## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", eval = FALSE, cache = FALSE, warning = FALSE, fig.width = 8, fig.height = 5 ) library(fastcpd) ## ----logistic-regression-setup------------------------------------------------ # #' Cost function for Logistic regression, i.e. binomial family in GLM. # #' # #' @param data Data to be used to calculate the cost values. The last column is # #' the response variable. # #' @param family Family of the distribution. # #' @keywords internal # #' # #' @noRd # #' @return Cost value for the corresponding segment of data. # cost_glm_binomial <- function(data, family = "binomial") { # data <- as.matrix(data) # p <- dim(data)[2] - 1 # out <- fastglm::fastglm( # as.matrix(data[, 1:p]), data[, p + 1], # family = family # ) # return(out$deviance / 2) # } # # #' Implementation of vanilla PELT for logistic regression type data. # #' # #' @param data Data to be used for change point detection. # #' @param beta Penalty coefficient for the number of change points. # #' @param cost Cost function to be used to calculate cost values. # #' @keywords internal # #' # #' @noRd # #' @return A list consisting of two: change point locations and negative log # #' likelihood values for each segment. # pelt_vanilla_binomial <- function(data, beta, cost = cost_glm_binomial) { # n <- dim(data)[1] # p <- dim(data)[2] - 1 # Fobj <- c(-beta, 0) # cp_set <- list(NULL, 0) # set <- c(0, 1) # for (t in 2:n) # { # m <- length(set) # cval <- rep(NA, m) # for (i in 1:m) # { # k <- set[i] + 1 # cval[i] <- 0 # if (t - k >= p - 1) cval[i] <- suppressWarnings(cost(data[k:t, ])) # } # obj <- cval + Fobj[set + 1] + beta # min_val <- min(obj) # ind <- which(obj == min_val)[1] # cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) # cp_set <- append(cp_set, list(cp_set_add)) # ind2 <- (cval + Fobj[set + 1]) <= min_val # set <- c(set[ind2], t) # Fobj <- c(Fobj, min_val) # } # cp <- cp_set[[n + 1]] # nLL <- 0 # cp_loc <- unique(c(0, cp, n)) # for (i in 1:(length(cp_loc) - 1)) # { # seg <- (cp_loc[i] + 1):cp_loc[i + 1] # data_seg <- data[seg, ] # out <- fastglm::fastglm( # as.matrix(data_seg[, 1:p]), data_seg[, p + 1], # family = "binomial" # ) # nLL <- out$deviance / 2 + nLL # } # # output <- list(cp, nLL) # names(output) <- c("cp", "nLL") # return(output) # } # # #' Function to update the coefficients using gradient descent. # #' # #' @param data_new New data point used to update the coeffient. # #' @param coef Previous coeffient to be updated. # #' @param cum_coef Summation of all the past coefficients to be used in # #' averaging. # #' @param cmatrix Hessian matrix in gradient descent. # #' @param epsilon Small adjustment to avoid singularity when doing inverse on # #' the Hessian matrix. # #' @keywords internal # #' # #' @noRd # #' @return A list of values containing the new coefficients, summation of # #' coefficients so far and all the Hessian matrices. # cost_logistic_update <- function( # data_new, coef, cum_coef, cmatrix, epsilon = 1e-10) { # p <- length(data_new) - 1 # X_new <- data_new[1:p] # Y_new <- data_new[p + 1] # eta <- X_new %*% coef # mu <- 1 / (1 + exp(-eta)) # cmatrix <- cmatrix + (X_new %o% X_new) * as.numeric((1 - mu) * mu) # lik_dev <- as.numeric(-(Y_new - mu)) * X_new # coef <- coef - solve(cmatrix + epsilon * diag(1, p), lik_dev) # cum_coef <- cum_coef + coef # return(list(coef, cum_coef, cmatrix)) # } # # #' Calculate negative log likelihood given data segment and guess of # #' coefficient. # #' # #' @param data Data to be used to calculate the negative log likelihood. # #' @param b Guess of the coefficient. # #' @keywords internal # #' # #' @noRd # #' @return Negative log likelihood. # neg_log_lik_binomial <- function(data, b) { # p <- dim(data)[2] - 1 # X <- data[, 1:p, drop = FALSE] # Y <- data[, p + 1, drop = FALSE] # u <- as.numeric(X %*% b) # L <- -Y * u + log(1 + exp(u)) # return(sum(L)) # } # # #' Find change points using dynamic programming with pruning and SeGD. # #' # #' @param data Data used to find change points. # #' @param beta Penalty coefficient for the number of change points. # #' @param B Initial guess on the number of change points. # #' @param trim Propotion of the data to ignore the change points at the # #' beginning, ending and between change points. # #' @keywords internal # #' # #' @noRd # #' @return A list containing potential change point locations and negative log # #' likelihood for each segment based on the change points guess. # segd_binomial <- function(data, beta, B = 10, trim = 0.025) { # n <- dim(data)[1] # p <- dim(data)[2] - 1 # Fobj <- c(-beta, 0) # cp_set <- list(NULL, 0) # set <- c(0, 1) # # # choose the initial values based on pre-segmentation # # index <- rep(1:B, rep(n / B, B)) # coef.int <- matrix(NA, B, p) # for (i in 1:B) # { # out <- fastglm::fastglm( # as.matrix(data[index == i, 1:p]), # data[index == i, p + 1], # family = "binomial" # ) # coef.int[i, ] <- coef(out) # } # X1 <- data[1, 1:p] # cum_coef <- coef <- matrix(coef.int[1, ], p, 1) # e_eta <- exp(coef %*% X1) # const <- e_eta / (1 + e_eta)^2 # cmatrix <- array((X1 %o% X1) * as.numeric(const), c(p, p, 1)) # # for (t in 2:n) # { # m <- length(set) # cval <- rep(NA, m) # # for (i in 1:(m - 1)) # { # coef_c <- coef[, i] # cum_coef_c <- cum_coef[, i] # cmatrix_c <- cmatrix[, , i] # out <- cost_logistic_update(data[t, ], coef_c, cum_coef_c, cmatrix_c) # coef[, i] <- out[[1]] # cum_coef[, i] <- out[[2]] # cmatrix[, , i] <- out[[3]] # k <- set[i] + 1 # cval[i] <- 0 # if (t - k >= p - 1) { # cval[i] <- # neg_log_lik_binomial(data[k:t, ], cum_coef[, i] / (t - k + 1)) # } # } # # # the choice of initial values requires further investigation # # cval[m] <- 0 # Xt <- data[t, 1:p] # cum_coef_add <- coef_add <- coef.int[index[t], ] # e_eta_t <- exp(coef_add %*% Xt) # const <- e_eta_t / (1 + e_eta_t)^2 # cmatrix_add <- (Xt %o% Xt) * as.numeric(const) # # coef <- cbind(coef, coef_add) # cum_coef <- cbind(cum_coef, cum_coef_add) # cmatrix <- abind::abind(cmatrix, cmatrix_add, along = 3) # # # Adding a momentum term (TBD) # # obj <- cval + Fobj[set + 1] + beta # min_val <- min(obj) # ind <- which(obj == min_val)[1] # cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) # cp_set <- append(cp_set, list(cp_set_add)) # ind2 <- (cval + Fobj[set + 1]) <= min_val # set <- c(set[ind2], t) # coef <- coef[, ind2, drop = FALSE] # cum_coef <- cum_coef[, ind2, drop = FALSE] # cmatrix <- cmatrix[, , ind2, drop = FALSE] # Fobj <- c(Fobj, min_val) # } # # # Remove change-points close to the boundaries # # cp <- cp_set[[n + 1]] # if (length(cp) > 0) { # ind3 <- (seq_len(length(cp)))[(cp < trim * n) | (cp > (1 - trim) * n)] # cp <- cp[-ind3] # } # # nLL <- 0 # cp_loc <- unique(c(0, cp, n)) # for (i in 1:(length(cp_loc) - 1)) # { # seg <- (cp_loc[i] + 1):cp_loc[i + 1] # data_seg <- data[seg, ] # out <- fastglm::fastglm( # as.matrix(data_seg[, 1:p]), data_seg[, p + 1], # family = "binomial" # ) # nLL <- out$deviance / 2 + nLL # } # # output <- list(cp, nLL) # names(output) <- c("cp", "nLL") # return(output) # } ## ----poisson-regression-setup------------------------------------------------- # #' Cost function for Poisson regression. # #' # #' @param data Data to be used to calculate the cost values. The last column is # #' the response variable. # #' @param family Family of the distribution. # #' @keywords internal # #' # #' @noRd # #' @return Cost value for the corresponding segment of data. # cost_glm_poisson <- function(data, family = "poisson") { # data <- as.matrix(data) # p <- dim(data)[2] - 1 # out <- fastglm::fastglm(as.matrix(data[, 1:p]), data[, p + 1], family = family) # return(out$deviance / 2) # } # # #' Implementation of vanilla PELT for poisson regression type data. # #' # #' @param data Data to be used for change point detection. # #' @param beta Penalty coefficient for the number of change points. # #' @param cost Cost function to be used to calculate cost values. # #' @keywords internal # #' # #' @noRd # #' @return A list consisting of two: change point locations and negative log # #' likelihood values for each segment. # pelt_vanilla_poisson <- function(data, beta, cost = cost_glm_poisson) { # n <- dim(data)[1] # p <- dim(data)[2] - 1 # Fobj <- c(-beta, 0) # cp_set <- list(NULL, 0) # set <- c(0, 1) # for (t in 2:n) # { # m <- length(set) # cval <- rep(NA, m) # for (i in 1:m) # { # k <- set[i] + 1 # if (t - k >= p - 1) cval[i] <- suppressWarnings(cost(data[k:t, ])) else cval[i] <- 0 # } # obj <- cval + Fobj[set + 1] + beta # min_val <- min(obj) # ind <- which(obj == min_val)[1] # cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) # cp_set <- append(cp_set, list(cp_set_add)) # ind2 <- (cval + Fobj[set + 1]) <= min_val # set <- c(set[ind2], t) # Fobj <- c(Fobj, min_val) # # if (t %% 100 == 0) print(t) # } # cp <- cp_set[[n + 1]] # output <- list(cp) # names(output) <- c("cp") # # return(output) # } # # #' Function to update the coefficients using gradient descent. # #' # #' @param data_new New data point used to update the coeffient. # #' @param coef Previous coeffient to be updated. # #' @param cum_coef Summation of all the past coefficients to be used in # #' averaging. # #' @param cmatrix Hessian matrix in gradient descent. # #' @param epsilon Small adjustment to avoid singularity when doing inverse on # #' the Hessian matrix. # #' @param G Upper bound for the coefficient. # #' @param L Winsorization lower bound. # #' @param H Winsorization upper bound. # #' @keywords internal # #' # #' @noRd # #' @return A list of values containing the new coefficients, summation of # #' coefficients so far and all the Hessian matrices. # cost_poisson_update <- function(data_new, coef, cum_coef, cmatrix, epsilon = 0.001, G = 10^10, L = -20, H = 20) { # p <- length(data_new) - 1 # X_new <- data_new[1:p] # Y_new <- data_new[p + 1] # eta <- X_new %*% coef # mu <- exp(eta) # cmatrix <- cmatrix + (X_new %o% X_new) * min(as.numeric(mu), G) # lik_dev <- as.numeric(-(Y_new - mu)) * X_new # coef <- coef - solve(cmatrix + epsilon * diag(1, p), lik_dev) # coef <- pmin(pmax(coef, L), H) # cum_coef <- cum_coef + coef # return(list(coef, cum_coef, cmatrix)) # } # # #' Calculate negative log likelihood given data segment and guess of # #' coefficient. # #' # #' @param data Data to be used to calculate the negative log likelihood. # #' @param b Guess of the coefficient. # #' @keywords internal # #' # #' @noRd # #' @return Negative log likelihood. # neg_log_lik_poisson <- function(data, b) { # p <- dim(data)[2] - 1 # X <- data[, 1:p, drop = FALSE] # Y <- data[, p + 1, drop = FALSE] # u <- as.numeric(X %*% b) # L <- -Y * u + exp(u) + lfactorial(Y) # return(sum(L)) # } # # #' Find change points using dynamic programming with pruning and SeGD. # #' # #' @param data Data used to find change points. # #' @param beta Penalty coefficient for the number of change points. # #' @param B Initial guess on the number of change points. # #' @param trim Propotion of the data to ignore the change points at the # #' beginning, ending and between change points. # #' @param epsilon Small adjustment to avoid singularity when doing inverse on # #' the Hessian matrix. # #' @param G Upper bound for the coefficient. # #' @param L Winsorization lower bound. # #' @param H Winsorization upper bound. # #' @keywords internal # #' # #' @noRd # #' @return A list containing potential change point locations and negative log # #' likelihood for each segment based on the change points guess. # segd_poisson <- function(data, beta, B = 10, trim = 0.03, epsilon = 0.001, G = 10^10, L = -20, H = 20) { # n <- dim(data)[1] # p <- dim(data)[2] - 1 # Fobj <- c(-beta, 0) # cp_set <- list(NULL, 0) # set <- c(0, 1) # # # choose the initial values based on pre-segmentation # # index <- rep(1:B, rep(n / B, B)) # coef.int <- matrix(NA, B, p) # for (i in 1:B) # { # out <- fastglm::fastglm(x = as.matrix(data[index == i, 1:p]), y = data[index == i, p + 1], family = "poisson") # coef.int[i, ] <- coef(out) # } # X1 <- data[1, 1:p] # cum_coef <- coef <- pmin(pmax(matrix(coef.int[1, ], p, 1), L), H) # e_eta <- exp(coef %*% X1) # const <- e_eta # cmatrix <- array((X1 %o% X1) * as.numeric(const), c(p, p, 1)) # # for (t in 2:n) # { # m <- length(set) # cval <- rep(NA, m) # for (i in 1:(m - 1)) # { # coef_c <- coef[, i] # cum_coef_c <- cum_coef[, i] # cmatrix_c <- cmatrix[, , i] # out <- cost_poisson_update(data[t, ], coef_c, cum_coef_c, cmatrix_c, epsilon = epsilon, G = G, L = L, H = H) # coef[, i] <- out[[1]] # cum_coef[, i] <- out[[2]] # cmatrix[, , i] <- out[[3]] # k <- set[i] + 1 # cum_coef_win <- pmin(pmax(cum_coef[, i] / (t - k + 1), L), H) # if (t - k >= p - 1) cval[i] <- neg_log_lik_poisson(data[k:t, ], cum_coef_win) else cval[i] <- 0 # } # # # the choice of initial values requires further investigation # # cval[m] <- 0 # Xt <- data[t, 1:p] # cum_coef_add <- coef_add <- pmin(pmax(coef.int[index[t], ], L), H) # e_eta_t <- exp(coef_add %*% Xt) # const <- e_eta_t # cmatrix_add <- (Xt %o% Xt) * as.numeric(const) # coef <- cbind(coef, coef_add) # cum_coef <- cbind(cum_coef, cum_coef_add) # cmatrix <- abind::abind(cmatrix, cmatrix_add, along = 3) # # # Adding a momentum term (TBD) # # obj <- cval + Fobj[set + 1] + beta # min_val <- min(obj) # ind <- which(obj == min_val)[1] # cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) # cp_set <- append(cp_set, list(cp_set_add)) # ind2 <- (cval + Fobj[set + 1]) <= min_val # set <- c(set[ind2], t) # coef <- coef[, ind2, drop = FALSE] # cum_coef <- cum_coef[, ind2, drop = FALSE] # cmatrix <- cmatrix[, , ind2, drop = FALSE] # Fobj <- c(Fobj, min_val) # } # # # Remove change-points close to the boundaries # # cp <- cp_set[[n + 1]] # if (length(cp) > 0) { # ind3 <- (1:length(cp))[(cp < trim * n) | (cp > (1 - trim) * n)] # if (length(ind3) > 0) cp <- cp[-ind3] # } # # cp <- sort(unique(c(0, cp))) # index <- which((diff(cp) < trim * n) == TRUE) # if (length(index) > 0) cp <- floor((cp[-(index + 1)] + cp[-index]) / 2) # cp <- cp[cp > 0] # # # nLL <- 0 # # cp_loc <- unique(c(0,cp,n)) # # for(i in 1:(length(cp_loc)-1)) # # { # # seg <- (cp_loc[i]+1):cp_loc[i+1] # # data_seg <- data[seg,] # # out <- fastglm(as.matrix(data_seg[, 1:p]), data_seg[, p+1], family="Poisson") # # nLL <- out$deviance/2 + nLL # # } # # # output <- list(cp, nLL) # # names(output) <- c("cp", "nLL") # # output <- list(cp) # names(output) <- c("cp") # # return(output) # } # # # Generate data from poisson regression models with change-points # #' @param n Number of observations. # #' @param d Dimension of the covariates. # #' @param true.coef True regression coefficients. # #' @param true.cp.loc True change-point locations. # #' @param Sigma Covariance matrix of the covariates. # #' @keywords internal # #' # #' @noRd # #' @return A list containing the generated data and the true cluster # #' assignments. # data_gen_poisson <- function(n, d, true.coef, true.cp.loc, Sigma) { # loc <- unique(c(0, true.cp.loc, n)) # if (dim(true.coef)[2] != length(loc) - 1) stop("true.coef and true.cp.loc do not match") # x <- mvtnorm::rmvnorm(n, mean = rep(0, d), sigma = Sigma) # y <- NULL # for (i in 1:(length(loc) - 1)) # { # mu <- exp(x[(loc[i] + 1):loc[i + 1], , drop = FALSE] %*% true.coef[, i, drop = FALSE]) # group <- rpois(length(mu), mu) # y <- c(y, group) # } # data <- cbind(x, y) # true_cluster <- rep(1:(length(loc) - 1), diff(loc)) # result <- list(data, true_cluster) # return(result) # } ## ----penalized-linear-regression-setup---------------------------------------- # #' Cost function for penalized linear regression. # #' # #' @param data Data to be used to calculate the cost values. The last column is # #' the response variable. # #' @param lambda Penalty coefficient. # #' @param family Family of the distribution. # #' @keywords internal # #' # #' @noRd # #' @return Cost value for the corresponding segment of data. # cost_lasso <- function(data, lambda, family = "gaussian") { # data <- as.matrix(data) # n <- dim(data)[1] # p <- dim(data)[2] - 1 # out <- glmnet::glmnet(as.matrix(data[, 1:p]), data[, p + 1], family = family, lambda = lambda) # return(deviance(out) / 2) # } # # #' Implementation of vanilla PELT for penalized linear regression type data. # #' # #' @param data Data to be used for change point detection. # #' @param beta Penalty coefficient for the number of change points. # #' @param B Initial guess on the number of change points. # #' @param cost Cost function to be used to calculate cost values. # #' @param family Family of the distribution. # #' @keywords internal # #' # #' @noRd # #' @return A list consisting of two: change point locations and negative log # #' likelihood values for each segment. # pelt_vanilla_lasso <- function(data, beta, B = 10, cost = cost_lasso, family = "gaussian") { # n <- dim(data)[1] # p <- dim(data)[2] - 1 # index <- rep(1:B, rep(n / B, B)) # err_sd <- act_num <- rep(NA, B) # for (i in 1:B) # { # cvfit <- glmnet::cv.glmnet(as.matrix(data[index == i, 1:p]), data[index == i, p + 1], family = family) # coef <- coef(cvfit, s = "lambda.1se")[-1] # resi <- data[index == i, p + 1] - as.matrix(data[index == i, 1:p]) %*% as.numeric(coef) # err_sd[i] <- sqrt(mean(resi^2)) # act_num[i] <- sum(abs(coef) > 0) # } # err_sd_mean <- mean(err_sd) # only works if error sd is unchanged. # act_num_mean <- mean(act_num) # beta <- (act_num_mean + 1) * beta # seems to work but there might be better choices # # Fobj <- c(-beta, 0) # cp_set <- list(NULL, 0) # set <- c(0, 1) # for (t in 2:n) # { # m <- length(set) # cval <- rep(NA, m) # for (i in 1:m) # { # k <- set[i] + 1 # if (t - k >= 1) cval[i] <- suppressWarnings(cost(data[k:t, ], lambda = err_sd_mean * sqrt(2 * log(p) / (t - k + 1)))) else cval[i] <- 0 # } # obj <- cval + Fobj[set + 1] + beta # min_val <- min(obj) # ind <- which(obj == min_val)[1] # cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) # cp_set <- append(cp_set, list(cp_set_add)) # ind2 <- (cval + Fobj[set + 1]) <= min_val # set <- c(set[ind2], t) # Fobj <- c(Fobj, min_val) # if (t %% 100 == 0) print(t) # } # cp <- cp_set[[n + 1]] # # nLL <- 0 # # cp_loc <- unique(c(0,cp,n)) # # for(i in 1:(length(cp_loc)-1)) # # { # # seg <- (cp_loc[i]+1):cp_loc[i+1] # # data_seg <- data[seg,] # # out <- glmnet(as.matrix(data_seg[, 1:p]), data_seg[, p+1], lambda=lambda, family=family) # # nLL <- deviance(out)/2 + nLL # # } # # output <- list(cp, nLL) # output <- list(cp) # names(output) <- c("cp") # return(output) # } # # #' Function to update the coefficients using gradient descent. # #' @param a Coefficient to be updated. # #' @param lambda Penalty coefficient. # #' @keywords internal # #' # #' @noRd # #' @return Updated coefficient. # soft_threshold <- function(a, lambda) { # sign(a) * pmax(abs(a) - lambda, 0) # } # # #' Function to update the coefficients using gradient descent. # #' # #' @param data_new New data point used to update the coeffient. # #' @param coef Previous coeffient to be updated. # #' @param cum_coef Summation of all the past coefficients to be used in # #' averaging. # #' @param cmatrix Hessian matrix in gradient descent. # #' @param lambda Penalty coefficient. # #' @keywords internal # #' # #' @noRd # #' @return A list of values containing the new coefficients, summation of # #' coefficients so far and all the Hessian matrices. # cost_lasso_update <- function(data_new, coef, cum_coef, cmatrix, lambda) { # p <- length(data_new) - 1 # X_new <- data_new[1:p] # Y_new <- data_new[p + 1] # mu <- X_new %*% coef # cmatrix <- cmatrix + X_new %o% X_new # # B <- as.vector(cmatrix_inv%*%X_new) # # cmatrix_inv <- cmatrix_inv - B%o%B/(1+sum(X_new*B)) # lik_dev <- as.numeric(-(Y_new - mu)) * X_new # coef <- coef - solve(cmatrix, lik_dev) # nc <- norm(cmatrix, type = "F") # the choice of norm affects the speed. Spectral norm is more accurate but slower than F norm. # coef <- soft_threshold(coef, lambda / nc) # cum_coef <- cum_coef + coef # return(list(coef, cum_coef, cmatrix)) # } # # #' Calculate negative log likelihood given data segment and guess of # #' coefficient. # #' # #' @param data Data to be used to calculate the negative log likelihood. # #' @param b Guess of the coefficient. # #' @param lambda Penalty coefficient. # #' @keywords internal # #' # #' @noRd # #' @return Negative log likelihood. # neg_log_lik_lasso <- function(data, b, lambda) { # p <- dim(data)[2] - 1 # X <- data[, 1:p, drop = FALSE] # Y <- data[, p + 1, drop = FALSE] # resi <- Y - X %*% b # L <- sum(resi^2) / 2 + lambda * sum(abs(b)) # return(L) # } # # #' Find change points using dynamic programming with pruning and SeGD. # #' # #' @param data Data used to find change points. # #' @param beta Penalty coefficient for the number of change points. # #' @param B Initial guess on the number of change points. # #' @param trim Propotion of the data to ignore the change points at the # #' beginning, ending and between change points. # #' @param epsilon Small adjustment to avoid singularity when doing inverse on # #' the Hessian matrix. # #' @param family Family of the distribution. # #' @keywords internal # #' # #' @noRd # #' @return A list containing potential change point locations and negative log # #' likelihood for each segment based on the change points guess. # segd_lasso <- function(data, beta, B = 10, trim = 0.025, epsilon = 1e-5, family = "gaussian") { # n <- dim(data)[1] # p <- dim(data)[2] - 1 # Fobj <- c(-beta, 0) # cp_set <- list(NULL, 0) # set <- c(0, 1) # # # choose the initial values based on pre-segmentation # # index <- rep(1:B, rep(n / B, B)) # coef.int <- matrix(NA, B, p) # err_sd <- act_num <- rep(NA, B) # for (i in 1:B) # { # cvfit <- glmnet::cv.glmnet(as.matrix(data[index == i, 1:p]), data[index == i, p + 1], family = family) # coef.int[i, ] <- coef(cvfit, s = "lambda.1se")[-1] # resi <- data[index == i, p + 1] - as.matrix(data[index == i, 1:p]) %*% as.numeric(coef.int[i, ]) # err_sd[i] <- sqrt(mean(resi^2)) # act_num[i] <- sum(abs(coef.int[i, ]) > 0) # } # err_sd_mean <- mean(err_sd) # only works if error sd is unchanged. # act_num_mean <- mean(act_num) # beta <- (act_num_mean + 1) * beta # seems to work but there might be better choices # # X1 <- data[1, 1:p] # cum_coef <- coef <- matrix(coef.int[1, ], p, 1) # eta <- coef %*% X1 # # c_int <- diag(1/epsilon,p) - X1%o%X1/epsilon^2/(1+sum(X1^2)/epsilon) # # cmatrix_inv <- array(c_int, c(p,p,1)) # cmatrix <- array(X1 %o% X1 + epsilon * diag(1, p), c(p, p, 1)) # # for (t in 2:n) # { # m <- length(set) # cval <- rep(NA, m) # # for (i in 1:(m - 1)) # { # coef_c <- coef[, i] # cum_coef_c <- cum_coef[, i] # # cmatrix_inv_c <- cmatrix_inv[,,i] # cmatrix_c <- cmatrix[, , i] # k <- set[i] + 1 # out <- cost_lasso_update(data[t, ], coef_c, cum_coef_c, cmatrix_c, lambda = err_sd_mean * sqrt(2 * log(p) / (t - k + 1))) # coef[, i] <- out[[1]] # cum_coef[, i] <- out[[2]] # # cmatrix_inv[,,i] <- out[[3]] # cmatrix[, , i] <- out[[3]] # if (t - k >= 2) cval[i] <- neg_log_lik_lasso(data[k:t, ], cum_coef[, i] / (t - k + 1), lambda = err_sd_mean * sqrt(2 * log(p) / (t - k + 1))) else cval[i] <- 0 # } # # # the choice of initial values requires further investigation # # cval[m] <- 0 # Xt <- data[t, 1:p] # cum_coef_add <- coef_add <- coef.int[index[t], ] # # cmatrix_inv_add <- diag(1/epsilon,p) - Xt%o%Xt/epsilon^2/(1+sum(Xt^2)/epsilon) # # coef <- cbind(coef, coef_add) # cum_coef <- cbind(cum_coef, cum_coef_add) # # cmatrix_inv <- abind::abind(cmatrix_inv, cmatrix_inv_add, along=3) # cmatrix <- abind::abind(cmatrix, Xt %o% Xt + epsilon * diag(1, p), along = 3) # # obj <- cval + Fobj[set + 1] + beta # min_val <- min(obj) # ind <- which(obj == min_val)[1] # cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) # cp_set <- append(cp_set, list(cp_set_add)) # ind2 <- (cval + Fobj[set + 1]) <= min_val # set <- c(set[ind2], t) # coef <- coef[, ind2, drop = FALSE] # cum_coef <- cum_coef[, ind2, drop = FALSE] # cmatrix <- cmatrix[, , ind2, drop = FALSE] # # cmatrix_inv <- cmatrix_inv[,,ind2,drop=FALSE] # Fobj <- c(Fobj, min_val) # } # # # Remove change-points close to the boundaries and merge change-points # # cp <- cp_set[[n + 1]] # if (length(cp) > 0) { # ind3 <- (1:length(cp))[(cp < trim * n) | (cp > (1 - trim) * n)] # if (length(ind3) > 0) cp <- cp[-ind3] # } # # cp <- sort(unique(c(0, cp))) # index <- which((diff(cp) < trim * n) == TRUE) # if (length(index) > 0) cp <- floor((cp[-(index + 1)] + cp[-index]) / 2) # cp <- cp[cp > 0] # # # nLL <- 0 # # cp_loc <- unique(c(0,cp,n)) # # for(i in 1:(length(cp_loc)-1)) # # { # # seg <- (cp_loc[i]+1):cp_loc[i+1] # # data_seg <- data[seg,] # # out <- fastglm(as.matrix(data_seg[, 1:p]), data_seg[, p+1], family="binomial") # # nLL <- out$deviance/2 + nLL # # } # # output <- list(cp) # names(output) <- c("cp") # return(output) # } # # # Generate data from penalized linear regression models with change-points # #' @param n Number of observations. # #' @param d Dimension of the covariates. # #' @param true.coef True regression coefficients. # #' @param true.cp.loc True change-point locations. # #' @param Sigma Covariance matrix of the covariates. # #' @param evar Error variance. # #' @keywords internal # #' # #' @noRd # #' @return A list containing the generated data and the true cluster # #' assignments. # data_gen_lasso <- function(n, d, true.coef, true.cp.loc, Sigma, evar) { # loc <- unique(c(0, true.cp.loc, n)) # if (dim(true.coef)[2] != length(loc) - 1) stop("true.coef and true.cp.loc do not match") # x <- mvtnorm::rmvnorm(n, mean = rep(0, d), sigma = Sigma) # y <- NULL # for (i in 1:(length(loc) - 1)) # { # Xb <- x[(loc[i] + 1):loc[i + 1], , drop = FALSE] %*% true.coef[, i, drop = FALSE] # add <- Xb + rnorm(length(Xb), sd = sqrt(evar)) # y <- c(y, add) # } # data <- cbind(x, y) # true_cluster <- rep(1:(length(loc) - 1), diff(loc)) # result <- list(data, true_cluster) # return(result) # } ## ----logistic-regression------------------------------------------------------ # set.seed(1) # p <- 5 # x <- matrix(rnorm(300 * p, 0, 1), ncol = p) # # # Randomly generate coefficients with different means. # theta <- rbind(rnorm(p, 0, 1), rnorm(p, 2, 1)) # # # Randomly generate response variables based on the segmented data and # # corresponding coefficients # y <- c( # rbinom(125, 1, 1 / (1 + exp(-x[1:125, ] %*% theta[1, ]))), # rbinom(300 - 125, 1, 1 / (1 + exp(-x[(125 + 1):300, ] %*% theta[2, ]))) # ) # # segd_binomial(cbind(x, y), (p + 1) * log(300) / 2, B = 5)$cp # #> [1] 125 # # fastcpd.binomial( # cbind(y, x), # segment_count = 5, # beta = "BIC", # cost_adjustment = "BIC", # r.progress = FALSE # )@cp_set # #> [1] 125 # # pelt_vanilla_binomial(cbind(x, y), (p + 1) * log(300) / 2)$cp # #> [1] 0 125 # # fastcpd.binomial( # cbind(y, x), # segment_count = 5, # vanilla_percentage = 1, # beta = "BIC", # cost_adjustment = "BIC", # r.progress = FALSE # )@cp_set # #> [1] 125 ## ----poisson-regression------------------------------------------------------- # set.seed(1) # n <- 1500 # d <- 5 # rho <- 0.9 # Sigma <- array(0, c(d, d)) # for (i in 1:d) { # Sigma[i, ] <- rho^(abs(i - (1:d))) # } # delta <- c(5, 7, 9, 11, 13) # a.sq <- 1 # delta.new <- # delta * sqrt(a.sq) / sqrt(as.numeric(t(delta) %*% Sigma %*% delta)) # true.cp.loc <- c(375, 750, 1125) # # # regression coefficients # true.coef <- matrix(0, nrow = d, ncol = length(true.cp.loc) + 1) # true.coef[, 1] <- c(1, 1.2, -1, 0.5, -2) # true.coef[, 2] <- true.coef[, 1] + delta.new # true.coef[, 3] <- true.coef[, 1] # true.coef[, 4] <- true.coef[, 3] - delta.new # # out <- data_gen_poisson(n, d, true.coef, true.cp.loc, Sigma) # data <- out[[1]] # g_tr <- out[[2]] # beta <- log(n) * (d + 1) / 2 # # segd_poisson( # data, beta, trim = 0.03, B = 10, epsilon = 0.001, G = 10^10, L = -20, H = 20 # )$cp # #> [1] 380 751 1136 1251 # # fastcpd.poisson( # cbind(data[, d + 1], data[, 1:d]), # beta = beta, # cost_adjustment = "BIC", # epsilon = 0.001, # segment_count = 10, # r.progress = FALSE # )@cp_set # #> [1] 380 751 1136 1251 # # pelt_vanilla_poisson(data, beta)$cp # #> [1] 0 374 752 1133 # # fastcpd.poisson( # cbind(data[, d + 1], data[, 1:d]), # segment_count = 10, # vanilla_percentage = 1, # beta = beta, # cost_adjustment = "BIC", # r.progress = FALSE # )@cp_set # #> [1] 374 752 1133 ## ----penalized-linear-regression---------------------------------------------- # set.seed(1) # n <- 1000 # s <- 3 # d <- 50 # evar <- 0.5 # Sigma <- diag(1, d) # true.cp.loc <- c(100, 300, 500, 800, 900) # seg <- length(true.cp.loc) + 1 # true.coef <- matrix(rnorm(seg * s), s, seg) # true.coef <- rbind(true.coef, matrix(0, d - s, seg)) # out <- data_gen_lasso(n, d, true.coef, true.cp.loc, Sigma, evar) # data <- out[[1]] # beta <- log(n) / 2 # beta here has different meaning # # segd_lasso(data, beta, B = 10, trim = 0.025)$cp # #> [1] 100 300 520 800 901 # # fastcpd.lasso( # cbind(data[, d + 1], data[, 1:d]), # epsilon = 1e-5, # beta = beta, # cost_adjustment = "BIC", # pruning_coef = 0, # r.progress = FALSE # )@cp_set # #> [1] 100 300 520 800 901 # # pelt_vanilla_lasso(data, beta, cost = cost_lasso)$cp # #> [1] 100 # #> [1] 200 # #> [1] 300 # #> [1] 400 # #> [1] 500 # #> [1] 600 # #> [1] 700 # #> [1] 800 # #> [1] 900 # #> [1] 1000 # #> [1] 0 103 299 510 800 900 # # fastcpd.lasso( # cbind(data[, d + 1], data[, 1:d]), # vanilla_percentage = 1, # epsilon = 1e-5, # beta = beta, # cost_adjustment = "BIC", # pruning_coef = 0, # r.progress = FALSE # )@cp_set # #> [1] 103 299 510 800 900 ## ----ref.label = knitr::all_labels(), echo = TRUE----------------------------- # knitr::opts_chunk$set( # collapse = TRUE, comment = "#>", eval = FALSE, cache = FALSE, # warning = FALSE, fig.width = 8, fig.height = 5 # ) # library(fastcpd) # #' Cost function for Logistic regression, i.e. binomial family in GLM. # #' # #' @param data Data to be used to calculate the cost values. The last column is # #' the response variable. # #' @param family Family of the distribution. # #' @keywords internal # #' # #' @noRd # #' @return Cost value for the corresponding segment of data. # cost_glm_binomial <- function(data, family = "binomial") { # data <- as.matrix(data) # p <- dim(data)[2] - 1 # out <- fastglm::fastglm( # as.matrix(data[, 1:p]), data[, p + 1], # family = family # ) # return(out$deviance / 2) # } # # #' Implementation of vanilla PELT for logistic regression type data. # #' # #' @param data Data to be used for change point detection. # #' @param beta Penalty coefficient for the number of change points. # #' @param cost Cost function to be used to calculate cost values. # #' @keywords internal # #' # #' @noRd # #' @return A list consisting of two: change point locations and negative log # #' likelihood values for each segment. # pelt_vanilla_binomial <- function(data, beta, cost = cost_glm_binomial) { # n <- dim(data)[1] # p <- dim(data)[2] - 1 # Fobj <- c(-beta, 0) # cp_set <- list(NULL, 0) # set <- c(0, 1) # for (t in 2:n) # { # m <- length(set) # cval <- rep(NA, m) # for (i in 1:m) # { # k <- set[i] + 1 # cval[i] <- 0 # if (t - k >= p - 1) cval[i] <- suppressWarnings(cost(data[k:t, ])) # } # obj <- cval + Fobj[set + 1] + beta # min_val <- min(obj) # ind <- which(obj == min_val)[1] # cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) # cp_set <- append(cp_set, list(cp_set_add)) # ind2 <- (cval + Fobj[set + 1]) <= min_val # set <- c(set[ind2], t) # Fobj <- c(Fobj, min_val) # } # cp <- cp_set[[n + 1]] # nLL <- 0 # cp_loc <- unique(c(0, cp, n)) # for (i in 1:(length(cp_loc) - 1)) # { # seg <- (cp_loc[i] + 1):cp_loc[i + 1] # data_seg <- data[seg, ] # out <- fastglm::fastglm( # as.matrix(data_seg[, 1:p]), data_seg[, p + 1], # family = "binomial" # ) # nLL <- out$deviance / 2 + nLL # } # # output <- list(cp, nLL) # names(output) <- c("cp", "nLL") # return(output) # } # # #' Function to update the coefficients using gradient descent. # #' # #' @param data_new New data point used to update the coeffient. # #' @param coef Previous coeffient to be updated. # #' @param cum_coef Summation of all the past coefficients to be used in # #' averaging. # #' @param cmatrix Hessian matrix in gradient descent. # #' @param epsilon Small adjustment to avoid singularity when doing inverse on # #' the Hessian matrix. # #' @keywords internal # #' # #' @noRd # #' @return A list of values containing the new coefficients, summation of # #' coefficients so far and all the Hessian matrices. # cost_logistic_update <- function( # data_new, coef, cum_coef, cmatrix, epsilon = 1e-10) { # p <- length(data_new) - 1 # X_new <- data_new[1:p] # Y_new <- data_new[p + 1] # eta <- X_new %*% coef # mu <- 1 / (1 + exp(-eta)) # cmatrix <- cmatrix + (X_new %o% X_new) * as.numeric((1 - mu) * mu) # lik_dev <- as.numeric(-(Y_new - mu)) * X_new # coef <- coef - solve(cmatrix + epsilon * diag(1, p), lik_dev) # cum_coef <- cum_coef + coef # return(list(coef, cum_coef, cmatrix)) # } # # #' Calculate negative log likelihood given data segment and guess of # #' coefficient. # #' # #' @param data Data to be used to calculate the negative log likelihood. # #' @param b Guess of the coefficient. # #' @keywords internal # #' # #' @noRd # #' @return Negative log likelihood. # neg_log_lik_binomial <- function(data, b) { # p <- dim(data)[2] - 1 # X <- data[, 1:p, drop = FALSE] # Y <- data[, p + 1, drop = FALSE] # u <- as.numeric(X %*% b) # L <- -Y * u + log(1 + exp(u)) # return(sum(L)) # } # # #' Find change points using dynamic programming with pruning and SeGD. # #' # #' @param data Data used to find change points. # #' @param beta Penalty coefficient for the number of change points. # #' @param B Initial guess on the number of change points. # #' @param trim Propotion of the data to ignore the change points at the # #' beginning, ending and between change points. # #' @keywords internal # #' # #' @noRd # #' @return A list containing potential change point locations and negative log # #' likelihood for each segment based on the change points guess. # segd_binomial <- function(data, beta, B = 10, trim = 0.025) { # n <- dim(data)[1] # p <- dim(data)[2] - 1 # Fobj <- c(-beta, 0) # cp_set <- list(NULL, 0) # set <- c(0, 1) # # # choose the initial values based on pre-segmentation # # index <- rep(1:B, rep(n / B, B)) # coef.int <- matrix(NA, B, p) # for (i in 1:B) # { # out <- fastglm::fastglm( # as.matrix(data[index == i, 1:p]), # data[index == i, p + 1], # family = "binomial" # ) # coef.int[i, ] <- coef(out) # } # X1 <- data[1, 1:p] # cum_coef <- coef <- matrix(coef.int[1, ], p, 1) # e_eta <- exp(coef %*% X1) # const <- e_eta / (1 + e_eta)^2 # cmatrix <- array((X1 %o% X1) * as.numeric(const), c(p, p, 1)) # # for (t in 2:n) # { # m <- length(set) # cval <- rep(NA, m) # # for (i in 1:(m - 1)) # { # coef_c <- coef[, i] # cum_coef_c <- cum_coef[, i] # cmatrix_c <- cmatrix[, , i] # out <- cost_logistic_update(data[t, ], coef_c, cum_coef_c, cmatrix_c) # coef[, i] <- out[[1]] # cum_coef[, i] <- out[[2]] # cmatrix[, , i] <- out[[3]] # k <- set[i] + 1 # cval[i] <- 0 # if (t - k >= p - 1) { # cval[i] <- # neg_log_lik_binomial(data[k:t, ], cum_coef[, i] / (t - k + 1)) # } # } # # # the choice of initial values requires further investigation # # cval[m] <- 0 # Xt <- data[t, 1:p] # cum_coef_add <- coef_add <- coef.int[index[t], ] # e_eta_t <- exp(coef_add %*% Xt) # const <- e_eta_t / (1 + e_eta_t)^2 # cmatrix_add <- (Xt %o% Xt) * as.numeric(const) # # coef <- cbind(coef, coef_add) # cum_coef <- cbind(cum_coef, cum_coef_add) # cmatrix <- abind::abind(cmatrix, cmatrix_add, along = 3) # # # Adding a momentum term (TBD) # # obj <- cval + Fobj[set + 1] + beta # min_val <- min(obj) # ind <- which(obj == min_val)[1] # cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) # cp_set <- append(cp_set, list(cp_set_add)) # ind2 <- (cval + Fobj[set + 1]) <= min_val # set <- c(set[ind2], t) # coef <- coef[, ind2, drop = FALSE] # cum_coef <- cum_coef[, ind2, drop = FALSE] # cmatrix <- cmatrix[, , ind2, drop = FALSE] # Fobj <- c(Fobj, min_val) # } # # # Remove change-points close to the boundaries # # cp <- cp_set[[n + 1]] # if (length(cp) > 0) { # ind3 <- (seq_len(length(cp)))[(cp < trim * n) | (cp > (1 - trim) * n)] # cp <- cp[-ind3] # } # # nLL <- 0 # cp_loc <- unique(c(0, cp, n)) # for (i in 1:(length(cp_loc) - 1)) # { # seg <- (cp_loc[i] + 1):cp_loc[i + 1] # data_seg <- data[seg, ] # out <- fastglm::fastglm( # as.matrix(data_seg[, 1:p]), data_seg[, p + 1], # family = "binomial" # ) # nLL <- out$deviance / 2 + nLL # } # # output <- list(cp, nLL) # names(output) <- c("cp", "nLL") # return(output) # } # #' Cost function for Poisson regression. # #' # #' @param data Data to be used to calculate the cost values. The last column is # #' the response variable. # #' @param family Family of the distribution. # #' @keywords internal # #' # #' @noRd # #' @return Cost value for the corresponding segment of data. # cost_glm_poisson <- function(data, family = "poisson") { # data <- as.matrix(data) # p <- dim(data)[2] - 1 # out <- fastglm::fastglm(as.matrix(data[, 1:p]), data[, p + 1], family = family) # return(out$deviance / 2) # } # # #' Implementation of vanilla PELT for poisson regression type data. # #' # #' @param data Data to be used for change point detection. # #' @param beta Penalty coefficient for the number of change points. # #' @param cost Cost function to be used to calculate cost values. # #' @keywords internal # #' # #' @noRd # #' @return A list consisting of two: change point locations and negative log # #' likelihood values for each segment. # pelt_vanilla_poisson <- function(data, beta, cost = cost_glm_poisson) { # n <- dim(data)[1] # p <- dim(data)[2] - 1 # Fobj <- c(-beta, 0) # cp_set <- list(NULL, 0) # set <- c(0, 1) # for (t in 2:n) # { # m <- length(set) # cval <- rep(NA, m) # for (i in 1:m) # { # k <- set[i] + 1 # if (t - k >= p - 1) cval[i] <- suppressWarnings(cost(data[k:t, ])) else cval[i] <- 0 # } # obj <- cval + Fobj[set + 1] + beta # min_val <- min(obj) # ind <- which(obj == min_val)[1] # cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) # cp_set <- append(cp_set, list(cp_set_add)) # ind2 <- (cval + Fobj[set + 1]) <= min_val # set <- c(set[ind2], t) # Fobj <- c(Fobj, min_val) # # if (t %% 100 == 0) print(t) # } # cp <- cp_set[[n + 1]] # output <- list(cp) # names(output) <- c("cp") # # return(output) # } # # #' Function to update the coefficients using gradient descent. # #' # #' @param data_new New data point used to update the coeffient. # #' @param coef Previous coeffient to be updated. # #' @param cum_coef Summation of all the past coefficients to be used in # #' averaging. # #' @param cmatrix Hessian matrix in gradient descent. # #' @param epsilon Small adjustment to avoid singularity when doing inverse on # #' the Hessian matrix. # #' @param G Upper bound for the coefficient. # #' @param L Winsorization lower bound. # #' @param H Winsorization upper bound. # #' @keywords internal # #' # #' @noRd # #' @return A list of values containing the new coefficients, summation of # #' coefficients so far and all the Hessian matrices. # cost_poisson_update <- function(data_new, coef, cum_coef, cmatrix, epsilon = 0.001, G = 10^10, L = -20, H = 20) { # p <- length(data_new) - 1 # X_new <- data_new[1:p] # Y_new <- data_new[p + 1] # eta <- X_new %*% coef # mu <- exp(eta) # cmatrix <- cmatrix + (X_new %o% X_new) * min(as.numeric(mu), G) # lik_dev <- as.numeric(-(Y_new - mu)) * X_new # coef <- coef - solve(cmatrix + epsilon * diag(1, p), lik_dev) # coef <- pmin(pmax(coef, L), H) # cum_coef <- cum_coef + coef # return(list(coef, cum_coef, cmatrix)) # } # # #' Calculate negative log likelihood given data segment and guess of # #' coefficient. # #' # #' @param data Data to be used to calculate the negative log likelihood. # #' @param b Guess of the coefficient. # #' @keywords internal # #' # #' @noRd # #' @return Negative log likelihood. # neg_log_lik_poisson <- function(data, b) { # p <- dim(data)[2] - 1 # X <- data[, 1:p, drop = FALSE] # Y <- data[, p + 1, drop = FALSE] # u <- as.numeric(X %*% b) # L <- -Y * u + exp(u) + lfactorial(Y) # return(sum(L)) # } # # #' Find change points using dynamic programming with pruning and SeGD. # #' # #' @param data Data used to find change points. # #' @param beta Penalty coefficient for the number of change points. # #' @param B Initial guess on the number of change points. # #' @param trim Propotion of the data to ignore the change points at the # #' beginning, ending and between change points. # #' @param epsilon Small adjustment to avoid singularity when doing inverse on # #' the Hessian matrix. # #' @param G Upper bound for the coefficient. # #' @param L Winsorization lower bound. # #' @param H Winsorization upper bound. # #' @keywords internal # #' # #' @noRd # #' @return A list containing potential change point locations and negative log # #' likelihood for each segment based on the change points guess. # segd_poisson <- function(data, beta, B = 10, trim = 0.03, epsilon = 0.001, G = 10^10, L = -20, H = 20) { # n <- dim(data)[1] # p <- dim(data)[2] - 1 # Fobj <- c(-beta, 0) # cp_set <- list(NULL, 0) # set <- c(0, 1) # # # choose the initial values based on pre-segmentation # # index <- rep(1:B, rep(n / B, B)) # coef.int <- matrix(NA, B, p) # for (i in 1:B) # { # out <- fastglm::fastglm(x = as.matrix(data[index == i, 1:p]), y = data[index == i, p + 1], family = "poisson") # coef.int[i, ] <- coef(out) # } # X1 <- data[1, 1:p] # cum_coef <- coef <- pmin(pmax(matrix(coef.int[1, ], p, 1), L), H) # e_eta <- exp(coef %*% X1) # const <- e_eta # cmatrix <- array((X1 %o% X1) * as.numeric(const), c(p, p, 1)) # # for (t in 2:n) # { # m <- length(set) # cval <- rep(NA, m) # for (i in 1:(m - 1)) # { # coef_c <- coef[, i] # cum_coef_c <- cum_coef[, i] # cmatrix_c <- cmatrix[, , i] # out <- cost_poisson_update(data[t, ], coef_c, cum_coef_c, cmatrix_c, epsilon = epsilon, G = G, L = L, H = H) # coef[, i] <- out[[1]] # cum_coef[, i] <- out[[2]] # cmatrix[, , i] <- out[[3]] # k <- set[i] + 1 # cum_coef_win <- pmin(pmax(cum_coef[, i] / (t - k + 1), L), H) # if (t - k >= p - 1) cval[i] <- neg_log_lik_poisson(data[k:t, ], cum_coef_win) else cval[i] <- 0 # } # # # the choice of initial values requires further investigation # # cval[m] <- 0 # Xt <- data[t, 1:p] # cum_coef_add <- coef_add <- pmin(pmax(coef.int[index[t], ], L), H) # e_eta_t <- exp(coef_add %*% Xt) # const <- e_eta_t # cmatrix_add <- (Xt %o% Xt) * as.numeric(const) # coef <- cbind(coef, coef_add) # cum_coef <- cbind(cum_coef, cum_coef_add) # cmatrix <- abind::abind(cmatrix, cmatrix_add, along = 3) # # # Adding a momentum term (TBD) # # obj <- cval + Fobj[set + 1] + beta # min_val <- min(obj) # ind <- which(obj == min_val)[1] # cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) # cp_set <- append(cp_set, list(cp_set_add)) # ind2 <- (cval + Fobj[set + 1]) <= min_val # set <- c(set[ind2], t) # coef <- coef[, ind2, drop = FALSE] # cum_coef <- cum_coef[, ind2, drop = FALSE] # cmatrix <- cmatrix[, , ind2, drop = FALSE] # Fobj <- c(Fobj, min_val) # } # # # Remove change-points close to the boundaries # # cp <- cp_set[[n + 1]] # if (length(cp) > 0) { # ind3 <- (1:length(cp))[(cp < trim * n) | (cp > (1 - trim) * n)] # if (length(ind3) > 0) cp <- cp[-ind3] # } # # cp <- sort(unique(c(0, cp))) # index <- which((diff(cp) < trim * n) == TRUE) # if (length(index) > 0) cp <- floor((cp[-(index + 1)] + cp[-index]) / 2) # cp <- cp[cp > 0] # # # nLL <- 0 # # cp_loc <- unique(c(0,cp,n)) # # for(i in 1:(length(cp_loc)-1)) # # { # # seg <- (cp_loc[i]+1):cp_loc[i+1] # # data_seg <- data[seg,] # # out <- fastglm(as.matrix(data_seg[, 1:p]), data_seg[, p+1], family="Poisson") # # nLL <- out$deviance/2 + nLL # # } # # # output <- list(cp, nLL) # # names(output) <- c("cp", "nLL") # # output <- list(cp) # names(output) <- c("cp") # # return(output) # } # # # Generate data from poisson regression models with change-points # #' @param n Number of observations. # #' @param d Dimension of the covariates. # #' @param true.coef True regression coefficients. # #' @param true.cp.loc True change-point locations. # #' @param Sigma Covariance matrix of the covariates. # #' @keywords internal # #' # #' @noRd # #' @return A list containing the generated data and the true cluster # #' assignments. # data_gen_poisson <- function(n, d, true.coef, true.cp.loc, Sigma) { # loc <- unique(c(0, true.cp.loc, n)) # if (dim(true.coef)[2] != length(loc) - 1) stop("true.coef and true.cp.loc do not match") # x <- mvtnorm::rmvnorm(n, mean = rep(0, d), sigma = Sigma) # y <- NULL # for (i in 1:(length(loc) - 1)) # { # mu <- exp(x[(loc[i] + 1):loc[i + 1], , drop = FALSE] %*% true.coef[, i, drop = FALSE]) # group <- rpois(length(mu), mu) # y <- c(y, group) # } # data <- cbind(x, y) # true_cluster <- rep(1:(length(loc) - 1), diff(loc)) # result <- list(data, true_cluster) # return(result) # } # #' Cost function for penalized linear regression. # #' # #' @param data Data to be used to calculate the cost values. The last column is # #' the response variable. # #' @param lambda Penalty coefficient. # #' @param family Family of the distribution. # #' @keywords internal # #' # #' @noRd # #' @return Cost value for the corresponding segment of data. # cost_lasso <- function(data, lambda, family = "gaussian") { # data <- as.matrix(data) # n <- dim(data)[1] # p <- dim(data)[2] - 1 # out <- glmnet::glmnet(as.matrix(data[, 1:p]), data[, p + 1], family = family, lambda = lambda) # return(deviance(out) / 2) # } # # #' Implementation of vanilla PELT for penalized linear regression type data. # #' # #' @param data Data to be used for change point detection. # #' @param beta Penalty coefficient for the number of change points. # #' @param B Initial guess on the number of change points. # #' @param cost Cost function to be used to calculate cost values. # #' @param family Family of the distribution. # #' @keywords internal # #' # #' @noRd # #' @return A list consisting of two: change point locations and negative log # #' likelihood values for each segment. # pelt_vanilla_lasso <- function(data, beta, B = 10, cost = cost_lasso, family = "gaussian") { # n <- dim(data)[1] # p <- dim(data)[2] - 1 # index <- rep(1:B, rep(n / B, B)) # err_sd <- act_num <- rep(NA, B) # for (i in 1:B) # { # cvfit <- glmnet::cv.glmnet(as.matrix(data[index == i, 1:p]), data[index == i, p + 1], family = family) # coef <- coef(cvfit, s = "lambda.1se")[-1] # resi <- data[index == i, p + 1] - as.matrix(data[index == i, 1:p]) %*% as.numeric(coef) # err_sd[i] <- sqrt(mean(resi^2)) # act_num[i] <- sum(abs(coef) > 0) # } # err_sd_mean <- mean(err_sd) # only works if error sd is unchanged. # act_num_mean <- mean(act_num) # beta <- (act_num_mean + 1) * beta # seems to work but there might be better choices # # Fobj <- c(-beta, 0) # cp_set <- list(NULL, 0) # set <- c(0, 1) # for (t in 2:n) # { # m <- length(set) # cval <- rep(NA, m) # for (i in 1:m) # { # k <- set[i] + 1 # if (t - k >= 1) cval[i] <- suppressWarnings(cost(data[k:t, ], lambda = err_sd_mean * sqrt(2 * log(p) / (t - k + 1)))) else cval[i] <- 0 # } # obj <- cval + Fobj[set + 1] + beta # min_val <- min(obj) # ind <- which(obj == min_val)[1] # cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) # cp_set <- append(cp_set, list(cp_set_add)) # ind2 <- (cval + Fobj[set + 1]) <= min_val # set <- c(set[ind2], t) # Fobj <- c(Fobj, min_val) # if (t %% 100 == 0) print(t) # } # cp <- cp_set[[n + 1]] # # nLL <- 0 # # cp_loc <- unique(c(0,cp,n)) # # for(i in 1:(length(cp_loc)-1)) # # { # # seg <- (cp_loc[i]+1):cp_loc[i+1] # # data_seg <- data[seg,] # # out <- glmnet(as.matrix(data_seg[, 1:p]), data_seg[, p+1], lambda=lambda, family=family) # # nLL <- deviance(out)/2 + nLL # # } # # output <- list(cp, nLL) # output <- list(cp) # names(output) <- c("cp") # return(output) # } # # #' Function to update the coefficients using gradient descent. # #' @param a Coefficient to be updated. # #' @param lambda Penalty coefficient. # #' @keywords internal # #' # #' @noRd # #' @return Updated coefficient. # soft_threshold <- function(a, lambda) { # sign(a) * pmax(abs(a) - lambda, 0) # } # # #' Function to update the coefficients using gradient descent. # #' # #' @param data_new New data point used to update the coeffient. # #' @param coef Previous coeffient to be updated. # #' @param cum_coef Summation of all the past coefficients to be used in # #' averaging. # #' @param cmatrix Hessian matrix in gradient descent. # #' @param lambda Penalty coefficient. # #' @keywords internal # #' # #' @noRd # #' @return A list of values containing the new coefficients, summation of # #' coefficients so far and all the Hessian matrices. # cost_lasso_update <- function(data_new, coef, cum_coef, cmatrix, lambda) { # p <- length(data_new) - 1 # X_new <- data_new[1:p] # Y_new <- data_new[p + 1] # mu <- X_new %*% coef # cmatrix <- cmatrix + X_new %o% X_new # # B <- as.vector(cmatrix_inv%*%X_new) # # cmatrix_inv <- cmatrix_inv - B%o%B/(1+sum(X_new*B)) # lik_dev <- as.numeric(-(Y_new - mu)) * X_new # coef <- coef - solve(cmatrix, lik_dev) # nc <- norm(cmatrix, type = "F") # the choice of norm affects the speed. Spectral norm is more accurate but slower than F norm. # coef <- soft_threshold(coef, lambda / nc) # cum_coef <- cum_coef + coef # return(list(coef, cum_coef, cmatrix)) # } # # #' Calculate negative log likelihood given data segment and guess of # #' coefficient. # #' # #' @param data Data to be used to calculate the negative log likelihood. # #' @param b Guess of the coefficient. # #' @param lambda Penalty coefficient. # #' @keywords internal # #' # #' @noRd # #' @return Negative log likelihood. # neg_log_lik_lasso <- function(data, b, lambda) { # p <- dim(data)[2] - 1 # X <- data[, 1:p, drop = FALSE] # Y <- data[, p + 1, drop = FALSE] # resi <- Y - X %*% b # L <- sum(resi^2) / 2 + lambda * sum(abs(b)) # return(L) # } # # #' Find change points using dynamic programming with pruning and SeGD. # #' # #' @param data Data used to find change points. # #' @param beta Penalty coefficient for the number of change points. # #' @param B Initial guess on the number of change points. # #' @param trim Propotion of the data to ignore the change points at the # #' beginning, ending and between change points. # #' @param epsilon Small adjustment to avoid singularity when doing inverse on # #' the Hessian matrix. # #' @param family Family of the distribution. # #' @keywords internal # #' # #' @noRd # #' @return A list containing potential change point locations and negative log # #' likelihood for each segment based on the change points guess. # segd_lasso <- function(data, beta, B = 10, trim = 0.025, epsilon = 1e-5, family = "gaussian") { # n <- dim(data)[1] # p <- dim(data)[2] - 1 # Fobj <- c(-beta, 0) # cp_set <- list(NULL, 0) # set <- c(0, 1) # # # choose the initial values based on pre-segmentation # # index <- rep(1:B, rep(n / B, B)) # coef.int <- matrix(NA, B, p) # err_sd <- act_num <- rep(NA, B) # for (i in 1:B) # { # cvfit <- glmnet::cv.glmnet(as.matrix(data[index == i, 1:p]), data[index == i, p + 1], family = family) # coef.int[i, ] <- coef(cvfit, s = "lambda.1se")[-1] # resi <- data[index == i, p + 1] - as.matrix(data[index == i, 1:p]) %*% as.numeric(coef.int[i, ]) # err_sd[i] <- sqrt(mean(resi^2)) # act_num[i] <- sum(abs(coef.int[i, ]) > 0) # } # err_sd_mean <- mean(err_sd) # only works if error sd is unchanged. # act_num_mean <- mean(act_num) # beta <- (act_num_mean + 1) * beta # seems to work but there might be better choices # # X1 <- data[1, 1:p] # cum_coef <- coef <- matrix(coef.int[1, ], p, 1) # eta <- coef %*% X1 # # c_int <- diag(1/epsilon,p) - X1%o%X1/epsilon^2/(1+sum(X1^2)/epsilon) # # cmatrix_inv <- array(c_int, c(p,p,1)) # cmatrix <- array(X1 %o% X1 + epsilon * diag(1, p), c(p, p, 1)) # # for (t in 2:n) # { # m <- length(set) # cval <- rep(NA, m) # # for (i in 1:(m - 1)) # { # coef_c <- coef[, i] # cum_coef_c <- cum_coef[, i] # # cmatrix_inv_c <- cmatrix_inv[,,i] # cmatrix_c <- cmatrix[, , i] # k <- set[i] + 1 # out <- cost_lasso_update(data[t, ], coef_c, cum_coef_c, cmatrix_c, lambda = err_sd_mean * sqrt(2 * log(p) / (t - k + 1))) # coef[, i] <- out[[1]] # cum_coef[, i] <- out[[2]] # # cmatrix_inv[,,i] <- out[[3]] # cmatrix[, , i] <- out[[3]] # if (t - k >= 2) cval[i] <- neg_log_lik_lasso(data[k:t, ], cum_coef[, i] / (t - k + 1), lambda = err_sd_mean * sqrt(2 * log(p) / (t - k + 1))) else cval[i] <- 0 # } # # # the choice of initial values requires further investigation # # cval[m] <- 0 # Xt <- data[t, 1:p] # cum_coef_add <- coef_add <- coef.int[index[t], ] # # cmatrix_inv_add <- diag(1/epsilon,p) - Xt%o%Xt/epsilon^2/(1+sum(Xt^2)/epsilon) # # coef <- cbind(coef, coef_add) # cum_coef <- cbind(cum_coef, cum_coef_add) # # cmatrix_inv <- abind::abind(cmatrix_inv, cmatrix_inv_add, along=3) # cmatrix <- abind::abind(cmatrix, Xt %o% Xt + epsilon * diag(1, p), along = 3) # # obj <- cval + Fobj[set + 1] + beta # min_val <- min(obj) # ind <- which(obj == min_val)[1] # cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) # cp_set <- append(cp_set, list(cp_set_add)) # ind2 <- (cval + Fobj[set + 1]) <= min_val # set <- c(set[ind2], t) # coef <- coef[, ind2, drop = FALSE] # cum_coef <- cum_coef[, ind2, drop = FALSE] # cmatrix <- cmatrix[, , ind2, drop = FALSE] # # cmatrix_inv <- cmatrix_inv[,,ind2,drop=FALSE] # Fobj <- c(Fobj, min_val) # } # # # Remove change-points close to the boundaries and merge change-points # # cp <- cp_set[[n + 1]] # if (length(cp) > 0) { # ind3 <- (1:length(cp))[(cp < trim * n) | (cp > (1 - trim) * n)] # if (length(ind3) > 0) cp <- cp[-ind3] # } # # cp <- sort(unique(c(0, cp))) # index <- which((diff(cp) < trim * n) == TRUE) # if (length(index) > 0) cp <- floor((cp[-(index + 1)] + cp[-index]) / 2) # cp <- cp[cp > 0] # # # nLL <- 0 # # cp_loc <- unique(c(0,cp,n)) # # for(i in 1:(length(cp_loc)-1)) # # { # # seg <- (cp_loc[i]+1):cp_loc[i+1] # # data_seg <- data[seg,] # # out <- fastglm(as.matrix(data_seg[, 1:p]), data_seg[, p+1], family="binomial") # # nLL <- out$deviance/2 + nLL # # } # # output <- list(cp) # names(output) <- c("cp") # return(output) # } # # # Generate data from penalized linear regression models with change-points # #' @param n Number of observations. # #' @param d Dimension of the covariates. # #' @param true.coef True regression coefficients. # #' @param true.cp.loc True change-point locations. # #' @param Sigma Covariance matrix of the covariates. # #' @param evar Error variance. # #' @keywords internal # #' # #' @noRd # #' @return A list containing the generated data and the true cluster # #' assignments. # data_gen_lasso <- function(n, d, true.coef, true.cp.loc, Sigma, evar) { # loc <- unique(c(0, true.cp.loc, n)) # if (dim(true.coef)[2] != length(loc) - 1) stop("true.coef and true.cp.loc do not match") # x <- mvtnorm::rmvnorm(n, mean = rep(0, d), sigma = Sigma) # y <- NULL # for (i in 1:(length(loc) - 1)) # { # Xb <- x[(loc[i] + 1):loc[i + 1], , drop = FALSE] %*% true.coef[, i, drop = FALSE] # add <- Xb + rnorm(length(Xb), sd = sqrt(evar)) # y <- c(y, add) # } # data <- cbind(x, y) # true_cluster <- rep(1:(length(loc) - 1), diff(loc)) # result <- list(data, true_cluster) # return(result) # } # set.seed(1) # p <- 5 # x <- matrix(rnorm(300 * p, 0, 1), ncol = p) # # # Randomly generate coefficients with different means. # theta <- rbind(rnorm(p, 0, 1), rnorm(p, 2, 1)) # # # Randomly generate response variables based on the segmented data and # # corresponding coefficients # y <- c( # rbinom(125, 1, 1 / (1 + exp(-x[1:125, ] %*% theta[1, ]))), # rbinom(300 - 125, 1, 1 / (1 + exp(-x[(125 + 1):300, ] %*% theta[2, ]))) # ) # # segd_binomial(cbind(x, y), (p + 1) * log(300) / 2, B = 5)$cp # #> [1] 125 # # fastcpd.binomial( # cbind(y, x), # segment_count = 5, # beta = "BIC", # cost_adjustment = "BIC", # r.progress = FALSE # )@cp_set # #> [1] 125 # # pelt_vanilla_binomial(cbind(x, y), (p + 1) * log(300) / 2)$cp # #> [1] 0 125 # # fastcpd.binomial( # cbind(y, x), # segment_count = 5, # vanilla_percentage = 1, # beta = "BIC", # cost_adjustment = "BIC", # r.progress = FALSE # )@cp_set # #> [1] 125 # set.seed(1) # n <- 1500 # d <- 5 # rho <- 0.9 # Sigma <- array(0, c(d, d)) # for (i in 1:d) { # Sigma[i, ] <- rho^(abs(i - (1:d))) # } # delta <- c(5, 7, 9, 11, 13) # a.sq <- 1 # delta.new <- # delta * sqrt(a.sq) / sqrt(as.numeric(t(delta) %*% Sigma %*% delta)) # true.cp.loc <- c(375, 750, 1125) # # # regression coefficients # true.coef <- matrix(0, nrow = d, ncol = length(true.cp.loc) + 1) # true.coef[, 1] <- c(1, 1.2, -1, 0.5, -2) # true.coef[, 2] <- true.coef[, 1] + delta.new # true.coef[, 3] <- true.coef[, 1] # true.coef[, 4] <- true.coef[, 3] - delta.new # # out <- data_gen_poisson(n, d, true.coef, true.cp.loc, Sigma) # data <- out[[1]] # g_tr <- out[[2]] # beta <- log(n) * (d + 1) / 2 # # segd_poisson( # data, beta, trim = 0.03, B = 10, epsilon = 0.001, G = 10^10, L = -20, H = 20 # )$cp # #> [1] 380 751 1136 1251 # # fastcpd.poisson( # cbind(data[, d + 1], data[, 1:d]), # beta = beta, # cost_adjustment = "BIC", # epsilon = 0.001, # segment_count = 10, # r.progress = FALSE # )@cp_set # #> [1] 380 751 1136 1251 # # pelt_vanilla_poisson(data, beta)$cp # #> [1] 0 374 752 1133 # # fastcpd.poisson( # cbind(data[, d + 1], data[, 1:d]), # segment_count = 10, # vanilla_percentage = 1, # beta = beta, # cost_adjustment = "BIC", # r.progress = FALSE # )@cp_set # #> [1] 374 752 1133 # set.seed(1) # n <- 1000 # s <- 3 # d <- 50 # evar <- 0.5 # Sigma <- diag(1, d) # true.cp.loc <- c(100, 300, 500, 800, 900) # seg <- length(true.cp.loc) + 1 # true.coef <- matrix(rnorm(seg * s), s, seg) # true.coef <- rbind(true.coef, matrix(0, d - s, seg)) # out <- data_gen_lasso(n, d, true.coef, true.cp.loc, Sigma, evar) # data <- out[[1]] # beta <- log(n) / 2 # beta here has different meaning # # segd_lasso(data, beta, B = 10, trim = 0.025)$cp # #> [1] 100 300 520 800 901 # # fastcpd.lasso( # cbind(data[, d + 1], data[, 1:d]), # epsilon = 1e-5, # beta = beta, # cost_adjustment = "BIC", # pruning_coef = 0, # r.progress = FALSE # )@cp_set # #> [1] 100 300 520 800 901 # # pelt_vanilla_lasso(data, beta, cost = cost_lasso)$cp # #> [1] 100 # #> [1] 200 # #> [1] 300 # #> [1] 400 # #> [1] 500 # #> [1] 600 # #> [1] 700 # #> [1] 800 # #> [1] 900 # #> [1] 1000 # #> [1] 0 103 299 510 800 900 # # fastcpd.lasso( # cbind(data[, d + 1], data[, 1:d]), # vanilla_percentage = 1, # epsilon = 1e-5, # beta = beta, # cost_adjustment = "BIC", # pruning_coef = 0, # r.progress = FALSE # )@cp_set # #> [1] 103 299 510 800 900