## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----read the dataset--------------------------------------------------------- data(Boston, package='MASS') dd <- Boston[, c(1, 3, 5:8, 10:14)] dd[, names(dd) != 'medv'] <- log( dd[, names(dd) != 'medv'] ) ## ----loadpckg----------------------------------------------------------------- library(RBF) ## ----bandwidths--------------------------------------------------------------- bandw <- apply(dd[, names(dd) != 'medv'], 2, sd) / 2 ## ----robustfit---------------------------------------------------------------- robust.fit <- backf.rob(medv ~ ., data = dd, degree = 0, type = 'Huber', windows = bandw) ## ----summary------------------------------------------------------------------ summary(robust.fit) ## ----plot, out.width = "45%"------------------------------------------------- plot(robust.fit) ## ----preds-------------------------------------------------------------------- po <- colMeans(dd[, names(dd) != 'medv']) robust.fit1 <- backf.rob(medv ~ ., data = dd, degree = 0, type = 'Huber', windows = bandw, point = po) ## ----showpred----------------------------------------------------------------- robust.fit1$prediction ## ----outliers----------------------------------------------------------------- dd2 <- dd dd2$medv[1:5]<- rep(400, 5) ## ----robustplotswithoutliers-------------------------------------------------- robust.fit.new <- backf.rob(medv ~ ., data = dd2, degree = 0, type = 'Huber', windows = bandw, point = po) summary(robust.fit.new) robust.fit.new$prediction ## ----robustplots2, warning=FALSE, out.width = "45%"-------------------------- for(j in 1:10) { name.x <- names(dd)[j] name.y <- bquote(paste(hat('g')[.(j)])) oo <- order(dd2[,j]) plot(dd2[oo,j], robust.fit.new$g.matrix[oo,j], type="l", lwd=2, col='blue', lty=1, xlab=name.x, ylab=name.y) lines(dd2[oo,j], robust.fit$g.matrix[oo,j], lwd=2, col='green', lty=2) } ## ----gam, warning=FALSE------------------------------------------------------- library(gam) fit.gam <- gam(medv ~ lo(crim, span=1.62) + lo(indus, span=0.58) + lo(nox, span=0.15) + lo(rm, span=0.08) + lo(age, span=0.46) + lo(dis, span=0.40) + lo(tax, span=0.30) + lo(ptratio, span=0.09) + lo(black, span=0.58) + lo(lstat, span=0.45), data=dd) fits <- predict(fit.gam, type='terms') fit.gam.new <- gam(medv ~ lo(crim, span=1.62) + lo(indus, span=0.58) + lo(nox, span=0.15) + lo(rm, span=0.08) + lo(age, span=0.46) + lo(dis, span=0.40) + lo(tax, span=0.30) + lo(ptratio, span=0.09) + lo(black, span=0.58) + lo(lstat, span=0.45), data=dd2) fits.new <- predict(fit.gam.new, type='terms') ## ----gamplots, out.width = "45%"--------------------------------------------- for(j in 1:10) { oo <- order(dd2[,j]) name.x <- names(dd)[j] name.y <- bquote(paste(hat('g')[.(j)])) plot(dd2[oo,j], fits.new[oo,j], type="l", lwd=2, col='purple', lty=1, xlab=name.x, ylab=name.y) lines(dd2[oo,j], fits[oo,j], lwd=2, col='darkorange2', lty=2) } ## ----scatterplot, out.width = "75%"------------------------------------------ data(airquality) ccs <- complete.cases(airquality) aircomplete <- airquality[ccs, c('Ozone', 'Solar.R', 'Wind', 'Temp')] pairs(aircomplete[, c('Ozone', 'Solar.R', 'Wind', 'Temp')], pch=19, col='gray30') ## ----robustcv, warning=FALSE, eval=FALSE-------------------------------------- # library(RBF) # # # Bandwidth selection with leave-one-out cross-validation # ## Without outliers # # This takes a long time to compute (approx 380 minutes running # # R 3.6.1 on an Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz) # a <- c(1/2, 1, 1.5, 2, 2.5, 3) # h1 <- a * sd(aircomplete[,2]) # h2 <- a * sd(aircomplete[,3]) # h3 <- a * sd(aircomplete[,4]) # hh <- expand.grid(h1, h2, h3) # nh <- nrow(hh) # rmspe <- rep(NA, nh) # jbest <- 0 # cvbest <- +Inf # n <- nrow(aircomplete) # for(i in 1:nh) { # # leave-one-out CV loop # preds <- rep(NA, n) # for(j in 1:n) { # tmp <- try( backf.rob(Ozone ~ Solar.R + Wind + Temp, point = aircomplete[j, -1], # windows = hh[i, ], epsilon = 1e-6, data = aircomplete, # degree = 1, type = 'Tukey', subset = c(-j) )) # if (class(tmp)[1] != "try-error") { # preds[j] <- rowSums(tmp$prediction) + tmp$alpha # } # } # tmp.re <- RobStatTM::locScaleM(preds - aircomplete$Ozone, na.rm=TRUE) # rmspe[i] <- tmp.re$mu^2 + tmp.re$disper^2 # if( rmspe[i] < cvbest ) { # jbest <- i # cvbest <- rmspe[i] # } # } # (bandw <- hh[jbest,]) ## ----bandw-------------------------------------------------------------------- bandw <- c(136.7285, 10.67314, 4.764985) ## ----fitfull------------------------------------------------------------------ fit.full <- backf.rob(Ozone ~ Solar.R + Wind + Temp, windows = bandw, epsilon = 1e-6, degree = 1, type = 'Tukey', subset = ccs, data = airquality) ## ----plotfitfull, out.width = "45%"------------------------------------------ plot(fit.full) ## ----gamcv, warning=FALSE, eval=FALSE----------------------------------------- # library(gam) # a <- c(.3, .4, .5, .6, .7, .8, .9) # hh <- expand.grid(a, a, a) # nh <- nrow(hh) # jbest <- 0 # cvbest <- +Inf # n <- nrow(aircomplete) # for(i in 1:nh) { # fi <- rep(0, n) # for(j in 1:n) { # tmp <- gam(Ozone ~ lo(Solar.R, span=hh[i,1]) + lo(Wind, span=hh[i,2]) # + lo(Temp, span=hh[i,3]), data = aircomplete, subset=c(-j)) # fi[j] <- as.numeric(predict(tmp, newdata=aircomplete[j, -1], type='response')) # } # ss <- mean((aircomplete$Ozone - fi)^2) # if(ss < cvbest) { # jbest <- i # cvbest <- ss # } # } # (hh[jbest,]) # # Var1 Var2 Var3 # # 0.7 0.7 0.5 ## ----fitgam------------------------------------------------------------------- fit.gam <- gam(Ozone ~ lo(Solar.R, span=.7) + lo(Wind, span=.7)+ lo(Temp, span=.5), data = aircomplete) ## ----plotrobgam, out.width = "45%"------------------------------------------- x <- as.matrix( aircomplete[ , c('Solar.R', 'Wind', 'Temp')] ) y <- as.vector( aircomplete[ , 'Ozone'] ) fits <- predict(fit.gam, type='terms') for(j in 1:3) { re <- fit.full$yp - fit.full$alpha - rowSums(fit.full$g.matrix[,-j]) plot(re ~ x[,j], type='p', pch=20, col='gray45', xlab=colnames(x)[j], ylab='') oo <- order(x[,j]) lines(x[oo,j], fit.full$g.matrix[oo,j], lwd=2, col='blue', lty=1) lines(x[oo,j], fits[oo,j], lwd=2, col='magenta', lty=2) } ## ----boxplot, out.width = "50%"---------------------------------------------- re.ro <- residuals(fit.full) ou.ro <- boxplot(re.ro, col='gray80')$out in.ro <- (1:length(re.ro))[ re.ro %in% ou.ro ] points(rep(1, length(in.ro)), re.ro[in.ro], pch=20, col='red') (in.ro) ## ----scatterplotpoints, out.width = "75%"------------------------------------ cs <- rep('gray30', nrow(aircomplete)) cs[in.ro] <- 'red' os <- 1:nrow(aircomplete) os2 <- c(os[-in.ro], os[in.ro]) pairs(aircomplete[os2, c('Ozone', 'Solar.R', 'Wind', 'Temp')], pch=19, col=cs[os2]) ## ----plotoutred, out.width = "45%"------------------------------------------- # Plot both fits (robust and classical) x <- as.matrix( aircomplete[ , c('Solar.R', 'Wind', 'Temp')] ) y <- as.vector( aircomplete[ , 'Ozone'] ) fits <- predict(fit.gam, type='terms') for(j in 1:3) { re <- fit.full$yp - fit.full$alpha - rowSums(fit.full$g.matrix[,-j]) plot(re ~ x[,j], type='p', pch=20, col='gray45', xlab=colnames(x)[j], ylab='') points(re[in.ro] ~ x[in.ro,j], pch=20, col='red') oo <- order(x[,j]) lines(x[oo,j], fit.full$g.matrix[oo,j], lwd=2, col='blue', lty=1) lines(x[oo,j], fits[oo,j], lwd=2, col='magenta', lty=2) } ## ----cvgamclean, eval=FALSE--------------------------------------------------- # airclean <- aircomplete[-in.ro, c('Ozone', 'Solar.R', 'Wind', 'Temp')] # a <- c(.3, .4, .5, .6, .7, .8, .9) # hh <- expand.grid(a, a, a) # nh <- nrow(hh) # jbest <- 0 # cvbest <- +Inf # n <- nrow(airclean) # for(i in 1:nh) { # fi <- rep(0, n) # for(j in 1:n) { # tmp <- gam(Ozone ~ lo(Solar.R, span=hh[i,1]) + lo(Wind, span=hh[i,2]) # + lo(Temp, span=hh[i,3]), data=airclean, subset=c(-j)) # fi[j] <- as.numeric(predict(tmp, newdata=airclean[j,], type='response')) # } # ss <- mean((airclean$Ozone - fi)^2) # if(ss < cvbest) { # jbest <- i # cvbest <- ss # } # } # (hh[jbest,]) # # Var1 Var2 Var3 # # 0.7 0.8 0.3 ## ----fitgam2------------------------------------------------------------------ airclean <- aircomplete[-in.ro, c('Ozone', 'Solar.R', 'Wind', 'Temp')] fit.gam2 <- gam(Ozone ~ lo(Solar.R, span=.7) + lo(Wind, span=.8)+ lo(Temp, span=.3), data=airclean) ## ----finalplot, out.width = "45%"-------------------------------------------- fits2 <- predict(fit.gam2, type='terms') dd2 <- aircomplete[-in.ro, c('Solar.R', 'Wind', 'Temp')] for(j in 1:3) { re <- fit.full$yp - fit.full$alpha - rowSums(fit.full$g.matrix[,-j]) plot(re ~ x[,j], type='p', pch=20, col='gray45', xlab=colnames(x)[j], ylab='') points(re[in.ro] ~ x[in.ro,j], pch=20, col='red') oo <- order(dd2[,j]) lines(dd2[oo,j], fits2[oo,j], lwd=2, col='magenta', lty=2) oo <- order(x[,j]) lines(x[oo,j], fit.full$g.matrix[oo,j], lwd=2, col='blue', lty=1) } ## ----pred1-------------------------------------------------------------------- tms <- function(a, alpha=.1) { # alpha is the proportion to trim a2 <- sort(a^2, na.last=NA) n0 <- floor( length(a) * (1 - alpha) ) return( mean(a2[1:n0], na.rm=TRUE) ) } ## ----pred2, warning=FALSE, eval=FALSE----------------------------------------- # dd <- airquality # dd <- dd[complete.cases(dd), c('Ozone', 'Solar.R', 'Wind', 'Temp')] # # 100 runs of K-fold CV # M <- 100 # # 5-fold # K <- 5 # n <- nrow(dd) # # store (trimmed) TMSPE for robust and gam, and also # tmspe.ro <- tmspe.gam <- vector('numeric', M) # set.seed(123) # ii <- (1:n)%%K + 1 # for(runs in 1:M) { # tmpro <- tmpgam <- vector('numeric', n) # ii <- sample(ii) # for(j in 1:K) { # fit.full <- backf.rob(Ozone ~ Solar.R + Wind + Temp, # point=dd[ii==j, -1], windows = bandw, # epsilon = 1e-6, degree = 1, type = 'Tukey', # subset = (ii!=j), data = dd) # tmpro[ ii == j ] <- rowSums(fit.full$prediction) + fit.full$alpha # fit.gam <- gam(Ozone ~ lo(Solar.R, span=.7) + lo(Wind, span=.7)+ # lo(Temp, span=.5), data = dd[ii!=j, ]) # tmpgam[ ii == j ] <- predict(fit.gam, newdata=dd[ii==j, ], type='response') # } # tmspe.ro[runs] <- tms( dd$Ozone - tmpro, alpha=0.05) # tmspe.gam[runs] <- tms( dd$Ozone - tmpgam, alpha=0.05) # }