Linear Models
Lab#6 – Penalized regression
1 Previous work
In Lab#4 we fitted and briefly compare three candidate models for the diabetes dataset.
url <- "https://web.tecnico.ulisboa.pt/paulo.soares/aml/data/diabetes.txt"
db <- read.delim(url)set.seed(3185) # for reproducible example
n <- nrow(db)
test.index <- sample(n, 0.2 * n)
db.test <- db[test.index,]
db.train <- db[-test.index,]1.1 Manual variable selection
mod4 <- lm(Y ~ SEX + BMI + BP + S1 + S2 + S5 + BMI:BP + BMI:S1, data = db.train)1.2 Best subsets
library(leaps)
best <- regsubsets(Y ~ (.)^2, data = db.train, really.big = TRUE, nbest = 1,
nvmax = 8)
info <- summary(best)
plot(2:9, info$cp, xlab = 'P (# of predictors + 1)', ylab = 'Cp')
abline(a = 0, b = 1, col = "blue")columns <- info$which[6,]
columns[columns == TRUE](Intercept) AGE SEX AGE:SEX BMI:S1 BMI:S5
TRUE TRUE TRUE TRUE TRUE TRUE
BP:S2
TRUE
mod6 <- lm(Y ~ AGE + SEX + AGE:SEX + BMI:S1 + BMI:S5 + BP:S2, data = db.train)1.3 Stepwise regression
initial_model <- lm(Y ~ ., data = db.train)
mod7 <- step(initial_model, scope = list(lower = Y ~ 1, upper = Y ~ (.)^2),
trace = 0, direction = "both")1.4 Model comparison
models <- list(mod4, mod6, mod7)
measures <- function(model) {
summ <- summary(model)
pred <- predict(model, db.test)
dev <- pred - db.test$Y
data.frame(df = summ$df[1],
adjR2 = summ$adj.r.squared,
press = sum((residuals(model) / (1 - diag(hatvalues(model))))^2),
rmspe = sqrt(mean(dev^2)))
}
results <- do.call(rbind, lapply(models, measures))
rownames(results) <- c("Manual", "Best subsets", "Stepwise")
results df adjR2 press rmspe
Manual 9 0.5205 356054622 52.63
Best subsets 7 0.5351 347187035 53.39
Stepwise 17 0.5507 325947316 55.54
2 LASSO regression
Use the LASSO to fit a regression model with first-order interactions.
library(glmnet)
X <- model.matrix(Y ~ (.)^2 - 1, data = db.train)
# This may change because it is a randomized procedure
cv8 <- cv.glmnet(X, db.train$Y)
plot(cv8)cv8$lambda.min[1] 0.1368
cv8$lambda.1se[1] 6.807
mod8 <- glmnet(X, db.train$Y, lambda=cv8$lambda.1se)
coef8 <- coef(mod8)
coef8 <- coef8[coef8[,1]!=0,]
coef8(Intercept) S3 SEX:S3 BMI:BP BMI:S5 BP:S5
-24.191861 -0.233140 -0.220151 0.010343 1.020209 0.110179
S5:S6
0.007685
newX <- model.matrix(Y ~ (.)^2 - 1, data = db.test)
df8 <- length(coef8)
pred8 <- predict(mod8, newX, type = "response")
dev8 <- pred8 - db.test$Y
rmspe8 <- sqrt(mean(dev8^2))
results <- rbind(results, c(df8, NA, NA, rmspe8))
rownames(results)[4] <- "LASSO"
results df adjR2 press rmspe
Manual 9 0.5205 356054622 52.63
Best subsets 7 0.5351 347187035 53.39
Stepwise 17 0.5507 325947316 55.54
LASSO 7 NA NA 51.48