Linear Models

Lab#6 – Penalized regression

Author

Paulo Soares

Published

December 15, 2025

1 Previous work

In Lab#4 we fitted and briefly compare three candidate models for the diabetes dataset.

url <- "https://web.tecnico.ulisboa.pt/paulo.soares/aml/data/diabetes.txt"
db <- read.delim(url)
set.seed(3185)  # for reproducible example

n <- nrow(db)
test.index <- sample(n, 0.2 * n)
db.test <- db[test.index,]
db.train <- db[-test.index,]

1.1 Manual variable selection

mod4 <- lm(Y ~ SEX + BMI + BP + S1 + S2 + S5 + BMI:BP + BMI:S1, data = db.train)

1.2 Best subsets

library(leaps)

best <- regsubsets(Y ~ (.)^2, data = db.train, really.big = TRUE, nbest = 1,
                   nvmax = 8)
info <- summary(best)
plot(2:9, info$cp, xlab = 'P (# of predictors + 1)', ylab = 'Cp')
abline(a = 0, b = 1, col = "blue")

columns <- info$which[6,]
columns[columns == TRUE]
(Intercept)         AGE         SEX     AGE:SEX      BMI:S1      BMI:S5 
       TRUE        TRUE        TRUE        TRUE        TRUE        TRUE 
      BP:S2 
       TRUE 
mod6 <- lm(Y ~ AGE + SEX + AGE:SEX + BMI:S1 + BMI:S5 + BP:S2, data = db.train)

1.3 Stepwise regression

initial_model <- lm(Y ~ ., data = db.train)
mod7 <- step(initial_model, scope = list(lower = Y ~ 1, upper = Y ~ (.)^2),
             trace = 0, direction = "both")

1.4 Model comparison

models <- list(mod4, mod6, mod7)

measures <- function(model) {
  summ <- summary(model)
  
  pred <- predict(model, db.test)
  dev <- pred - db.test$Y
  
  data.frame(df = summ$df[1],
             adjR2 = summ$adj.r.squared,
             press = sum((residuals(model) / (1 - diag(hatvalues(model))))^2),
             rmspe = sqrt(mean(dev^2)))
}

results <- do.call(rbind, lapply(models, measures))
rownames(results) <- c("Manual", "Best subsets", "Stepwise")
results
             df  adjR2     press rmspe
Manual        9 0.5205 356054622 52.63
Best subsets  7 0.5351 347187035 53.39
Stepwise     17 0.5507 325947316 55.54

2 LASSO regression

Use the LASSO to fit a regression model with first-order interactions.

library(glmnet)

X <- model.matrix(Y ~ (.)^2 - 1, data = db.train)

# This may change because it is a randomized procedure
cv8 <- cv.glmnet(X, db.train$Y)
plot(cv8)

cv8$lambda.min
[1] 0.1368
cv8$lambda.1se
[1] 6.807
mod8 <- glmnet(X, db.train$Y, lambda=cv8$lambda.1se)

coef8 <- coef(mod8)
coef8 <- coef8[coef8[,1]!=0,]
coef8
(Intercept)          S3      SEX:S3      BMI:BP      BMI:S5       BP:S5 
 -24.191861   -0.233140   -0.220151    0.010343    1.020209    0.110179 
      S5:S6 
   0.007685 
newX <- model.matrix(Y ~ (.)^2 - 1, data = db.test)

df8 <- length(coef8)

pred8 <- predict(mod8, newX, type = "response")
dev8 <- pred8 - db.test$Y
rmspe8 <- sqrt(mean(dev8^2))

results <- rbind(results, c(df8, NA, NA, rmspe8))
rownames(results)[4] <- "LASSO"
results
             df  adjR2     press rmspe
Manual        9 0.5205 356054622 52.63
Best subsets  7 0.5351 347187035 53.39
Stepwise     17 0.5507 325947316 55.54
LASSO         7     NA        NA 51.48