Linear Models

Lab#3 – Multiple linear regression model 2

Author

Paulo Soares

Published

November 15, 2025

1 Heavy machinery

Experimental study carried out in a factory to understand how tool wear (y, quantitative, measured in mm) varies with tool speed (x1, quantitative, measured in rot/sec) and tool type (x2, qualitative with 3 levels: M1, M2 and M3). The data is available in the file machines.txt.

Read and plot the data.

url <- "https://web.tecnico.ulisboa.pt/paulo.soares/aml/data/machines.txt"
md <- read.delim(url, sep = " ")
summary(md)

       i               y               x1           x2           
 Min.   : 1.00   Min.   : 4.30   Min.   :200   Length:18         
 1st Qu.: 5.25   1st Qu.: 5.97   1st Qu.:200   Class :character  
 Median : 9.50   Median : 7.75   Median :300   Mode  :character  
 Mean   : 9.50   Mean   : 7.77   Mean   :300                     
 3rd Qu.:13.75   3rd Qu.: 9.30   3rd Qu.:400                     
 Max.   :18.00   Max.   :12.70   Max.   :400

md$i <- NULL

g <- ggplot(md) +
        geom_point(aes(x1, y, color = x2), size = 3, alpha = 0.5)
g

Fit a model with the quantitative covariate only and plot the fitted line over the data.

mod0 <- lm(y ~ x1, data = md)
summary(mod0)


Call:
lm(formula = y ~ x1, data = md)

Residuals:
   Min     1Q Median     3Q    Max 
-1.972 -1.143 -0.114  0.899  2.736 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  1.19722    1.25644    0.95     0.35    
x1           0.02192    0.00404    5.42  5.6e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.4 on 16 degrees of freedom
Multiple R-squared:  0.648, Adjusted R-squared:  0.626 
F-statistic: 29.4 on 1 and 16 DF,  p-value: 5.63e-05

coef0 <- coef(mod0)
coef0

(Intercept)          x1 
    1.19722     0.02192

ggplot(md) +
  geom_point(aes(x1, y), size = 3, alpha = 0.5) +
  geom_abline(intercept = coef0[1], slope = coef0[2])

g + geom_abline(intercept = coef0[1], slope = coef0[2])

# Alternative
g + stat_smooth(aes(x1, y), method = "lm", formula = 'y ~ x', se = FALSE, color = "black")

Fit a model with both variables using the label encoding (0, 1, 2) for the categorical one. Produce a plot of the fitted model and comment.

md$x3 <- ifelse(md$x2 == "M1", 0, ifelse(md$x2 == "M2", 1, 2))

mod1 <- lm(y ~ x1 + x3, data = md)
summary(mod1)


Call:
lm(formula = y ~ x1 + x3, data = md)

Residuals:
   Min     1Q Median     3Q    Max 
-1.972 -0.993  0.257  0.817  1.961 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.42222    1.19624    0.35    0.729    
x1           0.02192    0.00366    5.98  2.5e-05 ***
x3           0.77500    0.36627    2.12    0.051 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.27 on 15 degrees of freedom
Multiple R-squared:  0.729, Adjusted R-squared:  0.692 
F-statistic: 20.1 on 2 and 15 DF,  p-value: 5.64e-05

coef1 <- coef(mod1)
coef1

(Intercept)          x1          x3 
    0.42222     0.02192     0.77500

# first 3 default colors from ggplot
colors <- scales::hue_pal()(3)

g + geom_abline(intercept = coef1[1], slope = coef1[2], color = colors[1]) +
  geom_abline(intercept = coef1[1] + coef1[3], slope = coef1[2], color = colors[2]) +
  geom_abline(intercept = coef1[1] + 2 * coef1[3], slope = coef1[2], color = colors[3])

Fit a model with both variables using 3 dummy variables to encode the categorical one.

md$x21 <- ifelse(md$x2 == "M1", 1, 0)
md$x22 <- ifelse(md$x2 == "M2", 1, 0)
md$x23 <- ifelse(md$x2 == "M3", 1, 0)

mod2 <- lm(y ~ x1 + x21 + x22 + x23, data = md)
summary(mod2)


Call:
lm(formula = y ~ x1 + x21 + x22 + x23, data = md)

Residuals:
   Min     1Q Median     3Q    Max 
-1.142 -0.352 -0.025  0.215  1.275 

Coefficients: (1 not defined because of singularities)
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  2.65833    0.68665    3.87   0.0017 ** 
x1           0.02192    0.00207   10.59  4.6e-08 ***
x21         -1.55000    0.41406   -3.74   0.0022 ** 
x22         -2.83333    0.41406   -6.84  8.0e-06 ***
x23               NA         NA      NA       NA    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.717 on 14 degrees of freedom
Multiple R-squared:  0.919, Adjusted R-squared:  0.902 
F-statistic:   53 on 3 and 14 DF,  p-value: 6.88e-08

Fit a model with both variables using the reference level encoding for the categorical one. Produce a plot of the fitted model.

md$x4 <- as.factor(md$x2)
contrasts(md$x4)

mod3 <- lm(y ~ x1 + x4, data = md)
summary(mod3)


Call:
lm(formula = y ~ x1 + x4, data = md)

Residuals:
   Min     1Q Median     3Q    Max 
-1.142 -0.352 -0.025  0.215  1.275 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  1.10833    0.68665    1.61   0.1288    
x1           0.02192    0.00207   10.59  4.6e-08 ***
x4M2        -1.28333    0.41406   -3.10   0.0078 ** 
x4M3         1.55000    0.41406    3.74   0.0022 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.717 on 14 degrees of freedom
Multiple R-squared:  0.919, Adjusted R-squared:  0.902 
F-statistic:   53 on 3 and 14 DF,  p-value: 6.88e-08

coef3 <- coef(mod3)
coef3

(Intercept)          x1        x4M2        x4M3 
    1.10833     0.02192    -1.28333     1.55000

g + geom_abline(intercept = coef3[1], slope = coef3[2], color = colors[1]) +
  geom_abline(intercept = coef3[1] + coef3[3], slope = coef3[2], color = colors[2]) +
  geom_abline(intercept = coef3[1] + coef3[4], slope = coef3[2], color = colors[3])

Fit a model with both variables using the reference level encoding for the categorical one and including interactions. Produce a plot of the fitted model and compare it with the previous one.

mod4 <- lm(y ~ x1 * x4, data = md)
summary(mod4)


Call:
lm(formula = y ~ x1 * x4, data = md)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.0083 -0.2250 -0.0792  0.4292  0.9917 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  1.15833    1.14229    1.01     0.33    
x1           0.02175    0.00367    5.92    7e-05 ***
x4M2        -0.45833    1.61545   -0.28     0.78    
x4M3         0.57500    1.61545    0.36     0.73    
x1:x4M2     -0.00275    0.00520   -0.53     0.61    
x1:x4M3      0.00325    0.00520    0.63     0.54    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.735 on 12 degrees of freedom
Multiple R-squared:  0.927, Adjusted R-squared:  0.897 
F-statistic: 30.6 on 5 and 12 DF,  p-value: 1.98e-06

coef4 <- coef(mod4)
coef4

(Intercept)          x1        x4M2        x4M3     x1:x4M2     x1:x4M3 
    1.15833     0.02175    -0.45833     0.57500    -0.00275     0.00325

g + geom_abline(intercept = coef4[1], slope = coef4[2], color = colors[1]) +
  geom_abline(intercept = coef4[1] + coef4[3], slope = coef4[2] + coef4[5], color = colors[2]) +
  geom_abline(intercept = coef4[1] + coef4[4], slope = coef4[2] + coef4[6], color = colors[3])

# Alternative
g + stat_smooth(aes(x1, y, color = x2), method = "lm", , formula = 'y ~ x', se = FALSE)

anova(mod3, mod4)

Analysis of Variance Table

Model 1: y ~ x1 + x4
Model 2: y ~ x1 * x4
  Res.Df  RSS Df Sum of Sq    F Pr(>F)
1     14 7.20                         
2     12 6.48  2     0.722 0.67   0.53

Note that this last result alone should not be used to discard the larger model. It simply says that the larger model does not bring enough improvement to reject the simpler one.