Linear Models

Lab#3 – Multiple linear regression model 2

Author

Paulo Soares

Published

November 15, 2025

1 Heavy machinery

Experimental study carried out in a factory to understand how tool wear (y, quantitative, measured in mm) varies with tool speed (x1, quantitative, measured in rot/sec) and tool type (x2, qualitative with 3 levels: M1, M2 and M3). The data is available in the file machines.txt.

  1. Read and plot the data.

    url <- "https://web.tecnico.ulisboa.pt/paulo.soares/aml/data/machines.txt"
    md <- read.delim(url, sep = " ")
    summary(md)
           i               y               x1           x2           
     Min.   : 1.00   Min.   : 4.30   Min.   :200   Length:18         
     1st Qu.: 5.25   1st Qu.: 5.97   1st Qu.:200   Class :character  
     Median : 9.50   Median : 7.75   Median :300   Mode  :character  
     Mean   : 9.50   Mean   : 7.77   Mean   :300                     
     3rd Qu.:13.75   3rd Qu.: 9.30   3rd Qu.:400                     
     Max.   :18.00   Max.   :12.70   Max.   :400                     
    md$i <- NULL
    g <- ggplot(md) +
            geom_point(aes(x1, y, color = x2), size = 3, alpha = 0.5)
    g

  2. Fit a model with the quantitative covariate only and plot the fitted line over the data.

    mod0 <- lm(y ~ x1, data = md)
    summary(mod0)
    
    Call:
    lm(formula = y ~ x1, data = md)
    
    Residuals:
       Min     1Q Median     3Q    Max 
    -1.972 -1.143 -0.114  0.899  2.736 
    
    Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
    (Intercept)  1.19722    1.25644    0.95     0.35    
    x1           0.02192    0.00404    5.42  5.6e-05 ***
    ---
    Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    
    Residual standard error: 1.4 on 16 degrees of freedom
    Multiple R-squared:  0.648, Adjusted R-squared:  0.626 
    F-statistic: 29.4 on 1 and 16 DF,  p-value: 5.63e-05
    coef0 <- coef(mod0)
    coef0
    (Intercept)          x1 
        1.19722     0.02192 
    ggplot(md) +
      geom_point(aes(x1, y), size = 3, alpha = 0.5) +
      geom_abline(intercept = coef0[1], slope = coef0[2])

    g + geom_abline(intercept = coef0[1], slope = coef0[2])

    # Alternative
    g + stat_smooth(aes(x1, y), method = "lm", formula = 'y ~ x', se = FALSE, color = "black")
  3. Fit a model with both variables using the label encoding (0, 1, 2) for the categorical one. Produce a plot of the fitted model and comment.

    md$x3 <- ifelse(md$x2 == "M1", 0, ifelse(md$x2 == "M2", 1, 2))
    
    mod1 <- lm(y ~ x1 + x3, data = md)
    summary(mod1)
    
    Call:
    lm(formula = y ~ x1 + x3, data = md)
    
    Residuals:
       Min     1Q Median     3Q    Max 
    -1.972 -0.993  0.257  0.817  1.961 
    
    Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
    (Intercept)  0.42222    1.19624    0.35    0.729    
    x1           0.02192    0.00366    5.98  2.5e-05 ***
    x3           0.77500    0.36627    2.12    0.051 .  
    ---
    Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    
    Residual standard error: 1.27 on 15 degrees of freedom
    Multiple R-squared:  0.729, Adjusted R-squared:  0.692 
    F-statistic: 20.1 on 2 and 15 DF,  p-value: 5.64e-05
    coef1 <- coef(mod1)
    coef1
    (Intercept)          x1          x3 
        0.42222     0.02192     0.77500 
    # first 3 default colors from ggplot
    colors <- scales::hue_pal()(3)
    
    g + geom_abline(intercept = coef1[1], slope = coef1[2], color = colors[1]) +
      geom_abline(intercept = coef1[1] + coef1[3], slope = coef1[2], color = colors[2]) +
      geom_abline(intercept = coef1[1] + 2 * coef1[3], slope = coef1[2], color = colors[3])

  4. Fit a model with both variables using 3 dummy variables to encode the categorical one.

    md$x21 <- ifelse(md$x2 == "M1", 1, 0)
    md$x22 <- ifelse(md$x2 == "M2", 1, 0)
    md$x23 <- ifelse(md$x2 == "M3", 1, 0)
    
    mod2 <- lm(y ~ x1 + x21 + x22 + x23, data = md)
    summary(mod2)
    
    Call:
    lm(formula = y ~ x1 + x21 + x22 + x23, data = md)
    
    Residuals:
       Min     1Q Median     3Q    Max 
    -1.142 -0.352 -0.025  0.215  1.275 
    
    Coefficients: (1 not defined because of singularities)
                Estimate Std. Error t value Pr(>|t|)    
    (Intercept)  2.65833    0.68665    3.87   0.0017 ** 
    x1           0.02192    0.00207   10.59  4.6e-08 ***
    x21         -1.55000    0.41406   -3.74   0.0022 ** 
    x22         -2.83333    0.41406   -6.84  8.0e-06 ***
    x23               NA         NA      NA       NA    
    ---
    Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    
    Residual standard error: 0.717 on 14 degrees of freedom
    Multiple R-squared:  0.919, Adjusted R-squared:  0.902 
    F-statistic:   53 on 3 and 14 DF,  p-value: 6.88e-08
  5. Fit a model with both variables using the reference level encoding for the categorical one. Produce a plot of the fitted model.

    md$x4 <- as.factor(md$x2)
    contrasts(md$x4)
       M2 M3
    M1  0  0
    M2  1  0
    M3  0  1
    mod3 <- lm(y ~ x1 + x4, data = md)
    summary(mod3)
    
    Call:
    lm(formula = y ~ x1 + x4, data = md)
    
    Residuals:
       Min     1Q Median     3Q    Max 
    -1.142 -0.352 -0.025  0.215  1.275 
    
    Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
    (Intercept)  1.10833    0.68665    1.61   0.1288    
    x1           0.02192    0.00207   10.59  4.6e-08 ***
    x4M2        -1.28333    0.41406   -3.10   0.0078 ** 
    x4M3         1.55000    0.41406    3.74   0.0022 ** 
    ---
    Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    
    Residual standard error: 0.717 on 14 degrees of freedom
    Multiple R-squared:  0.919, Adjusted R-squared:  0.902 
    F-statistic:   53 on 3 and 14 DF,  p-value: 6.88e-08
    coef3 <- coef(mod3)
    coef3
    (Intercept)          x1        x4M2        x4M3 
        1.10833     0.02192    -1.28333     1.55000 
    g + geom_abline(intercept = coef3[1], slope = coef3[2], color = colors[1]) +
      geom_abline(intercept = coef3[1] + coef3[3], slope = coef3[2], color = colors[2]) +
      geom_abline(intercept = coef3[1] + coef3[4], slope = coef3[2], color = colors[3])

  6. Fit a model with both variables using the reference level encoding for the categorical one and including interactions. Produce a plot of the fitted model and compare it with the previous one.

    mod4 <- lm(y ~ x1 * x4, data = md)
    summary(mod4)
    
    Call:
    lm(formula = y ~ x1 * x4, data = md)
    
    Residuals:
        Min      1Q  Median      3Q     Max 
    -1.0083 -0.2250 -0.0792  0.4292  0.9917 
    
    Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
    (Intercept)  1.15833    1.14229    1.01     0.33    
    x1           0.02175    0.00367    5.92    7e-05 ***
    x4M2        -0.45833    1.61545   -0.28     0.78    
    x4M3         0.57500    1.61545    0.36     0.73    
    x1:x4M2     -0.00275    0.00520   -0.53     0.61    
    x1:x4M3      0.00325    0.00520    0.63     0.54    
    ---
    Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    
    Residual standard error: 0.735 on 12 degrees of freedom
    Multiple R-squared:  0.927, Adjusted R-squared:  0.897 
    F-statistic: 30.6 on 5 and 12 DF,  p-value: 1.98e-06
    coef4 <- coef(mod4)
    coef4
    (Intercept)          x1        x4M2        x4M3     x1:x4M2     x1:x4M3 
        1.15833     0.02175    -0.45833     0.57500    -0.00275     0.00325 
    g + geom_abline(intercept = coef4[1], slope = coef4[2], color = colors[1]) +
      geom_abline(intercept = coef4[1] + coef4[3], slope = coef4[2] + coef4[5], color = colors[2]) +
      geom_abline(intercept = coef4[1] + coef4[4], slope = coef4[2] + coef4[6], color = colors[3])

    # Alternative
    g + stat_smooth(aes(x1, y, color = x2), method = "lm", , formula = 'y ~ x', se = FALSE)
    anova(mod3, mod4)
    Analysis of Variance Table
    
    Model 1: y ~ x1 + x4
    Model 2: y ~ x1 * x4
      Res.Df  RSS Df Sum of Sq    F Pr(>F)
    1     14 7.20                         
    2     12 6.48  2     0.722 0.67   0.53

    Note that this last result alone should not be used to discard the larger model. It simply says that the larger model does not bring enough improvement to reject the simpler one.