1. Simple Linear Regression
  2. Multiple Linear Regression
    • Model Selection
  3. Multivariate Linear Regression
  4. ANOVA
    • Multiple Comparison Testing
  5. ANCOVA
  6. MANOVA
  7. MANCOVA
  8. Logistic Regression
  9. Poisson Regression

Load environment

rm(list=ls(all=TRUE))

# options
options(stringsAsFactors = FALSE)

Simple Linear Regression

# lets load the iris package
data("iris")

# Does Longer Sepal length mean longer Petal Length too?
linear.fit = lm(
  formula = Petal.Length ~ Sepal.Length,
  data = iris
)

# show results
summary(linear.fit)
## 
## Call:
## lm(formula = Petal.Length ~ Sepal.Length, data = iris)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.47747 -0.59072 -0.00668  0.60484  2.49512 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -7.10144    0.50666  -14.02   <2e-16 ***
## Sepal.Length  1.85843    0.08586   21.65   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8678 on 148 degrees of freedom
## Multiple R-squared:   0.76,  Adjusted R-squared:  0.7583 
## F-statistic: 468.6 on 1 and 148 DF,  p-value: < 2.2e-16

Multiple Linear Regression

# Can we explain/predict the petal length using all other variables in the iris data
multiple.linear.fit = lm(
  formula = Petal.Length ~ Sepal.Length + Sepal.Width + Petal.Width + Species,
  data = iris
)

# show results
summary(multiple.linear.fit)
## 
## Call:
## lm(formula = Petal.Length ~ Sepal.Length + Sepal.Width + Petal.Width + 
##     Species, data = iris)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.78396 -0.15708  0.00193  0.14730  0.65418 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1.11099    0.26987  -4.117 6.45e-05 ***
## Sepal.Length       0.60801    0.05024  12.101  < 2e-16 ***
## Sepal.Width       -0.18052    0.08036  -2.246   0.0262 *  
## Petal.Width        0.60222    0.12144   4.959 1.97e-06 ***
## Speciesversicolor  1.46337    0.17345   8.437 3.14e-14 ***
## Speciesvirginica   1.97422    0.24480   8.065 2.60e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2627 on 144 degrees of freedom
## Multiple R-squared:  0.9786, Adjusted R-squared:  0.9778 
## F-statistic:  1317 on 5 and 144 DF,  p-value: < 2.2e-16

Model Selection

step.fit = stepAIC(
  object = multiple.linear.fit,
  direction = 'both' # "forward" and "backward" are also possible
)
## Start:  AIC=-395.12
## Petal.Length ~ Sepal.Length + Sepal.Width + Petal.Width + Species
## 
##                Df Sum of Sq     RSS     AIC
## <none>                       9.9397 -395.12
## - Sepal.Width   1    0.3484 10.2880 -391.95
## - Petal.Width   1    1.6974 11.6371 -373.47
## - Species       2    4.9133 14.8529 -338.87
## - Sepal.Length  1   10.1075 20.0472 -291.88
summary(step.fit)
## 
## Call:
## lm(formula = Petal.Length ~ Sepal.Length + Sepal.Width + Petal.Width + 
##     Species, data = iris)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.78396 -0.15708  0.00193  0.14730  0.65418 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1.11099    0.26987  -4.117 6.45e-05 ***
## Sepal.Length       0.60801    0.05024  12.101  < 2e-16 ***
## Sepal.Width       -0.18052    0.08036  -2.246   0.0262 *  
## Petal.Width        0.60222    0.12144   4.959 1.97e-06 ***
## Speciesversicolor  1.46337    0.17345   8.437 3.14e-14 ***
## Speciesvirginica   1.97422    0.24480   8.065 2.60e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2627 on 144 degrees of freedom
## Multiple R-squared:  0.9786, Adjusted R-squared:  0.9778 
## F-statistic:  1317 on 5 and 144 DF,  p-value: < 2.2e-16

Multivariate Linear Regression

# Predict Petal Length and Petal Width using all othe variables in iris data set.
multivariate.fit = lm(
  data = iris,
  formula = cbind(Petal.Length, Petal.Width) ~ Sepal.Length + Sepal.Width + Species,
)
summary(multivariate.fit)
## Response Petal.Length :
## 
## Call:
## lm(formula = Petal.Length ~ Sepal.Length + Sepal.Width + Species, 
##     data = iris)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.75196 -0.18755  0.00432  0.16965  0.79580 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1.63430    0.26783  -6.102 9.08e-09 ***
## Sepal.Length       0.64631    0.05353  12.073  < 2e-16 ***
## Sepal.Width       -0.04058    0.08113  -0.500    0.618    
## Speciesversicolor  2.17023    0.10657  20.364  < 2e-16 ***
## Speciesvirginica   3.04911    0.12267  24.857  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2833 on 145 degrees of freedom
## Multiple R-squared:  0.9749, Adjusted R-squared:  0.9742 
## F-statistic:  1410 on 4 and 145 DF,  p-value: < 2.2e-16
## 
## 
## Response Petal.Width :
## 
## Call:
## lm(formula = Petal.Width ~ Sepal.Length + Sepal.Width + Species, 
##     data = iris)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.50805 -0.10042 -0.01221  0.11416  0.46455 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -0.86897    0.16985  -5.116 9.73e-07 ***
## Sepal.Length       0.06360    0.03395   1.873    0.063 .  
## Sepal.Width        0.23237    0.05145   4.516 1.29e-05 ***
## Speciesversicolor  1.17375    0.06758  17.367  < 2e-16 ***
## Speciesvirginica   1.78487    0.07779  22.944  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1797 on 145 degrees of freedom
## Multiple R-squared:  0.9459, Adjusted R-squared:  0.9444 
## F-statistic: 634.3 on 4 and 145 DF,  p-value: < 2.2e-16

ANOVA

# H0 = The dependent variable is equal in all groups of the independent variable.
# HA = The dependent variable is not equal in all groups of the independent variable.
lm.fit=lm(
  data = iris,
  formula = Sepal.Length~factor(Species)
)
aov.fit=aov(lm.fit)
summary(aov.fit)
##                  Df Sum Sq Mean Sq F value Pr(>F)    
## factor(Species)   2  63.21  31.606   119.3 <2e-16 ***
## Residuals       147  38.96   0.265                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Multiple Comparison Testing

TukeyHSD(aov.fit)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = lm.fit)
## 
## $`factor(Species)`
##                       diff       lwr       upr p adj
## versicolor-setosa    0.930 0.6862273 1.1737727     0
## virginica-setosa     1.582 1.3382273 1.8257727     0
## virginica-versicolor 0.652 0.4082273 0.8957727     0

ANCOVA

ancova.lm.fit=lm(
  data = iris,
  formula = Sepal.Length~factor(Species)
)
ancova.fit=aov(ancova.lm.fit)
summary(ancova.fit)
##                  Df Sum Sq Mean Sq F value Pr(>F)    
## factor(Species)   2  63.21  31.606   119.3 <2e-16 ***
## Residuals       147  38.96   0.265                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(ancova.fit)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = ancova.lm.fit)
## 
## $`factor(Species)`
##                       diff       lwr       upr p adj
## versicolor-setosa    0.930 0.6862273 1.1737727     0
## virginica-setosa     1.582 1.3382273 1.8257727     0
## virginica-versicolor 0.652 0.4082273 0.8957727     0

MANOVA

manova.lm.fit=lm(
  data = iris,
  formula = cbind(Sepal.Length, Sepal.Width)~factor(Species)
)
manova.fit=aov(manova.lm.fit)
summary(manova.fit)
##  Response Sepal.Length :
##                  Df Sum Sq Mean Sq F value    Pr(>F)    
## factor(Species)   2 63.212  31.606  119.26 < 2.2e-16 ***
## Residuals       147 38.956   0.265                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Sepal.Width :
##                  Df Sum Sq Mean Sq F value    Pr(>F)    
## factor(Species)   2 11.345  5.6725   49.16 < 2.2e-16 ***
## Residuals       147 16.962  0.1154                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

MANCOVA

mancova.lm.fit=lm(
  data = iris,
  formula = cbind(Sepal.Length, Sepal.Width)~factor(Species)+ Petal.Width + Petal.Length ,
)
mancova.fit=aov(mancova.lm.fit)
summary(mancova.fit)
##  Response Sepal.Length :
##                  Df Sum Sq Mean Sq F value    Pr(>F)    
## factor(Species)   2 63.212 31.6061  274.73 < 2.2e-16 ***
## Petal.Width       1  5.176  5.1759   44.99 4.125e-10 ***
## Petal.Length      1 17.099 17.0988  148.63 < 2.2e-16 ***
## Residuals       145 16.681  0.1150                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Sepal.Width :
##                  Df  Sum Sq Mean Sq F value    Pr(>F)    
## factor(Species)   2 11.3449  5.6725 64.7229 < 2.2e-16 ***
## Petal.Width       1  3.7554  3.7554 42.8496 9.548e-10 ***
## Petal.Length      1  0.4984  0.4984  5.6871   0.01838 *  
## Residuals       145 12.7081  0.0876                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Logistic Regression

# load the melanoma data
data("Melanoma")

# fit a logistic regression with all variables that 
logistic.fit = glm(
  data = Melanoma, 
  formula = ulcer ~ .,
  family = 'binomial'
)

# Perform stepwise variable selection
stepwise.logistic.fit = stepAIC(
  object = logistic.fit,
  direction = 'both'
)
## Start:  AIC=237.18
## ulcer ~ time + status + sex + age + year + thickness
## 
##             Df Deviance    AIC
## - age        1   223.23 235.23
## - year       1   223.40 235.40
## - sex        1   223.69 235.69
## <none>           223.19 237.19
## - time       1   226.06 238.06
## - status     1   226.70 238.70
## - thickness  1   250.89 262.89
## 
## Step:  AIC=235.23
## ulcer ~ time + status + sex + year + thickness
## 
##             Df Deviance    AIC
## - year       1   223.43 233.43
## - sex        1   223.73 233.73
## <none>           223.23 235.23
## - time       1   226.31 236.31
## - status     1   226.70 236.70
## + age        1   223.19 237.19
## - thickness  1   252.03 262.03
## 
## Step:  AIC=233.43
## ulcer ~ time + status + sex + thickness
## 
##             Df Deviance    AIC
## - sex        1   223.94 231.94
## <none>           223.43 233.43
## - time       1   226.88 234.88
## + year       1   223.23 235.23
## + age        1   223.40 235.40
## - status     1   228.12 236.12
## - thickness  1   254.88 262.88
## 
## Step:  AIC=231.94
## ulcer ~ time + status + thickness
## 
##             Df Deviance    AIC
## <none>           223.94 231.94
## + sex        1   223.43 233.43
## - time       1   227.62 233.62
## + year       1   223.73 233.73
## + age        1   223.90 233.90
## - status     1   228.68 234.68
## - thickness  1   258.08 264.08
# show the model summary
summary(stepwise.logistic.fit)
## 
## Call:
## glm(formula = ulcer ~ time + status + thickness, family = "binomial", 
##     data = Melanoma)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.3190  -0.7996  -0.5980   0.9212   1.9530  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.5179847  0.6504255   0.796   0.4258    
## time        -0.0003055  0.0001610  -1.897   0.0578 .  
## status      -0.6787092  0.3145346  -2.158   0.0309 *  
## thickness    0.4080844  0.0851858   4.791 1.66e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 281.13  on 204  degrees of freedom
## Residual deviance: 223.94  on 201  degrees of freedom
## AIC: 231.94
## 
## Number of Fisher Scoring iterations: 5

Poisson Regression

# load warpbreaks data
data("warpbreaks")

# fit poisson regression
poisson.fit = glm(
  data = warpbreaks, 
  family = 'poisson',
  formula = breaks ~ wool + tension
)
summary(poisson.fit)
## 
## Call:
## glm(formula = breaks ~ wool + tension, family = "poisson", data = warpbreaks)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.6871  -1.6503  -0.4269   1.1902   4.2616  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.69196    0.04541  81.302  < 2e-16 ***
## woolB       -0.20599    0.05157  -3.994 6.49e-05 ***
## tensionM    -0.32132    0.06027  -5.332 9.73e-08 ***
## tensionH    -0.51849    0.06396  -8.107 5.21e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 297.37  on 53  degrees of freedom
## Residual deviance: 210.39  on 50  degrees of freedom
## AIC: 493.06
## 
## Number of Fisher Scoring iterations: 4