cars
contains two columns: speed (speed
) and stopping distance (dist
).dist
value, when speed
is 40.Proposed answer:
?cars
plot(cars, pch = 21)
# linear regression model
model_lin <- lm(dist ~ speed, data = cars)
abline(model_lin, col = "darkred", lwd = 2)
summary(model_lin)
##
## Call:
## lm(formula = dist ~ speed, data = cars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29.069 -9.525 -2.272 9.215 43.201
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.5791 6.7584 -2.601 0.0123 *
## speed 3.9324 0.4155 9.464 1.49e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.38 on 48 degrees of freedom
## Multiple R-squared: 0.6511, Adjusted R-squared: 0.6438
## F-statistic: 89.57 on 1 and 48 DF, p-value: 1.49e-12
# quadratic regression model
model_qua <- lm(dist ~ speed + I(speed^2), data = cars)
coefs <- coef(model_qua)
curve(coefs[1] + coefs[2] * x + coefs[3] * x^2,
add = TRUE,
col = "blue",
lwd = 2)
# adding legend to plot
legend("topleft",
legend = c("linear regression model", "quadratic regression model"),
col = c("darkred", "blue"),
lty = 1,
lwd = 2)
summary(model_qua)
##
## Call:
## lm(formula = dist ~ speed + I(speed^2), data = cars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.720 -9.184 -3.188 4.628 45.152
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.47014 14.81716 0.167 0.868
## speed 0.91329 2.03422 0.449 0.656
## I(speed^2) 0.09996 0.06597 1.515 0.136
##
## Residual standard error: 15.18 on 47 degrees of freedom
## Multiple R-squared: 0.6673, Adjusted R-squared: 0.6532
## F-statistic: 47.14 on 2 and 47 DF, p-value: 5.852e-12
# looking at the R^2, quadratic model is better
# prediction
predict(model_qua, list(speed = 40))
## 1
## 198.9365
Boston
from MASS
package, fit a polynomial regression model.medv
and y as lstat
)lstat
is dependent variable and medv
is independent variable) with different levels of the polynomial. In your opinion, which is the best? (use measures from lecture)lstat
value where medv
= 50.04Proposed answer:
#install.packages("MASS")
library(MASS)
?Boston
plot(Boston$medv, Boston$lstat)
# 2th level
model1 <- lm(lstat ~ medv + I(medv^2), data = Boston)
coefs <- coef(model1)
curve(coefs[1] + coefs[2] * x + coefs[3] * x^2,
add = TRUE,
col = "darkred",
lwd = 2)
summary(model1)
##
## Call:
## lm(formula = lstat ~ medv + I(medv^2), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.688 -2.550 -0.443 1.675 19.529
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.047643 1.044126 37.40 <2e-16 ***
## medv -1.715027 0.081046 -21.16 <2e-16 ***
## I(medv^2) 0.020687 0.001424 14.53 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.055 on 503 degrees of freedom
## Multiple R-squared: 0.6789, Adjusted R-squared: 0.6776
## F-statistic: 531.7 on 2 and 503 DF, p-value: < 2.2e-16
# 3th level
model2 <- lm(lstat ~ medv + I(medv^2) + I(medv^3), data = Boston)
coefs <- coef(model2)
curve(coefs[1] + coefs[2] * x + coefs[3] * x^2 + coefs[4] * x^3,
add = TRUE,
col = "blue",
lwd = 2)
summary(model2)
##
## Call:
## lm(formula = lstat ~ medv + I(medv^2) + I(medv^3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.7027 -2.5645 -0.4597 1.7146 19.5425
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.993e+01 2.073e+00 19.259 < 2e-16 ***
## medv -1.841e+00 2.680e-01 -6.868 1.93e-11 ***
## I(medv^2) 2.586e-02 1.062e-02 2.435 0.0152 *
## I(medv^3) -6.189e-05 1.259e-04 -0.492 0.6232
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.058 on 502 degrees of freedom
## Multiple R-squared: 0.679, Adjusted R-squared: 0.6771
## F-statistic: 354 on 3 and 502 DF, p-value: < 2.2e-16
# 4th level
model3 <- lm(lstat ~ medv + I(medv^2) + I(medv^3) + I(medv^4), data = Boston)
coefs <- coef(model3)
curve(coefs[1] + coefs[2] * x + coefs[3] * x^2 + coefs[4] * x^3 + coefs[5] * x^4,
add = TRUE,
col = "green",
lwd = 2)
legend("topright",
legend = c("2th level", "3th level", "4th level"),
col = c("darkred", "blue", "green"),
lty = 1,
lwd = 2)
summary(model3)
##
## Call:
## lm(formula = lstat ~ medv + I(medv^2) + I(medv^3) + I(medv^4),
## data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.1885 -2.4067 -0.6392 1.7855 19.9394
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.962e+01 4.031e+00 7.347 8.27e-13 ***
## medv 2.896e-01 7.641e-01 0.379 0.70488
## I(medv^2) -1.175e-01 4.934e-02 -2.381 0.01764 *
## I(medv^3) 3.758e-03 1.291e-03 2.912 0.00375 **
## I(medv^4) -3.460e-05 1.163e-05 -2.974 0.00308 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.027 on 501 degrees of freedom
## Multiple R-squared: 0.6846, Adjusted R-squared: 0.6821
## F-statistic: 271.8 on 4 and 501 DF, p-value: < 2.2e-16
# In my opinion, 2th level is sufficient.
# There are very small differences at coefficient R^2.
# prediction
predict(model1, list(medv = 50.04))
## 1
## 5.02819
marketing
from datarium
package. Read more information about this dataset (?marketing
). Create multiple regression model using method 1 or method 2 from the lecture. Predict sales
value for your selected variable values.Proposed answer:
# install.packages("datarium")
library(datarium)
?marketing
# multiple regression model
model <- lm(sales ~ ., data = marketing)
summary(model)
##
## Call:
## lm(formula = sales ~ ., data = marketing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.5932 -1.0690 0.2902 1.4272 3.3951
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.526667 0.374290 9.422 <2e-16 ***
## youtube 0.045765 0.001395 32.809 <2e-16 ***
## facebook 0.188530 0.008611 21.893 <2e-16 ***
## newspaper -0.001037 0.005871 -0.177 0.86
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.023 on 196 degrees of freedom
## Multiple R-squared: 0.8972, Adjusted R-squared: 0.8956
## F-statistic: 570.3 on 3 and 196 DF, p-value: < 2.2e-16
# We use method 2 from the lecture
model2 <- step(model)
## Start: AIC=285.72
## sales ~ youtube + facebook + newspaper
##
## Df Sum of Sq RSS AIC
## - newspaper 1 0.1 802.0 283.75
## <none> 801.8 285.72
## - facebook 1 1960.9 2762.7 531.13
## - youtube 1 4403.5 5205.4 657.83
##
## Step: AIC=283.75
## sales ~ youtube + facebook
##
## Df Sum of Sq RSS AIC
## <none> 802.0 283.75
## - facebook 1 2225.7 3027.6 547.44
## - youtube 1 4408.7 5210.6 656.03
summary(model2) # the best model
##
## Call:
## lm(formula = sales ~ youtube + facebook, data = marketing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.5572 -1.0502 0.2906 1.4049 3.3994
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.50532 0.35339 9.919 <2e-16 ***
## youtube 0.04575 0.00139 32.909 <2e-16 ***
## facebook 0.18799 0.00804 23.382 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.018 on 197 degrees of freedom
## Multiple R-squared: 0.8972, Adjusted R-squared: 0.8962
## F-statistic: 859.6 on 2 and 197 DF, p-value: < 2.2e-16
# prediction
newdata = data.frame(youtube = 150,
facebook = 25)
predict(model2, newdata)
## 1
## 15.0684
rating
is 3, age
is 20, yearsmarried
is 2, gender
is male and religiousness
is 1rating
is 5, age
is 80, yearsmarried
is 10 or 20, gender
is female, religiousness` is 3Proposed answer:
library(AER)
data(Affairs)
library(dplyr)
?Affairs
Affairs <- Affairs %>%
mutate(ynaffairs = ifelse(affairs > 0, 1, 0))
model_glm <- glm(ynaffairs ~ gender + age +
yearsmarried + children +
religiousness + education +
occupation + rating,
data = Affairs,
family = 'binomial')
model_glm2 <- step(model_glm)
## Start: AIC=627.51
## ynaffairs ~ gender + age + yearsmarried + children + religiousness +
## education + occupation + rating
##
## Df Deviance AIC
## - education 1 609.68 625.68
## - occupation 1 609.70 625.70
## - gender 1 610.89 626.89
## - children 1 611.40 627.40
## <none> 609.51 627.51
## - age 1 615.67 631.67
## - yearsmarried 1 618.34 634.34
## - religiousness 1 622.92 638.92
## - rating 1 636.75 652.75
##
## Step: AIC=625.68
## ynaffairs ~ gender + age + yearsmarried + children + religiousness +
## occupation + rating
##
## Df Deviance AIC
## - occupation 1 610.15 624.15
## - gender 1 611.29 625.29
## - children 1 611.62 625.62
## <none> 609.68 625.68
## - age 1 615.78 629.78
## - yearsmarried 1 618.46 632.46
## - religiousness 1 623.27 637.27
## - rating 1 636.93 650.93
##
## Step: AIC=624.15
## ynaffairs ~ gender + age + yearsmarried + children + religiousness +
## rating
##
## Df Deviance AIC
## - children 1 611.86 623.86
## <none> 610.15 624.15
## - gender 1 613.41 625.41
## - age 1 616.00 628.00
## - yearsmarried 1 619.07 631.07
## - religiousness 1 623.98 635.98
## - rating 1 637.23 649.23
##
## Step: AIC=623.86
## ynaffairs ~ gender + age + yearsmarried + religiousness + rating
##
## Df Deviance AIC
## <none> 611.86 623.86
## - gender 1 615.36 625.36
## - age 1 618.05 628.05
## - religiousness 1 625.57 635.57
## - yearsmarried 1 626.23 636.23
## - rating 1 639.93 649.93
newdata1 <- data.frame(rating = 3,
age = 20,
yearsmarried = 2,
religiousness = 1,
gender = "male")
predict(model_glm2,
newdata = newdata1,
type = "response")
## 1
## 0.4872761
newdata2 <- data.frame(rating = 5,
age = 80,
yearsmarried = c(10, 20),
religiousness = 3,
gender = "female")
predict(model_glm2,
newdata = newdata2,
type = "response")
## 1 2
## 0.02251841 0.06553592