cars contains two columns: speed (speed) and stopping distance (dist).dist value, when speed is 40.Proposed answer:
?cars
plot(cars, pch = 21)
# linear regression model
model_lin <- lm(dist ~ speed, data = cars)
abline(model_lin, col = "darkred", lwd = 2)
summary(model_lin)
##
## Call:
## lm(formula = dist ~ speed, data = cars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29.069 -9.525 -2.272 9.215 43.201
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.5791 6.7584 -2.601 0.0123 *
## speed 3.9324 0.4155 9.464 1.49e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.38 on 48 degrees of freedom
## Multiple R-squared: 0.6511, Adjusted R-squared: 0.6438
## F-statistic: 89.57 on 1 and 48 DF, p-value: 1.49e-12
# quadratic regression model
model_qua <- lm(dist ~ speed + I(speed^2), data = cars)
coefs <- coef(model_qua)
curve(coefs[1] + coefs[2] * x + coefs[3] * x^2,
add = TRUE,
col = "blue",
lwd = 2)
# adding legend to plot
legend("topleft",
legend = c("linear regression model", "quadratic regression model"),
col = c("darkred", "blue"),
lty = 1,
lwd = 2)
summary(model_qua)
##
## Call:
## lm(formula = dist ~ speed + I(speed^2), data = cars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.720 -9.184 -3.188 4.628 45.152
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.47014 14.81716 0.167 0.868
## speed 0.91329 2.03422 0.449 0.656
## I(speed^2) 0.09996 0.06597 1.515 0.136
##
## Residual standard error: 15.18 on 47 degrees of freedom
## Multiple R-squared: 0.6673, Adjusted R-squared: 0.6532
## F-statistic: 47.14 on 2 and 47 DF, p-value: 5.852e-12
# looking at the R^2, quadratic model is better
# prediction
predict(model_qua, list(speed = 40))
## 1
## 198.9365
Boston from MASS package, fit a polynomial regression model.medv and y as lstat)lstat is dependent variable and medv is independent variable) with different levels of the polynomial. In your opinion, which is the best? (use measures from lecture)lstat value where medv = 50.04Proposed answer:
#install.packages("MASS")
library(MASS)
?Boston
plot(Boston$medv, Boston$lstat)
# 2th level
model1 <- lm(lstat ~ medv + I(medv^2), data = Boston)
coefs <- coef(model1)
curve(coefs[1] + coefs[2] * x + coefs[3] * x^2,
add = TRUE,
col = "darkred",
lwd = 2)
summary(model1)
##
## Call:
## lm(formula = lstat ~ medv + I(medv^2), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.688 -2.550 -0.443 1.675 19.529
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.047643 1.044126 37.40 <2e-16 ***
## medv -1.715027 0.081046 -21.16 <2e-16 ***
## I(medv^2) 0.020687 0.001424 14.53 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.055 on 503 degrees of freedom
## Multiple R-squared: 0.6789, Adjusted R-squared: 0.6776
## F-statistic: 531.7 on 2 and 503 DF, p-value: < 2.2e-16
# 3th level
model2 <- lm(lstat ~ medv + I(medv^2) + I(medv^3), data = Boston)
coefs <- coef(model2)
curve(coefs[1] + coefs[2] * x + coefs[3] * x^2 + coefs[4] * x^3,
add = TRUE,
col = "blue",
lwd = 2)
summary(model2)
##
## Call:
## lm(formula = lstat ~ medv + I(medv^2) + I(medv^3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.7027 -2.5645 -0.4597 1.7146 19.5425
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.993e+01 2.073e+00 19.259 < 2e-16 ***
## medv -1.841e+00 2.680e-01 -6.868 1.93e-11 ***
## I(medv^2) 2.586e-02 1.062e-02 2.435 0.0152 *
## I(medv^3) -6.189e-05 1.259e-04 -0.492 0.6232
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.058 on 502 degrees of freedom
## Multiple R-squared: 0.679, Adjusted R-squared: 0.6771
## F-statistic: 354 on 3 and 502 DF, p-value: < 2.2e-16
# 4th level
model3 <- lm(lstat ~ medv + I(medv^2) + I(medv^3) + I(medv^4), data = Boston)
coefs <- coef(model3)
curve(coefs[1] + coefs[2] * x + coefs[3] * x^2 + coefs[4] * x^3 + coefs[5] * x^4,
add = TRUE,
col = "green",
lwd = 2)
legend("topright",
legend = c("2th level", "3th level", "4th level"),
col = c("darkred", "blue", "green"),
lty = 1,
lwd = 2)
summary(model3)
##
## Call:
## lm(formula = lstat ~ medv + I(medv^2) + I(medv^3) + I(medv^4),
## data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.1885 -2.4067 -0.6392 1.7855 19.9394
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.962e+01 4.031e+00 7.347 8.27e-13 ***
## medv 2.896e-01 7.641e-01 0.379 0.70488
## I(medv^2) -1.175e-01 4.934e-02 -2.381 0.01764 *
## I(medv^3) 3.758e-03 1.291e-03 2.912 0.00375 **
## I(medv^4) -3.460e-05 1.163e-05 -2.974 0.00308 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.027 on 501 degrees of freedom
## Multiple R-squared: 0.6846, Adjusted R-squared: 0.6821
## F-statistic: 271.8 on 4 and 501 DF, p-value: < 2.2e-16
# In my opinion, 2th level is sufficient.
# There are very small differences at coefficient R^2.
# prediction
predict(model1, list(medv = 50.04))
## 1
## 5.02819
marketing from datarium package. Read more information about this dataset (?marketing). Create multiple regression model using method 1 or method 2 from the lecture. Predict sales value for your selected variable values.Proposed answer:
# install.packages("datarium")
library(datarium)
?marketing
# multiple regression model
model <- lm(sales ~ ., data = marketing)
summary(model)
##
## Call:
## lm(formula = sales ~ ., data = marketing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.5932 -1.0690 0.2902 1.4272 3.3951
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.526667 0.374290 9.422 <2e-16 ***
## youtube 0.045765 0.001395 32.809 <2e-16 ***
## facebook 0.188530 0.008611 21.893 <2e-16 ***
## newspaper -0.001037 0.005871 -0.177 0.86
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.023 on 196 degrees of freedom
## Multiple R-squared: 0.8972, Adjusted R-squared: 0.8956
## F-statistic: 570.3 on 3 and 196 DF, p-value: < 2.2e-16
# We use method 2 from the lecture
model2 <- step(model)
## Start: AIC=285.72
## sales ~ youtube + facebook + newspaper
##
## Df Sum of Sq RSS AIC
## - newspaper 1 0.1 802.0 283.75
## <none> 801.8 285.72
## - facebook 1 1960.9 2762.7 531.13
## - youtube 1 4403.5 5205.4 657.83
##
## Step: AIC=283.75
## sales ~ youtube + facebook
##
## Df Sum of Sq RSS AIC
## <none> 802.0 283.75
## - facebook 1 2225.7 3027.6 547.44
## - youtube 1 4408.7 5210.6 656.03
summary(model2) # the best model
##
## Call:
## lm(formula = sales ~ youtube + facebook, data = marketing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.5572 -1.0502 0.2906 1.4049 3.3994
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.50532 0.35339 9.919 <2e-16 ***
## youtube 0.04575 0.00139 32.909 <2e-16 ***
## facebook 0.18799 0.00804 23.382 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.018 on 197 degrees of freedom
## Multiple R-squared: 0.8972, Adjusted R-squared: 0.8962
## F-statistic: 859.6 on 2 and 197 DF, p-value: < 2.2e-16
# prediction
newdata = data.frame(youtube = 150,
facebook = 25)
predict(model2, newdata)
## 1
## 15.0684
rating is 3, age is 20, yearsmarried is 2, gender is male and religiousness is 1rating is 5, age is 80, yearsmarried is 10 or 20, gender is female, religiousness` is 3Proposed answer:
library(AER)
data(Affairs)
library(dplyr)
?Affairs
Affairs <- Affairs %>%
mutate(ynaffairs = ifelse(affairs > 0, 1, 0))
model_glm <- glm(ynaffairs ~ gender + age +
yearsmarried + children +
religiousness + education +
occupation + rating,
data = Affairs,
family = 'binomial')
model_glm2 <- step(model_glm)
## Start: AIC=627.51
## ynaffairs ~ gender + age + yearsmarried + children + religiousness +
## education + occupation + rating
##
## Df Deviance AIC
## - education 1 609.68 625.68
## - occupation 1 609.70 625.70
## - gender 1 610.89 626.89
## - children 1 611.40 627.40
## <none> 609.51 627.51
## - age 1 615.67 631.67
## - yearsmarried 1 618.34 634.34
## - religiousness 1 622.92 638.92
## - rating 1 636.75 652.75
##
## Step: AIC=625.68
## ynaffairs ~ gender + age + yearsmarried + children + religiousness +
## occupation + rating
##
## Df Deviance AIC
## - occupation 1 610.15 624.15
## - gender 1 611.29 625.29
## - children 1 611.62 625.62
## <none> 609.68 625.68
## - age 1 615.78 629.78
## - yearsmarried 1 618.46 632.46
## - religiousness 1 623.27 637.27
## - rating 1 636.93 650.93
##
## Step: AIC=624.15
## ynaffairs ~ gender + age + yearsmarried + children + religiousness +
## rating
##
## Df Deviance AIC
## - children 1 611.86 623.86
## <none> 610.15 624.15
## - gender 1 613.41 625.41
## - age 1 616.00 628.00
## - yearsmarried 1 619.07 631.07
## - religiousness 1 623.98 635.98
## - rating 1 637.23 649.23
##
## Step: AIC=623.86
## ynaffairs ~ gender + age + yearsmarried + religiousness + rating
##
## Df Deviance AIC
## <none> 611.86 623.86
## - gender 1 615.36 625.36
## - age 1 618.05 628.05
## - religiousness 1 625.57 635.57
## - yearsmarried 1 626.23 636.23
## - rating 1 639.93 649.93
newdata1 <- data.frame(rating = 3,
age = 20,
yearsmarried = 2,
religiousness = 1,
gender = "male")
predict(model_glm2,
newdata = newdata1,
type = "response")
## 1
## 0.4872761
newdata2 <- data.frame(rating = 5,
age = 80,
yearsmarried = c(10, 20),
religiousness = 3,
gender = "female")
predict(model_glm2,
newdata = newdata2,
type = "response")
## 1 2
## 0.02251841 0.06553592