## assignment for lecture 9. To hand in before Tuesday 25th of November, 18.00
## (1)
# Have a look at built-in data set cars. This data set contains two variables:
# the speed that cars were driving when the break was hit, and the distance (dist)
# that the car needed to come to a full stop.
# 1) plot the distance before stopping (y-axis) against the speed of the car (x-axis).
# Make both axes run from 0 to the maximum observed value on that axis.
# 2) Fit a linear model in which the distance is predicted by the speed
# and save to lmout1.
# 3) Add the regression line to the plot.
# 4) lmout1$fitted.values contains, for each case in the data set, the predicted distance
# given the speed, according to the linear model. Plot the predicted distances against
# the observed distances.
# 5) Add a green y = x line to this plot. (if the model fit was perfect, all points
# would lie on this line)
# =======================================================================================
#1)
plot(cars$speed, cars$dist, xlim = c(0, max(cars$speed)), ylim = c(0, max(cars$dist)))
#2)
lmout = lm(cars$dist ~ cars$speed)
#3)
abline(lmout)
#4)
plot(cars$dist, lmout$fitted.values, pch = 19)
#5)
abline(0, 1, col = 'green')
# =========================================================================================
## 2
# Check the following code that simulates the effects of certain factors on succes in an
# R-course:
n = 150
IQ = rnorm(n, 100, 15)
creativity = rnorm(n, 10, 3)
motivation = sample(1:10, n, T)
s.IQ = scale(IQ) # scale these variables (scale means normalize)
s.creativity = scale(creativity)
s.motivation = scale(motivation)
Rscore = s.creativity + ## effect of creativity
s.motivation * s.IQ + ## the higher your motivation, the more your IQ matters
rnorm(n, 0, 3) # and some noise
# 1) Why do you think I used scale (that does a standard normal transformation) on the
# predictors?
# 2) Create the full linear model predicting Rscore from the three predictors (s.IQ,
# s.creativity, and s.motivation) and all their interactions and save the output in
# lmout2.
# 3) Use stepwise selection of predictors on this output. Therefore, apply the function
# step() to the output of the lm call and save the output, which is the best model,
# in best.model2
# 4) Create the anova table for this best model. You might see that the effects don't turn
# out the way you expected (or not). Run the whole simulation and analysis a couple of
# times to see what chance does.
# 5) To not be dependent on chance, create and fit the following (true) model:
# truemodel = lm(Rscore ~ creativity + IQ : motivation). What is the estimate of the
# linear regression model now?
# (answer should not be code, but just the resulting model as a written formula,
# fill in:
# Rscore = ... + ... * ... + ... * ... * ...)
# ========================================================================================
#1)
# Creativity, IQ and motivation have different scales (magnitudes and spread). In order
# to give them all a comparable magnitude of effect in the simulation, it is useful to
# give them both a mean of 0 and a sd of 1.
#2)
out = lm(Rscore ~ s.creativity * s.IQ * s.motivation)
#3)
best.model2 = step(out)
#4)
anova(best.model2)
#5)
# depends on your simulation, fill in the with their values.
Rscore = + * creativity + * IQ * motivation
# =========================================================================================
## 3
# A rather fun rehersal of things you have learned up to now!
# We're going to check whether the p-values of t-tests represent what we think they do.
# Therefore, we will simulate data under the null-hypothesis, i.e., simulate two normal
# distributions with the same mean. Then we see what p-value the t-test gives us. We will
# then repeat this procedure, say, 100 times and see whether the p-values behave as we
# expected. No worries, we'll do this step by step:
#
# a)
# - Create an empty numeric variable called pvalues.
# - Make a for loop in which you repeat the following 3 steps a hundred times (for (i in 1:100))
# (1) - create two groups of 40 random standard normal observations (use rnorm(n = 40))
# (2) - do a t-test between the two groups (use t.test()) and assign the output to, e.g.
# temp.output.
# (3) - take from this temp.output the $p.value and attach it to the
# variable pvalues using the "x = c(x, ...)"-trick.
# b) Make a histogram of the resulting pvalues. What percentage of the p-values
# distribution would you expect to fall below .05?
# c) In your simulation, which percentage of pvalues are smaller than .05? Don't tell me
# the answer, but show me how you ask R to calculate that. You'd probably need sum(), <,
# and length().
# d) Does it get closer to what you expected when you increase the number of simulations?
# =========================================================================================
# a)
pvalues = numeric()
for (i in 1:100){
tdata1 = rnorm(40)
tdata2 = rnorm(40)
out = t.test(tdata1, tdata2)
pvalues = c(pvalues, out$p.value)
}
# b)
hist(pvalues)
# Per definition, under the null hypothesis, the probability of observing p-value smaller
# than 0.05 is 0.05. So, in five percent of the cases, you will reject the null, although
# it's true
# c)
sum(pvalues < .05)/length(pvalues)
# d) if you increase the number of simulations from 100 to, say, 10000, the percentage of
# p-values below 0.05 wil on average get closer and closer to the nominal 0.05.
# =========================================================================================
## 4. challenge!
# Do the same as in #3, but now 'contaminate' the simulated data with a very wide normal
# distribution. Do this by replacing some of the data by samples from a wider distribution,
# So, let's add 10 values with large variance by saying:
# data = c(rnorm(30, 0, 1), rnorm(10, 0, 20))
# Now, the data is not normal anymore, but has very thick tails on both sides. (check hist
# (data)). Say something about the rbustness of the t-test. Can you try other things to study
# the robustness?
# =========================================================================================
pvalues = numeric()
for (i in 1:1000){
tdata1 = c(rnorm(30), rnorm(10, 0, 20))
tdata2 = c(rnorm(30), rnorm(10, 0, 20))
out = t.test(tdata1, tdata2)
pvalues = c(pvalues, out$p.value)
}
hist(pvalues)
sum(pvalues < .05) / length(pvalues)
# That's not too far off the nominal p. In fact, it seems like the null-hypothesis is
# less often rejected than the nominal 0.05.
## Another test could be to add some outliers, for example to data set 1
pvalues = numeric()
for (i in 1:1000){
tdata1 = rnorm(40)
tdata2 = rnorm(40)
tdata1[sample(1:100, 5)] = runif(5, -50,50) # replace five values by outliers
out = t.test(tdata1, tdata2)
pvalues = c(pvalues, out$p.value)
}
hist(pvalues)
sum(pvalues < .05) / length(pvalues)