### Session 3 - Generalized Linear Models ###
# Notes by Portia Osborne
# input data set and choose a subset with hemlock
dat=read.csv(file.choose())
dat2=subset(dat, dat$species=="Tsuga canadensis")
# cover values are discrete data (integers 1-10)
# mean and var roughly equal
# these mean that a poisson dist. may be appropriate
mean(dat2$cover)
var(dat2$cover)
table(dat2$cover)
# barplot of the contingency table
# store barplot as an object to work with easily
bar1 = barplot(as.vector(table(dat2$cover)),names.arg=seq(1:10))
# superimpose a vector of points on bar1, where y-values are expected values
# based on poisson distribution
points(bar1,dpois(seq(1,10),4.66)*sum(table(dat2$cover)),
cex=2,type="b",col="sienna",lwd=2,pch=19)
# glm functions the same as lm, but the family option
# invokes the desired error distribution
# glm 1 fits cover data to poisson dist.
# ~1 only fits the intercept
glm1 = glm(cover~1,data=dat2,family=poisson)
summary(glm1)
# coef(glm1) is returned on the scale of the link
# function (log in this case), so must transform
# values to return to original scale of the data
exp(coef(glm1))
# add another variable (elevation) to the model
with(dat2,plot(elev,cover,main="Hemlock cover vs. elevation",cex=1.5,col="cyan",pch=19))
glm2 = glm(cover~elev,data=dat2,family=poisson)
# test if glm2 is a better fit. anova gives large p-value, so no difference
# and elevation doesn't improve the model
anova(glm1,glm2,test="Chisq")
# no difference is evident from the trend line
# lines adds in the trend line, predict function gives predicted values of glm2
# based on elevation. Note use of type="response" to get back coefficients
# on the scale of the original data
x = seq(0,1660)
lines(predict(glm2,list(elev=x),type="response"),lwd=2,col="orange")
# glm3 uses disturbance as the predictor variable
glm3 = glm(cover~disturb,data=dat2,family=poisson)
summary(glm3)
# when reading glm coef table, remember that the alphabetically first level is
# used as the intercept, and other levels are tested against that first one.
anova(glm1,glm3,test="Chisq")
# summary results show a difference in hemlock cover between virgin sites and
# corporately logged sites, but anova F-test gives p=0.07.
# Some evidence for a relationship, but not much.
# let's try to find a maximal model with main effects and interactions
# glm4 combines disturb and elev main effects, plus the interaction of these 2
glm4 = glm(cover~disturb*elev, data=dat2, family=poisson)
summary(glm4)
# step function does backward, step-wise testing. First it removes the
# interaction, but AIC increases so the interaction is important.
step(glm4)
# Note that AIC should not be the only determinant of models. Models should be
# based on testable hypotheses formed at the beginning. Models should have
# ecological meaning, not just a good AIC.
# Can use contrasts() function to specify which contrasts you want back
# Overdispersion may occur in your data, when you have a lot of values of 0 in the data.
# Quasivariance introduces a new parameter phi to the expected variance equation.
# If phi>1, it accounts for overdispersion. If phi=1, glm model. If phi<1,
# accounts for underdispersion.
# glm4.qp uses family=quasipoisson. Summary gives back the estimate for phi,
# which in this case is <1, so data not overdispersed and using phi not helpful.
glm4.qp = update(glm4, family=quasipoisson)
summary(glm4.qp)
# Note that AIC is NA because standard likelihoods aren't used. Do an F test
# to compare the 2 glms with poisson and quasipoisson distributions.
anova(glm4,glm4.qp,test="F")
# We can use negative binomial models to deal with overdispersion while
# maintaining likelihood characteristics. Use the library MASS.
library(MASS)
# Use glm.nb function to model glm with negative binomial dist.
nb1 = glm.nb(cover~disturb*elev, data=dat2)
summary(nb1)
# This allows us to compare AIC for glm model and nb model.
AIC(glm4, nb1)
# we find no large difference, and no support for a negative binomial model
# makes sense because there is no overdispersion in the data
# Let's create a new dataset with overdispersion by adding in many plots with
# a cover value of 0. h.plots contain hemlock, while noh.plots do not (so cover=0)
# Note the use of logical functions to separate the plots without hemlock from
# the vector of all plots (u.plots).
h.plots = unique(as.character(dat2$plotID))
u.plots = unique(as.character(dat$plotID))
noh.plots = u.plots[is.element(u.plots,h.plots)==F]
# use is.element to create dat3 with only plots from dat that are in noh.plots
dat3 = subset(dat,is.element(as.character(dat$plotID),noh.plots))
# dat4 takes out duplicated plots in dat3, then cover values are set to 0
# because these plots do not have hemlock
dat4 = subset(dat3,duplicated(dat3$plotID)==F)
dat4$cover = 0
# we combine dat2 (hemlock plots) and dat4 (no hemlock) using rbind, which combines
# objects by rows. (cbind combines by column)
dat5 = rbind(dat2,dat4)
# if we plot dat5, can see that there is a preponderance of 0s in the dataset
bar1 = barplot(as.vector(table(dat5$cover)),names.arg=as.character(seq(0,10)))
# adding points as before, these represent the expected y-values based on
# the poisson and negative binomial error distributions
points(bar1,dpois(seq(0,10),mean(dat5$cover))*sum(table(dat5$cover)),
cex=2,type="b",col="lightblue",lwd=2,pch=19)
points(bar1,dnbinom(seq(0,10),mu=mean(dat5$cover),
size=(mean(dat5$cover)^2)/(var(dat5$cover)-mean(dat5$cover)))*sum(table(dat5$cover)),
type="b",cex=2,col="salmon",lwd=2,pch=19)
# neither set of points fit perfectly, but nb is better than poisson
# remember that nb dist. includes an overdispersion parameter to account for
# that preponderance of 0s
# compare negative binomial and poisson models for this dataset
nb2 = glm.nb(cover~disturb*elev,data=dat5)
glm5 = glm(cover~disturb*elev,data=dat5,family=poisson)
AIC(nb2,glm5)
# AIC is much better for nb2, so neg. binomial dist. is our better model here
# but if we plot nb2, it is still not a perfect model
plot(nb2)
# Neither model is able to account for the overdispersion well because there it
# is too strong. We should not have combined the cover (abundance) data with the
# presence/absence data, because these are two different ecological processes
# and can't be explained by one model. It is better to split this data into 2
# models, dealing with the abundance as we did above and now the presence/absence
# as a binary variable. Note that we could also deal with this using
# hierarchical framework when we get there.
### binary response variables###
# No overdispersion with binary response variables. We will continue to use dat5,
# but make all cover values >0 equal to 1, so dat5$pa is presence or absence of
# hemlock.
dat5$pa = dat5$cover
dat5$pa[dat5$pa>0] = 1
# write our glm with binary response variable and binomial dist.
bin.glm1 = glm(pa~disturb*elev,data=dat5,family=binomial)
summary(bin.glm1)
step(bin.glm1)
# use step function evaluate bin.glm1. Interactions are important in this model.
# We can change the link function to complementary log-log (cloglog), which is
# often used for discrete data. The anova compares this link function with the
# default logit link function, and the default is better. In general, use
# the link function with lower residual deviance.
bin.glm2 = update(bin.glm1,family=binomial(link="cloglog"))
anova(bin.glm1,bin.glm2)
# with bin.glm1 being our best model, we can compare it to our plotted data.
# First plot is just the virgin disturbance class, with elev as x-axis.
x = seq(250,2000)
with(dat5[dat5$disturb=="VIRGIN",],plot(elev,pa,cex=1.5,col="darkgreen",xlim=c(250,2000)))
# add the predicted values from our fitted model, again using only data from virgin plots
lines(x,predict(bin.glm1,newdata=list(elev=x,disturb=factor(rep("VIRGIN",length(x)))),type="response"),lwd=2,col="darkgreen")
# we can do the same thing with the corplog disturbance class
with(dat5[dat5$disturb=="CORPLOG",],points(elev,pa+.02,cex=1.5,col="blue"))
lines(x,predict(bin.glm1,newdata=list(elev=x,disturb=factor(rep("CORPLOG",length(x)))),
type="response"),lwd=2,col="blue")
# and with lt-sel disturbance class
with(dat5[dat5$disturb=="LT-SEL",],points(elev,pa-.02,cex=1.5,col="darkred"))
lines(x,predict(bin.glm1,newdata=list(elev=x,disturb=factor(rep("LT-SEL",length(x)))),
type="response"),lwd=2,col="darkred")
# final graph has three different fitted curves, so the relationship between
# hemlock presence and elevation varies by disturbance type
###note on proportion data###
# Proportion data can be modeled with a glm and family=binomial. However, if data
# are associated with a number of trials N, this variable can be incorporated as
# well to weight the observations as desired. This is a way of dealing with
# rarefaction problem in plot data.
# Here we model hemlock cover with respect to total cover. Use tapply to find
# summed cover of all species for each plot.
sumcover = tapply(dat$cover,dat$plotID,sum)
# create a new dataset with these summed cover values, then merge with dat5 based
# on plotID. Use cbind to create the response variable cover.y
coverdat = data.frame(names(sumcover),sumcover)
dat6 = merge(dat5,coverdat,all.x=T,by.x=1,by.y=1)
cover.y = cbind(dat6$cover,dat6$sumcover-dat6$cover)
# we can model this response variable
glm6 = glm(cover.y~disturb*elev,data=dat6,family=binomial)
summary(glm6)
# note that overdispersion can occur with proportion data, in which case you can
# use family=quasibinomial in the same way as quasipoisson above
### Example comparing lm and glm ###
# Comparison of glm (with family=gaussian and link="log") with lm (with a log response variable)
# glm: subset treedata to only hemlock, and plot this with superimposed points
# that represent the expected values based on a logged gaussian dist.
dat7 = subset(dat,dat$species=="Tsuga canadensis")
glm7 = glm(cover~disturb*elev,data=dat7,family=gaussian(link="log"))
summary(glm7)
bar7=barplot(as.vector(table(dat7$cover)),names.arg=seq(1:10),ylim=c(0,300))
points(bar7,exp(dnorm(seq(1,10),4.66,log=TRUE))*sum(table(dat7$cover)),cex=2,type="b",col="purple",lwd=2,pch=19)
# clearly not the best fit. let's compare to a linear model.
#Jason: be sure to include specification of both mean and standard deviation in dnorm (here sd=2.11)
# we'll use the same hemlock cover data, but create a new log response variable
logcov=log(dat7$cover)
# then use cbind to combine dat7 and our new variable
dat8=cbind(dat7,logcov)
str(dat8)
# create a new linear model with logcov as the response variable
lm1=lm(logcov~disturb*elev,data=dat8)
summary(lm1)
#Jason: taking a look at residuals of both models:
bar8=barplot(as.vector(table(resid(glm7))))
hist(resid(glm7),freq=F)
lines(density(resid(glm7)))
lines(seq(-10,10,by=.1),dnorm(seq(-10,10,by=.1),mean=mean(resid(glm7)),sd=sd(resid(glm7))),col="blue")
windows()
bar8=barplot(as.vector(table(resid(lm1))))
hist(resid(lm1),freq=F)
lines(density(resid(lm1)))
lines(seq(-10,10,by=.1),dnorm(seq(-10,10,by=.1),mean=mean(resid(lm1)),sd=sd(resid(lm1))),col="red")
#which is the better model?
#note CAN'T USE likelihood-based tests (like AIC) because response variables are different
#we could compare deviance based R2 with variance-based R2:
#lm1 R2 is 2.6%
#glm7 1-(residual/null deviance) is 1 - (3233/3332) = 2.97%
#here I think we conclude that both models are poor