15 February 2010 GAMs
#Utimate in debate over model-fitting and parsimony
#Review of GLMs
#How do you evaluate values of something that is log transformed with something that is log-linked transformed...
#Don't use AIC. Based on the unique values of Y. If you transform data, AIC not applicable.
#Look at the residual fit of both models, linear models (r^2), GLM (deviance --> psuedo r2)
#GAMs
#Look like GLMs in specification with more diffuclt output to interpret and must specify the smoothing function [s()]
#Fit seperate polynomial functions of varing complexity to different parts of the data (knots=subsections) with the x''=0 (no kinks) -->spline
#Can also use LOESS approach (locally weighted regression) weigths each point the closer you get to the value
#Based on minimizing residual deviance (fitted to observe) - will always be lower than linear model - even with harsh penalties for df calculations still fits better in AIC, etc.
#But how do you do this and still keep parsimony in control
#Usesful for predictions within the range of the model - you receive a model built to the data you found in the range you sampled
#Observed trends could be caused by unmeasured variables - basically a really nice looking curve - no mechanistic connection
#Good for exploratory analysis - see if it could be reduced to GLM
setwd("PUT YOUR DATA DIRECTORY HERE")
dat=read.csv("treedata.csv")
dat2 = subset(dat,dat$species=="Betula alleghaniensis") #a yellow birch subset
b.plots = unique(as.character(dat2$plotID)) #plots with birch
u.plots = unique(as.character(dat$plotID)) #all plots
nob.plots = u.plots[is.element(u.plots,b.plots)==F] #plots without birch
dat3 = subset(dat,is.element(as.character(dat$plotID),nob.plots)) #datast of no birch
dat4 = subset(dat3,duplicated(dat3$plotID)==F) #one row per plot
dat4$cover = 0 #cover of birch is zero in these plots
dat5 = rbind(dat2,dat4) #new dataframe of presences and absences
str(dat5)
with(dat5,plot(elev,cover))
ls1 = loess(cover>0~elev,data=dat5) #to plot use predict()
summary(ls1) #creates model that downweights values further from the predicted values
with(dat5,plot(elev,cover>0))
x = seq(0,2000)
lines(x,predict(ls1,newdata=data.frame(elev=x)),col="tomato",lwd=2) #must be data frame
#binary regression (presence-absence) nothing involving splines here model increases toward 1
install.packages("mgcv") #any library can be pulled down this way - select mirror
library(mgcv)
gam1 = gam(cover>0~s(elev),family=binomial,data=dat5)
#gam() function, define binomial, logit link default, cover>0 response variable logical, s(elev) fit with spline (without s() fits almost GLM) could fit a hybird with some splined and other not
summary(gam1)
#with large datasets may run into problems with computations because iterative (compare both packages for efficiency)
#output show parametric fit associated with intercept on the scale of the link function
#the smooth term returns estimated degress of freedom (edf) Chi-sq significance
#ref.df may show the original df before deductions for smoothing fits or it may show the df calculated in another manner (R is unclear on this)
#degrees of freedom are calulated based on the smoothness of the function used in the GAM
#i.e. if we fit one parameter we loose one df, if we fit a line that connects all the data points we loose as many degrees of freedom as there are points
#GAM fits a smooth curve somewhere between these two extremes and we loose the respective df
#smooth also returns residual deviance adj R-sq. and deviance explained
#Compare to Loess fit
points(dat5$elev,fitted(gam1),col="springgreen",lwd=2)
#Is this a better fit? No likelyhood measurements --- r2 to pseuo r2
1/(1+1/exp(coef(gam1)[1])) #convert back to mean from link
#Do it with GLM without smoothing term
glm1 = glm(cover>0~elev,family=binomial,data=dat5)
lines(x,predict(glm1,newdata=list(elev=x),type="response"),lwd=3,col="turquoise")
#Fewer bumps cannot go back down
summary(glm1)
#can use likelyhood again
AIC(gam1,glm1) #GAM better of course
1-(glm1$dev/glm1$null)
anova(gam1,glm1,test="Chi")
#give the glm a method to go down
glm2 = glm(cover>0~elev+I(elev^2),family=binomial,data=dat5)
lines(x,predict(glm2,newdata=list(elev=x),type="response"),lwd=3,col="purple")
#check models
anova(gam1,glm1,glm2,test="Chi") #Lowest residual deviance with GAM
AIC(gam1,glm1,glm2) #Still better
windows() #this creates a new (second) plot window
plot(gam1,residuals=T,trans=function(x)exp(x)/(1+exp(x)),shade=T)
#Model the same - y-axis showing predicted model on scale specified in tran=with inverse logit function - returns the scale from 0 to 1
#Rug on bottom shows point for every data point
#Dotted lines value of residuals
#Shading is error values (2*SE) -> robustness of fit (always published with SEs)
#Question: How do these models relate to segmented (breakpoint) regression [two different functions two parts of data - get df back when join] (used in threshold modeling)?
#GAM you get back more df tests for how many different sections penalites applied to the number of knots
#You can secify number of knots and where they go - similar to segmented - may be more powerful
#Statement: If we have used our data to make a model then the optimization routine is picking the function - is using likelyhood justifiable? Probably. You have theorhetically already found the best model so of course it would be best in AIC
#Model selection may be more complicated
#When it comes to interactions probably should not be using GAMs makes interpretation very difficult
#You can fit a GAM with interaction but what does the P-value mean
#More models
gam2 = gam(cover>0~s(elev)+s(beers),family=binomial,data=dat5)
gam3 = gam(cover>0~s(elev)+beers,family=binomial,data=dat5) #hybrid
gam4 = gam(cover>0~s(elev)+s(beers)+s(elev,by=beers),family=binomial,data=dat5) #smoothed version of elevation conditioned by radiation - isotropic fit - changes in one variable equal to changes to the other
gam5 = gam(cover>0~s(elev)+s(beers)+te(elev,beers),family=binomial,data=dat5) #changes in one variable relation to other on different scales te() - give away more df
summary(gam2)
anova(gam1,gam2,test="Chi")
plot(gam2,trans=function(x)exp(x)/(1+exp(x)),shade=T,pages=1)#puts all variables on one page
anova(gam2,gam3,test="Chi") #residual deviance will go up - lost 2df but still support smoothing
summary(gam(cover>0~s(elev)+s(beers)+s(elev,beers),family=binomial,data=dat5)) #not interaction model
#used essentially the same fit different terms
summary(gam4) #no interaction
summary(gam5) #test interaction - not significant
plot(gam5,trans=function(x)exp(x)/(1+exp(x)),shade=T,pages=1) #no change with interaction because not significant
gam6 = gam(cover>0~s(elev)+disturb+te(elev,by=disturb,k=4),family=binomial,data=dat5) #GAM ANCOVA - use by() evaluate the smoothing function across different portions of the data - lose df because each smooth interacts with parameter of disturbance class
plot(gam6,trans=function(x)exp(x)/(1+exp(x)),shade=T,pages=1) #disturbance may be controlling some of the non linear effects of elec on occurrence
#It is unclear why the SE lines converge on zero in the LT SEL and COPRLOG graphs
#Error lines are plotted at 2*SE for each variable, so SE must go to zero at these points
#It may have to do with a mathematical limit at zero involving the construction of the smoothing terms, the calculation of the SE and their back transformation into scalable terms
#occurs at the particular x,y coordinates plotted (both plots - same spot)
summary(gam6)#no compelling evidence for disturbance effects on occurrence
##Bio et al paper GAMs vs Linear methods
#Are GAMs always better?
#Forward regression - four ways of putting a parameter in the model linear, quadratic, smoothed -3, smoothed -4
#Iterated best fit of putting terms in model in the four ways for all variables and all species - huge comparison
#Tested goodness of fit
#Majority of terms entered into the model were smoothed terms
#Polynomials of the same df of the smooths suffer from the fact that the whole fit varies according the the function - splines fit small parts
#GLMs stay less than cubic order
#Using package from Bio et al
install.packages("gam")
library(gam)
m0 = glm(cover>0~1,family=binomial,data=dat5)
m1 = glm(cover>0~elev,family=binomial,data=dat5)
m2 = glm(cover>0~elev+I(elev^2),family=binomial,data=dat5)
m3 = gam(cover>0~s(elev,3),family=binomial,data=dat5) #smoother with 3 df
m4 = gam(cover>0~s(elev,4),family=binomial,data=dat5) #smoother with 4 df
anova(m0,m1,m2,m3,m4,test="Chi")
m5 = glm(cover>0~elev+I(elev^2)+I(elev^3)+I(elev^4),family=binomial,data=dat5)
plot(x,predict(m4,newdata=list(elev=x),type="response"),pch=19,col="steelblue", ylim=c(0,1),ylab="Probability",xlab="Elevation")
points(x,predict(m5,newdata=list(elev=x),type="response"),pch=19,col="orchid")
anova(m4,m5,test="Chi")
#Bruto function - purported to be more robust, but in attempting to build one from the dataset for class, my system locked up under several iterations
#may be applicable to smaller data sets as those included in the example below
install.packages("mda")
libary(mda)
data(trees)
fit1 <- bruto(trees[,-3], trees[3])
fit1$type
fit1$df
## examine the fitted functions
par(mfrow=c(1,2), pty="s")
Xp <- matrix(sapply(trees[1:2], mean), nrow(trees), 2, byrow=TRUE)
for(i in 1:2) {
xr <- sapply(trees, range)
Xp1 <- Xp; Xp1[,i] <- seq(xr[1,i], xr[2,i], len=nrow(trees))
Xf <- predict(fit1, Xp1)
plot(Xp1[ ,i], Xf, xlab=names(trees)[i], ylab="", type="l")
}