15 February 2010 GAMs #Utimate in debate over model-fitting and parsimony #Review of GLMs #How do you evaluate values of something that is log transformed with something that is log-linked transformed... #Don't use AIC. Based on the unique values of Y. If you transform data, AIC not applicable. #Look at the residual fit of both models, linear models (r^2), GLM (deviance --> psuedo r2) #GAMs #Look like GLMs in specification with more diffuclt output to interpret and must specify the smoothing function [s()] #Fit seperate polynomial functions of varing complexity to different parts of the data (knots=subsections) with the x''=0 (no kinks) -->spline #Can also use LOESS approach (locally weighted regression) weigths each point the closer you get to the value #Based on minimizing residual deviance (fitted to observe) - will always be lower than linear model - even with harsh penalties for df calculations still fits better in AIC, etc. #But how do you do this and still keep parsimony in control #Usesful for predictions within the range of the model - you receive a model built to the data you found in the range you sampled #Observed trends could be caused by unmeasured variables - basically a really nice looking curve - no mechanistic connection #Good for exploratory analysis - see if it could be reduced to GLM setwd("PUT YOUR DATA DIRECTORY HERE") dat=read.csv("treedata.csv") dat2 = subset(dat,dat\$species=="Betula alleghaniensis") #a yellow birch subset b.plots = unique(as.character(dat2\$plotID)) #plots with birch u.plots = unique(as.character(dat\$plotID)) #all plots nob.plots = u.plots[is.element(u.plots,b.plots)==F] #plots without birch dat3 = subset(dat,is.element(as.character(dat\$plotID),nob.plots)) #datast of no birch dat4 = subset(dat3,duplicated(dat3\$plotID)==F) #one row per plot dat4\$cover = 0 #cover of birch is zero in these plots dat5 = rbind(dat2,dat4) #new dataframe of presences and absences str(dat5) with(dat5,plot(elev,cover)) ls1 = loess(cover>0~elev,data=dat5) #to plot use predict() summary(ls1) #creates model that downweights values further from the predicted values with(dat5,plot(elev,cover>0)) x = seq(0,2000) lines(x,predict(ls1,newdata=data.frame(elev=x)),col="tomato",lwd=2) #must be data frame #binary regression (presence-absence) nothing involving splines here model increases toward 1 install.packages("mgcv") #any library can be pulled down this way - select mirror library(mgcv) gam1 = gam(cover>0~s(elev),family=binomial,data=dat5) #gam() function, define binomial, logit link default, cover>0 response variable logical, s(elev) fit with spline (without s() fits almost GLM) could fit a hybird with some splined and other not summary(gam1) #with large datasets may run into problems with computations because iterative (compare both packages for efficiency) #output show parametric fit associated with intercept on the scale of the link function #the smooth term returns estimated degress of freedom (edf) Chi-sq significance #ref.df may show the original df before deductions for smoothing fits or it may show the df calculated in another manner (R is unclear on this) #degrees of freedom are calulated based on the smoothness of the function used in the GAM #i.e. if we fit one parameter we loose one df, if we fit a line that connects all the data points we loose as many degrees of freedom as there are points #GAM fits a smooth curve somewhere between these two extremes and we loose the respective df #smooth also returns residual deviance adj R-sq. and deviance explained #Compare to Loess fit points(dat5\$elev,fitted(gam1),col="springgreen",lwd=2) #Is this a better fit? No likelyhood measurements --- r2 to pseuo r2 1/(1+1/exp(coef(gam1)[1])) #convert back to mean from link #Do it with GLM without smoothing term glm1 = glm(cover>0~elev,family=binomial,data=dat5) lines(x,predict(glm1,newdata=list(elev=x),type="response"),lwd=3,col="turquoise") #Fewer bumps cannot go back down summary(glm1) #can use likelyhood again AIC(gam1,glm1) #GAM better of course 1-(glm1\$dev/glm1\$null) anova(gam1,glm1,test="Chi") #give the glm a method to go down glm2 = glm(cover>0~elev+I(elev^2),family=binomial,data=dat5) lines(x,predict(glm2,newdata=list(elev=x),type="response"),lwd=3,col="purple") #check models anova(gam1,glm1,glm2,test="Chi") #Lowest residual deviance with GAM AIC(gam1,glm1,glm2) #Still better windows() #this creates a new (second) plot window plot(gam1,residuals=T,trans=function(x)exp(x)/(1+exp(x)),shade=T) #Model the same - y-axis showing predicted model on scale specified in tran=with inverse logit function - returns the scale from 0 to 1 #Rug on bottom shows point for every data point #Dotted lines value of residuals #Shading is error values (2*SE) -> robustness of fit (always published with SEs) #Question: How do these models relate to segmented (breakpoint) regression [two different functions two parts of data - get df back when join] (used in threshold modeling)? #GAM you get back more df tests for how many different sections penalites applied to the number of knots #You can secify number of knots and where they go - similar to segmented - may be more powerful #Statement: If we have used our data to make a model then the optimization routine is picking the function - is using likelyhood justifiable? Probably. You have theorhetically already found the best model so of course it would be best in AIC #Model selection may be more complicated #When it comes to interactions probably should not be using GAMs makes interpretation very difficult #You can fit a GAM with interaction but what does the P-value mean #More models gam2 = gam(cover>0~s(elev)+s(beers),family=binomial,data=dat5) gam3 = gam(cover>0~s(elev)+beers,family=binomial,data=dat5) #hybrid gam4 = gam(cover>0~s(elev)+s(beers)+s(elev,by=beers),family=binomial,data=dat5) #smoothed version of elevation conditioned by radiation - isotropic fit - changes in one variable equal to changes to the other gam5 = gam(cover>0~s(elev)+s(beers)+te(elev,beers),family=binomial,data=dat5) #changes in one variable relation to other on different scales te() - give away more df summary(gam2) anova(gam1,gam2,test="Chi") plot(gam2,trans=function(x)exp(x)/(1+exp(x)),shade=T,pages=1)#puts all variables on one page anova(gam2,gam3,test="Chi") #residual deviance will go up - lost 2df but still support smoothing summary(gam(cover>0~s(elev)+s(beers)+s(elev,beers),family=binomial,data=dat5)) #not interaction model #used essentially the same fit different terms summary(gam4) #no interaction summary(gam5) #test interaction - not significant plot(gam5,trans=function(x)exp(x)/(1+exp(x)),shade=T,pages=1) #no change with interaction because not significant gam6 = gam(cover>0~s(elev)+disturb+te(elev,by=disturb,k=4),family=binomial,data=dat5) #GAM ANCOVA - use by() evaluate the smoothing function across different portions of the data - lose df because each smooth interacts with parameter of disturbance class plot(gam6,trans=function(x)exp(x)/(1+exp(x)),shade=T,pages=1) #disturbance may be controlling some of the non linear effects of elec on occurrence #It is unclear why the SE lines converge on zero in the LT SEL and COPRLOG graphs #Error lines are plotted at 2*SE for each variable, so SE must go to zero at these points #It may have to do with a mathematical limit at zero involving the construction of the smoothing terms, the calculation of the SE and their back transformation into scalable terms #occurs at the particular x,y coordinates plotted (both plots - same spot) summary(gam6)#no compelling evidence for disturbance effects on occurrence ##Bio et al paper GAMs vs Linear methods #Are GAMs always better? #Forward regression - four ways of putting a parameter in the model linear, quadratic, smoothed -3, smoothed -4 #Iterated best fit of putting terms in model in the four ways for all variables and all species - huge comparison #Tested goodness of fit #Majority of terms entered into the model were smoothed terms #Polynomials of the same df of the smooths suffer from the fact that the whole fit varies according the the function - splines fit small parts #GLMs stay less than cubic order #Using package from Bio et al install.packages("gam") library(gam) m0 = glm(cover>0~1,family=binomial,data=dat5) m1 = glm(cover>0~elev,family=binomial,data=dat5) m2 = glm(cover>0~elev+I(elev^2),family=binomial,data=dat5) m3 = gam(cover>0~s(elev,3),family=binomial,data=dat5) #smoother with 3 df m4 = gam(cover>0~s(elev,4),family=binomial,data=dat5) #smoother with 4 df anova(m0,m1,m2,m3,m4,test="Chi") m5 = glm(cover>0~elev+I(elev^2)+I(elev^3)+I(elev^4),family=binomial,data=dat5) plot(x,predict(m4,newdata=list(elev=x),type="response"),pch=19,col="steelblue", ylim=c(0,1),ylab="Probability",xlab="Elevation") points(x,predict(m5,newdata=list(elev=x),type="response"),pch=19,col="orchid") anova(m4,m5,test="Chi") #Bruto function - purported to be more robust, but in attempting to build one from the dataset for class, my system locked up under several iterations #may be applicable to smaller data sets as those included in the example below install.packages("mda") libary(mda) data(trees) fit1 <- bruto(trees[,-3], trees[3]) fit1\$type fit1\$df ## examine the fitted functions par(mfrow=c(1,2), pty="s") Xp <- matrix(sapply(trees[1:2], mean), nrow(trees), 2, byrow=TRUE) for(i in 1:2) { xr <- sapply(trees, range) Xp1 <- Xp; Xp1[,i] <- seq(xr[1,i], xr[2,i], len=nrow(trees)) Xf <- predict(fit1, Xp1) plot(Xp1[ ,i], Xf, xlab=names(trees)[i], ylab="", type="l") }