###############################################
# #
# Session 2: Linear Models, #
# writing functions #
# notes by Lee Davis #
# #
# #
###############################################
setwd("C:/Users/Lee.OWNER-PC/Documents/School_Teaching_2009-10/Pleco_R_Class")
#Basic review of syntax highlighting:
#Different types of commands (i.e. predefined functions or parameters
#for functionsa) are highlighted differentially.
#Makes coding and debugging much easier. Also easier to take code
#you've alread written and reuse it for other purposes
#Possible programs: TINN-R, JGR and others
#Some more basic commands:
#Input data set
dat <- read.csv("treedata.csv", header = T)
#Explore basic distribution with hist()
hist(dat$elev) #note that pars() can be used to change axis labels,
#size of font, etc. Can also be an argument in hist()
#recall that $ call a specific column within a data frame
hist(dat$elev,xlab = "Elevation", main = "", breaks = 10, freq = F)
#freq = F give proportions instead of counts
#lines() puts a line on top of existing graph
lines(density(dat$elev), lwd = 2, col = 'thistle')
#lwd = line width, col = color of density line
dat2 <- dat[duplicated(dat$plotID) == F,]
#Subsetting uses Trues (because True = 1, so 1 times your vector only
#returns trues)
#So, this only returns elevation for each resampled plot once
#Explore data structure more
#Q-Q plots
qqnorm(dat2$elev)
qqline(dat2$elev)
#Test for normality with Shapiro
shapiro.test(dat2.elev) #If normal p is > 0.05
#curve()
curve(dnorm) #Shows normaly curve
#dnorm is one of the functions associated with the normal distribution in R
#d in from of a distribution give the probability density function
#q give quatiles (useful for 95% confidince intervals, etc.)
#p give differential or probability of a particular value or range of values
#note there are many built in functions in the base package
#Pairs and coplot
#You can also write your own function in R to cary out any argument you want
#Basic structure:
# function(variable names) {equation}
#Don't overwrite other functions!!
#Caluculate standard error
se <- function(x) { sqrt(var(x)/length(x))}
#x is defined by us as the argument that the function gets, which means that
#x will be your data that you want the standard error of
se(dat2$elev)
se(x = dat2$elev) #is the same as above. You can explicitly name the argument
# or R will know via 'positional matching' what is what
#Best to define args with complex functions so you KNOW
#the args are defined properly
#Can also define functions within functions
se2 <- function (x) {
variance = function(X=x) {sum((X-mean(X))^2)/(length(X)-1)}
return ( sqrt(variance(x)/length(x)) )
}
#Note the x is passed on to X; caps are important! Try to figure out the exact
#order of operations in this function for yourself
#return() tell R what to print to the screen, which is a little extraneous
#here. Can also use print()
se2(dat2$elev)
#Bootsrap confidence intervals
#Note, bootstrapping is a technique whereby you can get around funky
#distributions by sampling from your actual distribution (with replacement)
#in order to obtain you confidence intervals (in this case)
#sample(x, replace = T) is sampling from your actuall data
#nperm is the number of permutations
#lower and upper are the C.I. boundaries
#sapply() applies a function to the list; good way to get around for loops
#sapply and lapply are essentially the same function but they give different
#forms of the output l gives: lists, s gives vectors
CI.boot <- function(x,nperm,lower,upper) {
list1 <- vector("list",nperm) #create a vector that is an empty list nperm long
vec1 <- sapply(list1,function(y)mean(sample(x,replace=T)))
#and use sappply() to calculate the mean of our random samples
#means are then recorded into list1, which is nperm long
#we now have a distribution of bootstrapped means
#Can change mean to anythig (like slope or median) to perform
#a bootstrap on any test statistic
return ( quantile(vec1,c(lower,upper)) )
#now return the upper and lower bounds, i.e. the conf intervals
}
#now call the function.
CI.boot(c(1:100),10000,.025,.975) #Note that these args are positional matched
#Use tapply to get mean elevation for disturbance classes
dist.mean <- tapply(dat2$elev, dat2$disturb, mean)
#Do the same for standard error
dist.se <- tapply(dat2$elev, dat2$disturb, se)
#Create barplot with standard error bars
bar1 <- barplot(dist.mean,names.arg=names(dist.mean),ylim=c(0,1500),
ylab="Elevation")
arrows(bar1,dist.mean,bar1,dist.mean+dist.se,angle=90,lwd=2)
#bar1 is x1, dist.mean is y1, bar1 is x2, dist.mean+dist.se = y2
#Standard errors are corrected by sample size and are therefore
#more useful for comparison beacuse there is inference based on
#degrees of freedom (that's why s.e. goes down as sample size goes up)
#Now for some least squares linear modeling
#lm() is the basic command
#Regress the cover of beech trees on elevation
lm1 <-lm(cover~elev, data = dat, subset=species=="Fagus grandifolia")
#basic structure: lm(y~x) y is a funtion of x
#subset is used to choose only data for beech trees
summary(lm1)
#gives back quantiles, and estimates of coefficients with their standard
#errors as well as p value for each, r squared, adjusted r squared (which
#penalizes for sample size), & F stat and p value associated with full model
#note that the summary can ba assigned to a variable name
sum <- summary(lm1)
#look at residuals, fitted values, etc
#attributes() returns all of the "stuff" you can get from your linear model
attributes(lm1)
lm1$residuals
lm1$fitted
lm1$coef
#ANOVA table to obtain summ of squares. F value should be the same
summary.aov(lm1)
#Confidence intervals for parameter extimates
confint(lm1)
#AIC likelihood values
AIC(lm1)
#plot the actual model diagnostic graphs
plot(lm1)
#First give residuals against fitted values
#Second gives QQ plot for distribution of residuals
#Third give scal location graph for outlier analysis
#Fourth gives residuals against leverage (Cook's distance) to show
#outliers that have large impact on linear model
#MASS library has roubust linear regression techniques
#load mass
library(MASS)
?rlm #for robust linear regression to deal with outliers
#Quantile regression models desired quantile instead of mean so you can
#estimate the slope and intercept of quantiles
#quantreg package
# update() is useful for model fitting when you are say removing terms or transforming
# a variable without copletely rewriting a big long model statement
lm2 <- update(lm1, formula=sqrt(cover)~elev)
#this tells R to square root transform cover and call the new model lm2
#ANOVA; uses the same underlying model
aov1 <- lm(cover~disturb, data=dat, subset=species=="Fagus grandifolia")
summary(aov1)
#This outpt from summary() is weird.
#The intercept is not the true intercept, but rather is the value for the
#first variable that R has considered
#The following estimates are then the differences for each new variable from the
#mean(intercept) of the first, so to get the mean you have to add them to the
# "intercept"
#The t tests are pairwise t tests, the overall F test is listed at the bottom
#the t test for the intercept is test if the intercept is different from zero
summary.aov(aov1) #Returns the actual anova table with associated f test
#it is best to tell R the actual contrasts you want to get back
#contrasts()
#The better thing to do is to use standard multiple comparison tests
# and NOT paiwise t tests to control pairwise error
#Tukeys' honestly significant difference test
TukeyHSD(aov(aov1))
aov1 <- lm(cover~disturb-1, data=dat, subset=species=="Fagus grandifolia")
#the -1 tells R to get rid of the intercept and test to see if each group
#is different from zero
#plot actual values and variance with boxplot. R know you have factor vars
#and know to use a boxplot.
plot(dat$disturb[dat$species=="Fagus grandifolia"],
dat$cover[dat$species=="Fagus grandifolia"],notch=T)
#ANCOVA
ancova1 <- lm(cover~disturb*elev,data=dat,subset=species=="Fagus grandifolia")
#disturb*elev gives full model with interaction term
summary(ancova1)
#gives intercepts and differences for each treatment and slope and
#differences for each treatment
#use step() for model selection
step(ancova1)
#step() removes terms and evaluates goodness of fit after removing terms
#begins with most complex terms first.
#adding polynomial terms to a linear regression with I()
#y~I(x^2)
#Can also use poly(x,2)
#######################Non-linear least squares################################
#can specify model functions
#nls()
#fits functions through least squares optimization, but does it
#using an optimization routine: must now specify starting parameters
#for coefficients so a decent guess is generally required. Use
#method of moments
dat2 = subset(dat,dat$species=="Betula lenta")
with(dat2,plot(elev,tci))
model1 = nls(tci ~ a*elev / (b+elev), start=list(a=500,b=5), data=dat2)
#basis michalis-menton function
#a is the asymptote, b is, start=list() define starting parameters
#note that you are defining the actual equaon for the function here
summary(model1)
#Many common non linear functions have a self-starting version built into R
#so that you do not have to estimate starting pars
model2 <- nls(tci ~ SSmicmen(elev,a,b), data=dat2)
summary(model2) #Should give same values as non SS version above
#gnls can take factor covariates (not sure what package this is in)
#Now put a curve on the data using predict()
x <- seq(0, 2000) #make a vector of value from 0 to 2000
lines(x, predict(model1, list(elev=x)), col = "red")
#now create new values for y as predicted from the model
model3 <- nls(tci ~ c*elev^z, start=list(c=20,z=1), data=dat2)
lines(x,predict(model3,list(elev=x)),col="blue")
#Which model fits best??
AIC(model2, model3)
#Note that you can transform variables to avoid non-linear modeling,
#but there are many reasons to avoid this
#Crawley pg 205 lists many common transformations
model4 <- lm(tci ~ I(elev^2) + elev, data = dat2)
summary(model4)
lines(x, predict(model4, list(elev=x)), col = "green")
#get a best fit linear model and report back how i did it