#####################################################
# #
# Lab 2 linear model examples #
# #
# #
#####################################################
setwd("C:/Users/Lee.OWNER-PC/Documents/School_Teaching_2009-10/Pleco_R_Class")
dat <- read.csv("treedata.csv", header = T)
head(dat)
#Let's see how stream distance is related to tci and elevation (check metadata
#for definitions to understand why possibly related)
#Subset data for enviro data of interest; note that attaching data frames
#is generally bad policy but will allow us to be a bit lazy with code here
dat <- dat[ , 9:11]
head(dat)
attach(dat)
#Let's get a graphical idea of how these vars may be correlated
pairs(dat, panel = panel.smooth)
#It looks like stream dist decreases as tci increases and increases as elev
#increases
#What is the empirical structure of streamdist?
hist(streamdist) #looks like a potential gamma distribution...
#No need to bother testing for normality
#Specify full model with interaction terms
mod1 <- lm(streamdist~tci*elev)
summary(mod1)
#The summary returns a significant interaction between tci and elev
#Explore the step function for comparison
step(mod1) #Note the coefficient structure is the same
#If the interaction were not signif it could be removed via he following:
mod2 <- update(mod1, ~.-tci:elev)
summary(mod2)
#Next explore diagnostic plots of mod1
plot(mod1)
#Are the resid normal?
shapiro.test(mod1$residuals)
#Error, sample size too large for shapiro function
#Not sure if this is a proper use of sample
res <- sample(mod1$residuals, size = 5000, replace = T)
shapiro.test(res)
hist(mod1$residuals)
#Resids don't appear to be normally distributed, so transformation advisable
mod1 <- lm(log(streamdist)~tci*elev)
#NaN/Inf returned;log(o)==INF, so:
mod1 <- lm(log(streamdist +1) ~tci*elev)
summary(mod1)
hist(mod1$residuals)
res <- sample(mod1$residuals, size = 5000, replace = T)
shapiro.test(res)
#Still looking pretty poor, perhaps lm() is a poor choice,
#or another transformation could be more useful
mod2 <-lm(sqrt(streamdist)~tci*elev)
summary(mod2)
hist(mod2$residuals)
res <- sample(mod2$residuals, size = 5000, replace = T)
shapiro.test(mod2$residuals) #Hmmm? At any rate our explanatory
#power from this model is limited...
#Let's look at this from another angle:ANOVA
#Is there a significant difference in distance to streams
#between "high" and "low" elevation plots (as i
#have arbitrarily defined them--let's imagine there is obvious some ecotone)?
dat$elev2 <- ifelse(elev>"800", "H", "L")
dat$elev2 <- as.factor(dat$elev2)
aov1 <- lm(streamdist~dat$elev2) #or t.test()
summary.aov(aov1)
plot(dat$elev2, streamdist, xlab = "high vs. low elevation",
ylab = "Distance to Stream", notch = T)
#Notches do not overlap, check p; difference
#How well represented are "H" and "L" is the data set?
length(dat$elev2[dat$elev2 == "H"])
length(dat$elev2[dat$elev2 == "L"])