# R para principiantes # PUCE, Quito, 5-8 enero 2010 # Simon Queenborough # JUEVES: clase 14 ##################### GENERALIZED LINEAR MODELS ###################### # We can use generalized linear models (GLMs) pronounced ‘glims’ – when the variance # is not constant, and/or when the errors are not normally distributed. Certain kinds of # response variables invariably suffer from these two important contraventions of the standard # assumptions, and GLMs are excellent at dealing with them. Specifically, we might consider # using GLMs when the response variable is: # • count data expressed as proportions (e.g. logistic regressions); # • count data that are not proportions (e.g. log-linear models of counts); # • binary response variables (e.g. dead or alive); # • data on time to death where the variance increases faster than linearly with the mean # (e.g. time data with gamma errors). # The central assumption that we have made up to this point is that variance was constant. # In count data, however, where the response variable is an integer and there # are often lots of zeros in the dataframe, the variance may increase linearly with the mean. # With proportion data, where we have a count of the number of failures of an # event as well as the number of successes, the variance will be an inverted U-shaped function # of the mean. Where the response variable follows a gamma distribution (as in # time-to-death data) the variance increases faster than linearly with the mean. # Many of the basic statistical methods such as regression and Student’s t test assume that # variance is constant, but in many applications this assumption is untenable. Hence the great # utility of GLMs. # A generalized linear model has three important properties: # • the error structure; # • the linear predictor; # • the link function. # Up to this point, we have dealt with the statistical analysis of data with normal errors. In # practice, however, many kinds of data have non-normal errors: for example: # • errors that are strongly skewed; # • errors that are kurtotic; # • errors that are strictly bounded (as in proportions); # • errors that cannot lead to negative fitted values (as in counts). # In the past, the only tools available to deal with these problems were transformation of # the response variable or the adoption of non-parametric methods. A GLM allows the # specification of a variety of different error distributions: # • Poisson errors, useful with count data; # • binomial errors, useful with data on proportions; # • gamma errors, useful with data showing a constant coefficient of variation; # • exponential errors, useful with data on time to death (survival analysis). # The error structure is defined by means of the family directive, used as part of the # model formula. Examples are # glm(y ~ z, family = poisson ) # which means that the response variable y has Poisson errors, and # glm(y ~ z, family = binomial ) # which means that the response is binary, and the model has binomial errors. As with previous # models, the explanatory variable z can be continuous (leading to a regression analysis) or # categorical (leading to an ANOVA-like procedure called analysis of deviance) ################## COUNT DATA (POISSON) ##################### clusters <- read.table("clusters.txt", header=T) hist(clusters$Cancers) plot(clusters$Distance, clusters$Cancers) model1<-glm(Cancers~Distance,poisson, data=clusters) summary(model1) # check for overdispersion model2<-glm(Cancers~Distance,quasipoisson, data=clusters) summary(model2) #################### PROPORTION DATA (BINOMIAL) ########### numbers <- read.table("sexratio.txt", header=T) par(mfrow=c(1,2)) p<-numbers$males/(numbers$males+numbers$females) plot(numbers$density,p,ylab="Proportion male") plot(log(numbers$density),p,ylab="Proportion male") y<-cbind(numbers$males,numbers$females) y model<-glm(y~numbers$density,binomial) summary(model) model<-glm(y~log(numbers$density),binomial) summary(model) ############## BINARY RESPONSE VARIABLES ###################### isol <- read.table("isolation.txt", header=T) # We begin by fitting a complex model involving an interaction between isolation and area: model1<-glm(incidence~area*isolation,binomial, data=isol) # Then we fit a simpler model with only main effects for isolation and area: model2<-glm(incidence~area+isolation,binomial, data=isol) #We now compare the two models using ANOVA: anova(model1,model2,test="Chi") summary(model2) # plot: modela<-glm(incidence~area,binomial, data=isol) modeli<-glm(incidence~isolation,binomial, data=isol) par(mfrow=c(2,2)) xv<-seq(0,9,0.01) yv<-predict(modela,list(area=xv),type="response") plot(isol$area,isol$incidence) lines(xv,yv) xv2<-seq(0,10,0.1) yv2<-predict(modeli,list(isolation=xv2),type="response") plot(isol$isolation,isol$incidence) lines(xv2,yv2) ######################################################################################## # # EJERCICIOS # ######################################################################################## # 15.1 species spp <- read.table("species.txt", header=T) # plot species as a function of biomass for different levels of pH # model species as a function of biomass # include pH as a covariate (= analysis of covariance) # check for overdispersion # 15.2 germination germ <- read.table("germination.txt", header=T) # Count is the number of seeds that germinated out of a batch of size = sample. So the # number that did not germinate is sample – count. # model germination as a function of orobanche and extract # check for overdispersion