# R para principiantes # PUCE, Quito, 5-8 enero 2010 # Simon Queenborough # MIERCOLES: clase 5 # FUNCIONES SIMPLES ####################### tapply ####################### # datos de vegatacion: species richness en transectos diferentes setwd("C:/R_Quito/") Veg <- read.table(file="Vegetation2.txt", header= TRUE) # verificar como es: names(Veg) str(Veg) # species richness overall & per transect - the long way! m <- mean(Veg$R) m1<- mean(Veg$R[Veg$Transect == 1]) m2<- mean(Veg$R[Veg$Transect == 2]) m3<- mean(Veg$R[Veg$Transect == 3]) m4<- mean(Veg$R[Veg$Transect == 4]) m5<- mean(Veg$R[Veg$Transect == 5]) m6<- mean(Veg$R[Veg$Transect == 6]) m7<- mean(Veg$R[Veg$Transect == 7]) m8<- mean(Veg$R[Veg$Transect == 8]) c(m, m1, m2, m3, m4, m5, m6, m7, m8) # The variable m contains the mean richness of all 8 transects, and m1 through # m8 show the mean richness values per transect. Note that the mean command is # applied to Veg $R, which is a vector of data. It is not amatrix; hence there is no # need for a comma between the square brackets. # mean species richness per transect - a better way! # The R function tapply performs the same operation as the code # above (for m1 through m8 ), but with a single line of code tapply(Veg$R, Veg$Transect, mean) # or tapply(X = Veg$R, INDEX = Veg$Transect, FUN = mean) # The tapply function splits the data of the first variable (R),based on the levels # of the second variable (Transect). To each subgroup of data, it applies a function, # in this case the mean, but we can also use the standard deviation (function sd), # variance (function var), length (function length),and so on. The following # lines of code calculate some of these functions for the vegetation data. Me <- tapply(Veg$R, Veg$Transect, mean) Sd <- tapply(Veg$R, Veg$Transect, sd) Le <- tapply(Veg$R, Veg$Transect, length) cbind(Me, Sd, Le) # Each row in the output gives the mean richness, standard deviation, and # number of observations per transect. #################### sapply & lapply ########################## # To calculate the mean, minimum, maximum, standard deviation, and length of # the full series, we still need to use mean (Veg$R), min (Veg$R), max (Veg$R), # sd (Veg$R), and length (Veg$R). This is laborious if we wish to calculate the # mean of a large number of variables such as all the numerical variables of # the vegetation data. We specifically say ‘‘numerical’’ as one cannot calculate # the mean of a factor. There are 20 numerical variables in the vegetation dataset, # columns 5–25 of the data frame Veg. However, we do not need to type in the # mean command 20 times. R provides other functions similar to the tapply to # address this situation: the lapply and the sapply. sapply(Veg[, 5:9], FUN= mean) # It is important to realise that tapply calculates the mean (or any other function) # for subsets of observations of a variable, whereas lapply and sapply calcu- # late the mean (or any other function) of one or more variables, using all # observations. # The word FUN stands for function, and must be written in capitals. Instead # of the mean, you can use any other function as an argument for FUN, and you # can write your own functions. So what is the difference between sapply and # lapply? The major differences lie in the presentation of output, as can be seen # in the following example. lapply(Veg[, 5:9], FUN= mean) # The output of lapply is presented as a list, whereas sapply gives it as a # vector. The choice depends on the format in which you would like the output. # The variable that contains the data in lapply and sapply needs to be a # data frame. This will not work: sapply(cbind(Veg$R, Veg$ROCK, Veg$LITTER, Veg$ML, Veg$BARESOIL), FUN = mean) # It will produce one long vector of data, because the output of the cbind # command is not a data frame. It can easily be changed to a data frame: sapply(data.frame(cbind(Veg$R, Veg$ROCK, Veg$LITTER, Veg$ML, Veg$BARESOIL)), FUN = mean) # Note that we have lost the variable labels. To avoid this, make a proper data # frame before running the sapply function. Alternatively, use the # colnames function after combining the data with the cbind function. ################################### summary ############################## # Another function that gives basic information on variables is the summary # command. The argument can be a variable, the output from a cbind command, # or a data frame. It is run by the following commands. Z <-cbind(Veg$R, Veg$ROCK, Veg$LITTER) colnames(Z) <- c("R", "ROCK", "LITTER") summary(Z) # The summary command gives the minimum, first quartile, median, mean, # third quartile, and maximum value of the variable. An alternative R code gives # the same result: summary(Veg[ , c("R","ROCK","LITTER")]) # or summary(Veg[ , c(5, 6, 7)]) ################################### table ################################## # The table function can be # used to learn how many animals per farm were sampled, as well as the number # of observations per sex and year. The following code imports the data, and # shows the results. setwd("c:/R_Quito/") Deer <- read.table(file="Deer.txt", header= TRUE) names (Deer) str(Deer) # Farm has been coded as AL, AU, and so on, and is automatically imported # as a factor. The other variables are all vectors of numerical or integer values. # The number of observations per farm is obtained by table(Deer$Farm) table(Deer$Sex, Deer$Year) ##### HACEN LOS EJERCICIOS !! #######