# R para principiantes # PUCE, Quito, 5-8 enero 2010 # Simon Queenborough # JUEVES: clase 12 ######### CLASSICAL TESTS ################## # TWO SAMPLES # # The classical tests for two samples include: # • comparing two variances (Fisher’s F test, var.test) # • comparing two sample means with normal errors (Student’s t test, t.test) # • comparing two means with non-normal errors (Wilcoxon’s rank test, wilcox.test) # • comparing two proportions (the binomial test, prop.test) # • correlating two variables (Pearson’s or Spearman’s rank correlation, cor.test) # • testing for independence of two variables in a contingency table (chi-squared, chisq.test, # or Fisher’s exact test, fisher.test). # t.test # notched boxplot useful for graphically showing the difference. gardenA <- c(3,4,4,3,2,3,1,3,5,2) gardenB <- c(5,5,6,7,4,4,3,5,6,5) t.test(gardenA, gardenB) # wilcox.test wilcox.test(gardenA, gardenB) # paired t test # streams down <- c(20, 15, 10, 5, 20, 15, 10, 5, 20, 15, 10, 5, 20, 15, 10, 5) up <- c(23, 16, 10, 4, 22, 15, 12, 7, 21, 16, 11, 5, 22, 14, 10, 6) t.test(down, up) t.test(down, up, paired=TRUE) # sign test # This is one of the simplest of all statistical tests. Suppose that you cannot measure a # difference, but you can see it (e.g. in judging a diving contest). For example, nine springboard # divers were scored as better or worse, having trained under a new regime and under the # conventional regime (the regimes were allocated in a randomized sequence to each athlete: # new then conventional, or conventional then new). Divers were judged twice: one diver # was worse on the new regime, and 8 were better. What is the evidence that the new regime # produces significantly better scores in competition? The answer comes from a two-tailed # binomial test. How likely is a response of 1/9 (or 8/9 or more extreme than this, i.e. 0/9 or # 9/9) if the populations are actually the same (i.e. p=0.05)? We use a binomial test for this, # specifying the number of ‘failures’ (1) and the total sample size (9): binom.test(1,9) # prop.test # success, totals prop.test(c(4,196),c(40,3270)) # chi square test count<-matrix(c(38,14,11,51),nrow=2) count chisq.test(count) chisq.test(count)$expected # unequal probabilities chisq.test(c(10,3,2,6),p=c(0.2,0.2,0.3,0.3)) # using table die<-ceiling(runif(100,0,6)) table(die) chisq.test(table(die)) # Fishers Exact Test x<-as.matrix(c(6,4,2,8)) dim(x)<-c(2,2) x fisher.test(x) ###### correlation coefficient ### two <- read.table("twosample.txt",header=T) plot(two$x, two$y) cor(two$x, two$y) cor.test(two$x, two$y) cor.test(two$x, two$y, method="spearman") # Kolmogorov–Smirnov test # # • Are two sample distributions the same, or are they significantly different from one # another in one or more (unspecified) ways? # • Does a particular sample distribution arise from a particular hypothesized distribution? ks.test(two$x, two$y) #### bootstrapping #### # We want to use bootstrapping to obtain a 95% confidence interval for the mean of a vector # of numbers called values: skew <- read.table("skewdata.txt",header=T) names(skew) # We shall sample with replacement from values using sample(values,replace=T), then # work out the mean, repeating this operation 10 000 times, and storing the 10 000 different # mean values in a vector called ms: ms<-numeric(10000) for (i in 1:10000){ ms[i]<-mean(sample(skew$values,replace=T)) } # The answer to our problem is provided by the quantile function applied to ms: we want # to know the values of ms associated with the 0.025 and the 0.975 tails of ms quantile(ms,c(0.025,0.975)) # so the intervals below and above the mean are mean(skew$values)-quantile(ms,c(0.025,0.975)) ######################################################################################## # # EJERCICIOS # ######################################################################################## # 12.1 # make a notched boxplot of the garden data used in the t test # 12.2 # In an effort to increase student retention, many colleges have tried block programs. Suppose 100 students are # broken into two groups of 50 at random. One half are in a block program, the other half not. The number of # years in attendance is then measured. We wish to test if the block program makes a difference in retention. # The data is: Program 1yr 2yr 3yr 4yr 5+yrs. Non-Block 18 15 5 8 4 Block 10 5 7 18 10 # Do a test of hypothesis to decide if there is a di erence between the two types of programs in terms of retention. # 12.3 # A fish survey is done to see if the proportion of fish types is consistent with previous years. Suppose, the 3 types # of fish recorded: parrotfish, grouper, tang are historically in a 5:3:4 proportion and in a survey the following # counts are found observed 53 22 49 # 12.4 # The R dataset UCBAdmissions contains data on admission to UC Berkeley by gender. We wish to investigate # if the distribution of males admitted is similar to that of females. # To do so, we need to first do some spade work as the data set is presented in a complex contingency table. The # ftable (flatten table) command is needed. To use it try data(UCBAdmissions) # read in the dataset x = ftable(UCBAdmissions) # flatten x # what is there # We want to compare rows 1 and 2. Treating x as a matrix, we can access these with x[1:2,]. # Do a test for homogeneity between the two rows. What do you conclude? Repeat for the rejected group. # 12.5 # An exit poll by a news station of 900 people in the state of Florida found 440 voting for Bush and 460 voting # for Gore. Does the data support the hypothesis that Bush received p = 50% of the state's vote? # 12.6 # Load the dataset blood (below). Do a significance test for equivalent centers. Which one did you use and why? # What was the p-value? blood <- structure(list(Machine = c(68, 82, 94, 106, 92, 80, 76, 74, 110, 93, 86, 65, 74, 84, 100), Expert = c(72, 84, 89, 100, 97, 88, 84, 70, 103, 84, 86, 63, 69, 87, 93)), .Names = c("Machine", "Expert"), row.names = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15"), class = "data.frame")