##### Lesson 99: Two-Sample Hypothesis Tests in R ##### # Setting the working directory setwd("path to your folder") setwd(getwd()) # Read the data file nyc_trash_data = read.csv("DSNY_Monthly_Tonnage_Data.csv",header=T) # Extract February refuse data for Manhattan's community district 9 feb_index = which((nyc_trash_data$BOROUGH=="Manhattan") & (nyc_trash_data$COMMUNITYDISTRICT==9) & (nyc_trash_data$MONTH==2)) feb_refuse_data = nyc_trash_data$REFUSETONSCOLLECTED[feb_index] # Extract February refuse data for Manhattan's community district 9 aug_index = which((nyc_trash_data$BOROUGH=="Manhattan") & (nyc_trash_data$COMMUNITYDISTRICT==9) & (nyc_trash_data$MONTH==8)) aug_refuse_data = nyc_trash_data$REFUSETONSCOLLECTED[aug_index] # Visualizing the distributions of the two samples boxplot(cbind(feb_refuse_data,aug_refuse_data),horizontal=T, main="Refuse from Manhattan's Community District 9") text(1500,1,"February Tonnage",font=2) text(1500,2,"August Tonnage",font=2) p_threshold = 2500 # tons of refuse abline(v=p_threshold,lty=2,col="maroon") # Preliminaries # sample sizes n1 = length(feb_refuse_data) n2 = length(aug_refuse_data) # sample means x1bar = mean(feb_refuse_data) x2bar = mean(aug_refuse_data) # sample variances x1var = var(feb_refuse_data) x2var = var(aug_refuse_data) # sample proportions p1 = length(which(feb_refuse_datamean(yboot)){null_mean[i]=1} } pvalue_mean = sum(null_mean)/N hist(null_mean_ratio,font=2,main="Null Distribution Assuming H0 is True",xlab="Xbar/Ybar",font.lab=2) abline(v=1,lwd=2,lty=2) text(0.95,1000,paste("p-value=",pvalue_mean),col="red") # Hypothesis Test on the Equality of Variances #1. F-Test f0 = x1var/x2var df_numerator = n1-1 df_denominator = n2-1 pval = 1-pf(f0,df1=df_numerator,df2=df_denominator) print(f0) print(df_numerator) print(df_denominator) print(pval) var.test(feb_refuse_data,aug_refuse_data,alternative = "two.sided") #2. Bootstrap N = 10000 null_var = matrix(0,nrow=N,ncol=1) null_var_ratio = matrix(0,nrow=N,ncol=1) for(i in 1:N) { xboot = sample(feb_refuse_data,replace=T) yboot = sample(aug_refuse_data,replace=T) null_var_ratio[i] = var(xboot)/var(yboot) if(var(xboot)>var(yboot)){null_var[i]=1} } pvalue_var = sum(null_var)/N hist(null_var_ratio,font=2,main="Null Distribution Assuming H0 is True",xlab="XVar/YVar",font.lab=2) abline(v=1,lwd=2,lty=2) text(2,500,paste("p-value=",pvalue_var),col="red") # Hypothesis Test on the Difference in Proportions #1. Fisher's Exact Test x1 = length(which(feb_refuse_datap2boot){null_prop[i]=1} } pvalue_prop = sum(null_prop)/N hist(null_prop_ratio,font=2,main="Null Distribution Assuming H0 is True",xlab="P1/P2",font.lab=2) abline(v=1,lwd=2,lty=2) text(2,250,paste("p-value=",pvalue_prop),col="red")