##### Lesson 70: Understanding Estimators part II ##### # Setting the working directory setwd(“provide the path to your folder“) library(locfit) library(logspline) # Reading the candy data file # candy_data = read.table("candy_samples.txt",header=F) hist(as.matrix(candy_data)) candy_combine = as.numeric(as.matrix(candy_data)) dum = which(candy_combine < 1.5) dummy_candies = candy_combine[dum] actual_candies = candy_combine[-dum] hist(dummy_candies) hist(actual_candies) true_mean = mean(actual_candies) true_var = (1/length(actual_candies))*sum((actual_candies - true_mean)^2) # true variance = (n-1/n)*variance function which is biased corrected true_sigma = sqrt(true_var) # or (sqrt(n-1/n)*sd(x)) true_p = length(dummy_candies)/length(candy_combine) ############### Wilfred Quality Jr. - Sampling Experiment ############### # Graphic analysis for proportion # # For each store, compute the sample proportion # # The expected value of these estimates is the true parameter # # With each store visited, we can recompute the sample proportion # # This will converge and stabilize at the true paraemter value # # Reading the candy data file # candy_data = read.table("candy_samples.txt",header=F) nstores = ncol(candy_data) # number of stores ncandies = nrow(candy_data) # number of candies in a pack candies = NULL # an empty vector that will be filled with actual cadies data from each store dummies = NULL # an empty vector that will be filled with dummy candies data from each store sample_proportion = NULL # as the sample increases, we re-compute the sample proportion of dummy candies sample_p = matrix(NA,nrow=nstores,ncol=1) # saving the estimate from each store. The expected value of this vector = truth plot(0,0,col="white",xlim=c(0,1300),ylim=c(0,0.08),xlab="Stores",ylab="Proportion of Dummies",font=2,font.lab=2) for (i in 1:nstores) { # sample pack of candies and the index of dummies in the pack candy_pack = candy_data[,i] dummy_index = which(candy_pack < 1.5) # separating candies from the dummies in each pack sample_dummies = candy_pack[dummy_index] if(sum(dummy_index)>0){sample_candies = candy_pack[-dummy_index]}else{sample_candies = candy_pack} # recording the weights on candies and dummies -- this vector will grow as we visit more stores, i.e., collect more samples dummies = c(dummies,sample_dummies) candies = c(candies,sample_candies) # computing the mean weight of the candies -- this vector will grow .. each incremental value is a better estimate with greater sample size. p = length(dummies)/(ncandies*i) sample_proportion = c(sample_proportion,p) # storing the sample weight for the store in a matrix sample_p[i,1] = length(dummy_index)/ncandies # plot showing the trace of the sample proportion -- converges to truth points(i,sample_proportion[i],cex=0.075,col="blue") points(1000,sample_p[i],cex=0.1,col="red") Sys.sleep(0.05) #print(i) } Sys.sleep(1) points(1000,mean(sample_p),pch=19,col="red",cex=2) Sys.sleep(1) boxplot(sample_p,add=T,at=1050,boxwex=25,range=0) Sys.sleep(1) text(1200,mean(sample_p),expression(E(hat(p))==0.02),col="red",font=2) # Graphic analysis for mean # # For each store, compute the sample mean # # The expected value of these estimates is the true parameter # # With each store visited, we can recompute the sample mean # # This will converge and stabilize at the true paraemter value # nstores = ncol(candy_data) # number of stores ncandies = nrow(candy_data) # number of candies in a pack candies = NULL # an empty vector that will be filled with actual cadies data from each store dummies = NULL # an empty vector that will be filled with dummy candies data from each store sample_meanweight = NULL # as the sample increases, we re-compute the sample mean of the candies sample_mean = matrix(NA,nrow=nstores,ncol=1) # save the value of sample mean for each store plot(0,0,xlim=c(0,1200),ylim=c(2.945,3.055),xlab="Stores",ylab="Average Candy Weight",font=2,font.lab=2) for (i in 1:nstores) { # sample pack of candies and the index of dummies in the pack candy_pack = candy_data[,i] dummy_index = which(candy_pack < 1.5) # separating candies from the dummies in each pack sample_dummies = candy_pack[dummy_index] if(sum(dummy_index)>0){sample_candies = candy_pack[-dummy_index]}else{sample_candies = candy_pack} # recording the weights on candies and dummies -- this vector will grow as we visit more stores, i.e., collect more samples dummies = c(dummies,sample_dummies) candies = c(candies,sample_candies) # computing the mean weight of the candies -- this vector will grow .. each incremental value is a better estimate with greater sample size. xbar = mean(candies) sample_meanweight = c(sample_meanweight,xbar) # storing the sample weight for the store in a matrix sample_mean[i,1] = mean(sample_candies) # plot showing the trace of the sample mean -- converges to truth points(i,sample_meanweight[i],cex=0.15,col="orange") points(1000,sample_mean[i],cex=0.15,col="red") Sys.sleep(0.05) print(i) } Sys.sleep(1) points(1000,mean(sample_mean),pch=19,col="red",cex=2) Sys.sleep(1) boxplot(sample_mean,add=T,range=0,boxwex=25,at=1050) Sys.sleep(1) text(1150,mean(sample_mean),expression(E(hat(mu))==3),col="red",font=2) # Graphic analysis for variance # # For each store, compute the sample variance # # The expected value of these estimates is the true parameter # # With each store visited, we can recompute the sample variance # # This will converge and stabilize at the true paraemter value # nstores = ncol(candy_data) # number of stores ncandies = nrow(candy_data) # number of candies in a pack candies = NULL # an empty vector that will be filled with actual cadies data from each store dummies = NULL # an empty vector that will be filled with dummy candies data from each store sample_varianceofweight = NULL # as the sample increases, we re-compute the sample variance of the candies sample_variance = matrix(NA,nrow=nstores,ncol=1) sample_variance_unbiased = matrix(NA,nrow=nstores,ncol=1) plot(0,0,xlim=c(0,1450),ylim=c(0.035,0.045),xlab="Stores",ylab="Variance of Candy Weight",font=2,font.lab=2) for (i in 1:nstores) { # sample pack of candies and the index of dummies in the pack candy_pack = candy_data[,i] dummy_index = which(candy_pack < 1.5) # separating candies from the dummies in each pack sample_dummies = candy_pack[dummy_index] if(sum(dummy_index)>0){sample_candies = candy_pack[-dummy_index]}else{sample_candies = candy_pack} # recording the weights on candies and dummies -- this vector will grow as we visit more stores, i.e., collect more samples dummies = c(dummies,sample_dummies) candies = c(candies,sample_candies) # computing the variance of weight of the candies -- this vector will grow .. each incremental value is a better estimate with greater sample size. s_sq = (1/(length(candies)))*sum((candies-mean(candies))^2) sample_varianceofweight = c(sample_varianceofweight,s_sq) # storing the sample weight for the store in a matrix sample_variance[i,1] = (1/length(sample_candies))*sum((sample_candies-mean(sample_candies))^2) sample_variance_unbiased[i,1] = var(sample_candies) # plot showing the trace of the sample mean -- converges to truth points(i,sample_varianceofweight[i],cex=0.1,col="magenta") points(1000,sample_variance[i],cex=0.15,col="red") Sys.sleep(0.05) print(i) } Sys.sleep(1) points(1000,mean(sample_variance),pch=19,col="red",cex=1) Sys.sleep(1) boxplot(sample_variance,add=T,range=0,boxwex=25,at=1050) Sys.sleep(1) points(1000,true_var,pch=22,col="red",cex=1) Sys.sleep(1) text(1350,mean(sample_variance),expression(E(hat(sigma^2))!=0.0401),col="red",font=2) Sys.sleep(1) text(900,round((true_var),2)-0.0002,"bias",col="red",font=2) boxplot(sample_variance_unbiased,add=T,range=0,boxwex=25,at=1100) points(1000,mean(sample_variance_unbiased),pch=19,col="green",cex=1) # Graphic analysis for standard deviation # # For each store, compute the sample standard deviation # # The expected value of these estimates is the true parameter # # With each store visited, we can recompute the sample mean, sample sd and sample proportion # # This will converge and stabilize at the true paraemter value # nstores = ncol(candy_data) # number of stores ncandies = nrow(candy_data) # number of candies in a pack candies = NULL # an empty vector that will be filled with actual cadies data from each store dummies = NULL # an empty vector that will be filled with dummy candies data from each store sample_sdofweight = NULL # as the sample increases, we re-compute the sample variance of the candies sample_sd = matrix(NA,nrow=nstores,ncol=1) sample_sd_unbiased = matrix(NA,nrow=nstores,ncol=1) plot(0,0,xlim=c(0,1400),ylim=c(0.18,0.22),xlab="Stores",ylab="Standard Deviation of Candy Weight",font=2,font.lab=2) for (i in 1:nstores) { # sample pack of candies and the index of dummies in the pack candy_pack = candy_data[,i] dummy_index = which(candy_pack < 1.5) # separating candies from the dummies in each pack sample_dummies = candy_pack[dummy_index] if(sum(dummy_index)>0){sample_candies = candy_pack[-dummy_index]}else{sample_candies = candy_pack} # recording the weights on candies and dummies -- this vector will grow as we visit more stores, i.e., collect more samples dummies = c(dummies,sample_dummies) candies = c(candies,sample_candies) # computing the variance (sd) of weight of the candies -- this vector will grow .. each incremental value is a better estimate with greater sample size. s_sq = (1/(length(candies)))*sum((candies-mean(candies))^2) s = sqrt(s_sq) sample_sdofweight = c(sample_sdofweight,s) # storing the sample weight for the store in a matrix sample_sd[i,1] = sqrt((1/length(sample_candies))*sum((sample_candies-mean(sample_candies))^2)) sample_sd_unbiased[i,1] = sd(sample_candies) # plot showing the trace of the sample mean -- converges to truth points(i,sample_sdofweight[i],cex=0.1) points(1000,sample_sd[i],cex=0.15,col="red") Sys.sleep(0.05) print(i) } Sys.sleep(1) points(1000,mean(sample_sd),pch=19,col="red",cex=1) Sys.sleep(1) boxplot(sample_sd,add=T,range=0,boxwex=25,at=1050) Sys.sleep(1) points(1000,true_sigma,pch=22,col="red",cex=1) Sys.sleep(1) text(1200,mean(sample_sd),expression(E(hat(sigma))!=0.200),col="red",font=2) Sys.sleep(1) text(900,round((true_sigma),2)-0.0006,"bias",col="red",font=2) Sys.sleep(1) boxplot(sample_sd_unbiased,add=T,range=0,boxwex=25,at=1100) points(1000,mean(sample_sd_unbiased),pch=19,col="green",cex=1)