##### Lesson 70: Understanding Estimators part II #####

# Setting the working directory 
setwd(“provide the path to your folder“)

library(locfit)
library(logspline)

# Reading the candy data file #
candy_data = read.table("candy_samples.txt",header=F)

hist(as.matrix(candy_data))

candy_combine = as.numeric(as.matrix(candy_data))
dum = which(candy_combine < 1.5)

dummy_candies = candy_combine[dum]
actual_candies = candy_combine[-dum]

hist(dummy_candies)
hist(actual_candies)

true_mean = mean(actual_candies)
true_var = (1/length(actual_candies))*sum((actual_candies - true_mean)^2) # true variance = (n-1/n)*variance function which is biased corrected

true_sigma = sqrt(true_var) # or (sqrt(n-1/n)*sd(x)) 

true_p = length(dummy_candies)/length(candy_combine)

############### Wilfred Quality Jr. - Sampling Experiment ###############

# Graphic analysis for proportion # 

# For each store, compute the sample proportion #
# The expected value of these estimates is the true parameter # 
# With each store visited, we can recompute the sample proportion #
# This will converge and stabilize at the true paraemter value # 

# Reading the candy data file #
candy_data = read.table("candy_samples.txt",header=F)

nstores = ncol(candy_data)    # number of stores 
ncandies = nrow(candy_data)   # number of candies in a pack

candies = NULL  # an empty vector that will be filled with actual cadies data from each store
dummies = NULL  # an empty vector that will be filled with dummy candies data from each store

sample_proportion = NULL # as the sample increases, we re-compute the sample proportion of dummy candies

sample_p = matrix(NA,nrow=nstores,ncol=1) # saving the estimate from each store. The expected value of this vector = truth

plot(0,0,col="white",xlim=c(0,1300),ylim=c(0,0.08),xlab="Stores",ylab="Proportion of Dummies",font=2,font.lab=2)

for (i in 1:nstores)
      {
        # sample pack of candies and the index of dummies in the pack
        candy_pack = candy_data[,i]
        dummy_index = which(candy_pack < 1.5)
        
        # separating candies from the dummies in each pack 
        sample_dummies = candy_pack[dummy_index]
        
        if(sum(dummy_index)>0){sample_candies = candy_pack[-dummy_index]}else{sample_candies = candy_pack}
        
        # recording the weights on candies and dummies -- this vector will grow as we visit more stores, i.e., collect more samples 
        dummies = c(dummies,sample_dummies)
        candies = c(candies,sample_candies)
        
        # computing the mean weight of the candies -- this vector will grow .. each incremental value is a better estimate with greater sample size. 
        p = length(dummies)/(ncandies*i)
        sample_proportion = c(sample_proportion,p)
        
        # storing the sample weight for the store in a matrix 
        sample_p[i,1] = length(dummy_index)/ncandies
        
        # plot showing the trace of the sample proportion -- converges to truth
        points(i,sample_proportion[i],cex=0.075,col="blue")
        points(1000,sample_p[i],cex=0.1,col="red")
        Sys.sleep(0.05)
        #print(i)
      }

Sys.sleep(1)
points(1000,mean(sample_p),pch=19,col="red",cex=2)
Sys.sleep(1)
boxplot(sample_p,add=T,at=1050,boxwex=25,range=0)
Sys.sleep(1)
text(1200,mean(sample_p),expression(E(hat(p))==0.02),col="red",font=2)


# Graphic analysis for mean # 

# For each store, compute the sample mean #
# The expected value of these estimates is the true parameter # 
# With each store visited, we can recompute the sample mean #
# This will converge and stabilize at the true paraemter value # 

nstores = ncol(candy_data)    # number of stores 
ncandies = nrow(candy_data)   # number of candies in a pack

candies = NULL  # an empty vector that will be filled with actual cadies data from each store
dummies = NULL  # an empty vector that will be filled with dummy candies data from each store

sample_meanweight = NULL # as the sample increases, we re-compute the sample mean of the candies

sample_mean = matrix(NA,nrow=nstores,ncol=1) # save the value of sample mean for each store

plot(0,0,xlim=c(0,1200),ylim=c(2.945,3.055),xlab="Stores",ylab="Average Candy Weight",font=2,font.lab=2)

for (i in 1:nstores)
{
  # sample pack of candies and the index of dummies in the pack
  candy_pack = candy_data[,i]
  dummy_index = which(candy_pack < 1.5)
  
  # separating candies from the dummies in each pack 
  sample_dummies = candy_pack[dummy_index]
  
  if(sum(dummy_index)>0){sample_candies = candy_pack[-dummy_index]}else{sample_candies = candy_pack}
  
  # recording the weights on candies and dummies -- this vector will grow as we visit more stores, i.e., collect more samples 
  dummies = c(dummies,sample_dummies)
  candies = c(candies,sample_candies)
  
  # computing the mean weight of the candies -- this vector will grow .. each incremental value is a better estimate with greater sample size. 
  xbar = mean(candies)
  sample_meanweight = c(sample_meanweight,xbar)
  
  # storing the sample weight for the store in a matrix 
  sample_mean[i,1] = mean(sample_candies)
  
  # plot showing the trace of the sample mean -- converges to truth
  points(i,sample_meanweight[i],cex=0.15,col="orange")
  points(1000,sample_mean[i],cex=0.15,col="red")
  Sys.sleep(0.05)
  print(i)
}

Sys.sleep(1)
points(1000,mean(sample_mean),pch=19,col="red",cex=2)
Sys.sleep(1)
boxplot(sample_mean,add=T,range=0,boxwex=25,at=1050)
Sys.sleep(1)
text(1150,mean(sample_mean),expression(E(hat(mu))==3),col="red",font=2)


# Graphic analysis for variance # 

# For each store, compute the sample variance #
# The expected value of these estimates is the true parameter # 
# With each store visited, we can recompute the sample variance #
# This will converge and stabilize at the true paraemter value # 

nstores = ncol(candy_data)    # number of stores 
ncandies = nrow(candy_data)   # number of candies in a pack

candies = NULL  # an empty vector that will be filled with actual cadies data from each store
dummies = NULL  # an empty vector that will be filled with dummy candies data from each store

sample_varianceofweight = NULL # as the sample increases, we re-compute the sample variance of the candies

sample_variance = matrix(NA,nrow=nstores,ncol=1)
sample_variance_unbiased = matrix(NA,nrow=nstores,ncol=1)

plot(0,0,xlim=c(0,1450),ylim=c(0.035,0.045),xlab="Stores",ylab="Variance of Candy Weight",font=2,font.lab=2)

for (i in 1:nstores)
{
  # sample pack of candies and the index of dummies in the pack
  candy_pack = candy_data[,i]
  dummy_index = which(candy_pack < 1.5)
  
  # separating candies from the dummies in each pack 
  sample_dummies = candy_pack[dummy_index]
  
  if(sum(dummy_index)>0){sample_candies = candy_pack[-dummy_index]}else{sample_candies = candy_pack}
  
  # recording the weights on candies and dummies -- this vector will grow as we visit more stores, i.e., collect more samples 
  dummies = c(dummies,sample_dummies)
  candies = c(candies,sample_candies)
  
  # computing the variance of weight of the candies -- this vector will grow .. each incremental value is a better estimate with greater sample size. 
  s_sq = (1/(length(candies)))*sum((candies-mean(candies))^2)
  sample_varianceofweight = c(sample_varianceofweight,s_sq)
  
  # storing the sample weight for the store in a matrix 
  sample_variance[i,1] = (1/length(sample_candies))*sum((sample_candies-mean(sample_candies))^2)
  sample_variance_unbiased[i,1] = var(sample_candies)
  
  # plot showing the trace of the sample mean -- converges to truth
  points(i,sample_varianceofweight[i],cex=0.1,col="magenta")
  points(1000,sample_variance[i],cex=0.15,col="red")
  Sys.sleep(0.05)
  print(i)
}

Sys.sleep(1)
points(1000,mean(sample_variance),pch=19,col="red",cex=1)
Sys.sleep(1)
boxplot(sample_variance,add=T,range=0,boxwex=25,at=1050)
Sys.sleep(1)
points(1000,true_var,pch=22,col="red",cex=1)
Sys.sleep(1)
text(1350,mean(sample_variance),expression(E(hat(sigma^2))!=0.0401),col="red",font=2)
Sys.sleep(1)
text(900,round((true_var),2)-0.0002,"bias",col="red",font=2)
boxplot(sample_variance_unbiased,add=T,range=0,boxwex=25,at=1100)
points(1000,mean(sample_variance_unbiased),pch=19,col="green",cex=1)


# Graphic analysis for standard deviation # 

# For each store, compute the sample standard deviation #
# The expected value of these estimates is the true parameter # 
# With each store visited, we can recompute the sample mean, sample sd and sample proportion #
# This will converge and stabilize at the true paraemter value # 

nstores = ncol(candy_data)    # number of stores 
ncandies = nrow(candy_data)   # number of candies in a pack

candies = NULL  # an empty vector that will be filled with actual cadies data from each store
dummies = NULL  # an empty vector that will be filled with dummy candies data from each store

sample_sdofweight = NULL # as the sample increases, we re-compute the sample variance of the candies

sample_sd = matrix(NA,nrow=nstores,ncol=1)
sample_sd_unbiased = matrix(NA,nrow=nstores,ncol=1)

plot(0,0,xlim=c(0,1400),ylim=c(0.18,0.22),xlab="Stores",ylab="Standard Deviation of Candy Weight",font=2,font.lab=2)

for (i in 1:nstores)
{
  # sample pack of candies and the index of dummies in the pack
  candy_pack = candy_data[,i]
  dummy_index = which(candy_pack < 1.5)
  
  # separating candies from the dummies in each pack 
  sample_dummies = candy_pack[dummy_index]
  
  if(sum(dummy_index)>0){sample_candies = candy_pack[-dummy_index]}else{sample_candies = candy_pack}
  
  # recording the weights on candies and dummies -- this vector will grow as we visit more stores, i.e., collect more samples 
  dummies = c(dummies,sample_dummies)
  candies = c(candies,sample_candies)
  
  # computing the variance (sd) of weight of the candies -- this vector will grow .. each incremental value is a better estimate with greater sample size. 
  s_sq = (1/(length(candies)))*sum((candies-mean(candies))^2)
  s = sqrt(s_sq)
  sample_sdofweight = c(sample_sdofweight,s)
  
  # storing the sample weight for the store in a matrix 
  sample_sd[i,1] = sqrt((1/length(sample_candies))*sum((sample_candies-mean(sample_candies))^2))
  sample_sd_unbiased[i,1] = sd(sample_candies)

  # plot showing the trace of the sample mean -- converges to truth
  points(i,sample_sdofweight[i],cex=0.1)
  points(1000,sample_sd[i],cex=0.15,col="red")
  Sys.sleep(0.05)
  print(i)
}

Sys.sleep(1)
points(1000,mean(sample_sd),pch=19,col="red",cex=1)
Sys.sleep(1)
boxplot(sample_sd,add=T,range=0,boxwex=25,at=1050)
Sys.sleep(1)
points(1000,true_sigma,pch=22,col="red",cex=1)
Sys.sleep(1)
text(1200,mean(sample_sd),expression(E(hat(sigma))!=0.200),col="red",font=2)
Sys.sleep(1)
text(900,round((true_sigma),2)-0.0006,"bias",col="red",font=2)
Sys.sleep(1)
boxplot(sample_sd_unbiased,add=T,range=0,boxwex=25,at=1100)
points(1000,mean(sample_sd_unbiased),pch=19,col="green",cex=1)