# illustration of central limit theorem using a UNIFORM random distribution
# To execute this script in R:
# 1) open R and set the working directory to the folder that contains this file.
# 2) at the command prompt, enter source("cltunif.R")
# The script will plot the log-normal distribution and then use 3 plots to examine the distribution of sample means:
# - the expected normal distribution drawn on top of a histogram of the sample means
# - a quantile-quantile QQ plot of the sample means: the points should fall along a straight lines if the means are distributed normally
# - a boxplot of the sample means: should be symmetrical and about 5% of the points should be "outliers"
# Note: Only a very small sample size is required for the sample means to follow a normal distribution.
# library(car) # need this package to call n.bins
# library(MASS)
print("central limit theorem using uniform random distribution")
N <- 5000; # number of samples drawn from the population
# sample size:
minsamplesize<-2;
maxsamplesize<-20;
m<-2; # distribution mean
sd<-3; # distribution standard deviation
mu<-m
# usually, a uniform random distribution has lower and upper limits of zero & one,
# a mean of 0.5, and a variance of 1/12...
# to get the correct mean and sd, we need to adjust limits of uniform distribution
d<-(sqrt(12)*sd)/2
rmin<-m-d # lower limit
rmax<-m+d # upper limit
# print mean & sd to make sure that they are correct...
tmp<-runif(10000,rmin,rmax)
print(paste("expected mean = ",m,"; obtained mean = ",mean(tmp)))
print(paste("expected sd = ",sd,"; obtained sd = ",sd(tmp)))
if (Sys.info()[1]=="Darwin") quartz(height=6,width=8) else windows(height=6,width=8)
# plot the distribution...
if (Sys.info()[1]=="Darwin") quartz(height=6,width=8) else windows(height=6,width=8) # create window
x<-seq((rmin-1),(rmax+1),len=500) # sequence of 500 x values ranging from rmin to rmax
y<-dunif(x,rmin,rmax) # calculate the uniform density at those x values
plot(x,y,main="uniform density") # scatter plot
lines(x,y) # add lines
# now plot distribution of sample means:
if (Sys.info()[1]=="Darwin") quartz(height=6,width=8) else windows(height=6,width=8) # create another window
par(mfrow=c(1,3)) # tell R to divide window into 1x3 plots
for (k in minsamplesize:maxsamplesize)
{
sampsize <- k; # current sample size
thenumbers<-runif(N* sampsize,min=rmin,max=rmax); # draw random numbers from population
thedata<-matrix(thenumbers,N,sampsize) # reformat numbers into N samples of k values
sample.means<-rowMeans(thedata); # clac the mean of each row
# now draw a histogram
hist(sample.means,breaks="FD",ylim=c(0,1.0),
main=paste("sample size = ",k),prob=TRUE,col="lemonchiffon") # draw histogram of means
# truehist(sample.means,ylim=c(0,1),col="lemonchiffon") # draw histogram of means
# to the histogram, add the normal distribution of means
# in other words, we will draw what we expect to get if the original numbers are
# drawn from a normal distribution
pu <-par("usr")[1:2] # fancy-pants R language for getting the range of histogram's x-axis
x <- seq(pu[1],pu[2],len=500) # make a sequence of x values
y <- dnorm(x,mean=m,sd=(sd/sqrt(sampsize)) ) # calc normal density at those x values
lines(x,y,col="red") # draw it on top of the histogram
# next, draw a qqplot to visualize departures from normality
qqnorm(sample.means,main="QQ Plot (should be straight line)")
qqline(sample.means)
# finally, draw a boxplot to look for asymmetry and outliers
boxplot(sample.means,main="should be symmetrical")
Sys.sleep(1)
# now calculate t-distribution when sample size = k
sample.sd<-apply(thedata,MARGIN=1,sd) # calculate the sd for each row/sample
sample.se<-sample.sd/sqrt(sampsize) # the STANDARD ERROR is sd/sqrt(sample size)
# the t-value for each sample is defined as the sample mean minus the population mean (i.e., mu) divided by the standard error
sample.t<-(sample.means-mu)/sample.se
type1.error.rate=0.05
critical.t<-qt(type1.error.rate,df=(sampsize-1)); # critical value of t for a 1-tailed test that the sample mean is less than mu;
# how many of our samples actually exceeded the critical t value?
num.type1.errors<-length(sample.t[sample.t<=critical.t])
p.type1.error<-num.type1.errors/length(sample.means)
print(" ")
print("---------- TYPE I ERROR RATE ----------------")
print(paste("sample size = ", sampsize))
print(paste("expected type I error rate = ",type1.error.rate,"; observed type I error rate = ",p.type1.error))
}