Machine Learning/brief statistics r

From Noisebridge
Jump to navigation Jump to search
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.

R code from A Brief Tour of Statistics

b_prob = c(0.05,0.5,0.35,0.1)
b_names = c("swish","basket","brick","air ball")
b_colors = c(2,2,1,1)
b_in_prob = c(0.55,0.45)
b_in_names = c("in","out")
b_in_colors = c(2,1)

png("basketball_shots.png")
barplot(b_prob,names.arg=b_names,legend.text=TRUE,ylim=c(0,1),col=b_colors,ylab="Probability")
dev.off()

png("basketball_in.png")
barplot(b_in_prob,names.arg=b_in_names,legend.text=TRUE,ylim=c(0,1),col=b_in_colors,ylab="Probability")
dev.off()


apple_lambda = 1181.25/(8*30)  # based on http://answers.yahoo.com/question/index?qid=20090205070515AAWn8PL 7.5 bushels * 45 lbs./bushel * 3.5 apples/lb.
				# growing season around 8 months
apple_range = (qpois(0.001,lambda=apple_lambda):qpois(0.999,lambda=apple_lambda))
apple_prob = dpois(apple_range,lambda=apple_lambda)

png("apple_distribution.png")
barplot(apple_prob,names.arg=apple_range,legend.text=TRUE,xlab="Apples per tree per day",ylab="Probability")
dev.off()

# Bus waiting times are Poisson
bus_lambda = 10
bus_range = (0:qpois(0.999,lambda=bus_lambda))
bus_prob = dpois(bus_range,lambda=bus_lambda)
png("bus_distribution.png",width=10*300,height=2*300)
barplot(bus_prob,names.arg=bus_range,legend.text=TRUE,xlab="Wait time",ylab="Probability")
dev.off()

# Rainfall is lognormal
avg_rainfall = 4.4
rain_amt = seq(0,50,by=0.01)
rain_prob = dlnorm(rain_amt,meanlog=log(avg_rainfall))

png("rain_distribution.png")
plot(rain_amt,rain_prob,type="l",xlab="Rain (inches)",ylab="Probability density")
dev.off()

png("rain_distribution_1020.png")
plot(rain_amt,rain_prob,type="l",xlab="Rain (inches)",ylab="Probability density")
for(rain in (seq(10,20,by=0.01))) {
	points(rep(rain,2),c(0,dlnorm(rain,meanlog=log(avg_rainfall))),type="l",col="grey")
}
points(rain_amt,rain_prob,type="l")
dev.off()


##  Balls in bins

ball_color=c("yellow","green","blue","red","white")
ball_label=c("yellow","green","blue","red","child")
ball_probability=c(0.32,0.24,0.18,0.16,0.1)
png("ball_distribution.png")
barplot(ball_probability,names.arg=ball_label,legend.text=TRUE,xlab="Ball color",ylab="Probability",col=ball_color)
dev.off()

png("ball_distribution2.png")
barplot(c(0.32,0.68),names.arg=c("yellow","non-yellow"),legend.text=TRUE,xlab="Ball color",ylab="Probability",col=c("yellow","white"))
dev.off()

## Distribution of number of baskets
n = 15
basket_range = (qbinom(0.001,size=n,prob=b_in_prob[1]):qbinom(0.999,size=n,prob=b_in_prob[1]))
basket_prob = dbinom(basket_range,size=n,prob=b_in_prob[1])
png(sprintf("basket_distribution_%d.png",n))
barplot(basket_prob,names.arg=basket_range,legend.text=TRUE,xlab="Number of baskets",ylab="Probability",main=sprintf("%d shots",n))
dev.off()

n = 30
basket_range = (qbinom(0.001,size=n,prob=b_in_prob[1]):qbinom(0.999,size=n,prob=b_in_prob[1]))
basket_prob = dbinom(basket_range,size=n,prob=b_in_prob[1])
png(sprintf("basket_distribution_%d.png",n))
barplot(basket_prob,names.arg=basket_range,legend.text=TRUE,xlab="Number of baskets",ylab="Probability",main=sprintf("%d shots",n))
dev.off()

horse_data=c(109,65,22,3,1)
png("horse_kick.png")
b=barplot(horse_data,names.arg=(0:4),legend.text=TRUE,xlab="Number of Deaths from Horse Kick",ylab="Occurrences")
mean_death = sum(horse_data*(0:4))/sum(horse_data)
death_points = (0:4)
#points(b,sum(horse_data)*dpois(death_points,lambda=mean_death),col=2,pch=19)
#points(b,sum(horse_data)*dpois(death_points,lambda=mean_death),type="l",col=2)
dev.off()

#
png("uniform.png")
barplot(rep(0.2,5),names.arg=(0:4),legend.text=TRUE,ylab="Probability",ylim=c(0,1))
dev.off()

## Ways of estimating

# distribution of the mean

n = 38
p = 21/38

basket_range = (qbinom(0.001,size=n,prob=p):qbinom(0.999,size=n,prob=p))
basket_prob = dbinom(basket_range,size=n,prob=p)
png(sprintf("estimate_distribution_%0.2f_%d.png",p,n))
barplot(basket_prob,names.arg=sprintf("%0.2f",basket_range/n),legend.text=TRUE,xlab="Estimated Field Goal Rate",ylab="Probability",main=sprintf("%0.2f Field Goal rate",p))
dev.off()


## Determining basket shooting percentage

n = 38
p = 0.5

for (p in c(0.25,0.5,0.75)) {

basket_range = (qbinom(0.001,size=n,prob=p):qbinom(0.999,size=n,prob=p))
basket_prob = dbinom(basket_range,size=n,prob=p)
png(sprintf("field_goal_distribution_%0.2f_%d.png",p,n))
barplot(basket_prob,names.arg=basket_range,legend.text=TRUE,xlab="Number of baskets",ylab="Probability",main=sprintf("%0.2f field goal rate",p))
dev.off()

print(sprintf("%0.2f: %s of %d",p,paste(rbinom(5,size=n,prob=p),sep=", "),n))
}

# 38 shots, make 21
basket_range = (qbinom(0.001,size=n,prob=p):qbinom(0.999,size=n,prob=p))
basket_prob = dbinom(basket_range,size=n,prob=p)
png(sprintf("field_goal_distribution_%0.2f_%d.png",p,n))
barplot(basket_prob,names.arg=basket_range,legend.text=TRUE,xlab="Number of baskets",ylab="Probability",main=sprintf("%0.2f field goal rate",p))
dev.off()


# confidence interval and p-value

for (p in seq(0,1,by=0.01)) {
	print(sprintf("%0.2f: P(>=21 baskets) = %0.4f",p,1-pbinom(20,size=n,prob=p)))
}

for (p in c(0.30,0.34,0.41,0.44,0.47)) {
	basket_range = (0:38)
	basket_prob = dbinom(basket_range,size=n,prob=p)
	basket_color = rep(1,39)
	basket_color[basket_range >= 21] = 2
	png(sprintf("field_goal_pval_%0.2f_%d.png",p,n))
	barplot(basket_prob, names.arg=basket_range, legend.text = TRUE, xlab="Number of baskets",ylab="Probability",main=sprintf("%0.2f field goal rate",p),col=basket_color)
	dev.off()
}


# CLT

p = 21/38
for (n in c(10,100,1000)) {
basket_range = (qbinom(0.001,size=n,prob=b_in_prob[1]):qbinom(0.999,size=n,prob=b_in_prob[1]))
basket_prob = dbinom(basket_range,size=n,prob=b_in_prob[1])
png(sprintf("basket_distribution_clt_%d.png",n))
barplot(basket_prob,names.arg=basket_range,legend.text=TRUE,xlab="Number of baskets",ylab="Probability",main=sprintf("%d shots",n))
dev.off()
}


# Regression

n = 100000

x = rpois(n,lambda=17)
y = 32.7 + x*12.3 + rnorm(n, mean=0, sd=50)

my_lm = lm(y~x)
sorted_x = unique(x[order(x)])
predicted = predict(my_lm,newdata=data.frame(x=sorted_x))

png("regression.png")
plot(x,y)
points(sorted_x,predicted,type="l",col=2,lwd=3)
dev.off()

png("residuals.png")
residuals = y - predict(my_lm,newdata=data.frame(x=x))
plot(x,residuals)
abline(a=0,b=0,col=2,lwd=3)
dev.off()

png("residuals2.png")
plot(residuals,x)
lines(x=c(0,0),y=range(x),col=2,lwd=3)
dev.off()

png("residuals_hist.png")
hist(residuals)
dev.off()



# logit

library("boot")
x = (-10:10)
png("inv_logit.png")
plot(x,inv.logit(x),type="l")
dev.off()

# non-normal

# mean 500, variance 30
mean = 500
variance = 500
k = 100000

png("normal.png")
hist(rnorm(k,mean=mean,sd=sqrt(variance)),xlab="Value",ylab="Frequency",main="Normal")
dev.off()

# chi-squared
#png("chisquared.png")
#hist(rchisq(k,df=mean),xlab="Value",ylab="Frequency",main="Chi-Squared")
#dev.off()

# bimodal
png("bimodal.png")
diff = sqrt(variance)-2
bimodal = c(rnorm(k/2,mean=mean-diff,sd=sqrt(variance-diff^2)),rnorm(k/2,mean=mean+diff,sd=sqrt(variance-diff^2)))
hist(bimodal, xlab="Value",ylab="Frequency",main="Bimodal")
dev.off()

png("uniform.png")
range=sqrt(12*variance)
hist(runif(k,mean-range/2,mean+range/2),xlab="Value",ylab="Frequency",main="Uniform")
dev.off()


# not enough black swans

for (n in c(100,1000,10000)) {
png(sprintf("hurricane_%d.png",n))
barplot(c(0,n),names.arg=c("Hurricane","No Hurricane"),legend.text=TRUE,ylab="Days")
dev.off()
}


#######################################################

sf_monthly_rainfall_mean = c(4.4,3.3,3.1,1.4,0.3,0.1,0.0,0.1,0.3,1.3,2.9,3.1)
this_month = as.numeric(format(Sys.time(),format="%m"))
cur_mean = sf_monthly_rainfall_mean[this_month]



png("heads_tails_fair.png")
barplot(c(0.5-1/12000,0.5-1/12000,1/6000),names.arg=c("heads","tails","side"),legend.text=TRUE,ylim=c(0,1),ylab="Probability")
dev.off()

png("heads_tails_biased.png")
barplot(c(0.8-1/12000,0.2-1/12000,1/6000),names.arg=c("heads","tails","side"),legend.text=TRUE,ylim=c(0,1),ylab="Probability")
dev.off()