set.seed(345) ################################################################ # Example: Rectangular plates # # Number of simulations k <- 10000 # Simulate X, Y and then A X <- rnorm(k, 2, 0.01) Y <- rnorm(k, 3, 0.02) A <- X*Y # The mean and std. deviation of the simulated values mean(A) sd(A) mean(abs(A-6)>0.1) set.seed(9876) ################################################################ # One-sample confidence interval for mu # # Read the data x <- c(32.6, 1.6, 42.1, 29.2, 53.4, 79.3, 2.3 , 4.7, 13.6, 2.0) n <- length(x) # Set the number of simulations k <- 100000 # 1. Simulate 10 exponentials with the sample mean k times simsamples <- replicate(k, rexp(10,1/26.08)) # 2. Compute the mean of the 10 simulated observations k times simmeans <- apply(simsamples, 2, mean) # 3. Find the two relevant quantiles of the k simulated means quantile(simmeans, c(0.025, 0.975)) # Histogram of the simulated means hist(simmeans, col="blue", nclass=30, cex.main=0.8) set.seed(9876) ################################################################ # Confidence interval for the median assuming an exponential distribution # # Load the data x <- c(32.6, 1.6, 42.1, 29.2, 53.4, 79.3, 2.3 , 4.7, 13.6, 2.0) n <- length(x) # Set the number of simulations k <- 100000 # 1. Simulate k samples of n=10 exponentials with the sample mean simsamples <- replicate(k, rexp(n,1/26.08)) # 2. Compute the median of the n=10 simulated observations k times: simmedians <- apply(simsamples, 2, median) # 3. Find the two relevant quantiles of the k simulated medians: quantile(simmedians, c(0.025, 0.975)) # See the simulated medians hist(simmedians, col="blue", nclass=30, cex.main=0.8) Q3 <- function(x){ quantile(x, 0.75) } set.seed(9876) # load in the data x <- c(168, 161, 167, 179, 184, 166, 198, 187, 191, 179) n <- length(x) # Set the number of simulations: k <- 100000 # 1. Simulate k samples of n=10 normals with the sample mean and # variance: simsamples <- replicate(k, rnorm(n, mean(x), sd(x))) # 2. Compute the Q3 of the n=10 simulated observations k times: simQ3 <- apply(simsamples, 2, Q3) # 3. Find the two relevant quantiles of the k simulated medians: quantile(simQ3, c(0.005, 0.995)) hist(simQ3, col="blue", cex.main=0.8) set.seed(9876) ################################################################ # CI for the difference of two means from exponential distributed data # # Read the data x <- c(32.6, 1.6, 42.1, 29.2, 53.4, 79.3, 2.3 , 4.7, 13.6, 2.0) y <- c(9.6, 22.2, 52.5, 12.6, 33.0, 15.2, 76.6, 36.3, 110.2, 18.0, 62.4, 10.3) n1 <- length(x) n2 <- length(y) # Set the number of simulations k <- 100000 # 1. Simulate k samples of each n1=10 and n2=12 exponentials # with the sample means simXsamples <- replicate(k, rexp(n1,1/mean(x))) simYsamples <- replicate(k, rexp(n2,1/mean(y))) # 2. Compute the difference between the simulated means k times simDifmeans <- apply(simXsamples,2,mean) - apply(simYsamples,2,mean) # 3. Find the two relevant quantiles of the k simulated differences # in sample means quantile(simDifmeans, c(0.025, 0.975), cex.main=0.8) # The histogram of the simulated sample mean differences hist(simDifmeans, col="blue", nclass=25, cex.main=0.8) ################################################################ # Nutrition study: comparing medians assuming normal distributions # # Read the data xA <- c(7.53, 7.48, 8.08, 8.09, 10.15, 8.4, 10.88, 6.13, 7.9) xB <- c(9.21, 11.51, 12.79, 11.85, 9.97, 8.79, 9.69, 9.68, 9.19) nA <- length(xA) nB <- length(xB) set.seed(9843) # Set the number of simulations k <- 100000 # 1. Simulate k samples of each nA=9 and nB=9 exponentials with the # sample means and standard deviations simAsamples <- replicate(k, rnorm(nA, mean(xA), sd(xA))) simBsamples <- replicate(k, rnorm(nB, mean(xB), sd(xB))) # 2. Compute the difference between the simulated medians k times simDifmedians <- apply(simAsamples, 2, median) - apply(simBsamples, 2, median) # 3. Find the two relevant quantiles of the k simulated differences # of means quantile(simDifmedians, c(0.025, 0.975)) ################################################################ # Example: Women's cigarette consumption # # Read and calculate the differences for each woman before and after x1 <- c(8, 24, 7, 20, 6, 20, 13, 15, 11, 22, 15) x2 <- c(5, 11, 0, 15, 0, 20, 15, 19, 12, 0, 6) dif <- x1-x2 dif set.seed(7266) t(replicate(5, sample(dif, replace=TRUE))) set.seed(112) # Number of simulated samples k <- 100000 # Simulate simsamples <- replicate(k, sample(dif, replace=TRUE)) # Calculate the mean of each simulated sample simmeans <- apply(simsamples, 2, mean) # Quantiles of the differences gives the CI quantile(simmeans, c(0.025,0.975)) # The 95% CI for the median change k <- 100000 simsamples <- replicate(k, sample(dif, replace = TRUE)) simmedians <- apply(simsamples, 2, median) quantile(simmedians, c(0.025,0.975)) ################################################################ # Example: Teeth and bottle # # Reading in "no bottle" group x <- c(9, 10, 12, 6, 10, 8, 6, 20, 12) # Reading in "yes bottle" group y <- c(14,15,19,12,13,13,16,14,9,12) # Number of simulations k <- 100000 # Simulate each sample k times simxsamples <- replicate(k, sample(x, replace=TRUE)) simysamples <- replicate(k, sample(y, replace=TRUE)) # Calculate the sample mean differences simmeandifs <- apply(simxsamples,2,mean) - apply(simysamples,2,mean) # Quantiles of the differences gives the CI quantile(simmeandifs, c(0.025,0.975)) # CI for the median differences simmediandifs <- apply(simxsamples,2,median)-apply(simysamples,2,median) quantile(simmediandifs, c(0.005,0.995)) ################################################################ # Bootstrapping - a further perspective # # Install the bootstrap package install.packages("bootstrap") set.seed(3249) # Calculate the 95% CI for the Teeth and bottle example above library(bootstrap) quantile(bootstrap(dif,k,mean)$thetastar, c(0.025,0.975)) ################################################################ # Example: Bootstrapping the mean mu with the boot package # # Read and calculate the differences for each woman before and after x1 <- c(8,24,7,20,6,20,13,15,11,22,15) x2 <- c(5,11,0,15,0,20,15,19,12,0,6) dif <- x1-x2 # Define function for calculating the mean of the d indexes samplemean <- function(x, d){ mean(x[d]) } # Call the new function mean(dif) samplemean(dif,1:11) samplemean(dif,c(1,3)) dif dif[c(1,3)] mean(dif[c(1,3)]) set.seed(231) # Load the boot package library(boot) # Non-parametric bootstrap of the mean difference: k <- 10000 meandifboot <- boot(dif, samplemean, k) plot(meandifboot) # Percentile bootstrap CI: boot.ci(meandifboot, type="perc") # Bias Corrected Accelerated CI: boot.ci(meandifboot, type="bca") # Define a function for taking the median in the needed format samplemedian <- function(x, d) { return(median(x[d])) } # Non-parametric bootstrap of the x1 median b <- boot(x1,samplemedian,k) boot.ci(b, type="bca") # Non-parametric bootstrap of the Dif coef. of var samplecoefvar <- function(x, d) { return(sd(x[d])/mean(x[d])) } # b <- boot(dif,samplecoefvar,k) boot.ci(b, type="bca") # Example with data frame and two variables # Making our own data for the example - into a data frame: x <- runif(100) y <- x + 2*runif(100) D <- data.frame(x, y) head(D) plot(D) cor(D$x,D$y) # The correlation function on the right form: mycor <- function(D, d) { E <- D[d,] return(cor(E$x, E$y)) } # Check: mycor(D, 1:100) mycor(D, 1:15) # Doing the bootstrap on the data frame: b <- boot(D, mycor, 10000) boot.ci(b, type="bca") # Bootstrapping an R-Squared from an MLR using # the data set mtcars from the boot package: # (And showing how to send EXTRA stuff to your function) # function to obtain R-Squared from the data # AND working for ANY model fit you want!! rsq <- function(formula, data, d) { fit <- lm(formula, data=data[d,]) return(summary(fit)$r.square) } # Bootstrapping with 1000 replications b <- boot(mtcars, rsq, 1000, formula=mpg~wt+disp) plot(b) boot.ci(b, type="bca")