library (resample)
library (tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.4
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
“I have followed all rules for collaboration for this project, and I have not used generative AI on this project.”
Setting 1: n = 1000, p = 0.45
n1 <- 1000 # sample size
p1 <- 0.45 # population proportion
generate_samp_prop <- function (n, p) {
x <- rbinom (1 , n, p) # randomly generate number of successes for the sample
## number of successes divided by sample size
phat <- x / n
## 90% confidence interval
lb <- phat - 1.645 * sqrt (phat * (1 - phat) / n)
ub <- phat + 1.645 * sqrt (phat * (1 - phat) / n)
prop_df <- tibble (phat, lb, ub)
return (prop_df)
}
generate_samp_prop (n = 1000 , p = 0.45 )
# A tibble: 1 × 3
phat lb ub
<dbl> <dbl> <dbl>
1 0.434 0.408 0.460
## how many CI's do we want
n_sim <- 5000
prop_ci_df <- map (1 : n_sim,
\(i) generate_samp_prop (n = 1000 , p = 0.45 )) |>
bind_rows ()
prop_ci_df
# A tibble: 5,000 × 3
phat lb ub
<dbl> <dbl> <dbl>
1 0.445 0.419 0.471
2 0.473 0.447 0.499
3 0.429 0.403 0.455
4 0.437 0.411 0.463
5 0.461 0.435 0.487
6 0.449 0.423 0.475
7 0.466 0.440 0.492
8 0.451 0.425 0.477
9 0.442 0.416 0.468
10 0.468 0.442 0.494
# ℹ 4,990 more rows
Average interval width and coverage rate
prop_ci_df <- prop_ci_df |> mutate (ci_width = ub - lb,
ci_cover_ind = if_else (p1 > lb & p1 < ub,
true = 1 ,
false = 0 ))
prop_ci_df |> summarise (avg_width = mean (ci_width),
coverage_rate = mean (ci_cover_ind))
# A tibble: 1 × 2
avg_width coverage_rate
<dbl> <dbl>
1 0.0517 0.903
Setting 2: n = 1000, p = .90
n1 <- 1000 # sample size
p2 <- 0.9 # population proportion
generate_samp_prop <- function (n, p) {
x <- rbinom (1 , n, p) # randomly generate number of successes for the sample
## number of successes divided by sample size
phat <- x / n
## 90% confidence interval
lb <- phat - 1.645 * sqrt (phat * (1 - phat) / n)
ub <- phat + 1.645 * sqrt (phat * (1 - phat) / n)
prop_df <- tibble (phat, lb, ub)
return (prop_df)
}
generate_samp_prop (n = 1000 , p = 0.9 )
# A tibble: 1 × 3
phat lb ub
<dbl> <dbl> <dbl>
1 0.893 0.877 0.909
## how many CI's do we want
n_sim <- 5000
prop_ci_df1 <- map (1 : n_sim,
\(i) generate_samp_prop (n = 1000 , p = 0.9 )) |>
bind_rows ()
prop_ci_df1
# A tibble: 5,000 × 3
phat lb ub
<dbl> <dbl> <dbl>
1 0.889 0.873 0.905
2 0.882 0.865 0.899
3 0.904 0.889 0.919
4 0.894 0.878 0.910
5 0.901 0.885 0.917
6 0.9 0.884 0.916
7 0.899 0.883 0.915
8 0.884 0.867 0.901
9 0.899 0.883 0.915
10 0.896 0.880 0.912
# ℹ 4,990 more rows
Average interval width and coverage rate
prop_ci_df1 <- prop_ci_df1 |> mutate (ci_width = ub - lb,
ci_cover_ind = if_else (p2 > lb & p2 < ub,
true = 1 ,
false = 0 ))
prop_ci_df1 |> summarise (avg_width = mean (ci_width),
coverage_rate = mean (ci_cover_ind))
# A tibble: 1 × 2
avg_width coverage_rate
<dbl> <dbl>
1 0.0312 0.889
Setting 3: n = 100, p = .45
n2 <- 100 # sample size
p1 <- 0.45 # population proportion
generate_samp_prop <- function (n, p) {
x <- rbinom (1 , n, p) # randomly generate number of successes for the sample
## number of successes divided by sample size
phat <- x / n
## 90% confidence interval
lb <- phat - 1.645 * sqrt (phat * (1 - phat) / n)
ub <- phat + 1.645 * sqrt (phat * (1 - phat) / n)
prop_df <- tibble (phat, lb, ub)
return (prop_df)
}
generate_samp_prop (n = 100 , p = 0.45 )
# A tibble: 1 × 3
phat lb ub
<dbl> <dbl> <dbl>
1 0.5 0.418 0.582
## how many CI's do we want
n_sim <- 5000
prop_ci_df2 <- map (1 : n_sim,
\(i) generate_samp_prop (n = 100 , p = 0.45 )) |>
bind_rows ()
prop_ci_df2
# A tibble: 5,000 × 3
phat lb ub
<dbl> <dbl> <dbl>
1 0.46 0.378 0.542
2 0.48 0.398 0.562
3 0.39 0.310 0.470
4 0.38 0.300 0.460
5 0.41 0.329 0.491
6 0.48 0.398 0.562
7 0.4 0.319 0.481
8 0.45 0.368 0.532
9 0.47 0.388 0.552
10 0.44 0.358 0.522
# ℹ 4,990 more rows
Average interval width and coverage rate
prop_ci_df2 <- prop_ci_df2 |> mutate (ci_width = ub - lb,
ci_cover_ind = if_else (p1 > lb & p1 < ub,
true = 1 ,
false = 0 ))
prop_ci_df2 |> summarise (avg_width = mean (ci_width),
coverage_rate = mean (ci_cover_ind))
# A tibble: 1 × 2
avg_width coverage_rate
<dbl> <dbl>
1 0.163 0.888
Setting 4: n = 100, p = 0.9
n2 <- 100 # sample size
p2 <- 0.9 # population proportion
generate_samp_prop <- function (n, p) {
x <- rbinom (1 , n, p) # randomly generate number of successes for the sample
## number of successes divided by sample size
phat <- x / n
## 90% confidence interval
lb <- phat - 1.645 * sqrt (phat * (1 - phat) / n)
ub <- phat + 1.645 * sqrt (phat * (1 - phat) / n)
prop_df <- tibble (phat, lb, ub)
return (prop_df)
}
generate_samp_prop (n = 100 , p = 0.9 )
# A tibble: 1 × 3
phat lb ub
<dbl> <dbl> <dbl>
1 0.89 0.839 0.941
## how many CI's do we want
n_sim <- 5000
prop_ci_df3 <- map (1 : n_sim,
\(i) generate_samp_prop (n = 100 , p = 0.9 )) |>
bind_rows ()
prop_ci_df3
# A tibble: 5,000 × 3
phat lb ub
<dbl> <dbl> <dbl>
1 0.84 0.780 0.900
2 0.89 0.839 0.941
3 0.91 0.863 0.957
4 0.92 0.875 0.965
5 0.89 0.839 0.941
6 0.86 0.803 0.917
7 0.93 0.888 0.972
8 0.88 0.827 0.933
9 0.89 0.839 0.941
10 0.92 0.875 0.965
# ℹ 4,990 more rows
Average interval width and coverage rate
prop_ci_df3 <- prop_ci_df3 |> mutate (ci_width = ub - lb,
ci_cover_ind = if_else (p2 > lb & p2 < ub,
true = 1 ,
false = 0 ))
prop_ci_df3 |> summarise (avg_width = mean (ci_width),
coverage_rate = mean (ci_cover_ind))
# A tibble: 1 × 2
avg_width coverage_rate
<dbl> <dbl>
1 0.0973 0.866
Setting 5: n = 10, p = 0.45
n3 <- 10 # sample size
p1 <- 0.45 # population proportion
generate_samp_prop <- function (n, p) {
x <- rbinom (1 , n, p) # randomly generate number of successes for the sample
## number of successes divided by sample size
phat <- x / n
## 90% confidence interval
lb <- phat - 1.645 * sqrt (phat * (1 - phat) / n)
ub <- phat + 1.645 * sqrt (phat * (1 - phat) / n)
prop_df <- tibble (phat, lb, ub)
return (prop_df)
}
generate_samp_prop (n = 10 , p = 0.45 )
# A tibble: 1 × 3
phat lb ub
<dbl> <dbl> <dbl>
1 0.5 0.240 0.760
## how many CI's do we want
n_sim <- 5000
prop_ci_df4 <- map (1 : n_sim,
\(i) generate_samp_prop (n = 10 , p = 0.45 )) |>
bind_rows ()
prop_ci_df4
# A tibble: 5,000 × 3
phat lb ub
<dbl> <dbl> <dbl>
1 0.2 -0.00808 0.408
2 0.4 0.145 0.655
3 0.3 0.0616 0.538
4 0.5 0.240 0.760
5 0.4 0.145 0.655
6 0.4 0.145 0.655
7 0.5 0.240 0.760
8 0.6 0.345 0.855
9 0.5 0.240 0.760
10 0.4 0.145 0.655
# ℹ 4,990 more rows
Average interval width and coverage rate
prop_ci_df4 <- prop_ci_df4 |> mutate (ci_width = ub - lb,
ci_cover_ind = if_else (p1 > lb & p1 < ub,
true = 1 ,
false = 0 ))
prop_ci_df4 |> summarise (avg_width = mean (ci_width),
coverage_rate = mean (ci_cover_ind))
# A tibble: 1 × 2
avg_width coverage_rate
<dbl> <dbl>
1 0.489 0.805
Seting 6: n = 10, p = 0.9
n3 <- 10 # sample size
p2 <- 0.9 # population proportion
generate_samp_prop <- function (n, p) {
x <- rbinom (1 , n, p) # randomly generate number of successes for the sample
## number of successes divided by sample size
phat <- x / n
## 90% confidence interval
lb <- phat - 1.645 * sqrt (phat * (1 - phat) / n)
ub <- phat + 1.645 * sqrt (phat * (1 - phat) / n)
prop_df <- tibble (phat, lb, ub)
return (prop_df)
}
generate_samp_prop (n = 10 , p = 0.9 )
# A tibble: 1 × 3
phat lb ub
<dbl> <dbl> <dbl>
1 0.8 0.592 1.01
## how many CI's do we want
n_sim <- 5000
prop_ci_df5 <- map (1 : n_sim,
\(i) generate_samp_prop (n = 10 , p = 0.9 )) |>
bind_rows ()
prop_ci_df5
# A tibble: 5,000 × 3
phat lb ub
<dbl> <dbl> <dbl>
1 0.6 0.345 0.855
2 0.9 0.744 1.06
3 1 1 1
4 1 1 1
5 1 1 1
6 0.9 0.744 1.06
7 0.9 0.744 1.06
8 0.9 0.744 1.06
9 0.6 0.345 0.855
10 1 1 1
# ℹ 4,990 more rows
Average interval width and coverage rate
prop_ci_df5 <- prop_ci_df5 |> mutate (ci_width = ub - lb,
ci_cover_ind = if_else (p2 > lb & p2 < ub,
true = 1 ,
false = 0 ))
prop_ci_df5 |> summarise (avg_width = mean (ci_width),
coverage_rate = mean (ci_cover_ind))
# A tibble: 1 × 2
avg_width coverage_rate
<dbl> <dbl>
1 0.234 0.636
Table of Results
\(p = 0.45\)
Coverage Rate
0.8916
0.8878
0.8016
\(p = 0.90\)
Coverage Rate
0.8932
0.8632
0.635
\(p = 0.45\)
Average Width
0.05173266
0.1628876
0.4882621
\(p = 0.90\)
Average Width
0.03114799
0.09735256
0.2351569
Write-up:
My table of results shows the coverage rates and average widths for 90% confidence intervals for six different settings of a different sample size and population proportion. The coverage rates are closest to 90% level when the sample size is large, particularly for n = 1000, which makes sense for what we’ve been discussing in class. For both p = 0.45 and p = 0.9, the coverage rates are around 0.89, which is consistent with the confidence level. However, as the sample size decreases, the coverage rate tends to fall below 90%, especially for p = 0.9. This result highlights how smaller sample sizes are more likely to produce confidence intervals that do not contain the true population proportion, usually when the population proportion is far away from 0.5.
The average interval widths also show an expected pattern that we’ve discussed in class briefly. Larger sample sizes produce narrower confidence intervals, while smaller sample sizes make wider intervals. The average width for n = 1000 is significantly smaller than for n = 100 and n = 10, proving that there are more precise estimates with larger samples. For each sample size, the intervals are wider when p = 0.45 compared to p = 0.9. This happens because proportions near 0.5 have more variability making the confidence intervals wider.
These results prove the importance of the large sample assumption for the asymptotic confidence interval. When both n*p and n(1-p) are large, the intervals are more accurate and consistent with the theoretical confidence level. When these assumptions are violated with small sample sizes or extreme population proportions, the coverage rate decreases and the confidence intervals become less accurate. My simulation shows why we need to be careful when we apply asymptotic methods to small sample sizes (n = 10) or highly skewed population distributions (p = 0.9).