Mini Project 3 – Math Stat Blog

library(resample)
library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

“I have followed all rules for collaboration for this project, and I have not used generative AI on this project.”

Setting 1: n = 1000, p = 0.45

n1 <- 1000   # sample size
p1 <- 0.45  # population proportion
  

generate_samp_prop <-function(n, p) {
  x <- rbinom(1, n, p) # randomly generate number of successes for the sample
  
  ## number of successes divided by sample size
  phat <- x / n
  
  ## 90% confidence interval
  lb <- phat - 1.645 * sqrt(phat * (1 - phat) / n)
  ub <- phat + 1.645 * sqrt(phat * (1 - phat) / n)
  
  prop_df <- tibble(phat, lb, ub)
  return(prop_df)
}

generate_samp_prop(n = 1000, p = 0.45)

# A tibble: 1 × 3
   phat    lb    ub
  <dbl> <dbl> <dbl>
1 0.434 0.408 0.460

## how many CI's do we want
n_sim <- 5000

prop_ci_df <- map(1:n_sim, 
    \(i) generate_samp_prop(n = 1000, p = 0.45)) |>
  bind_rows()

prop_ci_df

# A tibble: 5,000 × 3
    phat    lb    ub
   <dbl> <dbl> <dbl>
 1 0.445 0.419 0.471
 2 0.473 0.447 0.499
 3 0.429 0.403 0.455
 4 0.437 0.411 0.463
 5 0.461 0.435 0.487
 6 0.449 0.423 0.475
 7 0.466 0.440 0.492
 8 0.451 0.425 0.477
 9 0.442 0.416 0.468
10 0.468 0.442 0.494
# ℹ 4,990 more rows

Average interval width and coverage rate

prop_ci_df <- prop_ci_df |> mutate(ci_width = ub - lb,
                     ci_cover_ind = if_else(p1 > lb & p1 < ub,
                                            true = 1,
                                            false = 0))
prop_ci_df |> summarise(avg_width = mean(ci_width),
                        coverage_rate = mean(ci_cover_ind))

# A tibble: 1 × 2
  avg_width coverage_rate
      <dbl>         <dbl>
1    0.0517         0.903

Setting 2: n = 1000, p = .90

n1 <- 1000   # sample size
p2 <- 0.9  # population proportion
  

generate_samp_prop <-function(n, p) {
  x <- rbinom(1, n, p) # randomly generate number of successes for the sample
  
  ## number of successes divided by sample size
  phat <- x / n
  
  ## 90% confidence interval
  lb <- phat - 1.645 * sqrt(phat * (1 - phat) / n)
  ub <- phat + 1.645 * sqrt(phat * (1 - phat) / n)
  
  prop_df <- tibble(phat, lb, ub)
  return(prop_df)
}

generate_samp_prop(n = 1000, p = 0.9)

# A tibble: 1 × 3
   phat    lb    ub
  <dbl> <dbl> <dbl>
1 0.893 0.877 0.909

## how many CI's do we want
n_sim <- 5000

prop_ci_df1 <- map(1:n_sim, 
    \(i) generate_samp_prop(n = 1000, p = 0.9)) |>
  bind_rows()

prop_ci_df1

# A tibble: 5,000 × 3
    phat    lb    ub
   <dbl> <dbl> <dbl>
 1 0.889 0.873 0.905
 2 0.882 0.865 0.899
 3 0.904 0.889 0.919
 4 0.894 0.878 0.910
 5 0.901 0.885 0.917
 6 0.9   0.884 0.916
 7 0.899 0.883 0.915
 8 0.884 0.867 0.901
 9 0.899 0.883 0.915
10 0.896 0.880 0.912
# ℹ 4,990 more rows

Average interval width and coverage rate

prop_ci_df1 <- prop_ci_df1 |> mutate(ci_width = ub - lb,
                   ci_cover_ind = if_else(p2 > lb & p2 < ub,
                                          true = 1,
                                          false = 0))
prop_ci_df1 |> summarise(avg_width = mean(ci_width),
                      coverage_rate = mean(ci_cover_ind))

# A tibble: 1 × 2
  avg_width coverage_rate
      <dbl>         <dbl>
1    0.0312         0.889

Setting 3: n = 100, p = .45

n2 <- 100   # sample size
p1 <- 0.45  # population proportion
  

generate_samp_prop <-function(n, p) {
  x <- rbinom(1, n, p) # randomly generate number of successes for the sample
  
  ## number of successes divided by sample size
  phat <- x / n
  
  ## 90% confidence interval
  lb <- phat - 1.645 * sqrt(phat * (1 - phat) / n)
  ub <- phat + 1.645 * sqrt(phat * (1 - phat) / n)
  
  prop_df <- tibble(phat, lb, ub)
  return(prop_df)
}

generate_samp_prop(n = 100, p = 0.45)

# A tibble: 1 × 3
   phat    lb    ub
  <dbl> <dbl> <dbl>
1   0.5 0.418 0.582

## how many CI's do we want
n_sim <- 5000

prop_ci_df2 <- map(1:n_sim, 
    \(i) generate_samp_prop(n = 100, p = 0.45)) |>
  bind_rows()

prop_ci_df2

# A tibble: 5,000 × 3
    phat    lb    ub
   <dbl> <dbl> <dbl>
 1  0.46 0.378 0.542
 2  0.48 0.398 0.562
 3  0.39 0.310 0.470
 4  0.38 0.300 0.460
 5  0.41 0.329 0.491
 6  0.48 0.398 0.562
 7  0.4  0.319 0.481
 8  0.45 0.368 0.532
 9  0.47 0.388 0.552
10  0.44 0.358 0.522
# ℹ 4,990 more rows

Average interval width and coverage rate

prop_ci_df2 <- prop_ci_df2 |> mutate(ci_width = ub - lb,
                   ci_cover_ind = if_else(p1 > lb & p1 < ub,
                                          true = 1,
                                          false = 0))
prop_ci_df2 |> summarise(avg_width = mean(ci_width),
                      coverage_rate = mean(ci_cover_ind))

# A tibble: 1 × 2
  avg_width coverage_rate
      <dbl>         <dbl>
1     0.163         0.888

Setting 4: n = 100, p = 0.9

n2 <- 100   # sample size
p2 <- 0.9  # population proportion
  

generate_samp_prop <-function(n, p) {
  x <- rbinom(1, n, p) # randomly generate number of successes for the sample
  
  ## number of successes divided by sample size
  phat <- x / n
  
  ## 90% confidence interval
  lb <- phat - 1.645 * sqrt(phat * (1 - phat) / n)
  ub <- phat + 1.645 * sqrt(phat * (1 - phat) / n)
  
  prop_df <- tibble(phat, lb, ub)
  return(prop_df)
}

generate_samp_prop(n = 100, p = 0.9)

# A tibble: 1 × 3
   phat    lb    ub
  <dbl> <dbl> <dbl>
1  0.89 0.839 0.941

## how many CI's do we want
n_sim <- 5000

prop_ci_df3 <- map(1:n_sim, 
    \(i) generate_samp_prop(n = 100, p = 0.9)) |>
  bind_rows()

prop_ci_df3

# A tibble: 5,000 × 3
    phat    lb    ub
   <dbl> <dbl> <dbl>
 1  0.84 0.780 0.900
 2  0.89 0.839 0.941
 3  0.91 0.863 0.957
 4  0.92 0.875 0.965
 5  0.89 0.839 0.941
 6  0.86 0.803 0.917
 7  0.93 0.888 0.972
 8  0.88 0.827 0.933
 9  0.89 0.839 0.941
10  0.92 0.875 0.965
# ℹ 4,990 more rows

Average interval width and coverage rate

prop_ci_df3 <- prop_ci_df3 |> mutate(ci_width = ub - lb,
                   ci_cover_ind = if_else(p2 > lb & p2 < ub,
                                          true = 1,
                                          false = 0))
prop_ci_df3 |> summarise(avg_width = mean(ci_width),
                      coverage_rate = mean(ci_cover_ind))

# A tibble: 1 × 2
  avg_width coverage_rate
      <dbl>         <dbl>
1    0.0973         0.866

Setting 5: n = 10, p = 0.45

n3 <- 10   # sample size
p1 <- 0.45  # population proportion
  

generate_samp_prop <-function(n, p) {
  x <- rbinom(1, n, p) # randomly generate number of successes for the sample
  
  ## number of successes divided by sample size
  phat <- x / n
  
  ## 90% confidence interval
  lb <- phat - 1.645 * sqrt(phat * (1 - phat) / n)
  ub <- phat + 1.645 * sqrt(phat * (1 - phat) / n)
  
  prop_df <- tibble(phat, lb, ub)
  return(prop_df)
}

generate_samp_prop(n = 10, p = 0.45)

# A tibble: 1 × 3
   phat    lb    ub
  <dbl> <dbl> <dbl>
1   0.5 0.240 0.760

## how many CI's do we want
n_sim <- 5000

prop_ci_df4 <- map(1:n_sim, 
    \(i) generate_samp_prop(n = 10, p = 0.45)) |>
  bind_rows()

prop_ci_df4

# A tibble: 5,000 × 3
    phat       lb    ub
   <dbl>    <dbl> <dbl>
 1   0.2 -0.00808 0.408
 2   0.4  0.145   0.655
 3   0.3  0.0616  0.538
 4   0.5  0.240   0.760
 5   0.4  0.145   0.655
 6   0.4  0.145   0.655
 7   0.5  0.240   0.760
 8   0.6  0.345   0.855
 9   0.5  0.240   0.760
10   0.4  0.145   0.655
# ℹ 4,990 more rows

Average interval width and coverage rate

prop_ci_df4 <- prop_ci_df4 |> mutate(ci_width = ub - lb,
                   ci_cover_ind = if_else(p1 > lb & p1 < ub,
                                          true = 1,
                                          false = 0))
prop_ci_df4 |> summarise(avg_width = mean(ci_width),
                      coverage_rate = mean(ci_cover_ind))

# A tibble: 1 × 2
  avg_width coverage_rate
      <dbl>         <dbl>
1     0.489         0.805

Seting 6: n = 10, p = 0.9

n3 <- 10   # sample size
p2 <- 0.9  # population proportion
  

generate_samp_prop <-function(n, p) {
  x <- rbinom(1, n, p) # randomly generate number of successes for the sample
  
  ## number of successes divided by sample size
  phat <- x / n
  
  ## 90% confidence interval
  lb <- phat - 1.645 * sqrt(phat * (1 - phat) / n)
  ub <- phat + 1.645 * sqrt(phat * (1 - phat) / n)
  
  prop_df <- tibble(phat, lb, ub)
  return(prop_df)
}

generate_samp_prop(n = 10, p = 0.9)

# A tibble: 1 × 3
   phat    lb    ub
  <dbl> <dbl> <dbl>
1   0.8 0.592  1.01

## how many CI's do we want
n_sim <- 5000

prop_ci_df5 <- map(1:n_sim, 
    \(i) generate_samp_prop(n = 10, p = 0.9)) |>
  bind_rows()

prop_ci_df5

# A tibble: 5,000 × 3
    phat    lb    ub
   <dbl> <dbl> <dbl>
 1   0.6 0.345 0.855
 2   0.9 0.744 1.06 
 3   1   1     1    
 4   1   1     1    
 5   1   1     1    
 6   0.9 0.744 1.06 
 7   0.9 0.744 1.06 
 8   0.9 0.744 1.06 
 9   0.6 0.345 0.855
10   1   1     1    
# ℹ 4,990 more rows

Average interval width and coverage rate

prop_ci_df5 <- prop_ci_df5 |> mutate(ci_width = ub - lb,
                   ci_cover_ind = if_else(p2 > lb & p2 < ub,
                                          true = 1,
                                          false = 0))
prop_ci_df5 |> summarise(avg_width = mean(ci_width),
                      coverage_rate = mean(ci_cover_ind))

# A tibble: 1 × 2
  avg_width coverage_rate
      <dbl>         <dbl>
1     0.234         0.636

Table of Results
		\(n = 1000\)	\(n = 100\)	\(n = 10\)
\(p = 0.45\)	Coverage Rate	0.8916	0.8878	0.8016
\(p = 0.90\)	Coverage Rate	0.8932	0.8632	0.635

\(p = 0.45\)	Average Width	0.05173266	0.1628876	0.4882621
\(p = 0.90\)	Average Width	0.03114799	0.09735256	0.2351569

Write-up:

My table of results shows the coverage rates and average widths for 90% confidence intervals for six different settings of a different sample size and population proportion. The coverage rates are closest to 90% level when the sample size is large, particularly for n = 1000, which makes sense for what we’ve been discussing in class. For both p = 0.45 and p = 0.9, the coverage rates are around 0.89, which is consistent with the confidence level. However, as the sample size decreases, the coverage rate tends to fall below 90%, especially for p = 0.9. This result highlights how smaller sample sizes are more likely to produce confidence intervals that do not contain the true population proportion, usually when the population proportion is far away from 0.5.

The average interval widths also show an expected pattern that we’ve discussed in class briefly. Larger sample sizes produce narrower confidence intervals, while smaller sample sizes make wider intervals. The average width for n = 1000 is significantly smaller than for n = 100 and n = 10, proving that there are more precise estimates with larger samples. For each sample size, the intervals are wider when p = 0.45 compared to p = 0.9. This happens because proportions near 0.5 have more variability making the confidence intervals wider.

These results prove the importance of the large sample assumption for the asymptotic confidence interval. When both n*p and n(1-p) are large, the intervals are more accurate and consistent with the theoretical confidence level. When these assumptions are violated with small sample sizes or extreme population proportions, the coverage rate decreases and the confidence intervals become less accurate. My simulation shows why we need to be careful when we apply asymptotic methods to small sample sizes (n = 10) or highly skewed population distributions (p = 0.9).