This is a reproducible example of a stratified sampling calculation. Full code can also be accessed here
uncomment the first two lines at first utilisation
#install.packages("simFrame")
#install.packages("sampling")
library(simFrame)
library(sampling)
This is the total number of people in the group you are trying to reach with the survey.
Here we use 300,000
N <- 300000
It represents the probability of the same result if you re-sampled, all other things equal. A measure of how certain you are that your sample accurately reflects the population, within its margin of error. Common standards used by researchers are 90%, 95%, and 99%.
Here we use 95%
cl <- 0.95
z <- abs(qt((1-cl)/2, (N-1)))
This percentage describes the variability of the estimate: how closely the answer your sample gave is to the “true value” is in your population.
The smaller the margin of error is, the closer you are to having the exact answer at a given confidence level. A smaller margin of error means that you must have a larger sample size given the same population. Common standards used by researchers are: ± 5%, ± 3% , ± 1%).
Here we use 5%
e <- 0.05
Estimate of the prevalence or mean & STDev of the key indicator (e.g. 30% return intention).
Here we use 50%
p <- 0.5
q <- 1-p
PS: note that you can skip that step if you have already your dataset
size <- sample(x=c(1,2,3,4,5), size=N, replace=TRUE, prob=c(.3,.4,.2,.07,.03))
return <- sample(x=c("No","Yes","Unknown"), size=N, replace=TRUE, prob=c(0.4,p,0.1))
sex <- sample(x=c(0,1), size=N, replace=TRUE, prob=c(.4,.6))
region <- sample(x=c("Egypt","Iraq","Jordan","Lebanon"), size=N, replace=TRUE, prob=c(.2,.3,.1,.4))
needs <- sample(x=c(0,1), size=N, replace=TRUE, prob=c(.45,.55))
phone <- sample(x=c(0,1), size=N, replace=TRUE, prob=c(.2,.8))
data <- data.frame(size, return, sex, region, needs, phone)
n0 <- (z^2)*p*q/(e^2)
n0 <- round(n0, digits = 0)
print(n0)
## [1] 384
N <- nrow(data)
n <- n0/(1+((n0-1)/N))
n <- round(n, digits = 0)
print(n)
## [1] 384
data_with_phone <- data[ which(data$phone==1), ]
st <- stratify(data_with_phone, c("size", "needs"))
#summary(st)
str(st)
## Formal class 'Strata' [package "simFrame"] with 7 slots
## ..@ values: int [1:239887] 6 7 1 6 2 8 6 7 7 1 ...
## ..@ split :List of 10
## .. ..$ : int [1:32402] 3 10 12 15 36 37 46 54 58 59 ...
## .. ..$ : int [1:43158] 5 13 14 16 18 20 23 25 30 31 ...
## .. ..$ : int [1:21643] 17 19 34 57 65 71 74 107 130 152 ...
## .. ..$ : int [1:7583] 40 72 75 78 93 178 182 193 275 318 ...
## .. ..$ : int [1:3260] 32 44 99 230 423 430 474 529 547 578 ...
## .. ..$ : int [1:39480] 1 4 7 11 21 24 39 47 51 52 ...
## .. ..$ : int [1:52455] 2 8 9 22 27 35 38 41 61 62 ...
## .. ..$ : int [1:26836] 6 26 29 45 49 53 82 96 97 116 ...
## .. ..$ : int [1:9065] 28 83 108 126 136 145 148 154 171 190 ...
## .. ..$ : int [1:4005] 110 180 211 242 282 309 319 448 464 486 ...
## ..@ design: chr [1:2] "size" "needs"
## ..@ nr : int [1:10] 1 2 3 4 5 6 7 8 9 10
## ..@ legend:'data.frame': 10 obs. of 2 variables:
## .. ..$ size : Factor w/ 5 levels "1","2","3","4",..: 1 2 3 4 5 1 2 3 4 5
## .. ..$ needs: Factor w/ 2 levels "0","1": 1 1 1 1 1 2 2 2 2 2
## ..@ size : int [1:10] 32402 43158 21643 7583 3260 39480 52455 26836 9065 4005
## ..@ call : language stratify(x = data_with_phone, design = c("size", "needs"))
max(st@nr)
## [1] 10
nh = Nh/N*n for each strata h
n_size <- numeric(max(st@nr))
for (h in 1:max(st@nr)){
n_size[h] <- st@size[h]/N*n
n_size[h] <- round(n_size[h], digits = 0)
}
print(n_size)
## [1] 41 55 28 10 4 51 67 34 12 5
Use ‘Strata’ object
data_with_phone <- data_with_phone[order(data_with_phone$size, data_with_phone$needs),]
stratified_sample <- strata(data_with_phone, c("size", "needs"), c(n_size), method=("srswor"), pik,description=FALSE)
summary(stratified_sample)
## size needs ID_unit Prob
## Min. :1.000 Min. :0.0000 Min. : 132 Min. :0.0001848
## 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.: 59499 1st Qu.:0.0012654
## Median :3.000 Median :1.0000 Median :198664 Median :0.0019004
## Mean :2.691 Mean :0.5049 Mean :146888 Mean :0.0033099
## 3rd Qu.:4.000 3rd Qu.:1.0000 3rd Qu.:220495 3rd Qu.:0.0037507
## Max. :5.000 Max. :1.0000 Max. :237622 Max. :0.0088356
## Stratum
## Min. : 1.000
## 1st Qu.: 2.000
## Median : 6.000
## Mean : 4.886
## 3rd Qu.: 7.000
## Max. :10.000
data_sampled <- getdata(data_with_phone, stratified_sample)
#print(data_sampled)
write.csv(data_sampled, "data_sampled.csv")
Verify if the proportion of the attribute in the sample is close to its population’s counterpart
freq <- table(data_sampled$return)['Yes']
relfreq <- freq / NROW(data_sampled$return)
print(relfreq)
## Yes
## 0.4527687