Calibration of quantiles and a joint calibration of totals and quanitles for surveys
Maciej Beręsewicz
Source:vignettes/b_survey.Rmd
b_survey.Rmd
Setup
library(jointCalib)
library(survey)
#> Loading required package: grid
#> Loading required package: Matrix
#> Loading required package: survival
#>
#> Attaching package: 'survey'
#> The following object is masked from 'package:graphics':
#>
#> dotchart
library(laeken)
library(ebal)
#> ##
#> ## ebal Package: Implements Entropy Balancing.
#> ## See http://www.stanford.edu/~jhain/ for additional information.
Examples
Basic example
Setup
Generate small dataset
set.seed(123)
N <- 100
x1 <- rbinom(n = 100, prob = 0.7, size = 1)
x2 <- rlnorm(n = 100)*1000
pop<-data.frame(x1, x2)
sample_df <- pop[sample(1:N, 20), ]
sample_df$d <- as.integer(N/nrow(sample_df))
colSums(sample_df[, c("x1", "x2")]*sample_df$d)
#> x1 x2
#> 70.0 181454.5
with(sample_df, weightedMedian(x2, d))
#> [1] 1005.781
Calibrate using: + totals: x1
and x2
, +
median for: x2
.
res <- joint_calib(formula_totals = ~ x1 + x2,
formula_quantiles = ~ x2,
data = sample_df,
dweights = as.numeric(sample_df$d),
N = 100,
pop_totals = c(x1=sum(x1), x2=sum(x2)),
pop_quantiles = list(x2=quantile(x2,0.5)),
method = "linear",
control = control_calib(interpolation = "linear"))
for_example <- res$Xs[, c(3,4,2)] |> as.data.frame()
for_example$w <- sample_df$d * res$g
for_example
#> x1 x2 V3 w
#> 90 1 236.0072 0.0100000000 5.550384
#> 58 0 188.6349 0.0100000000 5.805215
#> 81 1 4239.9474 0.0000000000 3.892356
#> 29 1 1198.7789 0.0000000000 5.319617
#> 26 0 2788.6884 0.0000000000 4.806049
#> 27 1 752.1850 0.0100000000 5.308135
#> 85 1 128.3176 0.0100000000 5.600924
#> 7 1 212.5129 0.0100000000 5.561410
#> 60 1 2506.7739 0.0000000000 4.705757
#> 96 1 588.0716 0.0100000000 5.385155
#> 41 1 2700.6807 0.0000000000 4.614754
#> 84 0 655.4083 0.0100000000 5.586152
#> 6 1 4556.1164 0.0000000000 3.743974
#> 31 0 1005.7808 0.0008372057 5.624283
#> 17 1 1565.5071 0.0000000000 5.147506
#> 64 1 945.9534 0.0000000000 5.438271
#> 37 0 2994.6849 0.0000000000 4.709372
#> 57 1 456.1633 0.0100000000 5.447062
#> 20 0 7768.5590 0.0000000000 2.468930
#> 35 1 802.1284 0.0100000000 5.284696
Median of x2
after calibration.
weightedQuantile(sample_df$x2, res$g, 0.5)
#> [1] 945.9534
Example 1 – census case
Setup
Based on Haziza, D., and Lesage, É. (2016). A discussion of weighting procedures for unit nonresponse. Journal of Official Statistics, 32(1), 129-145.
set.seed(20230817)
N <- 1000
x <- runif(N, 0, 80)
#y <- 1000+10*x+rnorm(N, 0, 300)
#y <- exp(-0.1 + 0.1*x) + rnorm(N, 0, 300)
#y <- rbinom(N, 1, prob = 1/(exp(-0.5*(x-55))+1))
y <- 1300-(x-40)^2 + rnorm(N, 0, 300)
#p <- rbinom(N, 1, prob = 0.2+0.6*(1 + exp(-5 + x/8))^-1)
p <- rbinom(N, 1, prob = 0.07+0.45*(x/40-1)^2+0.0025*x)
#p <- rbinom(N, 1, prob = (1.2+0.024*x)^-1)
#p <- rbinom(N, 1, prob = exp(-0.2 - 0.014*x))
probs <- seq(0.1, 0.9, 0.1)
quants_known <- list(x=quantile(x, probs))
totals_known <- c(x=sum(x))
df <- data.frame(x, y, p)
df_resp <- df[df$p == 1, ]
df_resp$d <- N/nrow(df_resp)
y_quant_true <- quantile(y, probs)
head(df_resp)
#> x y p d
#> 6 12.35687 444.9053 1 3.134796
#> 7 61.90319 403.9473 1 3.134796
#> 13 60.96079 923.4975 1 3.134796
#> 14 76.85300 124.4110 1 3.134796
#> 18 71.52828 422.0934 1 3.134796
#> 19 65.32808 740.4801 1 3.134796
Using jointCalib
package
example 1a: calibrate only quantiles (deciles)
result1 <- joint_calib(formula_quantiles = ~x,
data=df_resp,
dweights=df_resp$d,
N = N,
pop_quantiles = quants_known,
method = "linear",
backend = "sampling")
result1
#> Weights calibrated using: linear with sampling backend.
#> Summary statistics for g-weights:
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 0.4562 0.6773 0.8180 1.0000 1.4500 2.4538
#> Totals and precision (abs diff: 4.574665e-10)
#> totals precision
#> N 1e+03 4.557705e-10
#> x 0.10 1e-01 5.020984e-14
#> x 0.20 2e-01 1.055545e-13
#> x 0.30 3e-01 1.324496e-13
#> x 0.40 4e-01 1.580958e-13
#> x 0.50 5e-01 1.765255e-13
#> x 0.60 6e-01 1.991740e-13
#> x 0.70 7e-01 2.304823e-13
#> x 0.80 8e-01 2.882139e-13
#> x 0.90 9e-01 3.552714e-13
data.frame(true = quants_known$x, est = weightedQuantile(df_resp$x, result1$g*df_resp$d, probs))
#> true est
#> 10% 7.078067 7.085003
#> 20% 14.831221 14.824424
#> 30% 23.146180 23.287657
#> 40% 31.641911 31.802986
#> 50% 39.033812 39.154276
#> 60% 47.527168 48.252065
#> 70% 54.984229 55.311953
#> 80% 64.073167 64.062629
#> 90% 71.565441 71.567274
example 1b: calibrate only quantiles (deciles)
result2 <- joint_calib(formula_totals = ~x,
formula_quantiles = ~x,
data = df_resp,
dweights = df_resp$d,
N = N,
pop_quantiles = quants_known,
pop_totals = totals_known,
method = "linear",
backend = "sampling")
summary(result2$g)
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 0.3563 0.6208 0.8199 1.0000 1.4368 2.5384
data.frame(true = quants_known$x, est = weightedQuantile(df_resp$x, result2$g*df_resp$d, probs))
#> true est
#> 10% 7.078067 7.085003
#> 20% 14.831221 14.824424
#> 30% 23.146180 23.287657
#> 40% 31.641911 31.802986
#> 50% 39.033812 39.154276
#> 60% 47.527168 48.252065
#> 70% 54.984229 55.311953
#> 80% 64.073167 64.062629
#> 90% 71.565441 71.567274
We can restrict weights to specific range using
logit
.
result3 <- joint_calib(formula_totals = ~x,
formula_quantiles = ~x,
data = df_resp,
dweights = df_resp$d,
N = N,
pop_quantiles = quants_known,
pop_totals = totals_known,
method = "logit",
backend = "sampling",
maxit = 500,
bounds = c(0, 3))
summary(result3$g)
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 0.3911 0.6254 0.8140 1.0000 1.4325 2.5186
data.frame(true = quants_known$x, est = weightedQuantile(df_resp$x, result3$g*df_resp$d, probs))
#> true est
#> 10% 7.078067 7.085003
#> 20% 14.831221 14.824424
#> 30% 23.146180 22.872078
#> 40% 31.641911 31.297922
#> 50% 39.033812 39.154276
#> 60% 47.527168 48.252065
#> 70% 54.984229 55.311953
#> 80% 64.073167 64.529683
#> 90% 71.565441 71.567274
Empirical likelihood method can be applied using the following code
result4a <- joint_calib(formula_quantiles = ~ x,
data = df_resp,
dweights = df_resp$d,
N = N,
pop_quantiles = quants_known,
method = "el",
backend = "base")
summary(result4a$g)
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 0.4563 0.6773 0.8180 1.0000 1.4500 2.4538
result4b <- joint_calib(formula_totals = ~ x,
formula_quantiles = ~ x,
data = df_resp,
dweights = df_resp$d,
N = N,
pop_quantiles = quants_known,
pop_totals = totals_known,
method = "el",
backend = "base")
summary(result4b$g)
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 0.4414 0.6584 0.8109 1.0000 1.4256 2.8513
result4c <- calib_el(X = result4b$Xs[, c(1, 11)],
d = df_resp$d,
totals = c(N,totals_known),
maxit = 50,
tol = 1e-8)
summary(result4c)
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 0.7438 0.7910 0.8848 1.0000 1.2455 1.4923
data.frame(true = quants_known$x,
est = weightedQuantile(df_resp$x, result2$g*df_resp$d, probs),
est_el0 = weightedQuantile(df_resp$x, result4c*df_resp$d, probs),
est_el1 = weightedQuantile(df_resp$x, result4a$g*df_resp$d, probs),
est_el2 = weightedQuantile(df_resp$x, result4b$g*df_resp$d, probs))
#> true est est_el0 est_el1 est_el2
#> 10% 7.078067 7.085003 3.640246 7.085003 7.085003
#> 20% 14.831221 14.824424 8.024948 14.824424 14.824424
#> 30% 23.146180 23.287657 14.499018 23.287657 23.287657
#> 40% 31.641911 31.802986 24.010501 31.802986 31.802986
#> 50% 39.033812 39.154276 39.154276 38.892342 39.154276
#> 60% 47.527168 48.252065 54.685438 48.252065 48.252065
#> 70% 54.984229 55.311953 63.084405 54.954054 54.954054
#> 80% 64.073167 64.062629 70.050593 64.062629 64.062629
#> 90% 71.565441 71.567274 75.663078 71.567274 71.567274
Entropy balancing method can be applied using the following code
result5 <- joint_calib(formula_quantiles = ~ x,
data = df_resp,
dweights = df_resp$d,
N = N,
pop_quantiles = quants_known,
method = "eb")
summary(result5$g)
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 0.4576 0.6775 0.8180 1.0003 1.4500 2.4538
Finally, compare all method with true Y distribution where
-
est_y1
– weights calibrated to quantiles only, -
est_y2
– weights calibrated to quantiles and totals, -
est_y3
– weights calibrated to quantiles and totals usinglogit
distance and range limitations, -
est_y4
– weights calibrated to means only usingel
. -
est_y5
– weights calibrated to quantiles only usingel
. -
est_y6
– weights calibrated to quantiles and totals usingel
. -
est_y7
– weights calibrated to quantiles and totals usingeb
.
results_y <- data.frame(true_y = y_quant_true,
est_y1 = weightedQuantile(df_resp$y, result1$g * df_resp$d, probs),
est_y2 = weightedQuantile(df_resp$y, result2$g * df_resp$d, probs),
est_y3 = weightedQuantile(df_resp$y, result3$g * df_resp$d, probs),
est_y4 = weightedQuantile(df_resp$y, result4c * df_resp$d, probs),
est_y5 = weightedQuantile(df_resp$y, result4a$g * df_resp$d, probs),
est_y6 = weightedQuantile(df_resp$y, result4b$g * df_resp$d, probs),
est_y7 = weightedQuantile(df_resp$y, result5$g * df_resp$d, probs))
results_y
#> true_y est_y1 est_y2 est_y3 est_y4 est_y5
#> 10% -56.97982 -36.18473 -36.18473 -36.18473 -149.00996 -36.18473
#> 20% 210.58301 228.75429 228.75429 228.75429 17.27107 228.75429
#> 30% 444.81652 413.22517 417.60155 413.22517 203.58302 413.22517
#> 40% 646.06099 622.82192 622.82192 622.82192 381.99560 622.82192
#> 50% 803.50842 789.89936 789.89936 789.89936 503.35849 789.89936
#> 60% 975.31803 925.84730 925.84730 925.84730 679.04530 925.84730
#> 70% 1123.44643 1023.75230 1023.75230 1023.75230 811.84389 1023.75230
#> 80% 1245.62645 1162.06504 1162.06504 1162.06504 1009.46703 1162.06504
#> 90% 1449.23819 1471.33301 1471.33301 1471.33301 1242.60357 1471.33301
#> est_y6 est_y7
#> 10% -36.18473 -36.18473
#> 20% 228.75429 228.75429
#> 30% 411.47088 413.22517
#> 40% 622.82192 622.82192
#> 50% 789.89936 789.89936
#> 60% 925.84730 925.84730
#> 70% 1023.75230 1023.75230
#> 80% 1162.06504 1162.06504
#> 90% 1471.33301 1471.33301
Here is the sum of squares and calibration of totals and means seems to be the best.
Using survey
package
example 1a: calibrate only quantiles (deciles)
A <- joint_calib_create_matrix(X_q = model.matrix(~x+0, df_resp), N = N, pop_quantiles = quants_known)
colnames(A) <- paste0("quant_", gsub("\\D", "", names(quants_known$x)))
A <- as.data.frame(A)
df_resp <- cbind(df_resp, A)
head(df_resp)
#> x y p d quant_10 quant_20 quant_30 quant_40 quant_50
#> 6 12.35687 444.9053 1 3.134796 0 0.001 0.001 0.001 0.001
#> 7 61.90319 403.9473 1 3.134796 0 0.000 0.000 0.000 0.000
#> 13 60.96079 923.4975 1 3.134796 0 0.000 0.000 0.000 0.000
#> 14 76.85300 124.4110 1 3.134796 0 0.000 0.000 0.000 0.000
#> 18 71.52828 422.0934 1 3.134796 0 0.000 0.000 0.000 0.000
#> 19 65.32808 740.4801 1 3.134796 0 0.000 0.000 0.000 0.000
#> quant_60 quant_70 quant_80 quant_90
#> 6 0.001 0.001 0.001 0.001
#> 7 0.000 0.000 0.001 0.001
#> 13 0.000 0.000 0.001 0.001
#> 14 0.000 0.000 0.000 0.000
#> 18 0.000 0.000 0.000 0.001
#> 19 0.000 0.000 0.000 0.001
m1 <- svydesign(ids = ~1, data = df_resp, weights = ~d)
quants_formula <- as.formula(paste("~", paste(colnames(A), collapse = "+")))
quants_formula
#> ~quant_10 + quant_20 + quant_30 + quant_40 + quant_50 + quant_60 +
#> quant_70 + quant_80 + quant_90
svytotal(quants_formula, m1)
#> total SE
#> quant_10 0.10972 0.0175
#> quant_20 0.23197 0.0237
#> quant_30 0.29154 0.0255
#> quant_40 0.34796 0.0267
#> quant_50 0.38871 0.0273
#> quant_60 0.43887 0.0278
#> quant_70 0.50784 0.0280
#> quant_80 0.63323 0.0270
#> quant_90 0.78100 0.0232
Calibration using calibrate
pop_totals <- c(N, probs)
names(pop_totals) <- c("(Intercept)", colnames(A))
m1_cal <- calibrate(m1, quants_formula, pop_totals)
svytotal(quants_formula, m1_cal)
#> total SE
#> quant_10 0.1 0
#> quant_20 0.2 0
#> quant_30 0.3 0
#> quant_40 0.4 0
#> quant_50 0.5 0
#> quant_60 0.6 0
#> quant_70 0.7 0
#> quant_80 0.8 0
#> quant_90 0.9 0
Calibration using grake
(low level)
g1 <- grake(mm = as.matrix(cbind(1, A)),
ww = df_resp$d,
calfun = cal.linear,
population = pop_totals,
bounds = list(lower = -Inf, upper = Inf),
epsilon = 1e-7,
verbose = FALSE,
maxit = 50,
variance = NULL)
summary(as.numeric(g1))
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 0.4562 0.6773 0.8180 1.0000 1.4500 2.4538