Skip to content

Commit 778b3df

Browse files
committed
Merge branch 'main' of github.com:PolicyEngine/code-snippets into paper
2 parents 62c81ea + 9d22c40 commit 778b3df

File tree

2 files changed

+674
-29
lines changed

2 files changed

+674
-29
lines changed

reweighting/contradiction.py

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
import numpy as np
2+
import pandas as pd
3+
4+
5+
def simulate_contradictory_data(
6+
T: float, # official grand total
7+
k: int, # number of strata
8+
c: float, # contradiction factor. Strata totals fall short by floor(c * T)
9+
n: int, # Sample size, over all strata
10+
dirichlet_alpha = 5.0, # low value creates uneven divisions
11+
gamma_shape: float = 2.0,
12+
gamma_scale: float = 2.0
13+
):
14+
15+
"""Returns tuple containing the sample DataFrame and the official totals dict."""
16+
# Simulate stratum proportions (thetas)
17+
alphas = np.full(k, dirichlet_alpha)
18+
thetas = np.random.dirichlet(alphas)
19+
20+
# Define Stratum subtotals S_i
21+
S_i = thetas * T
22+
23+
# Calculate and distribute the contradiction 'delta'
24+
delta = np.floor(c * T)
25+
p = np.full(k, 1/k)
26+
delta_i = np.random.multinomial(n=int(delta), pvals=p)
27+
28+
# Calculate final, contradictory official subtotals S_i^*
29+
S_i_star = S_i - delta_i
30+
31+
# Allocate sample size n into strata sample sizes n_i
32+
n_i_float = thetas * n # Assuming sample size in proportion to stratum total
33+
n_i = n_i_float.astype(int)
34+
remainder_n = n - n_i.sum()
35+
if remainder_n > 0:
36+
top_indices_n = np.argsort(n_i_float - n_i)[-remainder_n:]
37+
n_i[top_indices_n] += 1
38+
if n_i.sum() != n:
39+
diff = n_i.sum() - n
40+
while diff > 0:
41+
largest_idx = np.argmax(n_i)
42+
if n_i[largest_idx] > 1:
43+
n_i[largest_idx] -= 1
44+
diff -= 1
45+
else:
46+
break
47+
48+
# Simulate population microdata, draw sample, and calculate weights
49+
all_samples = []
50+
for i in range(k):
51+
sample_y = np.random.gamma(shape=gamma_shape, scale=gamma_scale, size=n_i[i])
52+
weight = np.full(n_i[i], S_i_star[i] / np.sum(sample_y)) # baseline
53+
stratum_sample = pd.DataFrame({
54+
'stratum_id': i + 1,
55+
'y_ij': sample_y,
56+
'w_ij': weight
57+
})
58+
all_samples.append(stratum_sample)
59+
60+
# Combine and return
61+
final_sample_df = pd.concat(all_samples, ignore_index=True)
62+
official_totals = {'T_official': T, 'S_star_official': S_i_star}
63+
return final_sample_df, official_totals
64+
65+
66+
sample_df, totals = simulate_contradictory_data(
67+
T = 6000,
68+
k = 3,
69+
c = .1,
70+
n = 30
71+
)
72+
73+
74+
import torch
75+
num_strata = sample_df['stratum_id'].nunique()
76+
num_obs = len(sample_df)
77+
M = torch.zeros((num_strata + 1, num_obs), dtype=torch.float32)
78+
79+
M[0, :] = torch.tensor(sample_df['y_ij'].values, dtype=torch.float32)
80+
for i in range(num_strata):
81+
stratum_mask = (sample_df['stratum_id'] == (i + 1))
82+
M[i + 1, :] = torch.tensor(np.where(stratum_mask, sample_df['y_ij'], 0), dtype=torch.float32)
83+
84+
t_values = [totals['T_official']] + list(totals['S_star_official'])
85+
t = torch.tensor(t_values, dtype=torch.float32)
86+
87+
log_w = torch.tensor(np.log(sample_df['w_ij'].values), dtype=torch.float32, requires_grad=True)
88+
89+
def relative_loss(log_weights):
90+
"""
91+
Calculates the squared relative error, split into two components:
92+
1. The loss for the grand total.
93+
2. The sum of losses for the stratum totals.
94+
"""
95+
weights = torch.exp(log_weights)
96+
estimated_totals = torch.matmul(M, weights)
97+
grand_total_loss = ((estimated_totals[0] - t[0]) / t[0]) ** 2
98+
stratum_totals_loss = torch.sum(((estimated_totals[1:] - t[1:]) / t[1:]) ** 2)
99+
return grand_total_loss + stratum_totals_loss
100+
101+
# Setup the Adam optimizer to adjust the log-weights
102+
optimizer = torch.optim.Adam([log_w], lr=0.1)
103+
n_epochs = 100
104+
105+
print("--- Starting Optimization ---")
106+
# Optimization loop
107+
for epoch in range(n_epochs):
108+
optimizer.zero_grad()
109+
loss = relative_loss(log_w)
110+
loss.backward()
111+
optimizer.step()
112+
113+
if epoch % 5 == 0:
114+
print(f"Epoch {epoch:03d} | Loss: {loss.item():.8f}")
115+
print("--- Optimization Finished ---\n")
116+
117+
118+
# Results and Verification ---
119+
final_weights = torch.exp(log_w).detach()
120+
final_estimates = torch.matmul(M, final_weights)
121+
122+
print("--- Comparison of Totals ---")
123+
print(f"{'Metric':<25} | {'Official Target':>15} | {'Final Estimate':>15} | {'Relative Error (%)':>20}")
124+
print("-" * 85)
125+
126+
grand_total_official = t[0].item()
127+
grand_total_estimate = final_estimates[0].item()
128+
rel_err_grand_total = (grand_total_estimate - grand_total_official) / grand_total_official * 100
129+
print(f"{'Grand Total (T)':<25} | {grand_total_official:>15.2f} | {grand_total_estimate:>15.2f} | {rel_err_grand_total:>19.4f}%")
130+
131+
for i in range(num_strata):
132+
stratum_official = t[i + 1].item()
133+
stratum_estimate = final_estimates[i + 1].item()
134+
rel_err_stratum = (stratum_estimate - stratum_official) / stratum_official * 100
135+
print(f"{f'Stratum {i+1} Total':<25} | {stratum_official:>15.2f} | {stratum_estimate:>15.2f} | {rel_err_stratum:>19.4f}%")
136+
137+
print("\nFinal optimized weights (first 5):")
138+
print(final_weights.numpy()[:5])
139+

0 commit comments

Comments
 (0)