|
| 1 | +import numpy as np |
| 2 | +import pandas as pd |
| 3 | + |
| 4 | + |
| 5 | +def simulate_contradictory_data( |
| 6 | + T: float, # official grand total |
| 7 | + k: int, # number of strata |
| 8 | + c: float, # contradiction factor. Strata totals fall short by floor(c * T) |
| 9 | + n: int, # Sample size, over all strata |
| 10 | + dirichlet_alpha = 5.0, # low value creates uneven divisions |
| 11 | + gamma_shape: float = 2.0, |
| 12 | + gamma_scale: float = 2.0 |
| 13 | +): |
| 14 | + |
| 15 | + """Returns tuple containing the sample DataFrame and the official totals dict.""" |
| 16 | + # Simulate stratum proportions (thetas) |
| 17 | + alphas = np.full(k, dirichlet_alpha) |
| 18 | + thetas = np.random.dirichlet(alphas) |
| 19 | + |
| 20 | + # Define Stratum subtotals S_i |
| 21 | + S_i = thetas * T |
| 22 | + |
| 23 | + # Calculate and distribute the contradiction 'delta' |
| 24 | + delta = np.floor(c * T) |
| 25 | + p = np.full(k, 1/k) |
| 26 | + delta_i = np.random.multinomial(n=int(delta), pvals=p) |
| 27 | + |
| 28 | + # Calculate final, contradictory official subtotals S_i^* |
| 29 | + S_i_star = S_i - delta_i |
| 30 | + |
| 31 | + # Allocate sample size n into strata sample sizes n_i |
| 32 | + n_i_float = thetas * n # Assuming sample size in proportion to stratum total |
| 33 | + n_i = n_i_float.astype(int) |
| 34 | + remainder_n = n - n_i.sum() |
| 35 | + if remainder_n > 0: |
| 36 | + top_indices_n = np.argsort(n_i_float - n_i)[-remainder_n:] |
| 37 | + n_i[top_indices_n] += 1 |
| 38 | + if n_i.sum() != n: |
| 39 | + diff = n_i.sum() - n |
| 40 | + while diff > 0: |
| 41 | + largest_idx = np.argmax(n_i) |
| 42 | + if n_i[largest_idx] > 1: |
| 43 | + n_i[largest_idx] -= 1 |
| 44 | + diff -= 1 |
| 45 | + else: |
| 46 | + break |
| 47 | + |
| 48 | + # Simulate population microdata, draw sample, and calculate weights |
| 49 | + all_samples = [] |
| 50 | + for i in range(k): |
| 51 | + sample_y = np.random.gamma(shape=gamma_shape, scale=gamma_scale, size=n_i[i]) |
| 52 | + weight = np.full(n_i[i], S_i_star[i] / np.sum(sample_y)) # baseline |
| 53 | + stratum_sample = pd.DataFrame({ |
| 54 | + 'stratum_id': i + 1, |
| 55 | + 'y_ij': sample_y, |
| 56 | + 'w_ij': weight |
| 57 | + }) |
| 58 | + all_samples.append(stratum_sample) |
| 59 | + |
| 60 | + # Combine and return |
| 61 | + final_sample_df = pd.concat(all_samples, ignore_index=True) |
| 62 | + official_totals = {'T_official': T, 'S_star_official': S_i_star} |
| 63 | + return final_sample_df, official_totals |
| 64 | + |
| 65 | + |
| 66 | +sample_df, totals = simulate_contradictory_data( |
| 67 | + T = 6000, |
| 68 | + k = 3, |
| 69 | + c = .1, |
| 70 | + n = 30 |
| 71 | +) |
| 72 | + |
| 73 | + |
| 74 | +import torch |
| 75 | +num_strata = sample_df['stratum_id'].nunique() |
| 76 | +num_obs = len(sample_df) |
| 77 | +M = torch.zeros((num_strata + 1, num_obs), dtype=torch.float32) |
| 78 | + |
| 79 | +M[0, :] = torch.tensor(sample_df['y_ij'].values, dtype=torch.float32) |
| 80 | +for i in range(num_strata): |
| 81 | + stratum_mask = (sample_df['stratum_id'] == (i + 1)) |
| 82 | + M[i + 1, :] = torch.tensor(np.where(stratum_mask, sample_df['y_ij'], 0), dtype=torch.float32) |
| 83 | + |
| 84 | +t_values = [totals['T_official']] + list(totals['S_star_official']) |
| 85 | +t = torch.tensor(t_values, dtype=torch.float32) |
| 86 | + |
| 87 | +log_w = torch.tensor(np.log(sample_df['w_ij'].values), dtype=torch.float32, requires_grad=True) |
| 88 | + |
| 89 | +def relative_loss(log_weights): |
| 90 | + """ |
| 91 | + Calculates the squared relative error, split into two components: |
| 92 | + 1. The loss for the grand total. |
| 93 | + 2. The sum of losses for the stratum totals. |
| 94 | + """ |
| 95 | + weights = torch.exp(log_weights) |
| 96 | + estimated_totals = torch.matmul(M, weights) |
| 97 | + grand_total_loss = ((estimated_totals[0] - t[0]) / t[0]) ** 2 |
| 98 | + stratum_totals_loss = torch.sum(((estimated_totals[1:] - t[1:]) / t[1:]) ** 2) |
| 99 | + return grand_total_loss + stratum_totals_loss |
| 100 | + |
| 101 | +# Setup the Adam optimizer to adjust the log-weights |
| 102 | +optimizer = torch.optim.Adam([log_w], lr=0.1) |
| 103 | +n_epochs = 100 |
| 104 | + |
| 105 | +print("--- Starting Optimization ---") |
| 106 | +# Optimization loop |
| 107 | +for epoch in range(n_epochs): |
| 108 | + optimizer.zero_grad() |
| 109 | + loss = relative_loss(log_w) |
| 110 | + loss.backward() |
| 111 | + optimizer.step() |
| 112 | + |
| 113 | + if epoch % 5 == 0: |
| 114 | + print(f"Epoch {epoch:03d} | Loss: {loss.item():.8f}") |
| 115 | +print("--- Optimization Finished ---\n") |
| 116 | + |
| 117 | + |
| 118 | +# Results and Verification --- |
| 119 | +final_weights = torch.exp(log_w).detach() |
| 120 | +final_estimates = torch.matmul(M, final_weights) |
| 121 | + |
| 122 | +print("--- Comparison of Totals ---") |
| 123 | +print(f"{'Metric':<25} | {'Official Target':>15} | {'Final Estimate':>15} | {'Relative Error (%)':>20}") |
| 124 | +print("-" * 85) |
| 125 | + |
| 126 | +grand_total_official = t[0].item() |
| 127 | +grand_total_estimate = final_estimates[0].item() |
| 128 | +rel_err_grand_total = (grand_total_estimate - grand_total_official) / grand_total_official * 100 |
| 129 | +print(f"{'Grand Total (T)':<25} | {grand_total_official:>15.2f} | {grand_total_estimate:>15.2f} | {rel_err_grand_total:>19.4f}%") |
| 130 | + |
| 131 | +for i in range(num_strata): |
| 132 | + stratum_official = t[i + 1].item() |
| 133 | + stratum_estimate = final_estimates[i + 1].item() |
| 134 | + rel_err_stratum = (stratum_estimate - stratum_official) / stratum_official * 100 |
| 135 | + print(f"{f'Stratum {i+1} Total':<25} | {stratum_official:>15.2f} | {stratum_estimate:>15.2f} | {rel_err_stratum:>19.4f}%") |
| 136 | + |
| 137 | +print("\nFinal optimized weights (first 5):") |
| 138 | +print(final_weights.numpy()[:5]) |
| 139 | + |
0 commit comments