diff --git a/CLAUDE.md b/CLAUDE.md index bd8570d..9bfeda0 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -351,6 +351,7 @@ python benchmarks/run_benchmarks.py --all # Run specific estimator python benchmarks/run_benchmarks.py --estimator callaway +python benchmarks/run_benchmarks.py --estimator multiperiod ``` See `docs/benchmarks.rst` for full methodology and validation results. diff --git a/METHODOLOGY_REVIEW.md b/METHODOLOGY_REVIEW.md index 233de9e..4752747 100644 --- a/METHODOLOGY_REVIEW.md +++ b/METHODOLOGY_REVIEW.md @@ -134,8 +134,10 @@ Each estimator in diff-diff should be periodically reviewed to ensure: fixed to use interaction sub-VCV instead of full regression VCV. **Outstanding Concerns:** -- No R comparison benchmarks yet (unlike DifferenceInDifferences and CallawaySantAnna which - have formal R benchmark tests). Consider adding `benchmarks/R/multiperiod_benchmark.R`. +- ~~No R comparison benchmarks yet~~ — **Resolved**: R comparison benchmark added via + `benchmarks/R/benchmark_multiperiod.R` using `fixest::feols(outcome ~ treated * time_f | unit)`. + Results match R exactly: ATT diff < 1e-11, SE diff 0.0%, period effects correlation 1.0. + Validated at small (200 units) and 1k scales. - Default SE is HC1 (not cluster-robust at unit level as fixest uses). Cluster-robust available via `cluster` parameter but not the default. - Endpoint binning for distant event times not yet implemented. diff --git a/benchmarks/R/benchmark_multiperiod.R b/benchmarks/R/benchmark_multiperiod.R new file mode 100644 index 0000000..7132488 --- /dev/null +++ b/benchmarks/R/benchmark_multiperiod.R @@ -0,0 +1,201 @@ +#!/usr/bin/env Rscript +# Benchmark: MultiPeriodDiD event study (R `fixest` package) +# +# Usage: +# Rscript benchmark_multiperiod.R --data path/to/data.csv --output path/to/results.json \ +# --n-pre 4 --n-post 4 + +library(fixest) +library(jsonlite) +library(data.table) + +# Parse command line arguments +args <- commandArgs(trailingOnly = TRUE) + +parse_args <- function(args) { + result <- list( + data = NULL, + output = NULL, + cluster = "unit", + n_pre = NULL, + n_post = NULL, + reference_period = NULL + ) + + i <- 1 + while (i <= length(args)) { + if (args[i] == "--data") { + result$data <- args[i + 1] + i <- i + 2 + } else if (args[i] == "--output") { + result$output <- args[i + 1] + i <- i + 2 + } else if (args[i] == "--cluster") { + result$cluster <- args[i + 1] + i <- i + 2 + } else if (args[i] == "--n-pre") { + result$n_pre <- as.integer(args[i + 1]) + i <- i + 2 + } else if (args[i] == "--n-post") { + result$n_post <- as.integer(args[i + 1]) + i <- i + 2 + } else if (args[i] == "--reference-period") { + result$reference_period <- as.integer(args[i + 1]) + i <- i + 2 + } else { + i <- i + 1 + } + } + + if (is.null(result$data) || is.null(result$output)) { + stop("Usage: Rscript benchmark_multiperiod.R --data --output --n-pre --n-post ") + } + if (is.null(result$n_pre) || is.null(result$n_post)) { + stop("--n-pre and --n-post are required") + } + + # Default reference period: last pre-period + if (is.null(result$reference_period)) { + result$reference_period <- result$n_pre + } + + return(result) +} + +config <- parse_args(args) + +# Load data +message(sprintf("Loading data from: %s", config$data)) +data <- fread(config$data) + +ref_period <- config$reference_period +message(sprintf("Reference period: %d", ref_period)) +message(sprintf("n_pre: %d, n_post: %d", config$n_pre, config$n_post)) + +# Create factor for time with reference level +data[, time_f := relevel(factor(time), ref = as.character(ref_period))] + +# Run benchmark +message("Running MultiPeriodDiD estimation (fixest::feols)...") +start_time <- Sys.time() + +# Regression: outcome ~ treated * time_f | unit, clustered SEs +# With | unit, fixest absorbs unit fixed effects. The unit-invariant 'treated' +# main effect is collinear with unit FE and is absorbed automatically. +# Interaction coefficients treated:time_fK remain identified. +cluster_formula <- as.formula(paste0("~", config$cluster)) +model <- feols(outcome ~ treated * time_f | unit, data = data, cluster = cluster_formula) + +estimation_time <- as.numeric(difftime(Sys.time(), start_time, units = "secs")) + +# Extract all coefficients and SEs +coefs <- coef(model) +ses <- se(model) +vcov_mat <- vcov(model) + +# Extract interaction coefficients (treated:time_fK for each non-reference K) +interaction_mask <- grepl("^treated:time_f", names(coefs)) +interaction_names <- names(coefs)[interaction_mask] +interaction_coefs <- coefs[interaction_mask] +interaction_ses <- ses[interaction_mask] + +message(sprintf("Found %d interaction coefficients", length(interaction_names))) + +# Build period effects list +all_periods <- sort(unique(data$time)) +period_effects <- list() + +for (i in seq_along(interaction_names)) { + coef_name <- interaction_names[i] + # Extract period value from coefficient name "treated:time_fK" + period_val <- as.integer(sub("treated:time_f", "", coef_name)) + event_time <- period_val - ref_period + + period_effects[[i]] <- list( + period = period_val, + event_time = event_time, + att = unname(interaction_coefs[i]), + se = unname(interaction_ses[i]) + ) +} + +# Compute average ATT across post-periods (covariance-aware SE) +post_period_names <- c() +for (coef_name in interaction_names) { + period_val <- as.integer(sub("treated:time_f", "", coef_name)) + if (period_val > config$n_pre) { + post_period_names <- c(post_period_names, coef_name) + } +} + +n_post_periods <- length(post_period_names) +message(sprintf("Post-period interaction coefficients: %d", n_post_periods)) + +if (n_post_periods > 0) { + avg_att <- mean(coefs[post_period_names]) + vcov_sub <- vcov_mat[post_period_names, post_period_names, drop = FALSE] + avg_se <- sqrt(sum(vcov_sub) / n_post_periods^2) + # NaN guard: match registry convention (REGISTRY.md lines 179-183) + if (is.finite(avg_se) && avg_se > 0) { + avg_t <- avg_att / avg_se + avg_pval <- 2 * pt(abs(avg_t), df = model$nobs - length(coefs), lower.tail = FALSE) + avg_ci_lower <- avg_att - qt(0.975, df = model$nobs - length(coefs)) * avg_se + avg_ci_upper <- avg_att + qt(0.975, df = model$nobs - length(coefs)) * avg_se + } else { + avg_t <- NA + avg_pval <- NA + avg_ci_lower <- NA + avg_ci_upper <- NA + } +} else { + avg_att <- NA + avg_se <- NA + avg_pval <- NA + avg_ci_lower <- NA + avg_ci_upper <- NA +} + +message(sprintf("Average ATT: %.6f", avg_att)) +message(sprintf("Average SE: %.6f", avg_se)) + +# Format output +results <- list( + estimator = "fixest::feols (multiperiod)", + cluster = config$cluster, + + # Average treatment effect + att = avg_att, + se = avg_se, + pvalue = avg_pval, + ci_lower = avg_ci_lower, + ci_upper = avg_ci_upper, + + # Reference period + reference_period = ref_period, + + # Period-level effects + period_effects = period_effects, + + # Timing + timing = list( + estimation_seconds = estimation_time, + total_seconds = estimation_time + ), + + # Metadata + metadata = list( + r_version = R.version.string, + fixest_version = as.character(packageVersion("fixest")), + n_units = length(unique(data$unit)), + n_periods = length(unique(data$time)), + n_obs = nrow(data), + n_pre = config$n_pre, + n_post = config$n_post + ) +) + +# Write output +message(sprintf("Writing results to: %s", config$output)) +write_json(results, config$output, auto_unbox = TRUE, pretty = TRUE, digits = 10) + +message(sprintf("Completed in %.3f seconds", estimation_time)) diff --git a/benchmarks/python/benchmark_multiperiod.py b/benchmarks/python/benchmark_multiperiod.py new file mode 100644 index 0000000..5507aa4 --- /dev/null +++ b/benchmarks/python/benchmark_multiperiod.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python3 +""" +Benchmark: MultiPeriodDiD event study (diff-diff MultiPeriodDiD). + +Usage: + python benchmark_multiperiod.py --data path/to/data.csv --output path/to/results.json \ + --n-pre 4 --n-post 4 +""" + +import argparse +import json +import os +import sys +from pathlib import Path + +# IMPORTANT: Parse --backend and set environment variable BEFORE importing diff_diff +# This ensures the backend configuration is respected by all modules +def _get_backend_from_args(): + """Parse --backend argument without importing diff_diff.""" + parser = argparse.ArgumentParser(add_help=False) + parser.add_argument("--backend", default="auto", choices=["auto", "python", "rust"]) + args, _ = parser.parse_known_args() + return args.backend + +_requested_backend = _get_backend_from_args() +if _requested_backend in ("python", "rust"): + os.environ["DIFF_DIFF_BACKEND"] = _requested_backend + +# NOW import diff_diff and other dependencies (will see the env var) +import pandas as pd + +# Add parent to path for imports +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) + +from diff_diff import MultiPeriodDiD, HAS_RUST_BACKEND +from benchmarks.python.utils import Timer + + +def parse_args(): + parser = argparse.ArgumentParser(description="Benchmark MultiPeriodDiD estimator") + parser.add_argument("--data", required=True, help="Path to input CSV data") + parser.add_argument("--output", required=True, help="Path to output JSON results") + parser.add_argument( + "--cluster", default="unit", help="Column to cluster standard errors on" + ) + parser.add_argument( + "--n-pre", type=int, required=True, help="Number of pre-treatment periods" + ) + parser.add_argument( + "--n-post", type=int, required=True, help="Number of post-treatment periods" + ) + parser.add_argument( + "--reference-period", type=int, default=None, + help="Reference period (default: last pre-period = n_pre)" + ) + parser.add_argument( + "--backend", default="auto", choices=["auto", "python", "rust"], + help="Backend to use: auto (default), python (pure Python), rust (Rust backend)" + ) + return parser.parse_args() + + +def get_actual_backend() -> str: + """Return the actual backend being used based on HAS_RUST_BACKEND.""" + return "rust" if HAS_RUST_BACKEND else "python" + + +def main(): + args = parse_args() + + # Get actual backend (already configured via env var before imports) + actual_backend = get_actual_backend() + print(f"Using backend: {actual_backend}") + + # Load data + print(f"Loading data from: {args.data}") + data = pd.read_csv(args.data) + + # Compute post_periods and reference_period from args + all_periods = sorted(data["time"].unique()) + n_pre = args.n_pre + post_periods = [p for p in all_periods if p > n_pre] + ref_period = args.reference_period if args.reference_period is not None else n_pre + + print(f"All periods: {all_periods}") + print(f"Post periods: {post_periods}") + print(f"Reference period: {ref_period}") + + # Run benchmark + print("Running MultiPeriodDiD estimation...") + + did = MultiPeriodDiD(robust=True, cluster=args.cluster) + + with Timer() as timer: + results = did.fit( + data, + outcome="outcome", + treatment="treated", + time="time", + post_periods=post_periods, + reference_period=ref_period, + absorb=["unit"], + ) + + total_time = timer.elapsed + + # Extract period effects (excluding reference period) + period_effects = [] + for period, pe in sorted(results.period_effects.items()): + event_time = period - ref_period + period_effects.append({ + "period": int(period), + "event_time": int(event_time), + "att": float(pe.effect), + "se": float(pe.se), + }) + + # Build output + output = { + "estimator": "diff_diff.MultiPeriodDiD", + "backend": actual_backend, + "cluster": args.cluster, + # Average treatment effect (across post-periods) + "att": float(results.avg_att), + "se": float(results.avg_se), + "pvalue": float(results.avg_p_value), + "ci_lower": float(results.avg_conf_int[0]), + "ci_upper": float(results.avg_conf_int[1]), + # Reference period + "reference_period": int(ref_period), + # Period-level effects + "period_effects": period_effects, + # Timing + "timing": { + "estimation_seconds": total_time, + "total_seconds": total_time, + }, + # Metadata + "metadata": { + "n_units": int(data["unit"].nunique()), + "n_periods": int(data["time"].nunique()), + "n_obs": len(data), + "n_pre": n_pre, + "n_post": len(post_periods), + }, + } + + # Write output + print(f"Writing results to: {args.output}") + output_path = Path(args.output) + output_path.parent.mkdir(parents=True, exist_ok=True) + with open(output_path, "w") as f: + json.dump(output, f, indent=2) + + print(f"ATT: {results.avg_att:.6f}") + print(f"SE: {results.avg_se:.6f}") + print(f"Completed in {total_time:.3f} seconds") + return output + + +if __name__ == "__main__": + main() diff --git a/benchmarks/python/utils.py b/benchmarks/python/utils.py index a71c9d9..3ccca80 100644 --- a/benchmarks/python/utils.py +++ b/benchmarks/python/utils.py @@ -331,6 +331,68 @@ def generate_sdid_data( return pd.DataFrame(data) +def generate_multiperiod_data( + n_units: int = 200, + n_pre: int = 4, + n_post: int = 4, + treatment_effect: float = 3.0, + treatment_fraction: float = 0.5, + seed: int = 42, +) -> pd.DataFrame: + """ + Generate synthetic multi-period event study data for benchmarking. + + All treated units receive treatment simultaneously at the same time. + + Parameters + ---------- + n_units : int + Number of units. + n_pre : int + Number of pre-treatment periods. + n_post : int + Number of post-treatment periods. + treatment_effect : float + True treatment effect in post-periods. + treatment_fraction : float + Fraction of units that are treated. + seed : int + Random seed. + + Returns + ------- + pd.DataFrame + Panel data with columns: unit, time, outcome, treated. + """ + rng = np.random.default_rng(seed) + + n_treated = int(n_units * treatment_fraction) + n_periods = n_pre + n_post + data = [] + + for unit in range(n_units): + unit_fe = rng.normal(0, 2) + treated = 1 if unit < n_treated else 0 + + for t in range(1, n_periods + 1): + time_fe = t * 0.5 + post = 1 if t > n_pre else 0 + + effect = treatment_effect if (treated and post) else 0 + outcome = unit_fe + time_fe + effect + rng.normal(0, 1) + + data.append( + { + "unit": unit, + "time": t, + "outcome": outcome, + "treated": treated, + } + ) + + return pd.DataFrame(data) + + def load_benchmark_data(path: Path) -> pd.DataFrame: """Load benchmark data from CSV.""" return pd.read_csv(path) diff --git a/benchmarks/run_benchmarks.py b/benchmarks/run_benchmarks.py index 284d2f6..7501ee7 100644 --- a/benchmarks/run_benchmarks.py +++ b/benchmarks/run_benchmarks.py @@ -33,11 +33,13 @@ generate_staggered_data, generate_basic_did_data, generate_sdid_data, + generate_multiperiod_data, save_benchmark_data, compute_timing_stats, ) from benchmarks.compare_results import ( compare_estimates, + compare_event_study, generate_comparison_report, load_results, ) @@ -48,26 +50,31 @@ "staggered": {"n_units": 200, "n_periods": 8, "n_cohorts": 3}, "basic": {"n_units": 100, "n_periods": 4}, "sdid": {"n_control": 40, "n_treated": 10, "n_pre": 15, "n_post": 5}, + "multiperiod": {"n_units": 200, "n_pre": 4, "n_post": 4}, }, "1k": { "staggered": {"n_units": 1000, "n_periods": 10, "n_cohorts": 4}, "basic": {"n_units": 1000, "n_periods": 6}, "sdid": {"n_control": 800, "n_treated": 200, "n_pre": 20, "n_post": 10}, + "multiperiod": {"n_units": 1000, "n_pre": 5, "n_post": 5}, }, "5k": { "staggered": {"n_units": 5000, "n_periods": 12, "n_cohorts": 5}, "basic": {"n_units": 5000, "n_periods": 8}, "sdid": {"n_control": 4000, "n_treated": 1000, "n_pre": 25, "n_post": 15}, + "multiperiod": {"n_units": 5000, "n_pre": 6, "n_post": 6}, }, "10k": { "staggered": {"n_units": 10000, "n_periods": 15, "n_cohorts": 6}, "basic": {"n_units": 10000, "n_periods": 10}, "sdid": {"n_control": 8000, "n_treated": 2000, "n_pre": 30, "n_post": 20}, + "multiperiod": {"n_units": 10000, "n_pre": 6, "n_post": 6}, }, "20k": { "staggered": {"n_units": 20000, "n_periods": 18, "n_cohorts": 7}, "basic": {"n_units": 20000, "n_periods": 12}, "sdid": {"n_control": 16000, "n_treated": 4000, "n_pre": 35, "n_post": 25}, + "multiperiod": {"n_units": 20000, "n_pre": 8, "n_post": 8}, }, } @@ -291,6 +298,22 @@ def generate_synthetic_datasets( save_benchmark_data(sdid_data, sdid_path) datasets[f"sdid_{scale}"] = sdid_path + # MultiPeriod event study data + mp_cfg = config["multiperiod"] + n_periods = mp_cfg["n_pre"] + mp_cfg["n_post"] + n_obs = mp_cfg["n_units"] * n_periods + print(f" - multiperiod_{scale} ({mp_cfg['n_units']} units, {n_periods} periods, {n_obs:,} obs)") + multiperiod_data = generate_multiperiod_data( + n_units=mp_cfg["n_units"], + n_pre=mp_cfg["n_pre"], + n_post=mp_cfg["n_post"], + treatment_effect=3.0, + seed=seed, + ) + multiperiod_path = DATA_DIR / "synthetic" / f"multiperiod_{scale}.csv" + save_benchmark_data(multiperiod_data, multiperiod_path) + datasets[f"multiperiod_{scale}"] = multiperiod_path + print(f"\nGenerated {len(datasets)} datasets") return datasets @@ -672,6 +695,144 @@ def run_basic_did_benchmark( return results +def run_multiperiod_benchmark( + data_path: Path, + n_pre: int, + n_post: int, + name: str = "multiperiod", + scale: str = "small", + n_replications: int = 1, + backends: Optional[List[str]] = None, +) -> Dict[str, Any]: + """Run MultiPeriodDiD event study benchmarks (Python and R) with replications.""" + print(f"\n{'='*60}") + print(f"MULTIPERIOD DID BENCHMARK ({scale})") + print(f"{'='*60}") + + if backends is None: + backends = ["python", "rust"] + + timeouts = TIMEOUT_CONFIGS.get(scale, TIMEOUT_CONFIGS["small"]) + extra_args = ["--n-pre", str(n_pre), "--n-post", str(n_post)] + results = { + "name": name, + "scale": scale, + "n_replications": n_replications, + "python_pure": None, + "python_rust": None, + "r": None, + "comparison": None, + } + + # Run Python benchmark for each backend + for backend in backends: + backend_label = f"python_{'pure' if backend == 'python' else backend}" + print(f"\nRunning Python (diff_diff.MultiPeriodDiD, backend={backend}) - {n_replications} replications...") + py_output = RESULTS_DIR / "accuracy" / f"{backend_label}_{name}_{scale}.json" + py_output.parent.mkdir(parents=True, exist_ok=True) + + py_timings = [] + py_result = None + for rep in range(n_replications): + try: + py_result = run_python_benchmark( + "benchmark_multiperiod.py", data_path, py_output, + extra_args=extra_args, + timeout=timeouts["python"], + backend=backend, + ) + py_timings.append(py_result["timing"]["total_seconds"]) + if rep == 0: + print(f" ATT: {py_result['att']:.4f}") + print(f" SE: {py_result['se']:.4f}") + print(f" Rep {rep+1}/{n_replications}: {py_timings[-1]:.3f}s") + except Exception as e: + print(f" Rep {rep+1} failed: {e}") + + if py_result and py_timings: + timing_stats = compute_timing_stats(py_timings) + py_result["timing"] = timing_stats + results[backend_label] = py_result + print(f" Mean time: {timing_stats['stats']['mean']:.3f}s ± {timing_stats['stats']['std']:.3f}s") + + # For backward compatibility, also store as "python" (use rust if available) + if results.get("python_rust"): + results["python"] = results["python_rust"] + elif results.get("python_pure"): + results["python"] = results["python_pure"] + + # R benchmark with replications + print(f"\nRunning R (fixest::feols multiperiod) - {n_replications} replications...") + r_output = RESULTS_DIR / "accuracy" / f"r_{name}_{scale}.json" + + r_timings = [] + r_result = None + for rep in range(n_replications): + try: + r_result = run_r_benchmark( + "benchmark_multiperiod.R", data_path, r_output, + extra_args=extra_args, + timeout=timeouts["r"] + ) + r_timings.append(r_result["timing"]["total_seconds"]) + if rep == 0: + print(f" ATT: {r_result['att']:.4f}") + print(f" SE: {r_result['se']:.4f}") + print(f" Rep {rep+1}/{n_replications}: {r_timings[-1]:.3f}s") + except Exception as e: + print(f" Rep {rep+1} failed: {e}") + + if r_result and r_timings: + timing_stats = compute_timing_stats(r_timings) + r_result["timing"] = timing_stats + results["r"] = r_result + print(f" Mean time: {timing_stats['stats']['mean']:.3f}s ± {timing_stats['stats']['std']:.3f}s") + + # Compare results + if results["python"] and results["r"]: + print("\nComparison (Python vs R):") + comparison = compare_estimates( + results["python"], results["r"], "MultiPeriodDiD", scale=scale, + se_rtol=0.01, + python_pure_results=results.get("python_pure"), + python_rust_results=results.get("python_rust"), + ) + results["comparison"] = comparison + print(f" ATT diff: {comparison.att_diff:.2e}") + print(f" SE rel diff: {comparison.se_rel_diff:.1%}") + print(f" Status: {'PASS' if comparison.passed else 'FAIL'}") + + # Period-level comparison + py_effects = results["python"].get("period_effects", []) + r_effects = results["r"].get("period_effects", []) + if py_effects and r_effects: + corr, max_diff, all_close = compare_event_study(py_effects, r_effects) + print(f" Period effects correlation: {corr:.6f}") + print(f" Period effects max diff: {max_diff:.2e}") + print(f" Period effects all close: {all_close}") + + # Print timing comparison table + print("\nTiming Comparison:") + print(f" {'Backend':<15} {'Time (s)':<12} {'vs R':<12} {'vs Pure Python':<15}") + print(f" {'-'*54}") + + r_mean = results["r"]["timing"]["stats"]["mean"] if results["r"] else None + pure_mean = results["python_pure"]["timing"]["stats"]["mean"] if results.get("python_pure") else None + rust_mean = results["python_rust"]["timing"]["stats"]["mean"] if results.get("python_rust") else None + + if r_mean: + print(f" {'R':<15} {r_mean:<12.3f} {'1.00x':<12} {'-':<15}") + if pure_mean: + r_speedup = f"{r_mean/pure_mean:.2f}x" if r_mean else "-" + print(f" {'Python (pure)':<15} {pure_mean:<12.3f} {r_speedup:<12} {'1.00x':<15}") + if rust_mean: + r_speedup = f"{r_mean/rust_mean:.2f}x" if r_mean else "-" + pure_speedup = f"{pure_mean/rust_mean:.2f}x" if pure_mean else "-" + print(f" {'Python (rust)':<15} {rust_mean:<12.3f} {r_speedup:<12} {pure_speedup:<15}") + + return results + + def main(): parser = argparse.ArgumentParser( description="Run diff-diff benchmarks against R packages" @@ -683,7 +844,7 @@ def main(): ) parser.add_argument( "--estimator", - choices=["callaway", "synthdid", "basic"], + choices=["callaway", "synthdid", "basic", "multiperiod"], help="Run specific estimator benchmark", ) parser.add_argument( @@ -771,6 +932,19 @@ def main(): ) all_results.append(results) + if args.all or args.estimator == "multiperiod": + mp_key = f"multiperiod_{scale}" + if mp_key in datasets: + mp_cfg = SCALE_CONFIGS[scale]["multiperiod"] + results = run_multiperiod_benchmark( + datasets[mp_key], + n_pre=mp_cfg["n_pre"], + n_post=mp_cfg["n_post"], + scale=scale, + n_replications=args.replications, + ) + all_results.append(results) + # Generate summary report if all_results: print(f"\n{'='*60}") diff --git a/docs/benchmarks.rst b/docs/benchmarks.rst index 647f903..eee42d9 100644 --- a/docs/benchmarks.rst +++ b/docs/benchmarks.rst @@ -27,6 +27,9 @@ diff-diff is validated against the following R packages: * - ``CallawaySantAnna`` - ``did::att_gt`` - Callaway & Sant'Anna (2021) + * - ``MultiPeriodDiD`` + - ``fixest::feols`` + - Event study with treatment × period interactions * - ``SyntheticDiD`` - ``synthdid::synthdid_estimate`` - Arkhangelsky et al. (2021) @@ -74,6 +77,11 @@ Summary Table - 0.0% - Yes - **PASS** + * - MultiPeriodDiD + - < 1e-11 + - 0.0% + - Yes + - **PASS** * - CallawaySantAnna - < 1e-10 - 0.0% @@ -116,6 +124,46 @@ Basic DiD Results **Validation**: PASS - Results are numerically identical across all implementations. +MultiPeriodDiD Results +~~~~~~~~~~~~~~~~~~~~~~ + +**Data**: 200 units, 8 periods (4 pre, 4 post), true ATT = 3.0 (small scale) + +.. list-table:: + :header-rows: 1 + + * - Metric + - diff-diff (Pure) + - diff-diff (Rust) + - R fixest + - Difference + * - ATT + - 2.912 + - 2.912 + - 2.912 + - < 1e-11 + * - SE + - 0.158 + - 0.158 + - 0.158 + - 0.0% + * - Period corr. + - 1.000 + - 1.000 + - (ref) + - Period max diff < 3e-11 + * - Time (s) + - 0.005 + - 0.035 + - 0.035 + - **7x faster** (pure) + +**Validation**: PASS - Both average ATT and all period-level effects match R's +``fixest::feols(outcome ~ treated * time_f | unit)`` to machine precision. The +regression includes unit fixed effects (absorbed via ``| unit`` in R, within- +transformation via ``absorb=["unit"]`` in Python) and treatment × period +interactions with cluster-robust SEs. + Synthetic DiD Results ~~~~~~~~~~~~~~~~~~~~~ @@ -363,35 +411,41 @@ Dataset Sizes .. list-table:: :header-rows: 1 - :widths: 12 22 22 22 22 + :widths: 10 18 18 18 18 18 * - Scale - BasicDiD + - MultiPeriodDiD - CallawaySantAnna - SyntheticDiD - Observations * - small - 100 × 4 - 200 × 8 + - 200 × 8 - 50 × 20 - 400 - 1,600 * - 1k - 1,000 × 6 - 1,000 × 10 + - 1,000 × 10 - 1,000 × 30 - 6,000 - 30,000 * - 5k - 5,000 × 8 - 5,000 × 12 + - 5,000 × 12 - 5,000 × 40 - 40,000 - 200,000 * - 10k - 10,000 × 10 + - 10,000 × 12 - 10,000 × 15 - 10,000 × 50 - 100,000 - 500,000 * - 20k - 20,000 × 12 + - 20,000 × 16 - 20,000 × 18 - 20,000 × 60 - 240,000 - 1,200,000 @@ -565,6 +619,7 @@ Running Benchmarks python benchmarks/run_benchmarks.py --estimator callaway --scale 1k --replications 3 python benchmarks/run_benchmarks.py --estimator synthdid --scale small --replications 3 python benchmarks/run_benchmarks.py --estimator basic --scale 20k --replications 3 + python benchmarks/run_benchmarks.py --estimator multiperiod --scale small --replications 3 # Available scales: small, 1k, 5k, 10k, 20k, all # Default: small (backward compatible) @@ -591,6 +646,10 @@ When to Trust Results - **BasicDiD/TWFE**: Results are identical to R. Use with confidence. +- **MultiPeriodDiD**: Results are identical to R's ``fixest::feols`` with + ``treated * time_f | unit`` interaction syntax (unit FE absorbed). Both average + ATT and all period-level effects match to machine precision. Use with confidence. + - **SyntheticDiD**: Both point estimates (0.3% diff) and standard errors (3.1% diff) match R closely. Use ``variance_method="placebo"`` (default) to match R's inference. Results are fully validated.