diff --git a/CLAUDE.md b/CLAUDE.md
index bd8570d..9bfeda0 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -351,6 +351,7 @@ python benchmarks/run_benchmarks.py --all
 
 # Run specific estimator
 python benchmarks/run_benchmarks.py --estimator callaway
+python benchmarks/run_benchmarks.py --estimator multiperiod
 ```
 
 See `docs/benchmarks.rst` for full methodology and validation results.
diff --git a/METHODOLOGY_REVIEW.md b/METHODOLOGY_REVIEW.md
index 233de9e..4752747 100644
--- a/METHODOLOGY_REVIEW.md
+++ b/METHODOLOGY_REVIEW.md
@@ -134,8 +134,10 @@ Each estimator in diff-diff should be periodically reviewed to ensure:
   fixed to use interaction sub-VCV instead of full regression VCV.
 
 **Outstanding Concerns:**
-- No R comparison benchmarks yet (unlike DifferenceInDifferences and CallawaySantAnna which
-  have formal R benchmark tests). Consider adding `benchmarks/R/multiperiod_benchmark.R`.
+- ~~No R comparison benchmarks yet~~ — **Resolved**: R comparison benchmark added via
+  `benchmarks/R/benchmark_multiperiod.R` using `fixest::feols(outcome ~ treated * time_f | unit)`.
+  Results match R exactly: ATT diff < 1e-11, SE diff 0.0%, period effects correlation 1.0.
+  Validated at small (200 units) and 1k scales.
 - Default SE is HC1 (not cluster-robust at unit level as fixest uses). Cluster-robust
   available via `cluster` parameter but not the default.
 - Endpoint binning for distant event times not yet implemented.
diff --git a/benchmarks/R/benchmark_multiperiod.R b/benchmarks/R/benchmark_multiperiod.R
new file mode 100644
index 0000000..7132488
--- /dev/null
+++ b/benchmarks/R/benchmark_multiperiod.R
@@ -0,0 +1,201 @@
+#!/usr/bin/env Rscript
+# Benchmark: MultiPeriodDiD event study (R `fixest` package)
+#
+# Usage:
+#   Rscript benchmark_multiperiod.R --data path/to/data.csv --output path/to/results.json \
+#       --n-pre 4 --n-post 4
+
+library(fixest)
+library(jsonlite)
+library(data.table)
+
+# Parse command line arguments
+args <- commandArgs(trailingOnly = TRUE)
+
+parse_args <- function(args) {
+  result <- list(
+    data = NULL,
+    output = NULL,
+    cluster = "unit",
+    n_pre = NULL,
+    n_post = NULL,
+    reference_period = NULL
+  )
+
+  i <- 1
+  while (i <= length(args)) {
+    if (args[i] == "--data") {
+      result$data <- args[i + 1]
+      i <- i + 2
+    } else if (args[i] == "--output") {
+      result$output <- args[i + 1]
+      i <- i + 2
+    } else if (args[i] == "--cluster") {
+      result$cluster <- args[i + 1]
+      i <- i + 2
+    } else if (args[i] == "--n-pre") {
+      result$n_pre <- as.integer(args[i + 1])
+      i <- i + 2
+    } else if (args[i] == "--n-post") {
+      result$n_post <- as.integer(args[i + 1])
+      i <- i + 2
+    } else if (args[i] == "--reference-period") {
+      result$reference_period <- as.integer(args[i + 1])
+      i <- i + 2
+    } else {
+      i <- i + 1
+    }
+  }
+
+  if (is.null(result$data) || is.null(result$output)) {
+    stop("Usage: Rscript benchmark_multiperiod.R --data <path> --output <path> --n-pre <int> --n-post <int>")
+  }
+  if (is.null(result$n_pre) || is.null(result$n_post)) {
+    stop("--n-pre and --n-post are required")
+  }
+
+  # Default reference period: last pre-period
+  if (is.null(result$reference_period)) {
+    result$reference_period <- result$n_pre
+  }
+
+  return(result)
+}
+
+config <- parse_args(args)
+
+# Load data
+message(sprintf("Loading data from: %s", config$data))
+data <- fread(config$data)
+
+ref_period <- config$reference_period
+message(sprintf("Reference period: %d", ref_period))
+message(sprintf("n_pre: %d, n_post: %d", config$n_pre, config$n_post))
+
+# Create factor for time with reference level
+data[, time_f := relevel(factor(time), ref = as.character(ref_period))]
+
+# Run benchmark
+message("Running MultiPeriodDiD estimation (fixest::feols)...")
+start_time <- Sys.time()
+
+# Regression: outcome ~ treated * time_f | unit, clustered SEs
+# With | unit, fixest absorbs unit fixed effects. The unit-invariant 'treated'
+# main effect is collinear with unit FE and is absorbed automatically.
+# Interaction coefficients treated:time_fK remain identified.
+cluster_formula <- as.formula(paste0("~", config$cluster))
+model <- feols(outcome ~ treated * time_f | unit, data = data, cluster = cluster_formula)
+
+estimation_time <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
+
+# Extract all coefficients and SEs
+coefs <- coef(model)
+ses <- se(model)
+vcov_mat <- vcov(model)
+
+# Extract interaction coefficients (treated:time_fK for each non-reference K)
+interaction_mask <- grepl("^treated:time_f", names(coefs))
+interaction_names <- names(coefs)[interaction_mask]
+interaction_coefs <- coefs[interaction_mask]
+interaction_ses <- ses[interaction_mask]
+
+message(sprintf("Found %d interaction coefficients", length(interaction_names)))
+
+# Build period effects list
+all_periods <- sort(unique(data$time))
+period_effects <- list()
+
+for (i in seq_along(interaction_names)) {
+  coef_name <- interaction_names[i]
+  # Extract period value from coefficient name "treated:time_fK"
+  period_val <- as.integer(sub("treated:time_f", "", coef_name))
+  event_time <- period_val - ref_period
+
+  period_effects[[i]] <- list(
+    period = period_val,
+    event_time = event_time,
+    att = unname(interaction_coefs[i]),
+    se = unname(interaction_ses[i])
+  )
+}
+
+# Compute average ATT across post-periods (covariance-aware SE)
+post_period_names <- c()
+for (coef_name in interaction_names) {
+  period_val <- as.integer(sub("treated:time_f", "", coef_name))
+  if (period_val > config$n_pre) {
+    post_period_names <- c(post_period_names, coef_name)
+  }
+}
+
+n_post_periods <- length(post_period_names)
+message(sprintf("Post-period interaction coefficients: %d", n_post_periods))
+
+if (n_post_periods > 0) {
+  avg_att <- mean(coefs[post_period_names])
+  vcov_sub <- vcov_mat[post_period_names, post_period_names, drop = FALSE]
+  avg_se <- sqrt(sum(vcov_sub) / n_post_periods^2)
+  # NaN guard: match registry convention (REGISTRY.md lines 179-183)
+  if (is.finite(avg_se) && avg_se > 0) {
+    avg_t <- avg_att / avg_se
+    avg_pval <- 2 * pt(abs(avg_t), df = model$nobs - length(coefs), lower.tail = FALSE)
+    avg_ci_lower <- avg_att - qt(0.975, df = model$nobs - length(coefs)) * avg_se
+    avg_ci_upper <- avg_att + qt(0.975, df = model$nobs - length(coefs)) * avg_se
+  } else {
+    avg_t <- NA
+    avg_pval <- NA
+    avg_ci_lower <- NA
+    avg_ci_upper <- NA
+  }
+} else {
+  avg_att <- NA
+  avg_se <- NA
+  avg_pval <- NA
+  avg_ci_lower <- NA
+  avg_ci_upper <- NA
+}
+
+message(sprintf("Average ATT: %.6f", avg_att))
+message(sprintf("Average SE:  %.6f", avg_se))
+
+# Format output
+results <- list(
+  estimator = "fixest::feols (multiperiod)",
+  cluster = config$cluster,
+
+  # Average treatment effect
+  att = avg_att,
+  se = avg_se,
+  pvalue = avg_pval,
+  ci_lower = avg_ci_lower,
+  ci_upper = avg_ci_upper,
+
+  # Reference period
+  reference_period = ref_period,
+
+  # Period-level effects
+  period_effects = period_effects,
+
+  # Timing
+  timing = list(
+    estimation_seconds = estimation_time,
+    total_seconds = estimation_time
+  ),
+
+  # Metadata
+  metadata = list(
+    r_version = R.version.string,
+    fixest_version = as.character(packageVersion("fixest")),
+    n_units = length(unique(data$unit)),
+    n_periods = length(unique(data$time)),
+    n_obs = nrow(data),
+    n_pre = config$n_pre,
+    n_post = config$n_post
+  )
+)
+
+# Write output
+message(sprintf("Writing results to: %s", config$output))
+write_json(results, config$output, auto_unbox = TRUE, pretty = TRUE, digits = 10)
+
+message(sprintf("Completed in %.3f seconds", estimation_time))
diff --git a/benchmarks/python/benchmark_multiperiod.py b/benchmarks/python/benchmark_multiperiod.py
new file mode 100644
index 0000000..5507aa4
--- /dev/null
+++ b/benchmarks/python/benchmark_multiperiod.py
@@ -0,0 +1,162 @@
+#!/usr/bin/env python3
+"""
+Benchmark: MultiPeriodDiD event study (diff-diff MultiPeriodDiD).
+
+Usage:
+    python benchmark_multiperiod.py --data path/to/data.csv --output path/to/results.json \
+        --n-pre 4 --n-post 4
+"""
+
+import argparse
+import json
+import os
+import sys
+from pathlib import Path
+
+# IMPORTANT: Parse --backend and set environment variable BEFORE importing diff_diff
+# This ensures the backend configuration is respected by all modules
+def _get_backend_from_args():
+    """Parse --backend argument without importing diff_diff."""
+    parser = argparse.ArgumentParser(add_help=False)
+    parser.add_argument("--backend", default="auto", choices=["auto", "python", "rust"])
+    args, _ = parser.parse_known_args()
+    return args.backend
+
+_requested_backend = _get_backend_from_args()
+if _requested_backend in ("python", "rust"):
+    os.environ["DIFF_DIFF_BACKEND"] = _requested_backend
+
+# NOW import diff_diff and other dependencies (will see the env var)
+import pandas as pd
+
+# Add parent to path for imports
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+
+from diff_diff import MultiPeriodDiD, HAS_RUST_BACKEND
+from benchmarks.python.utils import Timer
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Benchmark MultiPeriodDiD estimator")
+    parser.add_argument("--data", required=True, help="Path to input CSV data")
+    parser.add_argument("--output", required=True, help="Path to output JSON results")
+    parser.add_argument(
+        "--cluster", default="unit", help="Column to cluster standard errors on"
+    )
+    parser.add_argument(
+        "--n-pre", type=int, required=True, help="Number of pre-treatment periods"
+    )
+    parser.add_argument(
+        "--n-post", type=int, required=True, help="Number of post-treatment periods"
+    )
+    parser.add_argument(
+        "--reference-period", type=int, default=None,
+        help="Reference period (default: last pre-period = n_pre)"
+    )
+    parser.add_argument(
+        "--backend", default="auto", choices=["auto", "python", "rust"],
+        help="Backend to use: auto (default), python (pure Python), rust (Rust backend)"
+    )
+    return parser.parse_args()
+
+
+def get_actual_backend() -> str:
+    """Return the actual backend being used based on HAS_RUST_BACKEND."""
+    return "rust" if HAS_RUST_BACKEND else "python"
+
+
+def main():
+    args = parse_args()
+
+    # Get actual backend (already configured via env var before imports)
+    actual_backend = get_actual_backend()
+    print(f"Using backend: {actual_backend}")
+
+    # Load data
+    print(f"Loading data from: {args.data}")
+    data = pd.read_csv(args.data)
+
+    # Compute post_periods and reference_period from args
+    all_periods = sorted(data["time"].unique())
+    n_pre = args.n_pre
+    post_periods = [p for p in all_periods if p > n_pre]
+    ref_period = args.reference_period if args.reference_period is not None else n_pre
+
+    print(f"All periods: {all_periods}")
+    print(f"Post periods: {post_periods}")
+    print(f"Reference period: {ref_period}")
+
+    # Run benchmark
+    print("Running MultiPeriodDiD estimation...")
+
+    did = MultiPeriodDiD(robust=True, cluster=args.cluster)
+
+    with Timer() as timer:
+        results = did.fit(
+            data,
+            outcome="outcome",
+            treatment="treated",
+            time="time",
+            post_periods=post_periods,
+            reference_period=ref_period,
+            absorb=["unit"],
+        )
+
+    total_time = timer.elapsed
+
+    # Extract period effects (excluding reference period)
+    period_effects = []
+    for period, pe in sorted(results.period_effects.items()):
+        event_time = period - ref_period
+        period_effects.append({
+            "period": int(period),
+            "event_time": int(event_time),
+            "att": float(pe.effect),
+            "se": float(pe.se),
+        })
+
+    # Build output
+    output = {
+        "estimator": "diff_diff.MultiPeriodDiD",
+        "backend": actual_backend,
+        "cluster": args.cluster,
+        # Average treatment effect (across post-periods)
+        "att": float(results.avg_att),
+        "se": float(results.avg_se),
+        "pvalue": float(results.avg_p_value),
+        "ci_lower": float(results.avg_conf_int[0]),
+        "ci_upper": float(results.avg_conf_int[1]),
+        # Reference period
+        "reference_period": int(ref_period),
+        # Period-level effects
+        "period_effects": period_effects,
+        # Timing
+        "timing": {
+            "estimation_seconds": total_time,
+            "total_seconds": total_time,
+        },
+        # Metadata
+        "metadata": {
+            "n_units": int(data["unit"].nunique()),
+            "n_periods": int(data["time"].nunique()),
+            "n_obs": len(data),
+            "n_pre": n_pre,
+            "n_post": len(post_periods),
+        },
+    }
+
+    # Write output
+    print(f"Writing results to: {args.output}")
+    output_path = Path(args.output)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, "w") as f:
+        json.dump(output, f, indent=2)
+
+    print(f"ATT: {results.avg_att:.6f}")
+    print(f"SE:  {results.avg_se:.6f}")
+    print(f"Completed in {total_time:.3f} seconds")
+    return output
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/python/utils.py b/benchmarks/python/utils.py
index a71c9d9..3ccca80 100644
--- a/benchmarks/python/utils.py
+++ b/benchmarks/python/utils.py
@@ -331,6 +331,68 @@ def generate_sdid_data(
     return pd.DataFrame(data)
 
 
+def generate_multiperiod_data(
+    n_units: int = 200,
+    n_pre: int = 4,
+    n_post: int = 4,
+    treatment_effect: float = 3.0,
+    treatment_fraction: float = 0.5,
+    seed: int = 42,
+) -> pd.DataFrame:
+    """
+    Generate synthetic multi-period event study data for benchmarking.
+
+    All treated units receive treatment simultaneously at the same time.
+
+    Parameters
+    ----------
+    n_units : int
+        Number of units.
+    n_pre : int
+        Number of pre-treatment periods.
+    n_post : int
+        Number of post-treatment periods.
+    treatment_effect : float
+        True treatment effect in post-periods.
+    treatment_fraction : float
+        Fraction of units that are treated.
+    seed : int
+        Random seed.
+
+    Returns
+    -------
+    pd.DataFrame
+        Panel data with columns: unit, time, outcome, treated.
+    """
+    rng = np.random.default_rng(seed)
+
+    n_treated = int(n_units * treatment_fraction)
+    n_periods = n_pre + n_post
+    data = []
+
+    for unit in range(n_units):
+        unit_fe = rng.normal(0, 2)
+        treated = 1 if unit < n_treated else 0
+
+        for t in range(1, n_periods + 1):
+            time_fe = t * 0.5
+            post = 1 if t > n_pre else 0
+
+            effect = treatment_effect if (treated and post) else 0
+            outcome = unit_fe + time_fe + effect + rng.normal(0, 1)
+
+            data.append(
+                {
+                    "unit": unit,
+                    "time": t,
+                    "outcome": outcome,
+                    "treated": treated,
+                }
+            )
+
+    return pd.DataFrame(data)
+
+
 def load_benchmark_data(path: Path) -> pd.DataFrame:
     """Load benchmark data from CSV."""
     return pd.read_csv(path)
diff --git a/benchmarks/run_benchmarks.py b/benchmarks/run_benchmarks.py
index 284d2f6..7501ee7 100644
--- a/benchmarks/run_benchmarks.py
+++ b/benchmarks/run_benchmarks.py
@@ -33,11 +33,13 @@
     generate_staggered_data,
     generate_basic_did_data,
     generate_sdid_data,
+    generate_multiperiod_data,
     save_benchmark_data,
     compute_timing_stats,
 )
 from benchmarks.compare_results import (
     compare_estimates,
+    compare_event_study,
     generate_comparison_report,
     load_results,
 )
@@ -48,26 +50,31 @@
         "staggered": {"n_units": 200, "n_periods": 8, "n_cohorts": 3},
         "basic": {"n_units": 100, "n_periods": 4},
         "sdid": {"n_control": 40, "n_treated": 10, "n_pre": 15, "n_post": 5},
+        "multiperiod": {"n_units": 200, "n_pre": 4, "n_post": 4},
     },
     "1k": {
         "staggered": {"n_units": 1000, "n_periods": 10, "n_cohorts": 4},
         "basic": {"n_units": 1000, "n_periods": 6},
         "sdid": {"n_control": 800, "n_treated": 200, "n_pre": 20, "n_post": 10},
+        "multiperiod": {"n_units": 1000, "n_pre": 5, "n_post": 5},
     },
     "5k": {
         "staggered": {"n_units": 5000, "n_periods": 12, "n_cohorts": 5},
         "basic": {"n_units": 5000, "n_periods": 8},
         "sdid": {"n_control": 4000, "n_treated": 1000, "n_pre": 25, "n_post": 15},
+        "multiperiod": {"n_units": 5000, "n_pre": 6, "n_post": 6},
     },
     "10k": {
         "staggered": {"n_units": 10000, "n_periods": 15, "n_cohorts": 6},
         "basic": {"n_units": 10000, "n_periods": 10},
         "sdid": {"n_control": 8000, "n_treated": 2000, "n_pre": 30, "n_post": 20},
+        "multiperiod": {"n_units": 10000, "n_pre": 6, "n_post": 6},
     },
     "20k": {
         "staggered": {"n_units": 20000, "n_periods": 18, "n_cohorts": 7},
         "basic": {"n_units": 20000, "n_periods": 12},
         "sdid": {"n_control": 16000, "n_treated": 4000, "n_pre": 35, "n_post": 25},
+        "multiperiod": {"n_units": 20000, "n_pre": 8, "n_post": 8},
     },
 }
 
@@ -291,6 +298,22 @@ def generate_synthetic_datasets(
         save_benchmark_data(sdid_data, sdid_path)
         datasets[f"sdid_{scale}"] = sdid_path
 
+        # MultiPeriod event study data
+        mp_cfg = config["multiperiod"]
+        n_periods = mp_cfg["n_pre"] + mp_cfg["n_post"]
+        n_obs = mp_cfg["n_units"] * n_periods
+        print(f"    - multiperiod_{scale} ({mp_cfg['n_units']} units, {n_periods} periods, {n_obs:,} obs)")
+        multiperiod_data = generate_multiperiod_data(
+            n_units=mp_cfg["n_units"],
+            n_pre=mp_cfg["n_pre"],
+            n_post=mp_cfg["n_post"],
+            treatment_effect=3.0,
+            seed=seed,
+        )
+        multiperiod_path = DATA_DIR / "synthetic" / f"multiperiod_{scale}.csv"
+        save_benchmark_data(multiperiod_data, multiperiod_path)
+        datasets[f"multiperiod_{scale}"] = multiperiod_path
+
     print(f"\nGenerated {len(datasets)} datasets")
     return datasets
 
@@ -672,6 +695,144 @@ def run_basic_did_benchmark(
     return results
 
 
+def run_multiperiod_benchmark(
+    data_path: Path,
+    n_pre: int,
+    n_post: int,
+    name: str = "multiperiod",
+    scale: str = "small",
+    n_replications: int = 1,
+    backends: Optional[List[str]] = None,
+) -> Dict[str, Any]:
+    """Run MultiPeriodDiD event study benchmarks (Python and R) with replications."""
+    print(f"\n{'='*60}")
+    print(f"MULTIPERIOD DID BENCHMARK ({scale})")
+    print(f"{'='*60}")
+
+    if backends is None:
+        backends = ["python", "rust"]
+
+    timeouts = TIMEOUT_CONFIGS.get(scale, TIMEOUT_CONFIGS["small"])
+    extra_args = ["--n-pre", str(n_pre), "--n-post", str(n_post)]
+    results = {
+        "name": name,
+        "scale": scale,
+        "n_replications": n_replications,
+        "python_pure": None,
+        "python_rust": None,
+        "r": None,
+        "comparison": None,
+    }
+
+    # Run Python benchmark for each backend
+    for backend in backends:
+        backend_label = f"python_{'pure' if backend == 'python' else backend}"
+        print(f"\nRunning Python (diff_diff.MultiPeriodDiD, backend={backend}) - {n_replications} replications...")
+        py_output = RESULTS_DIR / "accuracy" / f"{backend_label}_{name}_{scale}.json"
+        py_output.parent.mkdir(parents=True, exist_ok=True)
+
+        py_timings = []
+        py_result = None
+        for rep in range(n_replications):
+            try:
+                py_result = run_python_benchmark(
+                    "benchmark_multiperiod.py", data_path, py_output,
+                    extra_args=extra_args,
+                    timeout=timeouts["python"],
+                    backend=backend,
+                )
+                py_timings.append(py_result["timing"]["total_seconds"])
+                if rep == 0:
+                    print(f"  ATT: {py_result['att']:.4f}")
+                    print(f"  SE:  {py_result['se']:.4f}")
+                print(f"  Rep {rep+1}/{n_replications}: {py_timings[-1]:.3f}s")
+            except Exception as e:
+                print(f"  Rep {rep+1} failed: {e}")
+
+        if py_result and py_timings:
+            timing_stats = compute_timing_stats(py_timings)
+            py_result["timing"] = timing_stats
+            results[backend_label] = py_result
+            print(f"  Mean time: {timing_stats['stats']['mean']:.3f}s ± {timing_stats['stats']['std']:.3f}s")
+
+    # For backward compatibility, also store as "python" (use rust if available)
+    if results.get("python_rust"):
+        results["python"] = results["python_rust"]
+    elif results.get("python_pure"):
+        results["python"] = results["python_pure"]
+
+    # R benchmark with replications
+    print(f"\nRunning R (fixest::feols multiperiod) - {n_replications} replications...")
+    r_output = RESULTS_DIR / "accuracy" / f"r_{name}_{scale}.json"
+
+    r_timings = []
+    r_result = None
+    for rep in range(n_replications):
+        try:
+            r_result = run_r_benchmark(
+                "benchmark_multiperiod.R", data_path, r_output,
+                extra_args=extra_args,
+                timeout=timeouts["r"]
+            )
+            r_timings.append(r_result["timing"]["total_seconds"])
+            if rep == 0:
+                print(f"  ATT: {r_result['att']:.4f}")
+                print(f"  SE:  {r_result['se']:.4f}")
+            print(f"  Rep {rep+1}/{n_replications}: {r_timings[-1]:.3f}s")
+        except Exception as e:
+            print(f"  Rep {rep+1} failed: {e}")
+
+    if r_result and r_timings:
+        timing_stats = compute_timing_stats(r_timings)
+        r_result["timing"] = timing_stats
+        results["r"] = r_result
+        print(f"  Mean time: {timing_stats['stats']['mean']:.3f}s ± {timing_stats['stats']['std']:.3f}s")
+
+    # Compare results
+    if results["python"] and results["r"]:
+        print("\nComparison (Python vs R):")
+        comparison = compare_estimates(
+            results["python"], results["r"], "MultiPeriodDiD", scale=scale,
+            se_rtol=0.01,
+            python_pure_results=results.get("python_pure"),
+            python_rust_results=results.get("python_rust"),
+        )
+        results["comparison"] = comparison
+        print(f"  ATT diff: {comparison.att_diff:.2e}")
+        print(f"  SE rel diff: {comparison.se_rel_diff:.1%}")
+        print(f"  Status: {'PASS' if comparison.passed else 'FAIL'}")
+
+        # Period-level comparison
+        py_effects = results["python"].get("period_effects", [])
+        r_effects = results["r"].get("period_effects", [])
+        if py_effects and r_effects:
+            corr, max_diff, all_close = compare_event_study(py_effects, r_effects)
+            print(f"  Period effects correlation: {corr:.6f}")
+            print(f"  Period effects max diff: {max_diff:.2e}")
+            print(f"  Period effects all close: {all_close}")
+
+    # Print timing comparison table
+    print("\nTiming Comparison:")
+    print(f"  {'Backend':<15} {'Time (s)':<12} {'vs R':<12} {'vs Pure Python':<15}")
+    print(f"  {'-'*54}")
+
+    r_mean = results["r"]["timing"]["stats"]["mean"] if results["r"] else None
+    pure_mean = results["python_pure"]["timing"]["stats"]["mean"] if results.get("python_pure") else None
+    rust_mean = results["python_rust"]["timing"]["stats"]["mean"] if results.get("python_rust") else None
+
+    if r_mean:
+        print(f"  {'R':<15} {r_mean:<12.3f} {'1.00x':<12} {'-':<15}")
+    if pure_mean:
+        r_speedup = f"{r_mean/pure_mean:.2f}x" if r_mean else "-"
+        print(f"  {'Python (pure)':<15} {pure_mean:<12.3f} {r_speedup:<12} {'1.00x':<15}")
+    if rust_mean:
+        r_speedup = f"{r_mean/rust_mean:.2f}x" if r_mean else "-"
+        pure_speedup = f"{pure_mean/rust_mean:.2f}x" if pure_mean else "-"
+        print(f"  {'Python (rust)':<15} {rust_mean:<12.3f} {r_speedup:<12} {pure_speedup:<15}")
+
+    return results
+
+
 def main():
     parser = argparse.ArgumentParser(
         description="Run diff-diff benchmarks against R packages"
@@ -683,7 +844,7 @@ def main():
     )
     parser.add_argument(
         "--estimator",
-        choices=["callaway", "synthdid", "basic"],
+        choices=["callaway", "synthdid", "basic", "multiperiod"],
         help="Run specific estimator benchmark",
     )
     parser.add_argument(
@@ -771,6 +932,19 @@ def main():
                 )
                 all_results.append(results)
 
+        if args.all or args.estimator == "multiperiod":
+            mp_key = f"multiperiod_{scale}"
+            if mp_key in datasets:
+                mp_cfg = SCALE_CONFIGS[scale]["multiperiod"]
+                results = run_multiperiod_benchmark(
+                    datasets[mp_key],
+                    n_pre=mp_cfg["n_pre"],
+                    n_post=mp_cfg["n_post"],
+                    scale=scale,
+                    n_replications=args.replications,
+                )
+                all_results.append(results)
+
     # Generate summary report
     if all_results:
         print(f"\n{'='*60}")
diff --git a/docs/benchmarks.rst b/docs/benchmarks.rst
index 647f903..eee42d9 100644
--- a/docs/benchmarks.rst
+++ b/docs/benchmarks.rst
@@ -27,6 +27,9 @@ diff-diff is validated against the following R packages:
    * - ``CallawaySantAnna``
      - ``did::att_gt``
      - Callaway & Sant'Anna (2021)
+   * - ``MultiPeriodDiD``
+     - ``fixest::feols``
+     - Event study with treatment × period interactions
    * - ``SyntheticDiD``
      - ``synthdid::synthdid_estimate``
      - Arkhangelsky et al. (2021)
@@ -74,6 +77,11 @@ Summary Table
      - 0.0%
      - Yes
      - **PASS**
+   * - MultiPeriodDiD
+     - < 1e-11
+     - 0.0%
+     - Yes
+     - **PASS**
    * - CallawaySantAnna
      - < 1e-10
      - 0.0%
@@ -116,6 +124,46 @@ Basic DiD Results
 
 **Validation**: PASS - Results are numerically identical across all implementations.
 
+MultiPeriodDiD Results
+~~~~~~~~~~~~~~~~~~~~~~
+
+**Data**: 200 units, 8 periods (4 pre, 4 post), true ATT = 3.0 (small scale)
+
+.. list-table::
+   :header-rows: 1
+
+   * - Metric
+     - diff-diff (Pure)
+     - diff-diff (Rust)
+     - R fixest
+     - Difference
+   * - ATT
+     - 2.912
+     - 2.912
+     - 2.912
+     - < 1e-11
+   * - SE
+     - 0.158
+     - 0.158
+     - 0.158
+     - 0.0%
+   * - Period corr.
+     - 1.000
+     - 1.000
+     - (ref)
+     - Period max diff < 3e-11
+   * - Time (s)
+     - 0.005
+     - 0.035
+     - 0.035
+     - **7x faster** (pure)
+
+**Validation**: PASS - Both average ATT and all period-level effects match R's
+``fixest::feols(outcome ~ treated * time_f | unit)`` to machine precision. The
+regression includes unit fixed effects (absorbed via ``| unit`` in R, within-
+transformation via ``absorb=["unit"]`` in Python) and treatment × period
+interactions with cluster-robust SEs.
+
 Synthetic DiD Results
 ~~~~~~~~~~~~~~~~~~~~~
 
@@ -363,35 +411,41 @@ Dataset Sizes
 
 .. list-table::
    :header-rows: 1
-   :widths: 12 22 22 22 22
+   :widths: 10 18 18 18 18 18
 
    * - Scale
      - BasicDiD
+     - MultiPeriodDiD
      - CallawaySantAnna
      - SyntheticDiD
      - Observations
    * - small
      - 100 × 4
      - 200 × 8
+     - 200 × 8
      - 50 × 20
      - 400 - 1,600
    * - 1k
      - 1,000 × 6
      - 1,000 × 10
+     - 1,000 × 10
      - 1,000 × 30
      - 6,000 - 30,000
    * - 5k
      - 5,000 × 8
      - 5,000 × 12
+     - 5,000 × 12
      - 5,000 × 40
      - 40,000 - 200,000
    * - 10k
      - 10,000 × 10
+     - 10,000 × 12
      - 10,000 × 15
      - 10,000 × 50
      - 100,000 - 500,000
    * - 20k
      - 20,000 × 12
+     - 20,000 × 16
      - 20,000 × 18
      - 20,000 × 60
      - 240,000 - 1,200,000
@@ -565,6 +619,7 @@ Running Benchmarks
    python benchmarks/run_benchmarks.py --estimator callaway --scale 1k --replications 3
    python benchmarks/run_benchmarks.py --estimator synthdid --scale small --replications 3
    python benchmarks/run_benchmarks.py --estimator basic --scale 20k --replications 3
+   python benchmarks/run_benchmarks.py --estimator multiperiod --scale small --replications 3
 
    # Available scales: small, 1k, 5k, 10k, 20k, all
    # Default: small (backward compatible)
@@ -591,6 +646,10 @@ When to Trust Results
 
 - **BasicDiD/TWFE**: Results are identical to R. Use with confidence.
 
+- **MultiPeriodDiD**: Results are identical to R's ``fixest::feols`` with
+  ``treated * time_f | unit`` interaction syntax (unit FE absorbed). Both average
+  ATT and all period-level effects match to machine precision. Use with confidence.
+
 - **SyntheticDiD**: Both point estimates (0.3% diff) and standard errors (3.1% diff)
   match R closely. Use ``variance_method="placebo"`` (default) to match R's
   inference. Results are fully validated.