import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import arviz as az
import pymc as pm 
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.robust.scale import mad

/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/arviz/__init__.py:50: FutureWarning: 
ArviZ is undergoing a major refactor to improve flexibility and extensibility while maintaining a user-friendly interface.
Some upcoming changes may be backward incompatible.
For details and migration guidance, visit: https://python.arviz.org/en/latest/user_guide/migration_guide.html
  warn(

def fit_and_plot_lm(data, predictors, outcome, add_constant=True, show_plot=True, scatter_kws=None, line_kws=None):
    """
    Fit a linear model using statsmodels, print summary, plot, and show formula.
    Args:
        data: pandas DataFrame
        predictors: list of predictor column names (str)
        outcome: outcome column name (str)
        add_constant: whether to add intercept (default True)
        show_plot: whether to plot (default True)
        scatter_kws: dict, kwargs for scatterplot
        line_kws: dict, kwargs for regression line
    """
    X = data[predictors].copy()
    if add_constant:
        X = sm.add_constant(X, prepend=False)
    y = data[outcome]
    model = sm.OLS(y, X)
    results = model.fit()
    print(results.summary())
    # Print formula
    params = results.params
    formula = f"{outcome} = " + " + ".join([f"{params[name]:.2f}*{name}" for name in predictors])
    if add_constant:
        formula = f"{outcome} = {params['const']:.2f} + " + " + ".join([f"{params[name]:.2f}*{name}" for name in predictors])
    print("Formula:", formula)
    # Print residual standard deviation and its uncertainty
    sigma = np.sqrt(results.mse_resid)
    sigma_se = sigma / np.sqrt(2 * results.df_resid)
    print(f"Residual std dev (σ): {sigma:.2f} ± {sigma_se:.2f}")
    # Print median absolute deviation of residuals
    print("MAD of residuals:", round(mad(results.resid), 2))
    # Plot if only one predictor
    if show_plot and len(predictors) == 1:
        x_name = predictors[0]
        ax = sns.scatterplot(data=data, x=x_name, y=outcome, **(scatter_kws or {}))
        x_vals = np.linspace(data[x_name].min(), data[x_name].max(), 100)
        y_vals = params['const'] + params[x_name] * x_vals if add_constant else params[x_name] * x_vals
        ax.plot(x_vals, y_vals, color='red', **(line_kws or {}))
        ax.set_title('Linear Regression Fit')
        plt.show()

def fit_and_plot_bayes(data, predictors, outcome,
                       intercept_mu=0, intercept_sigma=50,
                       slope_mu=0, slope_sigma=50,
                       sigma_sigma=50,
                       samples=2000, tune=1000, hdi_prob=0.95,
                       show_trace=True, show_forest=True,
                       show_posterior=True, show_regression=True,
                       show_residuals=False,
                       n_regression_lines=100):
    """
    Fit a Bayesian linear regression with PyMC and produce diagnostic plots.

    The model is:
        y ~ Normal(mu, sigma)
        mu = intercept + sum_k (slope_k * x_k)
        intercept ~ Normal(intercept_mu, intercept_sigma)
        slope_k   ~ Normal(slope_mu, slope_sigma)     for each predictor k
        sigma     ~ HalfNormal(sigma_sigma)

    Supports one or multiple predictors. After sampling, prints a posterior
    summary and the mean regression formula, then optionally shows trace,
    forest, posterior density, regression-line, and residual plots.

    Args:
        data (pd.DataFrame): DataFrame containing predictor and outcome columns.
        predictors (str | list[str]): Predictor column name(s). A single string
            is treated as a one-predictor model; a list allows multiple predictors.
        outcome (str): Outcome column name.
        intercept_mu (float): Prior mean for the intercept Normal prior.
        intercept_sigma (float): Prior std dev for the intercept Normal prior.
        slope_mu (float): Prior mean for each slope's Normal prior.
        slope_sigma (float): Prior std dev for each slope's Normal prior.
        sigma_sigma (float): Scale for the HalfNormal prior on residual noise.
        samples (int): Number of posterior draws per chain.
        tune (int): Number of NUTS tuning (warm-up) steps.
        hdi_prob (float): Probability mass used for HDI summaries and plots.
        show_trace (bool): If True, plot MCMC traces and marginal densities.
        show_forest (bool): If True, plot a forest plot of posterior means + HDIs.
        show_posterior (bool): If True, plot posterior density for each parameter.
        show_regression (bool): If True, plot data with overlaid posterior
            regression lines (one subplot per predictor; other predictors held at mean).
        show_residuals (bool): If True, plot residuals (y - y_hat using posterior
            mean coefficients) vs fitted values and vs each predictor.
        n_regression_lines (int): Number of posterior draws to overlay on the
            regression plot when show_regression is True.

    Returns:
        arviz.InferenceData: The PyMC trace (posterior samples and sampling stats).
    """
    if isinstance(predictors, str):  # Allow a single predictor to be passed as a bare string.
        predictors = [predictors]  # Normalise to a list so downstream code can iterate uniformly.
    y = data[outcome].values  # Extract outcome values as a NumPy array for PyMC.

    with pm.Model() as model:  # Open a PyMC model context; variables below are registered to it.
        intercept = pm.Normal("intercept", mu=intercept_mu, sigma=intercept_sigma)  # Normal prior on the intercept.
        slopes = []  # Collect slope RVs (not strictly needed, but handy if you want to inspect them).
        mu = intercept  # Start building the linear predictor; will accumulate slope*x terms below.
        for pred in predictors:  # Loop over each predictor column name.
            s = pm.Normal(f"slope_{pred}", mu=slope_mu, sigma=slope_sigma)  # Normal prior on this predictor's slope.
            slopes.append(s)  # Keep a reference to the slope RV.
            mu = mu + s * data[pred].values  # Add this predictor's contribution to the linear predictor.
        sigma = pm.HalfNormal("sigma", sigma=sigma_sigma)  # HalfNormal prior on residual std dev (must be positive).
        likelihood = pm.Normal("y", mu=mu, sigma=sigma, observed=y)  # Likelihood: observed y conditional on mu and sigma.
        trace = pm.sample(samples, tune=tune, idata_kwargs={"log_likelihood": True})  # Run NUTS; store log-likelihood for LOO.

    summary = pm.summary(trace, hdi_prob=hdi_prob)  # Compute posterior summary stats (mean, sd, HDI, r_hat, ess).
    print(summary)  # Print the summary table for the user.

    # Build and print the mean-posterior regression formula for quick inspection.
    posterior = trace.posterior  # Shortcut to the posterior group of the InferenceData.
    intercept_mean = posterior["intercept"].values.flatten().mean()  # Posterior mean of the intercept (flatten chains+draws).
    formula = f"{outcome} = {intercept_mean:.2f}"  # Start the formula string with the intercept.
    for pred in predictors:  # Append a "+ slope*predictor" term for each predictor.
        slope_mean = posterior[f"slope_{pred}"].values.flatten().mean()  # Posterior mean slope for this predictor.
        formula += f" + {slope_mean:.2f}*{pred}"  # Append this term to the formula string.
    print(f"\nRegression formula: {formula}")  # Print the assembled formula.

    sigma_draws = posterior["sigma"].values.flatten()
    print(f"Residual std dev (σ): {sigma_draws.mean():.2f} ± {sigma_draws.std():.2f}")

    intercept_draws = posterior["intercept"].values.flatten()
    X_mat = data[predictors].values
    slope_mat = np.column_stack([posterior[f"slope_{pred}"].values.flatten() for pred in predictors])
    y_hat_all = intercept_draws[:, None] + slope_mat @ X_mat.T
    bayes_r2 = az.r2_score(y, y_hat_all)
    print(f"Bayesian R²: {bayes_r2['r2']:.3f} ± {bayes_r2['r2_std']:.3f}")

    loo = az.loo(trace, pointwise=True)  # PSIS-LOO; pointwise=True gives per-obs Pareto-k diagnostics.
    n_obs = len(y)
    print(f"LOO-ELPD: {loo.elpd_loo:.2f} ± {loo.se:.2f}  (p_loo={loo.p_loo:.1f})")
    print(f"LOO log score (per obs): {loo.elpd_loo / n_obs:.3f} ± {loo.se / n_obs:.3f}")
    n_bad_k = int((loo.pareto_k.values > 0.7).sum())
    if n_bad_k > 0:
        print(f"  Warning: {n_bad_k} observations with Pareto-k > 0.7 (unreliable LOO estimates)")

    if show_trace:  # Optional: MCMC trace diagnostics.
        az.plot_trace(trace)  # Plot chain traces and marginal densities per parameter.
        plt.tight_layout()  # Tidy up subplot spacing.
        plt.show()  # Render the figure.

    if show_forest:  # Optional: forest plot of parameter posteriors.
        az.plot_forest(trace, hdi_prob=hdi_prob)  # Show posterior means and HDIs for all parameters.
        plt.show()  # Render the figure.

    if show_posterior:  # Optional: posterior density plots.
        az.plot_posterior(trace, hdi_prob=hdi_prob)  # Density plot per parameter, annotated with HDI.
        plt.show()  # Render the figure.

    if show_regression:  # Optional: overlay posterior regression lines on data.
        a_samples = posterior["intercept"].values.flatten()  # Flattened posterior draws of the intercept.
        slope_samples = {pred: posterior[f"slope_{pred}"].values.flatten() for pred in predictors}  # Flattened slope draws per predictor.
        idx = np.random.choice(len(a_samples), n_regression_lines, replace=False)  # Random subset of draw indices to plot.

        fig, axes = plt.subplots(1, len(predictors), figsize=(6 * len(predictors), 5))  # One subplot per predictor.
        if len(predictors) == 1:  # plt.subplots returns a single Axes when ncols=1; wrap it for uniform handling.
            axes = [axes]  # Make axes iterable in the single-predictor case.

        for ax, pred in zip(axes, predictors):  # Plot each predictor's partial regression view.
            x = data[pred].values  # Observed values of this predictor.
            ax.scatter(x, y, alpha=0.5)  # Scatter of y vs this predictor.
            x_grid = np.linspace(x.min(), x.max(), 100)  # Dense grid across this predictor for smooth lines.

            # For each posterior draw, compute the contribution of *other* predictors
            # held at their sample means so the plotted line isolates this predictor's effect.
            other_contribution = np.zeros(len(a_samples))  # Per-draw constant offset from held-fixed predictors.
            for other_pred in predictors:  # Sum contributions across all other predictors.
                if other_pred != pred:  # Skip the predictor currently on the x-axis.
                    other_contribution += slope_samples[other_pred] * data[other_pred].mean()  # slope_draw * mean(x_other).

            for i in idx:  # Overlay a thin gray line per sampled posterior draw.
                y_line = a_samples[i] + other_contribution[i] + slope_samples[pred][i] * x_grid  # Predicted y on the grid.
                ax.plot(x_grid, y_line, alpha=0.05, color="gray")  # Low alpha so overlap shades posterior uncertainty.

            # Posterior mean line for emphasis.
            mean_other = sum(slope_samples[op].mean() * data[op].mean() for op in predictors if op != pred)  # Mean offset from other predictors.
            y_mean = a_samples.mean() + mean_other + slope_samples[pred].mean() * x_grid  # Mean posterior prediction on the grid.
            ax.plot(x_grid, y_mean, color="red")  # Plot the mean line in red.
            ax.set_xlabel(pred)  # Label x-axis with the predictor name.
            ax.set_ylabel(outcome)  # Label y-axis with the outcome name.
            ax.set_title(f"{outcome} vs {pred} (others at mean)")  # Title notes that other predictors are held at mean.

        plt.tight_layout()  # Tidy spacing across subplots.
        plt.show()  # Render the figure.

    if show_residuals:  # Optional: residual diagnostics using posterior mean coefficients.
        a_mean = posterior["intercept"].values.flatten().mean()  # Posterior mean intercept (point estimate).
        slope_means = {pred: posterior[f"slope_{pred}"].values.flatten().mean() for pred in predictors}  # Posterior mean slope per predictor.
        y_hat = a_mean + sum(slope_means[pred] * data[pred].values for pred in predictors)  # Fitted values using mean coefficients.
        residuals = y - y_hat  # Residuals: observed minus fitted.

        fig, axes = plt.subplots(1, len(predictors) + 1, figsize=(6 * (len(predictors) + 1), 5))  # One subplot for fitted + one per predictor.

        axes[0].scatter(y_hat, residuals, alpha=0.5)  # Residuals vs fitted values — should look like noise around 0.
        axes[0].axhline(0, color="red", linestyle="--")  # Zero reference line to judge bias/structure.
        axes[0].set_xlabel("Fitted values")  # x-axis label.
        axes[0].set_ylabel("Residuals")  # y-axis label.
        axes[0].set_title("Residuals vs Fitted")  # Subplot title.

        for ax, pred in zip(axes[1:], predictors):  # Residuals vs each predictor — check for unmodeled structure.
            ax.scatter(data[pred].values, residuals, alpha=0.5)  # Scatter residuals against this predictor.
            ax.axhline(0, color="red", linestyle="--")  # Zero reference line.
            ax.set_xlabel(pred)  # x-axis label is the predictor name.
            ax.set_ylabel("Residuals")  # y-axis label.
            ax.set_title(f"Residuals vs {pred}")  # Subplot title.

        plt.tight_layout()  # Tidy subplot spacing.
        plt.show()  # Render the figure.

    return trace  # Return the InferenceData so the caller can do further analysis.

earnings = pd.read_csv('../ros_data/earnings.csv', skiprows=0)
# earnings['earn_k'] = earnings['earn'] / 1000
# earnings['c_height'] = earnings['height'] - 66 # Center height around 66 inches for better interpretability of the intercept

# calculate standard deviation of height
height_std = earnings['height'].std()
print(f"Standard deviation of height: {height_std:.2f} inches")

display(earnings.head())

earnings_clean = earnings.dropna(subset=['height', 'earn'])

earnings_model = fit_and_plot_bayes(earnings_clean, 'height', 'earn',
                       intercept_mu=0, intercept_sigma=50,
                       slope_mu=0, slope_sigma=50,
                       sigma_sigma=50,
                       samples=2000, tune=1000, hdi_prob=0.95,
                       show_trace=True, show_forest=False,
                       show_posterior=False, show_regression=True,
                       n_regression_lines=100)

print(earnings_model)

Standard deviation of height: 3.83 inches

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, slope_height, sigma]

/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/rich/live.py:260: UserWarning: install "ipywidgets" for 
Jupyter support
  warnings.warn('install "ipywidgets" for Jupyter support')

Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 1 seconds.

                  mean      sd  hdi_2.5%  hdi_97.5%  mcse_mean  mcse_sd  \
intercept      -22.607  49.905  -121.872     73.514      0.499    0.502   
slope_height   321.520   2.443   316.797    326.370      0.025    0.026   
sigma         6718.922  24.638  6671.214   6768.193      0.244    0.290   

              ess_bulk  ess_tail  r_hat  
intercept       9963.0    6334.0    1.0  
slope_height    9314.0    6715.0    1.0  
sigma          10213.0    6076.0    1.0  

Regression formula: earn = -22.61 + 321.52*height
Residual std dev (σ): 6718.92 ± 24.64
Bayesian R²: 0.003 ± 0.000

/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/arviz/stats/stats.py:782: UserWarning: Estimated shape parameter of Pareto distribution is greater than 0.70 for one or more samples. You should consider using a more robust model, this is because importance sampling is less likely to work well if the marginal posterior and LOO posterior are very different. This is more likely to happen with a non-robust model and highly influential observations.
  warnings.warn(

LOO-ELPD: -27668.41 ± 1775.19  (p_loo=111.8)
LOO log score (per obs): -15.236 ± 0.978
  Warning: 2 observations with Pareto-k > 0.7 (unreliable LOO estimates)

Inference data with groups:
	> posterior
	> log_likelihood
	> sample_stats
	> observed_data

kidiq = pd.read_csv('../ros_data/kidiq.csv', skiprows=0)
display(kidiq.head())

# interaction term between mom_hs and mom_iq
kidiq['mom_hs_iq'] = kidiq['mom_hs'] * kidiq['mom_iq']

fit1 = fit_and_plot_bayes(kidiq, ['mom_hs', 'mom_iq', 'mom_hs_iq'], 'kid_score',
                       intercept_mu=0, intercept_sigma=50,
                       slope_mu=0, slope_sigma=50,
                       sigma_sigma=50,
                       samples=2000, tune=1000, hdi_prob=0.95,
                       show_trace=False, show_forest=False,
                       show_posterior=False, show_regression=False,
                       n_regression_lines=100)

# Centering by subtracting the mean
kidiq['c_mom_iq'] = kidiq['mom_iq'] - kidiq['mom_iq'].mean()  # Center mom_iq by subtracting its mean
kidiq['c_mom_hs'] = kidiq['mom_hs'] - kidiq['mom_hs'].mean()  # Center mom_hs by subtracting its mean
kidiq['c_mom_hs_iq'] = kidiq['c_mom_hs'] * kidiq['c_mom_iq']  # Interaction of centered predictors

fit2 = fit_and_plot_bayes(kidiq, ['c_mom_hs', 'c_mom_iq', 'c_mom_hs_iq'], 'kid_score',
                       intercept_mu=0, intercept_sigma=50,
                       slope_mu=0, slope_sigma=50,
                       sigma_sigma=50,
                       samples=2000, tune=1000, hdi_prob=0.95,
                       show_trace=False, show_forest=False,
                       show_posterior=False, show_regression=False,
                       n_regression_lines=100)

# Standardising by subtracting the mean and dividing by 2 standard deviations
kidiq['s_mom_iq'] = (kidiq['mom_iq'] - kidiq['mom_iq'].mean()) / (2 * kidiq['mom_iq'].std())  # Standardize mom_iq
kidiq['s_mom_hs'] = (kidiq['mom_hs'] - kidiq['mom_hs'].mean()) / (2 * kidiq['mom_hs'].std())  # Standardize mom_hs
kidiq['s_mom_hs_iq'] = kidiq['s_mom_hs'] * kidiq['s_mom_iq']  # Interaction of standardized predictors

fit3 = fit_and_plot_bayes(kidiq, ['s_mom_hs', 's_mom_iq', 's_mom_hs_iq'], 'kid_score',
                       intercept_mu=0, intercept_sigma=50,
                       slope_mu=0, slope_sigma=50,
                       sigma_sigma=50,
                       samples=2000, tune=1000, hdi_prob=0.95,
                       show_trace=False, show_forest=False,
                       show_posterior=False, show_regression=False,
                       n_regression_lines=100)

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, slope_mom_hs, slope_mom_iq, slope_mom_hs_iq, sigma]

/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/rich/live.py:260: UserWarning: install "ipywidgets" for 
Jupyter support
  warnings.warn('install "ipywidgets" for Jupyter support')

Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 6 seconds.

                   mean      sd  hdi_2.5%  hdi_97.5%  mcse_mean  mcse_sd  \
intercept        -6.862  13.138   -33.221     17.723      0.344    0.199   
slope_mom_hs     45.730  14.552    17.803     73.886      0.378    0.218   
slope_mom_iq      0.919   0.142     0.649      1.200      0.004    0.002   
slope_mom_hs_iq  -0.426   0.154    -0.721     -0.129      0.004    0.002   
sigma            18.035   0.612    16.874     19.222      0.010    0.008   

                 ess_bulk  ess_tail  r_hat  
intercept          1462.0    2339.0    1.0  
slope_mom_hs       1483.0    2156.0    1.0  
slope_mom_iq       1472.0    2196.0    1.0  
slope_mom_hs_iq    1473.0    2042.0    1.0  
sigma              3689.0    4005.0    1.0  

Regression formula: kid_score = -6.86 + 45.73*mom_hs + 0.92*mom_iq + -0.43*mom_hs_iq
Residual std dev (σ): 18.03 ± 0.61
Bayesian R²: 0.228 ± 0.030

Initializing NUTS using jitter+adapt_diag...

LOO-ELPD: -1872.42 ± 14.31  (p_loo=4.7)
LOO log score (per obs): -4.314 ± 0.033

Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, slope_c_mom_hs, slope_c_mom_iq, slope_c_mom_hs_iq, sigma]

/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/rich/live.py:260: UserWarning: install "ipywidgets" for 
Jupyter support
  warnings.warn('install "ipywidgets" for Jupyter support')

	height	weight	male	earn	earnk	ethnicity	education	mother_education	father_education	walk	exercise	smokenow	tense	angry	age
0	74	210.0	1	50000.0	50.0	White	16.0	16.0	16.0	3	3	2.0	0.0	0.0	45
1	66	125.0	0	60000.0	60.0	White	16.0	16.0	16.0	6	5	1.0	0.0	0.0	58
2	64	126.0	0	30000.0	30.0	White	16.0	16.0	16.0	8	1	2.0	1.0	1.0	29
3	65	200.0	0	25000.0	25.0	White	17.0	17.0	NaN	8	1	2.0	0.0	0.0	57
4	63	110.0	0	50000.0	50.0	Other	16.0	16.0	16.0	5	6	2.0	0.0	0.0	91

	kid_score	mom_hs	mom_iq	mom_work	mom_age
0	65	1	121.117529	4	27
1	98	1	89.361882	4	25
2	85	1	115.443165	4	27
3	83	1	99.449639	3	25
4	115	1	92.745710	4	27

12. Transformations and regression¶

12.1 Linear transformations¶

Scaling of predictors and regression coefficients¶

Standardisation using z-scores¶

12.2 Centering and standardizing for models with interactions¶

Centering by subtracting the mean¶

Using a conventional centering point¶

Standardizing by subtracting the mean and dividing by 2 standard deviations¶

Why scaling by 2 standard deviations?¶

Multiplying each regression coefficient by 2 standard deviations of its predictor¶

12.3 Correlation and 'regression to the mean'¶

12.4 Logarithmic transformations¶

12.5 Other transformations¶

12.6 Building and comparing regression models for prediction¶

12.7 Models for regression coefficients¶

12.8 Bibliographic note¶

12.9 Exercises¶