import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import arviz as az
import pymc as pm 
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.robust.scale import mad

/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/arviz/__init__.py:50: FutureWarning: 
ArviZ is undergoing a major refactor to improve flexibility and extensibility while maintaining a user-friendly interface.
Some upcoming changes may be backward incompatible.
For details and migration guidance, visit: https://python.arviz.org/en/latest/user_guide/migration_guide.html
  warn(

def fit_and_plot_lm(data, predictors, outcome, add_constant=True, show_plot=True, scatter_kws=None, line_kws=None):
    """
    Fit a linear model using statsmodels, print summary, plot, and show formula.
    Args:
        data: pandas DataFrame
        predictors: list of predictor column names (str)
        outcome: outcome column name (str)
        add_constant: whether to add intercept (default True)
        show_plot: whether to plot (default True)
        scatter_kws: dict, kwargs for scatterplot
        line_kws: dict, kwargs for regression line
    """
    X = data[predictors].copy()
    if add_constant:
        X = sm.add_constant(X, prepend=False)
    y = data[outcome]
    model = sm.OLS(y, X)
    results = model.fit()
    print(results.summary())
    # Print formula
    params = results.params
    formula = f"{outcome} = " + " + ".join([f"{params[name]:.2f}*{name}" for name in predictors])
    if add_constant:
        formula = f"{outcome} = {params['const']:.2f} + " + " + ".join([f"{params[name]:.2f}*{name}" for name in predictors])
    print("Formula:", formula)
    # Print residual standard deviation and its uncertainty
    sigma = np.sqrt(results.mse_resid)
    sigma_se = sigma / np.sqrt(2 * results.df_resid)
    print(f"Residual std dev (σ): {sigma:.2f} ± {sigma_se:.2f}")
    # Print median absolute deviation of residuals
    print("MAD of residuals:", round(mad(results.resid), 2))
    # Plot if only one predictor
    if show_plot and len(predictors) == 1:
        x_name = predictors[0]
        ax = sns.scatterplot(data=data, x=x_name, y=outcome, **(scatter_kws or {}))
        x_vals = np.linspace(data[x_name].min(), data[x_name].max(), 100)
        y_vals = params['const'] + params[x_name] * x_vals if add_constant else params[x_name] * x_vals
        ax.plot(x_vals, y_vals, color='red', **(line_kws or {}))
        ax.set_title('Linear Regression Fit')
        plt.show()

def fit_and_plot_bayes(data, predictors, outcome,
                       intercept_mu=0, intercept_sigma=50,
                       slope_mu=0, slope_sigma=50,
                       sigma_sigma=50,
                       samples=2000, tune=1000, hdi_prob=0.95,
                       show_trace=True, show_forest=True,
                       show_posterior=True, show_regression=True,
                       show_residuals=False,
                       n_regression_lines=100):
    """
    Fit a Bayesian linear regression with PyMC and produce diagnostic plots.

    The model is:
        y ~ Normal(mu, sigma)
        mu = intercept + sum_k (slope_k * x_k)
        intercept ~ Normal(intercept_mu, intercept_sigma)
        slope_k   ~ Normal(slope_mu, slope_sigma)     for each predictor k
        sigma     ~ HalfNormal(sigma_sigma)

    Supports one or multiple predictors. After sampling, prints a posterior
    summary and the mean regression formula, then optionally shows trace,
    forest, posterior density, regression-line, and residual plots.

    Args:
        data (pd.DataFrame): DataFrame containing predictor and outcome columns.
        predictors (str | list[str]): Predictor column name(s). A single string
            is treated as a one-predictor model; a list allows multiple predictors.
        outcome (str): Outcome column name.
        intercept_mu (float): Prior mean for the intercept Normal prior.
        intercept_sigma (float): Prior std dev for the intercept Normal prior.
        slope_mu (float): Prior mean for each slope's Normal prior.
        slope_sigma (float): Prior std dev for each slope's Normal prior.
        sigma_sigma (float): Scale for the HalfNormal prior on residual noise.
        samples (int): Number of posterior draws per chain.
        tune (int): Number of NUTS tuning (warm-up) steps.
        hdi_prob (float): Probability mass used for HDI summaries and plots.
        show_trace (bool): If True, plot MCMC traces and marginal densities.
        show_forest (bool): If True, plot a forest plot of posterior means + HDIs.
        show_posterior (bool): If True, plot posterior density for each parameter.
        show_regression (bool): If True, plot data with overlaid posterior
            regression lines (one subplot per predictor; other predictors held at mean).
        show_residuals (bool): If True, plot residuals (y - y_hat using posterior
            mean coefficients) vs fitted values and vs each predictor.
        n_regression_lines (int): Number of posterior draws to overlay on the
            regression plot when show_regression is True.

    Returns:
        arviz.InferenceData: The PyMC trace (posterior samples and sampling stats).
    """
    if isinstance(predictors, str):  # Allow a single predictor to be passed as a bare string.
        predictors = [predictors]  # Normalise to a list so downstream code can iterate uniformly.
    y = data[outcome].values  # Extract outcome values as a NumPy array for PyMC.

    with pm.Model() as model:  # Open a PyMC model context; variables below are registered to it.
        intercept = pm.Normal("intercept", mu=intercept_mu, sigma=intercept_sigma)  # Normal prior on the intercept.
        slopes = []  # Collect slope RVs (not strictly needed, but handy if you want to inspect them).
        mu = intercept  # Start building the linear predictor; will accumulate slope*x terms below.
        for pred in predictors:  # Loop over each predictor column name.
            s = pm.Normal(f"slope_{pred}", mu=slope_mu, sigma=slope_sigma)  # Normal prior on this predictor's slope.
            slopes.append(s)  # Keep a reference to the slope RV.
            mu = mu + s * data[pred].values  # Add this predictor's contribution to the linear predictor.
        sigma = pm.HalfNormal("sigma", sigma=sigma_sigma)  # HalfNormal prior on residual std dev (must be positive).
        likelihood = pm.Normal("y", mu=mu, sigma=sigma, observed=y)  # Likelihood: observed y conditional on mu and sigma.
        trace = pm.sample(samples, tune=tune, idata_kwargs={"log_likelihood": True})  # Run NUTS; store log-likelihood for LOO.

    summary = pm.summary(trace, hdi_prob=hdi_prob)  # Compute posterior summary stats (mean, sd, HDI, r_hat, ess).
    print(summary)  # Print the summary table for the user.

    # Build and print the mean-posterior regression formula for quick inspection.
    posterior = trace.posterior  # Shortcut to the posterior group of the InferenceData.
    intercept_mean = posterior["intercept"].values.flatten().mean()  # Posterior mean of the intercept (flatten chains+draws).
    formula = f"{outcome} = {intercept_mean:.2f}"  # Start the formula string with the intercept.
    for pred in predictors:  # Append a "+ slope*predictor" term for each predictor.
        slope_mean = posterior[f"slope_{pred}"].values.flatten().mean()  # Posterior mean slope for this predictor.
        formula += f" + {slope_mean:.2f}*{pred}"  # Append this term to the formula string.
    print(f"\nRegression formula: {formula}")  # Print the assembled formula.

    sigma_draws = posterior["sigma"].values.flatten()
    print(f"Residual std dev (σ): {sigma_draws.mean():.2f} ± {sigma_draws.std():.2f}")

    intercept_draws = posterior["intercept"].values.flatten()
    X_mat = data[predictors].values
    slope_mat = np.column_stack([posterior[f"slope_{pred}"].values.flatten() for pred in predictors])
    y_hat_all = intercept_draws[:, None] + slope_mat @ X_mat.T
    bayes_r2 = az.r2_score(y, y_hat_all)
    print(f"Bayesian R²: {bayes_r2['r2']:.3f} ± {bayes_r2['r2_std']:.3f}")

    loo = az.loo(trace, pointwise=True)  # PSIS-LOO; pointwise=True gives per-obs Pareto-k diagnostics.
    n_obs = len(y)
    print(f"LOO-ELPD: {loo.elpd_loo:.2f} ± {loo.se:.2f}  (p_loo={loo.p_loo:.1f})")
    print(f"LOO log score (per obs): {loo.elpd_loo / n_obs:.3f} ± {loo.se / n_obs:.3f}")
    n_bad_k = int((loo.pareto_k.values > 0.7).sum())
    if n_bad_k > 0:
        print(f"  Warning: {n_bad_k} observations with Pareto-k > 0.7 (unreliable LOO estimates)")

    if show_trace:  # Optional: MCMC trace diagnostics.
        az.plot_trace(trace)  # Plot chain traces and marginal densities per parameter.
        plt.tight_layout()  # Tidy up subplot spacing.
        plt.show()  # Render the figure.

    if show_forest:  # Optional: forest plot of parameter posteriors.
        az.plot_forest(trace, hdi_prob=hdi_prob)  # Show posterior means and HDIs for all parameters.
        plt.show()  # Render the figure.

    if show_posterior:  # Optional: posterior density plots.
        az.plot_posterior(trace, hdi_prob=hdi_prob)  # Density plot per parameter, annotated with HDI.
        plt.show()  # Render the figure.

    if show_regression:  # Optional: overlay posterior regression lines on data.
        a_samples = posterior["intercept"].values.flatten()  # Flattened posterior draws of the intercept.
        slope_samples = {pred: posterior[f"slope_{pred}"].values.flatten() for pred in predictors}  # Flattened slope draws per predictor.
        idx = np.random.choice(len(a_samples), n_regression_lines, replace=False)  # Random subset of draw indices to plot.

        fig, axes = plt.subplots(1, len(predictors), figsize=(6 * len(predictors), 5))  # One subplot per predictor.
        if len(predictors) == 1:  # plt.subplots returns a single Axes when ncols=1; wrap it for uniform handling.
            axes = [axes]  # Make axes iterable in the single-predictor case.

        for ax, pred in zip(axes, predictors):  # Plot each predictor's partial regression view.
            x = data[pred].values  # Observed values of this predictor.
            ax.scatter(x, y, alpha=0.5)  # Scatter of y vs this predictor.
            x_grid = np.linspace(x.min(), x.max(), 100)  # Dense grid across this predictor for smooth lines.

            # For each posterior draw, compute the contribution of *other* predictors
            # held at their sample means so the plotted line isolates this predictor's effect.
            other_contribution = np.zeros(len(a_samples))  # Per-draw constant offset from held-fixed predictors.
            for other_pred in predictors:  # Sum contributions across all other predictors.
                if other_pred != pred:  # Skip the predictor currently on the x-axis.
                    other_contribution += slope_samples[other_pred] * data[other_pred].mean()  # slope_draw * mean(x_other).

            for i in idx:  # Overlay a thin gray line per sampled posterior draw.
                y_line = a_samples[i] + other_contribution[i] + slope_samples[pred][i] * x_grid  # Predicted y on the grid.
                ax.plot(x_grid, y_line, alpha=0.05, color="gray")  # Low alpha so overlap shades posterior uncertainty.

            # Posterior mean line for emphasis.
            mean_other = sum(slope_samples[op].mean() * data[op].mean() for op in predictors if op != pred)  # Mean offset from other predictors.
            y_mean = a_samples.mean() + mean_other + slope_samples[pred].mean() * x_grid  # Mean posterior prediction on the grid.
            ax.plot(x_grid, y_mean, color="red")  # Plot the mean line in red.
            ax.set_xlabel(pred)  # Label x-axis with the predictor name.
            ax.set_ylabel(outcome)  # Label y-axis with the outcome name.
            ax.set_title(f"{outcome} vs {pred} (others at mean)")  # Title notes that other predictors are held at mean.

        plt.tight_layout()  # Tidy spacing across subplots.
        plt.show()  # Render the figure.

    if show_residuals:  # Optional: residual diagnostics using posterior mean coefficients.
        a_mean = posterior["intercept"].values.flatten().mean()  # Posterior mean intercept (point estimate).
        slope_means = {pred: posterior[f"slope_{pred}"].values.flatten().mean() for pred in predictors}  # Posterior mean slope per predictor.
        y_hat = a_mean + sum(slope_means[pred] * data[pred].values for pred in predictors)  # Fitted values using mean coefficients.
        residuals = y - y_hat  # Residuals: observed minus fitted.

        fig, axes = plt.subplots(1, len(predictors) + 1, figsize=(6 * (len(predictors) + 1), 5))  # One subplot for fitted + one per predictor.

        axes[0].scatter(y_hat, residuals, alpha=0.5)  # Residuals vs fitted values — should look like noise around 0.
        axes[0].axhline(0, color="red", linestyle="--")  # Zero reference line to judge bias/structure.
        axes[0].set_xlabel("Fitted values")  # x-axis label.
        axes[0].set_ylabel("Residuals")  # y-axis label.
        axes[0].set_title("Residuals vs Fitted")  # Subplot title.

        for ax, pred in zip(axes[1:], predictors):  # Residuals vs each predictor — check for unmodeled structure.
            ax.scatter(data[pred].values, residuals, alpha=0.5)  # Scatter residuals against this predictor.
            ax.axhline(0, color="red", linestyle="--")  # Zero reference line.
            ax.set_xlabel(pred)  # x-axis label is the predictor name.
            ax.set_ylabel("Residuals")  # y-axis label.
            ax.set_title(f"Residuals vs {pred}")  # Subplot title.

        plt.tight_layout()  # Tidy subplot spacing.
        plt.show()  # Render the figure.

    return trace  # Return the InferenceData so the caller can do further analysis.

# Simualte some fake data

n = 100
x = np.random.uniform(0, 1, n)                  
z = np.random.choice([0, 1], size=n, replace=True)
a = 1
b = 2
theta = 5
sigma = 2
y = a + b * x + theta * z + np.random.normal(0, sigma, n)
data = pd.DataFrame({"x": x, "z": z, "y": y})
display(data.head())
z0 = data[data["z"] == 0]
z1 = data[data["z"] == 1]

# fit_and_plot_lm(z0, predictors=["x"], outcome="y")
# fit_and_plot_lm(z1, predictors=["x"], outcome="y")

fit_and_plot_bayes(data = z0, predictors = ['x'], outcome = 'y',
                       intercept_mu=0, intercept_sigma=50,
                       slope_mu=0, slope_sigma=50,
                       sigma_sigma=50,
                       samples=2000, tune=1000, hdi_prob=0.95,
                       show_trace=False, show_forest=False,
                       show_posterior=False, show_regression=True,
                       show_residuals=True,
                       n_regression_lines=100)

fit_and_plot_bayes(data = z1, predictors = ['x'], outcome = 'y',
                       intercept_mu=0, intercept_sigma=50,
                       slope_mu=0, slope_sigma=50,
                       sigma_sigma=50,
                       samples=2000, tune=1000, hdi_prob=0.95,
                       show_trace=False, show_forest=False,
                       show_posterior=False, show_regression=True,
                       show_residuals=True,
                       n_regression_lines=100)

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, slope_x, sigma]

/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/rich/live.py:260: UserWarning: install "ipywidgets" for 
Jupyter support
  warnings.warn('install "ipywidgets" for Jupyter support')

Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 1 seconds.

            mean     sd  hdi_2.5%  hdi_97.5%  mcse_mean  mcse_sd  ess_bulk  \
intercept  0.071  0.541    -0.999      1.110      0.009    0.007    3304.0   
slope_x    2.925  0.945     1.104      4.782      0.016    0.013    3362.0   
sigma      1.891  0.195     1.540      2.297      0.003    0.003    4535.0   

           ess_tail  r_hat  
intercept    3671.0    1.0  
slope_x      3751.0    1.0  
sigma        4044.0    1.0  

Regression formula: y = 0.07 + 2.93*x
Residual std dev (σ): 1.89 ± 0.19
Bayesian R²: 0.173 ± 0.082
LOO-ELPD: -105.72 ± 5.67  (p_loo=3.2)
LOO log score (per obs): -2.073 ± 0.111

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, slope_x, sigma]

/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/rich/live.py:260: UserWarning: install "ipywidgets" for 
Jupyter support
  warnings.warn('install "ipywidgets" for Jupyter support')

Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 1 seconds.

            mean     sd  hdi_2.5%  hdi_97.5%  mcse_mean  mcse_sd  ess_bulk  \
intercept  6.536  0.642     5.259      7.788      0.011    0.008    3609.0   
slope_x    0.982  1.154    -1.272      3.199      0.020    0.015    3443.0   
sigma      2.376  0.254     1.912      2.882      0.004    0.004    4570.0   

           ess_tail  r_hat  
intercept    3834.0    1.0  
slope_x      3757.0    1.0  
sigma        4454.0    1.0  

Regression formula: y = 6.54 + 0.98*x
Residual std dev (σ): 2.38 ± 0.25
Bayesian R²: 0.033 ± 0.039
LOO-ELPD: -112.48 ± 4.29  (p_loo=2.6)
LOO log score (per obs): -2.295 ± 0.088

# simulate fake data

N = 100                                              # Number of observations (sample size).
K = 10                                               # Number of predictors (features).
X = np.random.uniform(0, 1, size=(N, K))             # N×K design matrix of Uniform(0,1) draws.
# print(X[:5])                                         # Show the first 5 rows of the design matrix
z = np.random.choice([0, 1], size=N, replace=True)   # Binary treatment indicator per observation.
a = 1                                                # Intercept (constant term).
b = np.arange(1, K + 1)                              # Length-K slope vector: [1, 2, ..., K].
theta = 5                                            # Treatment effect coefficient on z.
sigma = 2                                            # Residual standard deviation.
# X @ b is matrix multiplication: for each row of X, multiply each value by the matching entry in b and sum — one number per observation (same as np.dot(X, b)).
y = a + X @ b + theta * z + np.random.normal(0, sigma, N)  # Outcome generated by the linear model with noise.
data = pd.DataFrame(X, columns=[f"x{k+1}" for k in range(K)])  # Create DataFrame with predictor columns named x1, x2, ..., xK.
data["z"] = z  # Add treatment indicator to the DataFrame.
data["y"] = y  # Add outcome variable to the DataFrame.
display(data.head())  # Show the first few rows of the simulated dataset.
# describe the data
# print(data.describe())

z0 = data[data["z"] == 0]  # Subset of data where treatment indicator z is 0.
z1 = data[data["z"] == 1]  # Subset of data where

fit_and_plot_bayes(data=z0, predictors=[f"x{k+1}" for k in range(K)], outcome="y",
                   intercept_mu=0, intercept_sigma=50,
                   slope_mu=0, slope_sigma=50,
                   sigma_sigma=50,
                   samples=2000, tune=1000, hdi_prob=0.95,
                   show_trace=False, show_forest=False,
                   show_posterior=False, show_regression=True,
                   show_residuals=True,
                   n_regression_lines=100)

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, slope_x1, slope_x2, slope_x3, slope_x4, slope_x5, slope_x6, slope_x7, slope_x8, slope_x9, slope_x10, sigma]

/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/rich/live.py:260: UserWarning: install "ipywidgets" for 
Jupyter support
  warnings.warn('install "ipywidgets" for Jupyter support')

Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 4 seconds.

             mean     sd  hdi_2.5%  hdi_97.5%  mcse_mean  mcse_sd  ess_bulk  \
intercept  -0.199  1.785    -3.669      3.410      0.023    0.022    5839.0   
slope_x1    2.423  1.062     0.408      4.601      0.012    0.013    8016.0   
slope_x2    1.574  1.109    -0.716      3.690      0.013    0.013    7408.0   
slope_x3    1.640  1.116    -0.515      3.865      0.012    0.013    8343.0   
slope_x4    3.203  1.147     0.986      5.466      0.013    0.013    8356.0   
slope_x5    4.763  1.155     2.578      7.088      0.013    0.015    8040.0   
slope_x6    7.626  1.322     5.015     10.197      0.016    0.015    7081.0   
slope_x7    6.541  1.203     4.150      8.878      0.014    0.013    7215.0   
slope_x8    7.775  1.222     5.420     10.226      0.014    0.014    7868.0   
slope_x9   10.043  1.144     7.871     12.399      0.013    0.014    7501.0   
slope_x10  10.637  1.151     8.450     12.939      0.013    0.013    7698.0   
sigma       2.155  0.248     1.717      2.668      0.003    0.003    5885.0   

           ess_tail  r_hat  
intercept    4644.0    1.0  
slope_x1     5844.0    1.0  
slope_x2     5625.0    1.0  
slope_x3     5223.0    1.0  
slope_x4     5638.0    1.0  
slope_x5     5801.0    1.0  
slope_x6     5623.0    1.0  
slope_x7     6250.0    1.0  
slope_x8     6195.0    1.0  
slope_x9     5290.0    1.0  
slope_x10    5908.0    1.0  
sigma        5380.0    1.0  

Regression formula: y = -0.20 + 2.42*x1 + 1.57*x2 + 1.64*x3 + 3.20*x4 + 4.76*x5 + 7.63*x6 + 6.54*x7 + 7.77*x8 + 10.04*x9 + 10.64*x10
Residual std dev (σ): 2.15 ± 0.25
Bayesian R²: 0.884 ± 0.015
LOO-ELPD: -122.75 ± 5.44  (p_loo=12.3)
LOO log score (per obs): -2.316 ± 0.103
  Warning: 1 observations with Pareto-k > 0.7 (unreliable LOO estimates)

/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/arviz/stats/stats.py:782: UserWarning: Estimated shape parameter of Pareto distribution is greater than 0.70 for one or more samples. You should consider using a more robust model, this is because importance sampling is less likely to work well if the marginal posterior and LOO posterior are very different. This is more likely to happen with a non-robust model and highly influential observations.
  warnings.warn(

<xarray.Dataset> Size: 784kB
Dimensions:    (chain: 4, draw: 2000)
Coordinates:
  * chain      (chain) int64 32B 0 1 2 3
  * draw       (draw) int64 16kB 0 1 2 3 4 5 6 ... 1994 1995 1996 1997 1998 1999
Data variables:
    intercept  (chain, draw) float64 64kB 0.6593 1.075 ... -1.733 -0.5384
    slope_x1   (chain, draw) float64 64kB 2.437 0.7655 4.487 ... 4.868 4.52 2.59
    slope_x2   (chain, draw) float64 64kB 2.77 0.7728 2.016 ... 2.61 2.483 2.557
    slope_x3   (chain, draw) float64 64kB 2.857 0.04978 2.969 ... 2.047 0.1386
    slope_x4   (chain, draw) float64 64kB 4.503 2.468 3.083 ... 3.275 2.759 3.24
    slope_x5   (chain, draw) float64 64kB 3.212 7.074 3.169 ... 1.914 4.987
    slope_x6   (chain, draw) float64 64kB 5.582 8.671 5.807 ... 9.141 5.452
    slope_x7   (chain, draw) float64 64kB 6.996 5.081 8.09 ... 6.613 6.227 8.688
    slope_x8   (chain, draw) float64 64kB 8.627 7.771 8.292 ... 9.32 8.912 8.456
    slope_x9   (chain, draw) float64 64kB 8.754 8.907 10.01 ... 9.699 9.822
    slope_x10  (chain, draw) float64 64kB 8.06 11.32 8.755 ... 12.24 11.8 10.75
    sigma      (chain, draw) float64 64kB 2.283 1.931 1.922 ... 2.021 2.174 2.14
Attributes:
    created_at:                 2026-04-29T07:00:57.001070+00:00
    arviz_version:              0.23.4
    inference_library:          pymc
    inference_library_version:  5.28.2
    sampling_time:              3.8593673706054688
    tuning_steps:               1000

array([0, 1, 2, 3])

array([   0,    1,    2, ..., 1997, 1998, 1999], shape=(2000,))

array([[ 0.659261  ,  1.07534163, -0.6314563 , ...,  2.54743372,
        -0.03963473,  0.56361164],
       [-2.56738798,  1.11078748, -1.06883906, ..., -0.93649978,
         1.70669729,  1.35469173],
       [ 1.42755547,  1.98204049,  1.11902833, ...,  0.2567874 ,
         0.29159445,  1.0119711 ],
       [-2.93372553,  1.0749503 , -2.56214487, ..., -2.33643222,
        -1.73280783, -0.53836776]], shape=(4, 2000))

# Import gradesW4315.dat from /ros_data
grades = pd.read_csv('../ros_data/gradesW4315.dat', 
                     sep=r'\s+')

# Display the first few rows of the dataset
display(grades.head(10))

fit_and_plot_bayes(data=grades, predictors=["midterm"], outcome="final",
                     intercept_mu=0, intercept_sigma=50,
                     slope_mu=0, slope_sigma=50,
                     sigma_sigma=50,
                     samples=2000, tune=1000, hdi_prob=0.95,
                     show_trace=False, show_forest=False,
                     show_posterior=False, show_regression=True,
                     show_residuals=True,
                     n_regression_lines=100)

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, slope_midterm, sigma]

/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/rich/live.py:260: UserWarning: install "ipywidgets" for 
Jupyter support
  warnings.warn('install "ipywidgets" for Jupyter support')

Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 2 seconds.

                 mean      sd  hdi_2.5%  hdi_97.5%  mcse_mean  mcse_sd  \
intercept      57.736  16.722    26.616     90.677      0.349    0.251   
slope_midterm   0.789   0.211     0.387      1.196      0.004    0.003   
sigma          15.103   1.535    12.253     18.174      0.026    0.021   

               ess_bulk  ess_tail  r_hat  
intercept        2302.0    3062.0    1.0  
slope_midterm    2323.0    3050.0    1.0  
sigma            3379.0    3274.0    1.0  

Regression formula: final = 57.74 + 0.79*midterm
Residual std dev (σ): 15.10 ± 1.54
Bayesian R²: 0.211 ± 0.080
LOO-ELPD: -216.22 ± 5.44  (p_loo=3.4)
LOO log score (per obs): -4.158 ± 0.105

<xarray.Dataset> Size: 208kB
Dimensions:        (chain: 4, draw: 2000)
Coordinates:
  * chain          (chain) int64 32B 0 1 2 3
  * draw           (draw) int64 16kB 0 1 2 3 4 5 ... 1995 1996 1997 1998 1999
Data variables:
    intercept      (chain, draw) float64 64kB 54.06 49.2 57.57 ... 52.49 50.71
    slope_midterm  (chain, draw) float64 64kB 0.8117 0.8722 ... 0.9173 0.8182
    sigma          (chain, draw) float64 64kB 15.48 13.51 13.52 ... 15.54 15.64
Attributes:
    created_at:                 2026-04-29T07:01:01.821555+00:00
    arviz_version:              0.23.4
    inference_library:          pymc
    inference_library_version:  5.28.2
    sampling_time:              2.4053969383239746
    tuning_steps:               1000

array([0, 1, 2, 3])

array([   0,    1,    2, ..., 1997, 1998, 1999], shape=(2000,))

array([[54.06140119, 49.19791635, 57.57081119, ..., 56.03097253,
        44.30725101, 36.20968289],
       [77.68917924, 75.08995137, 75.40488869, ..., 31.42233148,
        43.55833196, 43.55833196],
       [69.15300694, 71.11707757, 82.00498459, ..., 38.96762794,
        39.44146605, 62.67034526],
       [57.09143096, 58.39663951, 65.5706168 , ..., 48.08757355,
        52.48867068, 50.71341153]], shape=(4, 2000))

array([[0.8117044 , 0.87222632, 0.81376283, ..., 0.80819193, 0.9618803 ,
        1.07709491],
       [0.56655731, 0.54933781, 0.56478379, ..., 1.11027455, 0.98584288,
        0.98584288],
       [0.60612526, 0.62413849, 0.49098806, ..., 1.02569847, 1.01724878,
        0.71006307],
       [0.80773335, 0.82447506, 0.69045279, ..., 0.84647492, 0.91734581,
        0.81822213]], shape=(4, 2000))

# read csv
newcomb = pd.read_csv('../ros_data/newcomb.txt', sep=' ')
display(newcomb.head())

#plot histogram of the data
sns.histplot(newcomb['y'], bins=30, kde=True)
plt.title('Newcomb\'s Speed of Light Measurements')
plt.xlabel('Time (microseconds)')
plt.ylabel('Frequency')
plt.show()

# using pymc to fit a normal distribution to the data, with no predictors (just an intercept) - dont use fit_and_plot_bayes since it expects a predictor, so we will write a custom pymc model for this
with pm.Model() as model:
    mu = pm.Normal('mu', mu=0, sigma=50)  # Prior for the mean (intercept).
    sigma = pm.HalfNormal('sigma', sigma=50)  # Prior for the standard deviation (must be positive).
    likelihood = pm.Normal('y', mu=mu, sigma=sigma, observed=newcomb['y'].values)  # Likelihood of observed data given mu and sigma.
    trace = pm.sample(2000, tune=1000, chains=4)  # Sample from the posterior distribution using NUTS.

az.plot_trace(trace)  # Plot the MCMC traces and marginal densities for mu and sigma.
plt.tight_layout()  # Adjust subplot spacing.
plt.show()  # Render the trace plot.

# Stack the posterior samples of mu and sigma into a 2-column matrix.
# Each row is one posterior draw: [mu_draw, sigma_draw].
# This mirrors R's `sims <- as.matrix(fit)`.
sims = np.column_stack([
    trace.posterior["mu"].values.ravel(),       # flatten chains x draws into 1D
    trace.posterior["sigma"].values.ravel(),
])

# Total number of posterior draws we have available.
n_sims = sims.shape[0]

# Build fake ("replicated") datasets from the fitted model.
# Seeded RNG so results are reproducible.
rng = np.random.default_rng(0)

# Pre-allocate an (n_sims x n) array; each row will be one simulated dataset of size n.
y_rep = np.empty((n_sims, n))

# For each posterior draw, simulate n new observations from Normal(mu, sigma).
for s in range(n_sims):
    y_rep[s] = rng.normal(sims[s, 0], sims[s, 1], size=n)

# Reset the RNG and pick 20 random draws to visualize (no repeats).
rng = np.random.default_rng(0)
sample_idx = rng.choice(n_sims, size=20, replace=False)

# Create a 5-row x 4-column grid of subplots (20 panels total).
fig, axes = plt.subplots(5, 4, figsize=(12, 10))

# Plot a histogram of one replicated dataset in each panel.
for ax, s in zip(axes.flat, sample_idx):       # axes.flat iterates all 20 subplots
    ax.hist(y_rep[s, :], bins=15, edgecolor="black")
    ax.set_title(f"draw {s}", fontsize=8)      # label which posterior draw it came from

plt.tight_layout()   # stop titles/axes from overlapping
plt.show()           # render the figure

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [mu, sigma]

/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/rich/live.py:260: UserWarning: install "ipywidgets" for 
Jupyter support
  warnings.warn('install "ipywidgets" for Jupyter support')

Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 4 seconds.

# Define a test statistic: here, the minimum of a dataset.
def test(y):
    return np.min(y)

# Apply the test statistic to each row of y_rep.
# axis=1 means: collapse columns, one result per row (one per replicated dataset).
# Equivalent to R's apply(y_rep, 1, test).
test_rep = np.apply_along_axis(test, 1, y_rep)
# Simpler / faster alternative: test_rep = y_rep.min(axis=1)

# Observed test statistic (minimum of the real data).
t_obs = test(newcomb["y"].values)

# xlim spans both the observed value and the full range of replicated minima
# so the observed-data line is always visible on the axis.
lo = min(t_obs, test_rep.min())
hi = max(t_obs, test_rep.max())

fig, ax = plt.subplots(figsize=(8, 5))
ax.hist(test_rep, bins=30, edgecolor="black", range=(lo, hi))

# Vertical line marking the observed test statistic.
# R: lines(rep(test(y),2), c(0,n))
ax.axvline(t_obs, color="red", linewidth=2)

ax.set_xlabel("min(y_rep)")
ax.set_ylabel("Frequency")
ax.set_title("Posterior predictive check: minimum")
plt.show()

# read csv
unemp = pd.read_csv('../ros_data/unemp.txt', sep=' ')
display(unemp.head())

# plot year vs y
sns.lineplot(data=unemp, x='year', y='y')
plt.title('Unemployment Rate Over Time')
plt.xlabel('Year')
plt.ylabel('Unemployment Rate (%)')
plt.show()

unemp['y_lag'] = unemp['y'].shift(1)

unemp_model = fit_and_plot_bayes(data=unemp.dropna(), predictors=['y_lag'], outcome='y',
                       intercept_mu=0, intercept_sigma=50,
                       slope_mu=0, slope_sigma=50,
                       sigma_sigma=50,
                       samples=2000, tune=1000, hdi_prob=0.95,
                       show_trace=False, show_forest=False,
                       show_posterior=False, show_regression=True,
                       show_residuals=True,
                       n_regression_lines=100)

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, slope_y_lag, sigma]

/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/rich/live.py:260: UserWarning: install "ipywidgets" for 
Jupyter support
  warnings.warn('install "ipywidgets" for Jupyter support')

Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 1 seconds.

              mean     sd  hdi_2.5%  hdi_97.5%  mcse_mean  mcse_sd  ess_bulk  \
intercept    1.365  0.472     0.414      2.250      0.009    0.007    2951.0   
slope_y_lag  0.767  0.078     0.618      0.923      0.001    0.001    2994.0   
sigma        1.043  0.092     0.879      1.238      0.002    0.001    3693.0   

             ess_tail  r_hat  
intercept      2811.0    1.0  
slope_y_lag    2563.0    1.0  
sigma          3801.0    1.0  

Regression formula: y = 1.37 + 0.77*y_lag
Residual std dev (σ): 1.04 ± 0.09
Bayesian R²: 0.592 ± 0.051
LOO-ELPD: -102.03 ± 8.29  (p_loo=3.7)
LOO log score (per obs): -1.479 ± 0.120

# print(unemp_model)
posterior = unemp_model.posterior
# print(posterior)

intercept_sims = posterior["intercept"].values.ravel()
slope_sims = posterior["slope_y_lag"].values.ravel()
sigma_sims = posterior["sigma"].values.ravel()

print("Posterior mean of intercept:", np.mean(intercept_sims))
print("Posterior mean of slope:", np.mean(slope_sims))
print("Posterior mean of sigma:", np.mean(sigma_sims))

# len of sims
print("Number of posterior draws:", len(intercept_sims))

n = len(unemp) # number of time points in the original data
n_sims = len(intercept_sims) # number of posterior draws
rng = np.random.default_rng(0) # seeded RNG for reproducibility

y_rep = np.full((n_sims, n), np.nan) # Start with an array of NaNs; we'll fill in the first column with the initial value and then iteratively fill in the rest.
# print(y_rep)

for s in range(n_sims):
    y_rep[s, 0] = unemp['y'].iloc[0] # Set the first value of each simulated series to the same initial value as the real data.
    for t in range(1, n):
        y_rep[s, t] = intercept_sims[s] + slope_sims[s] * y_rep[s, t-1] + rng.normal(0, sigma_sims[s]) # Simulate the next value based on the previous one using the AR(1) model with noise.
        
print(y_rep.shape) # Should be (n_sims, n)

# plot a random 10 simulated series, each as sub plots in a 5x2 grid and plot the original data on top of each subplot for comparison
rng = np.random.default_rng(0)
sample_idx = rng.choice(n_sims, size=10, replace=False) # Randomly select 10 posterior draws to visualize.
fig, axes = plt.subplots(5, 2, figsize=(12, 15)) # Create a 5-row x 2-column grid of subplots.
for ax, s in zip(axes.flat, sample_idx):
    ax.plot(unemp['year'], y_rep[s, :], label='Simulated', alpha=0.7) # Plot the simulated series for this draw.
    ax.plot(unemp['year'], unemp['y'], label='Observed', color='red') # Overlay the original data in red for comparison.
    ax.set_title(f"Draw {s}") # Title indicating which posterior draw it is.
    ax.set_xlabel('Year') # x-axis label.
    ax.set_ylabel('Unemployment Rate (%)') # y-axis label.
    ax.legend() # Show legend to differentiate simulated vs observed.
plt.tight_layout() # Adjust subplot spacing.
plt.show() # Render the figure.

Posterior mean of intercept: 1.365171392702648
Posterior mean of slope: 0.7671765233022904
Posterior mean of sigma: 1.042686294763032
Number of posterior draws: 8000
(8000, 70)

# Test statistic: count the number of turning points in a series.
# A turning point is where the direction changes (rising to falling or vice versa).
def test(y):
    n = len(y)
    y_lag = np.roll(y, 1)       # y[t-1]
    y_lag[0] = np.nan
    y_lag_2 = np.roll(y, 2)     # y[t-2]
    y_lag_2[:2] = np.nan
    diff1 = np.sign(y - y_lag)          # direction of current move
    diff2 = np.sign(y_lag - y_lag_2)    # direction of previous move
    return np.nansum(diff1 != diff2)    # count where direction changed

# Observed test statistic
test_y = test(unemp['y'].values)

# Apply to each simulated series (each row of y_rep)
test_rep = np.apply_along_axis(test, 1, y_rep)

# Plot
lo = min(test_y, test_rep.min())
hi = max(test_y, test_rep.max())

fig, ax = plt.subplots(figsize=(8, 5))
ax.hist(test_rep, bins=30, edgecolor="black", range=(lo, hi))
ax.axvline(test_y, color="red", linewidth=2, label=f"Observed = {test_y}")
ax.set_xlabel("Number of turning points")
ax.set_ylabel("Frequency")
ax.set_title("Posterior predictive check: turning points")
ax.legend()
plt.show()

# simulated dataset

n = 20
# array 1:n
x = np.arange(1, n + 1)
a = 0.2
b = 0.3
sigma = 1
# set seed for reproducibility
rng = np.random.default_rng(2141)
y = a + b * x + rng.normal(0, sigma, n)
data = pd.DataFrame({"x": x, "y": y})
display(data.head())

fit_and_plot_bayes(data, 'x', 'y',
                       intercept_mu=0, intercept_sigma=50,
                       slope_mu=0, slope_sigma=50,
                       sigma_sigma=50,
                       samples=2000, tune=1000, hdi_prob=0.95,
                       show_trace=False, show_forest=False,
                       show_posterior=False, show_regression=True,
                       show_residuals=True,
                       n_regression_lines=100)

# remove 15th observation
data_dropped = data.drop(index=15).reset_index(drop=True)
display(data_dropped.head())

new_model = fit_and_plot_bayes(data_dropped, 'x', 'y',
                       intercept_mu=0, intercept_sigma=50,
                       slope_mu=0, slope_sigma=50,
                       sigma_sigma=50,
                       samples=2000, tune=1000, hdi_prob=0.95,
                       show_trace=False, show_forest=False,
                       show_posterior=False, show_regression=True,
                       show_residuals=True,
                       n_regression_lines=100)

# predict the 15 observation using the new model
posterior = new_model.posterior
intercept_sims = posterior["intercept"].values.ravel()
slope_sims = posterior["slope_x"].values.ravel()
sigma_sims = posterior["sigma"].values.ravel()
x_new = 15
y_pred_sims = intercept_sims + slope_sims * x_new + rng.normal(0, sigma_sims)
y_pred_mean = np.mean(y_pred_sims)
y_pred_hdi = az.hdi(y_pred_sims, hdi_prob=0.95)
print(f"Predicted value for x=15: {y_pred_mean:.2f} (95% HDI: [{y_pred_hdi[0]:.2f}, {y_pred_hdi[1]:.2f}])")

# plot distribution of predicted values for x=15
sns.histplot(y_pred_sims, bins=30, kde=True)
plt.axvline(y_pred_mean, color='red', label='Predicted mean')
plt.axvline(y_pred_hdi[0], color='blue', linestyle='--', label='95% HDI')
plt.axvline(y_pred_hdi[1], color='blue', linestyle='--')
plt.title('Posterior predictive distribution for x=15')
plt.xlabel('Predicted y')
plt.ylabel('Frequency')
plt.legend()
plt.show()

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, slope_x, sigma]

/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/rich/live.py:260: UserWarning: install "ipywidgets" for 
Jupyter support
  warnings.warn('install "ipywidgets" for Jupyter support')

Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 1 seconds.

            mean     sd  hdi_2.5%  hdi_97.5%  mcse_mean  mcse_sd  ess_bulk  \
intercept  0.305  0.541    -0.740      1.408      0.010    0.008    3184.0   
slope_x    0.300  0.045     0.211      0.387      0.001    0.001    3181.0   
sigma      1.156  0.217     0.774      1.576      0.004    0.004    3585.0   

           ess_tail  r_hat  
intercept    3740.0    1.0  
slope_x      3513.0    1.0  
sigma        3090.0    1.0  

Regression formula: y = 0.30 + 0.30*x
Residual std dev (σ): 1.16 ± 0.22
Bayesian R²: 0.722 ± 0.070
LOO-ELPD: -31.89 ± 2.56  (p_loo=2.4)
LOO log score (per obs): -1.595 ± 0.128

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, slope_x, sigma]

/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/rich/live.py:260: UserWarning: install "ipywidgets" for 
Jupyter support
  warnings.warn('install "ipywidgets" for Jupyter support')

Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 1 seconds.

            mean     sd  hdi_2.5%  hdi_97.5%  mcse_mean  mcse_sd  ess_bulk  \
intercept  0.395  0.491    -0.548      1.390      0.009    0.007    2801.0   
slope_x    0.281  0.041     0.201      0.362      0.001    0.001    2946.0   
sigma      1.054  0.196     0.706      1.443      0.004    0.003    2888.0   

           ess_tail  r_hat  
intercept    3453.0    1.0  
slope_x      3219.0    1.0  
sigma        2493.0    1.0  

Regression formula: y = 0.39 + 0.28*x
Residual std dev (σ): 1.05 ± 0.20
Bayesian R²: 0.735 ± 0.070
LOO-ELPD: -28.49 ± 2.42  (p_loo=2.4)
LOO log score (per obs): -1.499 ± 0.128

kidiq = pd.read_csv('../ros_data/kidiq.csv', skiprows=0)
display(kidiq.head())

# fit_and_plot_lm(kidiq, ['mom_hs'], 'kid_score', add_constant=True, show_plot=True, scatter_kws=None, line_kws=None)

fit1 = fit_and_plot_bayes(kidiq, ['mom_hs', 'mom_iq'], 'kid_score',
                       intercept_mu=0, intercept_sigma=50,
                       slope_mu=0, slope_sigma=50,
                       sigma_sigma=50,
                       samples=2000, tune=1000, hdi_prob=0.95,
                       show_trace=False, show_forest=False,
                       show_posterior=False, show_regression=False,
                       n_regression_lines=100)

# add 5 pure noise predictors to the data — matches R: array(rnorm(5*n), c(n,5))
n = len(kidiq)
noise = rng.normal(0, 1, size=5 * n).reshape(n, 5, order='F')
for i in range(5):
    kidiq[f'noise_{i+1}'] = noise[:, i]

fit2 = fit_and_plot_bayes(kidiq, ['mom_hs', 'mom_iq', 'noise_1', 'noise_2', 'noise_3', 'noise_4', 'noise_5'], 'kid_score',
                       intercept_mu=0, intercept_sigma=50,
                       slope_mu=0, slope_sigma=50,
                       sigma_sigma=50,
                       samples=2000, tune=1000, hdi_prob=0.95,
                       show_trace=False, show_forest=False,
                       show_posterior=False, show_regression=False,
                       n_regression_lines=100)

fit3 = fit_and_plot_bayes(kidiq, ['mom_hs'], 'kid_score',
                       intercept_mu=0, intercept_sigma=50,
                       slope_mu=0, slope_sigma=50,
                       sigma_sigma=50,
                       samples=2000, tune=1000, hdi_prob=0.95,
                       show_trace=False, show_forest=False,
                       show_posterior=False, show_regression=False,
                       n_regression_lines=100)

az.compare({'model1': fit1, 'model2': fit2, 'model3': fit3}, ic='loo')

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, slope_mom_hs, slope_mom_iq, sigma]

/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/rich/live.py:260: UserWarning: install "ipywidgets" for 
Jupyter support
  warnings.warn('install "ipywidgets" for Jupyter support')

Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 3 seconds.

                mean     sd  hdi_2.5%  hdi_97.5%  mcse_mean  mcse_sd  \
intercept     25.315  5.943    13.465     36.531      0.093    0.075   
slope_mom_hs   5.895  2.196     1.707     10.264      0.031    0.027   
slope_mom_iq   0.568  0.061     0.445      0.685      0.001    0.001   
sigma         18.186  0.623    17.039     19.471      0.009    0.008   

              ess_bulk  ess_tail  r_hat  
intercept       4126.0    4585.0    1.0  
slope_mom_hs    5167.0    4453.0    1.0  
slope_mom_iq    4061.0    4277.0    1.0  
sigma           5310.0    4574.0    1.0  

Regression formula: kid_score = 25.31 + 5.89*mom_hs + 0.57*mom_iq
Residual std dev (σ): 18.19 ± 0.62
Bayesian R²: 0.217 ± 0.031
LOO-ELPD: -1876.06 ± 14.19  (p_loo=4.0)
LOO log score (per obs): -4.323 ± 0.033

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, slope_mom_hs, slope_mom_iq, slope_noise_1, slope_noise_2, slope_noise_3, slope_noise_4, slope_noise_5, sigma]

/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/rich/live.py:260: UserWarning: install "ipywidgets" for 
Jupyter support
  warnings.warn('install "ipywidgets" for Jupyter support')

Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 5 seconds.

                 mean     sd  hdi_2.5%  hdi_97.5%  mcse_mean  mcse_sd  \
intercept      26.137  5.918    14.888     37.916      0.079    0.067   
slope_mom_hs    5.948  2.245     1.509     10.193      0.022    0.029   
slope_mom_iq    0.560  0.061     0.439      0.680      0.001    0.001   
slope_noise_1   0.880  0.857    -0.802      2.558      0.008    0.010   
slope_noise_2  -0.360  0.885    -2.090      1.423      0.009    0.012   
slope_noise_3  -0.878  0.856    -2.545      0.796      0.009    0.010   
slope_noise_4  -0.491  0.891    -2.326      1.202      0.009    0.010   
slope_noise_5   0.404  0.861    -1.271      2.101      0.008    0.011   
sigma          18.234  0.613    17.078     19.463      0.006    0.007   

               ess_bulk  ess_tail  r_hat  
intercept        5609.0    5608.0    1.0  
slope_mom_hs    10261.0    4910.0    1.0  
slope_mom_iq     5538.0    5411.0    1.0  
slope_noise_1   11330.0    5827.0    1.0  
slope_noise_2   10340.0    5061.0    1.0  
slope_noise_3    9593.0    5846.0    1.0  
slope_noise_4   10568.0    5944.0    1.0  
slope_noise_5   10876.0    5628.0    1.0  
sigma           10033.0    5774.0    1.0  

Regression formula: kid_score = 26.14 + 5.95*mom_hs + 0.56*mom_iq + 0.88*noise_1 + -0.36*noise_2 + -0.88*noise_3 + -0.49*noise_4 + 0.40*noise_5
Residual std dev (σ): 18.23 ± 0.61
Bayesian R²: 0.226 ± 0.030

	x	z	y
0	0.454181	0	-0.878259
1	0.045388	1	8.343305
2	0.869509	1	8.853543
3	0.608016	0	1.662860
4	0.876591	0	1.602928

	x1	x2	x3	x4	x5	x6	x7	x8	x9	x10	z	y
0	0.613026	0.339508	0.610554	0.524063	0.060038	0.373551	0.778760	0.970273	0.416569	0.192414	1	32.956398
1	0.568241	0.511727	0.355487	0.502818	0.973214	0.458990	0.618661	0.919096	0.955292	0.107091	0	34.749450
2	0.707802	0.010413	0.983991	0.519175	0.924139	0.742563	0.089703	0.706747	0.554307	0.875053	0	38.297788
3	0.139362	0.989462	0.740087	0.051541	0.601359	0.139781	0.604497	0.090807	0.892681	0.931586	0	27.729313
4	0.648994	0.508998	0.785647	0.091857	0.470251	0.622525	0.060170	0.631203	0.295915	0.295320	0	21.876481

	year	y
0	1947	3.9
1	1948	3.8
2	1949	5.9
3	1950	5.3
4	1951	3.3

	x	y
0	1	0.640604
1	2	2.168083
2	3	2.893252
3	4	1.091838
4	5	2.274745

	x	y
0	1	0.640604
1	2	2.168083
2	3	2.893252
3	4	1.091838
4	5	2.274745

11. Assumptions, diagnostics, and model evaluation¶

11.1 Assumptions of regression analysis¶

Failures of the assumptions¶

Causal inference¶

11.2 Plotting the data and fitted model¶

Displaying a regression line as a function of one input variable¶

Forming a linear predictor from a multiple regression¶

11.3 Residual plots¶

A confusing choice: plot residuals vs. predicted values, or residuals vs. observed values?¶

11.4 Comparing data to replications from a fitted model¶

Example: simulation-based checking of a fitted normal distribution¶

Checking model fit using a numerical data summary¶

11.5 Example: predictive simulation to check the fit of a time-series model¶

Fitting a first-order autoregression to the unemployment series¶

Visual and numerical comparisons of replicated to actual data¶

11.6 Residual standard deviation σ and explained variance R²¶

Difficulties in interpreting residual standard deviation and explained variance¶

Bayesian R²¶

11.7 External validation: checking fitted model on new data¶

11.8 Cross validation¶

Leave-one-out cross validation (LOO)¶

Fast leave-one-out cross validation¶

Summarizing prediction error using the log score and deviance¶

Overfitting and AIC¶

Interpreting differences in log scores¶

Demonstration of adding pure noise predictors to a model¶

K-fold cross validation¶

Demonstration of K-fold cross validation¶

Concerns about model selection¶

11.9 Bibliographic note¶

11.10 Exercises¶

	hw1	hw2	hw3	hw4	midterm	hw5	hw6	hw7	final
0	95	88	100	95	80	96	99	0	103
1	0	74	74	0	53	83	97	0	79
2	100	0	105	100	91	96	100	96	122
3	0	90	76	100	63	91	95	0	78
4	100	96	99	100	91	93	100	92	135
5	90	83	95	100	73	89	100	90	117
6	95	98	100	100	59	98	98	94	135
7	80	100	97	100	69	94	98	101	123
8	95	90	98	90	78	95	99	100	109
9	90	94	95	98	91	94	100	89	126

	kid_score	mom_hs	mom_iq	mom_work	mom_age
0	65	1	121.117529	4	27
1	98	1	89.361882	4	25
2	85	1	115.443165	4	27
3	83	1	99.449639	3	25
4	115	1	92.745710	4	27

	rank	elpd_loo	p_loo	elpd_diff	weight	se	dse	warning	scale
model1	0	-1876.059170	4.007582	0.000000	9.843872e-01	14.189357	0.000000	False	log
model2	1	-1879.663074	8.926097	3.603904	9.466809e-16	14.088293	1.788628	False	log
model3	2	-1914.768343	3.019895	38.709173	1.561281e-02	13.754367	8.401360	False	log

	hw1	hw2	hw3	hw4	midterm	hw5	hw6	hw7	final
0	95	88	100	95	80	96	99	0	103
1	0	74	74	0	53	83	97	0	79
2	100	0	105	100	91	96	100	96	122
3	0	90	76	100	63	91	95	0	78
4	100	96	99	100	91	93	100	92	135
5	90	83	95	100	73	89	100	90	117
6	95	98	100	100	59	98	98	94	135
7	80	100	97	100	69	94	98	101	123
8	95	90	98	90	78	95	99	100	109
9	90	94	95	98	91	94	100	89	126

	hw1	hw2	hw3	hw4	midterm	hw5	hw6	hw7	final
0	95	88	100	95	80	96	99	0	103
1	0	74	74	0	53	83	97	0	79
2	100	0	105	100	91	96	100	96	122
3	0	90	76	100	63	91	95	0	78
4	100	96	99	100	91	93	100	92	135
5	90	83	95	100	73	89	100	90	117
6	95	98	100	100	59	98	98	94	135
7	80	100	97	100	69	94	98	101	123
8	95	90	98	90	78	95	99	100	109
9	90	94	95	98	91	94	100	89	126