Tutorial 6: Model Calibration Guide
Learn how to calibrate models to historical data using the calibration module.
Prerequisites
1
2
3
4
5
6
7
8
9
10
11
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from privatecredit.calibration import (
HistoricalCalibrator,
TransitionCalibrator,
ParameterEstimator,
BayesianEstimator
)
1. Historical Default Rate Calibration
Prepare Historical Data
1
2
3
4
5
6
7
8
9
10
11
12
13
# Generate synthetic historical data (replace with actual data)
np.random.seed(42)
n_periods = 60
# Monthly observations
true_pd = 0.025 # 2.5% annual default rate
exposure_per_period = np.random.poisson(1000, n_periods)
defaults = np.random.binomial(exposure_per_period, true_pd / 12)
print(f"Total periods: {n_periods}")
print(f"Total defaults: {defaults.sum()}")
print(f"Total exposure: {exposure_per_period.sum()}")
print(f"Observed default rate: {defaults.sum() / exposure_per_period.sum():.4f}")
Fit Default Rate Distribution
1
2
3
4
5
6
7
8
9
10
11
# Initialize calibrator
calibrator = HistoricalCalibrator()
# Fit to historical data
params = calibrator.fit_default_rates(defaults, exposure_per_period)
print("\nCalibrated Parameters:")
print(f" Mean default rate: {params['mean_default_rate']:.4f}")
print(f" Std default rate: {params['std_default_rate']:.4f}")
print(f" Beta alpha: {params['beta_alpha']:.2f}")
print(f" Beta beta: {params['beta_beta']:.2f}")
Bootstrap Confidence Intervals
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# Compute observed rates
observed_rates = defaults / (exposure_per_period + 1e-10)
# Bootstrap CI
ci = calibrator.bootstrap_confidence_intervals(
observed_rates,
statistic='mean',
n_bootstrap=10000,
confidence_level=0.95
)
print(f"\n95% Bootstrap CI for mean default rate:")
print(f" Point estimate: {ci['point_estimate']:.4f}")
print(f" Lower bound: {ci['lower_bound']:.4f}")
print(f" Upper bound: {ci['upper_bound']:.4f}")
print(f" Standard error: {ci['std_error']:.4f}")
2. Transition Matrix Estimation
From Cohort Data
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# Create cohort transition data
states = ['Performing', '30DPD', '60DPD', '90DPD', 'Default', 'Prepaid', 'Matured']
# Simulated cohort transitions (n_from x n_to counts)
transition_counts = np.array([
[850, 30, 10, 5, 2, 12, 91], # From Performing
[400, 350, 100, 50, 20, 30, 50], # From 30DPD
[200, 200, 300, 150, 50, 50, 50], # From 60DPD
[100, 100, 150, 350, 150, 50, 100], # From 90DPD
[0, 0, 0, 0, 1000, 0, 0], # From Default (absorbing)
[0, 0, 0, 0, 0, 1000, 0], # From Prepaid (absorbing)
[0, 0, 0, 0, 0, 0, 1000] # From Matured (absorbing)
])
# Fit transition matrix
trans_calibrator = TransitionCalibrator()
P_mle = trans_calibrator.fit_from_counts(transition_counts)
print("MLE Transition Matrix:")
print(pd.DataFrame(P_mle, index=states, columns=states).round(4))
Calibrate to Target Rates
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# Calibrate to hit specific default/prepayment targets
target_default = 0.02 # 2% annual default
target_prepayment = 0.15 # 15% annual prepayment
maturity = 60 # months
P_calibrated = trans_calibrator.fit_to_target_rates(
target_default_rate=target_default,
target_prepayment_rate=target_prepayment,
maturity=maturity
)
# Verify
state_probs = np.zeros(7)
state_probs[0] = 1.0 # Start in Performing
for _ in range(maturity):
state_probs = state_probs @ P_calibrated
print(f"\nVerification ({maturity}-month projection):")
print(f" Target default: {target_default:.2%}, Achieved: {state_probs[4]:.2%}")
print(f" Target prepay: {target_prepayment:.2%}, Achieved: {state_probs[5]:.2%}")
Add Dirichlet Prior (Regularization)
1
2
3
4
5
6
7
8
9
10
# With prior for smoothing
prior_alpha = np.ones((7, 7)) * 0.1 # Weak uniform prior
P_regularized = trans_calibrator.fit_with_prior(
transition_counts,
prior_alpha=prior_alpha
)
print("\nRegularized Transition Matrix:")
print(pd.DataFrame(P_regularized, index=states, columns=states).round(4))
3. LGD Distribution Fitting
Maximum Likelihood Estimation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# Generate synthetic LGD data (bimodal: secured vs unsecured)
np.random.seed(42)
lgd_secured = np.random.beta(2, 8, size=300) # Mean ~20%
lgd_unsecured = np.random.beta(4, 4, size=200) # Mean ~50%
lgd_data = np.concatenate([lgd_secured, lgd_unsecured])
lgd_data = np.clip(lgd_data, 0.01, 0.99) # Avoid boundaries
# Fit with MLE
estimator = ParameterEstimator()
# Beta distribution fit
beta_params = estimator.fit_lgd_mle(lgd_data, model='beta')
print("Beta Distribution MLE:")
print(f" Alpha: {beta_params['alpha']:.3f}")
print(f" Beta: {beta_params['beta']:.3f}")
print(f" Mean LGD: {beta_params['mean']:.3f}")
print(f" Mode LGD: {beta_params['mode']:.3f}")
print(f" Variance: {beta_params['variance']:.4f}")
Compare Distribution Models
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# Try different distributions
models = ['beta', 'kumaraswamy', 'mixture_beta']
results = {}
for model in models:
params = estimator.fit_lgd_mle(lgd_data, model=model)
results[model] = params
# Compare via AIC/BIC
print("\nModel Comparison:")
print(f"{'Model':<15} {'AIC':<10} {'BIC':<10} {'Mean':<10}")
print("-" * 45)
for model, params in results.items():
print(f"{model:<15} {params.get('aic', 'N/A'):<10.1f} "
f"{params.get('bic', 'N/A'):<10.1f} {params['mean']:<10.3f}")
Plot Fitted Distribution
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
fig, ax = plt.subplots(figsize=(10, 6))
# Histogram of observed data
ax.hist(lgd_data, bins=30, density=True, alpha=0.6, label='Observed', color='steelblue')
# Fitted distributions
x = np.linspace(0.01, 0.99, 100)
for model, params in results.items():
if model == 'beta':
pdf = stats.beta.pdf(x, params['alpha'], params['beta'])
ax.plot(x, pdf, label=f"Beta (a={params['alpha']:.2f}, b={params['beta']:.2f})")
ax.set_xlabel('LGD')
ax.set_ylabel('Density')
ax.set_title('LGD Distribution Fitting')
ax.legend()
plt.tight_layout()
plt.show()
4. Bayesian Parameter Estimation
Default Rate with Prior
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# Bayesian estimation
bayes_estimator = BayesianEstimator()
# Use informative prior based on industry knowledge
# Prior: Beta(2, 80) implies ~2.5% mean with moderate uncertainty
bayes_results = bayes_estimator.fit_default_rate_bayesian(
defaults,
exposure_per_period,
prior_alpha=2.0,
prior_beta=80.0,
n_samples=10000
)
print("Bayesian Estimation Results:")
print(f" Prior mean: {2.0 / (2.0 + 80.0):.4f}")
print(f" Posterior mean: {bayes_results['mean']:.4f}")
print(f" Posterior median: {bayes_results['median']:.4f}")
print(f" 95% Credible Interval: [{bayes_results['ci_lower']:.4f}, {bayes_results['ci_upper']:.4f}]")
print(f" 95% HPD Interval: [{bayes_results['hpd_lower']:.4f}, {bayes_results['hpd_upper']:.4f}]")
Prior Sensitivity Analysis
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
def prior_sensitivity(defaults, exposure, priors):
"""
Analyze how different priors affect posterior.
"""
results = []
for name, (alpha, beta) in priors.items():
posterior = BayesianEstimator().fit_default_rate_bayesian(
defaults, exposure, prior_alpha=alpha, prior_beta=beta
)
results.append({
'Prior': name,
'Prior_Mean': alpha / (alpha + beta),
'Posterior_Mean': posterior['mean'],
'CI_Width': posterior['ci_upper'] - posterior['ci_lower']
})
return pd.DataFrame(results)
# Different priors
priors = {
'Uninformative': (1, 1),
'Weak': (2, 80),
'Strong': (5, 200),
'Pessimistic': (5, 100)
}
sensitivity = prior_sensitivity(defaults, exposure_per_period, priors)
print("\nPrior Sensitivity Analysis:")
print(sensitivity.to_string(index=False))
Posterior Visualization
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
fig, ax = plt.subplots(figsize=(10, 6))
# Plot prior
x = np.linspace(0, 0.10, 200)
prior_pdf = stats.beta.pdf(x, 2, 80)
ax.plot(x, prior_pdf, 'b--', linewidth=2, label='Prior')
# Plot posterior
ax.hist(bayes_results['samples'], bins=50, density=True,
alpha=0.7, color='steelblue', label='Posterior')
# Mark estimates
ax.axvline(bayes_results['mean'], color='red', linestyle='-',
linewidth=2, label=f"Mean: {bayes_results['mean']:.4f}")
ax.axvline(true_pd / 12, color='black', linestyle='-',
linewidth=2, label=f"True: {true_pd/12:.4f}")
ax.set_xlabel('Monthly Default Rate')
ax.set_ylabel('Density')
ax.set_title('Bayesian Default Rate Estimation')
ax.legend()
plt.tight_layout()
plt.show()
5. Bayesian vs MLE Comparison
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
def compare_mle_bayesian(defaults, exposure, true_rate):
"""
Compare MLE and Bayesian estimates.
"""
# MLE
mle_rate = defaults.sum() / exposure.sum()
# Bayesian with different sample sizes
results = []
for n in [10, 30, 60]:
bayes = BayesianEstimator().fit_default_rate_bayesian(
defaults[:n], exposure[:n],
prior_alpha=2, prior_beta=80
)
results.append({
'N_Months': n,
'MLE': defaults[:n].sum() / exposure[:n].sum(),
'Bayesian_Mean': bayes['mean'],
'Bayesian_CI_Width': bayes['ci_upper'] - bayes['ci_lower'],
'True_Rate': true_rate
})
return pd.DataFrame(results)
comparison = compare_mle_bayesian(defaults, exposure_per_period, true_pd/12)
print("\nMLE vs Bayesian by Sample Size:")
print(comparison.round(5).to_string(index=False))
6. Calibrating Model Parameters
Macro VAE Calibration
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
from privatecredit.models import MacroVAE, MacroVAEConfig
from privatecredit.calibration import ModelCalibrator
# Load historical macro data
historical_macro = pd.read_csv('historical_macro.csv') # Replace with your data
# Initialize model calibrator
model_calibrator = ModelCalibrator()
# Calibrate VAE to historical distributions
calibration_results = model_calibrator.calibrate_macro_vae(
historical_data=historical_macro,
target_metrics={
'gdp_mean': 0.025,
'gdp_std': 0.015,
'unemployment_mean': 0.045,
'correlation_gdp_unemp': -0.65
},
n_iterations=100
)
print("Calibration Results:")
for metric, value in calibration_results['achieved_metrics'].items():
target = calibration_results['target_metrics'].get(metric, 'N/A')
print(f" {metric}: Target={target}, Achieved={value:.4f}")
7. Validation and Backtesting
Out-of-Sample Validation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
from privatecredit.calibration import BacktestFramework
# Split data
train_defaults = defaults[:48]
train_exposure = exposure_per_period[:48]
test_defaults = defaults[48:]
test_exposure = exposure_per_period[48:]
# Fit on training data
train_params = HistoricalCalibrator().fit_default_rates(
train_defaults, train_exposure
)
# Validate on test data
backtester = BacktestFramework()
validation = backtester.validate_default_model(
predicted_pd=train_params['mean_default_rate'],
actual_defaults=test_defaults,
actual_exposure=test_exposure
)
print("\nOut-of-Sample Validation:")
print(f" Predicted PD: {train_params['mean_default_rate']:.4f}")
print(f" Actual PD: {validation['actual_pd']:.4f}")
print(f" Brier Score: {validation['brier_score']:.4f}")
print(f" Log Loss: {validation['log_loss']:.4f}")
Summary
| Method | Use Case | Data Requirements |
|---|---|---|
| MLE | Point estimates | Moderate sample |
| Bootstrap CI | Uncertainty quantification | Moderate sample |
| Bayesian | Prior incorporation | Any sample size |
| Transition MLE | Panel/cohort data | State observations |
| LGD Fitting | Loss distribution | Resolved defaults |
Key Takeaways:
- Use MLE for large samples, Bayesian for small samples or prior knowledge
- Bootstrap provides non-parametric uncertainty estimates
- Transition matrices should be validated against target rates
- Compare multiple distribution models via AIC/BIC
- Always validate out-of-sample
Next: Model Selection Guide