Python Code for Portfolio Optimization Chapter 3 – Financial Data: I.I.D. Modeling
Daniel P. Palomar (2025). Portfolio Optimization: Theory and Application. Cambridge University Press.
Last update: February 18, 2025
Contributors:
Packages¶
The following packages are used in the examples:
# Core numerical computing import numpy as np import pandas as pd from scipy.stats import multivariate_normal, multivariate_t from scipy import linalg # Financial data handling import yfinance as yf # Book data (pip install "git+https://github.com/dppalomar/pob.git#subdirectory=python") from pob_python import SP500_stocks_2015to2020, SP500_index_2015to2020 # Visualization import matplotlib.pyplot as plt import seaborn as sns sns.set_theme(style="darkgrid") from tqdm import tqdm # For progress tracking
IID model¶
Example of a synthetic Gaussian i.i.d. time series:
# Use S&P 500 index from the book data SP500_index_2015to2020.head()
| SP500_INDEX | |
|---|---|
| Date | |
| 2015-01-05 | 2020.58 |
| 2015-01-06 | 2002.61 |
| 2015-01-07 | 2025.90 |
| 2015-01-08 | 2062.14 |
| 2015-01-09 | 2044.81 |
sp500_prices = SP500_index_2015to2020.values.squeeze()
# Compute log returns and drop the first difference sp500_returns = np.diff(np.log(sp500_prices))[1:] # Calculate sample mean and sample variance mu = np.mean(sp500_returns) var = np.var(sp500_returns) # Generate synthetic data np.random.seed(42) T = 500 x = np.random.normal(mu, np.sqrt(var), T)
# Create a DataFrame with a time index and the synthetic data df = pd.DataFrame({'t': np.arange(1, T + 1), 'x': x}) # Plot the time series fig, ax = plt.subplots(figsize=(10, 6)) plt.plot(df['t'], df['x'], linewidth=0.8, color='blue') plt.title('i.i.d. time series') plt.xlabel('') plt.ylabel('') plt.show()
Sample estimators¶
Let's warm up using the classical estimators for the mean and covariance matrix, namely the sample mean and the sample covariance matrix: $$ \begin{aligned} \hat{\boldsymbol{\mu}} & = \frac{1}{T}\sum_{t=1}^T\mathbf{x}_t\\ \hat{\boldsymbol{\Sigma}} & = \frac{1}{T-1}\sum_{t=1}^T(\mathbf{x}_t-\boldsymbol{\mu})(\mathbf{x}_t-\boldsymbol{\mu})^T. \end{aligned} $$
Illustration of sample estimators in two dimensions:
# Generate Gaussian synthetic return data np.random.seed(42) N = 2 T = 10 mu_true = np.zeros(N) Sigma_true = np.array([[0.0012, 0.0011], [0.0011, 0.0014]]) X = np.random.multivariate_normal(mean=mu_true, cov=Sigma_true, size=200) #X = multivariate_normal.rvs(mean=mu_true, cov=Sigma_true, size=200) X_ = X[:T, :] # Estimators mu_sm = np.mean(X_, axis=0) # classical mean estimator (sample mean) X_centered = X_ - mu_sm Sigma_scm = np.cov(X_centered.T) # classical covariance estimator (sample covariance matrix) # Print errors print(np.linalg.norm(mu_sm - mu_true)) print(np.linalg.norm(Sigma_scm - Sigma_true, 'fro') / np.linalg.norm(Sigma_true, 'fro'))
0.009323178928090176 0.17331755138107485
from matplotlib.patches import Ellipse def confidence_ellipse(cov, mean, ax, n_std=2, **kwargs): """Create covariance ellipse with proper scaling""" vals, vecs = np.linalg.eigh(cov) theta = np.degrees(np.arctan2(*vecs[:,0][::-1])) width, height = 2 * n_std * np.sqrt(vals) return Ellipse(xy=mean, width=width, height=height, angle=theta, fill=False, **kwargs)
# Scatter plot with corrected ellipses fig, ax = plt.subplots(figsize=(10, 8)) # Plot data points ax.scatter(X[:, 0], X[:, 1], color='black', s=2) # Add ellipses with 95% confidence (n_std=2) ax.add_patch(confidence_ellipse(Sigma_scm, mu_sm, ax, n_std=2, edgecolor='red', linewidth=2, label='sample estimator')) ax.add_patch(confidence_ellipse(Sigma_true, mu_true, ax, n_std=2, edgecolor='blue', linewidth=2, label='true')) # Plot mean estimates with big colored dots ax.scatter(mu_sm[0], mu_sm[1], color='red', s=100) ax.scatter(mu_true[0], mu_true[1], color='blue', s=100) # Configure plot ax.set_xlabel('X-axis') ax.set_ylabel('Y-axis') ax.legend() plt.tight_layout() plt.show()
Estimation error of sample estimators versus number of observations (for Gaussian data with $N=100$):
# Configuration np.random.seed(42) N = 100 T_max = 2000 # Get realistic mu and Sigma X_market = np.log(SP500_stocks_2015to2020).diff().dropna().to_numpy()[:, :N] mu_true = np.mean(X_market, axis=0) Sigma_true = np.cov(X_market.T) # # Alternatively, create synthetic true parameters # mu_true = np.random.randn(N) * 0.01 # Sigma_true = np.diag(np.abs(np.random.randn(N))) + 0.1*np.eye(N) # Generate synthetic Gaussian data X_gaussian = multivariate_normal.rvs(mean=mu_true, cov=Sigma_true, size=10*T_max)
# Benchmarking framework T_sweep = np.linspace(50, T_max, 50, dtype=int) error_mu_vs_T = [] error_Sigma_vs_T = [] for T in T_sweep: X_ = X_gaussian[:T,] # Sample estimators mu_sm = np.mean(X_, axis=0) Sigma_scm = np.cov(X_.T) # Compute error error_mu_vs_T.append(np.linalg.norm(mu_sm - mu_true) / np.linalg.norm(mu_true)) error_Sigma_vs_T.append(np.linalg.norm(Sigma_scm - Sigma_true, 'fro') / np.linalg.norm(Sigma_true, 'fro')) error_mu_vs_T = np.asarray(error_mu_vs_T) error_Sigma_vs_T = np.asarray(error_Sigma_vs_T)
# Plots fig, ax = plt.subplots(figsize=(10, 6)) plt.plot(T_sweep, 100*error_mu_vs_T) plt.title("Estimation error of sample mean") plt.xlabel("T") plt.ylabel("normalized error (%)") plt.show()
fig, ax = plt.subplots(figsize=(10, 6)) plt.plot(T_sweep, 100*error_Sigma_vs_T) plt.title("Estimation error of sample covariance matrix") plt.xlabel("T") plt.ylabel("normalized error (%)") plt.show()
Location estimators¶
Illustration of different location estimators in 2D:
# Configuration np.random.seed(42) N = 2 T = 10 mu_true = np.zeros(N) Sigma_true = np.array([[0.0012, 0.0011], [0.0011, 0.0014]]) nu = 4 scatter_true = (nu - 2)/nu * Sigma_true # Generate synthetic data X = multivariate_t.rvs( loc=mu_true, shape=scatter_true, df=nu, size=800 )
def spatial_median(X, max_iter=100, tol=1e-5): """Weiszfeld algorithm for spatial median""" med = np.median(X, axis=0) for _ in range(max_iter): dist = np.linalg.norm(X - med, axis=1) np.clip(dist, 1e-12, None, out=dist) # Avoid division by zero weights = 1 / dist new_med = X.T @ weights / weights.sum() if np.linalg.norm(new_med - med) < tol: break med = new_med return med
# Calculate estimators using first T samples X_est = X[:T] estimators = { "true": mu_true, "sample mean": np.mean(X_est, axis=0), "median": np.median(X_est, axis=0), "spatial median": spatial_median(X_est) # Assume this function exists } # Create comparison dataframe df = pd.DataFrame([ {"location": key, "x": val[0], "y": val[1]} for key, val in estimators.items() ]) df["location"] = pd.Categorical( df["location"], categories=["true", "sample mean", "median", "spatial median"] )
# Visualization fig, ax = plt.subplots(figsize=(10, 8)) # Plot data points ax.scatter(X[:, 0], X[:, 1], alpha=0.7, s=10, label='Data points') # Plot estimators with custom markers marker_map = { "true": "X", # Big X for ground truth "sample mean": "o", # Circle for sample mean "median": "s", # Square for median "spatial median": "D" # Diamond for spatial median } for location in df.location.cat.categories: subset = df[df.location == location] ax.scatter(subset.x, subset.y, s=150, edgecolor='w', linewidth=1.5, marker=marker_map[location], label=location) # Formatting ax.set_xlim(-0.08, 0.08) ax.set_ylim(-0.08, 0.08) ax.set_title("Scatter plot of data points with different location estimators", pad=20) ax.set_xlabel("") ax.set_ylabel("") ax.grid(True, alpha=0.3) ax.legend(loc='upper right', frameon=True) plt.tight_layout() plt.show()
Estimation error of location estimators versus number of observations (with $N=100$):
# Configuration np.random.seed(42) N = 100 T_max = 2000 n_trials = 100 # Get realistic mu and Sigma X_market = np.log(SP500_stocks_2015to2020).diff().dropna().to_numpy()[:, :N] mu_true = np.mean(X_market, axis=0) Sigma_true = np.cov(X_market.T) # Generate synthetic Gaussian data X_gaussian = multivariate_normal.rvs(mean=mu_true, cov=Sigma_true, size=10*T_max) # Generate synthetic heavy-tailed distribution data nu = 4 scatter_true = ((nu-2)/nu) * Sigma_true X_heavy = multivariate_t.rvs(loc=mu_true, shape=scatter_true, df=nu, size=10*T_max)
# Benchmarking framework T_sweep = np.linspace(50, T_max, 50, dtype=int) results = [] for T in T_sweep: for dist_name, data in [('Gaussian', X_gaussian), ('heavy-tailed', X_heavy)]: for _ in range(n_trials): # Random subsample idx = np.random.choice(len(data), size=T, replace=False) X = data[idx] # Calculate estimators mu_sm = np.mean(X, axis=0) mu_med = np.median(X, axis=0) mu_spmed = spatial_median(X) # Calculate relative errors error = lambda x: 100*np.linalg.norm(x - mu_true)/np.linalg.norm(mu_true) results.append({ 'T': T, 'distribution': dist_name, 'method': 'sample mean', 'error': error(mu_sm) }) results.append({ 'T': T, 'distribution': dist_name, 'method': 'median', 'error': error(mu_med) }) results.append({ 'T': T, 'distribution': dist_name, 'method': 'spatial median', 'error': error(mu_spmed) })
# Create DataFrame and aggregate results df = pd.DataFrame(results) df_agg = df.groupby(['method', 'T', 'distribution']).mean().reset_index()
# Visualization fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), sharex=True) for method in df_agg['method'].unique(): subset = df_agg[(df_agg['distribution'] == 'Gaussian') & (df_agg['method'] == method)] ax1.plot(subset['T'], subset['error'], label=method) subset = df_agg[(df_agg['distribution'] == 'heavy-tailed') & (df_agg['method'] == method)] ax2.plot(subset['T'], subset['error'], label=method) ax1.set_title('Error of location estimators under Gaussian data') ax2.set_title('Error of location estimators under heavy-tailed data') ax2.set_xlabel('Sample Size (T)') for ax in (ax1, ax2): ax.set_ylabel('Relative Error (%)') ax.legend() ax.grid(True) plt.tight_layout() plt.show()
Estimation error of location estimators versus degrees of freedom in a $t$ distribution (with $T=200$ and $N=100$):
# Configuration np.random.seed(42) N = 100 T = 200 n_trials = 200 nu_values = np.arange(3, 31, 2) # Get realistic mu and Sigma X_market = np.log(SP500_stocks_2015to2020).diff().dropna().to_numpy()[:, :N] mu_true = np.mean(X_market, axis=0) Sigma_true = np.cov(X_market.T) # Benchmarking framework results = [] for nu in tqdm(nu_values, desc="Processing ν values"): scatter_matrix = ((nu - 2)/nu) * Sigma_true for _ in range(n_trials): # Generate multivariate t-distributed data X = multivariate_t.rvs(loc=mu_true, shape=scatter_matrix, df=nu, size=T) # Calculate estimators methods = { 'sample mean': np.mean(X, axis=0), 'median': np.median(X, axis=0), 'spatial median': spatial_median(X) } # Calculate errors for method, estimate in methods.items(): error = 100 * np.linalg.norm(estimate - mu_true) / np.linalg.norm(mu_true) results.append({ 'ν': nu, 'method': method, 'error': error }) # Create and process DataFrame df = pd.DataFrame(results) df_agg = df.groupby(['method', 'ν'], as_index=False).agg( mean_error=('error', 'mean'), std_error=('error', 'std') )
# Visualization fig, ax = plt.subplots(1, 1, figsize=(12, 7)) colors = {'sample mean': '#1f77b4', 'median': '#ff7f0e', 'spatial median': '#2ca02c'} for method in df_agg['method'].unique(): subset = df_agg[df_agg['method'] == method] ax.plot(subset['ν'], subset['mean_error'], label=method, color=colors[method], lw=2) ax.set_title('Error of location estimators under heavy-tailed data') ax.set_xlabel('ν') ax.set_ylabel('Relative Error (%)') plt.legend() plt.tight_layout() plt.show()
Gaussian ML estimators¶
Estimation error of Gaussian ML estimators versus number of observations (with $N=100$):
# Configuration np.random.seed(42) N = 100 T_max = 1000 n_trials = 100 # Get realistic mu and Sigma X_market = np.diff(np.log(SP500_stocks_2015to2020.values), axis=0)[1:, :N] mu_true = X_market.mean(axis=0) Sigma_true = np.cov(X_market, rowvar=False) nu = 4 scatter_true = (nu-2)/nu * Sigma_true # Generate synthetic datasets X_gaussian = multivariate_normal.rvs( mean=mu_true, cov=Sigma_true, size=10*T_max ) X_heavy = multivariate_t.rvs( loc=mu_true, shape=scatter_true, df=nu, size=10*T_max )
# Benchmarking framework T_sweep = np.linspace(50, T_max, num=20, dtype=int) results = [] for T in T_sweep: for dist_name, data in [('Gaussian', X_gaussian), ('heavy-tailed', X_heavy)]: for _ in range(n_trials): # Subsample data idx = np.random.choice(len(data), size=T, replace=False) X = data[idx] # Calculate estimators mu_hat = X.mean(axis=0) Sigma_hat = (T-1)/T * np.cov(X, rowvar=False) # Calculate relative errors rel_error_mu = (np.linalg.norm(mu_hat - mu_true, 2) / np.linalg.norm(mu_true, 2)) rel_error_Sigma = (np.linalg.norm(Sigma_hat - Sigma_true, 'fro') / np.linalg.norm(Sigma_true, 'fro')) results.append({ 'T': T, 'distribution': dist_name, 'error_mu': 100 * rel_error_mu, 'error_Sigma': 100 * rel_error_Sigma }) # Create DataFrame and aggregate df = pd.DataFrame(results) df_agg = df.groupby(['distribution', 'T']).mean().reset_index()
# Visualization fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8), sharex=True) for dist in ['Gaussian', 'heavy-tailed']: subset = df_agg[df_agg['distribution'] == dist] ax1.plot(subset['T'], subset['error_mu'], label=dist) ax2.plot(subset['T'], subset['error_Sigma'], label=dist) ax1.set_title('Error of Gaussian ML mean estimator') ax2.set_title('Error of Gaussian ML covariance estimator') ax2.set_xlabel('Sample Size (T)') for ax in (ax1, ax2): ax.set_ylabel('Normalized Error (%)') ax.legend(title='Data distribution') ax.grid(True) ax.label_outer() plt.tight_layout() plt.show()
Estimation error of Gaussian ML estimators versus degrees of freedom in a $t$ distribution (with $T=200$ and $N=100$):
# Configuration np.random.seed(42) N = 100 T = 200 n_trials = 200 nu_values = np.arange(3, 31, 2) # Get realistic mu and Sigma (assuming SP500_stocks_2015to2020 is a DataFrame) X_market = np.log(SP500_stocks_2015to2020).diff().dropna().to_numpy()[:, :N] mu_true = np.mean(X_market, axis=0) Sigma_true = np.cov(X_market, rowvar=False)
# Benchmarking framework results = [] for nu in tqdm(nu_values, desc="Processing ν values"): scatter_matrix = ((nu - 2)/nu) * Sigma_true for _ in range(n_trials): # Generate multivariate t-distributed data X = multivariate_t.rvs( loc=mu_true, shape=scatter_matrix, df=nu, size=T, ) # Calculate estimators mu_est = np.mean(X, axis=0) Sigma_est = np.cov(X, rowvar=False) # MLE covariance # Calculate normalized errors mu_error = np.linalg.norm(mu_est - mu_true) / np.linalg.norm(mu_true) Sigma_error = np.linalg.norm(Sigma_est - Sigma_true, 'fro') / np.linalg.norm(Sigma_true, 'fro') results.append({ 'ν': nu, 'method': 'Gaussian ML', 'error mu': 100 * mu_error, 'error Sigma': 100 * Sigma_error }) # Create and process DataFrame df = pd.DataFrame(results) df_agg = df.groupby(['method', 'ν'], as_index=False).agg({ 'error mu': 'mean', 'error Sigma': 'mean' })
# Visualization plt.figure(figsize=(12, 10)) # Plot for mean estimation error plt.subplot(2, 1, 1) plt.plot(df_agg['ν'], df_agg['error mu'], color='#1f77b4', lw=2) plt.title('Estimation error of Gaussian ML mean estimator') plt.xlabel('ν') plt.ylabel('Normalized Error (%)') plt.ylim(200, 250) # Plot for covariance estimation error plt.subplot(2, 1, 2) plt.plot(df_agg['ν'], df_agg['error Sigma'], color='#2ca02c', lw=2) plt.title('Estimation error of Gaussian ML covariance estimator') plt.xlabel('ν') plt.ylabel('Normalized Error (%)') plt.tight_layout() plt.show()
The failure of Gaussian ML estimators¶
def create_2covariance_plot(X, Sigma_true, Sigma_scm, title): """Helper function to create consistent plots""" fig, ax = plt.subplots(figsize=(10, 8)) # Scatter plot ax.scatter(X[:,0], X[:,1], s=8, color='black', alpha=0.8) # Add ellipses for cov, color, label in zip([Sigma_true, Sigma_scm], ['blue', 'red'], ['True', 'Gaussian ML']): ellipse = confidence_ellipse(cov, [0,0], ax=ax, edgecolor=color, linewidth=1.5) ax.add_patch(ellipse) # Plot configuration ax.set_xlim(-0.2, 0.2) ax.set_ylim(-0.2, 0.2) ax.set_title(title, pad=20) ax.legend(handles=[plt.Line2D([0], [0], color=c, lw=2) for c in ['blue', 'red']], labels=['True covariance', 'Gaussian ML estimator']) plt.tight_layout() return fig
# Parameters mu_true = np.array([0.0, 0.0]) Sigma_true = np.array([[0.0012, 0.0011], [0.0011, 0.0014]]) T = 10 np.random.seed(42)
- Gaussian Data:
X_gaussian = multivariate_normal.rvs( mean=mu_true, cov=Sigma_true, size=200 ) X_subset = X_gaussian[:T] Sigma_scm = (X_subset.T @ X_subset) / T create_2covariance_plot(X_gaussian, Sigma_true, Sigma_scm, "Gaussian Data") plt.show()
- Heavy-tailed data:
nu = 4 scatter_matrix = (nu-2)/nu * Sigma_true # For multivariate_t shape param X_heavy = multivariate_t.rvs( loc=mu_true, shape=scatter_matrix, df=nu, size=200 ) X_subset = X_heavy[:T] Sigma_scm = (X_subset.T @ X_subset) / T create_2covariance_plot(X_heavy, Sigma_true, Sigma_scm, "Heavy-tailed Data (ν=4)") plt.show()
- Gaussian data with outliers:
X_gaussian = multivariate_normal.rvs( mean=mu_true, cov=Sigma_true, size=200 ) # Add outlier at (-0.15, 0.05) X_outlier = np.vstack([[-0.15, 0.05], X_gaussian]) X_subset = X_outlier[:T] Sigma_scm = (X_subset.T @ X_subset) / T create_2covariance_plot(X_outlier, Sigma_true, Sigma_scm, "Gaussian Data with Outlier") plt.show()
Heavy-tailed ML estimators¶
Recall that the ML estimators can be iteratively computed as the weighted sample mean and sample covariance matrix: $$ \begin{aligned} \boldsymbol{\mu}^{k+1} &= \frac{\frac{1}{T}\sum_{t=1}^T w_t(\boldsymbol{\mu}^k,\boldsymbol{\Sigma}^k) \times \mathbf{x}_t}{\frac{1}{T}\sum_{t=1}^T w_t(\boldsymbol{\mu}^k,\boldsymbol{\Sigma}^k)}\\ \boldsymbol{\Sigma}^{k+1} &= \frac{1}{T}\sum_{t=1}^T w_t(\boldsymbol{\mu}^{k+1},\boldsymbol{\Sigma}^k) \times (\mathbf{x}_t - \boldsymbol{\mu}^{k+1})(\mathbf{x}_t - \boldsymbol{\mu}^{k+1})^{T} \end{aligned} $$ where the weights $w_t(\boldsymbol{\mu},\boldsymbol{\Sigma})$ are defined as $$w_t(\boldsymbol{\mu},\boldsymbol{\Sigma}) = \frac{\nu+N}{\nu + (\mathbf{x}_t - \boldsymbol{\mu})^{T}\boldsymbol{\Sigma}^{-1}(\mathbf{x}_t - \boldsymbol{\mu})}.$$
For convenience, we define the function fit_mvt() to estimate the mean and covariance matrix for a heavy-tailed distribution:
def nu_POP_estimator(r2, nu, N): """ Estimate the degrees of freedom of a heavy-tailed t distribution based on the POP estimator. Based on: Frédéric Pascal, Esa Ollila, and Daniel P. Palomar, "Improved estimation of the degree of freedom parameter of multivariate t-distribution," in Proc. European Signal Processing Conference (EUSIPCO), Dublin, Ireland, Aug. 23-27, 2021. <https://doi.org/10.23919/EUSIPCO54536.2021.9616162> """ T = len(r2) u = (N + nu) / (nu + r2 * T / (T - 1)) r2i = r2 / (1 - r2 * u / T) theta = (1 - N / T) * np.sum(r2i) / T / N nu = 2 * theta / (theta - 1) nu = min(100, max(2.5, nu)) return nu def fit_mvt(X, verbose=False, max_iter=100, ptol=1e-3): """ Fit of a multivariate t distribution to the data X. References: Chuanhai Liu and Donald B. Rubin, "ML estimation of the t-distribution using EM and its extensions, ECM and ECME," Statistica Sinica (5), pp. 19-39, 1995. Chuanhai Liu, Donald B. Rubin, and Ying Nian Wu, "Parameter Expansion to Accelerate EM: The PX-EM Algorithm," Biometrika, Vol. 85, No. 4, pp. 755-770, Dec., 1998 Rui Zhou, Junyan Liu, Sandeep Kumar, and Daniel P. Palomar, "Robust factor analysis parameter estimation," Lecture Notes in Computer Science (LNCS), 2019. <https://arxiv.org/abs/1909.12530> Esa Ollila, Daniel P. Palomar, and Frédéric Pascal, "Shrinking the Eigenvalues of M-estimators of Covariance Matrix," IEEE Trans. on Signal Processing, vol. 69, pp. 256-269, Jan. 2021. <https://doi.org/10.1109/TSP.2020.3043952> Frédéric Pascal, Esa Ollila, and Daniel P. Palomar, "Improved estimation of the degree of freedom parameter of multivariate t-distribution," in Proc. European Signal Processing Conference (EUSIPCO), Dublin, Ireland, Aug. 23-27, 2021. <https://doi.org/10.23919/EUSIPCO54536.2021.9616162> """ X = np.asarray(X) T, N = np.shape(X) # initial points nu = 4 mu = np.mean(X, axis=0) cov = np.cov(X, rowvar=False) Sigma = (nu - 2)/nu * cov def fnu(nu): return nu/(nu - 2) # # loop # Sigma_diff_record = np.empty(max_iter) for k in range(max_iter): nu_prev = np.copy(nu) mu_prev = np.copy(mu) Sigma_prev = np.copy(Sigma) # intermediate variables Xc = X - mu #r2 = np.sum(Xc * np.dot(Xc, np.linalg.inv(Sigma)), axis=1) # np.diag( Xc @ np.linalg.inv(Sigma) @ t(Xc) ) r2 = np.sum(Xc * np.linalg.solve(Sigma + 1e-6 * np.eye(N), Xc.T).T, axis=1) # update nu, mu, Sigma nu = nu_POP_estimator(r2=r2, nu=nu, N=N) E_tau = (N + nu) / (nu + r2) ave_E_tau = np.mean(E_tau) ave_E_tau_X = np.mean(X * E_tau[:, None], axis=0) mu = ave_E_tau_X / ave_E_tau Xc = X - mu ave_E_tau_XX = (1 / T) * np.dot((Xc * np.sqrt(E_tau)[:, None]).T, Xc * np.sqrt(E_tau)[:, None]) #ave_E_tau_XX_ = (1 / T) * Xc.T @ np.diag(E_tau) @ Xc Sigma = ave_E_tau_XX Sigma = Sigma / ave_E_tau # PX_EM_acceleration # stopping criterion Sigma_diff_record[k] = np.linalg.norm(Sigma - Sigma_prev) / np.linalg.norm(Sigma_prev) if (np.all(np.abs(mu - mu_prev) <= .5 * ptol * (np.abs(mu_prev) + np.abs(mu))) and np.abs(fnu(nu) - fnu(nu_prev)) <= .5 * ptol * (np.abs(fnu(nu_prev)) + np.abs(fnu(nu))) and np.all(np.abs(Sigma - Sigma_prev) <= .5 * ptol * (np.abs(Sigma_prev) + np.abs(Sigma)))): break Sigma_diff_record = Sigma_diff_record[:k] Sigma_cov = nu / (nu - 2) * Sigma if verbose: fig, ax = plt.subplots(figsize=(16, 6)) ax.plot(np.arange(k), Sigma_diff_record) plt.xlabel("Iterations") plt.ylabel("Sigma differences") plt.show() return mu, Sigma_cov
Now we can assess the effect of heavy tails and outliers in heavy-tailed ML covariance matrix estimator:
# Parameters mu_true = np.array([0.0, 0.0]) Sigma_true = np.array([[0.0012, 0.0011], [0.0011, 0.0014]]) T = 10 np.random.seed(42)
def create_3covariance_plot(X, Sigma_true, Sigma_scm, Sigma_heavytail, title): """Helper function to create consistent plots""" fig, ax = plt.subplots(figsize=(10, 8)) # Scatter plot ax.scatter(X[:,0], X[:,1], s=8, color='black', alpha=0.8) # Add ellipses for cov, color, label in zip([Sigma_true, Sigma_scm, Sigma_heavytail], ['blue', 'red', 'green'], ['True', 'Gaussian ML', 'Heavy-tailed ML']): ellipse = confidence_ellipse(cov, [0,0], ax=ax, edgecolor=color, linewidth=1.5) ax.add_patch(ellipse) # Plot configuration ax.set_xlim(-0.2, 0.2) ax.set_ylim(-0.2, 0.2) ax.set_title(title, pad=20) ax.legend(handles=[plt.Line2D([0], [0], color=c, lw=2) for c in ['blue', 'red', 'green']], labels=['True covariance', 'Gaussian ML estimator', 'Heavy-tailed ML estimator']) plt.tight_layout() return fig
- Gaussian Data:
# Data generation X_gaussian = multivariate_normal.rvs( mean=mu_true, cov=Sigma_true, size=200 ) X_subset = X_gaussian[:T] # Estimators Sigma_scm = (X_subset.T @ X_subset) / T mu_heavytail, Sigma_heavytail = fit_mvt(X_subset) # Plot create_3covariance_plot(X_gaussian, Sigma_true, Sigma_scm, Sigma_heavytail, "Gaussian Data") plt.show()
- Heavy-tailed data:
# Data generation nu = 4 scatter_matrix = (nu-2)/nu * Sigma_true # For multivariate_t shape param X_heavy = multivariate_t.rvs( loc=mu_true, shape=scatter_matrix, df=nu, size=200 ) X_subset = X_heavy[:T] # Estimators Sigma_scm = (X_subset.T @ X_subset) / T mu_heavytail, Sigma_heavytail = fit_mvt(X_subset) # Plot create_3covariance_plot(X_gaussian, Sigma_true, Sigma_scm, Sigma_heavytail, "Heavy-tailed Data (ν=4)") plt.show()
- Gaussian data with outliers:
# Data generation X_gaussian = multivariate_normal.rvs( mean=mu_true, cov=Sigma_true, size=200 ) # Add outlier at (-0.15, 0.05) X_outlier = np.vstack([[-0.15, 0.05], X_gaussian]) X_subset = X_outlier[:T] # Estimators Sigma_scm = (X_subset.T @ X_subset) / T mu_heavytail, Sigma_heavytail = fit_mvt(X_subset) # Plot create_3covariance_plot(X_gaussian, Sigma_true, Sigma_scm, Sigma_heavytail, "Gaussian Data with Outlier") plt.show()
Estimation error of different ML estimators versus number of observations (for $t$-distributed heavy-tailed data with $\nu=4$ and $N=100$):
# Configuration np.random.seed(42) N = 100 T_max = 1000 n_trials = 100 nu = 4 # Get realistic mu and Sigma X_market = np.log(SP500_stocks_2015to2020).diff().dropna().to_numpy()[:, :N] mu_true = np.mean(X_market, axis=0) Sigma_true = np.cov(X_market.T) # Generate synthetic heavy-tailed data scatter_true = ((nu-2)/nu) * Sigma_true X_heavtytailed = multivariate_t.rvs( loc=mu_true, shape=scatter_true, df=nu, size=10*T_max ) # Benchmarking framework T_sweep = np.arange(150, T_max+1, 50) results = [] for T in T_sweep: for _ in range(n_trials): # Random subsample idx = np.random.choice(len(X_heavtytailed), size=T, replace=False) X_sub = X_heavtytailed[idx] # Gaussian MLE mu_gauss = np.mean(X_sub, axis=0) Sigma_gauss = (T-1)/T * np.cov(X_sub.T) # Heavy-tailed MLE (assuming fit_mvt exists) mu_heavy, Sigma_heavy = fit_mvt(X_sub) # Calculate errors errors = { 'Gaussian MLE': { 'mu': np.linalg.norm(mu_gauss - mu_true)/np.linalg.norm(mu_true), 'Sigma': np.linalg.norm(Sigma_gauss - Sigma_true, 'fro')/np.linalg.norm(Sigma_true, 'fro') }, 'heavy-tailed MLE': { 'mu': np.linalg.norm(mu_heavy - mu_true)/np.linalg.norm(mu_true), 'Sigma': np.linalg.norm(Sigma_heavy - Sigma_true, 'fro')/np.linalg.norm(Sigma_true, 'fro') } } for method in errors: results.append({ 'T': T, 'method': method, 'error mu': errors[method]['mu'], 'error Sigma': errors[method]['Sigma'] }) # Process results df = pd.DataFrame(results) df_agg = df.groupby(['method', 'T'], as_index=False).agg({ 'error mu': lambda x: 100*np.mean(x), 'error Sigma': lambda x: 100*np.mean(x) })
# Visualization fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), sharex=True) for method in df_agg['method'].unique(): subset = df_agg[df_agg['method'] == method] ax1.plot(subset['T'], subset['error mu'], label=method) ax2.plot(subset['T'], subset['error Sigma'], label=method) ax1.set_title('Error of Mean Estimators') ax2.set_title('Error of Covariance Estimators') ax2.set_xlabel('Sample Size (T)') ax1.set_ylabel('Normalized Error (%)') ax2.set_ylabel('Normalized Error (%)') ax1.legend() ax1.grid(True) ax2.grid(True) plt.tight_layout() plt.show()
Estimation error of different ML estimators versus degrees of freedom in a $t$ distribution (with $T=200$ and $N=100$):
# Configuration np.random.seed(42) N = 100 T = 200 n_trials = 200 nu_values = np.arange(3, 31, 2) # Get realistic mu and Sigma X_market = np.log(SP500_stocks_2015to2020).diff().dropna().to_numpy()[:, :N] mu_true = np.mean(X_market, axis=0) Sigma_true = np.cov(X_market.T) # Benchmarking framework results = [] for nu in tqdm(nu_values, desc="Processing ν values"): scatter_matrix = ((nu - 2)/nu) * Sigma_true for _ in range(n_trials): # Generate multivariate t-distributed data X = multivariate_t.rvs( loc=mu_true, shape=scatter_matrix, df=nu, size=T, ) # Gaussian MLE mu_gauss = np.mean(X, axis=0) Sigma_gauss = (T-1)/T * np.cov(X, rowvar=False) mu_error_gauss = np.linalg.norm(mu_gauss - mu_true) / np.linalg.norm(mu_true) Sigma_error_gauss = np.linalg.norm(Sigma_gauss - Sigma_true, 'fro') / np.linalg.norm(Sigma_true, 'fro') results.append({ 'ν': nu, 'method': 'Gaussian MLE', 'error mu': 100 * mu_error_gauss, 'error Sigma': 100 * Sigma_error_gauss }) # Heavy-tailed MLE (assuming fit_mvt exists) mu_heavy, Sigma_heavy = fit_mvt(X) # Replace with actual implementation of fit_mvt mu_error_heavy = np.linalg.norm(mu_heavy - mu_true) / np.linalg.norm(mu_true) Sigma_error_heavy = np.linalg.norm(Sigma_heavy - Sigma_true, 'fro') / np.linalg.norm(Sigma_true, 'fro') results.append({ 'ν': nu, 'method': 'Heavy-tailed MLE', 'error mu': 100 * mu_error_heavy, 'error Sigma': 100 * Sigma_error_heavy }) # Create and process DataFrame df = pd.DataFrame(results) df_agg = df.groupby(['method', 'ν'], as_index=False).agg({ 'error mu': 'mean', 'error Sigma': 'mean' })
# Visualization plt.figure(figsize=(12, 10)) # Plot for mean estimation error plt.subplot(2, 1, 1) for method in df_agg['method'].unique(): subset = df_agg[df_agg['method'] == method] plt.plot(subset['ν'], subset['error mu'], label=method) plt.title('Estimation error of mean estimators') plt.xlabel('ν') plt.ylabel('Normalized Error (%)') plt.legend() plt.grid(True) # Plot for covariance estimation error plt.subplot(2, 1, 2) for method in df_agg['method'].unique(): subset = df_agg[df_agg['method'] == method] plt.plot(subset['ν'], subset['error Sigma'], label=method) plt.title('Estimation error of covariance estimators') plt.xlabel('ν') plt.ylabel('Normalized Error (%)') plt.legend() plt.grid(True) plt.tight_layout() plt.show()
Convergence of iterative robust heavy-tailed ML estimators:
# Configuration np.random.seed(42) N = 100 T = 200 n_trials = 200 nu_values = np.arange(3, 31, 2) # Get realistic mu and Sigma X_market = np.log(SP500_stocks_2015to2020).diff().dropna().to_numpy()[:, :N] mu_true = np.mean(X_market, axis=0) Sigma_true = np.cov(X_market.T) nu <- 4 scatter_matrix = (nu-2)/nu * Sigma_true # Generate multivariate t-distributed data X = multivariate_t.rvs( loc=mu_true, shape=scatter_matrix, df=nu, size=T, ) # heavy-tailed MLE mu_heavy, Sigma_heavy = fit_mvt(X, verbose=True)
Prior information¶
Shrinkage¶
First, we define a simple and convenient function to shrink the mean vector:
def shrink_mu(X, mu=None, cov=None, mu_sh_use_cor=True, mu_target="grand mean"): T, N = X.shape if mu is None: mu = np.mean(X, axis=0) if cov is None: cov = np.cov(X, rowvar=False) # target if mu_target == "zero": mu_target = np.zeros(N) elif mu_target == "grand mean": mu_target = np.mean(mu) elif mu_target == "volatility weighted": vol = np.sqrt(np.diag(cov)) mu_target = np.mean(mu/vol) * vol elif mu_target == "covmat weighted": if mu_sh_use_cor: corr = cov / np.outer(np.sqrt(np.diag(cov)), np.sqrt(np.diag(cov))) invSigma_ones = linalg.solve(corr, np.ones(N)) else: invSigma_ones = linalg.solve(cov, np.ones(N)) invSigma_ones = np.maximum(0, invSigma_ones) mu_target = np.dot(mu, invSigma_ones) / np.sum(invSigma_ones) else: raise ValueError("Shrinkage estimation method for mu not recognized!") # shrinkage diff = mu - mu_target rho = (N + 2) / ((N + 2) + T * np.dot(diff, linalg.solve(cov, diff))) # improved James-Stein estimator #rho = (N - 2) / (T * np.dot(diff, linalg.solve(cov, diff))) # original James-Stein estimator rho = max(0, min(1, rho)) mu_shrinked = (1 - rho) * mu + rho * mu_target return mu_shrinked
To shrink the covariance matrix we will use Ledoit-Wolf's estimator. This can be founc in the library scikit-learn, but we will implement it ourselves.
Estimation error of different shrinkage estimators versus number of observations (for Gaussian data with $N=100$):
# Configuration np.random.seed(42) N = 100 T_max = 550 n_trials = 100 nu = 5 # Get realistic mu and Sigma X_market = np.log(SP500_stocks_2015to2020).diff().dropna().to_numpy()[:, :N] mu_true = np.mean(X_market, axis=0) Sigma_true = np.cov(X_market.T) # Generate synthetic heavy-tailed data scatter_true = ((nu-2)/nu) * Sigma_true X_heavtytailed = multivariate_t.rvs( loc=mu_true, shape=scatter_true, df=nu, size=10*T_max ) # Benchmarking framework T_sweep = np.arange(50, T_max+1, 50) results = [] for T in T_sweep: for _ in range(n_trials): # Random subsample idx = np.random.choice(len(X_heavtytailed), size=T, replace=False) X_sub = X_heavtytailed[idx] # # Mean estimation # Sigma = np.cov(X_sub.T) Sigma_reg = Sigma + np.mean(np.diag(Sigma)) * np.eye(N) # regularize Sigma for stability # sample mean mu = np.mean(X_sub, axis=0) # Shrinkage estimators mu_shrinked_0 = shrink_mu(X_sub, mu=mu, mu_target="zero", cov=Sigma_reg) mu_shrinked_grand_mean = shrink_mu(X_sub, mu=mu, mu_target="grand mean", cov=Sigma_reg) mu_shrinked_covmat_weighted = shrink_mu(X_sub, mu=mu, mu_target="covmat weighted", cov=Sigma_reg) # Calculate errors results.append({ 'T': T, 'parameter': 'mu', 'method': 'sample mean', 'error': np.linalg.norm(mu - mu_true)/np.linalg.norm(mu_true), }) results.append({ 'T': T, 'parameter': 'mu', 'method': 'shrinkage to zero', 'error': np.linalg.norm(mu_shrinked_0 - mu_true)/np.linalg.norm(mu_true), }) results.append({ 'T': T, 'parameter': 'mu', 'method': 'shrinkage to grand mean', 'error': np.linalg.norm(mu_shrinked_grand_mean - mu_true)/np.linalg.norm(mu_true), }) results.append({ 'T': T, 'parameter': 'mu', 'method': 'shrinkage to volatility-weighted mean', 'error': np.linalg.norm(mu_shrinked_covmat_weighted - mu_true)/np.linalg.norm(mu_true), }) # # Covariance matrix estimation # # Sample covariance matrix (SCM) S = np.cov(X_sub.T) Xc = X_sub - mu # center data matrix # SCM + Ledoit-Wolf shrinked to scaled identity Sigma_target = np.mean(np.diag(S)) * np.eye(N) rho = min(1, ((1/T**2) * np.sum(np.sum(Xc**2, axis=1)**2) - 1/T * np.sum(S**2)) / np.sum((S - Sigma_target)**2)) Sigma_shrinked_identity = (1 - rho)*S + rho*Sigma_target # SCM + Ledoit-Wolf shrinked to diagonal matrix Sigma_target = np.diag(np.diag(S)) rho = min(1, ((1/T**2) * np.sum(np.sum(Xc**2, axis=1)**2) - 1/T * np.sum(S**2)) / np.sum((S - Sigma_target)**2)) Sigma_shrinked_diagonal = (1 - rho) * S + rho * Sigma_target # SCM + Ledoit-Wolf shrinked to equal-correlation matrix sigma = np.sqrt(np.diag(S)) C = S / np.outer(sigma, sigma) ave_corr = np.mean(C[np.tril_indices_from(C, k=-1)]) C[np.tril_indices_from(C, k=-1)] = ave_corr C[np.triu_indices_from(C, k=1)] = ave_corr Sigma_target = np.outer(sigma, sigma) * C rho = min(1, ((1/T**2) * np.sum(np.sum(Xc**2, axis=1)**2) - 1/T * np.sum(S**2)) / np.sum((S - Sigma_target)**2)) Sigma_shrinked_eq_corr = (1 - rho) * S + rho * Sigma_target # Calculate errors results.append({ 'T': T, 'parameter': 'Sigma', 'method': 'sample covariance matrix', 'error': np.linalg.norm(S - Sigma_true, 'fro')/np.linalg.norm(Sigma_true, 'fro') }) results.append({ 'T': T, 'parameter': 'Sigma', 'method': 'shrinkage to scaled identity', 'error': np.linalg.norm(Sigma_shrinked_identity - Sigma_true, 'fro')/np.linalg.norm(Sigma_true, 'fro') }) results.append({ 'T': T, 'parameter': 'Sigma', 'method': 'shrinkage to diagonal matrix', 'error': np.linalg.norm(Sigma_shrinked_diagonal - Sigma_true, 'fro')/np.linalg.norm(Sigma_true, 'fro') }) results.append({ 'T': T, 'parameter': 'Sigma', 'method': 'shrinkage to equal-correlation matrix', 'error': np.linalg.norm(Sigma_shrinked_eq_corr - Sigma_true, 'fro')/np.linalg.norm(Sigma_true, 'fro') }) # Process results df = pd.DataFrame(results) df_agg = df.groupby(['parameter', 'method', 'T']).agg({ 'error': lambda x: 100*np.mean(x) }).reset_index()
# Plots fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), sharex=True) # Plot mu methods mu_methods = df_agg[df_agg['parameter'] == 'mu']['method'].unique() for method in mu_methods: subset_mu = df_agg[(df_agg['method'] == method) & (df_agg['parameter'] == 'mu')] ax1.plot(subset_mu['T'], subset_mu['error'], label=method) # Plot Sigma methods sigma_methods = df_agg[df_agg['parameter'] == 'Sigma']['method'].unique() for method in sigma_methods: subset_sigma = df_agg[(df_agg['method'] == method) & (df_agg['parameter'] == 'Sigma')] ax2.plot(subset_sigma['T'], subset_sigma['error'], label=method) ax1.set_title('Error of Mean Estimators') ax2.set_title('Error of Covariance Estimators') ax2.set_xlabel('Sample Size (T)') ax1.set_ylabel('Normalized Error (%)') ax2.set_ylabel('Normalized Error (%)') ax1.legend() ax2.legend() ax1.grid(True) ax2.grid(True) plt.tight_layout() plt.show()
Factor models¶
TBD