import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.weightstats import ttest_ind as sm_ttest_ind, CompareMeans, DescrStatsW
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
np.random.seed(42)

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams.update({
    'figure.dpi': 150,
    'font.size': 10,
    'axes.titlesize': 12,
    'axes.labelsize': 11,
    'figure.figsize': (10, 6)
})

PALETTE = sns.color_palette('Set2')
sns.set_palette('Set2')

print("All packages loaded successfully.")

All packages loaded successfully.

# ---------------------------------------------------------------------------
# 3.  Synthetic Data Generation
# ---------------------------------------------------------------------------

n = 500
np.random.seed(42)

employee_id = np.arange(1, n + 1)
age = np.clip(np.random.normal(40, 10, n).astype(int), 22, 65)
gender = np.where(np.random.binomial(1, 0.45, n) == 1, 'Female', 'Male')
departments = ['Engineering', 'Marketing', 'Sales', 'HR', 'Finance']
department = np.random.choice(departments, n)
education_levels = ['High School', 'Bachelor', 'Master', 'PhD']
education_level = np.random.choice(education_levels, n, p=[0.15, 0.40, 0.30, 0.15])
education_num = pd.Series(education_level).map(
    {'High School': 1, 'Bachelor': 2, 'Master': 3, 'PhD': 4}
).values

years_experience = np.clip(age - 22 + np.random.normal(0, 3, n), 0, 40).round(1)
performance_score = np.clip(np.random.normal(70, 12, n), 30, 100).round(1)
hours_per_week = np.clip(np.random.normal(42, 5, n), 30, 60).round(1)
num_projects = np.random.poisson(2 + 0.05 * performance_score)
training_hours = np.maximum(
    0, 10 + 0.3 * performance_score - 0.2 * years_experience
    + np.random.normal(0, 8, n)
).round(1)
remote_work_pct = np.clip(
    np.random.beta(2, 5, n) * 100 + 3 * (education_num - 2), 0, 100
).round(1)
satisfaction_score = np.clip(
    7.0 - 0.04 * hours_per_week + 0.02 * remote_work_pct
    + np.random.normal(0, 1.2, n), 1, 10
).round(1)
team_size = np.random.randint(3, 21, n)

# True salary model (treated as unknown in the analysis)
salary = (25000
          + 800 * years_experience
          + 300 * performance_score
          + 5000 * (education_num == 3)      # Master premium
          + 12000 * (education_num == 4)     # PhD premium
          + 3000 * (department == 'Engineering').astype(int)
          + 2000 * (department == 'Finance').astype(int)
          - 2000 * (gender == 'Female').astype(int)
          + np.random.normal(0, 8000, n))
annual_salary = np.round(salary, -2)  # Round to nearest 100

df = pd.DataFrame({
    'employee_id': employee_id,
    'age': age,
    'gender': gender,
    'department': department,
    'education_level': education_level,
    'years_experience': years_experience,
    'performance_score': performance_score,
    'hours_per_week': hours_per_week,
    'num_projects': num_projects,
    'training_hours': training_hours,
    'satisfaction_score': satisfaction_score,
    'remote_work_pct': remote_work_pct,
    'team_size': team_size,
    'annual_salary': annual_salary
})

# Set education_level as ordered categorical
df['education_level'] = pd.Categorical(
    df['education_level'],
    categories=['High School', 'Bachelor', 'Master', 'PhD'],
    ordered=True
)

print(f"Dataset shape: {df.shape}")
print(f"\nFirst 10 observations:")
df.head(10)

Dataset shape: (500, 14)

First 10 observations:

# Summary statistics for all numeric variables
df.describe().round(2)

# ---------------------------------------------------------------------------
# 4.1  Univariate Distributions of Key Numeric Variables
# ---------------------------------------------------------------------------

fig, axes = plt.subplots(2, 2, figsize=(12, 9))

variables = ['annual_salary', 'age', 'years_experience', 'performance_score']
titles = ['Annual Salary (USD)', 'Age (years)', 'Years of Experience', 'Performance Score']
colors = [PALETTE[0], PALETTE[1], PALETTE[2], PALETTE[3]]

for ax, var, title, color in zip(axes.ravel(), variables, titles, colors):
    sns.histplot(df[var], kde=True, bins=30, color=color, edgecolor='white',
                 linewidth=0.5, ax=ax, alpha=0.7)
    mean_val = df[var].mean()
    median_val = df[var].median()
    ax.axvline(mean_val, color='red', linestyle='--', linewidth=1.2, label=f'Mean = {mean_val:.1f}')
    ax.axvline(median_val, color='navy', linestyle=':', linewidth=1.2, label=f'Median = {median_val:.1f}')
    ax.set_title(title, fontweight='bold')
    ax.set_xlabel(title)
    ax.set_ylabel('Frequency')
    ax.legend(fontsize=8)

fig.suptitle('Figure 1: Univariate Distributions of Key Numeric Variables',
             fontsize=13, fontweight='bold', y=1.01)
plt.tight_layout()
plt.show()

# Print skewness and kurtosis
print("Skewness and excess kurtosis for key variables:\n")
for var, title in zip(variables, titles):
    skew = df[var].skew()
    kurt = df[var].kurtosis()
    print(f"  {title:30s}  skewness = {skew:+.3f},  excess kurtosis = {kurt:+.3f}")

Skewness and excess kurtosis for key variables:

  Annual Salary (USD)             skewness = +0.024,  excess kurtosis = -0.362
  Age (years)                     skewness = +0.213,  excess kurtosis = -0.378
  Years of Experience             skewness = +0.187,  excess kurtosis = -0.526
  Performance Score               skewness = -0.148,  excess kurtosis = -0.228

# ---------------------------------------------------------------------------
# 4.2  Categorical Variable Distributions
# ---------------------------------------------------------------------------

fig, axes = plt.subplots(1, 3, figsize=(14, 5))

# Department
dept_counts = df['department'].value_counts().sort_index()
axes[0].bar(dept_counts.index, dept_counts.values, color=PALETTE[:len(dept_counts)],
            edgecolor='white', linewidth=0.8)
axes[0].set_title('Department', fontweight='bold')
axes[0].set_xlabel('Department')
axes[0].set_ylabel('Count')
for i, (cat, val) in enumerate(zip(dept_counts.index, dept_counts.values)):
    axes[0].text(i, val + 2, str(val), ha='center', fontsize=9, fontweight='bold')
axes[0].tick_params(axis='x', rotation=30)

# Education Level
edu_counts = df['education_level'].value_counts().sort_index()
axes[1].bar(edu_counts.index, edu_counts.values, color=PALETTE[:len(edu_counts)],
            edgecolor='white', linewidth=0.8)
axes[1].set_title('Education Level', fontweight='bold')
axes[1].set_xlabel('Education Level')
axes[1].set_ylabel('Count')
for i, (cat, val) in enumerate(zip(edu_counts.index, edu_counts.values)):
    axes[1].text(i, val + 2, str(val), ha='center', fontsize=9, fontweight='bold')
axes[1].tick_params(axis='x', rotation=30)

# Gender
gender_counts = df['gender'].value_counts()
axes[2].bar(gender_counts.index, gender_counts.values, color=[PALETTE[4], PALETTE[5]],
            edgecolor='white', linewidth=0.8)
axes[2].set_title('Gender', fontweight='bold')
axes[2].set_xlabel('Gender')
axes[2].set_ylabel('Count')
for i, (cat, val) in enumerate(zip(gender_counts.index, gender_counts.values)):
    axes[2].text(i, val + 2, str(val), ha='center', fontsize=9, fontweight='bold')

fig.suptitle('Figure 2: Frequency Distributions of Categorical Variables',
             fontsize=13, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

# Print proportions
print("\nCategorical variable proportions:\n")
for var in ['department', 'education_level', 'gender']:
    print(f"  {var}:")
    props = df[var].value_counts(normalize=True).sort_index()
    for cat, prop in props.items():
        print(f"    {cat:20s} {prop:.1%}  (n = {df[var].value_counts()[cat]})")
    print()

Categorical variable proportions:

  department:
    Engineering          19.2%  (n = 96)
    Finance              20.0%  (n = 100)
    HR                   22.4%  (n = 112)
    Marketing            21.2%  (n = 106)
    Sales                17.2%  (n = 86)

  education_level:
    High School          16.8%  (n = 84)
    Bachelor             38.4%  (n = 192)
    Master               31.8%  (n = 159)
    PhD                  13.0%  (n = 65)

  gender:
    Female               46.2%  (n = 231)
    Male                 53.8%  (n = 269)

# ---------------------------------------------------------------------------
# 4.3  Correlation Heatmap (All Numeric Variables)
# ---------------------------------------------------------------------------

numeric_cols = ['age', 'years_experience', 'performance_score', 'hours_per_week',
                'num_projects', 'training_hours', 'satisfaction_score',
                'remote_work_pct', 'team_size', 'annual_salary']

corr_matrix = df[numeric_cols].corr()

# Create mask for upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)

fig, ax = plt.subplots(figsize=(11, 9))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='RdBu_r', center=0,
            vmin=-1, vmax=1, square=True, linewidths=0.5,
            cbar_kws={'shrink': 0.8, 'label': 'Pearson Correlation'},
            ax=ax, mask=None)

ax.set_title('Figure 3: Pearson Correlation Matrix of Numeric Variables',
             fontsize=13, fontweight='bold', pad=15)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Print strongest correlations with salary
print("\nCorrelations with annual_salary (sorted by absolute value):\n")
salary_corr = corr_matrix['annual_salary'].drop('annual_salary').abs().sort_values(ascending=False)
for var, r in salary_corr.items():
    sign = '+' if corr_matrix.loc[var, 'annual_salary'] > 0 else '-'
    print(f"  {var:25s}  r = {sign}{r:.3f}")

Correlations with annual_salary (sorted by absolute value):

  years_experience           r = +0.658
  age                        r = +0.626
  performance_score          r = +0.273
  training_hours             r = -0.107
  satisfaction_score         r = +0.065
  num_projects               r = +0.031
  remote_work_pct            r = +0.018
  hours_per_week             r = -0.011
  team_size                  r = -0.007

# ---------------------------------------------------------------------------
# 4.4  Salary Distributions by Subgroup
# ---------------------------------------------------------------------------

fig, axes = plt.subplots(1, 3, figsize=(16, 6))

# --- Salary by Department ---
dept_order = df.groupby('department')['annual_salary'].median().sort_values().index.tolist()
sns.boxplot(data=df, x='department', y='annual_salary', order=dept_order,
            palette='Set2', ax=axes[0], linewidth=0.8, fliersize=3)
axes[0].set_title('Salary by Department', fontweight='bold')
axes[0].set_xlabel('Department')
axes[0].set_ylabel('Annual Salary (USD)')
axes[0].tick_params(axis='x', rotation=30)

# Add mean markers
means = df.groupby('department')['annual_salary'].mean()
for i, dept in enumerate(dept_order):
    axes[0].scatter(i, means[dept], color='red', marker='D', s=40, zorder=5,
                    label='Mean' if i == 0 else '')
axes[0].legend(fontsize=8)

# --- Salary by Gender ---
sns.boxplot(data=df, x='gender', y='annual_salary', palette='Set2', ax=axes[1],
            linewidth=0.8, fliersize=3)
axes[1].set_title('Salary by Gender', fontweight='bold')
axes[1].set_xlabel('Gender')
axes[1].set_ylabel('Annual Salary (USD)')

# Add mean markers
for i, g in enumerate(['Female', 'Male']):
    m = df[df['gender'] == g]['annual_salary'].mean()
    axes[1].scatter(i, m, color='red', marker='D', s=40, zorder=5,
                    label='Mean' if i == 0 else '')
axes[1].legend(fontsize=8)

# --- Salary by Education Level ---
edu_order = ['High School', 'Bachelor', 'Master', 'PhD']
sns.boxplot(data=df, x='education_level', y='annual_salary', order=edu_order,
            palette='Set2', ax=axes[2], linewidth=0.8, fliersize=3)
axes[2].set_title('Salary by Education Level', fontweight='bold')
axes[2].set_xlabel('Education Level')
axes[2].set_ylabel('Annual Salary (USD)')
axes[2].tick_params(axis='x', rotation=30)

# Add mean markers
means_edu = df.groupby('education_level')['annual_salary'].mean()
for i, edu in enumerate(edu_order):
    axes[2].scatter(i, means_edu[edu], color='red', marker='D', s=40, zorder=5,
                    label='Mean' if i == 0 else '')
axes[2].legend(fontsize=8)

fig.suptitle('Figure 4: Annual Salary Distributions by Department, Gender, and Education Level',
             fontsize=13, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

# Print group means and medians
print("\nSalary summary by subgroup:\n")
for grp_var, grp_label in [('department', 'Department'),
                             ('gender', 'Gender'),
                             ('education_level', 'Education Level')]:
    print(f"  {grp_label}:")
    summary = df.groupby(grp_var)['annual_salary'].agg(['mean', 'median', 'std', 'count'])
    summary = summary.round(0)
    for idx, row in summary.iterrows():
        print(f"    {str(idx):20s}  mean = ${row['mean']:>10,.0f}  "
              f"median = ${row['median']:>10,.0f}  "
              f"std = ${row['std']:>8,.0f}  n = {int(row['count'])}")
    print()

Salary summary by subgroup:

  Department:
    Engineering           mean = $    64,342  median = $    64,400  std = $  12,774  n = 96
    Finance               mean = $    64,445  median = $    63,350  std = $  11,421  n = 100
    HR                    mean = $    62,020  median = $    61,150  std = $  12,760  n = 112
    Marketing             mean = $    62,092  median = $    61,000  std = $  12,102  n = 106
    Sales                 mean = $    62,355  median = $    60,900  std = $  11,965  n = 86

  Gender:
    Female                mean = $    62,082  median = $    61,600  std = $  11,887  n = 231
    Male                  mean = $    63,832  median = $    62,700  std = $  12,484  n = 269

  Education Level:
    High School           mean = $    60,377  median = $    59,550  std = $  11,202  n = 84
    Bachelor              mean = $    60,518  median = $    60,700  std = $  12,699  n = 192
    Master                mean = $    64,505  median = $    63,300  std = $  11,435  n = 159
    PhD                   mean = $    70,218  median = $    70,000  std = $  10,655  n = 65

# ---------------------------------------------------------------------------
# 4.5  Bivariate Scatter Plots with Regression Lines
# ---------------------------------------------------------------------------

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# --- Salary vs Years of Experience (colored by Gender) ---
for i, g in enumerate(['Male', 'Female']):
    subset = df[df['gender'] == g]
    axes[0].scatter(subset['years_experience'], subset['annual_salary'],
                    alpha=0.45, s=25, color=PALETTE[i], label=g, edgecolors='white',
                    linewidth=0.3)
    # Add regression line
    z = np.polyfit(subset['years_experience'], subset['annual_salary'], 1)
    p = np.poly1d(z)
    x_line = np.linspace(subset['years_experience'].min(),
                         subset['years_experience'].max(), 100)
    axes[0].plot(x_line, p(x_line), color=PALETTE[i], linewidth=2, linestyle='--')

axes[0].set_title('Salary vs. Years of Experience by Gender', fontweight='bold')
axes[0].set_xlabel('Years of Experience')
axes[0].set_ylabel('Annual Salary (USD)')
axes[0].legend(title='Gender', fontsize=9)

# --- Salary vs Performance Score (colored by Department) ---
dept_list = sorted(df['department'].unique())
for i, dept in enumerate(dept_list):
    subset = df[df['department'] == dept]
    axes[1].scatter(subset['performance_score'], subset['annual_salary'],
                    alpha=0.45, s=25, color=PALETTE[i], label=dept, edgecolors='white',
                    linewidth=0.3)

# Overall regression line
z_all = np.polyfit(df['performance_score'], df['annual_salary'], 1)
p_all = np.poly1d(z_all)
x_all = np.linspace(df['performance_score'].min(), df['performance_score'].max(), 100)
axes[1].plot(x_all, p_all(x_all), color='black', linewidth=2, linestyle='-',
             label='Overall trend')

axes[1].set_title('Salary vs. Performance Score by Department', fontweight='bold')
axes[1].set_xlabel('Performance Score')
axes[1].set_ylabel('Annual Salary (USD)')
axes[1].legend(title='Department', fontsize=8, ncol=2)

fig.suptitle('Figure 5: Bivariate Relationships Between Salary and Key Predictors',
             fontsize=13, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

# Print regression slopes
print("\nBivariate regression slopes (salary ~ predictor):\n")
for var, label in [('years_experience', 'Years of Experience'),
                   ('performance_score', 'Performance Score')]:
    slope, intercept = np.polyfit(df[var], df['annual_salary'], 1)
    r = df[var].corr(df['annual_salary'])
    print(f"  {label:25s}  slope = ${slope:>8,.0f}/unit,  r = {r:+.3f}")

Bivariate regression slopes (salary ~ predictor):

  Years of Experience        slope = $     838/unit,  r = +0.658
  Performance Score          slope = $     279/unit,  r = +0.273

# 5.1 Model Fitting
# Create dummy variables for categorical predictors
df_reg = df.copy()
df_reg = pd.get_dummies(df_reg, columns=['gender', 'department', 'education_level'],
                         drop_first=True, dtype=float)

# Define feature matrix and response
exclude_cols = ['employee_id', 'annual_salary']
feature_cols = [c for c in df_reg.columns if c not in exclude_cols]
X = df_reg[feature_cols]
y = df_reg['annual_salary']

# Add constant for intercept
X_const = sm.add_constant(X)

# Fit OLS model
model = sm.OLS(y, X_const).fit()
print(model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:          annual_salary   R-squared:                       0.619
Model:                            OLS   Adj. R-squared:                  0.605
Method:                 Least Squares   F-statistic:                     46.04
Date:                Sat, 14 Feb 2026   Prob (F-statistic):           1.83e-89
Time:                        13:48:24   Log-Likelihood:                -5173.7
No. Observations:                 500   AIC:                         1.038e+04
Df Residuals:                     482   BIC:                         1.046e+04
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
============================================================================================
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                     2.473e+04   5265.592      4.697      0.000    1.44e+04    3.51e+04
age                        107.0399    125.672      0.852      0.395    -139.893     353.973
years_experience           733.0495    124.517      5.887      0.000     488.386     977.713
performance_score          294.5629     32.413      9.088      0.000     230.875     358.251
hours_per_week              -6.6707     73.387     -0.091      0.928    -150.868     137.527
num_projects               -90.8473    166.043     -0.547      0.585    -417.105     235.411
training_hours             -82.1055     41.914     -1.959      0.051    -164.462       0.251
satisfaction_score         384.3332    279.511      1.375      0.170    -164.877     933.544
remote_work_pct            -29.0946     23.036     -1.263      0.207     -74.359      16.169
team_size                  -69.6536     67.662     -1.029      0.304    -202.602      63.295
gender_Male               2081.6546    700.748      2.971      0.003     704.756    3458.553
department_Finance         289.6094   1113.175      0.260      0.795   -1897.665    2476.884
department_HR            -2006.3716   1082.572     -1.853      0.064   -4133.515     120.772
department_Marketing     -1351.9471   1097.580     -1.232      0.219   -3508.579     804.685
department_Sales         -3743.9406   1157.148     -3.235      0.001   -6017.619   -1470.262
education_level_Bachelor   331.8999   1026.019      0.323      0.746   -1684.122    2347.922
education_level_Master    4969.7855   1063.901      4.671      0.000    2879.329    7060.242
education_level_PhD       1.175e+04   1305.354      8.999      0.000    9181.594    1.43e+04
==============================================================================
Omnibus:                        0.306   Durbin-Watson:                   2.068
Prob(Omnibus):                  0.858   Jarque-Bera (JB):                0.414
Skew:                           0.038   Prob(JB):                        0.813
Kurtosis:                       2.882   Cond. No.                     1.57e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.57e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

# 5.2 Regression Diagnostic Plots
fitted_vals = model.fittedvalues
residuals = model.resid
standardized_resid = model.get_influence().resid_studentized_internal

fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# (1) Residuals vs Fitted
axes[0, 0].scatter(fitted_vals, residuals, alpha=0.4, s=15, color=PALETTE[0])
axes[0, 0].axhline(y=0, color='red', linestyle='--', linewidth=1)
axes[0, 0].set_xlabel('Fitted Values')
axes[0, 0].set_ylabel('Residuals')
axes[0, 0].set_title('Residuals vs. Fitted Values')
# Add lowess smooth
from statsmodels.nonparametric.smoothers_lowess import lowess
smooth = lowess(residuals, fitted_vals, frac=0.3)
axes[0, 0].plot(smooth[:, 0], smooth[:, 1], color='red', linewidth=1.5)

# (2) Q-Q Plot
from scipy.stats import probplot
probplot(standardized_resid, dist="norm", plot=axes[0, 1])
axes[0, 1].set_title('Normal Q-Q Plot')
axes[0, 1].get_lines()[0].set(color=PALETTE[1], markersize=3, alpha=0.5)
axes[0, 1].get_lines()[1].set(color='red', linewidth=1.5)

# (3) Scale-Location
sqrt_abs_resid = np.sqrt(np.abs(standardized_resid))
axes[1, 0].scatter(fitted_vals, sqrt_abs_resid, alpha=0.4, s=15, color=PALETTE[2])
axes[1, 0].set_xlabel('Fitted Values')
axes[1, 0].set_ylabel(r'$\sqrt{|\mathrm{Standardized\ Residuals}|}$')
axes[1, 0].set_title('Scale-Location Plot')
smooth2 = lowess(sqrt_abs_resid, fitted_vals, frac=0.3)
axes[1, 0].plot(smooth2[:, 0], smooth2[:, 1], color='red', linewidth=1.5)

# (4) Residuals vs Leverage
influence = model.get_influence()
leverage = influence.hat_matrix_diag
cooks_d = influence.cooks_distance[0]
axes[1, 1].scatter(leverage, standardized_resid, alpha=0.4, s=15,
                    c=cooks_d, cmap='YlOrRd', edgecolors='gray', linewidth=0.3)
axes[1, 1].axhline(y=0, color='gray', linestyle='--', linewidth=0.8)
axes[1, 1].set_xlabel("Leverage ($h_{ii}$)")
axes[1, 1].set_ylabel('Standardized Residuals')
axes[1, 1].set_title("Residuals vs. Leverage (Cook's Distance)")
# Mark Cook's distance threshold
p = X_const.shape[1]
cooks_threshold = 4 / (n - p)
high_cook = cooks_d > cooks_threshold
if high_cook.any():
    axes[1, 1].scatter(leverage[high_cook], standardized_resid[high_cook],
                        s=50, facecolors='none', edgecolors='red', linewidth=1.5)

plt.tight_layout()
plt.savefig('regression_diagnostics.png', dpi=150, bbox_inches='tight')
plt.show()

# 5.3 Variance Inflation Factor Analysis
vif_data = pd.DataFrame()
vif_data['Variable'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif_data = vif_data.sort_values('VIF', ascending=False).reset_index(drop=True)
print("Variance Inflation Factors (VIF)")
print("=" * 45)
print(vif_data.to_string(index=False))
print(f"\nVariables with VIF > 5: {vif_data[vif_data['VIF'] > 5]['Variable'].tolist()}")
print(f"Variables with VIF > 10: {vif_data[vif_data['VIF'] > 10]['Variable'].tolist()}")

Variance Inflation Factors (VIF)
=============================================
                Variable        VIF
                     age 166.248065
          hours_per_week  48.912517
        years_experience  42.540888
       performance_score  40.572153
      satisfaction_score  21.922551
          training_hours  12.022048
            num_projects   7.867542
               team_size   6.054581
         remote_work_pct   5.330057
education_level_Bachelor   3.375360
  education_level_Master   3.001300
           department_HR   2.209555
             gender_Male   2.207027
    department_Marketing   2.143510
      department_Finance   2.066237
        department_Sales   1.906436
     education_level_PhD   1.828515

Variables with VIF > 5: ['age', 'hours_per_week', 'years_experience', 'performance_score', 'satisfaction_score', 'training_hours', 'num_projects', 'team_size', 'remote_work_pct']
Variables with VIF > 10: ['age', 'hours_per_week', 'years_experience', 'performance_score', 'satisfaction_score', 'training_hours']

# 6.1 Two-Sample Welch's t-Test: Gender Pay Gap

male_salary = df[df['gender'] == 'Male']['annual_salary']
female_salary = df[df['gender'] == 'Female']['annual_salary']

print("Hypothesis Test: Gender Pay Gap")
print("=" * 60)
print(f"H\u2080: \u03bc_male = \u03bc_female (no difference in mean salary)")
print(f"H\u2081: \u03bc_male \u2260 \u03bc_female (two-sided test)")
print(f"Significance level: \u03b1 = 0.05")
print()

# Descriptive statistics
print(f"Male employees:   n = {len(male_salary)}, Mean = ${male_salary.mean():,.0f}, SD = ${male_salary.std():,.0f}")
print(f"Female employees: n = {len(female_salary)}, Mean = ${female_salary.mean():,.0f}, SD = ${female_salary.std():,.0f}")
print(f"Mean difference:  ${male_salary.mean() - female_salary.mean():,.0f}")
print()

# Welch's t-test using statsmodels for df and CI
d1 = DescrStatsW(male_salary)
d2 = DescrStatsW(female_salary)
cm = CompareMeans(d1, d2)

# t-test
t_stat, p_value, df_test = cm.ttest_ind(usevar='unequal')
print(f"Welch's t-statistic: t = {t_stat:.4f}")
print(f"Degrees of freedom:  df = {df_test:.1f}")
print(f"p-value (two-sided): p = {p_value:.6f}")

# Confidence interval for the difference
ci_low, ci_high = cm.tconfint_diff(usevar='unequal')
print(f"95% CI for difference: [${ci_low:,.0f}, ${ci_high:,.0f}]")

# Cohen's d
pooled_std = np.sqrt(((len(male_salary) - 1) * male_salary.std()**2 +
                       (len(female_salary) - 1) * female_salary.std()**2) /
                      (len(male_salary) + len(female_salary) - 2))
cohens_d = (male_salary.mean() - female_salary.mean()) / pooled_std
print(f"\nEffect size (Cohen's d): d = {cohens_d:.4f}")
interpretation = "negligible" if abs(cohens_d) < 0.2 else "small" if abs(cohens_d) < 0.5 else "medium" if abs(cohens_d) < 0.8 else "large"
print(f"Interpretation: {interpretation} effect")

# Decision
if p_value < 0.05:
    print(f"\nDecision: Reject H\u2080 at \u03b1 = 0.05. There is statistically significant evidence")
    print(f"of a difference in mean salary between male and female employees.")
else:
    print(f"\nDecision: Fail to reject H\u2080 at \u03b1 = 0.05.")

Hypothesis Test: Gender Pay Gap
============================================================
H₀: μ_male = μ_female (no difference in mean salary)
H₁: μ_male ≠ μ_female (two-sided test)
Significance level: α = 0.05

Male employees:   n = 269, Mean = $63,832, SD = $12,484
Female employees: n = 231, Mean = $62,082, SD = $11,887
Mean difference:  $1,749

Welch's t-statistic: t = 1.6029
Degrees of freedom:  df = 492.7
p-value (two-sided): p = 0.109597
95% CI for difference: [$-395, $3,894]

Effect size (Cohen's d): d = 0.1432
Interpretation: negligible effect

Decision: Fail to reject H₀ at α = 0.05.

# 6.2 One-Way ANOVA: Salary by Department

print("One-Way ANOVA: Annual Salary by Department")
print("=" * 60)
print(f"H\u2080: \u03bc\u2081 = \u03bc\u2082 = \u03bc\u2083 = \u03bc\u2084 = \u03bc\u2085 (equal mean salaries across departments)")
print(f"H\u2081: At least one \u03bc\u1d62 differs")
print(f"Significance level: \u03b1 = 0.05")
print()

# Group statistics
dept_stats = df.groupby('department')['annual_salary'].agg(['count', 'mean', 'std'])
dept_stats.columns = ['n', 'Mean', 'SD']
dept_stats['Mean'] = dept_stats['Mean'].map('${:,.0f}'.format)
dept_stats['SD'] = dept_stats['SD'].map('${:,.0f}'.format)
print("Department Statistics:")
print(dept_stats.to_string())
print()

# ANOVA
groups = [group['annual_salary'].values for name, group in df.groupby('department')]
f_stat, p_value_anova = stats.f_oneway(*groups)
print(f"F-statistic: F = {f_stat:.4f}")
print(f"p-value: p = {p_value_anova:.6f}")

# Eta-squared effect size
ss_between = sum(len(g) * (g.mean() - df['annual_salary'].mean())**2 for g in groups)
ss_total = sum((df['annual_salary'] - df['annual_salary'].mean())**2)
eta_squared = ss_between / ss_total
print(f"Effect size (\u03b7\u00b2): {eta_squared:.4f}")
interpretation_eta = "small" if eta_squared < 0.06 else "medium" if eta_squared < 0.14 else "large"
print(f"Interpretation: {interpretation_eta} effect")

if p_value_anova < 0.05:
    print(f"\nDecision: Reject H\u2080. Significant differences exist between departments.")
else:
    print(f"\nDecision: Fail to reject H\u2080.")

# Post-hoc Tukey HSD
print("\n" + "=" * 60)
print("Post-Hoc Analysis: Tukey's HSD")
print("=" * 60)
tukey = pairwise_tukeyhsd(df['annual_salary'], df['department'], alpha=0.05)
print(tukey.summary())

# Visualization: Means plot with 95% CI
fig, ax = plt.subplots(figsize=(10, 6))
dept_means = df.groupby('department')['annual_salary'].agg(['mean', 'std', 'count'])
dept_means['se'] = dept_means['std'] / np.sqrt(dept_means['count'])
dept_means['ci95'] = 1.96 * dept_means['se']
dept_means = dept_means.sort_values('mean', ascending=True)

colors = [PALETTE[i] for i in range(len(dept_means))]
bars = ax.barh(range(len(dept_means)), dept_means['mean'],
               xerr=dept_means['ci95'], capsize=5, color=colors,
               edgecolor='gray', linewidth=0.5, alpha=0.85)
ax.set_yticks(range(len(dept_means)))
ax.set_yticklabels(dept_means.index)
ax.set_xlabel('Mean Annual Salary ($)')
ax.set_title('Mean Annual Salary by Department with 95% Confidence Intervals')
ax.axvline(x=df['annual_salary'].mean(), color='red', linestyle='--',
           linewidth=1, label=f'Overall Mean (${df["annual_salary"].mean():,.0f})')
ax.legend()
plt.tight_layout()
plt.show()

One-Way ANOVA: Annual Salary by Department
============================================================
H₀: μ₁ = μ₂ = μ₃ = μ₄ = μ₅ (equal mean salaries across departments)
H₁: At least one μᵢ differs
Significance level: α = 0.05

Department Statistics:
               n     Mean       SD
department                        
Engineering   96  $64,342  $12,774
Finance      100  $64,445  $11,421
HR           112  $62,020  $12,760
Marketing    106  $62,092  $12,102
Sales         86  $62,355  $11,965

F-statistic: F = 1.0234
p-value: p = 0.394618
Effect size (η²): 0.0082
Interpretation: small effect

Decision: Fail to reject H₀.

============================================================
Post-Hoc Analysis: Tukey's HSD
============================================================

        Multiple Comparison of Means - Tukey HSD, FWER=0.05        
===================================================================
   group1     group2   meandiff  p-adj    lower      upper   reject
-------------------------------------------------------------------
Engineering   Finance   103.3333    1.0 -4681.0268 4887.6935  False
Engineering        HR -2322.0238 0.6504 -6979.1551 2335.1075  False
Engineering Marketing -2250.1572 0.6877 -6967.7297 2467.4152  False
Engineering     Sales -1987.0155 0.8095 -6958.4566 2984.4256  False
    Finance        HR -2425.3571 0.6011 -7032.0617 2181.3474  False
    Finance Marketing -2353.4906 0.6405 -7021.2893 2314.3082  False
    Finance     Sales -2090.3488 0.7729 -7014.5829 2833.8853  False
         HR Marketing    71.8666    1.0 -4465.4361 4609.1692  False
         HR     Sales   335.0083 0.9997 -4465.7053  5135.722  False
  Marketing     Sales   263.1417 0.9999 -4596.2275  5122.511  False
-------------------------------------------------------------------

# 6.3 Chi-Square Test of Independence

print("Chi-Square Test of Independence: Department \u00d7 Education Level")
print("=" * 60)
print("H\u2080: Department and education level are independent")
print("H\u2081: Department and education level are not independent")
print(f"Significance level: \u03b1 = 0.05")
print()

# Contingency table
contingency = pd.crosstab(df['department'], df['education_level'])
print("Observed Frequencies:")
print(contingency)
print()

# Chi-square test
chi2, p_chi, dof, expected = stats.chi2_contingency(contingency)
print(f"\u03c7\u00b2 statistic: {chi2:.4f}")
print(f"Degrees of freedom: {dof}")
print(f"p-value: {p_chi:.4f}")

# Cramer's V
n_obs = contingency.sum().sum()
min_dim = min(contingency.shape[0] - 1, contingency.shape[1] - 1)
cramers_v = np.sqrt(chi2 / (n_obs * min_dim))
print(f"Cram\u00e9r's V: {cramers_v:.4f}")
interpretation_v = "negligible" if cramers_v < 0.1 else "small" if cramers_v < 0.3 else "medium" if cramers_v < 0.5 else "large"
print(f"Interpretation: {interpretation_v} association")

if p_chi < 0.05:
    print(f"\nDecision: Reject H\u2080. Department and education level are not independent.")
else:
    print(f"\nDecision: Fail to reject H\u2080. No significant association detected.")

# Visualization: Heatmap of observed vs expected
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Observed
sns.heatmap(contingency, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            cbar_kws={'label': 'Count'})
axes[0].set_title('Observed Frequencies')
axes[0].set_xlabel('Education Level')
axes[0].set_ylabel('Department')

# Expected
expected_df = pd.DataFrame(expected, index=contingency.index, columns=contingency.columns)
sns.heatmap(expected_df, annot=True, fmt='.1f', cmap='Blues', ax=axes[1],
            cbar_kws={'label': 'Expected Count'})
axes[1].set_title('Expected Frequencies (Under Independence)')
axes[1].set_xlabel('Education Level')
axes[1].set_ylabel('Department')

plt.tight_layout()
plt.show()

# Check expected cell counts (assumption)
min_expected = expected.min()
pct_below_5 = (expected < 5).sum() / expected.size * 100
print(f"\nAssumption check: Minimum expected frequency = {min_expected:.1f}")
print(f"Cells with expected frequency < 5: {pct_below_5:.1f}%")

Chi-Square Test of Independence: Department × Education Level
============================================================
H₀: Department and education level are independent
H₁: Department and education level are not independent
Significance level: α = 0.05

Observed Frequencies:
education_level  High School  Bachelor  Master  PhD
department                                         
Engineering               18        37      30   11
Finance                   14        40      34   12
HR                        23        46      28   15
Marketing                 14        40      39   13
Sales                     15        29      28   14

χ² statistic: 6.8850
Degrees of freedom: 12
p-value: 0.8651
Cramér's V: 0.0677
Interpretation: negligible association

Decision: Fail to reject H₀. No significant association detected.

Assumption check: Minimum expected frequency = 11.2
Cells with expected frequency < 5: 0.0%

# 7.1 Data Preparation and Suitability Assessment

# Select all numeric non-target variables
pca_vars = ['age', 'years_experience', 'performance_score', 'hours_per_week',
            'num_projects', 'training_hours', 'satisfaction_score', 'remote_work_pct', 'team_size']

X_pca = df[pca_vars].copy()
print(f"PCA input: {len(pca_vars)} variables, {len(X_pca)} observations")
print()

# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_pca)
X_scaled_df = pd.DataFrame(X_scaled, columns=pca_vars)

# Correlation matrix
corr_matrix = X_scaled_df.corr()

# Display correlation matrix
fig, ax = plt.subplots(figsize=(10, 8))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f', cmap='RdBu_r',
            center=0, vmin=-1, vmax=1, square=True, ax=ax,
            cbar_kws={'label': 'Pearson Correlation'},
            linewidths=0.5)
ax.set_title('Correlation Matrix of PCA Input Variables')
plt.tight_layout()
plt.show()

# Bartlett's Test of Sphericity
n_obs = X_scaled.shape[0]
p_vars = X_scaled.shape[1]
corr_det = np.linalg.det(corr_matrix.values)
chi2_bartlett = -(n_obs - 1 - (2 * p_vars + 5) / 6) * np.log(corr_det)
df_bartlett = p_vars * (p_vars - 1) / 2
p_bartlett = 1 - stats.chi2.cdf(chi2_bartlett, df_bartlett)

print(f"Bartlett's Test of Sphericity:")
print(f"  chi2 = {chi2_bartlett:.2f},  df = {int(df_bartlett)},  p < {max(p_bartlett, 1e-10):.2e}")
if p_bartlett < 0.05:
    print("  Result: Reject H0. The correlation matrix is significantly different from the identity matrix.")
    print("  Interpretation: PCA is appropriate for these data.")
else:
    print("  Result: Fail to reject H0. PCA may not be suitable.")

# KMO Measure (manual computation via partial correlations)
print()
try:
    R = corr_matrix.values
    R_inv = np.linalg.inv(R)

    # Partial correlations from the inverse correlation matrix
    D = np.diag(1.0 / np.sqrt(np.diag(R_inv)))
    P = -D @ R_inv @ D
    np.fill_diagonal(P, 1.0)

    # KMO per variable
    r2_sum = np.sum(R**2, axis=1) - 1  # sum of squared correlations (excluding diagonal)
    p2_sum = np.sum(P**2, axis=1) - 1  # sum of squared partial correlations (excluding diagonal)

    kmo_per_var = r2_sum / (r2_sum + p2_sum)

    # Overall KMO
    r2_total = np.sum(np.triu(R**2, k=1))
    p2_total = np.sum(np.triu(P**2, k=1))
    kmo_overall = r2_total / (r2_total + p2_total)

    print("Kaiser-Meyer-Olkin (KMO) Measure of Sampling Adequacy:")
    print(f"  Overall KMO = {kmo_overall:.4f}")
    kmo_interp = ("unacceptable" if kmo_overall < 0.5 else
                  "miserable" if kmo_overall < 0.6 else
                  "mediocre" if kmo_overall < 0.7 else
                  "middling" if kmo_overall < 0.8 else
                  "meritorious" if kmo_overall < 0.9 else "marvelous")
    print(f"  Interpretation: {kmo_interp} (Kaiser, 1974)")
    print()
    print("  KMO per variable:")
    for var, kmo_val in zip(pca_vars, kmo_per_var):
        print(f"    {var:25s}: {kmo_val:.4f}")
except Exception as e:
    print(f"KMO computation note: {e}")
    print("Proceeding with Bartlett's test result as the primary suitability indicator.")

PCA input: 9 variables, 500 observations

Bartlett's Test of Sphericity:
  chi2 = 1416.85,  df = 36,  p < 1.00e-10
  Result: Reject H0. The correlation matrix is significantly different from the identity matrix.
  Interpretation: PCA is appropriate for these data.

Kaiser-Meyer-Olkin (KMO) Measure of Sampling Adequacy:
  Overall KMO = 0.5154
  Interpretation: miserable (Kaiser, 1974)

  KMO per variable:
    age                      : 0.5181
    years_experience         : 0.5164
    performance_score        : 0.4545
    hours_per_week           : 0.5102
    num_projects             : 0.5005
    training_hours           : 0.5955
    satisfaction_score       : 0.5012
    remote_work_pct          : 0.4762
    team_size                : 0.4653

# 7.2 PCA Computation

pca = PCA()
pca_scores = pca.fit_transform(X_scaled)

# Eigenvalues and variance explained
eigenvalues = pca.explained_variance_
variance_ratio = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(variance_ratio)

# Eigenvalue table
eigen_table = pd.DataFrame({
    'Component': [f'PC{i+1}' for i in range(len(eigenvalues))],
    'Eigenvalue': eigenvalues.round(4),
    'Variance Explained (%)': (variance_ratio * 100).round(2),
    'Cumulative (%)': (cumulative_variance * 100).round(2)
})
print("Principal Component Eigenvalues and Variance Explained")
print("=" * 65)
print(eigen_table.to_string(index=False))

# Number of components by Kaiser criterion
n_kaiser = sum(eigenvalues > 1)
print(f"\nComponents with eigenvalue > 1 (Kaiser criterion): {n_kaiser}")

# Visualization: Scree plot + Cumulative variance
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Scree plot
axes[0].plot(range(1, len(eigenvalues) + 1), eigenvalues, 'o-',
             color=PALETTE[0], linewidth=2, markersize=8)
axes[0].axhline(y=1, color='red', linestyle='--', linewidth=1, label='Kaiser Criterion (lambda = 1)')
axes[0].set_xlabel('Component Number')
axes[0].set_ylabel('Eigenvalue')
axes[0].set_title('Scree Plot')
axes[0].set_xticks(range(1, len(eigenvalues) + 1))
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Cumulative variance
axes[1].bar(range(1, len(variance_ratio) + 1), variance_ratio * 100,
            alpha=0.6, color=PALETTE[1], label='Individual')
axes[1].plot(range(1, len(cumulative_variance) + 1), cumulative_variance * 100,
             'o-', color=PALETTE[3], linewidth=2, markersize=6, label='Cumulative')
axes[1].axhline(y=80, color='red', linestyle='--', linewidth=1, label='80% Threshold')
axes[1].set_xlabel('Component Number')
axes[1].set_ylabel('Variance Explained (%)')
axes[1].set_title('Variance Explained by Principal Components')
axes[1].set_xticks(range(1, len(variance_ratio) + 1))
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

Principal Component Eigenvalues and Variance Explained
=================================================================
Component  Eigenvalue  Variance Explained (%)  Cumulative (%)
      PC1      2.1201                   23.51           23.51
      PC2      1.4109                   15.65           39.15
      PC3      1.3060                   14.48           53.64
      PC4      1.0599                   11.75           65.39
      PC5      0.9515                   10.55           75.94
      PC6      0.8878                    9.84           85.78
      PC7      0.7119                    7.89           93.68
      PC8      0.5256                    5.83           99.51
      PC9      0.0445                    0.49          100.00

Components with eigenvalue > 1 (Kaiser criterion): 4

# 7.3 Component Interpretation

# Loadings matrix
n_components = n_kaiser  # Use Kaiser criterion
loadings = pca.components_[:n_components].T  # shape: (p, n_components)
loadings_df = pd.DataFrame(loadings, index=pca_vars,
                           columns=[f'PC{i+1}' for i in range(n_components)])

# Loadings heatmap
fig, ax = plt.subplots(figsize=(8, 8))
sns.heatmap(loadings_df, annot=True, fmt='.3f', cmap='RdBu_r', center=0,
            vmin=-1, vmax=1, ax=ax, linewidths=0.5,
            cbar_kws={'label': 'Loading'})
ax.set_title(f'PCA Loadings Matrix (First {n_components} Components)')
ax.set_xlabel('Principal Component')
ax.set_ylabel('Variable')
plt.tight_layout()
plt.show()

# Biplot (PC1 vs PC2)
fig, ax = plt.subplots(figsize=(10, 8))

# Scores
scores_df = pd.DataFrame(pca_scores[:, :2], columns=['PC1', 'PC2'])
ax.scatter(scores_df['PC1'], scores_df['PC2'], alpha=0.3, s=15, color=PALETTE[0], label='Observations')

# Loading vectors
scale_factor = 3  # Scale for visibility
for i, var in enumerate(pca_vars):
    ax.annotate('', xy=(pca.components_[0, i] * scale_factor, pca.components_[1, i] * scale_factor),
                xytext=(0, 0),
                arrowprops=dict(arrowstyle='->', color=PALETTE[3], lw=1.5))
    ax.text(pca.components_[0, i] * scale_factor * 1.1,
            pca.components_[1, i] * scale_factor * 1.1,
            var, fontsize=8, ha='center', va='center',
            bbox=dict(boxstyle='round,pad=0.2', facecolor='white', alpha=0.8, edgecolor='gray'))

ax.set_xlabel(f'PC1 ({variance_ratio[0]*100:.1f}% variance explained)')
ax.set_ylabel(f'PC2 ({variance_ratio[1]*100:.1f}% variance explained)')
ax.set_title('PCA Biplot: Observations and Variable Loadings')
ax.axhline(y=0, color='gray', linestyle='--', linewidth=0.5)
ax.axvline(x=0, color='gray', linestyle='--', linewidth=0.5)
ax.legend()
plt.tight_layout()
plt.show()

# Print dominant loadings per component
print("Dominant Loadings per Component (|loading| > 0.3):")
print("=" * 55)
for comp in range(n_components):
    dominant = loadings_df.iloc[:, comp][abs(loadings_df.iloc[:, comp]) > 0.3]
    dominant = dominant.sort_values(key=abs, ascending=False)
    print(f"\nPC{comp+1}:")
    for var, load in dominant.items():
        direction = "+" if load > 0 else "-"
        print(f"  {var:25s}: {load:+.3f} {direction}")

Dominant Loadings per Component (|loading| > 0.3):
=======================================================

PC1:
  years_experience         : +0.652 +
  age                      : +0.648 +
  training_hours           : -0.335 -

PC2:
  performance_score        : +0.599 +
  num_projects             : +0.468 +
  satisfaction_score       : +0.385 +
  training_hours           : +0.351 +

PC3:
  remote_work_pct          : +0.544 +
  satisfaction_score       : +0.535 +
  performance_score        : -0.394 -
  hours_per_week           : -0.369 -

PC4:
  team_size                : +0.812 +
  hours_per_week           : -0.480 -

# 8.1 Optimal Number of Clusters

# Use standardized features (same as PCA input)
X_cluster = X_scaled.copy()  # From PCA section

# Elbow method: WSS for k=2..10
k_range = range(2, 11)
wss = []
silhouette_avgs = []

for k in k_range:
    km = KMeans(n_clusters=k, n_init=10, random_state=42)
    km.fit(X_cluster)
    wss.append(km.inertia_)
    sil_avg = silhouette_score(X_cluster, km.labels_)
    silhouette_avgs.append(sil_avg)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Elbow plot
axes[0].plot(list(k_range), wss, 'o-', color=PALETTE[0], linewidth=2, markersize=8)
axes[0].set_xlabel('Number of Clusters (k)')
axes[0].set_ylabel('Within-Cluster Sum of Squares (WSS)')
axes[0].set_title('Elbow Method for Optimal k')
axes[0].set_xticks(list(k_range))
axes[0].grid(True, alpha=0.3)

# Silhouette scores
axes[1].plot(list(k_range), silhouette_avgs, 'o-', color=PALETTE[1], linewidth=2, markersize=8)
axes[1].set_xlabel('Number of Clusters (k)')
axes[1].set_ylabel('Average Silhouette Score')
axes[1].set_title('Silhouette Analysis for Optimal k')
axes[1].set_xticks(list(k_range))
axes[1].grid(True, alpha=0.3)

# Mark optimal k
optimal_k = list(k_range)[np.argmax(silhouette_avgs)]
axes[1].axvline(x=optimal_k, color='red', linestyle='--', linewidth=1,
                label=f'Optimal k = {optimal_k}')
axes[1].legend()

plt.tight_layout()
plt.show()

print(f"Silhouette scores by k:")
for k, sil in zip(k_range, silhouette_avgs):
    marker = " \u2190 optimal" if k == optimal_k else ""
    print(f"  k = {k}: {sil:.4f}{marker}")

Silhouette scores by k:
  k = 2: 0.1423 ← optimal
  k = 3: 0.1187
  k = 4: 0.1109
  k = 5: 0.1034
  k = 6: 0.1049
  k = 7: 0.1106
  k = 8: 0.1096
  k = 9: 0.1134
  k = 10: 0.1084

# 8.2 K-Means Clustering

# Use optimal_k determined above (or use 3 if silhouette is ambiguous)
k_final = optimal_k if optimal_k <= 5 else 3  # cap at 5 for interpretability
km_final = KMeans(n_clusters=k_final, n_init=20, random_state=42)
cluster_labels = km_final.fit_predict(X_cluster)
df['cluster_kmeans'] = cluster_labels

# Cluster sizes
print(f"K-Means Clustering (k = {k_final})")
print("=" * 50)
for c in range(k_final):
    count = (cluster_labels == c).sum()
    print(f"  Cluster {c}: {count} employees ({count/n*100:.1f}%)")

# Cluster centers
centers_df = pd.DataFrame(km_final.cluster_centers_, columns=pca_vars)
print("\nCluster Centers (standardized scale):")
print(centers_df.round(3).to_string())

# Visualization in PC1-PC2 space
fig, ax = plt.subplots(figsize=(10, 8))
for c in range(k_final):
    mask = cluster_labels == c
    ax.scatter(pca_scores[mask, 0], pca_scores[mask, 1],
               s=20, alpha=0.5, color=PALETTE[c], label=f'Cluster {c}')

# Plot centroids in PC space
centers_pca = pca.transform(km_final.cluster_centers_)
ax.scatter(centers_pca[:, 0], centers_pca[:, 1], s=200, c='black',
           marker='X', linewidths=2, edgecolors='white', zorder=5, label='Centroids')

ax.set_xlabel(f'PC1 ({variance_ratio[0]*100:.1f}%)')
ax.set_ylabel(f'PC2 ({variance_ratio[1]*100:.1f}%)')
ax.set_title(f'K-Means Clustering (k={k_final}) in Principal Component Space')
ax.legend()
plt.tight_layout()
plt.show()

K-Means Clustering (k = 2)
==================================================
  Cluster 0: 226 employees (45.2%)
  Cluster 1: 274 employees (54.8%)

Cluster Centers (standardized scale):
     age  years_experience  performance_score  hours_per_week  num_projects  training_hours  satisfaction_score  remote_work_pct  team_size
0  0.850             0.863             -0.071          -0.158        -0.057          -0.377               0.157            0.018      0.177
1 -0.701            -0.712              0.059           0.130         0.047           0.311              -0.129           -0.015     -0.146

# 8.3 Hierarchical Clustering

# Ward's linkage
Z = linkage(X_cluster, method='ward')

# Dendrogram
fig, ax = plt.subplots(figsize=(14, 6))
dendrogram(Z, truncate_mode='lastp', p=30, leaf_rotation=90,
           leaf_font_size=8, ax=ax, color_threshold=Z[-(k_final-1), 2],
           above_threshold_color='gray')
ax.set_title("Hierarchical Clustering Dendrogram (Ward's Method, Truncated)")
ax.set_xlabel('Cluster Size / Sample Index')
ax.set_ylabel('Distance')
ax.axhline(y=Z[-(k_final-1), 2], color='red', linestyle='--', linewidth=1,
           label=f'Cut for k={k_final}')
ax.legend()
plt.tight_layout()
plt.show()

# Cut dendrogram at k_final clusters
hier_labels = fcluster(Z, t=k_final, criterion='maxclust') - 1
df['cluster_hier'] = hier_labels

# Compare K-Means vs Hierarchical
contingency_cluster = pd.crosstab(df['cluster_kmeans'], df['cluster_hier'],
                                   rownames=['K-Means'], colnames=['Hierarchical'])
print("Contingency Table: K-Means vs Hierarchical Clustering")
print(contingency_cluster)

# Agreement rate
# Find best mapping between clusterings
from itertools import permutations
best_agreement = 0
for perm in permutations(range(k_final)):
    relabeled = np.array([perm[l] for l in hier_labels])
    agreement = np.mean(cluster_labels == relabeled)
    best_agreement = max(best_agreement, agreement)
print(f"\nBest agreement (after relabeling): {best_agreement:.1%}")

Contingency Table: K-Means vs Hierarchical Clustering
Hierarchical   0    1
K-Means              
0             77  149
1              1  273

Best agreement (after relabeling): 70.0%

# 8.4 Cluster Profiling

# Box plots of salary by cluster
fig, ax = plt.subplots(figsize=(10, 6))
cluster_data = df.copy()
cluster_data['Cluster'] = cluster_data['cluster_kmeans'].astype(str)
sns.boxplot(x='Cluster', y='annual_salary', data=cluster_data, palette='Set2', ax=ax)
ax.set_xlabel('K-Means Cluster')
ax.set_ylabel('Annual Salary ($)')
ax.set_title('Annual Salary Distribution by Cluster')
plt.tight_layout()
plt.show()

# Summary table: mean values per cluster
profile_vars = ['age', 'years_experience', 'performance_score', 'annual_salary',
                'hours_per_week', 'training_hours', 'satisfaction_score', 'remote_work_pct']
profile = df.groupby('cluster_kmeans')[profile_vars].mean().round(1)
profile.index.name = 'Cluster'
print("\nCluster Profiles (Mean Values)")
print("=" * 90)
print(profile.to_string())

# Department and gender distribution per cluster
print("\nDepartment Distribution by Cluster (%)")
dept_dist = pd.crosstab(df['cluster_kmeans'], df['department'], normalize='index') * 100
print(dept_dist.round(1).to_string())

print("\nGender Distribution by Cluster (%)")
gender_dist = pd.crosstab(df['cluster_kmeans'], df['gender'], normalize='index') * 100
print(gender_dist.round(1).to_string())

print("\nEducation Distribution by Cluster (%)")
edu_dist = pd.crosstab(df['cluster_kmeans'], df['education_level'], normalize='index') * 100
print(edu_dist.round(1).to_string())

Cluster Profiles (Mean Values)
==========================================================================================
          age  years_experience  performance_score  annual_salary  hours_per_week  training_hours  satisfaction_score  remote_work_pct
Cluster                                                                                                                               
0        47.6              25.9               68.7        69920.4            41.1            23.5                 6.2             30.9
1        33.0              10.8               70.3        57334.7            42.5            29.9                 5.8             30.3

Department Distribution by Cluster (%)
department      Engineering  Finance    HR  Marketing  Sales
cluster_kmeans                                              
0                      20.8     18.6  23.0       16.8   20.8
1                      17.9     21.2  21.9       24.8   14.2

Gender Distribution by Cluster (%)
gender          Female  Male
cluster_kmeans              
0                 46.0  54.0
1                 46.4  53.6

Education Distribution by Cluster (%)
education_level  High School  Bachelor  Master   PhD
cluster_kmeans                                      
0                       17.7      39.4    30.5  12.4
1                       16.1      37.6    32.8  13.5

# =============================================================================
# APPENDIX A: Complete Standalone Data-Generating Process (DGP)
# =============================================================================
# This code block reproduces the synthetic employee compensation dataset used
# throughout the thesis. It is self-contained: running this cell alone will
# generate an identical copy of the dataset, provided the random seed is fixed.
#
# Design choices:
#   - n = 500 employees (sufficient power for all four analytical methods)
#   - 14 variables: 9 numeric, 4 categorical, 1 identifier
#   - Known salary model (true DGP) allows validation of regression recovery
#   - Embedded correlation structures provide non-trivial PCA and cluster results
# =============================================================================

import numpy as np
import pandas as pd

# ── Reproducibility ──────────────────────────────────────────────────────────
np.random.seed(42)
n = 500  # Sample size

# ── 1. Demographic Variables ─────────────────────────────────────────────────
# Age: Normal(40, 10), truncated to [22, 65] to reflect working-age population.
age = np.clip(np.random.normal(40, 10, n).astype(int), 22, 65)

# Gender: Bernoulli(0.45) for Female; creates approx 55:45 male-to-female ratio.
gender = np.where(np.random.binomial(1, 0.45, n) == 1, 'Female', 'Male')

# ── 2. Organizational Variables ──────────────────────────────────────────────
# Department: Uniform draw from five departments (no inherent ordering).
departments = ['Engineering', 'Marketing', 'Sales', 'HR', 'Finance']
department = np.random.choice(departments, n)

# Education: Multinomial with probabilities reflecting a knowledge-economy workforce.
#   High School: 15%, Bachelor: 40%, Master: 30%, PhD: 15%
education_levels = ['High School', 'Bachelor', 'Master', 'PhD']
education_level = np.random.choice(education_levels, n, p=[0.15, 0.40, 0.30, 0.15])

# Numeric encoding of education for use in the salary model.
education_num = pd.Series(education_level).map(
    {'High School': 1, 'Bachelor': 2, 'Master': 3, 'PhD': 4}
).values

# ── 3. Human Capital and Performance Variables ───────────────────────────────
# Years of experience: Correlated with age (experience ~ age - 22 + noise).
# This creates the strong age-experience correlation (r > 0.8) that drives
# multicollinearity in the regression and the first principal component.
years_experience = np.clip(age - 22 + np.random.normal(0, 3, n), 0, 40).round(1)

# Performance score: Normal(70, 12), truncated to [30, 100].
# Independent of demographics by construction.
performance_score = np.clip(np.random.normal(70, 12, n), 30, 100).round(1)

# Hours per week: Normal(42, 5), truncated to [30, 60].
hours_per_week = np.clip(np.random.normal(42, 5, n), 30, 60).round(1)

# Number of projects: Poisson with rate depending on performance.
# Creates a moderate performance-projects correlation.
num_projects = np.random.poisson(2 + 0.05 * performance_score)

# Training hours: Depends on performance (+) and experience (-).
# Higher performers receive more training; senior employees receive less.
# This creates the performance-training correlation captured by PCA.
training_hours = np.maximum(
    0, 10 + 0.3 * performance_score - 0.2 * years_experience
    + np.random.normal(0, 8, n)
).round(1)

# ── 4. Workplace Characteristics ─────────────────────────────────────────────
# Remote work percentage: Beta(2,5)*100, shifted by education.
# Higher education is associated with more remote work flexibility.
remote_work_pct = np.clip(
    np.random.beta(2, 5, n) * 100 + 3 * (education_num - 2), 0, 100
).round(1)

# Satisfaction score: Depends on hours (-) and remote work (+).
# Creates the hours-satisfaction-remote correlation cluster.
satisfaction_score = np.clip(
    7.0 - 0.04 * hours_per_week + 0.02 * remote_work_pct
    + np.random.normal(0, 1.2, n), 1, 10
).round(1)

# Team size: Uniform integer in [3, 20]. Independent of all other variables.
team_size = np.random.randint(3, 21, n)

# ── 5. TRUE SALARY MODEL (the DGP that the regression aims to recover) ──────
# annual_salary = 25000 (base)
#               + 800  * years_experience     (returns to experience)
#               + 300  * performance_score     (performance-based pay)
#               + 5000 * I(Master)             (Master's degree premium)
#               + 12000 * I(PhD)               (PhD premium)
#               + 3000 * I(Engineering)        (Engineering department premium)
#               + 2000 * I(Finance)            (Finance department premium)
#               - 2000 * I(Female)             (gender pay gap)
#               + Normal(0, 8000)              (idiosyncratic noise)
salary = (25000
          + 800 * years_experience
          + 300 * performance_score
          + 5000 * (education_num == 3)       # Master premium
          + 12000 * (education_num == 4)      # PhD premium
          + 3000 * (department == 'Engineering').astype(int)
          + 2000 * (department == 'Finance').astype(int)
          - 2000 * (gender == 'Female').astype(int)
          + np.random.normal(0, 8000, n))
annual_salary = np.round(salary, -2)  # Round to nearest $100

# ── 6. Assemble DataFrame ───────────────────────────────────────────────────
df_appendix = pd.DataFrame({
    'employee_id': range(1, n + 1),
    'age': age,
    'gender': gender,
    'department': department,
    'education_level': education_level,
    'years_experience': years_experience,
    'performance_score': performance_score,
    'hours_per_week': hours_per_week,
    'num_projects': num_projects,
    'training_hours': training_hours,
    'satisfaction_score': satisfaction_score,
    'remote_work_pct': remote_work_pct,
    'team_size': team_size,
    'annual_salary': annual_salary
})

# Set education_level as ordered categorical
df_appendix['education_level'] = pd.Categorical(
    df_appendix['education_level'],
    categories=['High School', 'Bachelor', 'Master', 'PhD'],
    ordered=True
)

# ── 7. Summary Statistics ────────────────────────────────────────────────────
print("Appendix A: Summary Statistics of the Generated Dataset")
print("=" * 70)
print(f"Observations: {len(df_appendix)}")
print(f"Variables:    {df_appendix.shape[1]}")
print()
print(df_appendix.describe(include='all').round(2).to_string())

# ── 8. Correlation Structure Documentation ───────────────────────────────────
print("\n\nEmbedded Correlation Structure")
print("=" * 70)
print("By-design variable dependencies in the DGP:")
print("  1. age  <-->  years_experience    (strong positive, r > 0.8)")
print("     Mechanism: experience = age - 22 + Normal(0, 3)")
print("  2. performance_score --> num_projects   (moderate positive)")
print("     Mechanism: projects ~ Poisson(2 + 0.05 * performance)")
print("  3. performance_score --> training_hours (positive)")
print("     years_experience   --> training_hours (negative)")
print("     Mechanism: training = 10 + 0.3*perf - 0.2*exp + noise")
print("  4. education_num     --> remote_work_pct (positive)")
print("     Mechanism: remote = Beta(2,5)*100 + 3*(edu - 2)")
print("  5. hours_per_week    --> satisfaction_score (negative)")
print("     remote_work_pct   --> satisfaction_score (positive)")
print("     Mechanism: satisfaction = 7 - 0.04*hours + 0.02*remote + noise")

# Verify the dataset matches the main analysis
numeric_cols = df_appendix.select_dtypes(include=[np.number]).columns.drop('employee_id')
print(f"\nNumeric correlations with annual_salary (top 5):")
corr_with_salary = df_appendix[numeric_cols].corr()['annual_salary'].drop('annual_salary')
for var, r in corr_with_salary.abs().sort_values(ascending=False).head(5).items():
    sign = '+' if corr_with_salary[var] > 0 else '-'
    print(f"  {var:25s} r = {sign}{r:.3f}")

Appendix A: Summary Statistics of the Generated Dataset
======================================================================
Observations: 500
Variables:    14

        employee_id     age gender department education_level  years_experience  performance_score  hours_per_week  num_projects  training_hours  satisfaction_score  remote_work_pct  team_size  annual_salary
count        500.00  500.00    500        500             500            500.00             500.00          500.00        500.00          500.00              500.00           500.00     500.00         500.00
unique          NaN     NaN      2          5               4               NaN                NaN             NaN           NaN             NaN                 NaN              NaN        NaN            NaN
top             NaN     NaN   Male         HR        Bachelor               NaN                NaN             NaN           NaN             NaN                 NaN              NaN        NaN            NaN
freq            NaN     NaN    269        112             192               NaN                NaN             NaN           NaN             NaN                 NaN              NaN        NaN            NaN
mean         250.50   39.61    NaN        NaN             NaN             17.63              69.57           41.89          5.39           26.99                5.95            30.58      11.61       63023.40
std          144.48    9.47    NaN        NaN             NaN              9.60              11.94            4.79          2.18            9.27                1.32            15.93       5.18       12230.97
min            1.00   22.00    NaN        NaN             NaN              0.00              30.00           30.00          0.00            1.70                1.40             0.00       3.00       27000.00
25%          125.75   32.75    NaN        NaN             NaN             10.40              61.60           38.60          4.00           20.80                5.10            18.20       7.00       54300.00
50%          250.50   40.00    NaN        NaN             NaN             17.45              70.10           41.70          5.00           27.10                5.95            28.60      12.00       62000.00
75%          375.25   46.00    NaN        NaN             NaN             23.90              77.93           45.30          7.00           33.30                6.80            41.52      16.00       71850.00
max          500.00   65.00    NaN        NaN             NaN             40.00             100.00           57.30         12.00           52.70                9.60            81.30      20.00       96500.00


Embedded Correlation Structure
======================================================================
By-design variable dependencies in the DGP:
  1. age  <-->  years_experience    (strong positive, r > 0.8)
     Mechanism: experience = age - 22 + Normal(0, 3)
  2. performance_score --> num_projects   (moderate positive)
     Mechanism: projects ~ Poisson(2 + 0.05 * performance)
  3. performance_score --> training_hours (positive)
     years_experience   --> training_hours (negative)
     Mechanism: training = 10 + 0.3*perf - 0.2*exp + noise
  4. education_num     --> remote_work_pct (positive)
     Mechanism: remote = Beta(2,5)*100 + 3*(edu - 2)
  5. hours_per_week    --> satisfaction_score (negative)
     remote_work_pct   --> satisfaction_score (positive)
     Mechanism: satisfaction = 7 - 0.04*hours + 0.02*remote + noise

Numeric correlations with annual_salary (top 5):
  years_experience          r = +0.658
  age                       r = +0.626
  performance_score         r = +0.273
  training_hours            r = -0.107
  satisfaction_score        r = +0.065

# Appendix B: Partial Regression Plots and DFBETAS
fig = sm.graphics.plot_partregress_grid(model, fig=plt.figure(figsize=(16, 12)))
plt.suptitle('Partial Regression Plots', y=1.02, fontsize=14)
plt.tight_layout()
plt.show()

# DFBETAS for key predictors
influence = model.get_influence()
dfbetas = influence.dfbetas
key_predictors = ['years_experience', 'performance_score']
key_indices = [list(X_const.columns).index(p) for p in key_predictors if p in X_const.columns]

if key_indices:
    fig, axes = plt.subplots(1, len(key_indices), figsize=(7*len(key_indices), 5))
    if len(key_indices) == 1:
        axes = [axes]
    threshold = 2 / np.sqrt(n)
    for ax, idx in zip(axes, key_indices):
        ax.stem(range(n), dfbetas[:, idx], markerfmt=',', linefmt='-', basefmt='k-')
        ax.axhline(y=threshold, color='red', linestyle='--', linewidth=1)
        ax.axhline(y=-threshold, color='red', linestyle='--', linewidth=1)
        ax.set_xlabel('Observation Index')
        ax.set_ylabel(f'DFBETAS')
        ax.set_title(f'DFBETAS: {X_const.columns[idx]}')
    plt.tight_layout()
    plt.show()

print(f"DFBETAS threshold (2/sqrt(n)): {threshold:.4f}")
print(f"Observations exceeding threshold for any predictor: {(np.abs(dfbetas[:, 1:]) > threshold).any(axis=1).sum()}")

DFBETAS threshold (2/sqrt(n)): 0.0894
Observations exceeding threshold for any predictor: 147

# Appendix C: Additional Hypothesis Tests

# Shapiro-Wilk normality test per group
print("Shapiro-Wilk Normality Tests for Annual Salary by Gender")
print("=" * 55)
for g in ['Male', 'Female']:
    group_data = df[df['gender'] == g]['annual_salary']
    # Use subsample for Shapiro-Wilk (max 5000)
    stat_sw, p_sw = stats.shapiro(group_data)
    print(f"  {g:8s}: W = {stat_sw:.4f}, p = {p_sw:.4f} {'*' if p_sw < 0.05 else ''}")

print("\nShapiro-Wilk Normality Tests for Annual Salary by Department")
print("=" * 55)
for dept in df['department'].unique():
    group_data = df[df['department'] == dept]['annual_salary']
    stat_sw, p_sw = stats.shapiro(group_data)
    print(f"  {dept:15s}: W = {stat_sw:.4f}, p = {p_sw:.4f} {'*' if p_sw < 0.05 else ''}")

# Levene's test for homogeneity of variances
print("\nLevene's Test for Homogeneity of Variances (Salary by Department)")
print("=" * 55)
groups_levene = [group['annual_salary'].values for _, group in df.groupby('department')]
stat_lev, p_lev = stats.levene(*groups_levene)
print(f"  Levene's statistic = {stat_lev:.4f}, p = {p_lev:.4f}")
if p_lev < 0.05:
    print("  Result: Reject H0. Variances are not equal across departments.")
else:
    print("  Result: Fail to reject H0. No significant evidence against equal variances.")

# Kruskal-Wallis (non-parametric alternative to ANOVA)
print("\nKruskal-Wallis Test (Non-Parametric Alternative to ANOVA)")
print("=" * 55)
stat_kw, p_kw = stats.kruskal(*groups_levene)
print(f"  H-statistic = {stat_kw:.4f}, p = {p_kw:.6f}")
if p_kw < 0.05:
    print("  Result: Reject H0. Significant differences in salary distributions across departments.")
else:
    print("  Result: Fail to reject H0.")

Shapiro-Wilk Normality Tests for Annual Salary by Gender
=======================================================
  Male    : W = 0.9917, p = 0.1373 
  Female  : W = 0.9922, p = 0.2614 

Shapiro-Wilk Normality Tests for Annual Salary by Department
=======================================================
  Marketing      : W = 0.9833, p = 0.2061 
  Engineering    : W = 0.9808, p = 0.1726 
  Sales          : W = 0.9720, p = 0.0589 
  Finance        : W = 0.9858, p = 0.3636 
  HR             : W = 0.9928, p = 0.8325 

Levene's Test for Homogeneity of Variances (Salary by Department)
=======================================================
  Levene's statistic = 0.4493, p = 0.7729
  Result: Fail to reject H0. No significant evidence against equal variances.

Kruskal-Wallis Test (Non-Parametric Alternative to ANOVA)
=======================================================
  H-statistic = 5.4206, p = 0.246794
  Result: Fail to reject H0.

# Appendix D: Complete PCA Results

# Full eigenvalue table
eigen_full = pd.DataFrame({
    'Component': [f'PC{i+1}' for i in range(len(pca.explained_variance_))],
    'Eigenvalue': pca.explained_variance_.round(4),
    'Variance (%)': (pca.explained_variance_ratio_ * 100).round(2),
    'Cumulative (%)': (np.cumsum(pca.explained_variance_ratio_) * 100).round(2)
})
print("Complete Eigenvalue Table")
print("=" * 65)
print(eigen_full.to_string(index=False))

# Full loadings matrix
loadings_full = pd.DataFrame(
    pca.components_.T,
    index=pca_vars,
    columns=[f'PC{i+1}' for i in range(len(pca_vars))]
)
print("\nComplete Loadings Matrix")
print("=" * 100)
print(loadings_full.round(4).to_string())

# Communalities
communalities = np.sum(pca.components_[:n_kaiser]**2, axis=0)
comm_df = pd.DataFrame({
    'Variable': pca_vars,
    'Communality': communalities.round(4)
})
print(f"\nCommunalities (based on {n_kaiser} retained components)")
print("=" * 40)
print(comm_df.to_string(index=False))

Complete Eigenvalue Table
=================================================================
Component  Eigenvalue  Variance (%)  Cumulative (%)
      PC1      2.1201         23.51           23.51
      PC2      1.4109         15.65           39.15
      PC3      1.3060         14.48           53.64
      PC4      1.0599         11.75           65.39
      PC5      0.9515         10.55           75.94
      PC6      0.8878          9.84           85.78
      PC7      0.7119          7.89           93.68
      PC8      0.5256          5.83           99.51
      PC9      0.0445          0.49          100.00

Complete Loadings Matrix
====================================================================================================
                       PC1     PC2     PC3     PC4     PC5     PC6     PC7     PC8     PC9
age                 0.6481  0.1393 -0.1698 -0.0532 -0.0757  0.0450 -0.0616 -0.1469 -0.7041
years_experience    0.6518  0.1289 -0.1637 -0.0627 -0.0665  0.0258 -0.0832 -0.1128  0.7093
performance_score  -0.1078  0.5986 -0.3938 -0.1232 -0.1456 -0.0490  0.0928  0.6538 -0.0085
hours_per_week     -0.0307 -0.1945 -0.3686 -0.4797  0.5376  0.4304  0.3459 -0.0364  0.0055
num_projects       -0.1140  0.4682 -0.1138  0.1394  0.6406 -0.4013 -0.2076 -0.3477  0.0008
training_hours     -0.3348  0.3514 -0.2340 -0.0664 -0.4255  0.4224 -0.0992 -0.5791  0.0250
satisfaction_score  0.0894  0.3847  0.5345 -0.1632 -0.0402 -0.0748  0.7060 -0.1609  0.0131
remote_work_pct     0.0532  0.2644  0.5442 -0.1971  0.2268  0.5011 -0.4932  0.2154 -0.0093
team_size           0.0814  0.0849 -0.0614  0.8116  0.1877  0.4620  0.2579  0.0916  0.0130

Communalities (based on 4 retained components)
========================================
          Variable  Communality
               age       0.4712
  years_experience       0.4721
 performance_score       0.5402
    hours_per_week       0.4048
      num_projects       0.2646
    training_hours       0.2948
satisfaction_score       0.4683
   remote_work_pct       0.4078
         team_size       0.6762

# Appendix E: Cluster Stability Analysis

from sklearn.metrics import silhouette_samples

# Silhouette plot per observation
sample_silhouette_values = silhouette_samples(X_cluster, cluster_labels)
avg_sil = silhouette_score(X_cluster, cluster_labels)

fig, ax = plt.subplots(figsize=(10, 7))
y_lower = 10
for c in range(k_final):
    c_silhouettes = sample_silhouette_values[cluster_labels == c]
    c_silhouettes.sort()
    size_c = c_silhouettes.shape[0]
    y_upper = y_lower + size_c
    ax.fill_betweenx(np.arange(y_lower, y_upper), 0, c_silhouettes,
                      facecolor=PALETTE[c], edgecolor=PALETTE[c], alpha=0.7)
    ax.text(-0.05, y_lower + 0.5 * size_c, f'Cluster {c}', fontsize=10)
    y_lower = y_upper + 10

ax.axvline(x=avg_sil, color='red', linestyle='--', linewidth=1.5,
           label=f'Mean Silhouette = {avg_sil:.3f}')
ax.set_xlabel('Silhouette Coefficient')
ax.set_ylabel('Observations (sorted within clusters)')
ax.set_title('Silhouette Plot for K-Means Clustering')
ax.legend()
plt.tight_layout()
plt.show()

# Stability via multiple random initializations
print("Cluster Stability: Sensitivity to Random Initialization")
print("=" * 55)
rand_labels_list = []
for seed in range(10):
    km_trial = KMeans(n_clusters=k_final, n_init=1, random_state=seed)
    rand_labels_list.append(km_trial.fit_predict(X_cluster))

# Compute pairwise agreement (with label permutation)
from itertools import permutations as _perms
agreements = []
for i in range(len(rand_labels_list)):
    for j in range(i+1, len(rand_labels_list)):
        best_agr = 0
        for perm in _perms(range(k_final)):
            relabeled = np.array([perm[l] for l in rand_labels_list[j]])
            agr = np.mean(rand_labels_list[i] == relabeled)
            best_agr = max(best_agr, agr)
        agreements.append(best_agr)

print(f"  Number of trials: 10 (single initialization each)")
print(f"  Mean pairwise agreement: {np.mean(agreements):.3f}")
print(f"  Min pairwise agreement:  {np.min(agreements):.3f}")
print(f"  Max pairwise agreement:  {np.max(agreements):.3f}")
print(f"  Interpretation: {'High' if np.mean(agreements) > 0.9 else 'Moderate' if np.mean(agreements) > 0.7 else 'Low'} stability")

Cluster Stability: Sensitivity to Random Initialization
=======================================================
  Number of trials: 10 (single initialization each)
  Mean pairwise agreement: 0.994
  Min pairwise agreement:  0.988
  Max pairwise agreement:  1.000
  Interpretation: High stability

# Appendix F: Complete Summary Statistics

# Exclude cluster assignment columns added during analysis
drop_cols = [c for c in ['cluster_kmeans', 'cluster_hier'] if c in df.columns]
df_clean = df.drop(columns=drop_cols, errors='ignore')

print("Complete Descriptive Statistics")
print("=" * 100)
print(df_clean.describe(include='all').round(2).to_string())

print("\n\nMissing Values")
print("=" * 40)
missing = df_clean.isnull().sum()
print(f"Total missing values: {missing.sum()}")
if missing.sum() > 0:
    print(missing[missing > 0])
else:
    print("No missing values detected in any variable.")

print("\n\nData Types")
print("=" * 40)
print(df_clean.dtypes.to_string())

n_vars = df_clean.shape[1]
print(f"\nDataset dimensions: {df_clean.shape[0]} observations x {n_vars} variables")
print(f"Memory usage: {df_clean.memory_usage(deep=True).sum() / 1024:.1f} KB")

Complete Descriptive Statistics
====================================================================================================
        employee_id     age gender department education_level  years_experience  performance_score  hours_per_week  num_projects  training_hours  satisfaction_score  remote_work_pct  team_size  annual_salary
count        500.00  500.00    500        500             500            500.00             500.00          500.00        500.00          500.00              500.00           500.00     500.00         500.00
unique          NaN     NaN      2          5               4               NaN                NaN             NaN           NaN             NaN                 NaN              NaN        NaN            NaN
top             NaN     NaN   Male         HR        Bachelor               NaN                NaN             NaN           NaN             NaN                 NaN              NaN        NaN            NaN
freq            NaN     NaN    269        112             192               NaN                NaN             NaN           NaN             NaN                 NaN              NaN        NaN            NaN
mean         250.50   39.61    NaN        NaN             NaN             17.63              69.57           41.89          5.39           26.99                5.95            30.58      11.61       63023.40
std          144.48    9.47    NaN        NaN             NaN              9.60              11.94            4.79          2.18            9.27                1.32            15.93       5.18       12230.97
min            1.00   22.00    NaN        NaN             NaN              0.00              30.00           30.00          0.00            1.70                1.40             0.00       3.00       27000.00
25%          125.75   32.75    NaN        NaN             NaN             10.40              61.60           38.60          4.00           20.80                5.10            18.20       7.00       54300.00
50%          250.50   40.00    NaN        NaN             NaN             17.45              70.10           41.70          5.00           27.10                5.95            28.60      12.00       62000.00
75%          375.25   46.00    NaN        NaN             NaN             23.90              77.93           45.30          7.00           33.30                6.80            41.52      16.00       71850.00
max          500.00   65.00    NaN        NaN             NaN             40.00             100.00           57.30         12.00           52.70                9.60            81.30      20.00       96500.00


Missing Values
========================================
Total missing values: 0
No missing values detected in any variable.


Data Types
========================================
employee_id              int32
age                      int32
gender                  object
department              object
education_level       category
years_experience       float64
performance_score      float64
hours_per_week         float64
num_projects             int32
training_hours         float64
satisfaction_score     float64
remote_work_pct        float64
team_size                int32
annual_salary          float64

Dataset dimensions: 500 observations x 14 variables
Memory usage: 89.7 KB

#	Variable	Type	Description	Range / Categories	Role
1	`employee_id`	Integer	Unique employee identifier	1--500	Index
2	`age`	Integer	Employee age in years	22--65	Predictor
3	`gender`	Categorical (binary)	Employee gender	Male, Female	Predictor
4	`department`	Categorical (nominal)	Organizational department	Engineering, Marketing, Sales, HR, Finance	Predictor
5	`education_level`	Categorical (ordinal)	Highest degree attained	High School, Bachelor, Master, PhD	Predictor
6	`years_experience`	Continuous	Years of professional experience	0--40	Predictor
7	`performance_score`	Continuous	Annual performance rating	30--100	Predictor
8	`hours_per_week`	Continuous	Average weekly working hours	30--60	Predictor
9	`num_projects`	Count (integer)	Number of projects in current year	0+	Predictor
10	`training_hours`	Continuous	Annual training hours completed	0+	Predictor
11	`satisfaction_score`	Continuous	Job satisfaction rating	1--10	Predictor
12	`remote_work_pct`	Continuous	Percentage of work performed remotely	0--100	Predictor
13	`team_size`	Integer	Number of members in employee's team	3--20	Predictor
14	`annual_salary`	Continuous	Annual compensation in USD	~20,000--100,000+	Response

	employee_id	age	gender	department	education_level	years_experience	performance_score	hours_per_week	num_projects	training_hours	satisfaction_score	remote_work_pct	team_size	annual_salary
0	1	44	Female	Marketing	Bachelor	20.7	51.7	38.4	2	27.7	6.0	19.6	4	62900.0
1	2	38	Female	Engineering	Master	22.0	56.4	44.6	2	18.0	5.2	23.7	10	50900.0
2	3	46	Male	Sales	Bachelor	29.3	67.4	42.8	4	1.7	5.1	30.9	3	59000.0
3	4	55	Female	Marketing	Master	29.9	38.3	41.4	6	26.4	8.7	67.6	3	78000.0
4	5	37	Female	Marketing	PhD	12.9	79.3	43.8	5	37.3	4.9	28.0	6	59100.0
5	6	37	Male	Sales	Bachelor	15.2	41.1	43.1	3	5.3	6.8	32.8	9	53700.0
6	7	55	Male	Finance	Bachelor	37.2	69.1	44.7	4	18.4	5.2	33.3	11	83400.0
7	8	47	Male	Marketing	Master	25.2	70.2	49.2	4	38.6	5.0	24.4	18	55800.0
8	9	35	Male	HR	Bachelor	11.1	76.5	45.9	6	42.3	7.5	57.3	7	43000.0
9	10	45	Male	Marketing	Master	22.8	69.5	43.6	4	33.8	5.3	23.5	4	79200.0

	employee_id	age	years_experience	performance_score	hours_per_week	num_projects	training_hours	satisfaction_score	remote_work_pct	team_size	annual_salary
count	500.00	500.00	500.00	500.00	500.00	500.00	500.00	500.00	500.00	500.00	500.00
mean	250.50	39.61	17.63	69.57	41.89	5.39	26.99	5.95	30.58	11.61	63023.40
std	144.48	9.47	9.60	11.94	4.79	2.18	9.27	1.32	15.93	5.18	12230.97
min	1.00	22.00	0.00	30.00	30.00	0.00	1.70	1.40	0.00	3.00	27000.00
25%	125.75	32.75	10.40	61.60	38.60	4.00	20.80	5.10	18.20	7.00	54300.00
50%	250.50	40.00	17.45	70.10	41.70	5.00	27.10	5.95	28.60	12.00	62000.00
75%	375.25	46.00	23.90	77.93	45.30	7.00	33.30	6.80	41.52	16.00	71850.00
max	500.00	65.00	40.00	100.00	57.30	12.00	52.70	9.60	81.30	20.00	96500.00

Analyzing the Determinants of Employee Compensation: A Multi-Method Statistical Approach Using Synthetic HR Data¶

Abstract¶

Table of Contents¶

1. Introduction¶

2. Theoretical Background¶

2.1 Multiple Linear Regression¶

Gauss-Markov Assumptions¶

Goodness of Fit¶

Inference on Individual Coefficients¶

Multicollinearity Diagnostics¶

2.2 Hypothesis Testing¶

Two-Sample t-Test¶

Effect Size: Cohen's $d$¶

One-Way ANOVA¶

Chi-Square Test of Independence¶

Type I and Type II Errors¶

2.3 Principal Component Analysis¶

Mathematical Formulation¶

Variance Explained¶

Retention Criteria¶

2.4 Cluster Analysis¶

Distance Metric¶

K-Means Clustering¶

Hierarchical Agglomerative Clustering¶

Cluster Validation¶

3. Data Generation and Methodology¶

3.1 Variable Descriptions¶

4. Exploratory Data Analysis¶

Interpretation of Univariate Distributions¶

Interpretation of Categorical Distributions¶

Interpretation of the Correlation Structure¶

Interpretation of Subgroup Salary Distributions¶

Summary of Exploratory Data Analysis¶

5. Multiple Linear Regression¶

5.1.1 Interpretation of the Regression Results¶

Overall Model Fit¶

Significant Predictors¶

Non-Significant Predictors¶

5.2.1 Interpretation of Diagnostic Plots¶

Residuals vs. Fitted Values¶

Normal Q-Q Plot¶

Scale-Location Plot¶

Residuals vs. Leverage¶

5.3.1 Interpretation of Variance Inflation Factors¶

5.4 Critical Assessment of the Regression Analysis¶

6. Hypothesis Testing¶

6.1.1 Interpretation of the Gender Pay Gap Test¶

6.2.1 Interpretation of the ANOVA Results¶

6.3.1 Interpretation of the Chi-Square Test and Critical Assessment¶

7. Principal Component Analysis¶

Suitability Assessment¶

Component Retention Decision¶

Interpretation of Principal Components and Critical Assessment¶

8. Cluster Analysis¶

Optimal Cluster Number Selection¶

Interpretation of K-Means Clusters¶

Comparison of Hierarchical and K-Means Results¶

Critical Assessment of Cluster Analysis¶

9. Critical Assessment and Limitations¶

10. Conclusion¶

References¶

Appendix¶

Appendix A: Complete Data Generation Code¶

Appendix B: Extended Regression Diagnostics¶

Appendix C: Additional Hypothesis Tests¶

Appendix D: Complete PCA Results¶

Appendix E: Cluster Stability Analysis¶

Appendix F: Complete Summary Statistics¶