# Cell 1: Setup and load results

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from scipy import stats
import seaborn as sns

# Data path
DATA_PATH = "/home/psdehal/pangenome_science/data/temporal_core"

# Load results from notebook 02
cumulative_df = pd.read_parquet(f"{DATA_PATH}/cumulative_results.parquet")
window_df = pd.read_parquet(f"{DATA_PATH}/window_results.parquet")
jaccard_df = pd.read_csv(f"{DATA_PATH}/inter_window_jaccard.csv")
stability_df = pd.read_csv(f"{DATA_PATH}/core_stability.csv")
exit_df = pd.read_parquet(f"{DATA_PATH}/gene_exit_points.parquet")

SPECIES = ['p_aeruginosa', 'a_baumannii']
THRESHOLDS = [0.90, 0.95, 0.99]

print(f"Loaded cumulative results: {len(cumulative_df)} rows")
print(f"Loaded window results: {len(window_df)} rows")
print(f"Loaded exit data: {len(exit_df)} gene-species pairs")

# Cell 2: Fit power law decay models

def power_law(x, c0, alpha):
    """Power law: C(n) = c0 * n^(-alpha)"""
    return c0 * np.power(x, -alpha)

def log_linear(x, c0, k):
    """Log-linear: C(n) = c0 - k * log(n)"""
    return c0 - k * np.log(x)

fit_results = []

fig, axes = plt.subplots(2, 3, figsize=(15, 10))

for row_idx, species in enumerate(SPECIES):
    species_data = cumulative_df[cumulative_df['species'] == species]
    
    for col_idx, threshold in enumerate(THRESHOLDS):
        ax = axes[row_idx, col_idx]
        
        thresh_data = species_data[species_data['threshold'] == threshold].copy()
        x = thresh_data['n_genomes'].values
        y = thresh_data['core_size'].values
        
        # Scatter plot of actual data
        ax.scatter(x, y, alpha=0.5, s=20, label='Observed')
        
        # Fit power law
        try:
            popt_power, pcov = curve_fit(power_law, x, y, p0=[y[0]*10, 0.1], maxfev=5000)
            y_pred_power = power_law(x, *popt_power)
            
            # R-squared
            ss_res = np.sum((y - y_pred_power) ** 2)
            ss_tot = np.sum((y - np.mean(y)) ** 2)
            r2_power = 1 - (ss_res / ss_tot)
            
            ax.plot(x, y_pred_power, 'r-', linewidth=2, 
                    label=f'Power law (α={popt_power[1]:.3f}, R²={r2_power:.3f})')
            
            fit_results.append({
                'species': species,
                'threshold': threshold,
                'model': 'power_law',
                'c0': popt_power[0],
                'alpha': popt_power[1],
                'r_squared': r2_power
            })
        except Exception as e:
            print(f"Power law fit failed for {species} {threshold}: {e}")
        
        # Fit log-linear for comparison
        try:
            popt_log, _ = curve_fit(log_linear, x, y, p0=[y[0], 100])
            y_pred_log = log_linear(x, *popt_log)
            ss_res = np.sum((y - y_pred_log) ** 2)
            r2_log = 1 - (ss_res / ss_tot)
            
            ax.plot(x, y_pred_log, 'g--', linewidth=2, alpha=0.7,
                    label=f'Log-linear (R²={r2_log:.3f})')
            
            fit_results.append({
                'species': species,
                'threshold': threshold,
                'model': 'log_linear',
                'c0': popt_log[0],
                'k': popt_log[1],
                'r_squared': r2_log
            })
        except Exception as e:
            print(f"Log-linear fit failed for {species} {threshold}: {e}")
        
        ax.set_xlabel('Number of Genomes')
        ax.set_ylabel('Core Size')
        ax.set_title(f'{species} - {int(threshold*100)}% threshold')
        ax.legend(fontsize=8)
        ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(f"{DATA_PATH}/decay_model_fits.png", dpi=150)
plt.show()

fit_df = pd.DataFrame(fit_results)
print("\nModel fit results:")
print(fit_df.to_string(index=False))

# Cell 3: Compare decay rates between species

# Extract power law alpha values for comparison
power_fits = fit_df[fit_df['model'] == 'power_law'].copy()

fig, ax = plt.subplots(figsize=(8, 5))

x_pos = np.arange(len(THRESHOLDS))
width = 0.35

for i, species in enumerate(SPECIES):
    species_fits = power_fits[power_fits['species'] == species]
    alphas = [species_fits[species_fits['threshold'] == t]['alpha'].values[0] 
              for t in THRESHOLDS]
    ax.bar(x_pos + i*width, alphas, width, label=species, alpha=0.8)

ax.set_xlabel('Core Threshold')
ax.set_ylabel('Decay Exponent (α)')
ax.set_title('Core Decay Rate Comparison\n(Higher α = Faster Decay)')
ax.set_xticks(x_pos + width/2)
ax.set_xticklabels([f'{int(t*100)}%' for t in THRESHOLDS])
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig(f"{DATA_PATH}/decay_rate_comparison.png", dpi=150)
plt.show()

# Cell 4: Analyze gene exit patterns

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for idx, species in enumerate(SPECIES):
    ax = axes[idx]
    
    species_exits = exit_df[exit_df['species'] == species].copy()
    
    # Get exit points (excluding never_exits)
    exits_with_point = species_exits[species_exits['exit_at_n_genomes'].notna()]
    never_exits = species_exits[species_exits['never_exits']]
    
    if len(exits_with_point) > 0:
        ax.hist(exits_with_point['exit_at_n_genomes'], bins=30, 
                edgecolor='black', alpha=0.7, label='Exits core')
    
    ax.axvline(exits_with_point['exit_at_n_genomes'].median(), 
               color='red', linestyle='--', linewidth=2,
               label=f'Median exit: {exits_with_point["exit_at_n_genomes"].median():.0f}')
    
    ax.set_xlabel('Number of Genomes When Gene Exits Core')
    ax.set_ylabel('Number of Genes')
    ax.set_title(f'{species}\n({len(never_exits)} genes never exit, {len(exits_with_point)} exit)')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(f"{DATA_PATH}/gene_exit_distribution.png", dpi=150)
plt.show()

# Cell 5: Categorize genes by exit timing

exit_categories = []

for species in SPECIES:
    species_exits = exit_df[exit_df['species'] == species].copy()
    
    # Get total genome count for this species
    species_cumulative = cumulative_df[cumulative_df['species'] == species]
    max_n = species_cumulative['n_genomes'].max()
    
    # Categorize
    categories = {
        'Stable (never exits)': species_exits['never_exits'].sum(),
        'Late exit (>75%)': ((species_exits['exit_at_n_genomes'] > max_n * 0.75) & 
                             (~species_exits['never_exits'])).sum(),
        'Mid exit (25-75%)': ((species_exits['exit_at_n_genomes'] > max_n * 0.25) & 
                              (species_exits['exit_at_n_genomes'] <= max_n * 0.75)).sum(),
        'Early exit (<25%)': (species_exits['exit_at_n_genomes'] <= max_n * 0.25).sum()
    }
    
    for cat, count in categories.items():
        exit_categories.append({
            'species': species,
            'category': cat,
            'count': count,
            'percentage': 100 * count / len(species_exits)
        })

exit_cat_df = pd.DataFrame(exit_categories)

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

for idx, species in enumerate(SPECIES):
    ax = axes[idx]
    species_data = exit_cat_df[exit_cat_df['species'] == species]
    
    colors = ['#2ecc71', '#3498db', '#f39c12', '#e74c3c']  # green, blue, orange, red
    ax.pie(species_data['count'], labels=species_data['category'],
           autopct='%1.1f%%', colors=colors, startangle=90)
    ax.set_title(f'{species}\nCore Gene Exit Categories')

plt.tight_layout()
plt.savefig(f"{DATA_PATH}/gene_exit_categories.png", dpi=150)
plt.show()

print("Exit categories:")
print(exit_cat_df.pivot(index='category', columns='species', values='percentage').round(1))

# Cell 6: Visualize inter-window Jaccard similarities

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for idx, species in enumerate(SPECIES):
    ax = axes[idx]
    
    species_jaccard = jaccard_df[jaccard_df['species'] == species]
    
    for threshold in THRESHOLDS:
        thresh_data = species_jaccard[species_jaccard['threshold'] == threshold]
        
        # Create transition labels
        transitions = [f"{row['window_1']}→{row['window_2'].split('-')[1]}" 
                      for _, row in thresh_data.iterrows()]
        
        ax.plot(range(len(thresh_data)), thresh_data['jaccard'], 
                marker='o', label=f"{int(threshold*100)}%")
    
    ax.set_xlabel('Window Transition')
    ax.set_ylabel('Jaccard Similarity')
    ax.set_title(f'{species}\nCore Similarity Between Adjacent Time Windows')
    ax.set_ylim(0, 1)
    ax.legend()
    ax.grid(True, alpha=0.3)
    ax.axhline(0.8, color='gray', linestyle='--', alpha=0.5, label='80% threshold')

plt.tight_layout()
plt.savefig(f"{DATA_PATH}/inter_window_similarity.png", dpi=150)
plt.show()

# Cell 7: Core stability summary

print("=== CORE STABILITY SUMMARY ===")
print()

for species in SPECIES:
    print(f"\n{species}:")
    species_stability = stability_df[stability_df['species'] == species]
    
    for _, row in species_stability.iterrows():
        print(f"  {int(row['threshold']*100)}% threshold:")
        print(f"    Stable core: {row['stable_core_size']} genes")
        print(f"    Ever-core: {row['ever_core_size']} genes")
        print(f"    Stability ratio: {row['stability_ratio']:.1%}")

# Visualize stability ratios
fig, ax = plt.subplots(figsize=(8, 5))

x_pos = np.arange(len(THRESHOLDS))
width = 0.35

for i, species in enumerate(SPECIES):
    species_data = stability_df[stability_df['species'] == species]
    ratios = [species_data[species_data['threshold'] == t]['stability_ratio'].values[0] 
              for t in THRESHOLDS]
    ax.bar(x_pos + i*width, ratios, width, label=species, alpha=0.8)

ax.set_xlabel('Core Threshold')
ax.set_ylabel('Stability Ratio\n(Stable Core / Ever-Core)')
ax.set_title('Core Stability Across Time Windows')
ax.set_xticks(x_pos + width/2)
ax.set_xticklabels([f'{int(t*100)}%' for t in THRESHOLDS])
ax.set_ylim(0, 1)
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig(f"{DATA_PATH}/core_stability_ratio.png", dpi=150)
plt.show()

# Cell 8: Generate summary statistics

print("="*60)
print("TEMPORAL CORE GENOME DYNAMICS - KEY FINDINGS")
print("="*60)

for species in SPECIES:
    print(f"\n{'='*60}")
    print(f"{species.upper()}")
    print(f"{'='*60}")
    
    # Cumulative decay
    species_cumulative = cumulative_df[(cumulative_df['species'] == species) & 
                                        (cumulative_df['threshold'] == 0.95)]
    initial_core = species_cumulative.iloc[0]['core_size']
    final_core = species_cumulative.iloc[-1]['core_size']
    n_genomes = species_cumulative.iloc[-1]['n_genomes']
    
    print(f"\n1. CORE DECAY (95% threshold):")
    print(f"   Initial core (n={species_cumulative.iloc[0]['n_genomes']}): {initial_core} genes")
    print(f"   Final core (n={n_genomes}): {final_core} genes")
    print(f"   Reduction: {initial_core - final_core} genes ({100*(initial_core-final_core)/initial_core:.1f}%)")
    
    # Decay rate
    species_fit = fit_df[(fit_df['species'] == species) & 
                         (fit_df['threshold'] == 0.95) &
                         (fit_df['model'] == 'power_law')]
    if len(species_fit) > 0:
        alpha = species_fit.iloc[0]['alpha']
        r2 = species_fit.iloc[0]['r_squared']
        print(f"   Power law decay: α = {alpha:.4f} (R² = {r2:.3f})")
    
    # Stability
    species_stability = stability_df[(stability_df['species'] == species) & 
                                      (stability_df['threshold'] == 0.95)]
    if len(species_stability) > 0:
        stable = species_stability.iloc[0]['stable_core_size']
        ever = species_stability.iloc[0]['ever_core_size']
        ratio = species_stability.iloc[0]['stability_ratio']
        print(f"\n2. CORE STABILITY (across time windows):")
        print(f"   Stable core: {stable} genes (present in ALL windows)")
        print(f"   Ever-core: {ever} genes (present in ANY window)")
        print(f"   Stability ratio: {ratio:.1%}")
    
    # Exit patterns
    species_exits = exit_df[exit_df['species'] == species]
    never_exit = species_exits['never_exits'].sum()
    early_exit = species_exits['exits_early'].sum()
    print(f"\n3. GENE EXIT PATTERNS:")
    print(f"   Never exit (most stable): {never_exit} genes")
    print(f"   Early exit (most volatile): {early_exit} genes")

print(f"\n{'='*60}")
print("CONCLUSION")
print(f"{'='*60}")
print("""
The analysis reveals that core genome composition is NOT static over
collection time. Both species show significant core erosion as genomes
sampled from more time points are included. This supports the hypothesis
that the observed small cores in heavily-sampled species partly reflect
temporal dynamics rather than true genome fluidity.

Key evidence:
- Core size decreases predictably (power law) with more genomes
- Substantial fraction of 'core' genes are transient (window-specific)
- Some genes reliably exit core early, others persist indefinitely
""")

# Cell 9: Save analysis results

# Save fit results
fit_df.to_csv(f"{DATA_PATH}/decay_fit_results.csv", index=False)
print(f"Saved: decay_fit_results.csv")

# Save exit categories
exit_cat_df.to_csv(f"{DATA_PATH}/gene_exit_categories.csv", index=False)
print(f"Saved: gene_exit_categories.csv")

print("\nAll analysis results saved!")

03 Analysis

Temporal Core Genome Dynamics - Analysis¶

Goal¶

1. Core Decay Model Fitting¶

2. Core Turnover Analysis¶

3. Window-to-Window Core Dynamics¶

4. Key Findings Summary¶

Summary¶

Figures Created¶

Next Steps¶