03 Analysis
Jupyter notebook from the Temporal Core Genome Dynamics project.
In [ ]:
# Cell 1: Setup and load results
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from scipy import stats
import seaborn as sns
# Data path
DATA_PATH = "/home/psdehal/pangenome_science/data/temporal_core"
# Load results from notebook 02
cumulative_df = pd.read_parquet(f"{DATA_PATH}/cumulative_results.parquet")
window_df = pd.read_parquet(f"{DATA_PATH}/window_results.parquet")
jaccard_df = pd.read_csv(f"{DATA_PATH}/inter_window_jaccard.csv")
stability_df = pd.read_csv(f"{DATA_PATH}/core_stability.csv")
exit_df = pd.read_parquet(f"{DATA_PATH}/gene_exit_points.parquet")
SPECIES = ['p_aeruginosa', 'a_baumannii']
THRESHOLDS = [0.90, 0.95, 0.99]
print(f"Loaded cumulative results: {len(cumulative_df)} rows")
print(f"Loaded window results: {len(window_df)} rows")
print(f"Loaded exit data: {len(exit_df)} gene-species pairs")
1. Core Decay Model Fitting¶
Fit a power law model to the cumulative core decay: $$C(n) = C_0 \cdot n^{-\alpha}$$
Where:
- $C(n)$ = core size at n genomes
- $C_0$ = initial scaling factor
- $\alpha$ = decay exponent (higher = faster decay)
In [ ]:
# Cell 2: Fit power law decay models
def power_law(x, c0, alpha):
"""Power law: C(n) = c0 * n^(-alpha)"""
return c0 * np.power(x, -alpha)
def log_linear(x, c0, k):
"""Log-linear: C(n) = c0 - k * log(n)"""
return c0 - k * np.log(x)
fit_results = []
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
for row_idx, species in enumerate(SPECIES):
species_data = cumulative_df[cumulative_df['species'] == species]
for col_idx, threshold in enumerate(THRESHOLDS):
ax = axes[row_idx, col_idx]
thresh_data = species_data[species_data['threshold'] == threshold].copy()
x = thresh_data['n_genomes'].values
y = thresh_data['core_size'].values
# Scatter plot of actual data
ax.scatter(x, y, alpha=0.5, s=20, label='Observed')
# Fit power law
try:
popt_power, pcov = curve_fit(power_law, x, y, p0=[y[0]*10, 0.1], maxfev=5000)
y_pred_power = power_law(x, *popt_power)
# R-squared
ss_res = np.sum((y - y_pred_power) ** 2)
ss_tot = np.sum((y - np.mean(y)) ** 2)
r2_power = 1 - (ss_res / ss_tot)
ax.plot(x, y_pred_power, 'r-', linewidth=2,
label=f'Power law (α={popt_power[1]:.3f}, R²={r2_power:.3f})')
fit_results.append({
'species': species,
'threshold': threshold,
'model': 'power_law',
'c0': popt_power[0],
'alpha': popt_power[1],
'r_squared': r2_power
})
except Exception as e:
print(f"Power law fit failed for {species} {threshold}: {e}")
# Fit log-linear for comparison
try:
popt_log, _ = curve_fit(log_linear, x, y, p0=[y[0], 100])
y_pred_log = log_linear(x, *popt_log)
ss_res = np.sum((y - y_pred_log) ** 2)
r2_log = 1 - (ss_res / ss_tot)
ax.plot(x, y_pred_log, 'g--', linewidth=2, alpha=0.7,
label=f'Log-linear (R²={r2_log:.3f})')
fit_results.append({
'species': species,
'threshold': threshold,
'model': 'log_linear',
'c0': popt_log[0],
'k': popt_log[1],
'r_squared': r2_log
})
except Exception as e:
print(f"Log-linear fit failed for {species} {threshold}: {e}")
ax.set_xlabel('Number of Genomes')
ax.set_ylabel('Core Size')
ax.set_title(f'{species} - {int(threshold*100)}% threshold')
ax.legend(fontsize=8)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(f"{DATA_PATH}/decay_model_fits.png", dpi=150)
plt.show()
fit_df = pd.DataFrame(fit_results)
print("\nModel fit results:")
print(fit_df.to_string(index=False))
In [ ]:
# Cell 3: Compare decay rates between species
# Extract power law alpha values for comparison
power_fits = fit_df[fit_df['model'] == 'power_law'].copy()
fig, ax = plt.subplots(figsize=(8, 5))
x_pos = np.arange(len(THRESHOLDS))
width = 0.35
for i, species in enumerate(SPECIES):
species_fits = power_fits[power_fits['species'] == species]
alphas = [species_fits[species_fits['threshold'] == t]['alpha'].values[0]
for t in THRESHOLDS]
ax.bar(x_pos + i*width, alphas, width, label=species, alpha=0.8)
ax.set_xlabel('Core Threshold')
ax.set_ylabel('Decay Exponent (α)')
ax.set_title('Core Decay Rate Comparison\n(Higher α = Faster Decay)')
ax.set_xticks(x_pos + width/2)
ax.set_xticklabels([f'{int(t*100)}%' for t in THRESHOLDS])
ax.legend()
ax.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.savefig(f"{DATA_PATH}/decay_rate_comparison.png", dpi=150)
plt.show()
In [ ]:
# Cell 4: Analyze gene exit patterns
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
for idx, species in enumerate(SPECIES):
ax = axes[idx]
species_exits = exit_df[exit_df['species'] == species].copy()
# Get exit points (excluding never_exits)
exits_with_point = species_exits[species_exits['exit_at_n_genomes'].notna()]
never_exits = species_exits[species_exits['never_exits']]
if len(exits_with_point) > 0:
ax.hist(exits_with_point['exit_at_n_genomes'], bins=30,
edgecolor='black', alpha=0.7, label='Exits core')
ax.axvline(exits_with_point['exit_at_n_genomes'].median(),
color='red', linestyle='--', linewidth=2,
label=f'Median exit: {exits_with_point["exit_at_n_genomes"].median():.0f}')
ax.set_xlabel('Number of Genomes When Gene Exits Core')
ax.set_ylabel('Number of Genes')
ax.set_title(f'{species}\n({len(never_exits)} genes never exit, {len(exits_with_point)} exit)')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(f"{DATA_PATH}/gene_exit_distribution.png", dpi=150)
plt.show()
In [ ]:
# Cell 5: Categorize genes by exit timing
exit_categories = []
for species in SPECIES:
species_exits = exit_df[exit_df['species'] == species].copy()
# Get total genome count for this species
species_cumulative = cumulative_df[cumulative_df['species'] == species]
max_n = species_cumulative['n_genomes'].max()
# Categorize
categories = {
'Stable (never exits)': species_exits['never_exits'].sum(),
'Late exit (>75%)': ((species_exits['exit_at_n_genomes'] > max_n * 0.75) &
(~species_exits['never_exits'])).sum(),
'Mid exit (25-75%)': ((species_exits['exit_at_n_genomes'] > max_n * 0.25) &
(species_exits['exit_at_n_genomes'] <= max_n * 0.75)).sum(),
'Early exit (<25%)': (species_exits['exit_at_n_genomes'] <= max_n * 0.25).sum()
}
for cat, count in categories.items():
exit_categories.append({
'species': species,
'category': cat,
'count': count,
'percentage': 100 * count / len(species_exits)
})
exit_cat_df = pd.DataFrame(exit_categories)
# Visualize
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
for idx, species in enumerate(SPECIES):
ax = axes[idx]
species_data = exit_cat_df[exit_cat_df['species'] == species]
colors = ['#2ecc71', '#3498db', '#f39c12', '#e74c3c'] # green, blue, orange, red
ax.pie(species_data['count'], labels=species_data['category'],
autopct='%1.1f%%', colors=colors, startangle=90)
ax.set_title(f'{species}\nCore Gene Exit Categories')
plt.tight_layout()
plt.savefig(f"{DATA_PATH}/gene_exit_categories.png", dpi=150)
plt.show()
print("Exit categories:")
print(exit_cat_df.pivot(index='category', columns='species', values='percentage').round(1))
3. Window-to-Window Core Dynamics¶
In [ ]:
# Cell 6: Visualize inter-window Jaccard similarities
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
for idx, species in enumerate(SPECIES):
ax = axes[idx]
species_jaccard = jaccard_df[jaccard_df['species'] == species]
for threshold in THRESHOLDS:
thresh_data = species_jaccard[species_jaccard['threshold'] == threshold]
# Create transition labels
transitions = [f"{row['window_1']}→{row['window_2'].split('-')[1]}"
for _, row in thresh_data.iterrows()]
ax.plot(range(len(thresh_data)), thresh_data['jaccard'],
marker='o', label=f"{int(threshold*100)}%")
ax.set_xlabel('Window Transition')
ax.set_ylabel('Jaccard Similarity')
ax.set_title(f'{species}\nCore Similarity Between Adjacent Time Windows')
ax.set_ylim(0, 1)
ax.legend()
ax.grid(True, alpha=0.3)
ax.axhline(0.8, color='gray', linestyle='--', alpha=0.5, label='80% threshold')
plt.tight_layout()
plt.savefig(f"{DATA_PATH}/inter_window_similarity.png", dpi=150)
plt.show()
In [ ]:
# Cell 7: Core stability summary
print("=== CORE STABILITY SUMMARY ===")
print()
for species in SPECIES:
print(f"\n{species}:")
species_stability = stability_df[stability_df['species'] == species]
for _, row in species_stability.iterrows():
print(f" {int(row['threshold']*100)}% threshold:")
print(f" Stable core: {row['stable_core_size']} genes")
print(f" Ever-core: {row['ever_core_size']} genes")
print(f" Stability ratio: {row['stability_ratio']:.1%}")
# Visualize stability ratios
fig, ax = plt.subplots(figsize=(8, 5))
x_pos = np.arange(len(THRESHOLDS))
width = 0.35
for i, species in enumerate(SPECIES):
species_data = stability_df[stability_df['species'] == species]
ratios = [species_data[species_data['threshold'] == t]['stability_ratio'].values[0]
for t in THRESHOLDS]
ax.bar(x_pos + i*width, ratios, width, label=species, alpha=0.8)
ax.set_xlabel('Core Threshold')
ax.set_ylabel('Stability Ratio\n(Stable Core / Ever-Core)')
ax.set_title('Core Stability Across Time Windows')
ax.set_xticks(x_pos + width/2)
ax.set_xticklabels([f'{int(t*100)}%' for t in THRESHOLDS])
ax.set_ylim(0, 1)
ax.legend()
ax.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.savefig(f"{DATA_PATH}/core_stability_ratio.png", dpi=150)
plt.show()
4. Key Findings Summary¶
In [ ]:
# Cell 8: Generate summary statistics
print("="*60)
print("TEMPORAL CORE GENOME DYNAMICS - KEY FINDINGS")
print("="*60)
for species in SPECIES:
print(f"\n{'='*60}")
print(f"{species.upper()}")
print(f"{'='*60}")
# Cumulative decay
species_cumulative = cumulative_df[(cumulative_df['species'] == species) &
(cumulative_df['threshold'] == 0.95)]
initial_core = species_cumulative.iloc[0]['core_size']
final_core = species_cumulative.iloc[-1]['core_size']
n_genomes = species_cumulative.iloc[-1]['n_genomes']
print(f"\n1. CORE DECAY (95% threshold):")
print(f" Initial core (n={species_cumulative.iloc[0]['n_genomes']}): {initial_core} genes")
print(f" Final core (n={n_genomes}): {final_core} genes")
print(f" Reduction: {initial_core - final_core} genes ({100*(initial_core-final_core)/initial_core:.1f}%)")
# Decay rate
species_fit = fit_df[(fit_df['species'] == species) &
(fit_df['threshold'] == 0.95) &
(fit_df['model'] == 'power_law')]
if len(species_fit) > 0:
alpha = species_fit.iloc[0]['alpha']
r2 = species_fit.iloc[0]['r_squared']
print(f" Power law decay: α = {alpha:.4f} (R² = {r2:.3f})")
# Stability
species_stability = stability_df[(stability_df['species'] == species) &
(stability_df['threshold'] == 0.95)]
if len(species_stability) > 0:
stable = species_stability.iloc[0]['stable_core_size']
ever = species_stability.iloc[0]['ever_core_size']
ratio = species_stability.iloc[0]['stability_ratio']
print(f"\n2. CORE STABILITY (across time windows):")
print(f" Stable core: {stable} genes (present in ALL windows)")
print(f" Ever-core: {ever} genes (present in ANY window)")
print(f" Stability ratio: {ratio:.1%}")
# Exit patterns
species_exits = exit_df[exit_df['species'] == species]
never_exit = species_exits['never_exits'].sum()
early_exit = species_exits['exits_early'].sum()
print(f"\n3. GENE EXIT PATTERNS:")
print(f" Never exit (most stable): {never_exit} genes")
print(f" Early exit (most volatile): {early_exit} genes")
print(f"\n{'='*60}")
print("CONCLUSION")
print(f"{'='*60}")
print("""
The analysis reveals that core genome composition is NOT static over
collection time. Both species show significant core erosion as genomes
sampled from more time points are included. This supports the hypothesis
that the observed small cores in heavily-sampled species partly reflect
temporal dynamics rather than true genome fluidity.
Key evidence:
- Core size decreases predictably (power law) with more genomes
- Substantial fraction of 'core' genes are transient (window-specific)
- Some genes reliably exit core early, others persist indefinitely
""")
In [ ]:
# Cell 9: Save analysis results
# Save fit results
fit_df.to_csv(f"{DATA_PATH}/decay_fit_results.csv", index=False)
print(f"Saved: decay_fit_results.csv")
# Save exit categories
exit_cat_df.to_csv(f"{DATA_PATH}/gene_exit_categories.csv", index=False)
print(f"Saved: gene_exit_categories.csv")
print("\nAll analysis results saved!")
Summary¶
Figures Created¶
decay_model_fits.png- Power law and log-linear fits to core decaydecay_rate_comparison.png- Comparison of decay rates between speciesgene_exit_distribution.png- When genes exit the coregene_exit_categories.png- Pie charts of exit timing categoriesinter_window_similarity.png- Jaccard similarity between time windowscore_stability_ratio.png- Stable vs ever-core ratios
Next Steps¶
Run 04_functional.ipynb to:
- Identify what functions characterize early-exiting genes
- Compare COG categories between stable and transient core