04 Cross Species Patterns
Jupyter notebook from the Metabolic Capability vs Metabolic Dependency project.
NB04: Cross-Species Patterns¶
Test secondary hypotheses using data from all 27K species:
- H2 (Black Queen): Species with more latent capabilities have more open pangenomes
- H3 (Metabolic ecotypes): Within-species pathway variation correlates with pangenome openness
Runs locally — uses pre-extracted data from NB01.
Inputs¶
data/organism_classification_summary.csv(from NB03) — per-FB-organism classificationsdata/species_pathway_summary.csv(from NB01) — species-level GapMind for all 27K speciesdata/pangenome_stats.csv(from NB01) — pangenome openness metricsdata/organism_mapping.csv(from NB01)
Outputs¶
data/cross_species_analysis.csvfigures/openness_vs_latent.pngfigures/pathway_heterogeneity_vs_openness.pngfigures/pathway_completeness_vs_openness.png
In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from pathlib import Path
DATA_DIR = Path('../data')
FIG_DIR = Path('../figures')
FIG_DIR.mkdir(exist_ok=True)
# Load data
org_class = pd.read_csv(DATA_DIR / 'organism_classification_summary.csv')
species_pw = pd.read_csv(DATA_DIR / 'species_pathway_summary.csv')
pangenome = pd.read_csv(DATA_DIR / 'pangenome_stats.csv')
org_mapping = pd.read_csv(DATA_DIR / 'organism_mapping.csv')
print(f"FB organism classifications: {len(org_class)}")
print(f"Species pathway summaries: {len(species_pw):,}")
print(f"Pangenome stats: {len(pangenome):,}")
1. Test H2: Latent Capabilities vs Pangenome Openness (FB Organisms)¶
Among the ~33 FB organisms, test whether the fraction of latent capabilities correlates with pangenome openness (accessory/total gene cluster ratio).
In [ ]:
# Merge FB organism classifications with pangenome openness
fb_analysis = org_class.merge(
org_mapping[['orgId', 'gtdb_species_clade_id']],
on='orgId'
).merge(
pangenome[['gtdb_species_clade_id', 'openness', 'pct_core', 'pct_singleton', 'no_genomes', 'no_gene_clusters']],
on='gtdb_species_clade_id'
)
print(f"FB organisms with pangenome data: {len(fb_analysis)}")
# Filter to organisms with enough complete pathways for meaningful ratios
min_complete = 5
fb_testable = fb_analysis[fb_analysis.get('total_complete', 0) >= min_complete].copy()
print(f"With ≥{min_complete} complete pathways: {len(fb_testable)}")
if len(fb_testable) > 5 and 'pct_latent' in fb_testable.columns:
# Spearman correlation: pct_latent vs openness
rho, p_val = stats.spearmanr(
fb_testable['pct_latent'].astype(float),
fb_testable['openness'].astype(float)
)
print(f"\nH2 Test (Spearman): rho = {rho:.3f}, p = {p_val:.4f}")
print(f" Interpretation: {'Supports H2' if rho > 0 and p_val < 0.05 else 'Does not support H2'}")
# Also test with genome count as covariate (partial correlation)
from scipy.stats import pearsonr
# Rank-transform for partial Spearman
fb_testable['rank_latent'] = fb_testable['pct_latent'].rank()
fb_testable['rank_openness'] = fb_testable['openness'].rank()
fb_testable['rank_n_genomes'] = fb_testable['no_genomes'].rank()
# Partial correlation (controlling for genome count)
from numpy.linalg import lstsq
x = fb_testable[['rank_latent', 'rank_n_genomes']].values
y = fb_testable['rank_openness'].values
# Residualize both variables on genome count
z = fb_testable['rank_n_genomes'].values.reshape(-1, 1)
res_latent = fb_testable['rank_latent'].values - z.flatten() * lstsq(z, fb_testable['rank_latent'].values, rcond=None)[0]
res_openness = y - z.flatten() * lstsq(z, y, rcond=None)[0]
partial_r, partial_p = pearsonr(res_latent, res_openness)
print(f" Partial correlation (controlling for genome count): r = {partial_r:.3f}, p = {partial_p:.4f}")
else:
print("Insufficient data for H2 test")
In [ ]:
# Figure: Openness vs Latent Capabilities
if len(fb_testable) > 5 and 'pct_latent' in fb_testable.columns:
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
# Panel A: Scatter with regression line
ax = axes[0]
ax.scatter(fb_testable['pct_latent'], fb_testable['openness'],
s=fb_testable['no_genomes'].clip(upper=500) / 5,
alpha=0.7, c='steelblue', edgecolors='black', linewidth=0.5)
# Add organism labels
for _, row in fb_testable.iterrows():
ax.annotate(row['orgId'], (row['pct_latent'], row['openness']),
fontsize=6, alpha=0.7, ha='center', va='bottom')
# Regression line
x = fb_testable['pct_latent'].astype(float)
y = fb_testable['openness'].astype(float)
z = np.polyfit(x, y, 1)
p = np.poly1d(z)
x_line = np.linspace(x.min(), x.max(), 100)
ax.plot(x_line, p(x_line), 'r--', alpha=0.5)
ax.set_xlabel('% Latent Capabilities (complete but fitness-neutral)')
ax.set_ylabel('Pangenome Openness (accessory / total clusters)')
ax.set_title(f'H2: Black Queen Hypothesis\n(rho={rho:.3f}, p={p_val:.4f})')
# Panel B: Active vs Latent by openness quartile
ax = axes[1]
fb_testable['openness_q'] = pd.qcut(fb_testable['openness'], q=3, labels=['Closed', 'Medium', 'Open'])
for q in ['Closed', 'Medium', 'Open']:
q_data = fb_testable[fb_testable['openness_q'] == q]
if len(q_data) > 0 and 'pct_active' in q_data.columns:
ax.bar(q, q_data['pct_active'].mean(), yerr=q_data['pct_active'].std(),
color='#dc3545', alpha=0.7, label='Active' if q == 'Closed' else '')
ax.set_ylabel('Mean % Active Dependencies')
ax.set_title('Active Dependencies by Openness Tertile')
plt.tight_layout()
plt.savefig(FIG_DIR / 'openness_vs_latent.png', dpi=150, bbox_inches='tight')
plt.show()
print(f"Saved: figures/openness_vs_latent.png")
else:
print("Skipping figure — insufficient data")
2. Test H3: Within-Species Pathway Heterogeneity (All 27K Species)¶
Species with more within-species variation in pathway completeness (high std_complete)
may have more open pangenomes — different genomes in the species have different
metabolic capabilities, defining "metabolic ecotypes."
In [ ]:
# Merge species pathway stats with pangenome openness
species_analysis = species_pw.merge(
pangenome,
left_on='clade_name',
right_on='gtdb_species_clade_id',
how='inner'
)
print(f"Species with both pathway and pangenome data: {len(species_analysis):,}")
# Filter to species with enough genomes for meaningful heterogeneity
min_genomes = 10
species_testable = species_analysis[species_analysis['n_genomes'] >= min_genomes].copy()
print(f"With ≥{min_genomes} genomes: {len(species_testable):,}")
# Compute pathway heterogeneity = coefficient of variation
species_testable['pathway_cv'] = (
species_testable['std_complete'] / species_testable['mean_complete'].replace(0, np.nan)
).fillna(0)
# Spearman: pathway heterogeneity vs pangenome openness
rho_h3, p_h3 = stats.spearmanr(
species_testable['pathway_cv'],
species_testable['openness']
)
print(f"\nH3 Test (Spearman): rho = {rho_h3:.3f}, p = {p_h3:.2e}")
print(f" Interpretation: {'Supports H3' if rho_h3 > 0 and p_h3 < 0.05 else 'Does not support H3'}")
# Also test: mean pathway completeness vs openness
rho_comp, p_comp = stats.spearmanr(
species_testable['mean_complete'],
species_testable['openness']
)
print(f"\nComplementary: Mean completeness vs openness: rho = {rho_comp:.3f}, p = {p_comp:.2e}")
In [ ]:
# Control for genome count (confound: more genomes → more variation by sampling)
# Partial correlation
from scipy.stats import pearsonr
# Rank-transform
species_testable['rank_cv'] = species_testable['pathway_cv'].rank()
species_testable['rank_open'] = species_testable['openness'].rank()
species_testable['rank_ng'] = species_testable['n_genomes'].rank()
# Residualize on genome count
from numpy.linalg import lstsq
z = species_testable['rank_ng'].values.reshape(-1, 1)
res_cv = species_testable['rank_cv'].values - z.flatten() * lstsq(z, species_testable['rank_cv'].values, rcond=None)[0]
res_open = species_testable['rank_open'].values - z.flatten() * lstsq(z, species_testable['rank_open'].values, rcond=None)[0]
partial_r, partial_p = pearsonr(res_cv, res_open)
print(f"Partial correlation (controlling for genome count): r = {partial_r:.3f}, p = {partial_p:.2e}")
In [ ]:
# Figure: Pathway heterogeneity vs pangenome openness
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
# Panel A: Pathway heterogeneity vs openness
ax = axes[0]
ax.hexbin(species_testable['pathway_cv'], species_testable['openness'],
gridsize=30, cmap='Blues', mincnt=1)
ax.set_xlabel('Pathway Completeness CV (within-species variation)')
ax.set_ylabel('Pangenome Openness')
ax.set_title(f'H3: Metabolic Ecotypes\n(rho={rho_h3:.3f}, p={p_h3:.2e})')
plt.colorbar(ax.collections[0], ax=ax, label='Species count')
# Panel B: Mean completeness vs openness
ax = axes[1]
ax.hexbin(species_testable['mean_complete'], species_testable['openness'],
gridsize=30, cmap='Oranges', mincnt=1)
ax.set_xlabel('Mean Complete Pathways per Genome')
ax.set_ylabel('Pangenome Openness')
ax.set_title(f'Pathway Completeness vs Openness\n(rho={rho_comp:.3f}, p={p_comp:.2e})')
plt.colorbar(ax.collections[0], ax=ax, label='Species count')
# Panel C: Distribution of pathway CV by openness quartile
ax = axes[2]
species_testable['openness_q'] = pd.qcut(
species_testable['openness'], q=4,
labels=['Q1 (Closed)', 'Q2', 'Q3', 'Q4 (Open)']
)
species_testable.boxplot(column='pathway_cv', by='openness_q', ax=ax)
ax.set_xlabel('Pangenome Openness Quartile')
ax.set_ylabel('Pathway Completeness CV')
ax.set_title('Pathway Heterogeneity by Openness Quartile')
plt.suptitle('') # Remove auto-title from boxplot
plt.tight_layout()
plt.savefig(FIG_DIR / 'pathway_heterogeneity_vs_openness.png', dpi=150, bbox_inches='tight')
plt.show()
print(f"Saved: figures/pathway_heterogeneity_vs_openness.png")
3. Additional Analyses¶
In [ ]:
# Analysis: Pathway completeness range (max - min) as alternative heterogeneity metric
species_testable['pathway_range'] = species_testable['max_complete'] - species_testable['min_complete']
rho_range, p_range = stats.spearmanr(
species_testable['pathway_range'],
species_testable['openness']
)
print(f"Pathway completeness range vs openness: rho = {rho_range:.3f}, p = {p_range:.2e}")
# Genome count as confounder check
rho_ng, p_ng = stats.spearmanr(
species_testable['pathway_cv'],
species_testable['n_genomes']
)
print(f"Pathway CV vs genome count: rho = {rho_ng:.3f}, p = {p_ng:.2e}")
print(f" (Positive rho expected: more genomes → more observed variation)")
In [ ]:
# Analysis: Which pathways show the most within-species variation?
# (This requires per-pathway species-level data — would need additional extraction)
# For now, use the aggregate metrics
# Top species by pathway heterogeneity
top_heterogeneous = species_testable.nlargest(20, 'pathway_cv')[
['clade_name', 'n_genomes', 'mean_complete', 'std_complete', 'pathway_cv', 'openness']
]
print("Top 20 species by pathway heterogeneity:")
print(top_heterogeneous.to_string(index=False))
4. Save Results¶
In [ ]:
# Save cross-species analysis
species_testable.to_csv(DATA_DIR / 'cross_species_analysis.csv', index=False)
print(f"Saved: cross_species_analysis.csv ({len(species_testable):,} rows)")
print("\n" + "=" * 60)
print("NB04 SUMMARY")
print("=" * 60)
print(f"H2 (Black Queen): rho = {rho:.3f}, p = {p_val:.4f}")
print(f" {'SUPPORTED' if rho > 0 and p_val < 0.05 else 'NOT SUPPORTED'}")
print(f"H3 (Metabolic ecotypes): rho = {rho_h3:.3f}, p = {p_h3:.2e}")
print(f" {'SUPPORTED' if rho_h3 > 0 and p_h3 < 0.05 else 'NOT SUPPORTED'}")
print(f" Partial (controlling genome count): r = {partial_r:.3f}, p = {partial_p:.2e}")
print(f"Species analyzed: {len(species_testable):,} (≥{min_genomes} genomes)")
print("=" * 60)
print("\nNext: Run NB05 (summary figures) — runs locally")