import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib.patches import Patch
import seaborn as sns
from scipy import stats
from pathlib import Path

# Style
plt.rcParams.update({
    'font.size': 10,
    'axes.titlesize': 12,
    'axes.labelsize': 11,
    'figure.dpi': 150
})

DATA_DIR = Path('../data')
FIG_DIR = Path('../figures')
FIG_DIR.mkdir(exist_ok=True)

# Load all results
classifications = pd.read_csv(DATA_DIR / 'pathway_classifications.csv')
org_summary = pd.read_csv(DATA_DIR / 'organism_classification_summary.csv')
pgf = pd.read_csv(DATA_DIR / 'pathway_gene_fitness.csv')
cross_species = pd.read_csv(DATA_DIR / 'cross_species_analysis.csv')
org_mapping = pd.read_csv(DATA_DIR / 'organism_mapping.csv')
pangenome = pd.read_csv(DATA_DIR / 'pangenome_stats.csv')

print(f"Classifications: {len(classifications)}")
print(f"Pathway-gene-fitness: {len(pgf):,}")
print(f"Cross-species: {len(cross_species):,}")

# Color scheme
CLASS_COLORS = {
    'active_dependency': '#d62728',   # Red
    'latent_capability': '#2ca02c',   # Green
    'partial': '#ff7f0e',             # Orange
    'absent': '#e0e0e0',              # Light gray
    'unmapped': '#bdbdbd'             # Gray
}

CLASS_NUMERIC = {
    'absent': 0,
    'unmapped': 0.5,
    'partial': 1,
    'latent_capability': 2,
    'active_dependency': 3
}

# Build matrix
classifications['class_num'] = classifications['classification'].map(CLASS_NUMERIC)
pivot = classifications.pivot_table(
    index='orgId',
    columns='gapmind_pathway',
    values='class_num',
    aggfunc='first'
).fillna(0)

# Sort organisms by total active dependencies
org_order = (pivot == 3).sum(axis=1).sort_values(ascending=False).index

# Sort pathways: amino acid first, then carbon, then by active frequency
pathway_info = classifications.drop_duplicates('gapmind_pathway')[['gapmind_pathway', 'metabolic_category']]
pathway_active_freq = (classifications[classifications['classification'] == 'active_dependency']
                       .groupby('gapmind_pathway').size())
pathway_info = pathway_info.merge(
    pathway_active_freq.rename('n_active').reset_index(),
    on='gapmind_pathway', how='left'
).fillna(0)
pathway_info = pathway_info.sort_values(['metabolic_category', 'n_active'], ascending=[True, False])
pathway_order = [p for p in pathway_info['gapmind_pathway'] if p in pivot.columns]

pivot = pivot.reindex(index=org_order, columns=pathway_order)

# Plot
fig, ax = plt.subplots(figsize=(22, 8))

from matplotlib.colors import ListedColormap, BoundaryNorm
cmap = ListedColormap(['#e0e0e0', '#bdbdbd', '#ff7f0e', '#2ca02c', '#d62728'])
bounds = [-0.25, 0.25, 0.75, 1.5, 2.5, 3.5]
norm = BoundaryNorm(bounds, cmap.N)

im = ax.imshow(pivot.values, aspect='auto', cmap=cmap, norm=norm, interpolation='none')

ax.set_yticks(range(len(pivot.index)))
ax.set_yticklabels(pivot.index, fontsize=7)
ax.set_xticks(range(len(pivot.columns)))
ax.set_xticklabels(pivot.columns, rotation=90, fontsize=6)

# Add category divider line between amino acid and carbon pathways
aa_count = len([p for p in pathway_order if pathway_info[pathway_info['gapmind_pathway'] == p]['metabolic_category'].iloc[0] == 'amino_acid'])
ax.axvline(x=aa_count - 0.5, color='black', linewidth=2, linestyle='-')
ax.text(aa_count / 2, -1.5, 'Amino Acid Biosynthesis', ha='center', fontsize=9, fontweight='bold')
ax.text(aa_count + (len(pathway_order) - aa_count) / 2, -1.5, 'Carbon Source Utilization',
        ha='center', fontsize=9, fontweight='bold')

# Legend
legend_elements = [
    Patch(facecolor='#d62728', label='Active Dependency'),
    Patch(facecolor='#2ca02c', label='Latent Capability'),
    Patch(facecolor='#ff7f0e', label='Partial'),
    Patch(facecolor='#e0e0e0', label='Absent'),
]
ax.legend(handles=legend_elements, loc='upper right', fontsize=9,
         framealpha=0.9, edgecolor='black')

ax.set_title('Metabolic Pathway Classification: Capability vs Dependency', fontsize=14, pad=20)

plt.tight_layout()
plt.savefig(FIG_DIR / 'fig1_classification_heatmap.png', dpi=200, bbox_inches='tight')
plt.show()
print(f"Saved: figures/fig1_classification_heatmap.png")

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

for idx, (cat, title) in enumerate([
    ('amino_acid', 'Amino Acid Biosynthesis'),
    ('carbon', 'Carbon Source Utilization')
]):
    cat_data = classifications[classifications['metabolic_category'] == cat]
    counts = cat_data['classification'].value_counts()
    
    colors = [CLASS_COLORS.get(c, '#999') for c in counts.index]
    labels = [c.replace('_', ' ').title() for c in counts.index]
    
    wedges, texts, autotexts = axes[idx].pie(
        counts.values, labels=labels, colors=colors,
        autopct='%1.1f%%', pctdistance=0.85,
        textprops={'fontsize': 9}
    )
    for t in autotexts:
        t.set_fontsize(8)
    axes[idx].set_title(title, fontsize=12)

plt.suptitle('Pathway Classification by Metabolic Category', fontsize=14)
plt.tight_layout()
plt.savefig(FIG_DIR / 'fig2_aa_vs_carbon_proportions.png', dpi=150, bbox_inches='tight')
plt.show()
print(f"Saved: figures/fig2_aa_vs_carbon_proportions.png")

# Get per-gene fitness data with pathway classification
gene_class = pgf.merge(
    classifications[['orgId', 'gapmind_pathway', 'classification']],
    on=['orgId', 'gapmind_pathway'],
    how='left'
)

# Filter to genes with fitness data in active/latent pathways
plot_data = gene_class[
    gene_class['classification'].isin(['active_dependency', 'latent_capability']) &
    gene_class['mean_abs_fit'].notna()
].copy()

if len(plot_data) > 0:
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    # Panel A: Violin plot of mean absolute fitness
    sns.violinplot(
        data=plot_data, x='classification', y='mean_abs_fit',
        palette={'active_dependency': '#d62728', 'latent_capability': '#2ca02c'},
        ax=axes[0], cut=0
    )
    axes[0].set_xlabel('')
    axes[0].set_ylabel('Mean |Fitness Effect|')
    axes[0].set_title('Gene Fitness Importance')
    axes[0].set_xticklabels(['Active\nDependency', 'Latent\nCapability'])
    
    # Panel B: Fraction of genes with significant fitness effects
    if 'n_sig_important' in plot_data.columns:
        plot_data['has_sig_effect'] = plot_data['n_sig_important'].fillna(0) > 0
        sig_rates = plot_data.groupby('classification')['has_sig_effect'].mean() * 100
        
        bars = axes[1].bar(
            range(len(sig_rates)),
            sig_rates.values,
            color=[CLASS_COLORS[c] for c in sig_rates.index],
            edgecolor='black', linewidth=0.5
        )
        axes[1].set_xticks(range(len(sig_rates)))
        axes[1].set_xticklabels(['Active\nDependency', 'Latent\nCapability'])
        axes[1].set_ylabel('% Genes with Significant Fitness Effect')
        axes[1].set_title('Fitness Significance Rate')
        
        for bar, val in zip(bars, sig_rates.values):
            axes[1].text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1,
                        f'{val:.1f}%', ha='center', fontsize=10)
    
    plt.tight_layout()
    plt.savefig(FIG_DIR / 'fig3_fitness_by_classification.png', dpi=150, bbox_inches='tight')
    plt.show()
    print(f"Saved: figures/fig3_fitness_by_classification.png")
else:
    print("No fitness data available for visualization")

# Merge organism classifications with pangenome data
fb_plot = org_summary.merge(
    org_mapping[['orgId', 'gtdb_species_clade_id']],
    on='orgId'
).merge(
    pangenome[['gtdb_species_clade_id', 'openness', 'no_genomes']],
    on='gtdb_species_clade_id'
)

if 'pct_latent' in fb_plot.columns and len(fb_plot) > 5:
    fb_plot = fb_plot.dropna(subset=['pct_latent', 'openness'])
    
    fig, ax = plt.subplots(figsize=(8, 6))
    
    scatter = ax.scatter(
        fb_plot['pct_latent'].astype(float),
        fb_plot['openness'].astype(float),
        s=80, c='steelblue', alpha=0.7, edgecolors='black', linewidth=0.5
    )
    
    # Add labels
    for _, row in fb_plot.iterrows():
        ax.annotate(
            row['orgId'], (float(row['pct_latent']), float(row['openness'])),
            fontsize=7, alpha=0.7, ha='center', va='bottom',
            xytext=(0, 4), textcoords='offset points'
        )
    
    # Trend line
    x = fb_plot['pct_latent'].astype(float).values
    y = fb_plot['openness'].astype(float).values
    mask = np.isfinite(x) & np.isfinite(y)
    if mask.sum() > 2:
        z = np.polyfit(x[mask], y[mask], 1)
        p = np.poly1d(z)
        x_line = np.linspace(x[mask].min(), x[mask].max(), 100)
        ax.plot(x_line, p(x_line), 'r--', alpha=0.5, linewidth=2)
        
        rho, pval = stats.spearmanr(x[mask], y[mask])
        ax.text(0.05, 0.95, f'Spearman rho = {rho:.3f}\np = {pval:.4f}',
               transform=ax.transAxes, fontsize=10, verticalalignment='top',
               bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    
    ax.set_xlabel('% Latent Capabilities\n(complete pathways with fitness-neutral genes)', fontsize=11)
    ax.set_ylabel('Pangenome Openness\n(accessory / total gene clusters)', fontsize=11)
    ax.set_title('H2: Black Queen Hypothesis\nMore Latent Capabilities → More Open Pangenome?', fontsize=13)
    
    plt.tight_layout()
    plt.savefig(FIG_DIR / 'fig4_black_queen.png', dpi=150, bbox_inches='tight')
    plt.show()
    print(f"Saved: figures/fig4_black_queen.png")
else:
    print("Insufficient data for Black Queen figure")

if len(cross_species) > 100:
    fig, ax = plt.subplots(figsize=(8, 6))
    
    hb = ax.hexbin(
        cross_species['pathway_cv'],
        cross_species['openness'],
        gridsize=40, cmap='YlOrRd', mincnt=1,
        edgecolors='gray', linewidths=0.2
    )
    
    cb = plt.colorbar(hb, ax=ax, label='Number of species')
    
    rho = cross_species[['pathway_cv', 'openness']].corr(method='spearman').iloc[0, 1]
    ax.text(0.05, 0.95, f'Spearman rho = {rho:.3f}\nn = {len(cross_species):,} species',
           transform=ax.transAxes, fontsize=10, verticalalignment='top',
           bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    
    ax.set_xlabel('Within-Species Pathway Completeness CV\n(metabolic heterogeneity)', fontsize=11)
    ax.set_ylabel('Pangenome Openness', fontsize=11)
    ax.set_title('H3: Metabolic Ecotypes\nPathway Heterogeneity vs Pangenome Openness (27K species)', fontsize=13)
    
    plt.tight_layout()
    plt.savefig(FIG_DIR / 'fig5_species_heterogeneity.png', dpi=150, bbox_inches='tight')
    plt.show()
    print(f"Saved: figures/fig5_species_heterogeneity.png")
else:
    print("Insufficient cross-species data for figure")

print("=" * 60)
print("SUMMARY STATISTICS FOR REPORT")
print("=" * 60)

n_orgs = classifications['orgId'].nunique()
n_pathways = classifications['gapmind_pathway'].nunique()
print(f"\nScale: {n_orgs} organisms × {n_pathways} pathways = {len(classifications)} pairs")

print(f"\nClassification breakdown:")
for cls, count in classifications['classification'].value_counts().items():
    pct = count / len(classifications) * 100
    print(f"  {cls:25s}: {count:5d} ({pct:.1f}%)")

# Complete pathways only
complete = classifications[classifications['classification'].isin(['active_dependency', 'latent_capability'])]
if len(complete) > 0:
    n_active = (complete['classification'] == 'active_dependency').sum()
    n_latent = (complete['classification'] == 'latent_capability').sum()
    print(f"\nAmong complete pathways:")
    print(f"  Active dependencies: {n_active} ({n_active/(n_active+n_latent)*100:.1f}%)")
    print(f"  Latent capabilities: {n_latent} ({n_latent/(n_active+n_latent)*100:.1f}%)")
    
    # By category
    for cat in ['amino_acid', 'carbon']:
        cat_complete = complete[complete['metabolic_category'] == cat]
        if len(cat_complete) > 0:
            cat_active = (cat_complete['classification'] == 'active_dependency').sum()
            cat_total = len(cat_complete)
            print(f"  {cat}: {cat_active}/{cat_total} active ({cat_active/cat_total*100:.1f}%)")

print(f"\nCross-species analysis:")
print(f"  Species analyzed: {len(cross_species):,}")

print("\n" + "=" * 60)
print("All figures saved to figures/")
print("Next: Run /synthesize to create REPORT.md")
print("=" * 60)

05 Summary Figures

NB05: Summary Figures¶

Figures¶

Figure 1: Overview Heatmap — Capability vs Dependency Landscape¶

Figure 2: Amino Acid vs Carbon Source Classification Proportions¶

Figure 3: Fitness Distributions by Pathway Classification¶

Figure 4: Black Queen Hypothesis — Openness vs Latent Capabilities¶

Figure 5: Species-Level Pathway Heterogeneity vs Openness¶

Summary Statistics for Report¶