03 Pathway Classification
Jupyter notebook from the Metabolic Capability vs Metabolic Dependency project.
NB03: Pathway Classification¶
Classify each organism-pathway pair as active dependency, latent capability, or absent based on GapMind completeness + fitness importance of pathway genes.
Runs locally — no Spark required.
Inputs (from NB01-02)¶
data/pathway_gene_fitness.csvdata/fb_gapmind_scores.csvdata/organism_mapping.csv
Outputs¶
data/pathway_classifications.csv— organism × pathway classification matrixfigures/pathway_classification_heatmap.pngfigures/classification_proportions.pngfigures/fitness_by_classification.png
In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from pathlib import Path
DATA_DIR = Path('../data')
FIG_DIR = Path('../figures')
FIG_DIR.mkdir(exist_ok=True)
# Load data
pgf = pd.read_csv(DATA_DIR / 'pathway_gene_fitness.csv')
gapmind = pd.read_csv(DATA_DIR / 'fb_gapmind_scores.csv')
org_mapping = pd.read_csv(DATA_DIR / 'organism_mapping.csv')
print(f"Pathway-gene-fitness: {len(pgf):,} rows")
print(f"GapMind scores: {len(gapmind)} genome-pathway pairs")
print(f"Organisms: {org_mapping['orgId'].nunique()}")
1. Aggregate Fitness Importance per Organism-Pathway¶
For each organism-pathway pair, compute:
- Number of genes assigned to the pathway
- Number of fitness-important genes (|fit| > 1 in any condition)
- Number of essential genes (absent from genefitness)
- Fraction of genes that are fitness-important or essential
- Mean absolute fitness across pathway genes
In [ ]:
# Aggregate to organism-pathway level
pathway_agg = pgf.groupby(['orgId', 'gapmind_pathway']).agg(
n_genes=('gene_cluster_id', 'nunique'),
n_essential=('fitness_category', lambda x: (x == 'essential').sum()),
n_important=('fitness_category', lambda x: (x == 'important').sum()),
n_neutral=('fitness_category', lambda x: (x == 'neutral').sum()),
mean_abs_fit=('mean_abs_fit', 'mean'),
min_fit=('min_fit', 'min'),
n_sig_sick_total=('n_sig_sick', 'sum'),
pct_core=('is_core', 'mean')
).reset_index()
pathway_agg['n_fitness_active'] = pathway_agg['n_essential'] + pathway_agg['n_important']
pathway_agg['frac_fitness_active'] = pathway_agg['n_fitness_active'] / pathway_agg['n_genes']
print(f"Organism-pathway aggregates: {len(pathway_agg)}")
print(f"Organisms: {pathway_agg['orgId'].nunique()}")
print(f"Pathways: {pathway_agg['gapmind_pathway'].nunique()}")
pathway_agg.head()
2. Merge with GapMind Pathway Completeness¶
In [ ]:
# Get best GapMind score per organism-pathway
gapmind_summary = gapmind[['orgId', 'pathway', 'best_score_rank', 'best_score_category', 'metabolic_category']].copy()
gapmind_summary = gapmind_summary.rename(columns={'pathway': 'gapmind_pathway'})
# Merge pathway fitness aggregates with GapMind completeness
classification_df = gapmind_summary.merge(
pathway_agg,
on=['orgId', 'gapmind_pathway'],
how='left' # Keep all GapMind pathways, even those without gene assignments
)
# Fill NaN for pathways with no gene assignments
classification_df['n_genes'] = classification_df['n_genes'].fillna(0).astype(int)
classification_df['n_fitness_active'] = classification_df['n_fitness_active'].fillna(0).astype(int)
classification_df['frac_fitness_active'] = classification_df['frac_fitness_active'].fillna(0)
print(f"Classification table: {len(classification_df)} organism-pathway pairs")
print(f"With gene assignments: {(classification_df['n_genes'] > 0).sum()}")
print(f"Without gene assignments: {(classification_df['n_genes'] == 0).sum()}")
3. Classify Pathways¶
Classification scheme:
- Active dependency: Pathway is complete/likely_complete AND has ≥1 fitness-important or essential gene
- Latent capability: Pathway is complete/likely_complete AND has NO fitness-important genes
- Absent: Pathway is not_present or steps_missing_medium
- Partial: Pathway is steps_missing_low (borderline)
- Unmapped: Pathway is complete but we couldn't assign any genes to it via KEGG
In [ ]:
def classify_pathway(row):
"""Classify an organism-pathway pair."""
score = row['best_score_rank']
n_genes = row['n_genes']
n_active = row['n_fitness_active']
# Pathway not present in genome
if score <= 1:
return 'absent'
# Pathway only partially present
if score <= 3:
return 'partial'
# Pathway is complete or likely_complete (score >= 4)
if n_genes == 0:
return 'unmapped' # Complete but no genes mapped via KEGG
if n_active > 0:
return 'active_dependency'
else:
return 'latent_capability'
classification_df['classification'] = classification_df.apply(classify_pathway, axis=1)
print("Pathway classification results:")
print(classification_df['classification'].value_counts())
print(f"\nBy metabolic category:")
print(pd.crosstab(classification_df['metabolic_category'], classification_df['classification']))
In [ ]:
# Focus on classifiable pathways (exclude unmapped and partial)
classifiable = classification_df[classification_df['classification'].isin(
['active_dependency', 'latent_capability', 'absent']
)].copy()
print(f"Classifiable pathway-organism pairs: {len(classifiable)}")
print(f"\nClassification breakdown:")
print(classifiable['classification'].value_counts())
print(f"\nPer metabolic category:")
ct = pd.crosstab(
classifiable['metabolic_category'],
classifiable['classification'],
normalize='index'
) * 100
print(ct.round(1))
In [ ]:
# Test 1: Chi-square test — classification vs metabolic category
complete_pathways = classification_df[
classification_df['classification'].isin(['active_dependency', 'latent_capability'])
].copy()
if len(complete_pathways) > 0:
ct_test = pd.crosstab(
complete_pathways['metabolic_category'],
complete_pathways['classification']
)
chi2, p_val, dof, expected = stats.chi2_contingency(ct_test)
print("Test 1: Classification × Metabolic Category")
print(f" Chi-squared = {chi2:.2f}, p = {p_val:.2e}, df = {dof}")
print(f" Contingency table:")
print(ct_test)
# Proportions
aa = complete_pathways[complete_pathways['metabolic_category'] == 'amino_acid']
carbon = complete_pathways[complete_pathways['metabolic_category'] == 'carbon']
aa_active_pct = (aa['classification'] == 'active_dependency').mean() * 100
carbon_active_pct = (carbon['classification'] == 'active_dependency').mean() * 100
print(f"\n Amino acid pathways: {aa_active_pct:.1f}% active dependencies")
print(f" Carbon pathways: {carbon_active_pct:.1f}% active dependencies")
else:
print("No complete pathways to test")
In [ ]:
# Test 2: Mann-Whitney — fitness importance in active vs latent
active = complete_pathways[complete_pathways['classification'] == 'active_dependency']
latent = complete_pathways[complete_pathways['classification'] == 'latent_capability']
if len(active) > 0 and len(latent) > 0:
# Compare mean absolute fitness of pathway genes
u_stat, p_val = stats.mannwhitneyu(
active['mean_abs_fit'].dropna(),
latent['mean_abs_fit'].dropna(),
alternative='greater'
)
print("Test 2: Fitness Importance (Active vs Latent)")
print(f" Active mean|fit|: {active['mean_abs_fit'].median():.3f}")
print(f" Latent mean|fit|: {latent['mean_abs_fit'].median():.3f}")
print(f" Mann-Whitney U = {u_stat:.0f}, p = {p_val:.2e} (one-sided)")
In [ ]:
# Test 3: Conservation — are active dependency genes more conserved?
if 'pct_core' in complete_pathways.columns:
active_core = active['pct_core'].dropna()
latent_core = latent['pct_core'].dropna()
if len(active_core) > 0 and len(latent_core) > 0:
u_stat, p_val = stats.mannwhitneyu(
active_core, latent_core, alternative='greater'
)
print("Test 3: Conservation (% Core Genes)")
print(f" Active median % core: {active_core.median():.1f}%")
print(f" Latent median % core: {latent_core.median():.1f}%")
print(f" Mann-Whitney U = {u_stat:.0f}, p = {p_val:.2e} (one-sided)")
5. Visualizations¶
In [ ]:
# Figure 1: Heatmap — organisms × pathways colored by classification
class_colors = {
'active_dependency': 3,
'latent_capability': 2,
'partial': 1,
'absent': 0,
'unmapped': -1
}
pivot = classification_df.pivot_table(
index='orgId',
columns='gapmind_pathway',
values='classification',
aggfunc=lambda x: class_colors.get(x.iloc[0], -1) if len(x) > 0 else -1
)
# Sort organisms by total active dependencies
org_order = (pivot == 3).sum(axis=1).sort_values(ascending=False).index
# Sort pathways by category, then by frequency of active classification
pathway_cats = classification_df.drop_duplicates('gapmind_pathway')[['gapmind_pathway', 'metabolic_category']]
pathway_order = pathway_cats.sort_values('metabolic_category')['gapmind_pathway'].tolist()
pathway_order = [p for p in pathway_order if p in pivot.columns]
pivot_sorted = pivot.reindex(index=org_order, columns=pathway_order)
fig, ax = plt.subplots(figsize=(20, 10))
cmap = plt.cm.colors.ListedColormap(['#cccccc', '#fff3cd', '#ffc107', '#28a745', '#dc3545'])
bounds = [-1.5, -0.5, 0.5, 1.5, 2.5, 3.5]
norm = plt.cm.colors.BoundaryNorm(bounds, cmap.N)
im = ax.imshow(pivot_sorted.values, aspect='auto', cmap=cmap, norm=norm)
ax.set_yticks(range(len(pivot_sorted.index)))
ax.set_yticklabels(pivot_sorted.index, fontsize=7)
ax.set_xticks(range(len(pivot_sorted.columns)))
ax.set_xticklabels(pivot_sorted.columns, rotation=90, fontsize=6)
ax.set_title('Metabolic Pathway Classification: Capability vs Dependency', fontsize=14)
# Legend
from matplotlib.patches import Patch
legend_elements = [
Patch(facecolor='#dc3545', label='Active Dependency'),
Patch(facecolor='#28a745', label='Latent Capability'),
Patch(facecolor='#ffc107', label='Partial'),
Patch(facecolor='#fff3cd', label='Absent'),
Patch(facecolor='#cccccc', label='Unmapped'),
]
ax.legend(handles=legend_elements, loc='upper right', fontsize=8)
plt.tight_layout()
plt.savefig(FIG_DIR / 'pathway_classification_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()
print(f"Saved: figures/pathway_classification_heatmap.png")
In [ ]:
# Figure 2: Stacked bar — amino acid vs carbon pathway proportions
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
for idx, cat in enumerate(['amino_acid', 'carbon']):
cat_data = classification_df[classification_df['metabolic_category'] == cat]
props = cat_data.groupby('orgId')['classification'].value_counts(normalize=True).unstack(fill_value=0)
colors = {
'active_dependency': '#dc3545',
'latent_capability': '#28a745',
'partial': '#ffc107',
'absent': '#6c757d',
'unmapped': '#cccccc'
}
# Sort by active dependency fraction
if 'active_dependency' in props.columns:
props = props.sort_values('active_dependency', ascending=True)
bottom = np.zeros(len(props))
for cls in ['active_dependency', 'latent_capability', 'partial', 'absent', 'unmapped']:
if cls in props.columns:
axes[idx].barh(range(len(props)), props[cls], left=bottom,
color=colors[cls], label=cls.replace('_', ' ').title())
bottom += props[cls].values
axes[idx].set_yticks(range(len(props)))
axes[idx].set_yticklabels(props.index, fontsize=7)
axes[idx].set_xlabel('Proportion of Pathways')
axes[idx].set_title(f"{'Amino Acid' if cat == 'amino_acid' else 'Carbon Source'} Pathways")
if idx == 0:
axes[idx].legend(fontsize=7, loc='lower right')
plt.suptitle('Pathway Classifications by Metabolic Category', fontsize=14)
plt.tight_layout()
plt.savefig(FIG_DIR / 'classification_proportions.png', dpi=150, bbox_inches='tight')
plt.show()
print(f"Saved: figures/classification_proportions.png")
In [ ]:
# Figure 3: Fitness distributions for genes in active vs latent pathways
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Panel A: Mean absolute fitness
if len(active) > 0 and len(latent) > 0:
for cls, data, color in [
('Active Dependency', active, '#dc3545'),
('Latent Capability', latent, '#28a745')
]:
vals = data['mean_abs_fit'].dropna()
axes[0].hist(vals, bins=30, alpha=0.6, color=color, label=cls, density=True)
axes[0].set_xlabel('Mean |Fitness| of Pathway Genes')
axes[0].set_ylabel('Density')
axes[0].set_title('Fitness Importance by Classification')
axes[0].legend(fontsize=9)
# Panel B: Fraction of fitness-active genes
for cls, data, color in [
('Active Dependency', active, '#dc3545'),
('Latent Capability', latent, '#28a745')
]:
vals = data['frac_fitness_active'].dropna()
axes[1].hist(vals, bins=20, alpha=0.6, color=color, label=cls, density=True)
axes[1].set_xlabel('Fraction Fitness-Active Genes per Pathway')
axes[1].set_ylabel('Density')
axes[1].set_title('Gene Fitness Composition')
axes[1].legend(fontsize=9)
plt.tight_layout()
plt.savefig(FIG_DIR / 'fitness_by_classification.png', dpi=150, bbox_inches='tight')
plt.show()
print(f"Saved: figures/fitness_by_classification.png")
6. Per-Organism Summary¶
In [ ]:
# Per-organism classification summary
org_class = classification_df.groupby('orgId')['classification'].value_counts().unstack(fill_value=0)
# Add organism metadata
org_class = org_class.merge(
org_mapping[['orgId', 'genus', 'species', 'gtdb_species_clade_id']],
left_index=True, right_on='orgId'
).set_index('orgId')
# Compute summary metrics
if 'active_dependency' in org_class.columns and 'latent_capability' in org_class.columns:
org_class['total_complete'] = org_class.get('active_dependency', 0) + org_class.get('latent_capability', 0)
org_class['pct_active'] = (org_class.get('active_dependency', 0) /
org_class['total_complete'].replace(0, np.nan) * 100).round(1)
org_class['pct_latent'] = (org_class.get('latent_capability', 0) /
org_class['total_complete'].replace(0, np.nan) * 100).round(1)
print("Per-organism classification summary:")
display_cols = ['genus', 'species']
for col in ['active_dependency', 'latent_capability', 'absent', 'partial', 'total_complete', 'pct_active', 'pct_latent']:
if col in org_class.columns:
display_cols.append(col)
print(org_class[display_cols].sort_values('pct_latent', ascending=False).to_string())
7. Save Classifications¶
In [ ]:
# Save full classification table
classification_df.to_csv(DATA_DIR / 'pathway_classifications.csv', index=False)
print(f"Saved: pathway_classifications.csv ({len(classification_df)} rows)")
# Save per-organism summary
org_class.to_csv(DATA_DIR / 'organism_classification_summary.csv')
print(f"Saved: organism_classification_summary.csv ({len(org_class)} rows)")
print("\n" + "=" * 60)
print("NB03 SUMMARY")
print("=" * 60)
for cls in ['active_dependency', 'latent_capability', 'absent', 'partial', 'unmapped']:
n = (classification_df['classification'] == cls).sum()
pct = n / len(classification_df) * 100
print(f" {cls:25s}: {n:5d} ({pct:.1f}%)")
print("=" * 60)
print("\nNext: Run NB04 (cross-species patterns) — runs locally")