02 Pathway Analysis
Jupyter notebook from the Pangenome Openness, Metabolic Pathways, and Phylogenetic Distances project.
Phase 2: Pathway Correlation Analysis¶
Analyze how pangenome openness correlates with metabolic pathway completeness across species.
Approach:¶
- Aggregate pathway scores to species level
- Calculate pathway diversity and completeness metrics
- Correlate with pangenome openness
- Analyze pathway category distributions
In [ ]:
# Import libraries
# Note: Spark session is pre-initialized in JupyterHub kernel
# Do NOT use: from get_spark_session import get_spark_session
# See docs/pitfalls.md for details on JupyterHub environment
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr, spearmanr
print("Libraries imported. Spark session is pre-initialized in the JupyterHub kernel.")
1. Load Species Openness Data¶
In [ ]:
# Load pre-computed openness scores
species_openness = pd.read_csv('../data/species_pangenome_openness.csv')
print(f"Loaded {len(species_openness)} species with openness scores")
print(species_openness.head())
2. Aggregate Pathway Scores to Species Level¶
In [ ]:
# Get pathway statistics aggregated by species
pathway_by_species = spark.sql("""
SELECT
g.gtdb_species_clade_id,
gp.metabolic_category,
gp.pathway,
AVG(gp.score) as mean_pathway_score,
COUNT(DISTINCT g.genome_id) as genomes_with_pathway,
SUM(CASE WHEN gp.score_simplified = 'High' THEN 1 ELSE 0 END) as high_score_genomes,
SUM(CASE WHEN gp.score_simplified = 'Medium' THEN 1 ELSE 0 END) as medium_score_genomes,
SUM(CASE WHEN gp.score_simplified = 'Low' THEN 1 ELSE 0 END) as low_score_genomes,
SUM(CASE WHEN gp.score_simplified = 'Unknown' THEN 1 ELSE 0 END) as unknown_score_genomes
FROM kbase_ke_pangenome.genome g
JOIN kbase_ke_pangenome.gapmind_pathways gp
ON g.genome_id = gp.genome_id
GROUP BY
g.gtdb_species_clade_id,
gp.metabolic_category,
gp.pathway
""").toPandas()
print(f"Loaded pathway data for {len(pathway_by_species)} pathway-species combinations")
print(pathway_by_species.head())
3. Calculate Species-Level Pathway Metrics¶
In [ ]:
# Aggregate to species level
species_pathway_profile = pathway_by_species.groupby('gtdb_species_clade_id').agg({
'pathway': 'count',
'mean_pathway_score': 'mean',
'metabolic_category': lambda x: x.nunique(),
'high_score_genomes': 'sum',
'genomes_with_pathway': 'mean'
}).rename(columns={
'pathway': 'total_pathways',
'mean_pathway_score': 'avg_pathway_score',
'metabolic_category': 'unique_categories',
'genomes_with_pathway': 'avg_genomes_per_pathway'
})
# Calculate pathway diversity
species_pathway_profile['pathway_diversity'] = (
species_pathway_profile['unique_categories'] /
species_pathway_profile['total_pathways']
)
species_pathway_profile.reset_index(inplace=True)
print(f"Species pathway profile shape: {species_pathway_profile.shape}")
print(species_pathway_profile.head())
4. Merge with Openness Data¶
In [ ]:
# Merge openness with pathway metrics
merged_data = species_openness[['gtdb_species_clade_id', 'GTDB_species', 'no_genomes', 'openness_score', 'closedness_score', 'core_fraction']].merge(
species_pathway_profile,
on='gtdb_species_clade_id',
how='inner'
)
print(f"Merged data for {len(merged_data)} species")
print(merged_data.head())
print("\nData summary:")
print(merged_data[['openness_score', 'avg_pathway_score', 'pathway_diversity', 'total_pathways']].describe())
5. Correlation Analysis¶
In [ ]:
# Calculate correlations
correlations = {}
metrics = ['avg_pathway_score', 'pathway_diversity', 'total_pathways', 'unique_categories']
for metric in metrics:
# Remove NaN values for correlation
valid = merged_data[['openness_score', metric]].dropna()
if len(valid) > 2:
pearson_r, pearson_p = pearsonr(valid['openness_score'], valid[metric])
spearman_r, spearman_p = spearmanr(valid['openness_score'], valid[metric])
correlations[metric] = {
'pearson_r': pearson_r,
'pearson_p': pearson_p,
'spearman_r': spearman_r,
'spearman_p': spearman_p,
'n_species': len(valid)
}
# Display results
print("\n=== Correlation of Openness with Pathway Metrics ===")
for metric, corr in correlations.items():
print(f"\n{metric}:")
print(f" Pearson r={corr['pearson_r']:.3f} (p={corr['pearson_p']:.2e})")
print(f" Spearman r={corr['spearman_r']:.3f} (p={corr['spearman_p']:.2e})")
print(f" n={corr['n_species']}")
6. Visualize Correlations¶
In [ ]:
fig, axes = plt.subplots(2, 2, figsize=(14, 12))
# Plot each correlation
metrics_to_plot = [
('avg_pathway_score', 'Average Pathway Score'),
('pathway_diversity', 'Pathway Diversity'),
('total_pathways', 'Total Pathways'),
('unique_categories', 'Unique Categories')
]
for idx, (metric, label) in enumerate(metrics_to_plot):
ax = axes[idx // 2, idx % 2]
valid = merged_data[['openness_score', metric]].dropna()
ax.scatter(valid['openness_score'], valid[metric], alpha=0.6, s=50)
# Add trend line
z = np.polyfit(valid['openness_score'], valid[metric], 1)
p = np.poly1d(z)
x_trend = np.linspace(valid['openness_score'].min(), valid['openness_score'].max(), 100)
ax.plot(x_trend, p(x_trend), "r--", alpha=0.8, linewidth=2)
# Labels
if metric in correlations:
corr = correlations[metric]
ax.set_title(f'{label}\nPearson r={corr["pearson_r"]:.3f} (p={corr["pearson_p"]:.2e})')
else:
ax.set_title(label)
ax.set_xlabel('Pangenome Openness Score')
ax.set_ylabel(label)
ax.grid(alpha=0.3)
plt.tight_layout()
plt.savefig('../figures/02_pathway_openness_correlations.png', dpi=300, bbox_inches='tight')
print("Saved: 02_pathway_openness_correlations.png")
plt.show()
7. Pathway Categories Analysis¶
In [ ]:
# Analyze pathway categories by openness
category_stats = spark.sql("""
SELECT
gp.metabolic_category,
COUNT(DISTINCT gp.pathway) as unique_pathways,
AVG(gp.score) as mean_score,
COUNT(DISTINCT gp.genome_id) as genomes_with_category
FROM kbase_ke_pangenome.gapmind_pathways gp
GROUP BY gp.metabolic_category
ORDER BY unique_pathways DESC
""").toPandas()
print("\nPathway Categories Summary:")
print(category_stats.to_string())
8. Export Results¶
In [ ]:
# Save merged analysis dataset
merged_data.to_csv(
'../data/species_pathway_openness_merged.csv',
index=False
)
print("Saved: species_pathway_openness_merged.csv")
# Save correlation results
corr_df = pd.DataFrame(correlations).T
corr_df.to_csv('../data/pathway_openness_correlations.csv')
print("Saved: pathway_openness_correlations.csv")
# Save category stats
category_stats.to_csv(
'../data/pathway_categories_summary.csv',
index=False
)
print("Saved: pathway_categories_summary.csv")
Summary¶
This analysis explores the relationship between pangenome openness and metabolic pathway characteristics.
Key outputs:
- Correlation of openness with pathway completeness and diversity
- Pathway category distributions across species
- Target species for deeper structural distance analysis
Next steps: Move to Phase 3 (03_structural_distances.ipynb) for AlphaEarth embedding analysis