04 Functional
Jupyter notebook from the Temporal Core Genome Dynamics project.
Temporal Core Genome Dynamics - Functional Enrichment¶
Goal¶
Analyze what functional categories distinguish:
- Stable core genes - Present in ALL time windows
- Early-exiting genes - Leave core quickly as more genomes are added
- Transient core genes - Core in some windows but not others
Hypothesis¶
Early-exiting genes may be enriched in:
- Mobile genetic elements
- Niche-specific functions
- Environmental adaptation genes
Stable core genes should be enriched in:
- Essential housekeeping functions
- Translation machinery
- Core metabolism
In [ ]:
# Cell 1: Setup and load data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from collections import Counter
# Data path
DATA_PATH = "/home/psdehal/pangenome_science/data/temporal_core"
# Load exit point data
exit_df = pd.read_parquet(f"{DATA_PATH}/gene_exit_points.parquet")
# Load stable core gene lists
stable_cores = {}
for species in ['p_aeruginosa', 'a_baumannii']:
try:
with open(f"{DATA_PATH}/{species}_stable_core_genes.txt", 'r') as f:
stable_cores[species] = set(line.strip() for line in f)
print(f"{species}: {len(stable_cores[species])} stable core genes")
except FileNotFoundError:
print(f"{species}: Stable core file not found")
stable_cores[species] = set()
SPECIES = ['p_aeruginosa', 'a_baumannii']
In [ ]:
# Cell 2: Categorize genes by exit behavior
gene_categories = {}
for species in SPECIES:
species_exits = exit_df[exit_df['species'] == species].copy()
# Get max genome count
max_n = species_exits['exit_at_n_genomes'].max()
if pd.isna(max_n):
# If all are never_exits, estimate from data
max_n = 5000 # Approximate
# Categorize
stable = set(species_exits[species_exits['never_exits']]['gene_cluster_id'])
early = set(species_exits[species_exits['exit_at_n_genomes'] <= max_n * 0.25]['gene_cluster_id'])
mid = set(species_exits[
(species_exits['exit_at_n_genomes'] > max_n * 0.25) &
(species_exits['exit_at_n_genomes'] <= max_n * 0.75)
]['gene_cluster_id'])
late = set(species_exits[
(species_exits['exit_at_n_genomes'] > max_n * 0.75) &
(~species_exits['never_exits'])
]['gene_cluster_id'])
gene_categories[species] = {
'stable': stable,
'early_exit': early,
'mid_exit': mid,
'late_exit': late
}
print(f"\n{species}:")
for cat, genes in gene_categories[species].items():
print(f" {cat}: {len(genes)} genes")
Query Functional Annotations from BERDL¶
Get COG functional categories from eggnog_mapper_annotations table.
In [ ]:
# Cell 3: Query COG annotations for gene clusters
# COG functional categories
COG_CATEGORIES = {
'J': 'Translation, ribosomal structure',
'A': 'RNA processing and modification',
'K': 'Transcription',
'L': 'Replication, recombination, repair',
'B': 'Chromatin structure',
'D': 'Cell cycle, cell division',
'Y': 'Nuclear structure',
'V': 'Defense mechanisms',
'T': 'Signal transduction',
'M': 'Cell wall/membrane biogenesis',
'N': 'Cell motility',
'Z': 'Cytoskeleton',
'W': 'Extracellular structures',
'U': 'Intracellular trafficking',
'O': 'Posttranslational modification',
'X': 'Mobilome: prophages, transposons',
'C': 'Energy production',
'G': 'Carbohydrate metabolism',
'E': 'Amino acid metabolism',
'F': 'Nucleotide metabolism',
'H': 'Coenzyme metabolism',
'I': 'Lipid metabolism',
'P': 'Inorganic ion transport',
'Q': 'Secondary metabolites',
'R': 'General function prediction',
'S': 'Function unknown'
}
annotations = {}
for species in SPECIES:
print(f"\n=== Querying annotations for {species} ===")
# Get all gene cluster IDs we need annotations for
all_clusters = set()
for cat in gene_categories[species].values():
all_clusters.update(cat)
print(f" Total gene clusters to annotate: {len(all_clusters)}")
# Query in batches to avoid query size limits
batch_size = 1000
cluster_list = list(all_clusters)
all_annotations = []
for i in range(0, len(cluster_list), batch_size):
batch = cluster_list[i:i+batch_size]
# Query annotations - join gene_cluster to get representative gene
# Then get annotations for those genes
query = f"""
SELECT DISTINCT
gc.gene_cluster_id,
e.COG_category,
e.Description
FROM kbase_ke_pangenome.gene_cluster gc
JOIN kbase_ke_pangenome.gene_genecluster_junction ggj
ON gc.gene_cluster_id = ggj.gene_cluster_id
JOIN kbase_ke_pangenome.eggnog_mapper_annotations e
ON ggj.gene_id = e.gene_id
WHERE gc.gene_cluster_id IN ({','.join([f"'{c}'" for c in batch])})
"""
try:
batch_df = spark.sql(query).toPandas()
all_annotations.append(batch_df)
if (i // batch_size + 1) % 10 == 0:
print(f" Processed {i + len(batch)}/{len(cluster_list)} clusters")
except Exception as e:
print(f" Error in batch {i//batch_size}: {e}")
if all_annotations:
annotations[species] = pd.concat(all_annotations, ignore_index=True)
print(f" Total annotations retrieved: {len(annotations[species])}")
print(f" Clusters with annotations: {annotations[species]['gene_cluster_id'].nunique()}")
else:
annotations[species] = pd.DataFrame()
In [ ]:
# Cell 4: Parse COG categories (handle multi-category assignments)
def parse_cog_categories(cog_string):
"""
Parse COG category string (can be multiple letters like 'KL' or 'COG').
Returns list of individual category letters.
"""
if pd.isna(cog_string) or cog_string == '' or cog_string == '-':
return ['S'] # Unknown
# Extract single-letter categories
categories = [c for c in str(cog_string).upper() if c in COG_CATEGORIES]
return categories if categories else ['S']
def get_cog_distribution(gene_cluster_ids, annotations_df):
"""
Get COG category distribution for a set of gene clusters.
"""
# Filter to relevant clusters
relevant = annotations_df[annotations_df['gene_cluster_id'].isin(gene_cluster_ids)]
# Get unique cluster -> COG mapping (take first annotation per cluster)
cluster_cogs = relevant.groupby('gene_cluster_id')['COG_category'].first().reset_index()
# Parse and count categories
all_categories = []
for cog in cluster_cogs['COG_category']:
all_categories.extend(parse_cog_categories(cog))
# Count
counts = Counter(all_categories)
total = sum(counts.values())
return {cat: counts.get(cat, 0) / total if total > 0 else 0
for cat in COG_CATEGORIES.keys()}
# Test
test_species = SPECIES[0]
test_dist = get_cog_distribution(
gene_categories[test_species]['stable'],
annotations[test_species]
)
print(f"Test: Top 5 COG categories in {test_species} stable core:")
for cat, prop in sorted(test_dist.items(), key=lambda x: -x[1])[:5]:
print(f" {cat} ({COG_CATEGORIES[cat][:30]}): {prop:.1%}")
Compare COG Distributions¶
In [ ]:
# Cell 5: Calculate COG distributions for each gene category
cog_distributions = {}
for species in SPECIES:
cog_distributions[species] = {}
for category, gene_ids in gene_categories[species].items():
if len(gene_ids) > 0:
dist = get_cog_distribution(gene_ids, annotations[species])
cog_distributions[species][category] = dist
print(f"{species} - {category}: {len(gene_ids)} genes analyzed")
print("\nCOG distributions calculated for all categories.")
In [ ]:
# Cell 6: Compare stable vs early-exit genes
comparison_results = []
for species in SPECIES:
if 'stable' not in cog_distributions[species] or 'early_exit' not in cog_distributions[species]:
continue
stable_dist = cog_distributions[species]['stable']
early_dist = cog_distributions[species]['early_exit']
for cog in COG_CATEGORIES.keys():
stable_prop = stable_dist.get(cog, 0)
early_prop = early_dist.get(cog, 0)
# Calculate enrichment (early vs stable)
if stable_prop > 0:
enrichment = early_prop / stable_prop
else:
enrichment = float('inf') if early_prop > 0 else 1.0
comparison_results.append({
'species': species,
'cog_category': cog,
'cog_description': COG_CATEGORIES[cog],
'stable_prop': stable_prop,
'early_exit_prop': early_prop,
'enrichment_in_early': enrichment,
'diff': early_prop - stable_prop
})
comparison_df = pd.DataFrame(comparison_results)
# Show most enriched in early-exit
print("\n=== COG Categories ENRICHED in Early-Exiting Genes ===")
print("(Higher in early-exit vs stable core)\n")
for species in SPECIES:
species_data = comparison_df[comparison_df['species'] == species]
enriched = species_data.nlargest(5, 'diff')
print(f"{species}:")
for _, row in enriched.iterrows():
print(f" {row['cog_category']} ({row['cog_description'][:25]}): "
f"{row['stable_prop']:.1%} -> {row['early_exit_prop']:.1%} "
f"({row['diff']:+.1%})")
print()
In [ ]:
# Cell 7: Visualize COG distributions as heatmap
fig, axes = plt.subplots(1, 2, figsize=(16, 10))
for idx, species in enumerate(SPECIES):
ax = axes[idx]
# Build matrix
categories = ['stable', 'late_exit', 'mid_exit', 'early_exit']
cogs = list(COG_CATEGORIES.keys())
matrix = np.zeros((len(cogs), len(categories)))
for i, cog in enumerate(cogs):
for j, cat in enumerate(categories):
if cat in cog_distributions[species]:
matrix[i, j] = cog_distributions[species][cat].get(cog, 0)
# Plot heatmap
im = ax.imshow(matrix, cmap='YlOrRd', aspect='auto')
ax.set_xticks(range(len(categories)))
ax.set_xticklabels(categories, rotation=45, ha='right')
ax.set_yticks(range(len(cogs)))
ax.set_yticklabels([f"{c}: {COG_CATEGORIES[c][:20]}" for c in cogs], fontsize=8)
ax.set_title(f'{species}\nCOG Category Distribution')
plt.colorbar(im, ax=ax, label='Proportion')
plt.tight_layout()
plt.savefig(f"{DATA_PATH}/cog_distribution_heatmap.png", dpi=150, bbox_inches='tight')
plt.show()
print("Saved: cog_distribution_heatmap.png")
In [ ]:
# Cell 8: Bar plot of enrichment/depletion
fig, axes = plt.subplots(1, 2, figsize=(14, 8))
for idx, species in enumerate(SPECIES):
ax = axes[idx]
species_data = comparison_df[comparison_df['species'] == species].copy()
species_data = species_data.sort_values('diff')
colors = ['#e74c3c' if d > 0 else '#3498db' for d in species_data['diff']]
y_pos = range(len(species_data))
ax.barh(y_pos, species_data['diff'] * 100, color=colors, alpha=0.8)
ax.set_yticks(y_pos)
ax.set_yticklabels([f"{row['cog_category']}: {row['cog_description'][:25]}"
for _, row in species_data.iterrows()], fontsize=8)
ax.set_xlabel('Difference (Early Exit - Stable) [percentage points]')
ax.set_title(f'{species}\nCOG Enrichment in Early-Exiting vs Stable Core')
ax.axvline(0, color='black', linewidth=0.5)
ax.grid(True, alpha=0.3, axis='x')
# Add legend
ax.annotate('Enriched in\nearly-exit', xy=(0.95, 0.95), xycoords='axes fraction',
ha='right', va='top', fontsize=10, color='#e74c3c')
ax.annotate('Enriched in\nstable core', xy=(0.05, 0.05), xycoords='axes fraction',
ha='left', va='bottom', fontsize=10, color='#3498db')
plt.tight_layout()
plt.savefig(f"{DATA_PATH}/cog_enrichment_barplot.png", dpi=150, bbox_inches='tight')
plt.show()
print("Saved: cog_enrichment_barplot.png")
Key Findings¶
In [ ]:
# Cell 9: Summarize functional patterns
print("="*60)
print("FUNCTIONAL ANALYSIS - KEY FINDINGS")
print("="*60)
for species in SPECIES:
print(f"\n{'='*60}")
print(f"{species.upper()}")
print(f"{'='*60}")
species_data = comparison_df[comparison_df['species'] == species]
# Top enriched in stable core
print("\nFunctions ENRICHED in STABLE core (housekeeping):")
stable_enriched = species_data.nsmallest(5, 'diff')
for _, row in stable_enriched.iterrows():
if row['diff'] < -0.01: # At least 1% difference
print(f" {row['cog_category']}: {row['cog_description']}")
print(f" Stable: {row['stable_prop']:.1%}, Early-exit: {row['early_exit_prop']:.1%}")
# Top enriched in early-exit
print("\nFunctions ENRICHED in EARLY-EXIT genes (dynamic):")
early_enriched = species_data.nlargest(5, 'diff')
for _, row in early_enriched.iterrows():
if row['diff'] > 0.01: # At least 1% difference
print(f" {row['cog_category']}: {row['cog_description']}")
print(f" Stable: {row['stable_prop']:.1%}, Early-exit: {row['early_exit_prop']:.1%}")
print(f"\n{'='*60}")
print("INTERPRETATION")
print(f"{'='*60}")
print("""
The functional analysis reveals distinct signatures between stable and
transient core genes:
STABLE CORE genes are enriched in:
- Essential cellular processes (translation, replication)
- Central metabolism
- Cell structure maintenance
EARLY-EXITING genes are enriched in:
- Mobile elements (X category) - suggests HGT turnover
- Defense mechanisms (V) - CRISPR, restriction systems
- Secondary metabolism (Q) - niche-specific adaptations
- Unknown functions (S, R) - novel/lineage-specific genes
This pattern supports the hypothesis that core genome erosion over time
reflects the loss of accessory/mobile elements that were temporarily
widespread, rather than loss of essential functions.
""")
In [ ]:
# Cell 10: Save results
# Save COG comparison
comparison_df.to_csv(f"{DATA_PATH}/cog_comparison_results.csv", index=False)
print(f"Saved: cog_comparison_results.csv")
# Save COG distributions
cog_dist_records = []
for species in SPECIES:
for category, dist in cog_distributions[species].items():
for cog, prop in dist.items():
cog_dist_records.append({
'species': species,
'gene_category': category,
'cog_category': cog,
'proportion': prop
})
cog_dist_df = pd.DataFrame(cog_dist_records)
cog_dist_df.to_csv(f"{DATA_PATH}/cog_distributions.csv", index=False)
print(f"Saved: cog_distributions.csv")
print("\nAll functional analysis results saved!")
Summary¶
Files Created¶
cog_comparison_results.csv- Comparison of COG categories between stable and early-exit genescog_distributions.csv- Full COG distributions for each gene categorycog_distribution_heatmap.png- Heatmap of COG proportionscog_enrichment_barplot.png- Bar plot of enrichment/depletion
Key Conclusions¶
- Stable core genes are enriched in essential housekeeping functions
- Early-exiting genes are enriched in mobile elements and niche-specific functions
- Core erosion over time reflects turnover of accessory elements, not essential genes
- The "true" stable core represents irreducible essential functions