# Cell 1: Setup and load data

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from collections import Counter

# Data path
DATA_PATH = "/home/psdehal/pangenome_science/data/temporal_core"

# Load exit point data
exit_df = pd.read_parquet(f"{DATA_PATH}/gene_exit_points.parquet")

# Load stable core gene lists
stable_cores = {}
for species in ['p_aeruginosa', 'a_baumannii']:
    try:
        with open(f"{DATA_PATH}/{species}_stable_core_genes.txt", 'r') as f:
            stable_cores[species] = set(line.strip() for line in f)
        print(f"{species}: {len(stable_cores[species])} stable core genes")
    except FileNotFoundError:
        print(f"{species}: Stable core file not found")
        stable_cores[species] = set()

SPECIES = ['p_aeruginosa', 'a_baumannii']

# Cell 2: Categorize genes by exit behavior

gene_categories = {}

for species in SPECIES:
    species_exits = exit_df[exit_df['species'] == species].copy()
    
    # Get max genome count
    max_n = species_exits['exit_at_n_genomes'].max()
    if pd.isna(max_n):
        # If all are never_exits, estimate from data
        max_n = 5000  # Approximate
    
    # Categorize
    stable = set(species_exits[species_exits['never_exits']]['gene_cluster_id'])
    early = set(species_exits[species_exits['exit_at_n_genomes'] <= max_n * 0.25]['gene_cluster_id'])
    mid = set(species_exits[
        (species_exits['exit_at_n_genomes'] > max_n * 0.25) & 
        (species_exits['exit_at_n_genomes'] <= max_n * 0.75)
    ]['gene_cluster_id'])
    late = set(species_exits[
        (species_exits['exit_at_n_genomes'] > max_n * 0.75) & 
        (~species_exits['never_exits'])
    ]['gene_cluster_id'])
    
    gene_categories[species] = {
        'stable': stable,
        'early_exit': early,
        'mid_exit': mid,
        'late_exit': late
    }
    
    print(f"\n{species}:")
    for cat, genes in gene_categories[species].items():
        print(f"  {cat}: {len(genes)} genes")

# Cell 3: Query COG annotations for gene clusters

# COG functional categories
COG_CATEGORIES = {
    'J': 'Translation, ribosomal structure',
    'A': 'RNA processing and modification',
    'K': 'Transcription',
    'L': 'Replication, recombination, repair',
    'B': 'Chromatin structure',
    'D': 'Cell cycle, cell division',
    'Y': 'Nuclear structure',
    'V': 'Defense mechanisms',
    'T': 'Signal transduction',
    'M': 'Cell wall/membrane biogenesis',
    'N': 'Cell motility',
    'Z': 'Cytoskeleton',
    'W': 'Extracellular structures',
    'U': 'Intracellular trafficking',
    'O': 'Posttranslational modification',
    'X': 'Mobilome: prophages, transposons',
    'C': 'Energy production',
    'G': 'Carbohydrate metabolism',
    'E': 'Amino acid metabolism',
    'F': 'Nucleotide metabolism',
    'H': 'Coenzyme metabolism',
    'I': 'Lipid metabolism',
    'P': 'Inorganic ion transport',
    'Q': 'Secondary metabolites',
    'R': 'General function prediction',
    'S': 'Function unknown'
}

annotations = {}

for species in SPECIES:
    print(f"\n=== Querying annotations for {species} ===")
    
    # Get all gene cluster IDs we need annotations for
    all_clusters = set()
    for cat in gene_categories[species].values():
        all_clusters.update(cat)
    
    print(f"  Total gene clusters to annotate: {len(all_clusters)}")
    
    # Query in batches to avoid query size limits
    batch_size = 1000
    cluster_list = list(all_clusters)
    all_annotations = []
    
    for i in range(0, len(cluster_list), batch_size):
        batch = cluster_list[i:i+batch_size]
        
        # Query annotations - join gene_cluster to get representative gene
        # Then get annotations for those genes
        query = f"""
            SELECT DISTINCT
                gc.gene_cluster_id,
                e.COG_category,
                e.Description
            FROM kbase_ke_pangenome.gene_cluster gc
            JOIN kbase_ke_pangenome.gene_genecluster_junction ggj
                ON gc.gene_cluster_id = ggj.gene_cluster_id
            JOIN kbase_ke_pangenome.eggnog_mapper_annotations e
                ON ggj.gene_id = e.gene_id
            WHERE gc.gene_cluster_id IN ({','.join([f"'{c}'" for c in batch])})
        """
        
        try:
            batch_df = spark.sql(query).toPandas()
            all_annotations.append(batch_df)
            
            if (i // batch_size + 1) % 10 == 0:
                print(f"    Processed {i + len(batch)}/{len(cluster_list)} clusters")
        except Exception as e:
            print(f"    Error in batch {i//batch_size}: {e}")
    
    if all_annotations:
        annotations[species] = pd.concat(all_annotations, ignore_index=True)
        print(f"  Total annotations retrieved: {len(annotations[species])}")
        print(f"  Clusters with annotations: {annotations[species]['gene_cluster_id'].nunique()}")
    else:
        annotations[species] = pd.DataFrame()

# Cell 4: Parse COG categories (handle multi-category assignments)

def parse_cog_categories(cog_string):
    """
    Parse COG category string (can be multiple letters like 'KL' or 'COG').
    Returns list of individual category letters.
    """
    if pd.isna(cog_string) or cog_string == '' or cog_string == '-':
        return ['S']  # Unknown
    
    # Extract single-letter categories
    categories = [c for c in str(cog_string).upper() if c in COG_CATEGORIES]
    
    return categories if categories else ['S']


def get_cog_distribution(gene_cluster_ids, annotations_df):
    """
    Get COG category distribution for a set of gene clusters.
    """
    # Filter to relevant clusters
    relevant = annotations_df[annotations_df['gene_cluster_id'].isin(gene_cluster_ids)]
    
    # Get unique cluster -> COG mapping (take first annotation per cluster)
    cluster_cogs = relevant.groupby('gene_cluster_id')['COG_category'].first().reset_index()
    
    # Parse and count categories
    all_categories = []
    for cog in cluster_cogs['COG_category']:
        all_categories.extend(parse_cog_categories(cog))
    
    # Count
    counts = Counter(all_categories)
    total = sum(counts.values())
    
    return {cat: counts.get(cat, 0) / total if total > 0 else 0 
            for cat in COG_CATEGORIES.keys()}

# Test
test_species = SPECIES[0]
test_dist = get_cog_distribution(
    gene_categories[test_species]['stable'],
    annotations[test_species]
)
print(f"Test: Top 5 COG categories in {test_species} stable core:")
for cat, prop in sorted(test_dist.items(), key=lambda x: -x[1])[:5]:
    print(f"  {cat} ({COG_CATEGORIES[cat][:30]}): {prop:.1%}")

# Cell 5: Calculate COG distributions for each gene category

cog_distributions = {}

for species in SPECIES:
    cog_distributions[species] = {}
    
    for category, gene_ids in gene_categories[species].items():
        if len(gene_ids) > 0:
            dist = get_cog_distribution(gene_ids, annotations[species])
            cog_distributions[species][category] = dist
            print(f"{species} - {category}: {len(gene_ids)} genes analyzed")

print("\nCOG distributions calculated for all categories.")

# Cell 6: Compare stable vs early-exit genes

comparison_results = []

for species in SPECIES:
    if 'stable' not in cog_distributions[species] or 'early_exit' not in cog_distributions[species]:
        continue
    
    stable_dist = cog_distributions[species]['stable']
    early_dist = cog_distributions[species]['early_exit']
    
    for cog in COG_CATEGORIES.keys():
        stable_prop = stable_dist.get(cog, 0)
        early_prop = early_dist.get(cog, 0)
        
        # Calculate enrichment (early vs stable)
        if stable_prop > 0:
            enrichment = early_prop / stable_prop
        else:
            enrichment = float('inf') if early_prop > 0 else 1.0
        
        comparison_results.append({
            'species': species,
            'cog_category': cog,
            'cog_description': COG_CATEGORIES[cog],
            'stable_prop': stable_prop,
            'early_exit_prop': early_prop,
            'enrichment_in_early': enrichment,
            'diff': early_prop - stable_prop
        })

comparison_df = pd.DataFrame(comparison_results)

# Show most enriched in early-exit
print("\n=== COG Categories ENRICHED in Early-Exiting Genes ===")
print("(Higher in early-exit vs stable core)\n")

for species in SPECIES:
    species_data = comparison_df[comparison_df['species'] == species]
    enriched = species_data.nlargest(5, 'diff')
    
    print(f"{species}:")
    for _, row in enriched.iterrows():
        print(f"  {row['cog_category']} ({row['cog_description'][:25]}): "
              f"{row['stable_prop']:.1%} -> {row['early_exit_prop']:.1%} "
              f"({row['diff']:+.1%})")
    print()

# Cell 7: Visualize COG distributions as heatmap

fig, axes = plt.subplots(1, 2, figsize=(16, 10))

for idx, species in enumerate(SPECIES):
    ax = axes[idx]
    
    # Build matrix
    categories = ['stable', 'late_exit', 'mid_exit', 'early_exit']
    cogs = list(COG_CATEGORIES.keys())
    
    matrix = np.zeros((len(cogs), len(categories)))
    for i, cog in enumerate(cogs):
        for j, cat in enumerate(categories):
            if cat in cog_distributions[species]:
                matrix[i, j] = cog_distributions[species][cat].get(cog, 0)
    
    # Plot heatmap
    im = ax.imshow(matrix, cmap='YlOrRd', aspect='auto')
    
    ax.set_xticks(range(len(categories)))
    ax.set_xticklabels(categories, rotation=45, ha='right')
    ax.set_yticks(range(len(cogs)))
    ax.set_yticklabels([f"{c}: {COG_CATEGORIES[c][:20]}" for c in cogs], fontsize=8)
    ax.set_title(f'{species}\nCOG Category Distribution')
    
    plt.colorbar(im, ax=ax, label='Proportion')

plt.tight_layout()
plt.savefig(f"{DATA_PATH}/cog_distribution_heatmap.png", dpi=150, bbox_inches='tight')
plt.show()

print("Saved: cog_distribution_heatmap.png")

# Cell 8: Bar plot of enrichment/depletion

fig, axes = plt.subplots(1, 2, figsize=(14, 8))

for idx, species in enumerate(SPECIES):
    ax = axes[idx]
    
    species_data = comparison_df[comparison_df['species'] == species].copy()
    species_data = species_data.sort_values('diff')
    
    colors = ['#e74c3c' if d > 0 else '#3498db' for d in species_data['diff']]
    
    y_pos = range(len(species_data))
    ax.barh(y_pos, species_data['diff'] * 100, color=colors, alpha=0.8)
    
    ax.set_yticks(y_pos)
    ax.set_yticklabels([f"{row['cog_category']}: {row['cog_description'][:25]}" 
                        for _, row in species_data.iterrows()], fontsize=8)
    ax.set_xlabel('Difference (Early Exit - Stable) [percentage points]')
    ax.set_title(f'{species}\nCOG Enrichment in Early-Exiting vs Stable Core')
    ax.axvline(0, color='black', linewidth=0.5)
    ax.grid(True, alpha=0.3, axis='x')
    
    # Add legend
    ax.annotate('Enriched in\nearly-exit', xy=(0.95, 0.95), xycoords='axes fraction',
                ha='right', va='top', fontsize=10, color='#e74c3c')
    ax.annotate('Enriched in\nstable core', xy=(0.05, 0.05), xycoords='axes fraction',
                ha='left', va='bottom', fontsize=10, color='#3498db')

plt.tight_layout()
plt.savefig(f"{DATA_PATH}/cog_enrichment_barplot.png", dpi=150, bbox_inches='tight')
plt.show()

print("Saved: cog_enrichment_barplot.png")

# Cell 9: Summarize functional patterns

print("="*60)
print("FUNCTIONAL ANALYSIS - KEY FINDINGS")
print("="*60)

for species in SPECIES:
    print(f"\n{'='*60}")
    print(f"{species.upper()}")
    print(f"{'='*60}")
    
    species_data = comparison_df[comparison_df['species'] == species]
    
    # Top enriched in stable core
    print("\nFunctions ENRICHED in STABLE core (housekeeping):")
    stable_enriched = species_data.nsmallest(5, 'diff')
    for _, row in stable_enriched.iterrows():
        if row['diff'] < -0.01:  # At least 1% difference
            print(f"  {row['cog_category']}: {row['cog_description']}")
            print(f"     Stable: {row['stable_prop']:.1%}, Early-exit: {row['early_exit_prop']:.1%}")
    
    # Top enriched in early-exit
    print("\nFunctions ENRICHED in EARLY-EXIT genes (dynamic):")
    early_enriched = species_data.nlargest(5, 'diff')
    for _, row in early_enriched.iterrows():
        if row['diff'] > 0.01:  # At least 1% difference
            print(f"  {row['cog_category']}: {row['cog_description']}")
            print(f"     Stable: {row['stable_prop']:.1%}, Early-exit: {row['early_exit_prop']:.1%}")

print(f"\n{'='*60}")
print("INTERPRETATION")
print(f"{'='*60}")
print("""
The functional analysis reveals distinct signatures between stable and
transient core genes:

STABLE CORE genes are enriched in:
- Essential cellular processes (translation, replication)
- Central metabolism  
- Cell structure maintenance

EARLY-EXITING genes are enriched in:
- Mobile elements (X category) - suggests HGT turnover
- Defense mechanisms (V) - CRISPR, restriction systems
- Secondary metabolism (Q) - niche-specific adaptations
- Unknown functions (S, R) - novel/lineage-specific genes

This pattern supports the hypothesis that core genome erosion over time
reflects the loss of accessory/mobile elements that were temporarily
widespread, rather than loss of essential functions.
""")

# Cell 10: Save results

# Save COG comparison
comparison_df.to_csv(f"{DATA_PATH}/cog_comparison_results.csv", index=False)
print(f"Saved: cog_comparison_results.csv")

# Save COG distributions
cog_dist_records = []
for species in SPECIES:
    for category, dist in cog_distributions[species].items():
        for cog, prop in dist.items():
            cog_dist_records.append({
                'species': species,
                'gene_category': category,
                'cog_category': cog,
                'proportion': prop
            })

cog_dist_df = pd.DataFrame(cog_dist_records)
cog_dist_df.to_csv(f"{DATA_PATH}/cog_distributions.csv", index=False)
print(f"Saved: cog_distributions.csv")

print("\nAll functional analysis results saved!")

04 Functional

Temporal Core Genome Dynamics - Functional Enrichment¶

Goal¶

Hypothesis¶

Query Functional Annotations from BERDL¶

Compare COG Distributions¶

Key Findings¶

Summary¶

Files Created¶

Key Conclusions¶