import sys
import os
import pandas as pd
import numpy as np

# Spark session (on BERDL JupyterHub — no import needed)
spark = get_spark_session()

# Add project src to path for prophage_utils
sys.path.insert(0, '../src')
from prophage_utils import MODULES, classify_gene_to_module, is_terL, build_spark_where_clause

# Output directories
os.makedirs('../data', exist_ok=True)
os.makedirs('../figures', exist_ok=True)

print(f'Spark session active. Modules defined: {list(MODULES.keys())}')

Spark session active. Modules defined: ['A_packaging', 'B_head_morphogenesis', 'C_tail', 'D_lysis', 'E_integration', 'F_lysogenic_regulation', 'G_anti_defense']

# Check counts for key prophage-related description keywords
keywords = [
    'terminase', 'capsid', 'portal protein', 'holin', 'endolysin',
    'integrase', 'tail sheath', 'tail tube', 'tape measure', 'baseplate',
    'excisionase', 'lysozyme', 'repressor', 'anti-crispr', 'anti-restriction',
    'phage', 'prophage', 'tail fiber', 'tail spike'
]

keyword_counts = []
for kw in keywords:
    count_df = spark.sql(f"""
        SELECT COUNT(*) as n
        FROM kbase_ke_pangenome.eggnog_mapper_annotations
        WHERE LOWER(Description) LIKE '%{kw}%'
    """).toPandas()
    keyword_counts.append({'keyword': kw, 'count': int(count_df['n'].iloc[0])})
    print(f'  {kw}: {count_df["n"].iloc[0]:,}')

kw_df = pd.DataFrame(keyword_counts).sort_values('count', ascending=False)
print(f'\nTotal unique keywords checked: {len(keywords)}')
print(kw_df.to_string(index=False))

  terminase: 104,064

  capsid: 46,221

  portal protein: 43,503

  holin: 88,790

  endolysin: 1,601

  integrase: 554,830

  tail sheath: 20,537

  tail tube: 23,691

  tape measure: 24,194

  baseplate: 30,605

# Check PFam domain counts for phage-related domains
pfam_keywords = [
    'Terminase', 'Phage_portal', 'HK97', 'Phage_cap',
    'Phage_holin', 'Phage_integrase', 'Phage_tail',
    'Phage_sheath', 'Phage_fiber', 'Phage_lysozyme',
    'XRE_N', 'ArdA', 'Phage_int_SAM'
]

pfam_counts = []
for pf in pfam_keywords:
    count_df = spark.sql(f"""
        SELECT COUNT(*) as n
        FROM kbase_ke_pangenome.eggnog_mapper_annotations
        WHERE PFAMs LIKE '%{pf}%'
    """).toPandas()
    pfam_counts.append({'pfam_keyword': pf, 'count': int(count_df['n'].iloc[0])})
    print(f'  {pf}: {count_df["n"].iloc[0]:,}')

pf_df = pd.DataFrame(pfam_counts).sort_values('count', ascending=False)
print(f'\nPFam keyword counts:')
print(pf_df.to_string(index=False))

  Terminase: 82,562

  Phage_portal: 31,929

  HK97: 16,975

  Phage_cap: 35,973

  Phage_holin: 46,122

  Phage_integrase: 475,222

  Phage_tail: 41,718

  Phage_sheath: 15,762

  Phage_fiber: 9,854

  Phage_lysozyme: 13,715

# Sample some annotations for each key prophage marker to understand annotation patterns
for marker in ['terminase large subunit', 'major capsid protein', 'holin', 'phage integrase', 'portal protein']:
    print(f'\n--- {marker.upper()} ---')
    sample = spark.sql(f"""
        SELECT PFAMs, Description, KEGG_ko, COG_category
        FROM kbase_ke_pangenome.eggnog_mapper_annotations
        WHERE LOWER(Description) LIKE '%{marker}%'
        LIMIT 10
    """).toPandas()
    for _, row in sample.iterrows():
        print(f'  PFAMs={row["PFAMs"]}  Desc={row["Description"][:80]}  KO={row["KEGG_ko"]}  COG={row["COG_category"]}')

--- TERMINASE LARGE SUBUNIT ---

  PFAMs=Terminase_3,Terminase_3C  Desc=Phage terminase large subunit  KO=ko:K06909  COG=L
  PFAMs=Terminase_3,Terminase_3C  Desc=Phage terminase large subunit  KO=ko:K06909  COG=L
  PFAMs=Terminase_3  Desc=Phage terminase large subunit  KO=ko:K06909  COG=S
  PFAMs=Terminase_3,Terminase_3C  Desc=Phage terminase large subunit  KO=-  COG=S
  PFAMs=Terminase_GpA  Desc=Phage terminase large subunit (GpA)  KO=-  COG=S
  PFAMs=Terminase_3,Terminase_6,Terminase_6C  Desc=phage Terminase large subunit  KO=-  COG=S
  PFAMs=Terminase_3  Desc=Phage terminase large subunit  KO=ko:K06909  COG=S
  PFAMs=Terminase_GpA  Desc=Phage terminase large subunit (GpA)  KO=-  COG=S
  PFAMs=Terminase_GpA  Desc=Phage terminase large subunit (GpA)  KO=ko:K21512  COG=S
  PFAMs=Terminase_GpA  Desc=Phage terminase large subunit (GpA)  KO=-  COG=S

--- MAJOR CAPSID PROTEIN ---

  PFAMs=Phage_cap_P2  Desc=Major capsid protein  KO=-  COG=S
  PFAMs=Phage_capsid  Desc=phage major capsid protein, HK97 family  KO=-  COG=S
  PFAMs=Phage_capsid  Desc=phage major capsid protein, HK97 family  KO=-  COG=S
  PFAMs=Phage_capsid  Desc=phage major capsid protein, HK97 family  KO=-  COG=S
  PFAMs=Phage_capsid  Desc=Phage major capsid protein  KO=-  COG=S
  PFAMs=Phage_capsid  Desc=phage major capsid protein, HK97 family  KO=-  COG=S
  PFAMs=Phage_capsid  Desc=phage major capsid protein, HK97 family  KO=-  COG=S
  PFAMs=Phage_cap_P2  Desc=Major capsid protein  KO=-  COG=S
  PFAMs=Phage_cap_P2  Desc=Major capsid protein  KO=-  COG=S
  PFAMs=Phage_cap_P2  Desc=Major capsid protein  KO=-  COG=S

--- HOLIN ---

  PFAMs=GMC_oxred_C,GMC_oxred_N  Desc=choline dehydrogenase activity  KO=ko:K00108  COG=C
  PFAMs=GMC_oxred_C,GMC_oxred_N  Desc=Involved in the biosynthesis of the osmoprotectant glycine betaine. Catalyzes th  KO=-  COG=E
  PFAMs=GMC_oxred_C,GMC_oxred_N  Desc=Involved in the biosynthesis of the osmoprotectant glycine betaine. Catalyzes th  KO=ko:K00108  COG=C
  PFAMs=APH  Desc=Choline/ethanolamine kinase  KO=ko:K18844  COG=S
  PFAMs=GMC_oxred_C,GMC_oxred_N  Desc=Involved in the biosynthesis of the osmoprotectant glycine betaine. Catalyzes th  KO=ko:K00108  COG=E
  PFAMs=GMC_oxred_C,GMC_oxred_N  Desc=PFAM glucose-methanol-choline oxidoreductase  KO=-  COG=E
  PFAMs=APH  Desc=Choline/ethanolamine kinase  KO=-  COG=S
  PFAMs=GMC_oxred_C,GMC_oxred_N  Desc=Involved in the biosynthesis of the osmoprotectant glycine betaine. Catalyzes th  KO=ko:K00108  COG=E
  PFAMs=Gly_radical,PFL-like  Desc=Glycine radical enzyme that catalyzes the cleavage of a C-N bond in choline, pro  KO=ko:K00656,ko:K20038  COG=C
  PFAMs=-  Desc=Phosphorylcholine phosphatase  KO=ko:K21830  COG=E

--- PHAGE INTEGRASE ---

  PFAMs=Phage_integrase  Desc=PFAM phage integrase family protein  KO=-  COG=L
  PFAMs=Phage_integrase  Desc=Phage integrase family  KO=-  COG=L
  PFAMs=Arm-DNA-bind_4,Phage_int_SAM_3,Phage_integrase  Desc=Phage integrase family  KO=-  COG=L
  PFAMs=Phage_int_SAM_4,Phage_integrase  Desc=Phage integrase family  KO=-  COG=L
  PFAMs=Arm-DNA-bind_4,Phage_int_SAM_3,Phage_integrase  Desc=Phage integrase family  KO=-  COG=L
  PFAMs=Phage_integrase  Desc=Phage integrase family  KO=-  COG=L
  PFAMs=Phage_integrase  Desc=Phage integrase family  KO=ko:K04763  COG=L
  PFAMs=Phage_integrase  Desc=Phage integrase family  KO=-  COG=L
  PFAMs=Phage_int_SAM_4,Phage_integrase  Desc=Phage integrase family  KO=-  COG=L
  PFAMs=Phage_integrase  Desc=Phage integrase family  KO=-  COG=L

--- PORTAL PROTEIN ---

  PFAMs=Fil_haemagg_2,Phage_portal  Desc=Portal protein  KO=ko:K15125  COG=N
  PFAMs=Phage_TAC_5  Desc=Phage portal protein  KO=-  COG=S
  PFAMs=-  Desc=Phage portal protein, lambda family  KO=-  COG=S
  PFAMs=Phage_portal  Desc=Phage portal protein, HK97 family  KO=-  COG=S
  PFAMs=Phage_TAC_5  Desc=Phage portal protein  KO=-  COG=S
  PFAMs=Phage_portal_2  Desc=Phage portal protein, lambda family  KO=-  COG=F
  PFAMs=Phage_portal  Desc=portal protein  KO=-  COG=S
  PFAMs=Phage_portal  Desc=portal protein  KO=-  COG=S
  PFAMs=Phage_portal  Desc=Phage portal protein  KO=-  COG=S
  PFAMs=Phage_portal  Desc=portal protein  KO=-  COG=S

# Build the full WHERE clause from our module definitions
where_clause = build_spark_where_clause()
print(f'WHERE clause has {where_clause.count("OR")+1} conditions')
print(f'First 500 chars: {where_clause[:500]}...')

WHERE clause has 112 conditions
First 500 chars: LOWER(ann.Description) LIKE '%anti-cbass%'
   OR LOWER(ann.Description) LIKE '%anti-crispr%'
   OR LOWER(ann.Description) LIKE '%anti-defense%'
   OR LOWER(ann.Description) LIKE '%anti-restriction%'
   OR LOWER(ann.Description) LIKE '%antitermination%'
   OR LOWER(ann.Description) LIKE '%antiterminator%'
   OR LOWER(ann.Description) LIKE '%arda%'
   OR LOWER(ann.Description) LIKE '%baseplate%'
   OR LOWER(ann.Description) LIKE '%capsid protease%'
   OR LOWER(ann.Description) LIKE '%capsid protei...

# Main extraction query: join eggnog_mapper_annotations with gene_cluster
# This is the most expensive query — ~93M annotation rows joined with 132M cluster rows
# Filter significantly reduces the scan via the WHERE clause
# NOTE: faa_sequence is EXCLUDED here to stay under spark.driver.maxResultSize (1024 MiB).
# TerL sequences are fetched separately in the save-outputs cell.

query = f"""
SELECT gc.gene_cluster_id,
       gc.gtdb_species_clade_id,
       gc.is_core,
       gc.is_auxiliary,
       gc.is_singleton,
       ann.PFAMs,
       ann.Description,
       ann.KEGG_ko,
       ann.COG_category,
       ann.EC
FROM kbase_ke_pangenome.gene_cluster gc
JOIN kbase_ke_pangenome.eggnog_mapper_annotations ann
    ON gc.gene_cluster_id = ann.query_name
WHERE {where_clause}
"""

print('Running prophage gene extraction query (without faa_sequence)...')
prophage_spark = spark.sql(query)

# Cache in Spark for reuse
prophage_spark.cache()
n_total = prophage_spark.count()
print(f'Total prophage-related gene clusters found: {n_total:,}')

Running prophage gene extraction query (without faa_sequence)...

Total prophage-related gene clusters found: 4,005,537

# Convert to pandas for module classification
# Without faa_sequence, this should be well under the 1 GB driver limit
print('Converting to pandas (without faa_sequence)...')
prophage_df = prophage_spark.toPandas()
print(f'DataFrame shape: {prophage_df.shape}')
print(f'Memory: {prophage_df.memory_usage(deep=True).sum() / 1e6:.1f} MB')

Converting to pandas (without faa_sequence)...

DataFrame shape: (4005537, 10)
Memory: 993.5 MB

# Classify each gene cluster into module(s)
prophage_df['modules'] = prophage_df.apply(
    lambda r: classify_gene_to_module(r['Description'], r['PFAMs'], r['KEGG_ko'], r['COG_category']),
    axis=1
)

# Count how many clusters are assigned to each module
from collections import Counter
module_counter = Counter()
for modules_list in prophage_df['modules']:
    for m in modules_list:
        module_counter[m] += 1

print('Gene clusters per module:')
for module_id in sorted(MODULES.keys()):
    count = module_counter.get(module_id, 0)
    print(f'  {module_id}: {count:,} ({MODULES[module_id]["full_name"]})')

# How many clusters have no module assignment?
n_unassigned = sum(1 for m in prophage_df['modules'] if len(m) == 0)
print(f'\nUnassigned (matched WHERE but not any module): {n_unassigned:,}')

# How many are assigned to multiple modules?
n_multi = sum(1 for m in prophage_df['modules'] if len(m) > 1)
print(f'Multi-module assignments: {n_multi:,}')

Gene clusters per module:
  A_packaging: 707,217 (Packaging Module)
  B_head_morphogenesis: 87,336 (Head Morphogenesis Module)
  C_tail: 189,271 (Tail Module)
  D_lysis: 724,499 (Lysis Module)
  E_integration: 773,417 (Integration Module)
  F_lysogenic_regulation: 1,682,902 (Lysogenic Regulation Module)
  G_anti_defense: 63,508 (Anti-Defense Module)

Unassigned (matched WHERE but not any module): 807

Multi-module assignments: 222,464

# Create a flat representation: one row per cluster, comma-separated module list
prophage_df['module_str'] = prophage_df['modules'].apply(lambda x: ','.join(x) if x else 'unassigned')

# Also create boolean columns for each module
for module_id in MODULES.keys():
    prophage_df[f'has_{module_id}'] = prophage_df['modules'].apply(lambda x: module_id in x)

print('Module assignment summary:')
print(prophage_df['module_str'].value_counts().head(20))

Module assignment summary:
module_str
F_lysogenic_regulation                               1472947
E_integration                                         750100
D_lysis                                               718228
A_packaging                                           542404
C_tail                                                182000
A_packaging,F_lysogenic_regulation                    159408
B_head_morphogenesis                                   79113
G_anti_defense                                         37474
F_lysogenic_regulation,G_anti_defense                  25895
E_integration,F_lysogenic_regulation                   23219
A_packaging,B_head_morphogenesis                        4824
C_tail,D_lysis                                          3880
B_head_morphogenesis,C_tail                             2338
D_lysis,F_lysogenic_regulation                          1294
unassigned                                               807
B_head_morphogenesis,C_tail,D_lysis                      780
A_packaging,C_tail                                       272
B_head_morphogenesis,D_lysis                             230
A_packaging,F_lysogenic_regulation,G_anti_defense        139
A_packaging,E_integration                                 83
Name: count, dtype: int64

# Identify TerL clusters specifically
prophage_df['is_terL'] = prophage_df.apply(
    lambda r: is_terL(r['Description'], r['PFAMs'], r['KEGG_ko']),
    axis=1
)

n_terL = prophage_df['is_terL'].sum()
n_terL_species = prophage_df.loc[prophage_df['is_terL'], 'gtdb_species_clade_id'].nunique()
print(f'TerL clusters: {n_terL:,}')
print(f'Species with TerL: {n_terL_species:,}')

# Sample some TerL annotations
print('\nSample TerL annotations:')
terL_sample = prophage_df[prophage_df['is_terL']].head(10)
for _, r in terL_sample.iterrows():
    print(f'  {r["gene_cluster_id"][:40]}  PFAMs={r["PFAMs"]}  Desc={r["Description"][:60]}')

TerL clusters: 38,085
Species with TerL: 11,789

Sample TerL annotations:

  BROK01000081.1_37  PFAMs=Terminase_3,Terminase_6,Terminase_6C  Desc=Phage terminase large subunit
  CAADGA010000023.1_12  PFAMs=Terminase_GpA  Desc=Phage terminase large subunit (GpA)
  CAAEFA010000148.1_12  PFAMs=Terminase_1  Desc=Phage Terminase
  CABPCV010000034.1_4  PFAMs=Terminase_1  Desc=Phage Terminase
  CACWKU010000099.1_14  PFAMs=Terminase_GpA  Desc=Phage terminase large subunit (GpA)
  CACWWM010000014.1_43  PFAMs=Terminase_1  Desc=Terminase
  CACWWS010000182.1_7  PFAMs=Terminase_1  Desc=large subunit
  CACXPF010000002.1_40  PFAMs=Terminase_1  Desc=Terminase
  CACYUS010000017.1_4  PFAMs=-  Desc=Phage terminase, large subunit, PBSX family
  CADADH010000052.1_6  PFAMs=Terminase_1  Desc=Phage Terminase

# Analyze core/accessory/singleton distribution per module
def classify_conservation(row):
    if row['is_core'] == 1 or row['is_core'] == True:
        return 'core'
    elif row['is_singleton'] == 1 or row['is_singleton'] == True:
        return 'singleton'
    else:
        return 'accessory'

prophage_df['conservation'] = prophage_df.apply(classify_conservation, axis=1)

print('Overall conservation of prophage gene clusters:')
print(prophage_df['conservation'].value_counts())

print('\nConservation by module:')
for module_id in sorted(MODULES.keys()):
    mask = prophage_df[f'has_{module_id}']
    if mask.sum() > 0:
        vc = prophage_df.loc[mask, 'conservation'].value_counts(normalize=True)
        core_pct = vc.get('core', 0) * 100
        acc_pct = vc.get('accessory', 0) * 100
        sing_pct = vc.get('singleton', 0) * 100
        print(f'  {module_id}: core={core_pct:.1f}% acc={acc_pct:.1f}% sing={sing_pct:.1f}% (n={mask.sum():,})')

Overall conservation of prophage gene clusters:
conservation
core         1752067
singleton    1466169
accessory     787301
Name: count, dtype: int64

Conservation by module:
  A_packaging: core=57.0% acc=15.4% sing=27.5% (n=707,217)
  B_head_morphogenesis: core=11.6% acc=32.4% sing=56.0% (n=87,336)
  C_tail: core=15.6% acc=34.6% sing=49.8% (n=189,271)
  D_lysis: core=62.6% acc=14.2% sing=23.2% (n=724,499)
  E_integration: core=14.1% acc=26.7% sing=59.3% (n=773,417)

  F_lysogenic_regulation: core=50.4% acc=17.5% sing=32.1% (n=1,682,902)
  G_anti_defense: core=55.1% acc=15.4% sing=29.5% (n=63,508)

# Aggregate to species level: which modules are present, how many clusters per module
species_modules = []

for species_id, grp in prophage_df.groupby('gtdb_species_clade_id'):
    row = {'gtdb_species_clade_id': species_id, 'n_prophage_clusters': len(grp)}
    for module_id in MODULES.keys():
        n_mod = grp[f'has_{module_id}'].sum()
        row[f'n_{module_id}'] = n_mod
        row[f'has_{module_id}'] = n_mod > 0
    row['n_terL'] = grp['is_terL'].sum()
    row['n_modules_present'] = sum(1 for m in MODULES.keys() if row[f'has_{m}'])
    species_modules.append(row)

species_mod_df = pd.DataFrame(species_modules)

# Merge with pangenome stats for context
pangenome_stats = spark.sql("""
    SELECT gtdb_species_clade_id, no_genomes, no_gene_clusters, no_core, no_aux_genome
    FROM kbase_ke_pangenome.pangenome
""").toPandas()

species_mod_df = species_mod_df.merge(pangenome_stats, on='gtdb_species_clade_id', how='left')

print(f'Species with any prophage annotation: {len(species_mod_df):,}')
print(f'Total species in pangenome: {len(pangenome_stats):,}')
print(f'Fraction with prophage: {len(species_mod_df)/len(pangenome_stats)*100:.1f}%')
print(f'\nModule presence across species:')
for module_id in sorted(MODULES.keys()):
    n = species_mod_df[f'has_{module_id}'].sum()
    pct = n / len(pangenome_stats) * 100
    print(f'  {module_id}: {n:,} species ({pct:.1f}%)')

print(f'\nNumber of modules present per species:')
print(species_mod_df['n_modules_present'].value_counts().sort_index())

Species with any prophage annotation: 27,702
Total species in pangenome: 27,702
Fraction with prophage: 100.0%

Module presence across species:
  A_packaging: 27,701 species (100.0%)
  B_head_morphogenesis: 15,536 species (56.1%)
  C_tail: 15,412 species (55.6%)
  D_lysis: 27,670 species (99.9%)
  E_integration: 27,451 species (99.1%)
  F_lysogenic_regulation: 27,701 species (100.0%)
  G_anti_defense: 17,802 species (64.3%)

Number of modules present per species:
n_modules_present
2       3
3     178
4    4577
5    6897
6    6389
7    9658
Name: count, dtype: int64

# Save prophage gene clusters (no faa_sequence column present)
prophage_df.to_csv('../data/prophage_gene_clusters.tsv', sep='\t', index=False)
print(f'Saved data/prophage_gene_clusters.tsv: {len(prophage_df):,} rows')

# Save species module summary
species_mod_df.to_csv('../data/species_module_summary.tsv', sep='\t', index=False)
print(f'Saved data/species_module_summary.tsv: {len(species_mod_df):,} rows')

# Fetch TerL protein sequences SEPARATELY (much smaller than full dataset)
# Only query faa_sequence for TerL clusters, avoiding the maxResultSize limit
terL_ids = prophage_df.loc[prophage_df['is_terL'], 'gene_cluster_id'].tolist()
print(f'\nFetching faa_sequence for {len(terL_ids):,} TerL clusters...')

# Batch the IDs to avoid query length limits
batch_size = 5000
terL_seq_dfs = []
for i in range(0, len(terL_ids), batch_size):
    batch = terL_ids[i:i+batch_size]
    id_list = ",".join(f"'{cid}'" for cid in batch)
    seq_query = f"""
        SELECT gene_cluster_id, gtdb_species_clade_id, faa_sequence
        FROM kbase_ke_pangenome.gene_cluster
        WHERE gene_cluster_id IN ({id_list})
          AND faa_sequence IS NOT NULL
    """
    batch_df = spark.sql(seq_query).toPandas()
    terL_seq_dfs.append(batch_df)
    print(f'  Batch {i//batch_size+1}: {len(batch_df)} sequences')

terL_seqs = pd.concat(terL_seq_dfs, ignore_index=True) if terL_seq_dfs else pd.DataFrame()
n_with_seq = len(terL_seqs)
print(f'TerL clusters with protein sequence: {n_with_seq:,}')

# Write FASTA
with open('../data/terL_sequences.fasta', 'w') as f:
    for _, row in terL_seqs.iterrows():
        seq = row['faa_sequence']
        if seq and len(seq) > 10:
            header = f">{row['gene_cluster_id']}|{row['gtdb_species_clade_id']}"
            f.write(f"{header}\n{seq}\n")

print(f'Saved data/terL_sequences.fasta')

Saved data/prophage_gene_clusters.tsv: 4,005,537 rows
Saved data/species_module_summary.tsv: 27,702 rows

Fetching faa_sequence for 38,085 TerL clusters...

  Batch 1: 5000 sequences

  Batch 2: 5000 sequences

  Batch 3: 5000 sequences

  Batch 4: 5000 sequences

  Batch 5: 5000 sequences

  Batch 6: 5000 sequences

  Batch 7: 5000 sequences

  Batch 8: 3085 sequences
TerL clusters with protein sequence: 38,085

Saved data/terL_sequences.fasta

# Check known prophage-carrying species
known_species = [
    's__Escherichia_coli',      # lambda phage
    's__Salmonella_enterica',   # P22 phage
    's__Staphylococcus_aureus', # phi11, phiSa3
    's__Pseudomonas_aeruginosa', # multiple prophages
    's__Mycobacterium_tuberculosis', # phiRv1, phiRv2
    's__Bacillus_subtilis',     # SPBeta, PBSX
    's__Vibrio_cholerae',       # CTXphi
    's__Streptococcus_pyogenes', # multiple prophages encoding virulence
]

for species_name in known_species:
    matches = species_mod_df[
        species_mod_df['gtdb_species_clade_id'].str.startswith(species_name)
    ]
    if len(matches) > 0:
        row = matches.iloc[0]
        modules_present = [m for m in MODULES.keys() if row[f'has_{m}']]
        print(f'{species_name}: {row["n_prophage_clusters"]} clusters, '
              f'{row["n_modules_present"]}/7 modules: {modules_present}')
    else:
        print(f'{species_name}: NOT FOUND in prophage results')

s__Escherichia_coli: 343 clusters, 7/7 modules: ['A_packaging', 'B_head_morphogenesis', 'C_tail', 'D_lysis', 'E_integration', 'F_lysogenic_regulation', 'G_anti_defense']
s__Salmonella_enterica: 7278 clusters, 7/7 modules: ['A_packaging', 'B_head_morphogenesis', 'C_tail', 'D_lysis', 'E_integration', 'F_lysogenic_regulation', 'G_anti_defense']
s__Staphylococcus_aureus: 4400 clusters, 7/7 modules: ['A_packaging', 'B_head_morphogenesis', 'C_tail', 'D_lysis', 'E_integration', 'F_lysogenic_regulation', 'G_anti_defense']
s__Pseudomonas_aeruginosa: 5918 clusters, 7/7 modules: ['A_packaging', 'B_head_morphogenesis', 'C_tail', 'D_lysis', 'E_integration', 'F_lysogenic_regulation', 'G_anti_defense']
s__Mycobacterium_tuberculosis: 1004 clusters, 7/7 modules: ['A_packaging', 'B_head_morphogenesis', 'C_tail', 'D_lysis', 'E_integration', 'F_lysogenic_regulation', 'G_anti_defense']
s__Bacillus_subtilis: 1191 clusters, 7/7 modules: ['A_packaging', 'B_head_morphogenesis', 'C_tail', 'D_lysis', 'E_integration', 'F_lysogenic_regulation', 'G_anti_defense']
s__Vibrio_cholerae: 1666 clusters, 7/7 modules: ['A_packaging', 'B_head_morphogenesis', 'C_tail', 'D_lysis', 'E_integration', 'F_lysogenic_regulation', 'G_anti_defense']
s__Streptococcus_pyogenes: 895 clusters, 7/7 modules: ['A_packaging', 'B_head_morphogenesis', 'C_tail', 'D_lysis', 'E_integration', 'F_lysogenic_regulation', 'G_anti_defense']

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Panel 1: Gene clusters per module
module_names = [MODULES[m]['full_name'] for m in sorted(MODULES.keys())]
module_counts = [module_counter.get(m, 0) for m in sorted(MODULES.keys())]
axes[0].barh(module_names, module_counts, color='steelblue')
axes[0].set_xlabel('Gene Clusters')
axes[0].set_title('Prophage Gene Clusters by Module')
for i, (v, n) in enumerate(zip(module_counts, module_names)):
    axes[0].text(v + max(module_counts)*0.01, i, f'{v:,}', va='center', fontsize=9)

# Panel 2: Species per module
species_counts = [species_mod_df[f'has_{m}'].sum() for m in sorted(MODULES.keys())]
axes[1].barh(module_names, species_counts, color='darkorange')
axes[1].set_xlabel('Species')
axes[1].set_title('Species with Module Present')
for i, v in enumerate(species_counts):
    axes[1].text(v + max(species_counts)*0.01, i, f'{v:,}', va='center', fontsize=9)

# Panel 3: Distribution of modules per species
mod_dist = species_mod_df['n_modules_present'].value_counts().sort_index()
axes[2].bar(mod_dist.index, mod_dist.values, color='seagreen')
axes[2].set_xlabel('Number of Modules Present')
axes[2].set_ylabel('Number of Species')
axes[2].set_title('Prophage Module Richness per Species')
axes[2].set_xticks(range(0, 8))

plt.tight_layout()
plt.savefig('../figures/prophage_module_discovery.png', dpi=150, bbox_inches='tight')
plt.show()
print('Saved figures/prophage_module_discovery.png')

Saved figures/prophage_module_discovery.png

# Conservation status by module (stacked bar)
fig, ax = plt.subplots(figsize=(10, 5))

conservation_data = []
for module_id in sorted(MODULES.keys()):
    mask = prophage_df[f'has_{module_id}']
    if mask.sum() > 0:
        vc = prophage_df.loc[mask, 'conservation'].value_counts(normalize=True)
        conservation_data.append({
            'module': MODULES[module_id]['full_name'],
            'core': vc.get('core', 0),
            'accessory': vc.get('accessory', 0),
            'singleton': vc.get('singleton', 0),
        })

cons_df = pd.DataFrame(conservation_data)
cons_df.plot.barh(
    x='module', stacked=True, ax=ax,
    color=['#2ca02c', '#ff7f0e', '#d62728'],
)
ax.set_xlabel('Fraction')
ax.set_title('Core/Accessory/Singleton Status of Prophage Gene Clusters by Module')
ax.legend(title='Conservation')

plt.tight_layout()
plt.savefig('../figures/prophage_conservation_by_module.png', dpi=150, bbox_inches='tight')
plt.show()
print('Saved figures/prophage_conservation_by_module.png')

Saved figures/prophage_conservation_by_module.png

# Final summary
print('='*60)
print('NB01 SUMMARY')
print('='*60)
print(f'Total prophage gene clusters: {len(prophage_df):,}')
print(f'Species with prophage annotations: {species_mod_df.shape[0]:,} / {len(pangenome_stats):,} ({species_mod_df.shape[0]/len(pangenome_stats)*100:.1f}%)')
print(f'TerL clusters (for lineage clustering): {n_terL:,}')
print(f'TerL clusters with sequences: {n_with_seq:,}')
print(f'\nModule prevalence (species with >= 1 cluster):')
for module_id in sorted(MODULES.keys()):
    n = species_mod_df[f'has_{module_id}'].sum()
    print(f'  {module_id}: {n:,} species')
print(f'\nFiles saved:')
print(f'  data/prophage_gene_clusters.tsv ({len(prophage_df):,} rows)')
print(f'  data/species_module_summary.tsv ({len(species_mod_df):,} rows)')
print(f'  data/terL_sequences.fasta ({n_with_seq:,} sequences)')
print(f'  figures/prophage_module_discovery.png')
print(f'  figures/prophage_conservation_by_module.png')

============================================================
NB01 SUMMARY
============================================================
Total prophage gene clusters: 4,005,537
Species with prophage annotations: 27,702 / 27,702 (100.0%)
TerL clusters (for lineage clustering): 38,085
TerL clusters with sequences: 38,085

Module prevalence (species with >= 1 cluster):
  A_packaging: 27,701 species
  B_head_morphogenesis: 15,536 species
  C_tail: 15,412 species
  D_lysis: 27,670 species
  E_integration: 27,451 species
  F_lysogenic_regulation: 27,701 species
  G_anti_defense: 17,802 species

Files saved:
  data/prophage_gene_clusters.tsv (4,005,537 rows)
  data/species_module_summary.tsv (27,702 rows)
  data/terL_sequences.fasta (38,085 sequences)
  figures/prophage_module_discovery.png
  figures/prophage_conservation_by_module.png

01 Prophage Gene Discovery

NB01: Prophage Gene Discovery¶

2. Extract All Prophage Gene Clusters¶

3. Classify Gene Clusters into Modules (A-G)¶

4. Core/Accessory/Singleton Status of Prophage Genes¶

5. Species-Level Module Summary¶

6. Save Outputs¶

7. Validation: Check Known Prophage-Carrying Species¶

8. Summary Statistics¶

01 Prophage Gene Discovery

NB01: Prophage Gene Discovery¶

1. Explore Prophage-Related Annotations¶

2. Extract All Prophage Gene Clusters¶

3. Classify Gene Clusters into Modules (A-G)¶

4. Core/Accessory/Singleton Status of Prophage Genes¶

5. Species-Level Module Summary¶

6. Save Outputs¶

7. Validation: Check Known Prophage-Carrying Species¶

8. Summary Statistics¶