import pandas as pd
import numpy as np
from pathlib import Path
from scipy import stats as scipy_stats
from statsmodels.stats.multitest import multipletests

DATA_DIR = Path('../data')
ANNOT_DIR = DATA_DIR / 'annotations'
MODULE_DIR = DATA_DIR / 'modules'
ANNOT_DIR.mkdir(parents=True, exist_ok=True)

pilots = pd.read_csv(DATA_DIR / 'pilot_organisms.csv')
pilot_ids = pilots['orgId'].tolist()
print(f"Pilot organisms: {pilot_ids}")

Pilot organisms: ['DvH', 'Btheta', 'Methanococcus_S2', 'psRCH2', 'Putida', 'Phaeo', 'Marino', 'pseudo3_N2E3', 'Koxy', 'Cola', 'WCS417', 'Caulo', 'SB2B', 'pseudo6_N2E2', 'Dino', 'pseudo5_N2C3_1', 'Miya', 'Pedo557', 'MR1', 'Keio', 'Korea', 'PV4', 'pseudo1_N1B4', 'acidovorax_3H11', 'SynE', 'Methanococcus_JJ', 'BFirm', 'Kang', 'ANA3', 'Cup4G11', 'pseudo13_GW456_L13', 'Ponti']

# Initialize Spark (comment out if running Part 2 locally)
try:
    spark = get_spark_session()
    HAS_SPARK = True
    print(f"Spark version: {spark.version}")
except Exception:
    HAS_SPARK = False
    print("No Spark available — running Part 2 only (local mode)")

Spark version: 4.0.1

if HAS_SPARK:
    for org_id in pilot_ids:
        # KEGG annotations (join through besthitkegg — keggmember has no orgId/locusId)
        kegg_file = ANNOT_DIR / f'{org_id}_kegg.csv'
        if not (kegg_file.exists() and kegg_file.stat().st_size > 0):
            kegg = spark.sql(f"""
                SELECT bk.locusId, km.kgroup, kd.desc as kgroup_desc,
                       ke.ecnum as ec
                FROM kescience_fitnessbrowser.besthitkegg bk
                JOIN kescience_fitnessbrowser.keggmember km
                    ON bk.keggOrg = km.keggOrg AND bk.keggId = km.keggId
                LEFT JOIN kescience_fitnessbrowser.kgroupdesc kd
                    ON km.kgroup = kd.kgroup
                LEFT JOIN kescience_fitnessbrowser.kgroupec ke
                    ON km.kgroup = ke.kgroup
                WHERE bk.orgId = '{org_id}'
            """).toPandas()
            kegg.to_csv(kegg_file, index=False)
            print(f"KEGG: {org_id} — {len(kegg)} annotations")
        else:
            print(f"CACHED: {org_id} KEGG")

        # SEED annotations (seedclass has no subsystem columns — just use seed_desc)
        seed_file = ANNOT_DIR / f'{org_id}_seed.csv'
        if not (seed_file.exists() and seed_file.stat().st_size > 0):
            seed = spark.sql(f"""
                SELECT locusId, seed_desc
                FROM kescience_fitnessbrowser.seedannotation
                WHERE orgId = '{org_id}'
            """).toPandas()
            seed.to_csv(seed_file, index=False)
            print(f"SEED: {org_id} — {len(seed)} annotations")
        else:
            print(f"CACHED: {org_id} SEED")

        # Domain annotations
        domain_file = ANNOT_DIR / f'{org_id}_domains.csv'
        if not (domain_file.exists() and domain_file.stat().st_size > 0):
            domains = spark.sql(f"""
                SELECT locusId, domainDb, domainId, domainName,
                       definition, geneSymbol, ec
                FROM kescience_fitnessbrowser.genedomain
                WHERE orgId = '{org_id}'
            """).toPandas()
            domains.to_csv(domain_file, index=False)
            print(f"Domains: {org_id} — {len(domains)} annotations")
        else:
            print(f"CACHED: {org_id} domains")

        # Specific phenotypes
        pheno_file = ANNOT_DIR / f'{org_id}_specific_phenotypes.csv'
        if not (pheno_file.exists() and pheno_file.stat().st_size > 0):
            pheno = spark.sql(f"""
                SELECT sp.locusId, sp.expName,
                       e.expDesc, e.expGroup, e.condition_1
                FROM kescience_fitnessbrowser.specificphenotype sp
                JOIN kescience_fitnessbrowser.experiment e
                    ON sp.orgId = e.orgId AND sp.expName = e.expName
                WHERE sp.orgId = '{org_id}'
            """).toPandas()
            pheno.to_csv(pheno_file, index=False)
            print(f"Phenotypes: {org_id} — {len(pheno)} entries")
        else:
            print(f"CACHED: {org_id} phenotypes")

CACHED: DvH KEGG
CACHED: DvH SEED
CACHED: DvH domains
CACHED: DvH phenotypes
CACHED: Btheta KEGG
CACHED: Btheta SEED
CACHED: Btheta domains
CACHED: Btheta phenotypes
CACHED: Methanococcus_S2 KEGG
CACHED: Methanococcus_S2 SEED
CACHED: Methanococcus_S2 domains
CACHED: Methanococcus_S2 phenotypes
CACHED: psRCH2 KEGG
CACHED: psRCH2 SEED
CACHED: psRCH2 domains
CACHED: psRCH2 phenotypes
CACHED: Putida KEGG
CACHED: Putida SEED
CACHED: Putida domains
CACHED: Putida phenotypes
CACHED: Phaeo KEGG
CACHED: Phaeo SEED
CACHED: Phaeo domains
CACHED: Phaeo phenotypes
CACHED: Marino KEGG
CACHED: Marino SEED
CACHED: Marino domains
CACHED: Marino phenotypes
CACHED: pseudo3_N2E3 KEGG
CACHED: pseudo3_N2E3 SEED
CACHED: pseudo3_N2E3 domains
CACHED: pseudo3_N2E3 phenotypes
CACHED: Koxy KEGG
CACHED: Koxy SEED
CACHED: Koxy domains
CACHED: Koxy phenotypes
CACHED: Cola KEGG
CACHED: Cola SEED
CACHED: Cola domains
CACHED: Cola phenotypes
CACHED: WCS417 KEGG
CACHED: WCS417 SEED
CACHED: WCS417 domains
CACHED: WCS417 phenotypes
CACHED: Caulo KEGG
CACHED: Caulo SEED
CACHED: Caulo domains
CACHED: Caulo phenotypes
CACHED: SB2B KEGG
CACHED: SB2B SEED
CACHED: SB2B domains
CACHED: SB2B phenotypes
CACHED: pseudo6_N2E2 KEGG
CACHED: pseudo6_N2E2 SEED
CACHED: pseudo6_N2E2 domains
CACHED: pseudo6_N2E2 phenotypes
CACHED: Dino KEGG
CACHED: Dino SEED
CACHED: Dino domains
CACHED: Dino phenotypes
CACHED: pseudo5_N2C3_1 KEGG
CACHED: pseudo5_N2C3_1 SEED
CACHED: pseudo5_N2C3_1 domains
CACHED: pseudo5_N2C3_1 phenotypes
CACHED: Miya KEGG
CACHED: Miya SEED
CACHED: Miya domains
CACHED: Miya phenotypes
CACHED: Pedo557 KEGG
CACHED: Pedo557 SEED
CACHED: Pedo557 domains
CACHED: Pedo557 phenotypes
CACHED: MR1 KEGG
CACHED: MR1 SEED
CACHED: MR1 domains
CACHED: MR1 phenotypes
CACHED: Keio KEGG
CACHED: Keio SEED
CACHED: Keio domains
CACHED: Keio phenotypes
CACHED: Korea KEGG
CACHED: Korea SEED
CACHED: Korea domains
CACHED: Korea phenotypes
CACHED: PV4 KEGG
CACHED: PV4 SEED
CACHED: PV4 domains
CACHED: PV4 phenotypes
CACHED: pseudo1_N1B4 KEGG
CACHED: pseudo1_N1B4 SEED
CACHED: pseudo1_N1B4 domains
CACHED: pseudo1_N1B4 phenotypes
CACHED: acidovorax_3H11 KEGG
CACHED: acidovorax_3H11 SEED
CACHED: acidovorax_3H11 domains
CACHED: acidovorax_3H11 phenotypes
CACHED: SynE KEGG
CACHED: SynE SEED
CACHED: SynE domains
CACHED: SynE phenotypes
CACHED: Methanococcus_JJ KEGG
CACHED: Methanococcus_JJ SEED
CACHED: Methanococcus_JJ domains
CACHED: Methanococcus_JJ phenotypes
CACHED: BFirm KEGG
CACHED: BFirm SEED
CACHED: BFirm domains
CACHED: BFirm phenotypes
CACHED: Kang KEGG
CACHED: Kang SEED
CACHED: Kang domains
CACHED: Kang phenotypes
CACHED: ANA3 KEGG
CACHED: ANA3 SEED
CACHED: ANA3 domains
CACHED: ANA3 phenotypes
CACHED: Cup4G11 KEGG
CACHED: Cup4G11 SEED
CACHED: Cup4G11 domains
CACHED: Cup4G11 phenotypes
CACHED: pseudo13_GW456_L13 KEGG
CACHED: pseudo13_GW456_L13 SEED
CACHED: pseudo13_GW456_L13 domains
CACHED: pseudo13_GW456_L13 phenotypes
CACHED: Ponti KEGG
CACHED: Ponti SEED
CACHED: Ponti domains
CACHED: Ponti phenotypes

def enrichment_analysis(module_genes, all_genes, annotation_map, min_annotated=2):
    """Fisher exact test enrichment for a single module.
    
    Parameters
    ----------
    module_genes : set
        Genes in the module.
    all_genes : set
        All genes in the organism.
    annotation_map : dict
        {term: set_of_genes} mapping.
    min_annotated : int
        Minimum annotated genes in module for testing.
    
    Returns
    -------
    results : list of dict
        Enrichment results per term.
    """
    results = []
    n_total = len(all_genes)
    n_module = len(module_genes)
    
    for term, term_genes in annotation_map.items():
        term_genes = term_genes & all_genes  # intersect with valid genes
        overlap = module_genes & term_genes
        
        if len(overlap) < min_annotated:
            continue
        
        # 2x2 contingency table
        a = len(overlap)                          # in module AND annotated
        b = len(module_genes - term_genes)         # in module NOT annotated
        c = len(term_genes - module_genes)          # NOT in module but annotated
        d = n_total - len(module_genes | term_genes)  # neither
        
        odds_ratio, p_value = scipy_stats.fisher_exact([[a, b], [c, d]],
                                                        alternative='greater')
        results.append({
            'term': term,
            'n_overlap': a,
            'n_module': n_module,
            'n_term': len(term_genes),
            'odds_ratio': odds_ratio,
            'p_value': p_value
        })
    
    return results

for org_id in pilot_ids:
    out_file = MODULE_DIR / f'{org_id}_module_annotations.csv'
    cond_file = MODULE_DIR / f'{org_id}_module_conditions.csv'
    
    if out_file.exists() and out_file.stat().st_size > 0:
        print(f"CACHED: {org_id} annotations")
        continue
    
    print(f"\nAnnotating {org_id} modules...")
    
    # Load membership
    membership = pd.read_csv(MODULE_DIR / f'{org_id}_gene_membership.csv', index_col=0)
    all_genes = set(membership.index.astype(str))
    module_names = membership.columns.tolist()
    
    # Load annotations and build term->gene maps
    annotation_maps = {}
    
    # KEGG
    kegg_file = ANNOT_DIR / f'{org_id}_kegg.csv'
    if kegg_file.exists():
        kegg = pd.read_csv(kegg_file)
        kegg_map = kegg.groupby('kgroup')['locusId'].apply(lambda x: set(x.astype(str))).to_dict()
        annotation_maps['KEGG'] = kegg_map
    
    # SEED (group by seed_desc — seedclass has no subsystem hierarchy)
    seed_file = ANNOT_DIR / f'{org_id}_seed.csv'
    if seed_file.exists():
        seed = pd.read_csv(seed_file).dropna(subset=['seed_desc'])
        if len(seed) > 0:
            seed_map = seed.groupby('seed_desc')['locusId'].apply(
                lambda x: set(x.astype(str))).to_dict()
            # Keep terms with 2+ genes
            annotation_maps['SEED'] = {k: v for k, v in seed_map.items() if len(v) >= 2}
    
    # Domains (TIGRFam and PFam)
    domain_file = ANNOT_DIR / f'{org_id}_domains.csv'
    if domain_file.exists():
        domains = pd.read_csv(domain_file)
        domains['locusId'] = domains['locusId'].astype(str)
        tigr = domains[domains['domainDb'] == 'TIGRFam']
        if len(tigr) > 0:
            tigr_map = tigr.groupby('domainId')['locusId'].apply(
                lambda x: set(x.astype(str))).to_dict()
            annotation_maps['TIGRFam'] = tigr_map
        pfam = domains[domains['domainDb'] == 'PFam']
        if len(pfam) > 0:
            pfam_map = pfam.groupby('domainId')['locusId'].apply(
                lambda x: set(x.astype(str))).to_dict()
            annotation_maps['PFam'] = pfam_map
    
    # Run enrichment for each module
    all_results = []
    for mod in module_names:
        mod_genes = set(membership.index[membership[mod] == 1].astype(str))
        if len(mod_genes) == 0:
            continue
        
        for db_name, term_map in annotation_maps.items():
            results = enrichment_analysis(mod_genes, all_genes, term_map)
            for r in results:
                r['module'] = mod
                r['database'] = db_name
            all_results.extend(results)
    
    if all_results:
        enrich_df = pd.DataFrame(all_results)
        # FDR correction
        reject, fdr, _, _ = multipletests(enrich_df['p_value'], method='fdr_bh')
        enrich_df['fdr'] = fdr
        enrich_df['significant'] = reject
        enrich_df = enrich_df.sort_values(['module', 'fdr'])
        enrich_df.to_csv(out_file, index=False)
        
        n_sig = enrich_df['significant'].sum()
        n_modules_annotated = enrich_df[enrich_df['significant']]['module'].nunique()
        print(f"  {n_sig} significant enrichments across {n_modules_annotated} modules")
    else:
        print(f"  No enrichments found")
    
    # Map module activity to experiment conditions
    profiles = pd.read_csv(MODULE_DIR / f'{org_id}_module_profiles.csv', index_col=0)
    exp_meta = pd.read_csv(ANNOT_DIR / f'{org_id}_experiments.csv')
    
    condition_results = []
    for mod in profiles.index:
        activity = profiles.loc[mod]
        # Top 5 most activated experiments
        top_activated = activity.abs().nlargest(5)
        for exp_name, act_value in top_activated.items():
            exp_info = exp_meta[exp_meta['expName'] == exp_name]
            if len(exp_info) > 0:
                condition_results.append({
                    'module': mod,
                    'expName': exp_name,
                    'activity': float(activity[exp_name]),
                    'abs_activity': float(act_value),
                    'expDesc': exp_info.iloc[0].get('expDesc', ''),
                    'expGroup': exp_info.iloc[0].get('expGroup', ''),
                    'condition_1': exp_info.iloc[0].get('condition_1', '')
                })
    
    if condition_results:
        cond_df = pd.DataFrame(condition_results)
        cond_df.to_csv(cond_file, index=False)
        print(f"  Saved condition mappings")

Annotating DvH modules...

  133 significant enrichments across 31 modules
  Saved condition mappings

Annotating Btheta modules...

  134 significant enrichments across 24 modules
  Saved condition mappings

Annotating Methanococcus_S2 modules...

  40 significant enrichments across 10 modules
  Saved condition mappings

Annotating psRCH2 modules...

  177 significant enrichments across 35 modules
  Saved condition mappings

Annotating Putida modules...

  250 significant enrichments across 36 modules
  Saved condition mappings

Annotating Phaeo modules...

  170 significant enrichments across 32 modules
  Saved condition mappings

Annotating Marino modules...

  162 significant enrichments across 31 modules
  Saved condition mappings

Annotating pseudo3_N2E3 modules...

  307 significant enrichments across 34 modules
  Saved condition mappings

Annotating Koxy modules...

  173 significant enrichments across 34 modules
  Saved condition mappings

Annotating Cola modules...

# Summary: top enrichments per organism
for org_id in pilot_ids:
    ann_file = MODULE_DIR / f'{org_id}_module_annotations.csv'
    if not ann_file.exists() or ann_file.stat().st_size < 10:
        continue
    ann = pd.read_csv(ann_file)
    if len(ann) == 0:
        continue
    sig = ann[ann['significant']]
    print(f"\n{org_id}: {len(sig)} significant enrichments")
    # Show top enrichment per module
    top = sig.groupby('module').first().reset_index()
    if len(top) > 0:
        print(top[['module', 'database', 'term', 'n_overlap', 'odds_ratio', 'fdr']].head(10).to_string(index=False))

DvH: 133 significant enrichments
module database      term  n_overlap  odds_ratio          fdr
  M000     PFam   PF12399          2  455.166667 1.770892e-04
  M002     PFam   PF00590          4  725.600000 2.877661e-07
  M003     PFam   PF00072          4   27.220513 2.783908e-04
  M006     PFam   PF00994          3  340.500000 1.845777e-05
  M007     PFam   PF13186          3  291.642857 2.266124e-05
  M008     PFam   PF01061          2         inf 4.120635e-05
  M010     PFam   PF02653          2  303.333333 2.534944e-04
  M011     PFam   PF00460          7  437.906977 4.820529e-10
  M012  TIGRFam TIGR01539          3         inf 3.966784e-06
  M013     PFam   PF02508          2         inf 1.770892e-04

Btheta: 134 significant enrichments
module database                                                                  term  n_overlap  odds_ratio          fdr
  M000     PFam                                                               PF01915          2   44.175824 3.270124e-03
  M002     SEED                                     putative glycosyltransferase yibD          2  576.857143 1.865003e-04
  M003     SEED                                                BexA, membrane protein          2         inf 1.510169e-04
  M004     KEGG                                                                K03701          2         inf 1.510169e-04
  M005     SEED Ferric siderophore transport system, periplasmic binding protein TonB          2  336.833333 2.242119e-04
  M007     PFam                                                               PF00534          8   44.210526 2.153416e-08
  M008     SEED                                   Cardiolipin synthetase (EC 2.7.8.-)          2  298.148148 5.264009e-04
  M012     PFam                                                               PF00370          2  385.047619 1.865003e-04
  M014     PFam                                                               PF13522          2         inf 1.510169e-04
  M015     SEED                                        conserved hypothetical protein          3  207.982759 7.090474e-05

Methanococcus_S2: 40 significant enrichments
module database      term  n_overlap  odds_ratio      fdr
  M000  TIGRFam TIGR02090          2         inf 0.000827
  M010     KEGG    K07079          2  223.636364 0.000827
  M014     PFam   PF04021          4   51.200000 0.000232
  M015     PFam   PF12838          3   43.023529 0.000827
  M016     KEGG    K07321          2         inf 0.000827
  M018     PFam   PF00072          2 1239.000000 0.000341
  M019     PFam   PF01472          2  205.833333 0.000827
  M021     PFam   PF00148          4         inf 0.000002
  M022     PFam   PF00753          3   73.560000 0.000534
  M025     PFam   PF04055          3   17.382775 0.002688

psRCH2: 177 significant enrichments
module database      term  n_overlap  odds_ratio          fdr
  M000     PFam   PF05157          2  100.363636 1.366850e-03
  M002     PFam   PF00440          3   51.843750 4.219746e-04
  M003     PFam   PF02239          3  382.961538 6.637691e-05
  M004     PFam   PF16363          5  296.964286 8.255354e-08
  M005     PFam   PF00361          2   60.363636 2.022822e-03
  M006     PFam   PF01584          4   57.286957 1.049572e-04
  M007  TIGRFam TIGR00786          2   45.777778 3.509183e-03
  M008     PFam   PF04324          2  333.500000 4.259970e-04
  M009     PFam   PF01144          2  184.833333 6.155129e-04
  M012     PFam   PF00072          5    7.015152 2.457249e-03

Putida: 250 significant enrichments
module database                                   term  n_overlap  odds_ratio          fdr
  M000     PFam                                PF03471          2  288.606061 3.728746e-04
  M001     SEED Transcriptional regulator, LysR family          2   11.223810 2.290103e-02
  M002     PFam                                PF01012          2         inf 1.971565e-04
  M003     PFam                                PF00590          3  198.041667 3.799954e-05
  M004     PFam                                PF00668          4         inf 3.362548e-07
  M005     PFam                                PF00171          3   30.581897 8.592811e-04
  M006     PFam                                PF13193          2   39.541667 3.639559e-03
  M007  TIGRFam                              TIGR00177          2  278.882353 6.278973e-04
  M008  TIGRFam                              TIGR01726          2   35.133333 4.198886e-03
  M009     KEGG                                 K03808          2         inf 1.498389e-04

Phaeo: 170 significant enrichments
module database                                                  term  n_overlap  odds_ratio          fdr
  M000     PFam                                               PF05138          2  440.285714 4.749231e-04
  M001     PFam                                               PF00994          3  256.333333 6.955303e-05
  M002     SEED Pyrimidine ABC transporter, transmembrane component 1          2         inf 6.768074e-04
  M004     PFam                                               PF00460          5         inf 5.443611e-08
  M005     PFam                                               PF06568          2  171.222222 6.768074e-04
  M006     PFam                                               PF01979          2   85.555556 1.191026e-03
  M007     PFam                                               PF01571          2   61.540000 1.641622e-03
  M008     PFam                                               PF00512          2   30.670000 4.418259e-03
  M009     PFam                                               PF04321          2  102.666667 9.718327e-04
  M011     PFam                                               PF01471          3  153.950000 9.273318e-05

Marino: 162 significant enrichments
module database                                          term  n_overlap  odds_ratio      fdr
  M001     PFam                                       PF13538          2  329.545455 0.000648
  M002     KEGG                                        K10040          2         inf 0.000256
  M003     PFam                                       PF05138          2         inf 0.000561
  M005     PFam                                       PF00482          2  161.333333 0.000686
  M006     KEGG                                        K00951          2         inf 0.000581
  M007     PFam                                       PF13531          3   94.408696 0.000453
  M008     SEED NADPH dependent preQ0 reductase (EC 1.7.1.13)          2         inf 0.000561
  M009     PFam                                       PF01565          2  346.476190 0.000581
  M010     PFam                                       PF00440          2   11.832237 0.019219
  M011     SEED      Magnesium and cobalt efflux protein CorC          2  180.350000 0.001111

pseudo3_N2E3: 307 significant enrichments
module database                                            term  n_overlap  odds_ratio          fdr
  M000     PFam                                         PF08028          3   39.946524 5.175953e-04
  M001     SEED                 FIG002188: hypothetical protein          2         inf 2.102014e-04
  M003     SEED Sensory box histidine kinase/response regulator          2  589.176471 3.092783e-04
  M004     KEGG                                          K00673          2  118.928571 1.021151e-03
  M006     KEGG                                          K12979          3         inf 2.287327e-05
  M007     PFam                                         PF02653          2   61.641975 1.972041e-03
  M009     PFam                                         PF00905          2   69.097222 2.035097e-03
  M010  TIGRFam                                       TIGR00254          8   31.415873 1.749435e-07
  M012     PFam                                         PF01144          2  227.363636 5.273625e-04
  M013     PFam                                         PF00361          2         inf 4.587149e-04

Koxy: 173 significant enrichments
module database                                              term  n_overlap  odds_ratio      fdr
  M001     PFam                                           PF00696          3   58.123404 0.000477
  M002     PFam                                           PF00294          2   21.738095 0.010464
  M003     KEGG                                            K01607          2  228.250000 0.001025
  M005     PFam                                           PF00440          2    8.215580 0.037094
  M006  TIGRFam                                         TIGR01494          2   37.941667 0.005479
  M008     PFam                                           PF02518          5   17.352490 0.000370
  M009     PFam                                           PF07690          2   12.652482 0.025755
  M010     KEGG                                            K00068          2         inf 0.000403
  M011     SEED Cysteine desulfurase (EC 2.8.1.7), SufS subfamily          2         inf 0.000580
  M013     SEED          Phage tail length tape-measure protein 1          2         inf 0.000580

Cola: 197 significant enrichments
module database                                                           term  n_overlap  odds_ratio      fdr
  M000     PFam                                                        PF00534          5   27.912482 0.000156
  M001     PFam                                                        PF00155          2   21.744444 0.008923
  M002     KEGG                                                         K01752          2         inf 0.000716
  M003     KEGG                                                         K03296          3  249.127660 0.000230
  M004     PFam                                                        PF00072          4    6.702609 0.008263
  M005     KEGG                                                         K06147          2         inf 0.000299
  M008     SEED                    Sucrose-6-phosphate hydrolase (EC 3.2.1.B3)          2         inf 0.000156
  M012     PFam                                                        PF13649          2   32.666667 0.005413
  M013     SEED Transcriptional regulator of rhamnose utilization, GntR family          3         inf 0.000039
  M016     SEED                                 ABC transporter efflux protein          2         inf 0.000716

WCS417: 250 significant enrichments
module database      term  n_overlap  odds_ratio          fdr
  M000     PFam   PF00361          2         inf 4.071644e-04
  M001     PFam   PF08443          2   94.978261 1.941792e-03
  M002  TIGRFam TIGR00254          9   38.142439 9.518412e-09
  M003     KEGG    K00114          2         inf 4.664208e-04
  M004     KEGG    K02483          3   92.893617 4.071644e-04
  M006     KEGG    K07264          2         inf 5.970382e-04
  M007     KEGG    K11073          2  489.555556 4.071644e-04
  M008     PFam   PF00528          4   12.008333 1.911009e-03
  M009     PFam   PF13673          4   15.742754 1.018165e-03
  M010     PFam   PF00590          5  385.263158 1.759091e-08

Caulo: 112 significant enrichments
module database      term  n_overlap  odds_ratio      fdr
  M002     PFam   PF00912          3   93.457143 0.000356
  M005     KEGG    K01847          2         inf 0.000563
  M006     PFam   PF00270          5   72.377778 0.000009
  M007     PFam   PF00392          2   78.357143 0.001226
  M008     KEGG    K02014          2   12.248120 0.017675
  M009     PFam   PF00528          3   23.070922 0.001275
  M010  TIGRFam TIGR01845          2  549.500000 0.000356
  M011     PFam   PF00482          2  135.875000 0.001275
  M014     KEGG    K01847          2         inf 0.000402
  M016     PFam   PF00994          2  470.714286 0.000356

SB2B: 230 significant enrichments
module database      term  n_overlap  odds_ratio      fdr
  M002     PFam   PF02769          2  154.850000 0.001104
  M003  TIGRFam TIGR00254          9   12.744371 0.000014
  M005     KEGG    K06213          2         inf 0.000923
  M006     PFam   PF00171          2   18.642424 0.010310
  M007  TIGRFam TIGR02532          4   72.257310 0.000037
  M008     PFam   PF00664          2   81.473684 0.001939
  M010     PFam   PF13476          3  132.042857 0.000268
  M011     PFam   PF00675          2   34.111111 0.005313
  M012     PFam   PF00034          3   93.393939 0.000401
  M013     KEGG    K00257          2         inf 0.001014

pseudo6_N2E2: 250 significant enrichments
module database                                                                                                                           term  n_overlap  odds_ratio          fdr
  M000     KEGG                                                                                                                         K00031          2         inf 5.123179e-04
  M001     KEGG                                                                                                                         K11901          2  340.000000 5.626637e-04
  M002  TIGRFam                                                                                                                      TIGR03075          2  211.750000 1.047846e-03
  M003     PFam                                                                                                                        PF00270          6   79.421875 6.850756e-08
  M005     PFam                                                                                                                        PF01584          3   34.044643 8.590395e-04
  M006     KEGG                                                                                                                         K02057          2  211.750000 1.047846e-03
  M009     SEED Lead, cadmium, zinc and mercury transporting ATPase (EC 3.6.3.3) (EC 3.6.3.5); Copper-translocating P-type ATPase (EC 3.6.3.4)          2  105.854167 1.480216e-03
  M010     PFam                                                                                                                        PF01012          2         inf 5.123179e-04
  M011     SEED                                             Phosphonate ABC transporter phosphate-binding periplasmic component (TC 3.A.1.9.1)          4   23.176201 4.547849e-04
  M012     PFam                                                                                                                        PF00460          6         inf 5.097308e-11

Dino: 169 significant enrichments
module database                                                                    term  n_overlap  odds_ratio      fdr
  M000     PFam                                                                 PF00270          5   62.281746 0.000007
  M001     PFam                                                                 PF00364          2   81.179487 0.001397
  M002     PFam                                                                 PF00950          2  217.586207 0.000788
  M003     PFam                                                                 PF02653          2   24.984127 0.006295
  M004     PFam                                                                 PF05175          3   49.994681 0.000653
  M005     SEED Transcriptional regulator, MocR family, putative Taurine regulator tauR          2         inf 0.000708
  M006     PFam                                                                 PF00124          2         inf 0.000708
  M007     PFam                                                                 PF01315          2  105.600000 0.001056
  M008     PFam                                                                 PF11150          2  301.238095 0.000708
  M009     KEGG                                                                  K02065          2         inf 0.000708

pseudo5_N2C3_1: 334 significant enrichments
module database                                                                                                                           term  n_overlap  odds_ratio          fdr
  M000     PFam                                                                                                                        PF00460          6         inf 6.011770e-11
  M001  TIGRFam                                                                                                                      TIGR01720          4         inf 5.212239e-07
  M002  TIGRFam                                                                                                                      TIGR00254         12   56.571429 1.710577e-13
  M003     SEED                                                                                           Histidine ammonia-lyase (EC 4.3.1.3)          2         inf 5.325410e-04
  M004     PFam                                                                                                                        PF01297          2  234.863636 6.605538e-04
  M005     PFam                                                                                                                        PF00528          4   13.363158 1.551053e-03
  M006     PFam                                                                                                                        PF01261          2   67.947368 1.921985e-03
  M007     SEED Lead, cadmium, zinc and mercury transporting ATPase (EC 3.6.3.3) (EC 3.6.3.5); Copper-translocating P-type ATPase (EC 3.6.3.4)          2         inf 5.325410e-04
  M008     PFam                                                                                                                        PF02653          2   44.102564 3.173173e-03
  M009     PFam                                                                                                                        PF13188          3  109.361702 2.506310e-04

Miya: 165 significant enrichments
module database    term  n_overlap  odds_ratio          fdr
  M000     PFam PF01061          2         inf 1.001823e-03
  M001     PFam PF00072          3   27.055556 2.799420e-03
  M002     PFam PF04879          2   55.777778 2.846125e-03
  M003     PFam PF08668          3   49.058824 1.001823e-03
  M004     PFam PF13439          4   35.869565 7.033269e-04
  M005     PFam PF10111          3   52.723404 1.025802e-03
  M006     PFam PF00384          4   24.989899 9.379348e-04
  M007     PFam PF00005          3    4.862069 4.334861e-02
  M008     PFam PF00460          7  201.779070 5.890312e-09
  M009     PFam PF02518          9    8.564516 2.366509e-04

Pedo557: 204 significant enrichments
module database                                                                             term  n_overlap  odds_ratio      fdr
  M000     PFam                                                                          PF10111          3   52.107143 0.000449
  M001     SEED                                   GlpG protein (membrane protein of glp regulon)          3         inf 0.000051
  M002     SEED                                                                             GldJ          2         inf 0.000550
  M003     SEED                                             Aerobactin siderophore receptor IutA          2         inf 0.000550
  M004     SEED Maltose/maltodextrin ABC transporter, substrate binding periplasmic protein MalE          2         inf 0.000140
  M005     PFam                                                                          PF04324          2         inf 0.000142
  M006     PFam                                                                          PF04450          2  182.166667 0.000999
  M008     SEED                                                   ABC transporter efflux protein          2         inf 0.000550
  M009     PFam                                                                          PF00155          4   29.163880 0.000314
  M010     PFam                                                                          PF13407          2   49.897727 0.002399

MR1: 191 significant enrichments
module database    term  n_overlap  odds_ratio          fdr
  M000     PFam PF00440          2   23.993590 6.658439e-03
  M002     PFam PF00171          2  107.514286 1.090595e-03
  M004     PFam PF00460          6         inf 7.968301e-10
  M006     KEGG  K01772          2         inf 5.280934e-04
  M007     KEGG  K07114          2   51.791667 3.027254e-03
  M008     PFam PF00854          2   77.708333 2.072377e-03
  M009     PFam PF02151          2         inf 6.819676e-04
  M010     KEGG  K02078          2         inf 6.819676e-04
  M012     KEGG  K06148          2  359.047619 5.091676e-04
  M013     PFam PF01145          2   42.409091 3.703192e-03

Keio: 176 significant enrichments
module database                         term  n_overlap  odds_ratio      fdr
  M000     PFam                      PF02518          3   28.653061 0.001307
  M001     PFam                      PF02378          2   21.528736 0.009528
  M002  TIGRFam                    TIGR01033          2         inf 0.000768
  M003  TIGRFam                    TIGR00790          2  155.750000 0.001701
  M004     PFam                      PF06411          2         inf 0.000768
  M005     PFam                      PF13407          2   19.726316 0.010320
  M006  TIGRFam                    TIGR02937          2  155.750000 0.001701
  M007     PFam                      PF01232          2   86.321839 0.001973
  M009     KEGG                       K00042          2         inf 0.000768
  M010     SEED Paraquat-inducible protein B          2  155.750000 0.001701

Korea: 121 significant enrichments
module database    term  n_overlap  odds_ratio      fdr
  M000     PFam PF13692          4   29.371429 0.000405
  M001     PFam PF00391          2  563.000000 0.000405
  M002     PFam PF01261          3  213.319149 0.000233
  M004     PFam PF13439          4   28.982609 0.000405
  M005     PFam PF13505          3   74.266667 0.000416
  M006     PFam PF00226          2  375.555556 0.000435
  M007     PFam PF01979          2   33.610000 0.005721
  M008     PFam PF12844          2   34.781250 0.006287
  M011     PFam PF04851          4  141.684211 0.000027
  M012     PFam PF07690          3   12.723214 0.006070

PV4: 199 significant enrichments
module database    term  n_overlap  odds_ratio      fdr
  M000     PFam PF01228          2         inf 0.001063
  M001     PFam PF13649          2    8.177778 0.037950
  M002     PFam PF00486          4   15.994565 0.001063
  M004     KEGG  K03808          2         inf 0.001063
  M006     PFam PF01551          2   30.781250 0.007133
  M007     KEGG  K02851          2         inf 0.001063
  M008     PFam PF07690          2    9.180124 0.031990
  M009     KEGG  K03722          2         inf 0.001063
  M010     PFam PF03176          3   29.660000 0.001232
  M011     PFam PF00595          3  206.651163 0.000728

pseudo1_N1B4: 112 significant enrichments
module database    term  n_overlap  odds_ratio      fdr
  M000     PFam PF02151          2         inf 0.000496
  M001     PFam PF00361          2   59.486111 0.002373
  M002     PFam PF02653          4   73.282051 0.000113
  M003     PFam PF13442          2   35.800000 0.003882
  M004     PFam PF08448          2   13.909091 0.015573
  M005     PFam PF00501          2   11.438503 0.021561
  M006     PFam PF01553          2   35.675000 0.004259
  M007     KEGG  K02168          2         inf 0.000496
  M008     PFam PF00549          2         inf 0.000514
  M010     PFam PF01842          4  186.260870 0.000031

acidovorax_3H11: 160 significant enrichments
module database                                                       term  n_overlap  odds_ratio      fdr
  M000     SEED     FIG000557: hypothetical protein co-occurring with RecR          2  489.500000 0.000835
  M001     SEED                  Filamentation induced by cAMP protein Fic          2  161.833333 0.001829
  M002     KEGG                                                     K00548          2         inf 0.000835
  M003     SEED                     transcriptional regulator, TetR family          2   55.078014 0.004515
  M004     PFam                                                    PF01738          2   78.727273 0.002669
  M005     PFam                                                    PF01418          2         inf 0.000835
  M006     PFam                                                    PF00561          2    8.606667 0.041045
  M007  TIGRFam                                                  TIGR02532          3         inf 0.000030
  M008     KEGG                                                     K01422          2         inf 0.000835
  M009     SEED Signaling protein with a acyltransferase and GGDEF domains          2         inf 0.000835

SynE: 60 significant enrichments
module database                                    term  n_overlap  odds_ratio      fdr
  M000     KEGG                                  K01090          2         inf 0.003765
  M001     KEGG                                  K02005          2  138.444444 0.003765
  M003     KEGG                                  K03086          2         inf 0.003765
  M004     PFam                                 PF00005          2   11.974194 0.024968
  M005     PFam                                 PF00528          2   11.272727 0.026186
  M006     PFam                                 PF14559          2   11.236364 0.026865
  M008  TIGRFam                               TIGR01184          2   54.231884 0.007311
  M009     PFam                                 PF00999          2   51.944444 0.007446
  M010     PFam                                 PF01022          2  117.437500 0.003765
  M013     SEED Two-component system response regulator          2         inf 0.003765

Methanococcus_JJ: 63 significant enrichments
module database                                                 term  n_overlap  odds_ratio      fdr
  M002     PFam                                              PF04055          2   11.140496 0.025667
  M003     SEED                                     Archaeal histone          2         inf 0.003011
  M006     PFam                                              PF02163          2         inf 0.003743
  M008     PFam                                              PF00148          4  115.826087 0.000539
  M009     KEGG                                               K01834          2         inf 0.003011
  M010     KEGG                                               K07321          2         inf 0.003011
  M011     KEGG                                               K00196          2         inf 0.003011
  M013     PFam                                              PF00753          3   17.959821 0.004434
  M016     SEED Iron transport Periplasmic binding protein precursor          2  124.363636 0.003011

BFirm: 145 significant enrichments
module database                                                                                    term  n_overlap  odds_ratio          fdr
  M000     PFam                                                                                 PF01094          2   17.081081 1.568878e-02
  M001     PFam                                                                                 PF00155          2   17.428571 1.555756e-02
  M002     PFam                                                                                 PF00034          5   37.236111 6.464733e-05
  M003     PFam                                                                                 PF00270          2   18.631944 1.417507e-02
  M004     PFam                                                                                 PF00067          2         inf 5.539283e-04
  M005     SEED Transcriptional regulator, GntR family domain / Aspartate aminotransferase (EC 2.6.1.1)          2   13.963542 2.037010e-02
  M006     PFam                                                                                 PF01012          2  540.500000 5.539283e-04
  M007     PFam                                                                                 PF00460          5  597.444444 3.172766e-08
  M008     SEED                                                 Glutathione S-transferase (EC 2.5.1.18)          2   25.590476 9.555127e-03
  M009     PFam                                                                                 PF12418          2  224.041667 1.285883e-03

Kang: 97 significant enrichments
module database                                      term  n_overlap  odds_ratio      fdr
  M001     PFam                                   PF02151          2         inf 0.001455
  M002     PFam                                   PF00534          2         inf 0.001455
  M003     SEED Isovaleryl-CoA dehydrogenase (EC 1.3.8.4)          2         inf 0.001634
  M004     PFam                                   PF02518          3   14.692500 0.004123
  M005     PFam                                   PF13443          2   27.083333 0.009161
  M006     PFam                                   PF00089          2         inf 0.001634
  M008     PFam                                   PF02151          2         inf 0.001455
  M009     PFam                                   PF00109          2   53.027027 0.004159
  M011     KEGG                                    K05568          3         inf 0.000120
  M013     KEGG                                    K04761          2         inf 0.001634

ANA3: 184 significant enrichments
module database                                              term  n_overlap  odds_ratio      fdr
  M000     KEGG                                            K02415          2         inf 0.000810
  M001     PFam                                           PF00326          2   11.554487 0.024017
  M002     PFam                                           PF03553          2   50.208333 0.004159
  M004     PFam                                           PF00270          2   11.332288 0.024017
  M005     PFam                                           PF00005          2    9.374026 0.031599
  M007     SEED Site-specific recombinase, phage integrase family          2         inf 0.000810
  M008     PFam                                           PF00270          3   10.933131 0.007542
  M010  TIGRFam                                         TIGR00177          3  230.872340 0.000412
  M011     SEED                 Alkaline phosphatase (EC 3.1.3.1)          2   95.421053 0.002200
  M013     PFam                                           PF04349          3         inf 0.000412

Cup4G11: 244 significant enrichments
module database                 term  n_overlap  odds_ratio      fdr
  M000     KEGG               K01142          2         inf 0.000362
  M001     SEED DnaK-related protein          2         inf 0.000362
  M002     PFam              PF13145          2   87.930556 0.001832
  M003  TIGRFam            TIGR02001          2         inf 0.000362
  M004     PFam              PF00271          5  117.185185 0.000003
  M005     KEGG               K00548          2         inf 0.000362
  M006     PFam              PF00171          2    7.693529 0.045049
  M007     KEGG               K03802          2         inf 0.000362
  M008     PFam              PF01077          2  263.875000 0.000762
  M009  TIGRFam            TIGR01783          2   37.660714 0.005124

pseudo13_GW456_L13: 191 significant enrichments
module database                                 term  n_overlap  odds_ratio      fdr
  M000     SEED Histidine ammonia-lyase (EC 4.3.1.3)          2         inf 0.001073
  M001     PFam                              PF14497          2   63.703704 0.004443
  M002     KEGG                               K03293          2   44.750000 0.005618
  M003     PFam                              PF00590          3   91.425532 0.001073
  M005     PFam                              PF00532          3   68.553191 0.001073
  M006     PFam                              PF02589          2         inf 0.001131
  M007     KEGG                               K10125          2  179.125000 0.001981
  M008     PFam                              PF00528          4    5.411765 0.018468
  M009     SEED            Ferrichrome-iron receptor          2   89.541667 0.003184
  M010  TIGRFam                            TIGR01726          3   17.090426 0.004485

Ponti: 171 significant enrichments
module database                                                    term  n_overlap  odds_ratio      fdr
  M000     PFam                                                 PF00884          2         inf 0.000907
  M002     PFam                                                 PF19335          4  316.000000 0.000014
  M005     SEED Probable Co/Zn/Cd efflux system membrane fusion protein          4   25.342657 0.000709
  M006     KEGG                                                  K01752          2         inf 0.000907
  M007     PFam                                                 PF07715          5   11.108025 0.000953
  M008     KEGG                                                  K06147          2  151.416667 0.001737
  M010     KEGG                                                  K04047          2         inf 0.000907
  M011     PFam                                                 PF02661          2   93.384615 0.002251
  M012     PFam                                                 PF00171          2   21.595238 0.010614
  M013     SEED                Magnesium and cobalt efflux protein CorC          3  231.957447 0.000265

import matplotlib.pyplot as plt

FIG_DIR = Path('../figures')

# Collect enrichment stats per organism
enrich_stats = []
for org_id in pilot_ids:
    ann_file = MODULE_DIR / f'{org_id}_module_annotations.csv'
    member_file = MODULE_DIR / f'{org_id}_gene_membership.csv'
    if not ann_file.exists() or ann_file.stat().st_size < 10 or not member_file.exists():
        continue
    ann = pd.read_csv(ann_file)
    if len(ann) == 0:
        continue
    membership = pd.read_csv(member_file, index_col=0)
    n_modules = len(membership.columns)
    sig = ann[ann['significant']]
    modules_annotated = sig['module'].nunique()
    
    # Count by database
    db_counts = sig.groupby('database')['module'].nunique()
    enrich_stats.append({
        'orgId': org_id,
        'n_modules': n_modules,
        'n_annotated': modules_annotated,
        'pct_annotated': 100 * modules_annotated / n_modules,
        'n_kegg': db_counts.get('KEGG', 0),
        'n_seed': db_counts.get('SEED', 0),
        'n_tigrfam': db_counts.get('TIGRFam', 0),
        'n_pfam': db_counts.get('PFam', 0),
    })

stats_df = pd.DataFrame(enrich_stats)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Left: % modules annotated per organism
ax = axes[0]
stats_df_sorted = stats_df.sort_values('pct_annotated', ascending=True)
ax.barh(range(len(stats_df_sorted)), stats_df_sorted['pct_annotated'], color='#2196F3')
ax.set_yticks(range(len(stats_df_sorted)))
ax.set_yticklabels(stats_df_sorted['orgId'], fontsize=7)
ax.set_xlabel('% modules with significant enrichment')
ax.set_title('Module Annotation Rate by Organism')
ax.axvline(stats_df['pct_annotated'].median(), color='red', linestyle='--', alpha=0.7,
           label=f"median={stats_df['pct_annotated'].median():.0f}%")
ax.legend(fontsize=8)

# Right: annotation database breakdown (stacked bar)
ax = axes[1]
orgs = stats_df.sort_values('n_annotated', ascending=True)['orgId'].tolist()
kegg_vals = [stats_df[stats_df['orgId']==o]['n_kegg'].values[0] for o in orgs]
seed_vals = [stats_df[stats_df['orgId']==o]['n_seed'].values[0] for o in orgs]
tigr_vals = [stats_df[stats_df['orgId']==o]['n_tigrfam'].values[0] for o in orgs]
pfam_vals = [stats_df[stats_df['orgId']==o]['n_pfam'].values[0] for o in orgs]
y = range(len(orgs))
ax.barh(y, kegg_vals, label='KEGG', color='#4CAF50')
ax.barh(y, seed_vals, left=kegg_vals, label='SEED', color='#FF9800')
ax.barh(y, tigr_vals, left=[k+s for k,s in zip(kegg_vals, seed_vals)], label='TIGRFam', color='#9C27B0')
ax.barh(y, pfam_vals, left=[k+s+t for k,s,t in zip(kegg_vals, seed_vals, tigr_vals)], label='PFam', color='#2196F3')
ax.set_yticks(y)
ax.set_yticklabels(orgs, fontsize=7)
ax.set_xlabel('Number of modules with enrichment')
ax.set_title('Enrichment Sources per Organism')
ax.legend(fontsize=8)

plt.tight_layout()
plt.savefig(FIG_DIR / 'enrichment_summary.png', dpi=150, bbox_inches='tight')
plt.show()
print(f"Mean annotation rate: {stats_df['pct_annotated'].mean():.1f}%")
print(f"Saved: figures/enrichment_summary.png")

Mean annotation rate: 79.5%
Saved: figures/enrichment_summary.png

04 Module Annotation

NB 04: Module Functional Annotation¶

Part 1: Extract Annotations from Spark¶

Part 2: Enrichment Analysis¶