04 Module Annotation
Jupyter notebook from the Pan-bacterial Fitness Modules via Independent Component Analysis project.
NB 04: Module Functional Annotation¶
Label each ICA module with biological function using enrichment analysis.
Part 1 (JupyterHub): Extract KEGG, SEED, domain, and specific phenotype annotations from Spark.
Part 2 (local): Fisher exact test enrichment for each module.
Run Part 1 on JupyterHub first, then Part 2 can run locally.
In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from scipy import stats as scipy_stats
from statsmodels.stats.multitest import multipletests
DATA_DIR = Path('../data')
ANNOT_DIR = DATA_DIR / 'annotations'
MODULE_DIR = DATA_DIR / 'modules'
ANNOT_DIR.mkdir(parents=True, exist_ok=True)
pilots = pd.read_csv(DATA_DIR / 'pilot_organisms.csv')
pilot_ids = pilots['orgId'].tolist()
print(f"Pilot organisms: {pilot_ids}")
Pilot organisms: ['DvH', 'Btheta', 'Methanococcus_S2', 'psRCH2', 'Putida', 'Phaeo', 'Marino', 'pseudo3_N2E3', 'Koxy', 'Cola', 'WCS417', 'Caulo', 'SB2B', 'pseudo6_N2E2', 'Dino', 'pseudo5_N2C3_1', 'Miya', 'Pedo557', 'MR1', 'Keio', 'Korea', 'PV4', 'pseudo1_N1B4', 'acidovorax_3H11', 'SynE', 'Methanococcus_JJ', 'BFirm', 'Kang', 'ANA3', 'Cup4G11', 'pseudo13_GW456_L13', 'Ponti']
Part 1: Extract Annotations from Spark¶
Run this section on JupyterHub.
In [2]:
# Initialize Spark (comment out if running Part 2 locally)
try:
spark = get_spark_session()
HAS_SPARK = True
print(f"Spark version: {spark.version}")
except Exception:
HAS_SPARK = False
print("No Spark available — running Part 2 only (local mode)")
Spark version: 4.0.1
In [3]:
if HAS_SPARK:
for org_id in pilot_ids:
# KEGG annotations (join through besthitkegg — keggmember has no orgId/locusId)
kegg_file = ANNOT_DIR / f'{org_id}_kegg.csv'
if not (kegg_file.exists() and kegg_file.stat().st_size > 0):
kegg = spark.sql(f"""
SELECT bk.locusId, km.kgroup, kd.desc as kgroup_desc,
ke.ecnum as ec
FROM kescience_fitnessbrowser.besthitkegg bk
JOIN kescience_fitnessbrowser.keggmember km
ON bk.keggOrg = km.keggOrg AND bk.keggId = km.keggId
LEFT JOIN kescience_fitnessbrowser.kgroupdesc kd
ON km.kgroup = kd.kgroup
LEFT JOIN kescience_fitnessbrowser.kgroupec ke
ON km.kgroup = ke.kgroup
WHERE bk.orgId = '{org_id}'
""").toPandas()
kegg.to_csv(kegg_file, index=False)
print(f"KEGG: {org_id} — {len(kegg)} annotations")
else:
print(f"CACHED: {org_id} KEGG")
# SEED annotations (seedclass has no subsystem columns — just use seed_desc)
seed_file = ANNOT_DIR / f'{org_id}_seed.csv'
if not (seed_file.exists() and seed_file.stat().st_size > 0):
seed = spark.sql(f"""
SELECT locusId, seed_desc
FROM kescience_fitnessbrowser.seedannotation
WHERE orgId = '{org_id}'
""").toPandas()
seed.to_csv(seed_file, index=False)
print(f"SEED: {org_id} — {len(seed)} annotations")
else:
print(f"CACHED: {org_id} SEED")
# Domain annotations
domain_file = ANNOT_DIR / f'{org_id}_domains.csv'
if not (domain_file.exists() and domain_file.stat().st_size > 0):
domains = spark.sql(f"""
SELECT locusId, domainDb, domainId, domainName,
definition, geneSymbol, ec
FROM kescience_fitnessbrowser.genedomain
WHERE orgId = '{org_id}'
""").toPandas()
domains.to_csv(domain_file, index=False)
print(f"Domains: {org_id} — {len(domains)} annotations")
else:
print(f"CACHED: {org_id} domains")
# Specific phenotypes
pheno_file = ANNOT_DIR / f'{org_id}_specific_phenotypes.csv'
if not (pheno_file.exists() and pheno_file.stat().st_size > 0):
pheno = spark.sql(f"""
SELECT sp.locusId, sp.expName,
e.expDesc, e.expGroup, e.condition_1
FROM kescience_fitnessbrowser.specificphenotype sp
JOIN kescience_fitnessbrowser.experiment e
ON sp.orgId = e.orgId AND sp.expName = e.expName
WHERE sp.orgId = '{org_id}'
""").toPandas()
pheno.to_csv(pheno_file, index=False)
print(f"Phenotypes: {org_id} — {len(pheno)} entries")
else:
print(f"CACHED: {org_id} phenotypes")
CACHED: DvH KEGG CACHED: DvH SEED CACHED: DvH domains CACHED: DvH phenotypes CACHED: Btheta KEGG CACHED: Btheta SEED CACHED: Btheta domains CACHED: Btheta phenotypes CACHED: Methanococcus_S2 KEGG CACHED: Methanococcus_S2 SEED CACHED: Methanococcus_S2 domains CACHED: Methanococcus_S2 phenotypes CACHED: psRCH2 KEGG CACHED: psRCH2 SEED CACHED: psRCH2 domains CACHED: psRCH2 phenotypes CACHED: Putida KEGG CACHED: Putida SEED CACHED: Putida domains CACHED: Putida phenotypes CACHED: Phaeo KEGG CACHED: Phaeo SEED CACHED: Phaeo domains CACHED: Phaeo phenotypes CACHED: Marino KEGG CACHED: Marino SEED CACHED: Marino domains CACHED: Marino phenotypes CACHED: pseudo3_N2E3 KEGG CACHED: pseudo3_N2E3 SEED CACHED: pseudo3_N2E3 domains CACHED: pseudo3_N2E3 phenotypes CACHED: Koxy KEGG CACHED: Koxy SEED CACHED: Koxy domains CACHED: Koxy phenotypes CACHED: Cola KEGG CACHED: Cola SEED CACHED: Cola domains CACHED: Cola phenotypes CACHED: WCS417 KEGG CACHED: WCS417 SEED CACHED: WCS417 domains CACHED: WCS417 phenotypes CACHED: Caulo KEGG CACHED: Caulo SEED CACHED: Caulo domains CACHED: Caulo phenotypes CACHED: SB2B KEGG CACHED: SB2B SEED CACHED: SB2B domains CACHED: SB2B phenotypes CACHED: pseudo6_N2E2 KEGG CACHED: pseudo6_N2E2 SEED CACHED: pseudo6_N2E2 domains CACHED: pseudo6_N2E2 phenotypes CACHED: Dino KEGG CACHED: Dino SEED CACHED: Dino domains CACHED: Dino phenotypes CACHED: pseudo5_N2C3_1 KEGG CACHED: pseudo5_N2C3_1 SEED CACHED: pseudo5_N2C3_1 domains CACHED: pseudo5_N2C3_1 phenotypes CACHED: Miya KEGG CACHED: Miya SEED CACHED: Miya domains CACHED: Miya phenotypes CACHED: Pedo557 KEGG CACHED: Pedo557 SEED CACHED: Pedo557 domains CACHED: Pedo557 phenotypes CACHED: MR1 KEGG CACHED: MR1 SEED CACHED: MR1 domains CACHED: MR1 phenotypes CACHED: Keio KEGG CACHED: Keio SEED CACHED: Keio domains CACHED: Keio phenotypes CACHED: Korea KEGG CACHED: Korea SEED CACHED: Korea domains CACHED: Korea phenotypes CACHED: PV4 KEGG CACHED: PV4 SEED CACHED: PV4 domains CACHED: PV4 phenotypes CACHED: pseudo1_N1B4 KEGG CACHED: pseudo1_N1B4 SEED CACHED: pseudo1_N1B4 domains CACHED: pseudo1_N1B4 phenotypes CACHED: acidovorax_3H11 KEGG CACHED: acidovorax_3H11 SEED CACHED: acidovorax_3H11 domains CACHED: acidovorax_3H11 phenotypes CACHED: SynE KEGG CACHED: SynE SEED CACHED: SynE domains CACHED: SynE phenotypes CACHED: Methanococcus_JJ KEGG CACHED: Methanococcus_JJ SEED CACHED: Methanococcus_JJ domains CACHED: Methanococcus_JJ phenotypes CACHED: BFirm KEGG CACHED: BFirm SEED CACHED: BFirm domains CACHED: BFirm phenotypes CACHED: Kang KEGG CACHED: Kang SEED CACHED: Kang domains CACHED: Kang phenotypes CACHED: ANA3 KEGG CACHED: ANA3 SEED CACHED: ANA3 domains CACHED: ANA3 phenotypes CACHED: Cup4G11 KEGG CACHED: Cup4G11 SEED CACHED: Cup4G11 domains CACHED: Cup4G11 phenotypes CACHED: pseudo13_GW456_L13 KEGG CACHED: pseudo13_GW456_L13 SEED CACHED: pseudo13_GW456_L13 domains CACHED: pseudo13_GW456_L13 phenotypes CACHED: Ponti KEGG CACHED: Ponti SEED CACHED: Ponti domains CACHED: Ponti phenotypes
Part 2: Enrichment Analysis¶
Fisher exact test for each annotation term vs module membership.
In [4]:
def enrichment_analysis(module_genes, all_genes, annotation_map, min_annotated=2):
"""Fisher exact test enrichment for a single module.
Parameters
----------
module_genes : set
Genes in the module.
all_genes : set
All genes in the organism.
annotation_map : dict
{term: set_of_genes} mapping.
min_annotated : int
Minimum annotated genes in module for testing.
Returns
-------
results : list of dict
Enrichment results per term.
"""
results = []
n_total = len(all_genes)
n_module = len(module_genes)
for term, term_genes in annotation_map.items():
term_genes = term_genes & all_genes # intersect with valid genes
overlap = module_genes & term_genes
if len(overlap) < min_annotated:
continue
# 2x2 contingency table
a = len(overlap) # in module AND annotated
b = len(module_genes - term_genes) # in module NOT annotated
c = len(term_genes - module_genes) # NOT in module but annotated
d = n_total - len(module_genes | term_genes) # neither
odds_ratio, p_value = scipy_stats.fisher_exact([[a, b], [c, d]],
alternative='greater')
results.append({
'term': term,
'n_overlap': a,
'n_module': n_module,
'n_term': len(term_genes),
'odds_ratio': odds_ratio,
'p_value': p_value
})
return results
In [5]:
for org_id in pilot_ids:
out_file = MODULE_DIR / f'{org_id}_module_annotations.csv'
cond_file = MODULE_DIR / f'{org_id}_module_conditions.csv'
if out_file.exists() and out_file.stat().st_size > 0:
print(f"CACHED: {org_id} annotations")
continue
print(f"\nAnnotating {org_id} modules...")
# Load membership
membership = pd.read_csv(MODULE_DIR / f'{org_id}_gene_membership.csv', index_col=0)
all_genes = set(membership.index.astype(str))
module_names = membership.columns.tolist()
# Load annotations and build term->gene maps
annotation_maps = {}
# KEGG
kegg_file = ANNOT_DIR / f'{org_id}_kegg.csv'
if kegg_file.exists():
kegg = pd.read_csv(kegg_file)
kegg_map = kegg.groupby('kgroup')['locusId'].apply(lambda x: set(x.astype(str))).to_dict()
annotation_maps['KEGG'] = kegg_map
# SEED (group by seed_desc — seedclass has no subsystem hierarchy)
seed_file = ANNOT_DIR / f'{org_id}_seed.csv'
if seed_file.exists():
seed = pd.read_csv(seed_file).dropna(subset=['seed_desc'])
if len(seed) > 0:
seed_map = seed.groupby('seed_desc')['locusId'].apply(
lambda x: set(x.astype(str))).to_dict()
# Keep terms with 2+ genes
annotation_maps['SEED'] = {k: v for k, v in seed_map.items() if len(v) >= 2}
# Domains (TIGRFam and PFam)
domain_file = ANNOT_DIR / f'{org_id}_domains.csv'
if domain_file.exists():
domains = pd.read_csv(domain_file)
domains['locusId'] = domains['locusId'].astype(str)
tigr = domains[domains['domainDb'] == 'TIGRFam']
if len(tigr) > 0:
tigr_map = tigr.groupby('domainId')['locusId'].apply(
lambda x: set(x.astype(str))).to_dict()
annotation_maps['TIGRFam'] = tigr_map
pfam = domains[domains['domainDb'] == 'PFam']
if len(pfam) > 0:
pfam_map = pfam.groupby('domainId')['locusId'].apply(
lambda x: set(x.astype(str))).to_dict()
annotation_maps['PFam'] = pfam_map
# Run enrichment for each module
all_results = []
for mod in module_names:
mod_genes = set(membership.index[membership[mod] == 1].astype(str))
if len(mod_genes) == 0:
continue
for db_name, term_map in annotation_maps.items():
results = enrichment_analysis(mod_genes, all_genes, term_map)
for r in results:
r['module'] = mod
r['database'] = db_name
all_results.extend(results)
if all_results:
enrich_df = pd.DataFrame(all_results)
# FDR correction
reject, fdr, _, _ = multipletests(enrich_df['p_value'], method='fdr_bh')
enrich_df['fdr'] = fdr
enrich_df['significant'] = reject
enrich_df = enrich_df.sort_values(['module', 'fdr'])
enrich_df.to_csv(out_file, index=False)
n_sig = enrich_df['significant'].sum()
n_modules_annotated = enrich_df[enrich_df['significant']]['module'].nunique()
print(f" {n_sig} significant enrichments across {n_modules_annotated} modules")
else:
print(f" No enrichments found")
# Map module activity to experiment conditions
profiles = pd.read_csv(MODULE_DIR / f'{org_id}_module_profiles.csv', index_col=0)
exp_meta = pd.read_csv(ANNOT_DIR / f'{org_id}_experiments.csv')
condition_results = []
for mod in profiles.index:
activity = profiles.loc[mod]
# Top 5 most activated experiments
top_activated = activity.abs().nlargest(5)
for exp_name, act_value in top_activated.items():
exp_info = exp_meta[exp_meta['expName'] == exp_name]
if len(exp_info) > 0:
condition_results.append({
'module': mod,
'expName': exp_name,
'activity': float(activity[exp_name]),
'abs_activity': float(act_value),
'expDesc': exp_info.iloc[0].get('expDesc', ''),
'expGroup': exp_info.iloc[0].get('expGroup', ''),
'condition_1': exp_info.iloc[0].get('condition_1', '')
})
if condition_results:
cond_df = pd.DataFrame(condition_results)
cond_df.to_csv(cond_file, index=False)
print(f" Saved condition mappings")
Annotating DvH modules...
133 significant enrichments across 31 modules Saved condition mappings Annotating Btheta modules...
134 significant enrichments across 24 modules Saved condition mappings Annotating Methanococcus_S2 modules...
40 significant enrichments across 10 modules Saved condition mappings Annotating psRCH2 modules...
177 significant enrichments across 35 modules Saved condition mappings Annotating Putida modules...
250 significant enrichments across 36 modules Saved condition mappings Annotating Phaeo modules...
170 significant enrichments across 32 modules Saved condition mappings Annotating Marino modules...
162 significant enrichments across 31 modules Saved condition mappings Annotating pseudo3_N2E3 modules...
307 significant enrichments across 34 modules Saved condition mappings Annotating Koxy modules...
173 significant enrichments across 34 modules Saved condition mappings Annotating Cola modules...
197 significant enrichments across 28 modules Saved condition mappings Annotating WCS417 modules...
250 significant enrichments across 32 modules Saved condition mappings Annotating Caulo modules...
112 significant enrichments across 24 modules Saved condition mappings Annotating SB2B modules...
230 significant enrichments across 31 modules Saved condition mappings Annotating pseudo6_N2E2 modules...
250 significant enrichments across 36 modules Saved condition mappings Annotating Dino modules...
169 significant enrichments across 27 modules Saved condition mappings Annotating pseudo5_N2C3_1 modules...
334 significant enrichments across 41 modules Saved condition mappings Annotating Miya modules...
165 significant enrichments across 26 modules Saved condition mappings Annotating Pedo557 modules...
204 significant enrichments across 31 modules Saved condition mappings Annotating MR1 modules...
191 significant enrichments across 36 modules Saved condition mappings Annotating Keio modules...
176 significant enrichments across 31 modules Saved condition mappings Annotating Korea modules...
121 significant enrichments across 19 modules Saved condition mappings Annotating PV4 modules...
199 significant enrichments across 25 modules Saved condition mappings Annotating pseudo1_N1B4 modules...
112 significant enrichments across 17 modules Saved condition mappings Annotating acidovorax_3H11 modules...
160 significant enrichments across 29 modules Saved condition mappings Annotating SynE modules...
60 significant enrichments across 14 modules Saved condition mappings Annotating Methanococcus_JJ modules...
63 significant enrichments across 9 modules Saved condition mappings Annotating BFirm modules...
145 significant enrichments across 27 modules Saved condition mappings Annotating Kang modules...
97 significant enrichments across 18 modules Saved condition mappings Annotating ANA3 modules...
184 significant enrichments across 31 modules Saved condition mappings Annotating Cup4G11 modules...
244 significant enrichments across 31 modules Saved condition mappings Annotating pseudo13_GW456_L13 modules...
191 significant enrichments across 33 modules Saved condition mappings Annotating Ponti modules...
171 significant enrichments across 27 modules Saved condition mappings
In [6]:
# Summary: top enrichments per organism
for org_id in pilot_ids:
ann_file = MODULE_DIR / f'{org_id}_module_annotations.csv'
if not ann_file.exists() or ann_file.stat().st_size < 10:
continue
ann = pd.read_csv(ann_file)
if len(ann) == 0:
continue
sig = ann[ann['significant']]
print(f"\n{org_id}: {len(sig)} significant enrichments")
# Show top enrichment per module
top = sig.groupby('module').first().reset_index()
if len(top) > 0:
print(top[['module', 'database', 'term', 'n_overlap', 'odds_ratio', 'fdr']].head(10).to_string(index=False))
DvH: 133 significant enrichments module database term n_overlap odds_ratio fdr M000 PFam PF12399 2 455.166667 1.770892e-04 M002 PFam PF00590 4 725.600000 2.877661e-07 M003 PFam PF00072 4 27.220513 2.783908e-04 M006 PFam PF00994 3 340.500000 1.845777e-05 M007 PFam PF13186 3 291.642857 2.266124e-05 M008 PFam PF01061 2 inf 4.120635e-05 M010 PFam PF02653 2 303.333333 2.534944e-04 M011 PFam PF00460 7 437.906977 4.820529e-10 M012 TIGRFam TIGR01539 3 inf 3.966784e-06 M013 PFam PF02508 2 inf 1.770892e-04 Btheta: 134 significant enrichments module database term n_overlap odds_ratio fdr M000 PFam PF01915 2 44.175824 3.270124e-03 M002 SEED putative glycosyltransferase yibD 2 576.857143 1.865003e-04 M003 SEED BexA, membrane protein 2 inf 1.510169e-04 M004 KEGG K03701 2 inf 1.510169e-04 M005 SEED Ferric siderophore transport system, periplasmic binding protein TonB 2 336.833333 2.242119e-04 M007 PFam PF00534 8 44.210526 2.153416e-08 M008 SEED Cardiolipin synthetase (EC 2.7.8.-) 2 298.148148 5.264009e-04 M012 PFam PF00370 2 385.047619 1.865003e-04 M014 PFam PF13522 2 inf 1.510169e-04 M015 SEED conserved hypothetical protein 3 207.982759 7.090474e-05 Methanococcus_S2: 40 significant enrichments module database term n_overlap odds_ratio fdr M000 TIGRFam TIGR02090 2 inf 0.000827 M010 KEGG K07079 2 223.636364 0.000827 M014 PFam PF04021 4 51.200000 0.000232 M015 PFam PF12838 3 43.023529 0.000827 M016 KEGG K07321 2 inf 0.000827 M018 PFam PF00072 2 1239.000000 0.000341 M019 PFam PF01472 2 205.833333 0.000827 M021 PFam PF00148 4 inf 0.000002 M022 PFam PF00753 3 73.560000 0.000534 M025 PFam PF04055 3 17.382775 0.002688 psRCH2: 177 significant enrichments module database term n_overlap odds_ratio fdr M000 PFam PF05157 2 100.363636 1.366850e-03 M002 PFam PF00440 3 51.843750 4.219746e-04 M003 PFam PF02239 3 382.961538 6.637691e-05 M004 PFam PF16363 5 296.964286 8.255354e-08 M005 PFam PF00361 2 60.363636 2.022822e-03 M006 PFam PF01584 4 57.286957 1.049572e-04 M007 TIGRFam TIGR00786 2 45.777778 3.509183e-03 M008 PFam PF04324 2 333.500000 4.259970e-04 M009 PFam PF01144 2 184.833333 6.155129e-04 M012 PFam PF00072 5 7.015152 2.457249e-03 Putida: 250 significant enrichments module database term n_overlap odds_ratio fdr M000 PFam PF03471 2 288.606061 3.728746e-04 M001 SEED Transcriptional regulator, LysR family 2 11.223810 2.290103e-02 M002 PFam PF01012 2 inf 1.971565e-04 M003 PFam PF00590 3 198.041667 3.799954e-05 M004 PFam PF00668 4 inf 3.362548e-07 M005 PFam PF00171 3 30.581897 8.592811e-04 M006 PFam PF13193 2 39.541667 3.639559e-03 M007 TIGRFam TIGR00177 2 278.882353 6.278973e-04 M008 TIGRFam TIGR01726 2 35.133333 4.198886e-03 M009 KEGG K03808 2 inf 1.498389e-04 Phaeo: 170 significant enrichments module database term n_overlap odds_ratio fdr M000 PFam PF05138 2 440.285714 4.749231e-04 M001 PFam PF00994 3 256.333333 6.955303e-05 M002 SEED Pyrimidine ABC transporter, transmembrane component 1 2 inf 6.768074e-04 M004 PFam PF00460 5 inf 5.443611e-08 M005 PFam PF06568 2 171.222222 6.768074e-04 M006 PFam PF01979 2 85.555556 1.191026e-03 M007 PFam PF01571 2 61.540000 1.641622e-03 M008 PFam PF00512 2 30.670000 4.418259e-03 M009 PFam PF04321 2 102.666667 9.718327e-04 M011 PFam PF01471 3 153.950000 9.273318e-05 Marino: 162 significant enrichments module database term n_overlap odds_ratio fdr M001 PFam PF13538 2 329.545455 0.000648 M002 KEGG K10040 2 inf 0.000256 M003 PFam PF05138 2 inf 0.000561 M005 PFam PF00482 2 161.333333 0.000686 M006 KEGG K00951 2 inf 0.000581 M007 PFam PF13531 3 94.408696 0.000453 M008 SEED NADPH dependent preQ0 reductase (EC 1.7.1.13) 2 inf 0.000561 M009 PFam PF01565 2 346.476190 0.000581 M010 PFam PF00440 2 11.832237 0.019219 M011 SEED Magnesium and cobalt efflux protein CorC 2 180.350000 0.001111 pseudo3_N2E3: 307 significant enrichments module database term n_overlap odds_ratio fdr M000 PFam PF08028 3 39.946524 5.175953e-04 M001 SEED FIG002188: hypothetical protein 2 inf 2.102014e-04 M003 SEED Sensory box histidine kinase/response regulator 2 589.176471 3.092783e-04 M004 KEGG K00673 2 118.928571 1.021151e-03 M006 KEGG K12979 3 inf 2.287327e-05 M007 PFam PF02653 2 61.641975 1.972041e-03 M009 PFam PF00905 2 69.097222 2.035097e-03 M010 TIGRFam TIGR00254 8 31.415873 1.749435e-07 M012 PFam PF01144 2 227.363636 5.273625e-04 M013 PFam PF00361 2 inf 4.587149e-04 Koxy: 173 significant enrichments module database term n_overlap odds_ratio fdr M001 PFam PF00696 3 58.123404 0.000477 M002 PFam PF00294 2 21.738095 0.010464 M003 KEGG K01607 2 228.250000 0.001025 M005 PFam PF00440 2 8.215580 0.037094 M006 TIGRFam TIGR01494 2 37.941667 0.005479 M008 PFam PF02518 5 17.352490 0.000370 M009 PFam PF07690 2 12.652482 0.025755 M010 KEGG K00068 2 inf 0.000403 M011 SEED Cysteine desulfurase (EC 2.8.1.7), SufS subfamily 2 inf 0.000580 M013 SEED Phage tail length tape-measure protein 1 2 inf 0.000580 Cola: 197 significant enrichments module database term n_overlap odds_ratio fdr M000 PFam PF00534 5 27.912482 0.000156 M001 PFam PF00155 2 21.744444 0.008923 M002 KEGG K01752 2 inf 0.000716 M003 KEGG K03296 3 249.127660 0.000230 M004 PFam PF00072 4 6.702609 0.008263 M005 KEGG K06147 2 inf 0.000299 M008 SEED Sucrose-6-phosphate hydrolase (EC 3.2.1.B3) 2 inf 0.000156 M012 PFam PF13649 2 32.666667 0.005413 M013 SEED Transcriptional regulator of rhamnose utilization, GntR family 3 inf 0.000039 M016 SEED ABC transporter efflux protein 2 inf 0.000716 WCS417: 250 significant enrichments module database term n_overlap odds_ratio fdr M000 PFam PF00361 2 inf 4.071644e-04 M001 PFam PF08443 2 94.978261 1.941792e-03 M002 TIGRFam TIGR00254 9 38.142439 9.518412e-09 M003 KEGG K00114 2 inf 4.664208e-04 M004 KEGG K02483 3 92.893617 4.071644e-04 M006 KEGG K07264 2 inf 5.970382e-04 M007 KEGG K11073 2 489.555556 4.071644e-04 M008 PFam PF00528 4 12.008333 1.911009e-03 M009 PFam PF13673 4 15.742754 1.018165e-03 M010 PFam PF00590 5 385.263158 1.759091e-08 Caulo: 112 significant enrichments module database term n_overlap odds_ratio fdr M002 PFam PF00912 3 93.457143 0.000356 M005 KEGG K01847 2 inf 0.000563 M006 PFam PF00270 5 72.377778 0.000009 M007 PFam PF00392 2 78.357143 0.001226 M008 KEGG K02014 2 12.248120 0.017675 M009 PFam PF00528 3 23.070922 0.001275 M010 TIGRFam TIGR01845 2 549.500000 0.000356 M011 PFam PF00482 2 135.875000 0.001275 M014 KEGG K01847 2 inf 0.000402 M016 PFam PF00994 2 470.714286 0.000356 SB2B: 230 significant enrichments module database term n_overlap odds_ratio fdr M002 PFam PF02769 2 154.850000 0.001104 M003 TIGRFam TIGR00254 9 12.744371 0.000014 M005 KEGG K06213 2 inf 0.000923 M006 PFam PF00171 2 18.642424 0.010310 M007 TIGRFam TIGR02532 4 72.257310 0.000037 M008 PFam PF00664 2 81.473684 0.001939 M010 PFam PF13476 3 132.042857 0.000268 M011 PFam PF00675 2 34.111111 0.005313 M012 PFam PF00034 3 93.393939 0.000401 M013 KEGG K00257 2 inf 0.001014 pseudo6_N2E2: 250 significant enrichments module database term n_overlap odds_ratio fdr M000 KEGG K00031 2 inf 5.123179e-04 M001 KEGG K11901 2 340.000000 5.626637e-04 M002 TIGRFam TIGR03075 2 211.750000 1.047846e-03 M003 PFam PF00270 6 79.421875 6.850756e-08 M005 PFam PF01584 3 34.044643 8.590395e-04 M006 KEGG K02057 2 211.750000 1.047846e-03 M009 SEED Lead, cadmium, zinc and mercury transporting ATPase (EC 3.6.3.3) (EC 3.6.3.5); Copper-translocating P-type ATPase (EC 3.6.3.4) 2 105.854167 1.480216e-03 M010 PFam PF01012 2 inf 5.123179e-04 M011 SEED Phosphonate ABC transporter phosphate-binding periplasmic component (TC 3.A.1.9.1) 4 23.176201 4.547849e-04 M012 PFam PF00460 6 inf 5.097308e-11 Dino: 169 significant enrichments module database term n_overlap odds_ratio fdr M000 PFam PF00270 5 62.281746 0.000007 M001 PFam PF00364 2 81.179487 0.001397 M002 PFam PF00950 2 217.586207 0.000788 M003 PFam PF02653 2 24.984127 0.006295 M004 PFam PF05175 3 49.994681 0.000653 M005 SEED Transcriptional regulator, MocR family, putative Taurine regulator tauR 2 inf 0.000708 M006 PFam PF00124 2 inf 0.000708 M007 PFam PF01315 2 105.600000 0.001056 M008 PFam PF11150 2 301.238095 0.000708 M009 KEGG K02065 2 inf 0.000708 pseudo5_N2C3_1: 334 significant enrichments module database term n_overlap odds_ratio fdr M000 PFam PF00460 6 inf 6.011770e-11 M001 TIGRFam TIGR01720 4 inf 5.212239e-07 M002 TIGRFam TIGR00254 12 56.571429 1.710577e-13 M003 SEED Histidine ammonia-lyase (EC 4.3.1.3) 2 inf 5.325410e-04 M004 PFam PF01297 2 234.863636 6.605538e-04 M005 PFam PF00528 4 13.363158 1.551053e-03 M006 PFam PF01261 2 67.947368 1.921985e-03 M007 SEED Lead, cadmium, zinc and mercury transporting ATPase (EC 3.6.3.3) (EC 3.6.3.5); Copper-translocating P-type ATPase (EC 3.6.3.4) 2 inf 5.325410e-04 M008 PFam PF02653 2 44.102564 3.173173e-03 M009 PFam PF13188 3 109.361702 2.506310e-04 Miya: 165 significant enrichments module database term n_overlap odds_ratio fdr M000 PFam PF01061 2 inf 1.001823e-03 M001 PFam PF00072 3 27.055556 2.799420e-03 M002 PFam PF04879 2 55.777778 2.846125e-03 M003 PFam PF08668 3 49.058824 1.001823e-03 M004 PFam PF13439 4 35.869565 7.033269e-04 M005 PFam PF10111 3 52.723404 1.025802e-03 M006 PFam PF00384 4 24.989899 9.379348e-04 M007 PFam PF00005 3 4.862069 4.334861e-02 M008 PFam PF00460 7 201.779070 5.890312e-09 M009 PFam PF02518 9 8.564516 2.366509e-04 Pedo557: 204 significant enrichments module database term n_overlap odds_ratio fdr M000 PFam PF10111 3 52.107143 0.000449 M001 SEED GlpG protein (membrane protein of glp regulon) 3 inf 0.000051 M002 SEED GldJ 2 inf 0.000550 M003 SEED Aerobactin siderophore receptor IutA 2 inf 0.000550 M004 SEED Maltose/maltodextrin ABC transporter, substrate binding periplasmic protein MalE 2 inf 0.000140 M005 PFam PF04324 2 inf 0.000142 M006 PFam PF04450 2 182.166667 0.000999 M008 SEED ABC transporter efflux protein 2 inf 0.000550 M009 PFam PF00155 4 29.163880 0.000314 M010 PFam PF13407 2 49.897727 0.002399 MR1: 191 significant enrichments module database term n_overlap odds_ratio fdr M000 PFam PF00440 2 23.993590 6.658439e-03 M002 PFam PF00171 2 107.514286 1.090595e-03 M004 PFam PF00460 6 inf 7.968301e-10 M006 KEGG K01772 2 inf 5.280934e-04 M007 KEGG K07114 2 51.791667 3.027254e-03 M008 PFam PF00854 2 77.708333 2.072377e-03 M009 PFam PF02151 2 inf 6.819676e-04 M010 KEGG K02078 2 inf 6.819676e-04 M012 KEGG K06148 2 359.047619 5.091676e-04 M013 PFam PF01145 2 42.409091 3.703192e-03 Keio: 176 significant enrichments module database term n_overlap odds_ratio fdr M000 PFam PF02518 3 28.653061 0.001307 M001 PFam PF02378 2 21.528736 0.009528 M002 TIGRFam TIGR01033 2 inf 0.000768 M003 TIGRFam TIGR00790 2 155.750000 0.001701 M004 PFam PF06411 2 inf 0.000768 M005 PFam PF13407 2 19.726316 0.010320 M006 TIGRFam TIGR02937 2 155.750000 0.001701 M007 PFam PF01232 2 86.321839 0.001973 M009 KEGG K00042 2 inf 0.000768 M010 SEED Paraquat-inducible protein B 2 155.750000 0.001701 Korea: 121 significant enrichments module database term n_overlap odds_ratio fdr M000 PFam PF13692 4 29.371429 0.000405 M001 PFam PF00391 2 563.000000 0.000405 M002 PFam PF01261 3 213.319149 0.000233 M004 PFam PF13439 4 28.982609 0.000405 M005 PFam PF13505 3 74.266667 0.000416 M006 PFam PF00226 2 375.555556 0.000435 M007 PFam PF01979 2 33.610000 0.005721 M008 PFam PF12844 2 34.781250 0.006287 M011 PFam PF04851 4 141.684211 0.000027 M012 PFam PF07690 3 12.723214 0.006070 PV4: 199 significant enrichments module database term n_overlap odds_ratio fdr M000 PFam PF01228 2 inf 0.001063 M001 PFam PF13649 2 8.177778 0.037950 M002 PFam PF00486 4 15.994565 0.001063 M004 KEGG K03808 2 inf 0.001063 M006 PFam PF01551 2 30.781250 0.007133 M007 KEGG K02851 2 inf 0.001063 M008 PFam PF07690 2 9.180124 0.031990 M009 KEGG K03722 2 inf 0.001063 M010 PFam PF03176 3 29.660000 0.001232 M011 PFam PF00595 3 206.651163 0.000728 pseudo1_N1B4: 112 significant enrichments module database term n_overlap odds_ratio fdr M000 PFam PF02151 2 inf 0.000496 M001 PFam PF00361 2 59.486111 0.002373 M002 PFam PF02653 4 73.282051 0.000113 M003 PFam PF13442 2 35.800000 0.003882 M004 PFam PF08448 2 13.909091 0.015573 M005 PFam PF00501 2 11.438503 0.021561 M006 PFam PF01553 2 35.675000 0.004259 M007 KEGG K02168 2 inf 0.000496 M008 PFam PF00549 2 inf 0.000514 M010 PFam PF01842 4 186.260870 0.000031 acidovorax_3H11: 160 significant enrichments module database term n_overlap odds_ratio fdr M000 SEED FIG000557: hypothetical protein co-occurring with RecR 2 489.500000 0.000835 M001 SEED Filamentation induced by cAMP protein Fic 2 161.833333 0.001829 M002 KEGG K00548 2 inf 0.000835 M003 SEED transcriptional regulator, TetR family 2 55.078014 0.004515 M004 PFam PF01738 2 78.727273 0.002669 M005 PFam PF01418 2 inf 0.000835 M006 PFam PF00561 2 8.606667 0.041045 M007 TIGRFam TIGR02532 3 inf 0.000030 M008 KEGG K01422 2 inf 0.000835 M009 SEED Signaling protein with a acyltransferase and GGDEF domains 2 inf 0.000835
SynE: 60 significant enrichments module database term n_overlap odds_ratio fdr M000 KEGG K01090 2 inf 0.003765 M001 KEGG K02005 2 138.444444 0.003765 M003 KEGG K03086 2 inf 0.003765 M004 PFam PF00005 2 11.974194 0.024968 M005 PFam PF00528 2 11.272727 0.026186 M006 PFam PF14559 2 11.236364 0.026865 M008 TIGRFam TIGR01184 2 54.231884 0.007311 M009 PFam PF00999 2 51.944444 0.007446 M010 PFam PF01022 2 117.437500 0.003765 M013 SEED Two-component system response regulator 2 inf 0.003765 Methanococcus_JJ: 63 significant enrichments module database term n_overlap odds_ratio fdr M002 PFam PF04055 2 11.140496 0.025667 M003 SEED Archaeal histone 2 inf 0.003011 M006 PFam PF02163 2 inf 0.003743 M008 PFam PF00148 4 115.826087 0.000539 M009 KEGG K01834 2 inf 0.003011 M010 KEGG K07321 2 inf 0.003011 M011 KEGG K00196 2 inf 0.003011 M013 PFam PF00753 3 17.959821 0.004434 M016 SEED Iron transport Periplasmic binding protein precursor 2 124.363636 0.003011 BFirm: 145 significant enrichments module database term n_overlap odds_ratio fdr M000 PFam PF01094 2 17.081081 1.568878e-02 M001 PFam PF00155 2 17.428571 1.555756e-02 M002 PFam PF00034 5 37.236111 6.464733e-05 M003 PFam PF00270 2 18.631944 1.417507e-02 M004 PFam PF00067 2 inf 5.539283e-04 M005 SEED Transcriptional regulator, GntR family domain / Aspartate aminotransferase (EC 2.6.1.1) 2 13.963542 2.037010e-02 M006 PFam PF01012 2 540.500000 5.539283e-04 M007 PFam PF00460 5 597.444444 3.172766e-08 M008 SEED Glutathione S-transferase (EC 2.5.1.18) 2 25.590476 9.555127e-03 M009 PFam PF12418 2 224.041667 1.285883e-03 Kang: 97 significant enrichments module database term n_overlap odds_ratio fdr M001 PFam PF02151 2 inf 0.001455 M002 PFam PF00534 2 inf 0.001455 M003 SEED Isovaleryl-CoA dehydrogenase (EC 1.3.8.4) 2 inf 0.001634 M004 PFam PF02518 3 14.692500 0.004123 M005 PFam PF13443 2 27.083333 0.009161 M006 PFam PF00089 2 inf 0.001634 M008 PFam PF02151 2 inf 0.001455 M009 PFam PF00109 2 53.027027 0.004159 M011 KEGG K05568 3 inf 0.000120 M013 KEGG K04761 2 inf 0.001634 ANA3: 184 significant enrichments module database term n_overlap odds_ratio fdr M000 KEGG K02415 2 inf 0.000810 M001 PFam PF00326 2 11.554487 0.024017 M002 PFam PF03553 2 50.208333 0.004159 M004 PFam PF00270 2 11.332288 0.024017 M005 PFam PF00005 2 9.374026 0.031599 M007 SEED Site-specific recombinase, phage integrase family 2 inf 0.000810 M008 PFam PF00270 3 10.933131 0.007542 M010 TIGRFam TIGR00177 3 230.872340 0.000412 M011 SEED Alkaline phosphatase (EC 3.1.3.1) 2 95.421053 0.002200 M013 PFam PF04349 3 inf 0.000412 Cup4G11: 244 significant enrichments module database term n_overlap odds_ratio fdr M000 KEGG K01142 2 inf 0.000362 M001 SEED DnaK-related protein 2 inf 0.000362 M002 PFam PF13145 2 87.930556 0.001832 M003 TIGRFam TIGR02001 2 inf 0.000362 M004 PFam PF00271 5 117.185185 0.000003 M005 KEGG K00548 2 inf 0.000362 M006 PFam PF00171 2 7.693529 0.045049 M007 KEGG K03802 2 inf 0.000362 M008 PFam PF01077 2 263.875000 0.000762 M009 TIGRFam TIGR01783 2 37.660714 0.005124 pseudo13_GW456_L13: 191 significant enrichments module database term n_overlap odds_ratio fdr M000 SEED Histidine ammonia-lyase (EC 4.3.1.3) 2 inf 0.001073 M001 PFam PF14497 2 63.703704 0.004443 M002 KEGG K03293 2 44.750000 0.005618 M003 PFam PF00590 3 91.425532 0.001073 M005 PFam PF00532 3 68.553191 0.001073 M006 PFam PF02589 2 inf 0.001131 M007 KEGG K10125 2 179.125000 0.001981 M008 PFam PF00528 4 5.411765 0.018468 M009 SEED Ferrichrome-iron receptor 2 89.541667 0.003184 M010 TIGRFam TIGR01726 3 17.090426 0.004485 Ponti: 171 significant enrichments module database term n_overlap odds_ratio fdr M000 PFam PF00884 2 inf 0.000907 M002 PFam PF19335 4 316.000000 0.000014 M005 SEED Probable Co/Zn/Cd efflux system membrane fusion protein 4 25.342657 0.000709 M006 KEGG K01752 2 inf 0.000907 M007 PFam PF07715 5 11.108025 0.000953 M008 KEGG K06147 2 151.416667 0.001737 M010 KEGG K04047 2 inf 0.000907 M011 PFam PF02661 2 93.384615 0.002251 M012 PFam PF00171 2 21.595238 0.010614 M013 SEED Magnesium and cobalt efflux protein CorC 3 231.957447 0.000265
In [7]:
import matplotlib.pyplot as plt
FIG_DIR = Path('../figures')
# Collect enrichment stats per organism
enrich_stats = []
for org_id in pilot_ids:
ann_file = MODULE_DIR / f'{org_id}_module_annotations.csv'
member_file = MODULE_DIR / f'{org_id}_gene_membership.csv'
if not ann_file.exists() or ann_file.stat().st_size < 10 or not member_file.exists():
continue
ann = pd.read_csv(ann_file)
if len(ann) == 0:
continue
membership = pd.read_csv(member_file, index_col=0)
n_modules = len(membership.columns)
sig = ann[ann['significant']]
modules_annotated = sig['module'].nunique()
# Count by database
db_counts = sig.groupby('database')['module'].nunique()
enrich_stats.append({
'orgId': org_id,
'n_modules': n_modules,
'n_annotated': modules_annotated,
'pct_annotated': 100 * modules_annotated / n_modules,
'n_kegg': db_counts.get('KEGG', 0),
'n_seed': db_counts.get('SEED', 0),
'n_tigrfam': db_counts.get('TIGRFam', 0),
'n_pfam': db_counts.get('PFam', 0),
})
stats_df = pd.DataFrame(enrich_stats)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Left: % modules annotated per organism
ax = axes[0]
stats_df_sorted = stats_df.sort_values('pct_annotated', ascending=True)
ax.barh(range(len(stats_df_sorted)), stats_df_sorted['pct_annotated'], color='#2196F3')
ax.set_yticks(range(len(stats_df_sorted)))
ax.set_yticklabels(stats_df_sorted['orgId'], fontsize=7)
ax.set_xlabel('% modules with significant enrichment')
ax.set_title('Module Annotation Rate by Organism')
ax.axvline(stats_df['pct_annotated'].median(), color='red', linestyle='--', alpha=0.7,
label=f"median={stats_df['pct_annotated'].median():.0f}%")
ax.legend(fontsize=8)
# Right: annotation database breakdown (stacked bar)
ax = axes[1]
orgs = stats_df.sort_values('n_annotated', ascending=True)['orgId'].tolist()
kegg_vals = [stats_df[stats_df['orgId']==o]['n_kegg'].values[0] for o in orgs]
seed_vals = [stats_df[stats_df['orgId']==o]['n_seed'].values[0] for o in orgs]
tigr_vals = [stats_df[stats_df['orgId']==o]['n_tigrfam'].values[0] for o in orgs]
pfam_vals = [stats_df[stats_df['orgId']==o]['n_pfam'].values[0] for o in orgs]
y = range(len(orgs))
ax.barh(y, kegg_vals, label='KEGG', color='#4CAF50')
ax.barh(y, seed_vals, left=kegg_vals, label='SEED', color='#FF9800')
ax.barh(y, tigr_vals, left=[k+s for k,s in zip(kegg_vals, seed_vals)], label='TIGRFam', color='#9C27B0')
ax.barh(y, pfam_vals, left=[k+s+t for k,s,t in zip(kegg_vals, seed_vals, tigr_vals)], label='PFam', color='#2196F3')
ax.set_yticks(y)
ax.set_yticklabels(orgs, fontsize=7)
ax.set_xlabel('Number of modules with enrichment')
ax.set_title('Enrichment Sources per Organism')
ax.legend(fontsize=8)
plt.tight_layout()
plt.savefig(FIG_DIR / 'enrichment_summary.png', dpi=150, bbox_inches='tight')
plt.show()
print(f"Mean annotation rate: {stats_df['pct_annotated'].mean():.1f}%")
print(f"Saved: figures/enrichment_summary.png")
Mean annotation rate: 79.5% Saved: figures/enrichment_summary.png