import pandas as pd
import numpy as np
from pathlib import Path

DATA_DIR = Path('../data')
FM_DIR = Path('../../fitness_modules/data')
MODULE_DIR = FM_DIR / 'modules'
FAMILY_DIR = FM_DIR / 'module_families'

essential = pd.read_csv(DATA_DIR / 'all_essential_genes.tsv', sep='\t', low_memory=False)
essential['is_essential'] = essential['is_essential'].astype(str).str.strip().str.lower() == 'true'
essential['locusId'] = essential['locusId'].astype(str)

og_df = pd.read_csv(DATA_DIR / 'all_ortholog_groups.csv')
og_df['locusId'] = og_df['locusId'].astype(str)

print(f"Genes: {len(essential):,} ({essential['is_essential'].sum():,} essential)")
print(f"Ortholog groups: {og_df['OG_id'].nunique():,}")

Genes: 221,005 (41,059 essential)
Ortholog groups: 17,222

# Essential status lookup
ess_lookup = {}
for _, r in essential.iterrows():
    ess_lookup[(r['orgId'], r['locusId'])] = r['is_essential']

# Gene -> OG
gene_to_og = {}
for _, r in og_df.iterrows():
    gene_to_og[(r['orgId'], r['locusId'])] = r['OG_id']

# OG -> members
og_members = og_df.groupby('OG_id').apply(
    lambda g: list(zip(g['orgId'], g['locusId']))
).to_dict()

# Module membership
gene_to_modules = {}
module_orgs = []
for f in sorted(MODULE_DIR.glob('*_gene_membership.csv')):
    org = f.stem.replace('_gene_membership', '')
    module_orgs.append(org)
    membership = pd.read_csv(f, index_col=0)
    for mod in membership.columns:
        for gene in membership.index[membership[mod] == 1]:
            gene_to_modules.setdefault((org, str(gene)), []).append(mod)

# Module annotations
module_best_ann = {}
for org in module_orgs:
    ann_file = MODULE_DIR / f'{org}_module_annotations.csv'
    if not ann_file.exists() or ann_file.stat().st_size < 10:
        continue
    ann = pd.read_csv(ann_file)
    sig = ann[ann['significant'] == True]
    for mod, grp in sig.groupby('module'):
        best = grp.loc[grp['fdr'].idxmin()]
        module_best_ann[(org, mod)] = {'term': best['term'], 'db': best['database'], 'fdr': best['fdr']}

# Module families
mod_to_family = {}
for _, r in pd.read_csv(FAMILY_DIR / 'module_families.csv').iterrows():
    mod_to_family[(r['orgId'], r['module'])] = r['familyId']
fam_ann = {}
for _, r in pd.read_csv(FAMILY_DIR / 'family_annotations.csv').iterrows():
    fam_ann[r['familyId']] = {'consensus_term': r['consensus_term'], 'n_organisms': r['n_organisms']}

print(f"Module organisms: {len(module_orgs)}")
print(f"Genes in modules: {len(gene_to_modules):,}")
print(f"Annotated modules: {len(module_best_ann):,}")

Module organisms: 32
Genes in modules: 30,688
Annotated modules: 890

hyp_patterns = ['hypothetical', 'uncharacterized', 'unknown function', 'DUF']
ess_genes = essential[essential['is_essential']].copy()
ess_genes['is_hypothetical'] = ess_genes['desc'].apply(
    lambda d: any(p.lower() in str(d).lower() for p in hyp_patterns) if pd.notna(d) else True
)

og_lookup = og_df[['orgId', 'locusId', 'OG_id']].drop_duplicates()
ess_genes = ess_genes.merge(og_lookup, on=['orgId', 'locusId'], how='left')
ess_genes.rename(columns={'OG_id': 'og_id'}, inplace=True)

targets = ess_genes[ess_genes['is_hypothetical'] & ess_genes['og_id'].notna()]
print(f"Hypothetical essential genes: {ess_genes['is_hypothetical'].sum():,}")
print(f"  With orthologs (predictable): {len(targets):,}")
print(f"  Without orthologs (orphans): {ess_genes['is_hypothetical'].sum() - len(targets):,}")

Hypothetical essential genes: 8,297
  With orthologs (predictable): 3,912
  Without orthologs (orphans): 4,385

predictions = []
for i, (_, gene) in enumerate(targets.iterrows()):
    org, locus, og_id = gene['orgId'], gene['locusId'], gene['og_id']
    best_pred, best_conf = None, -1

    for m_org, m_locus in og_members.get(og_id, []):
        if m_org == org and m_locus == locus:
            continue
        if ess_lookup.get((m_org, m_locus), True):
            continue  # also essential — skip
        for mod in gene_to_modules.get((m_org, m_locus), []):
            ann = module_best_ann.get((m_org, mod))
            if ann is None:
                continue
            fam_id = mod_to_family.get((m_org, mod), '')
            fam_info = fam_ann.get(fam_id, {})
            conf = -np.log10(max(ann['fdr'], 1e-20))
            if fam_id and fam_info.get('n_organisms', 0) > 1:
                conf += np.log10(fam_info['n_organisms'])
            if conf > best_conf:
                best_conf = conf
                best_pred = {
                    'target_orgId': org, 'target_locusId': locus,
                    'target_desc': gene['desc'], 'OG_id': og_id,
                    'source_orgId': m_org, 'source_locusId': m_locus,
                    'source_module': mod, 'predicted_term': ann['term'],
                    'predicted_db': ann['db'], 'enrichment_fdr': ann['fdr'],
                    'familyId': fam_id,
                    'family_n_organisms': fam_info.get('n_organisms', 0),
                    'family_consensus': fam_info.get('consensus_term', ''),
                    'confidence': conf,
                }
    if best_pred:
        predictions.append(best_pred)

pred_df = pd.DataFrame(predictions).sort_values('confidence', ascending=False)
pred_df.to_csv(DATA_DIR / 'essential_predictions.tsv', sep='\t', index=False)
n_fam = (pred_df['familyId'] != '').sum()
print(f"Predictions: {len(pred_df):,} / {len(targets):,} targets ({len(pred_df)/len(targets)*100:.1f}%)")
print(f"  Family-backed: {n_fam:,}")
print(f"  Module-only: {len(pred_df) - n_fam:,}")

Predictions: 1,382 / 3,912 targets (35.3%)
  Family-backed: 1,382
  Module-only: 0

pred_df[['target_orgId', 'target_locusId', 'target_desc', 'predicted_term',
         'predicted_db', 'enrichment_fdr', 'familyId', 'confidence']].head(25)

pred_df.groupby('target_orgId').size().sort_values(ascending=False)

target_orgId
pseudo1_N1B4               90
PV4                        85
Smeli                      65
SyringaeB728a              64
psRCH2                     61
SyringaeB728a_mexBdelta    58
Burk376                    56
Putida                     52
WCS417                     51
BFirm                      46
ANA3                       37
MR1                        36
azobra                     36
Koxy                       35
Dino                       34
Btheta                     34
pseudo5_N2C3_1             33
pseudo13_GW456_L13         31
pseudo6_N2E2               29
PS                         28
pseudo3_N2E3               28
SB2B                       25
RalstoniaBSBF1503          23
RalstoniaUW163             23
HerbieS                    23
RalstoniaGMI1000           23
Keio                       22
Cup4G11                    22
Magneto                    20
Dyella79                   20
Miya                       18
SynE                       18
Ponti                      17
Pedo557                    17
acidovorax_3H11            16
RalstoniaPSI07             16
Phaeo                      14
Marino                     14
Korea                      14
DvH                        13
Ddia6719                    9
Cola                        8
Kang                        8
Dda3937                     3
Methanococcus_JJ            2
DdiaME23                    2
Methanococcus_S2            2
Caulo                       1
dtype: int64

n_ess = essential['is_essential'].sum()
n_hyp = ess_genes['is_hypothetical'].sum()
print(f"Essential genes: {n_ess:,}")
print(f"Hypothetical essentials: {n_hyp:,} ({n_hyp/n_ess*100:.1f}%)")
print(f"  Predictable (have orthologs): {len(targets):,}")
print(f"  Orphans (no orthologs): {n_hyp - len(targets):,}")
print(f"Predictions made: {len(pred_df):,} ({len(pred_df)/len(targets)*100:.1f}% of predictable)")

Essential genes: 41,059
Hypothetical essentials: 8,297 (20.2%)
  Predictable (have orthologs): 3,912
  Orphans (no orthologs): 4,385
Predictions made: 1,382 (35.3% of predictable)

	target_orgId	target_locusId	target_desc	predicted_term	predicted_db	enrichment_fdr	familyId	confidence
77	BFirm	BPHYT_RS31975	hypothetical protein	PF00356	PFam	6.285406e-18	F511	17.900637
147	Burk376	H281DRAFT_03269	protein of unknown function (DUF4154)	TIGR00254	TIGRFam	9.930740e-17	F691	16.480140
500	PS	Dsui_2865	hypothetical protein	TIGR00254	TIGRFam	9.930740e-17	F691	16.480140
1227	pseudo1_N1B4	Pf1N1B4_2091	Uncharacterized protein ImpI/VasC	TIGR00254	TIGRFam	9.930740e-17	F691	16.480140
711	RalstoniaBSBF1503	RALBFv3_RS19825	DUF4154 domain-containing protein	TIGR00254	TIGRFam	9.930740e-17	F691	16.480140
1100	azobra	AZOBR_RS21015	hypothetical protein	TIGR00254	TIGRFam	9.930740e-17	F691	16.480140
24	ANA3	7025431	hypothetical protein (RefSeq)	TIGR00254	TIGRFam	1.710577e-13	F691	13.243979
781	SB2B	6937165	hypothetical protein (RefSeq)	TIGR00254	TIGRFam	1.710577e-13	F691	13.243979
861	Smeli	SMa0193	hypothetical protein	TIGR00254	TIGRFam	1.710577e-13	F691	13.243979
1099	azobra	AZOBR_RS20460	hypothetical protein	TIGR00254	TIGRFam	1.710577e-13	F691	13.243979
1291	pseudo1_N1B4	Pf1N1B4_5990	FIG00998616: hypothetical protein	TIGR00254	TIGRFam	1.710577e-13	F691	13.243979
55	BFirm	BPHYT_RS13990	hypothetical protein	TIGR00254	TIGRFam	1.710577e-13	F691	13.243979
451	Marino	GFF2114	conserved hypothetical protein, membrane	TIGR00254	TIGRFam	1.710577e-13	F691	13.243979
536	PV4	5209158	hypothetical protein (RefSeq)	TIGR00254	TIGRFam	1.710577e-13	F691	13.243979
1134	psRCH2	GFF1633	Uncharacterized protein conserved in bacteria	TIGR00254	TIGRFam	1.710577e-13	F691	13.243979
992	SyringaeB728a_mexBdelta	Psyr_3475	Flagellar basal body rod protein:Protein of un...	PF00460	PFam	4.776180e-11	F070	11.497011
410	MR1	202309	conserved hypothetical protein (NCBI ptt file)	PF00460	PFam	4.776180e-11	F070	11.497011
1018	WCS417	GFF1467	hypothetical protein	PF00460	PFam	5.097308e-11	F070	11.468750
1330	pseudo5_N2C3_1	AO356_09745	hypothetical protein	PF00460	PFam	5.097308e-11	F070	11.468750
468	Miya	8500805	hypothetical protein (RefSeq)	PF00460	PFam	4.820529e-10	F070	10.492997
7	ANA3	7023975	hypothetical protein (RefSeq)	PF00460	PFam	7.968301e-10	F070	10.274726
38	BFirm	BPHYT_RS01190	hypothetical protein	PF00460	PFam	7.968301e-10	F070	10.274726
1103	azobra	AZOBR_RS26755	hypothetical protein	PF00460	PFam	7.968301e-10	F070	10.274726
972	SyringaeB728a_mexBdelta	Psyr_2112	Protein with unknown function DUF469	PF00460	PFam	7.968301e-10	F070	10.274726
1362	pseudo6_N2E2	Pf6N2E2_2342	FIG002060: uncharacterized protein YggL	PF00460	PFam	7.968301e-10	F070	10.274726

03 Function Prediction

NB03: Function Prediction for Hypothetical Essential Genes¶

Build lookup dictionaries¶

Identify hypothetical essential genes¶

Predict function via ortholog-module transfer¶

Top 25 predictions¶

Predictions per organism¶

Summary¶