import pandas as pd
import numpy as np
from pathlib import Path

DATA_DIR = Path('../data')
MODULE_DIR = DATA_DIR / 'modules'
ANNOT_DIR = DATA_DIR / 'annotations'
FAMILY_DIR = DATA_DIR / 'module_families'
PRED_DIR = DATA_DIR / 'predictions'
PRED_DIR.mkdir(parents=True, exist_ok=True)

pilots = pd.read_csv(DATA_DIR / 'pilot_organisms.csv')
pilot_ids = pilots['orgId'].tolist()
print(f"Pilot organisms: {pilot_ids}")

Pilot organisms: ['DvH', 'Btheta', 'Methanococcus_S2', 'psRCH2', 'Putida', 'Phaeo', 'Marino', 'pseudo3_N2E3', 'Koxy', 'Cola', 'WCS417', 'Caulo', 'SB2B', 'pseudo6_N2E2', 'Dino', 'pseudo5_N2C3_1', 'Miya', 'Pedo557', 'MR1', 'Keio', 'Korea', 'PV4', 'pseudo1_N1B4', 'acidovorax_3H11', 'SynE', 'Methanococcus_JJ', 'BFirm', 'Kang', 'ANA3', 'Cup4G11', 'pseudo13_GW456_L13', 'Ponti']

# Load module families and family annotations
families = pd.read_csv(FAMILY_DIR / 'module_families.csv')
fam_ann = pd.read_csv(FAMILY_DIR / 'family_annotations.csv')
print(f"Module families: {families['familyId'].nunique()}")
print(f"Annotated families: {(fam_ann['consensus_term'] != 'unannotated').sum()}")

Module families: 749
Annotated families: 145

unannotated_patterns = ['hypothetical', 'uncharacterized', 'unknown function',
                        'DUF', 'predicted protein', 'putative']

all_predictions = []

for org_id in pilot_ids:
    out_file = PRED_DIR / f'{org_id}_predictions.csv'
    if out_file.exists() and out_file.stat().st_size > 0:
        print(f"CACHED: {org_id}")
        preds = pd.read_csv(out_file)
        all_predictions.append(preds)
        continue
    
    print(f"\nPredicting for {org_id}...")
    
    # Load gene metadata
    genes = pd.read_csv(ANNOT_DIR / f'{org_id}_genes.csv')
    genes['locusId'] = genes['locusId'].astype(str)
    
    # Identify unannotated genes
    def is_unannotated(desc):
        if pd.isna(desc) or desc.strip() == '':
            return True
        desc_lower = desc.lower()
        return any(p in desc_lower for p in unannotated_patterns)
    
    genes['is_unannotated'] = genes['desc'].apply(is_unannotated)
    unannotated = genes[genes['is_unannotated']]
    print(f"  Unannotated genes: {len(unannotated)} / {len(genes)} ({len(unannotated)/len(genes)*100:.1f}%)")
    
    # Load module data
    weights = pd.read_csv(MODULE_DIR / f'{org_id}_gene_weights.csv', index_col=0)
    weights.index = weights.index.astype(str)
    membership = pd.read_csv(MODULE_DIR / f'{org_id}_gene_membership.csv', index_col=0)
    membership.index = membership.index.astype(str)
    
    # Load module annotations (skip if empty/missing)
    ann_file = MODULE_DIR / f'{org_id}_module_annotations.csv'
    if not ann_file.exists() or ann_file.stat().st_size < 10:
        print(f"  No module annotations — skipping")
        continue
    ann = pd.read_csv(ann_file)
    if len(ann) == 0:
        print(f"  Empty annotations — skipping")
        continue
    
    # Get top annotation per module
    sig_ann = ann[ann['significant']].sort_values('fdr')
    top_ann = sig_ann.groupby('module').first().reset_index()
    mod_to_ann = dict(zip(top_ann['module'], top_ann['term']))
    mod_to_fdr = dict(zip(top_ann['module'], top_ann['fdr']))
    mod_to_db = dict(zip(top_ann['module'], top_ann['database']))
    
    # Get family info for this organism's modules
    org_families = families[families['orgId'] == org_id]
    mod_to_family = dict(zip(org_families['module'], org_families['familyId']))
    fam_to_ann = dict(zip(fam_ann['familyId'], fam_ann['consensus_term']))
    fam_to_norgs = dict(zip(fam_ann['familyId'], fam_ann['n_organisms']))
    
    # Generate predictions
    preds = []
    for _, gene_row in unannotated.iterrows():
        locus = str(gene_row['locusId'])
        if locus not in membership.index:
            continue
        
        # Find modules this gene belongs to
        gene_modules = membership.columns[membership.loc[locus] == 1].tolist()
        if not gene_modules:
            continue
        
        for mod in gene_modules:
            gene_weight = abs(weights.loc[locus, mod])
            
            # Module-based prediction
            mod_term = mod_to_ann.get(mod)
            if not mod_term:
                continue
            
            mod_fdr = mod_to_fdr.get(mod, 1.0)
            mod_db = mod_to_db.get(mod, '')
            
            # Family-based prediction
            fam_id = mod_to_family.get(mod)
            fam_term = fam_to_ann.get(fam_id) if fam_id else None
            fam_n_orgs = fam_to_norgs.get(fam_id, 0) if fam_id else 0
            
            # Confidence score
            fdr_score = max(0, -np.log10(max(mod_fdr, 1e-300)))  # higher = better
            cross_org_bonus = fam_n_orgs / len(pilot_ids) if fam_n_orgs > 1 else 0
            confidence = gene_weight * fdr_score * (1 + cross_org_bonus)
            
            # Use family annotation if available and different
            if fam_term and fam_term != 'unannotated':
                prediction_source = 'family'
                predicted_function = fam_term
            else:
                prediction_source = 'module'
                predicted_function = mod_term
            
            preds.append({
                'orgId': org_id,
                'locusId': locus,
                'sysName': gene_row.get('sysName', ''),
                'original_desc': gene_row['desc'],
                'module': mod,
                'gene_weight': gene_weight,
                'predicted_function': predicted_function,
                'prediction_source': prediction_source,
                'annotation_db': mod_db,
                'enrichment_fdr': mod_fdr,
                'familyId': fam_id if fam_id else '',
                'family_n_organisms': fam_n_orgs,
                'confidence': confidence
            })
    
    preds_df = pd.DataFrame(preds)
    if len(preds_df) > 0:
        # Keep best prediction per gene (highest confidence)
        preds_df = preds_df.sort_values('confidence', ascending=False)
        preds_df = preds_df.drop_duplicates('locusId', keep='first')
        preds_df.to_csv(out_file, index=False)
        all_predictions.append(preds_df)
        print(f"  Predictions: {len(preds_df)} genes")
        print(f"  Family-backed: {(preds_df['prediction_source'] == 'family').sum()}")
        print(f"  Module-only: {(preds_df['prediction_source'] == 'module').sum()}")
    else:
        print(f"  No predictions generated")

Predicting for DvH...
  Unannotated genes: 1693 / 3668 (46.2%)

  Predictions: 178 genes
  Family-backed: 65
  Module-only: 113

Predicting for Btheta...
  Unannotated genes: 3009 / 4902 (61.4%)

  Predictions: 222 genes
  Family-backed: 77
  Module-only: 145

Predicting for Methanococcus_S2...
  Unannotated genes: 272 / 1793 (15.2%)
  Predictions: 14 genes
  Family-backed: 3
  Module-only: 11

Predicting for psRCH2...
  Unannotated genes: 1063 / 4336 (24.5%)

  Predictions: 131 genes
  Family-backed: 71
  Module-only: 60

Predicting for Putida...
  Unannotated genes: 2515 / 5661 (44.4%)

  Predictions: 255 genes
  Family-backed: 129
  Module-only: 126

Predicting for Phaeo...
  Unannotated genes: 1707 / 3944 (43.3%)

  Predictions: 322 genes
  Family-backed: 154
  Module-only: 168

Predicting for Marino...
  Unannotated genes: 989 / 4470 (22.1%)
  Predictions: 117 genes
  Family-backed: 74
  Module-only: 43

Predicting for pseudo3_N2E3...
  Unannotated genes: 1354 / 5854 (23.1%)

  Predictions: 170 genes
  Family-backed: 80
  Module-only: 90

Predicting for Koxy...
  Unannotated genes: 1047 / 5586 (18.7%)
  Predictions: 189 genes
  Family-backed: 26
  Module-only: 163

Predicting for Cola...

  Unannotated genes: 1632 / 4684 (34.8%)

  Predictions: 268 genes
  Family-backed: 113
  Module-only: 155

Predicting for WCS417...
  Unannotated genes: 968 / 5580 (17.3%)
  Predictions: 124 genes
  Family-backed: 67
  Module-only: 57

Predicting for Caulo...
  Unannotated genes: 818 / 3943 (20.7%)

  Predictions: 67 genes
  Family-backed: 28
  Module-only: 39

Predicting for SB2B...
  Unannotated genes: 1357 / 3785 (35.9%)

# Build ID → description lookup from all organisms' annotation files
id_to_desc = {}

for org_id in pilot_ids:
    # PFam/TIGRFam: domainId → domainName (PFam) or definition (TIGRFam)
    domain_file = ANNOT_DIR / f'{org_id}_domains.csv'
    if domain_file.exists():
        domains = pd.read_csv(domain_file)
        for _, row in domains.drop_duplicates('domainId').iterrows():
            did = row['domainId']
            if did in id_to_desc:
                continue
            if row['domainDb'] == 'TIGRFam' and pd.notna(row.get('definition')):
                id_to_desc[did] = row['definition']
            elif pd.notna(row.get('domainName')) and row['domainName'] != did:
                id_to_desc[did] = row['domainName']

    # KEGG: kgroup → kgroup_desc
    kegg_file = ANNOT_DIR / f'{org_id}_kegg.csv'
    if kegg_file.exists():
        kegg = pd.read_csv(kegg_file)
        for _, row in kegg.drop_duplicates('kgroup').iterrows():
            if row['kgroup'] not in id_to_desc and pd.notna(row.get('kgroup_desc')):
                id_to_desc[row['kgroup']] = row['kgroup_desc']

print(f"Built ID → description lookup: {len(id_to_desc)} entries")

# Apply to predictions
if all_predictions:
    combined = pd.concat(all_predictions, ignore_index=True)

    combined['predicted_function_desc'] = combined['predicted_function'].map(
        lambda x: id_to_desc.get(x, x)  # keep original if no mapping found
    )

    # Show resolution stats
    resolved = combined['predicted_function'] != combined['predicted_function_desc']
    print(f"Resolved: {resolved.sum()}/{len(combined)} predictions ({100*resolved.mean():.1f}%)")
    print(f"Unresolved (SEED terms already readable): {(~resolved).sum()}")

    # Save updated predictions
    combined.to_csv(PRED_DIR / 'all_predictions_summary.csv', index=False)

    # Also update per-organism files
    for org_id in combined['orgId'].unique():
        org_preds = combined[combined['orgId'] == org_id]
        org_preds.to_csv(PRED_DIR / f'{org_id}_predictions.csv', index=False)

    print(f"\nSaved updated predictions with descriptions")

Built ID → description lookup: 15155 entries
Resolved: 5621/6691 predictions (84.0%)
Unresolved (SEED terms already readable): 1070

Saved updated predictions with descriptions

print("=" * 60)
print("PREDICTION SUMMARY")
print("=" * 60)
print(f"Total predictions: {len(combined)}")
print(f"\nBy source:")
print(combined['prediction_source'].value_counts().to_string())
print(f"\nBy annotation database:")
print(combined['annotation_db'].value_counts().to_string())
print(f"\nConfidence: median={combined['confidence'].median():.2f}, "
      f"mean={combined['confidence'].mean():.2f}, "
      f"max={combined['confidence'].max():.2f}")

print(f"\nTop 15 highest-confidence predictions (with descriptions):")
top15 = combined.nlargest(15, 'confidence')
print(top15[['orgId', 'locusId', 'predicted_function_desc', 'confidence',
             'prediction_source']].to_string(index=False))

print(f"\nMost common predicted functions:")
print(combined['predicted_function_desc'].value_counts().head(15).to_string())

============================================================
PREDICTION SUMMARY
============================================================
Total predictions: 6691

By source:
prediction_source
module    4236
family    2455

By annotation database:
annotation_db
PFam       4397
SEED       1097
KEGG        761
TIGRFam     436

Confidence: median=1.32, mean=1.59, max=14.77

Top 15 highest-confidence predictions (with descriptions):
         orgId       locusId predicted_function_desc  confidence prediction_source
          Koxy BWI76_RS20270                    LacI   14.770167            family
  pseudo6_N2E2  Pf6N2E2_2710              Flg_bb_rod   12.419920            family
  pseudo6_N2E2  Pf6N2E2_2563              Flg_bb_rod   12.413871            family
  pseudo6_N2E2  Pf6N2E2_5963              Flg_bb_rod   12.063872            family
  pseudo6_N2E2  Pf6N2E2_5068              Flg_bb_rod   11.933264            family
  pseudo6_N2E2  Pf6N2E2_3827              Flg_bb_rod   11.908727            family
  pseudo6_N2E2  Pf6N2E2_5893              Flg_bb_rod   11.183882            family
pseudo5_N2C3_1   AO356_18320              Flg_bb_rod   10.694277            family
  pseudo6_N2E2  Pf6N2E2_4032                   GGDEF    9.877816            family
         BFirm BPHYT_RS19100              Flg_bb_rod    9.189315            family
          SB2B       6936921              Flg_bb_rod    9.163634            family
pseudo5_N2C3_1   AO356_20220              Flg_bb_rod    9.128664            family
        WCS417       GFF3924              Flg_bb_rod    9.017234            family
        WCS417       GFF3925              Flg_bb_rod    8.968367            family
  pseudo6_N2E2  Pf6N2E2_5277                   GGDEF    8.936912            family

Most common predicted functions:
predicted_function_desc
DEAD                                                                                  156
Flg_bb_rod                                                                            140
Response_reg                                                                          120
ACR_tran                                                                               95
BPD_transp_1                                                                           94
ABC_tran                                                                               77
Glyco_tranf_2_2                                                                        77
MoCF_biosynth                                                                          73
amino ABC transporter, permease protein, 3-TM region, His/Glu/Gln/Arg/opine family     68
Sulfatase                                                                              64
Proton_antipo_M                                                                        61
BPD_transp_2                                                                           59
ABC transporter efflux protein                                                         59
GDP_Man_Dehyd                                                                          56
GerE                                                                                   56

import matplotlib.pyplot as plt

FIG_DIR = Path('../figures')

fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Left: predictions per organism
ax = axes[0]
org_counts = combined.groupby('orgId').size().sort_values(ascending=True)
colors_pred = ['#4CAF50' if combined[combined['orgId']==o]['prediction_source'].mode().values[0]=='family'
               else '#2196F3' for o in org_counts.index]
ax.barh(range(len(org_counts)), org_counts.values, color='#2196F3')
ax.set_yticks(range(len(org_counts)))
ax.set_yticklabels(org_counts.index, fontsize=7)
ax.set_xlabel('Number of predictions')
ax.set_title(f'Predictions per Organism (n={len(combined)})')

# Middle: confidence distribution
ax = axes[1]
family_conf = combined[combined['prediction_source'] == 'family']['confidence']
module_conf = combined[combined['prediction_source'] == 'module']['confidence']
ax.hist([family_conf, module_conf], bins=30, stacked=True,
        label=[f'Family-backed ({len(family_conf)})', f'Module-only ({len(module_conf)})'],
        color=['#4CAF50', '#FF9800'])
ax.set_xlabel('Confidence score')
ax.set_ylabel('Number of predictions')
ax.set_title('Prediction Confidence Distribution')
ax.legend(fontsize=9)

# Right: family-backed vs module-only pie
ax = axes[2]
source_counts = combined['prediction_source'].value_counts()
ax.pie(source_counts.values, labels=[f'Family-backed\n({source_counts.get("family", 0)})',
                                      f'Module-only\n({source_counts.get("module", 0)})'],
       colors=['#4CAF50', '#FF9800'], autopct='%1.0f%%', startangle=90)
ax.set_title('Prediction Source Breakdown')

plt.tight_layout()
plt.savefig(FIG_DIR / 'prediction_summary.png', dpi=150, bbox_inches='tight')
plt.show()
print(f"Saved: figures/prediction_summary.png")

Saved: figures/prediction_summary.png

06 Function Prediction

NB 06: Function Prediction for Unannotated Genes¶

1. Identify Unannotated Genes¶

2. Resolve IDs to Human-Readable Descriptions¶

3. Combined Summary¶