06 Function Prediction
Jupyter notebook from the Pan-bacterial Fitness Modules via Independent Component Analysis project.
NB 06: Function Prediction for Unannotated Genes¶
Predict function for poorly annotated genes ("hypothetical protein", etc.) using module and module-family context.
Prediction sources:
- Module-based: function from module's top enrichment label
- Family-based: if module belongs to a conserved family, use family consensus
Confidence score = |gene_weight| × enrichment_significance × cross-organism_consistency
Run locally — no Spark needed.
In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
DATA_DIR = Path('../data')
MODULE_DIR = DATA_DIR / 'modules'
ANNOT_DIR = DATA_DIR / 'annotations'
FAMILY_DIR = DATA_DIR / 'module_families'
PRED_DIR = DATA_DIR / 'predictions'
PRED_DIR.mkdir(parents=True, exist_ok=True)
pilots = pd.read_csv(DATA_DIR / 'pilot_organisms.csv')
pilot_ids = pilots['orgId'].tolist()
print(f"Pilot organisms: {pilot_ids}")
Pilot organisms: ['DvH', 'Btheta', 'Methanococcus_S2', 'psRCH2', 'Putida', 'Phaeo', 'Marino', 'pseudo3_N2E3', 'Koxy', 'Cola', 'WCS417', 'Caulo', 'SB2B', 'pseudo6_N2E2', 'Dino', 'pseudo5_N2C3_1', 'Miya', 'Pedo557', 'MR1', 'Keio', 'Korea', 'PV4', 'pseudo1_N1B4', 'acidovorax_3H11', 'SynE', 'Methanococcus_JJ', 'BFirm', 'Kang', 'ANA3', 'Cup4G11', 'pseudo13_GW456_L13', 'Ponti']
In [2]:
# Load module families and family annotations
families = pd.read_csv(FAMILY_DIR / 'module_families.csv')
fam_ann = pd.read_csv(FAMILY_DIR / 'family_annotations.csv')
print(f"Module families: {families['familyId'].nunique()}")
print(f"Annotated families: {(fam_ann['consensus_term'] != 'unannotated').sum()}")
Module families: 749 Annotated families: 145
1. Identify Unannotated Genes¶
In [3]:
unannotated_patterns = ['hypothetical', 'uncharacterized', 'unknown function',
'DUF', 'predicted protein', 'putative']
all_predictions = []
for org_id in pilot_ids:
out_file = PRED_DIR / f'{org_id}_predictions.csv'
if out_file.exists() and out_file.stat().st_size > 0:
print(f"CACHED: {org_id}")
preds = pd.read_csv(out_file)
all_predictions.append(preds)
continue
print(f"\nPredicting for {org_id}...")
# Load gene metadata
genes = pd.read_csv(ANNOT_DIR / f'{org_id}_genes.csv')
genes['locusId'] = genes['locusId'].astype(str)
# Identify unannotated genes
def is_unannotated(desc):
if pd.isna(desc) or desc.strip() == '':
return True
desc_lower = desc.lower()
return any(p in desc_lower for p in unannotated_patterns)
genes['is_unannotated'] = genes['desc'].apply(is_unannotated)
unannotated = genes[genes['is_unannotated']]
print(f" Unannotated genes: {len(unannotated)} / {len(genes)} ({len(unannotated)/len(genes)*100:.1f}%)")
# Load module data
weights = pd.read_csv(MODULE_DIR / f'{org_id}_gene_weights.csv', index_col=0)
weights.index = weights.index.astype(str)
membership = pd.read_csv(MODULE_DIR / f'{org_id}_gene_membership.csv', index_col=0)
membership.index = membership.index.astype(str)
# Load module annotations (skip if empty/missing)
ann_file = MODULE_DIR / f'{org_id}_module_annotations.csv'
if not ann_file.exists() or ann_file.stat().st_size < 10:
print(f" No module annotations — skipping")
continue
ann = pd.read_csv(ann_file)
if len(ann) == 0:
print(f" Empty annotations — skipping")
continue
# Get top annotation per module
sig_ann = ann[ann['significant']].sort_values('fdr')
top_ann = sig_ann.groupby('module').first().reset_index()
mod_to_ann = dict(zip(top_ann['module'], top_ann['term']))
mod_to_fdr = dict(zip(top_ann['module'], top_ann['fdr']))
mod_to_db = dict(zip(top_ann['module'], top_ann['database']))
# Get family info for this organism's modules
org_families = families[families['orgId'] == org_id]
mod_to_family = dict(zip(org_families['module'], org_families['familyId']))
fam_to_ann = dict(zip(fam_ann['familyId'], fam_ann['consensus_term']))
fam_to_norgs = dict(zip(fam_ann['familyId'], fam_ann['n_organisms']))
# Generate predictions
preds = []
for _, gene_row in unannotated.iterrows():
locus = str(gene_row['locusId'])
if locus not in membership.index:
continue
# Find modules this gene belongs to
gene_modules = membership.columns[membership.loc[locus] == 1].tolist()
if not gene_modules:
continue
for mod in gene_modules:
gene_weight = abs(weights.loc[locus, mod])
# Module-based prediction
mod_term = mod_to_ann.get(mod)
if not mod_term:
continue
mod_fdr = mod_to_fdr.get(mod, 1.0)
mod_db = mod_to_db.get(mod, '')
# Family-based prediction
fam_id = mod_to_family.get(mod)
fam_term = fam_to_ann.get(fam_id) if fam_id else None
fam_n_orgs = fam_to_norgs.get(fam_id, 0) if fam_id else 0
# Confidence score
fdr_score = max(0, -np.log10(max(mod_fdr, 1e-300))) # higher = better
cross_org_bonus = fam_n_orgs / len(pilot_ids) if fam_n_orgs > 1 else 0
confidence = gene_weight * fdr_score * (1 + cross_org_bonus)
# Use family annotation if available and different
if fam_term and fam_term != 'unannotated':
prediction_source = 'family'
predicted_function = fam_term
else:
prediction_source = 'module'
predicted_function = mod_term
preds.append({
'orgId': org_id,
'locusId': locus,
'sysName': gene_row.get('sysName', ''),
'original_desc': gene_row['desc'],
'module': mod,
'gene_weight': gene_weight,
'predicted_function': predicted_function,
'prediction_source': prediction_source,
'annotation_db': mod_db,
'enrichment_fdr': mod_fdr,
'familyId': fam_id if fam_id else '',
'family_n_organisms': fam_n_orgs,
'confidence': confidence
})
preds_df = pd.DataFrame(preds)
if len(preds_df) > 0:
# Keep best prediction per gene (highest confidence)
preds_df = preds_df.sort_values('confidence', ascending=False)
preds_df = preds_df.drop_duplicates('locusId', keep='first')
preds_df.to_csv(out_file, index=False)
all_predictions.append(preds_df)
print(f" Predictions: {len(preds_df)} genes")
print(f" Family-backed: {(preds_df['prediction_source'] == 'family').sum()}")
print(f" Module-only: {(preds_df['prediction_source'] == 'module').sum()}")
else:
print(f" No predictions generated")
Predicting for DvH... Unannotated genes: 1693 / 3668 (46.2%)
Predictions: 178 genes Family-backed: 65 Module-only: 113 Predicting for Btheta... Unannotated genes: 3009 / 4902 (61.4%)
Predictions: 222 genes Family-backed: 77 Module-only: 145 Predicting for Methanococcus_S2... Unannotated genes: 272 / 1793 (15.2%) Predictions: 14 genes Family-backed: 3 Module-only: 11 Predicting for psRCH2... Unannotated genes: 1063 / 4336 (24.5%)
Predictions: 131 genes Family-backed: 71 Module-only: 60 Predicting for Putida... Unannotated genes: 2515 / 5661 (44.4%)
Predictions: 255 genes Family-backed: 129 Module-only: 126 Predicting for Phaeo... Unannotated genes: 1707 / 3944 (43.3%)
Predictions: 322 genes Family-backed: 154 Module-only: 168 Predicting for Marino... Unannotated genes: 989 / 4470 (22.1%) Predictions: 117 genes Family-backed: 74 Module-only: 43 Predicting for pseudo3_N2E3... Unannotated genes: 1354 / 5854 (23.1%)
Predictions: 170 genes Family-backed: 80 Module-only: 90 Predicting for Koxy... Unannotated genes: 1047 / 5586 (18.7%) Predictions: 189 genes Family-backed: 26 Module-only: 163 Predicting for Cola...
Unannotated genes: 1632 / 4684 (34.8%)
Predictions: 268 genes Family-backed: 113 Module-only: 155 Predicting for WCS417... Unannotated genes: 968 / 5580 (17.3%) Predictions: 124 genes Family-backed: 67 Module-only: 57 Predicting for Caulo... Unannotated genes: 818 / 3943 (20.7%)
Predictions: 67 genes Family-backed: 28 Module-only: 39 Predicting for SB2B... Unannotated genes: 1357 / 3785 (35.9%)
Predictions: 314 genes Family-backed: 174 Module-only: 140 Predicting for pseudo6_N2E2... Unannotated genes: 1445 / 6175 (23.4%)
Predictions: 185 genes Family-backed: 163 Module-only: 22 Predicting for Dino... Unannotated genes: 1535 / 4562 (33.6%) Predictions: 152 genes Family-backed: 91 Module-only: 61 Predicting for pseudo5_N2C3_1... Unannotated genes: 1188 / 6082 (19.5%)
Predictions: 189 genes Family-backed: 137 Module-only: 52 Predicting for Miya... Unannotated genes: 1236 / 3601 (34.3%) Predictions: 215 genes Family-backed: 62 Module-only: 153 Predicting for Pedo557... Unannotated genes: 2233 / 5089 (43.9%)
Predictions: 287 genes Family-backed: 103 Module-only: 184 Predicting for MR1... Unannotated genes: 2181 / 4745 (46.0%)
Predictions: 508 genes Family-backed: 197 Module-only: 311 Predicting for Keio... Unannotated genes: 1239 / 4610 (26.9%) Predictions: 214 genes Family-backed: 66 Module-only: 148 Predicting for Korea...
Unannotated genes: 1100 / 4245 (25.9%) Predictions: 96 genes Family-backed: 31 Module-only: 65 Predicting for PV4... Unannotated genes: 1085 / 3993 (27.2%)
Predictions: 202 genes Family-backed: 117 Module-only: 85 Predicting for pseudo1_N1B4... Unannotated genes: 1580 / 6058 (26.1%) Predictions: 108 genes Family-backed: 35 Module-only: 73 Predicting for acidovorax_3H11... Unannotated genes: 1269 / 5019 (25.3%)
Predictions: 227 genes Family-backed: 43 Module-only: 184 Predicting for SynE... Unannotated genes: 1135 / 2722 (41.7%) Predictions: 199 genes Family-backed: 26 Module-only: 173 Predicting for Methanococcus_JJ... Unannotated genes: 334 / 1861 (17.9%)
Predictions: 34 genes Family-backed: 13 Module-only: 21 Predicting for BFirm... Unannotated genes: 1526 / 7263 (21.0%)
Predictions: 171 genes Family-backed: 17 Module-only: 154 Predicting for Kang... Unannotated genes: 689 / 2513 (27.4%) Predictions: 176 genes Family-backed: 27 Module-only: 149 Predicting for ANA3... Unannotated genes: 1391 / 4552 (30.6%)
Predictions: 326 genes Family-backed: 61 Module-only: 265 Predicting for Cup4G11... Unannotated genes: 1732 / 7449 (23.3%)
Predictions: 293 genes Family-backed: 51 Module-only: 242 Predicting for pseudo13_GW456_L13... Unannotated genes: 1129 / 5243 (21.5%) Predictions: 299 genes Family-backed: 96 Module-only: 203 Predicting for Ponti...
Unannotated genes: 1678 / 4329 (38.8%)
Predictions: 439 genes Family-backed: 58 Module-only: 381
2. Resolve IDs to Human-Readable Descriptions¶
PFam/TIGRFam predictions use domain accession IDs (e.g., "PF00460"). Resolve these to readable names using the extracted domain annotation files. KEGG uses KO group IDs (e.g., "K01952") — resolve to descriptions. SEED terms are already human-readable.
In [4]:
# Build ID → description lookup from all organisms' annotation files
id_to_desc = {}
for org_id in pilot_ids:
# PFam/TIGRFam: domainId → domainName (PFam) or definition (TIGRFam)
domain_file = ANNOT_DIR / f'{org_id}_domains.csv'
if domain_file.exists():
domains = pd.read_csv(domain_file)
for _, row in domains.drop_duplicates('domainId').iterrows():
did = row['domainId']
if did in id_to_desc:
continue
if row['domainDb'] == 'TIGRFam' and pd.notna(row.get('definition')):
id_to_desc[did] = row['definition']
elif pd.notna(row.get('domainName')) and row['domainName'] != did:
id_to_desc[did] = row['domainName']
# KEGG: kgroup → kgroup_desc
kegg_file = ANNOT_DIR / f'{org_id}_kegg.csv'
if kegg_file.exists():
kegg = pd.read_csv(kegg_file)
for _, row in kegg.drop_duplicates('kgroup').iterrows():
if row['kgroup'] not in id_to_desc and pd.notna(row.get('kgroup_desc')):
id_to_desc[row['kgroup']] = row['kgroup_desc']
print(f"Built ID → description lookup: {len(id_to_desc)} entries")
# Apply to predictions
if all_predictions:
combined = pd.concat(all_predictions, ignore_index=True)
combined['predicted_function_desc'] = combined['predicted_function'].map(
lambda x: id_to_desc.get(x, x) # keep original if no mapping found
)
# Show resolution stats
resolved = combined['predicted_function'] != combined['predicted_function_desc']
print(f"Resolved: {resolved.sum()}/{len(combined)} predictions ({100*resolved.mean():.1f}%)")
print(f"Unresolved (SEED terms already readable): {(~resolved).sum()}")
# Save updated predictions
combined.to_csv(PRED_DIR / 'all_predictions_summary.csv', index=False)
# Also update per-organism files
for org_id in combined['orgId'].unique():
org_preds = combined[combined['orgId'] == org_id]
org_preds.to_csv(PRED_DIR / f'{org_id}_predictions.csv', index=False)
print(f"\nSaved updated predictions with descriptions")
Built ID → description lookup: 15155 entries Resolved: 5621/6691 predictions (84.0%) Unresolved (SEED terms already readable): 1070 Saved updated predictions with descriptions
3. Combined Summary¶
In [5]:
print("=" * 60)
print("PREDICTION SUMMARY")
print("=" * 60)
print(f"Total predictions: {len(combined)}")
print(f"\nBy source:")
print(combined['prediction_source'].value_counts().to_string())
print(f"\nBy annotation database:")
print(combined['annotation_db'].value_counts().to_string())
print(f"\nConfidence: median={combined['confidence'].median():.2f}, "
f"mean={combined['confidence'].mean():.2f}, "
f"max={combined['confidence'].max():.2f}")
print(f"\nTop 15 highest-confidence predictions (with descriptions):")
top15 = combined.nlargest(15, 'confidence')
print(top15[['orgId', 'locusId', 'predicted_function_desc', 'confidence',
'prediction_source']].to_string(index=False))
print(f"\nMost common predicted functions:")
print(combined['predicted_function_desc'].value_counts().head(15).to_string())
============================================================
PREDICTION SUMMARY
============================================================
Total predictions: 6691
By source:
prediction_source
module 4236
family 2455
By annotation database:
annotation_db
PFam 4397
SEED 1097
KEGG 761
TIGRFam 436
Confidence: median=1.32, mean=1.59, max=14.77
Top 15 highest-confidence predictions (with descriptions):
orgId locusId predicted_function_desc confidence prediction_source
Koxy BWI76_RS20270 LacI 14.770167 family
pseudo6_N2E2 Pf6N2E2_2710 Flg_bb_rod 12.419920 family
pseudo6_N2E2 Pf6N2E2_2563 Flg_bb_rod 12.413871 family
pseudo6_N2E2 Pf6N2E2_5963 Flg_bb_rod 12.063872 family
pseudo6_N2E2 Pf6N2E2_5068 Flg_bb_rod 11.933264 family
pseudo6_N2E2 Pf6N2E2_3827 Flg_bb_rod 11.908727 family
pseudo6_N2E2 Pf6N2E2_5893 Flg_bb_rod 11.183882 family
pseudo5_N2C3_1 AO356_18320 Flg_bb_rod 10.694277 family
pseudo6_N2E2 Pf6N2E2_4032 GGDEF 9.877816 family
BFirm BPHYT_RS19100 Flg_bb_rod 9.189315 family
SB2B 6936921 Flg_bb_rod 9.163634 family
pseudo5_N2C3_1 AO356_20220 Flg_bb_rod 9.128664 family
WCS417 GFF3924 Flg_bb_rod 9.017234 family
WCS417 GFF3925 Flg_bb_rod 8.968367 family
pseudo6_N2E2 Pf6N2E2_5277 GGDEF 8.936912 family
Most common predicted functions:
predicted_function_desc
DEAD 156
Flg_bb_rod 140
Response_reg 120
ACR_tran 95
BPD_transp_1 94
ABC_tran 77
Glyco_tranf_2_2 77
MoCF_biosynth 73
amino ABC transporter, permease protein, 3-TM region, His/Glu/Gln/Arg/opine family 68
Sulfatase 64
Proton_antipo_M 61
BPD_transp_2 59
ABC transporter efflux protein 59
GDP_Man_Dehyd 56
GerE 56
In [6]:
import matplotlib.pyplot as plt
FIG_DIR = Path('../figures')
fig, axes = plt.subplots(1, 3, figsize=(16, 5))
# Left: predictions per organism
ax = axes[0]
org_counts = combined.groupby('orgId').size().sort_values(ascending=True)
colors_pred = ['#4CAF50' if combined[combined['orgId']==o]['prediction_source'].mode().values[0]=='family'
else '#2196F3' for o in org_counts.index]
ax.barh(range(len(org_counts)), org_counts.values, color='#2196F3')
ax.set_yticks(range(len(org_counts)))
ax.set_yticklabels(org_counts.index, fontsize=7)
ax.set_xlabel('Number of predictions')
ax.set_title(f'Predictions per Organism (n={len(combined)})')
# Middle: confidence distribution
ax = axes[1]
family_conf = combined[combined['prediction_source'] == 'family']['confidence']
module_conf = combined[combined['prediction_source'] == 'module']['confidence']
ax.hist([family_conf, module_conf], bins=30, stacked=True,
label=[f'Family-backed ({len(family_conf)})', f'Module-only ({len(module_conf)})'],
color=['#4CAF50', '#FF9800'])
ax.set_xlabel('Confidence score')
ax.set_ylabel('Number of predictions')
ax.set_title('Prediction Confidence Distribution')
ax.legend(fontsize=9)
# Right: family-backed vs module-only pie
ax = axes[2]
source_counts = combined['prediction_source'].value_counts()
ax.pie(source_counts.values, labels=[f'Family-backed\n({source_counts.get("family", 0)})',
f'Module-only\n({source_counts.get("module", 0)})'],
colors=['#4CAF50', '#FF9800'], autopct='%1.0f%%', startangle=90)
ax.set_title('Prediction Source Breakdown')
plt.tight_layout()
plt.savefig(FIG_DIR / 'prediction_summary.png', dpi=150, bbox_inches='tight')
plt.show()
print(f"Saved: figures/prediction_summary.png")
Saved: figures/prediction_summary.png