import pandas as pd
import numpy as np
import os, re, warnings
from collections import Counter
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

warnings.filterwarnings("ignore", category=FutureWarning)
sns.set_style("whitegrid")
plt.rcParams.update({"figure.dpi": 150, "savefig.dpi": 150, "figure.figsize": (12, 7)})

# Path setup
if os.path.basename(os.getcwd()) == "notebooks":
    PROJECT_DIR = os.path.dirname(os.getcwd())
else:
    PROJECT_DIR = os.getcwd()

DATA_DIR = os.path.join(PROJECT_DIR, "data")
FIG_DIR = os.path.join(PROJECT_DIR, "figures")
os.makedirs(FIG_DIR, exist_ok=True)
print(f"Project dir: {PROJECT_DIR}")

Project dir: /home/aparkin/BERIL-research-observatory/projects/functional_dark_matter

# Load phylogenetic breadth (30,756 dark gene clusters → 11,774 root_ogs)
phylo = pd.read_csv(os.path.join(DATA_DIR, "phylogenetic_breadth.tsv"), sep="	")
print(f"Phylogenetic breadth: {len(phylo):,} clusters, {phylo['root_og'].nunique():,} unique root_ogs")
print(f"Columns: {list(phylo.columns)}")

# Load dark genes
dark_full = pd.read_csv(os.path.join(DATA_DIR, "dark_genes_integrated.tsv"), sep="	", low_memory=False)
dark = dark_full[dark_full["is_dark"] == True].copy()
print(f"Dark genes: {len(dark):,} (from {len(dark_full):,} total)")
print(f"Dark genes with gene_cluster_id: {dark['gene_cluster_id'].notna().sum():,}")
print(f"Dark genes with root_og (via phylo): {dark['gene_cluster_id'].isin(phylo['gene_cluster_id']).sum():,}")

# Load concordance
concordance = pd.read_csv(os.path.join(DATA_DIR, "concordance_detailed.tsv"), sep="	")
print(f"Concordance OGs: {len(concordance):,} (unique ogId: {concordance['ogId'].nunique()})")

# Load GapMind
gapmind = pd.read_csv(os.path.join(DATA_DIR, "gapmind_domain_matched.tsv"), sep="	", low_memory=False)
print(f"GapMind matches: {len(gapmind):,}")

# Load scoring (has s_tractability)
scored = pd.read_csv(os.path.join(DATA_DIR, "scoring_all_dark.tsv"), sep="	", low_memory=False)
print(f"Scored dark genes: {len(scored):,}")

Phylogenetic breadth: 30,756 clusters, 11,774 unique root_ogs
Columns: ['gene_cluster_id', 'root_og', 'narrowest_og', 'n_tax_levels', 'breadth_class', 'COG_category', 'EC', 'KEGG_ko', 'PFAMs', 'Description', 'n_species', 'n_clusters']

Dark genes: 57,011 (from 228,709 total)
Dark genes with gene_cluster_id: 39,532
Dark genes with root_og (via phylo): 32,791
Concordance OGs: 65 (unique ogId: 65)
GapMind matches: 42,239
Scored dark genes: 17,344

# Load pre-computed full pangenome distribution (from NB11b)
pg_dist_path = os.path.join(DATA_DIR, "og_pangenome_distribution.tsv")
result = pd.read_csv(pg_dist_path, sep="\t")
print(f"Loaded pangenome distribution: {len(result):,} root_ogs")
print(f"Species range: {result['n_species'].min()} - {result['n_species'].max():,}")
print(f"Phyla range: {result['n_phyla'].min()} - {result['n_phyla'].max()}")
print(f"\nSpecies distribution quantiles:")
print(result['n_species'].describe(percentiles=[.25, .5, .75, .9, .95, .99]))

# Load OG_id propagation mapping
prop_path = os.path.join(DATA_DIR, "og_id_root_propagation.tsv")
og_prop = pd.read_csv(prop_path, sep="\t")
print(f"\nOG_id propagation: {len(og_prop):,} OG_id → root_og mappings")

Loaded pangenome distribution: 11,774 root_ogs
Species range: 1 - 27,482
Phyla range: 1 - 142

Species distribution quantiles:
count    11774.000000
mean      2128.211993
std       4979.781327
min          1.000000
25%         43.000000
50%        135.000000
75%        799.000000
90%       7529.900000
95%      14236.900000
99%      24463.860000
max      27482.000000
Name: n_species, dtype: float64

OG_id propagation: 7,898 OG_id → root_og mappings

# Load pangenome distribution (use freshly computed or reload from disk)
pg_dist = result.copy()

# Merge with phylogenetic breadth to get COG categories
phylo_cog = phylo[["root_og", "COG_category"]].drop_duplicates("root_og")
pg_dist = pg_dist.merge(phylo_cog, on="root_og", how="left")

print(f"Pangenome distribution with COG: {len(pg_dist):,} root_ogs")
print(f"COG category X (mobilome): {(pg_dist['COG_category'] == 'X').sum()}")

Pangenome distribution with COG: 11,774 root_ogs
COG category X (mobilome): 0

def classify_taxonomic_tier(row):
    """Classify OG into taxonomic tier based on species distribution.
    
    Mobile detection: phylogenetic patchiness (distant phyla, few species per phylum)
    or COG category X (mobilome: prophages, transposons).
    """
    n_sp = row["n_species"]
    n_ph = row["n_phyla"]
    n_cl = row["n_classes"]
    n_or = row["n_orders"]
    n_fa = row["n_families"]
    n_ge = row["n_genera"]
    cog = str(row.get("COG_category", "-"))
    
    # Mobile detection: COG-X or patchy distribution (multi-phylum but few species per phylum)
    if cog == "X":
        return "mobile"
    if n_ph >= 2 and n_sp / n_ph <= 10:
        return "mobile"
    
    # Standard tier assignment (broadest first)
    if n_ph > 1:
        return "kingdom"
    if n_cl > 1:
        return "phylum"
    if n_or > 1:
        return "class"
    if n_fa > 1:
        return "order"
    if n_ge > 1:
        return "family"
    if n_sp > 1:
        return "genus"
    return "species"

pg_dist["taxonomic_tier"] = pg_dist.apply(classify_taxonomic_tier, axis=1)

print("Taxonomic tier distribution (pangenome OGs):")
tier_counts = pg_dist["taxonomic_tier"].value_counts()
for tier, n in tier_counts.items():
    print(f"  {tier:12s}: {n:6,} ({100*n/len(pg_dist):5.1f}%)")

Taxonomic tier distribution (pangenome OGs):
  kingdom     :  6,581 ( 55.9%)
  class       :  1,297 ( 11.0%)
  family      :  1,231 ( 10.5%)
  genus       :    812 (  6.9%)
  mobile      :    770 (  6.5%)
  phylum      :    563 (  4.8%)
  order       :    457 (  3.9%)
  species     :     63 (  0.5%)

# Fallback for dark genes without pangenome root_og:
# Use 48-organism OG from essential_genome project + hard-coded taxonomy

# Load 48-organism OGs
og_path = os.path.join(PROJECT_DIR, "..", "essential_genome", "data", "all_ortholog_groups.csv")
fb_ogs = pd.read_csv(og_path)
print(f"FB ortholog groups: {len(fb_ogs):,} rows, {fb_ogs['OG_id'].nunique():,} OGs")

# Hard-coded phylum for 48 FB organisms (from GTDB taxonomy)
ORG_PHYLUM = {
    # Pseudomonadota (Proteobacteria) — 37 organisms
    "Keio": "Pseudomonadota", "Koxy": "Pseudomonadota", "MR1": "Pseudomonadota",
    "ANA3": "Pseudomonadota", "PV4": "Pseudomonadota", "SB2B": "Pseudomonadota",
    "Marino": "Pseudomonadota", "DvH": "Pseudomonadota", "Miya": "Pseudomonadota",
    "Putida": "Pseudomonadota", "pseudo5_N2C3_1": "Pseudomonadota",
    "pseudo1_N1B4": "Pseudomonadota", "pseudo3_N2E3": "Pseudomonadota",
    "pseudo6_N2E2": "Pseudomonadota", "pseudo13_GW456_L13": "Pseudomonadota",
    "psRCH2": "Pseudomonadota", "SyringaeB728a": "Pseudomonadota",
    "SyringaeB728a_mexBdelta": "Pseudomonadota", "WCS417": "Pseudomonadota",
    "acidovorax_3H11": "Pseudomonadota", "BFirm": "Pseudomonadota",
    "Burk376": "Pseudomonadota", "Cup4G11": "Pseudomonadota",
    "RalstoniaBSBF1503": "Pseudomonadota", "RalstoniaGMI1000": "Pseudomonadota",
    "RalstoniaPSI07": "Pseudomonadota", "RalstoniaUW163": "Pseudomonadota",
    "HerbieS": "Pseudomonadota", "azobra": "Pseudomonadota",
    "Magneto": "Pseudomonadota", "Dino": "Pseudomonadota",
    "PS": "Pseudomonadota", "Smeli": "Pseudomonadota",
    "Caulo": "Pseudomonadota", "Phaeo": "Pseudomonadota",
    "Dda3937": "Pseudomonadota", "Ddia6719": "Pseudomonadota",
    "DdiaME23": "Pseudomonadota", "Dyella79": "Pseudomonadota",
    # Bacteroidota — 4 organisms
    "Btheta": "Bacteroidota", "Pedo557": "Bacteroidota",
    "Ponti": "Bacteroidota", "Cola": "Bacteroidota",
    # Cyanobacteriota — 1 organism
    "SynE": "Cyanobacteriota",
    # Euryarchaeota — 2 organisms  
    "Methanococcus_JJ": "Halobacteriota", "Methanococcus_S2": "Halobacteriota",
    # Spirochaetota — 0 in FB, others:
    "Korea": "Pseudomonadota",  # Sphingomonas
    "Kang": "Pseudomonadota",   # Kangiella
}

# Count organisms per OG with phylum info
fb_og_orgs = fb_ogs.merge(
    pd.DataFrame(list(ORG_PHYLUM.items()), columns=["orgId", "phylum"]),
    on="orgId", how="left"
)
fb_og_summary = fb_og_orgs.groupby("OG_id").agg(
    n_fb_orgs=("orgId", "nunique"),
    n_fb_phyla=("phylum", "nunique"),
).reset_index()
print(f"FB OG summary: {len(fb_og_summary):,} OGs")
print(f"Multi-phylum FB OGs: {(fb_og_summary['n_fb_phyla'] > 1).sum()}")

FB ortholog groups: 179,237 rows, 17,222 OGs
FB OG summary: 17,222 OGs
Multi-phylum FB OGs: 3109

# Build gene-level taxonomic tier assignment
# Map dark genes → root_og → pangenome tier
# For genes without root_og, try OG_id propagation, then FB OG fallback

# Step 1: root_og mapping via gene_cluster_id
gc_to_og = phylo[["gene_cluster_id", "root_og"]].drop_duplicates("gene_cluster_id")
dark_with_og = dark.merge(gc_to_og, on="gene_cluster_id", how="left")
print(f"Dark genes with root_og (direct): {dark_with_og['root_og'].notna().sum():,} / {len(dark):,}")

# Step 1b: OG_id propagation — for genes with OG_id but no root_og,
# propagate root_og from other genes in the same OG
needs_prop = dark_with_og["root_og"].isna() & dark_with_og["OG_id"].notna()
if os.path.exists(os.path.join(DATA_DIR, "og_id_root_propagation.tsv")):
    og_prop_map = pd.read_csv(os.path.join(DATA_DIR, "og_id_root_propagation.tsv"), sep="\t")
    prop_merge = dark_with_og.loc[needs_prop, ["OG_id"]].merge(og_prop_map, on="OG_id", how="left")
    n_prop = prop_merge["propagated_root_og"].notna().sum()
    dark_with_og.loc[needs_prop, "root_og"] = prop_merge["propagated_root_og"].values
    print(f"OG_id propagation: {n_prop:,} genes recovered")
else:
    print("No OG_id propagation file found, skipping")
print(f"Dark genes with root_og (after propagation): {dark_with_og['root_og'].notna().sum():,} / {len(dark):,}")

# Step 2: merge pangenome tier
og_tier = pg_dist[["root_og", "taxonomic_tier", "n_species", "n_phyla", "n_classes",
                     "n_orders", "n_families", "n_genera"]].copy()
dark_with_tier = dark_with_og.merge(og_tier, on="root_og", how="left")
print(f"Dark genes with pangenome tier: {dark_with_tier['taxonomic_tier'].notna().sum():,}")

# Step 3: Fallback for genes without pangenome tier — use FB OG
no_tier = dark_with_tier["taxonomic_tier"].isna()
print(f"Dark genes needing FB fallback: {no_tier.sum():,}")

# Map FB OG_id → tier (simplified: phylum info from 48 orgs)
def fb_tier(row, fb_og_summary):
    og_id = row.get("OG_id")
    if pd.isna(og_id) or og_id == "":
        return "species", 1, 1  # singleton
    match = fb_og_summary[fb_og_summary["OG_id"] == og_id]
    if len(match) == 0:
        return "species", 1, 1
    n_orgs = match.iloc[0]["n_fb_orgs"]
    n_phyla = match.iloc[0]["n_fb_phyla"]
    if n_phyla > 1:
        return "kingdom", n_orgs, n_phyla
    elif n_orgs > 1:
        return "genus", n_orgs, 1  # within one phylum but can't resolve further from 48 orgs
    else:
        return "species", 1, 1

# Apply fallback
fb_results = dark_with_tier[no_tier].apply(lambda r: fb_tier(r, fb_og_summary), axis=1, result_type="expand")
fb_results.columns = ["taxonomic_tier_fb", "n_species_fb", "n_phyla_fb"]

dark_with_tier.loc[no_tier, "taxonomic_tier"] = fb_results["taxonomic_tier_fb"].values
dark_with_tier.loc[no_tier, "n_species"] = fb_results["n_species_fb"].values
dark_with_tier.loc[no_tier, "n_phyla"] = fb_results["n_phyla_fb"].values

# Mark tier sources
dark_with_tier["tier_source"] = "pangenome"
dark_with_tier.loc[no_tier, "tier_source"] = "FB-only"

print(f"\nFinal tier distribution (all {len(dark_with_tier):,} dark genes):")
tier_dist = dark_with_tier["taxonomic_tier"].value_counts()
for tier, n in tier_dist.items():
    print(f"  {tier:12s}: {n:6,} ({100*n/len(dark_with_tier):5.1f}%)")
print(f"\nTier source: {dark_with_tier['tier_source'].value_counts().to_dict()}")

Dark genes with root_og (direct): 32,791 / 57,011

OG_id propagation: 5,206 genes recovered
Dark genes with root_og (after propagation): 37,997 / 57,011
Dark genes with pangenome tier: 37,997
Dark genes needing FB fallback: 19,014

Final tier distribution (all 57,011 dark genes):
  kingdom     : 29,330 ( 51.4%)
  species     : 17,005 ( 29.8%)
  genus       :  3,547 (  6.2%)
  class       :  2,262 (  4.0%)
  family      :  1,992 (  3.5%)
  mobile      :  1,062 (  1.9%)
  phylum      :  1,059 (  1.9%)
  order       :    754 (  1.3%)

Tier source: {'pangenome': 37997, 'FB-only': 19014}

# Prepare evidence indicators for hypothesis classification

# 1. Module prediction with EC/enzyme function
dark_with_tier["has_ec_prediction"] = dark_with_tier["module_prediction"].apply(
    lambda x: bool(re.search(r"EC\s*[\d\.]+|ase\b|synthase|transferase|kinase|reductase|oxidase|hydrolase|lyase|ligase|isomerase", 
                              str(x), re.IGNORECASE)) if pd.notna(x) else False
)

# 2. High concordance (>0.7)
high_conc_ogs = set(concordance[concordance["concordance_frac"] > 0.7]["ogId"].astype(str).values)
dark_with_tier["has_high_concordance"] = dark_with_tier["OG_id"].astype(str).isin(high_conc_ogs)

# 3. GapMind matches by confidence
gapmind_gene = gapmind.groupby(["orgId", "locusId"]).agg(
    best_confidence=("confidence_tier", "first"),  # already sorted by confidence
    n_gapmind_matches=("confidence_tier", "size"),
).reset_index()
# Mark high-confidence GapMind
gapmind_high = set(
    gapmind[gapmind["confidence_tier"] == "High"]
    .apply(lambda r: f"{r['orgId']}_{r['locusId']}", axis=1)
)
gapmind_any = set(
    gapmind.apply(lambda r: f"{r['orgId']}_{r['locusId']}", axis=1)
)
dark_with_tier["gene_key"] = dark_with_tier["orgId"] + "_" + dark_with_tier["locusId"].astype(str)
dark_with_tier["has_gapmind_high"] = dark_with_tier["gene_key"].isin(gapmind_high)
dark_with_tier["has_gapmind_any"] = dark_with_tier["gene_key"].isin(gapmind_any)

# 4. Domain analysis: named vs DUF-only
def is_duf_only(domain_names):
    if pd.isna(domain_names) or str(domain_names).strip() in ("", "nan"):
        return False  # no domains = not DUF-only
    names = str(domain_names).split(";")
    return all(re.match(r"^(DUF|UPF)\d+$", n.strip()) for n in names if n.strip())

def has_named_domain(domain_names):
    if pd.isna(domain_names) or str(domain_names).strip() in ("", "nan"):
        return False
    names = str(domain_names).split(";")
    return any(not re.match(r"^(DUF|UPF)\d+$", n.strip()) for n in names if n.strip())

dark_with_tier["is_duf_only"] = dark_with_tier["domain_names"].apply(is_duf_only)
dark_with_tier["has_named_domain"] = dark_with_tier["domain_names"].apply(has_named_domain)

# 5. Strong fitness
dark_with_tier["has_strong_fitness"] = dark_with_tier["max_abs_fit"].fillna(0) >= 2

# Summary
print("Evidence indicators:")
for col in ["has_ec_prediction", "has_high_concordance", "has_gapmind_high", "has_gapmind_any",
            "is_duf_only", "has_named_domain", "has_strong_fitness"]:
    print(f"  {col}: {dark_with_tier[col].sum():,}")

Evidence indicators:
  has_ec_prediction: 335
  has_high_concordance: 0
  has_gapmind_high: 442
  has_gapmind_any: 3,186
  is_duf_only: 8,578
  has_named_domain: 19,292
  has_strong_fitness: 7,787

def classify_hypothesis_status(row):
    """Three-tier hypothesis classification.
    
    Strong testable hypothesis: clear experimental path to function
    Weak lead: some evidence but not actionable without more work
    True knowledge gap: zero functional evidence
    """
    # Strong testable hypothesis (any of):
    if row["has_ec_prediction"]:
        return "strong_hypothesis"
    if row["has_high_concordance"]:
        return "strong_hypothesis"
    if row["has_gapmind_high"]:
        return "strong_hypothesis"
    if row["has_named_domain"] and row["has_strong_fitness"]:
        return "strong_hypothesis"
    
    # Weak lead (any of):
    if row["is_duf_only"]:
        return "weak_lead"
    if pd.notna(row["module_prediction"]) and str(row["module_prediction"]).strip() not in ("", "nan"):
        return "weak_lead"  # bare module prediction (PFam/TIGR/KEGG ID)
    if row["has_gapmind_any"]:
        return "weak_lead"
    if row["has_strong_fitness"]:
        return "weak_lead"  # fitness without annotation
    if row["has_named_domain"]:
        return "weak_lead"  # domain without fitness
    
    # True knowledge gap
    return "true_knowledge_gap"

dark_with_tier["hypothesis_status"] = dark_with_tier.apply(classify_hypothesis_status, axis=1)

print("Hypothesis status distribution:")
hyp_counts = dark_with_tier["hypothesis_status"].value_counts()
for status, n in hyp_counts.items():
    print(f"  {status:25s}: {n:6,} ({100*n/len(dark_with_tier):5.1f}%)")

# Cross-tabulation: tier × hypothesis
ct = pd.crosstab(dark_with_tier["taxonomic_tier"], dark_with_tier["hypothesis_status"])
print(f"\nTier × Hypothesis cross-tabulation:")
print(ct)

Hypothesis status distribution:
  weak_lead                : 29,926 ( 52.5%)
  true_knowledge_gap       : 23,654 ( 41.5%)
  strong_hypothesis        :  3,431 (  6.0%)

Tier × Hypothesis cross-tabulation:
hypothesis_status  strong_hypothesis  true_knowledge_gap  weak_lead
taxonomic_tier                                                     
class                             50                1250        962
family                            40                1339        613
genus                             79                2300       1168
kingdom                         2889                5260      21181
mobile                            22                 653        387
order                             15                 423        316
phylum                            22                 580        457
species                          314               11849       4842

# Conservation score: tier-adjusted + log-scaled species count
TIER_SCORE = {
    "kingdom": 7, "phylum": 6, "class": 5, "order": 4,
    "family": 3, "genus": 2, "species": 1, "mobile": 4  # mobile = interesting
}

# Ignorance score
IGNORANCE_SCORE = {
    "true_knowledge_gap": 3,
    "weak_lead": 2,
    "strong_hypothesis": 1
}

MAX_SPECIES = 27690  # pangenome total

def compute_importance(row):
    tier = row["taxonomic_tier"]
    n_sp = max(row.get("n_species", 1) or 1, 1)
    hypothesis = row["hypothesis_status"]
    
    tier_score = TIER_SCORE.get(tier, 1)
    # Within-tier modulation by log2(n_species) normalized to 0-1
    species_bonus = np.log2(n_sp) / np.log2(MAX_SPECIES) if n_sp > 1 else 0
    conservation = tier_score + species_bonus
    
    ignorance = IGNORANCE_SCORE.get(hypothesis, 1)
    
    return conservation * ignorance

dark_with_tier["conservation_score"] = dark_with_tier.apply(
    lambda r: TIER_SCORE.get(r["taxonomic_tier"], 1) + 
              (np.log2(max(r.get("n_species", 1) or 1, 1)) / np.log2(MAX_SPECIES) if max(r.get("n_species", 1) or 1, 1) > 1 else 0),
    axis=1
)
dark_with_tier["ignorance_score"] = dark_with_tier["hypothesis_status"].map(IGNORANCE_SCORE)
dark_with_tier["importance_score"] = dark_with_tier["conservation_score"] * dark_with_tier["ignorance_score"]

print("Importance score distribution:")
print(dark_with_tier["importance_score"].describe())
print(f"\nTop 10 importance scores:")
print(dark_with_tier.nlargest(10, "importance_score")[["orgId", "locusId", "taxonomic_tier", 
    "hypothesis_status", "conservation_score", "ignorance_score", "importance_score", "desc"]])

Importance score distribution:
count    57011.000000
mean        10.986832
std          6.781338
min          1.000000
25%          3.000000
50%         13.273989
75%         15.793883
max         23.997244
Name: importance_score, dtype: float64

Top 10 importance scores:
         orgId          locusId taxonomic_tier   hypothesis_status  \
8325   Burk376  H281DRAFT_00725        kingdom  true_knowledge_gap   
3261    azobra    AZOBR_RS15290        kingdom  true_knowledge_gap   
51639      PV4          5207961        kingdom  true_knowledge_gap   
1519      ANA3          7024652        kingdom  true_knowledge_gap   
3734    azobra    AZOBR_RS22455        kingdom  true_knowledge_gap   
4233    azobra    AZOBR_RS28995        kingdom  true_knowledge_gap   
5108    Btheta           350703        kingdom  true_knowledge_gap   
5109    Btheta           350706        kingdom  true_knowledge_gap   
6381    Btheta           353809        kingdom  true_knowledge_gap   
25501      DvH           208722        kingdom  true_knowledge_gap   

       conservation_score  ignorance_score  importance_score  \
8325             7.999081                3         23.997244   
3261             7.999067                3         23.997201   
51639            7.998771                3         23.996312   
1519             7.998538                3         23.995614   
3734             7.997855                3         23.993564   
4233             7.997855                3         23.993564   
5108             7.997855                3         23.993564   
5109             7.997855                3         23.993564   
6381             7.997855                3         23.993564   
25501            7.997855                3         23.993564   

                                                 desc  
8325                             hypothetical protein  
3261                             hypothetical protein  
51639                   hypothetical protein (RefSeq)  
1519                    hypothetical protein (RefSeq)  
3734                             hypothetical protein  
4233                             hypothetical protein  
5108             hypothetical protein (NCBI ptt file)  
5109   conserved hypothetical protein (NCBI ptt file)  
6381             hypothetical protein (NCBI ptt file)  
25501                     hypothetical protein (TIGR)

# Compute OG-level importance ranking
# For each root_og, take the max importance score and aggregate evidence
og_importance = dark_with_tier[dark_with_tier["root_og"].notna()].groupby("root_og").agg(
    n_genes=("locusId", "nunique"),
    n_organisms=("orgId", "nunique"),
    mean_importance=("importance_score", "mean"),
    max_importance=("importance_score", "max"),
    taxonomic_tier=("taxonomic_tier", "first"),
    hypothesis_status=("hypothesis_status", lambda x: x.value_counts().index[0]),  # mode
    n_species=("n_species", "first"),
    n_phyla=("n_phyla", "first"),
    mean_fitness=("max_abs_fit", lambda x: x.fillna(0).mean()),
    any_strong_fitness=("has_strong_fitness", "any"),
    any_named_domain=("has_named_domain", "any"),
    any_gapmind=("has_gapmind_any", "any"),
).reset_index()
og_importance = og_importance.sort_values("max_importance", ascending=False)

print(f"OG importance ranking: {len(og_importance):,} root_ogs")
print(f"\nTop 20 most important OGs (conservation × ignorance):")
print(og_importance.head(20)[["root_og", "n_genes", "n_organisms", "taxonomic_tier",
    "hypothesis_status", "n_species", "n_phyla", "max_importance"]].to_string(index=False))

# Save OG importance ranking
og_importance.to_csv(os.path.join(DATA_DIR, "og_importance_ranked.tsv"), sep="	", index=False)
print(f"\nSaved to data/og_importance_ranked.tsv")

OG importance ranking: 11,774 root_ogs

Top 20 most important OGs (conservation × ignorance):
root_og  n_genes  n_organisms taxonomic_tier  hypothesis_status  n_species  n_phyla  max_importance
COG0172        4            4        kingdom          weak_lead    27431.0    142.0       23.997244
COG0468        1            1        kingdom true_knowledge_gap    27427.0    142.0       23.997201
COG0484        4            2        kingdom          weak_lead    27344.0    139.0       23.996312
COG0443        2            1        kingdom true_knowledge_gap    27279.0    139.0       23.995614
COG0438       42           20        kingdom          weak_lead    27089.0    142.0       23.993564
COG0358       16           10        kingdom          weak_lead    27081.0    141.0       23.993478
COG0210        9            9        kingdom          weak_lead    26911.0    138.0       23.991631
COG0681        3            3        kingdom true_knowledge_gap    26736.0    133.0       23.989717
COG0563        7            7        kingdom          weak_lead    26732.0    136.0       23.989673
COG0454       20           12        kingdom          weak_lead    26688.0    138.0       23.989190
COG0456       49           24        kingdom          weak_lead    26620.0    140.0       23.988442
COG0745       18           11        kingdom          weak_lead    26612.0    137.0       23.988354
COG4974       11            9        kingdom          weak_lead    26475.0    138.0       23.986840
COG0272        1            1        kingdom true_knowledge_gap    26422.0    129.0       23.986252
COG0465        1            1        kingdom true_knowledge_gap    26332.0    131.0       23.985252
COG0697      150           28        kingdom          weak_lead    26219.0    130.0       23.983990
COG3118       12           10        kingdom          weak_lead    26194.0    137.0       23.983710
COG0593        5            4        kingdom          weak_lead    26136.0    131.0       23.983060
COG0707        3            3        kingdom true_knowledge_gap    26123.0    136.0       23.982914
COG1196       50           30        kingdom          weak_lead    25928.0    137.0       23.980717

Saved to data/og_importance_ranked.tsv

# Build evidence summary string
def evidence_summary(row):
    parts = []
    if row["has_ec_prediction"]:
        parts.append("EC-prediction")
    if row["has_high_concordance"]:
        parts.append("high-concordance")
    if row["has_gapmind_high"]:
        parts.append("GapMind-high")
    elif row["has_gapmind_any"]:
        parts.append("GapMind-low")
    if row["has_named_domain"]:
        parts.append("named-domain")
    elif row["is_duf_only"]:
        parts.append("DUF-only")
    if row["has_strong_fitness"]:
        parts.append("strong-fitness")
    if pd.notna(row["module_prediction"]) and str(row["module_prediction"]).strip() not in ("", "nan"):
        if not row["has_ec_prediction"]:
            parts.append("module-bare")
    return "; ".join(parts) if parts else "none"

dark_with_tier["evidence_summary"] = dark_with_tier.apply(evidence_summary, axis=1)

# Save per-gene classification
gene_classes = dark_with_tier[["orgId", "locusId", "gene_cluster_id", "root_og", "OG_id",
    "taxonomic_tier", "tier_source", "hypothesis_status",
    "conservation_score", "ignorance_score", "importance_score",
    "n_species", "n_phyla", "evidence_summary"]].copy()
gene_classes.to_csv(os.path.join(DATA_DIR, "dark_gene_classes.tsv"), sep="	", index=False)
print(f"Saved {len(gene_classes):,} rows to data/dark_gene_classes.tsv")

Saved 57,011 rows to data/dark_gene_classes.tsv

# Setup: tractability scores and genus mapping

CRISPRI_TRACTABILITY = {
    "Keio": 0.9, "BW25113": 0.9, "Deshi": 0.85,
    "MR1": 0.8, "SB2B": 0.7,
    "Putida": 0.8, "pseudo5_N2C3_1": 0.75,
    "pseudo1_N1B4": 0.7, "pseudo3_N2E2": 0.7,
    "pseudo13_GW456_L13": 0.65,
    "Pse": 0.7, "Pseu": 0.7,
    "BT": 0.75, "Bacteroides_theta": 0.75,
    "Smeli": 0.6, "Koxy": 0.65,
    "Marino": 0.5, "DvH": 0.5,
    "Geobacter": 0.45, "Caulobacter": 0.6,
    "Synpcc7942": 0.55, "Synpcc6803": 0.55,
    "psRCH2": 0.55, "Rleg": 0.5,
}
DEFAULT_CRISPRI = 0.3

# Genus mapping
org_map_path = os.path.join(PROJECT_DIR, "..", "conservation_vs_fitness", "data", "organism_mapping.tsv")
org_map = pd.read_csv(org_map_path, sep="	")
ORG_GENUS = dict(zip(org_map["orgId"], org_map["genus"]))
# Manual fallbacks for organisms not in mapping
ORG_GENUS.setdefault("SB2B", "Shewanella")
ORG_GENUS.setdefault("Magneto", "Magnetospirillum")
ORG_GENUS.setdefault("Cola", "Echinicola")
ORG_GENUS.setdefault("Kang", "Kangiella")

print(f"Tractability scores for {len(CRISPRI_TRACTABILITY)} organisms")
print(f"Genus mapping for {len(ORG_GENUS)} organisms")

Tractability scores for 24 organisms
Genus mapping for 48 organisms

# Define universe: high-importance OGs (top quartile OR kingdom/phylum-level gaps/leads)
importance_threshold = dark_with_tier["importance_score"].quantile(0.75)
print(f"Importance threshold (75th percentile): {importance_threshold:.2f}")

# Universe: genes above threshold
universe = dark_with_tier[
    (dark_with_tier["importance_score"] >= importance_threshold) |
    ((dark_with_tier["taxonomic_tier"].isin(["kingdom", "phylum"])) & 
     (dark_with_tier["hypothesis_status"].isin(["true_knowledge_gap", "weak_lead"])))
].copy()
print(f"Universe: {len(universe):,} dark genes (from {dark_with_tier['orgId'].nunique()} organisms)")
print(f"Universe organisms: {universe['orgId'].nunique()}")

# Ensure gene_key is set
universe["gene_key"] = universe["orgId"] + "_" + universe["locusId"].astype(str)

# Build org → gene_key sets
org_sets = {}
for org, grp in universe.groupby("orgId"):
    org_sets[org] = set(grp["gene_key"].values)

print(f"Organisms with universe genes: {len(org_sets)}")
for org in sorted(org_sets, key=lambda o: -len(org_sets[o]))[:10]:
    print(f"  {org}: {len(org_sets[org]):,} genes")

Importance threshold (75th percentile): 15.79
Universe: 28,584 dark genes (from 48 organisms)
Universe organisms: 48
Organisms with universe genes: 48
  Smeli: 1,630 genes
  Btheta: 1,382 genes
  Putida: 1,043 genes
  Ponti: 943 genes
  Cup4G11: 912 genes
  SyringaeB728a_mexBdelta: 882 genes
  SyringaeB728a: 833 genes
  BFirm: 809 genes
  MR1: 805 genes
  pseudo3_N2E3: 789 genes

# Greedy weighted set-cover
gene_priority = dict(zip(universe["gene_key"], universe["importance_score"]))
total_priority = sum(gene_priority.values())
target_coverage = 0.95 * total_priority

uncovered = set(universe["gene_key"].values)
covered_priority = 0.0
selected_genera = set()
covering_sequence = []

step = 0
while covered_priority < target_coverage and uncovered:
    best_org = None
    best_value = -1
    best_uncovered_genes = []
    best_sum_priority = 0

    for org, gene_set in org_sets.items():
        org_uncovered = [g for g in gene_set if g in uncovered]
        if not org_uncovered:
            continue
        
        sum_priority = sum(gene_priority[g] for g in org_uncovered)
        tractability = CRISPRI_TRACTABILITY.get(org, DEFAULT_CRISPRI)
        genus = ORG_GENUS.get(org, org)
        phylo_bonus = 0.5 if genus in selected_genera else 1.0
        
        value = sum_priority * tractability * phylo_bonus
        
        if value > best_value:
            best_value = value
            best_org = org
            best_uncovered_genes = org_uncovered
            best_sum_priority = sum_priority

    if best_org is None:
        break

    step += 1
    genus = ORG_GENUS.get(best_org, best_org)
    tractability = CRISPRI_TRACTABILITY.get(best_org, DEFAULT_CRISPRI)
    phylo_bonus = 0.5 if genus in selected_genera else 1.0

    for g in best_uncovered_genes:
        uncovered.discard(g)
    covered_priority += best_sum_priority
    selected_genera.add(genus)

    covering_sequence.append({
        "step": step,
        "orgId": best_org,
        "genus": genus,
        "tractability": tractability,
        "n_new_genes": len(best_uncovered_genes),
        "sum_importance": best_sum_priority,
        "objective_value": best_value,
        "cumulative_genes": len(universe) - len(uncovered),
        "cumulative_importance": covered_priority,
        "pct_covered": 100 * covered_priority / total_priority,
    })

covering_df = pd.DataFrame(covering_sequence)
print(f"Set-cover completed in {step} steps")
print(f"Coverage: {covered_priority:.1f} / {total_priority:.1f} ({100*covered_priority/total_priority:.1f}%)")
print(f"Unique genera: {len(selected_genera)}")
print(f"\nCovering sequence:")
print(covering_df[["step", "orgId", "genus", "tractability", "n_new_genes", 
    "pct_covered"]].to_string(index=False))

Set-cover completed in 42 steps
Coverage: 465067.0 / 486457.8 (95.6%)
Unique genera: 27

Covering sequence:
 step                   orgId            genus  tractability  n_new_genes  pct_covered
    1                   Smeli    Sinorhizobium          0.60         1630     5.585947
    2                  Putida      Pseudomonas          0.80         1043     9.207355
    3                     MR1       Shewanella          0.80          805    11.953610
    4                  Btheta      Bacteroides          0.30         1382    16.785541
    5                    Koxy       Klebsiella          0.65          568    18.692944
    6                    Keio      Escherichia          0.90          403    19.991505
    7                  Marino     Marinobacter          0.50          576    22.062402
    8                   Ponti      Pontibacter          0.30          943    25.329160
    9          pseudo5_N2C3_1      Pseudomonas          0.75          752    27.936467
   10                 Cup4G11      Cupriavidus          0.30          912    31.113805
   11                   BFirm     Burkholderia          0.30          809    33.984482
   12                     DvH    Desulfovibrio          0.50          440    35.547638
   13                    Dino  Dinoroseobacter          0.30          725    38.057620
   14                  azobra     Azospirillum          0.30          702    40.525789
   15            pseudo1_N1B4      Pseudomonas          0.70          594    42.621356
   16        RalstoniaGMI1000        Ralstonia          0.30          664    45.055333
   17                   Korea     Sphingomonas          0.30          637    47.371743
   18                    SB2B       Shewanella          0.70          576    49.324811
   19                 Burk376 Paraburkholderia          0.30          597    51.448946
   20                    SynE    Synechococcus          0.30          622    53.566320
   21                 Pedo557       Pedobacter          0.30          615    55.663890
   22                  psRCH2      Pseudomonas          0.55          622    57.829709
   23                   Phaeo      Phaeobacter          0.30          502    59.592594
   24                 HerbieS   Herbaspirillum          0.30          487    61.309991
   25      pseudo13_GW456_L13      Pseudomonas          0.65          455    62.876235
   26         acidovorax_3H11       Acidovorax          0.30          467    64.533552
   27                      PS     Dechlorosoma          0.30          445    66.150215
   28 SyringaeB728a_mexBdelta      Pseudomonas          0.30          882    69.235661
   29           SyringaeB728a      Pseudomonas          0.30          833    72.127633
   30            pseudo3_N2E3      Pseudomonas          0.30          789    74.870362
   31                    Cola       Echinicola          0.30          397    76.229986
   32                    ANA3       Shewanella          0.30          712    78.683078
   33                DdiaME23          Dickeya          0.30          334    79.890892
   34                   Caulo      Caulobacter          0.30          319    81.025880
   35          RalstoniaUW163        Ralstonia          0.30          596    83.216749
   36       RalstoniaBSBF1503        Ralstonia          0.30          586    85.364217
   37                     PV4       Shewanella          0.30          614    87.492135
   38                Dyella79           Dyella          0.30          310    88.549033
   39                  WCS417      Pseudomonas          0.30          598    90.659222
   40          RalstoniaPSI07        Ralstonia          0.30          564    92.724755
   41            pseudo6_N2E2      Pseudomonas          0.30          586    94.780169
   42                 Magneto Magnetospirillum          0.30          232    95.602746

# Assign genes to organisms in covering set order
gene_assignment = {}
for _, step_row in covering_df.iterrows():
    org = step_row["orgId"]
    for g in org_sets[org]:
        if g not in gene_assignment:
            gene_assignment[g] = org

# Add assignment to universe
universe["assigned_organism"] = universe["gene_key"].map(gene_assignment)
assigned = universe[universe["assigned_organism"].notna()].copy()
print(f"Genes assigned: {len(assigned):,} / {len(universe):,}")

# Per-organism breakdown by tier × hypothesis
for _, step_row in covering_df.head(10).iterrows():
    org = step_row["orgId"]
    org_genes = assigned[assigned["assigned_organism"] == org]
    ct = pd.crosstab(org_genes["taxonomic_tier"], org_genes["hypothesis_status"])
    print(f"\n{org} ({step_row['genus']}, step {int(step_row['step'])}): {len(org_genes)} genes")
    print(ct.to_string())

# Save covering set
covering_df.to_csv(os.path.join(DATA_DIR, "conservation_covering_set.tsv"), sep="	", index=False)
print(f"\nSaved covering set to data/conservation_covering_set.tsv")

Genes assigned: 27,325 / 28,584

Smeli (Sinorhizobium, step 1): 1630 genes
hypothesis_status  true_knowledge_gap  weak_lead
taxonomic_tier                                  
class                              31          0
kingdom                           195       1354
phylum                             29         21

Putida (Pseudomonas, step 2): 1043 genes
hypothesis_status  true_knowledge_gap  weak_lead
taxonomic_tier                                  
class                              50          0
kingdom                           172        797
phylum                             11         13

MR1 (Shewanella, step 3): 805 genes
hypothesis_status  true_knowledge_gap  weak_lead
taxonomic_tier                                  
class                              26          0
kingdom                           105        657
phylum                              8          9

Btheta (Bacteroides, step 4): 1382 genes
hypothesis_status  true_knowledge_gap  weak_lead
taxonomic_tier                                  
class                               6          0
kingdom                           267       1109

Koxy (Klebsiella, step 5): 568 genes
hypothesis_status  true_knowledge_gap  weak_lead
taxonomic_tier                                  
class                               6          0
kingdom                            65        489
phylum                              1          7

Keio (Escherichia, step 6): 403 genes
hypothesis_status  true_knowledge_gap  weak_lead
taxonomic_tier                                  
class                               2          0
kingdom                             9        388
phylum                              0          4

Marino (Marinobacter, step 7): 576 genes
hypothesis_status  true_knowledge_gap  weak_lead
taxonomic_tier                                  
class                              16          0
kingdom                           143        405
phylum                             10          2

Ponti (Pontibacter, step 8): 943 genes
hypothesis_status  true_knowledge_gap  weak_lead
taxonomic_tier                                  
class                              26          0
kingdom                           151        742
phylum                             10         14

pseudo5_N2C3_1 (Pseudomonas, step 9): 752 genes
hypothesis_status  true_knowledge_gap  weak_lead
taxonomic_tier                                  
class                              45          0
kingdom                           118        568
phylum                             14          7

Cup4G11 (Cupriavidus, step 10): 912 genes
hypothesis_status  true_knowledge_gap  weak_lead
taxonomic_tier                                  
class                              25          0
kingdom                           153        695
phylum                             21         18

Saved covering set to data/conservation_covering_set.tsv

# Condition recommendation from NB09
KEYWORD_TO_CONDITION = {
    "transport": "carbon/nitrogen source panel",
    "permease": "carbon/nitrogen source panel",
    "iron": "iron limitation/excess",
    "sulfur": "sulfur source panel",
    "nitrogen": "nitrogen source panel",
    "flagell": "motility/soft agar",
    "chemotaxis": "motility/soft agar",
    "membrane": "membrane stress (SDS, EDTA)",
    "oxidase": "oxidative stress (H2O2, paraquat)",
    "reductase": "electron acceptor panel",
    "kinase": "signaling conditions (osmotic, pH)",
    "regulator": "regulatory screen (stress panel)",
    "ribosom": "translation stress (antibiotics)",
    "protease": "protein stress (heat, ethanol)",
    "secretion": "secretion assays",
    "atp": "energy metabolism panel",
    "nad": "redox conditions panel",
    "phospho": "phosphate limitation",
    "dehydrogenase": "electron donor panel",
    "synthase": "biosynthesis conditions",
}

def recommend_experiment(row):
    """Recommend experiment type based on available evidence."""
    org = row["orgId"]
    has_crispri = org in CRISPRI_TRACTABILITY
    is_essential = str(row.get("essentiality_class", "")) == "essential_all"
    has_fitness = row.get("has_strong_fitness", False)
    hypothesis = row["hypothesis_status"]
    
    if is_essential and has_crispri:
        return "CRISPRi knockdown"
    elif has_fitness:
        return "condition-specific TnSeq"
    elif hypothesis == "true_knowledge_gap":
        return "broad phenotypic screen"
    else:
        return "targeted TnSeq"

assigned["recommended_experiment"] = assigned.apply(recommend_experiment, axis=1)

# Recommended condition (from fitness data or keyword inference)
def recommend_condition(row):
    top_cond = str(row.get("top_condition_class", ""))
    if top_cond and top_cond not in ("", "nan", "None"):
        return top_cond
    desc = str(row.get("desc", "")).lower()
    for kw, cond in KEYWORD_TO_CONDITION.items():
        if kw in desc:
            return cond
    pred = str(row.get("module_prediction", ""))
    if pred and pred not in ("", "nan"):
        return f"module: {pred[:40]}"
    return "broad screen"

assigned["recommended_condition"] = assigned.apply(recommend_condition, axis=1)
print(f"Experiment types: {assigned['recommended_experiment'].value_counts().to_dict()}")
print(f"Top conditions: {assigned['recommended_condition'].value_counts().head(10).to_dict()}")

Experiment types: {'targeted TnSeq': 18906, 'broad phenotypic screen': 6622, 'condition-specific TnSeq': 1797}
Top conditions: {'broad screen': 20806, 'stress': 2747, 'carbon source': 1114, 'nitrogen source': 514, 'membrane stress (SDS, EDTA)': 315, 'motility': 195, 'pH': 73, 'carbon/nitrogen source panel': 46, 'in planta': 46, 'anaerobic': 42}

# Build per-organism experiment plan
experiment_plans = []

for _, step_row in covering_df.iterrows():
    org = step_row["orgId"]
    org_genes = assigned[assigned["assigned_organism"] == org]
    if len(org_genes) == 0:
        continue
    
    n_total = len(org_genes)
    tractability = CRISPRI_TRACTABILITY.get(org, DEFAULT_CRISPRI)
    
    # Tier × hypothesis counts
    tier_hyp = pd.crosstab(org_genes["taxonomic_tier"], org_genes["hypothesis_status"])
    
    # Key counts
    n_kingdom_gaps = tier_hyp.loc["kingdom", "true_knowledge_gap"] if "kingdom" in tier_hyp.index and "true_knowledge_gap" in tier_hyp.columns else 0
    n_phylum_gaps = tier_hyp.loc["phylum", "true_knowledge_gap"] if "phylum" in tier_hyp.index and "true_knowledge_gap" in tier_hyp.columns else 0
    n_kingdom_leads = tier_hyp.loc["kingdom", "weak_lead"] if "kingdom" in tier_hyp.index and "weak_lead" in tier_hyp.columns else 0
    n_phylum_leads = tier_hyp.loc["phylum", "weak_lead"] if "phylum" in tier_hyp.index and "weak_lead" in tier_hyp.columns else 0
    
    # Expected novel functions: true_knowledge_gap + weak_lead counts
    n_novel = org_genes[org_genes["hypothesis_status"].isin(["true_knowledge_gap", "weak_lead"])].shape[0]
    
    # Experiment summary
    exp_types = org_genes["recommended_experiment"].value_counts()
    exp_str = " + ".join(exp_types.index[:3])
    
    # Top conditions
    top_conds = org_genes["recommended_condition"].value_counts().head(3)
    cond_str = "; ".join(f"{c}({n})" for c, n in top_conds.items())
    
    experiment_plans.append({
        "priority": int(step_row["step"]),
        "orgId": org,
        "genus": step_row["genus"],
        "tractability": tractability,
        "n_OGs_covered": n_total,
        "n_kingdom_gaps": int(n_kingdom_gaps),
        "n_phylum_gaps": int(n_phylum_gaps),
        "n_kingdom_leads": int(n_kingdom_leads),
        "n_phylum_leads": int(n_phylum_leads),
        "expected_novel": n_novel,
        "experiment_type": exp_str,
        "top_conditions": cond_str,
        "pct_covered": step_row["pct_covered"],
    })

plan_df = pd.DataFrame(experiment_plans)
plan_df.to_csv(os.path.join(DATA_DIR, "conservation_experiment_plans.tsv"), sep="	", index=False)
print(f"Saved {len(plan_df)} organism plans to data/conservation_experiment_plans.tsv")
print(f"\nExperimental priority table:")
print(plan_df[["priority", "orgId", "tractability", "n_OGs_covered", 
    "n_kingdom_gaps", "n_phylum_gaps", "expected_novel", "experiment_type"]].head(15).to_string(index=False))

Saved 42 organism plans to data/conservation_experiment_plans.tsv

Experimental priority table:
 priority          orgId  tractability  n_OGs_covered  n_kingdom_gaps  n_phylum_gaps  expected_novel                                                     experiment_type
        1          Smeli          0.60           1630             195             29            1630 targeted TnSeq + broad phenotypic screen + condition-specific TnSeq
        2         Putida          0.80           1043             172             11            1043 targeted TnSeq + broad phenotypic screen + condition-specific TnSeq
        3            MR1          0.80            805             105              8             805 targeted TnSeq + broad phenotypic screen + condition-specific TnSeq
        4         Btheta          0.30           1382             267              0            1382 targeted TnSeq + broad phenotypic screen + condition-specific TnSeq
        5           Koxy          0.65            568              65              1             568 targeted TnSeq + broad phenotypic screen + condition-specific TnSeq
        6           Keio          0.90            403               9              0             403 targeted TnSeq + condition-specific TnSeq + broad phenotypic screen
        7         Marino          0.50            576             143             10             576 targeted TnSeq + broad phenotypic screen + condition-specific TnSeq
        8          Ponti          0.30            943             151             10             943 targeted TnSeq + broad phenotypic screen + condition-specific TnSeq
        9 pseudo5_N2C3_1          0.75            752             118             14             752 targeted TnSeq + broad phenotypic screen + condition-specific TnSeq
       10        Cup4G11          0.30            912             153             21             912 targeted TnSeq + broad phenotypic screen + condition-specific TnSeq
       11          BFirm          0.30            809             170             39             809 targeted TnSeq + broad phenotypic screen + condition-specific TnSeq
       12            DvH          0.50            440             100              8             440 targeted TnSeq + broad phenotypic screen + condition-specific TnSeq
       13           Dino          0.30            725             109             34             725 targeted TnSeq + broad phenotypic screen + condition-specific TnSeq
       14         azobra          0.30            702             126             18             702 targeted TnSeq + broad phenotypic screen + condition-specific TnSeq
       15   pseudo1_N1B4          0.70            594             119             13             594 targeted TnSeq + broad phenotypic screen + condition-specific TnSeq

# fig32: FB organism taxonomy context — showing sparse sampling
# Phylum-colored bar chart of 48 organisms

org_phyla = pd.DataFrame(list(ORG_PHYLUM.items()), columns=["orgId", "phylum"])
phylum_counts = org_phyla["phylum"].value_counts()

fig, ax = plt.subplots(figsize=(10, 5))
colors = {"Pseudomonadota": "#1f77b4", "Bacteroidota": "#ff7f0e", 
          "Cyanobacteriota": "#2ca02c", "Halobacteriota": "#d62728"}
bars = ax.bar(phylum_counts.index, phylum_counts.values,
              color=[colors.get(p, "#999999") for p in phylum_counts.index])
ax.set_ylabel("Number of FB organisms")
ax.set_title("Fitness Browser Organism Sampling — Sparse Taxonomic Coverage\n(48 organisms dominated by Pseudomonadota)")
ax.set_xlabel("Phylum")
for bar, val in zip(bars, phylum_counts.values):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.3,
            str(val), ha="center", va="bottom", fontweight="bold")
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, "fig32_organism_taxonomy.png"), bbox_inches="tight")
plt.show()
print("Saved fig32_organism_taxonomy.png")

Saved fig32_organism_taxonomy.png

# fig33: Conservation tier distribution (stacked bars × hypothesis status)
tier_order = ["kingdom", "phylum", "class", "order", "family", "genus", "species", "mobile"]
hyp_order = ["strong_hypothesis", "weak_lead", "true_knowledge_gap"]
hyp_colors = {"strong_hypothesis": "#2ca02c", "weak_lead": "#ff7f0e", "true_knowledge_gap": "#d62728"}

ct = pd.crosstab(dark_with_tier["taxonomic_tier"], dark_with_tier["hypothesis_status"])
ct = ct.reindex(index=[t for t in tier_order if t in ct.index], 
                columns=[h for h in hyp_order if h in ct.columns], fill_value=0)

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Absolute counts
ct.plot(kind="bar", stacked=True, ax=axes[0], 
        color=[hyp_colors[h] for h in ct.columns])
axes[0].set_title("Dark Gene Count by Conservation Tier × Hypothesis Status")
axes[0].set_ylabel("Number of dark genes")
axes[0].set_xlabel("Taxonomic tier")
axes[0].tick_params(axis="x", rotation=45)
axes[0].legend(title="Hypothesis status", loc="upper right")

# Normalized (%)
ct_norm = ct.div(ct.sum(axis=1), axis=0) * 100
ct_norm.plot(kind="bar", stacked=True, ax=axes[1],
             color=[hyp_colors[h] for h in ct_norm.columns])
axes[1].set_title("Proportion by Conservation Tier × Hypothesis Status")
axes[1].set_ylabel("Percentage")
axes[1].set_xlabel("Taxonomic tier")
axes[1].tick_params(axis="x", rotation=45)
axes[1].legend(title="Hypothesis status", loc="upper right")

plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, "fig33_conservation_tiers.png"), bbox_inches="tight")
plt.show()
print("Saved fig33_conservation_tiers.png")

Saved fig33_conservation_tiers.png

# fig34: 2D classification heatmap (tier × hypothesis, counts + percentages)
ct_full = pd.crosstab(dark_with_tier["taxonomic_tier"], dark_with_tier["hypothesis_status"])
ct_full = ct_full.reindex(index=[t for t in tier_order if t in ct_full.index],
                           columns=[h for h in hyp_order if h in ct_full.columns], fill_value=0)

# Annotations: count + percentage
total = ct_full.sum().sum()
annot = ct_full.copy().astype(str)
for i in ct_full.index:
    for j in ct_full.columns:
        v = ct_full.loc[i, j]
        pct = 100 * v / total
        annot.loc[i, j] = f"{v:,}\n({pct:.1f}%)"

fig, ax = plt.subplots(figsize=(10, 7))
sns.heatmap(ct_full, annot=annot, fmt="", cmap="YlOrRd", ax=ax,
            linewidths=0.5, linecolor="white",
            cbar_kws={"label": "Gene count"})
ax.set_title(f"Dark Gene Classification: Conservation Tier × Hypothesis Status\n(N = {total:,} dark genes)")
ax.set_ylabel("Conservation tier (pangenome)")
ax.set_xlabel("Hypothesis status")

# Highlight key cells (kingdom × true_knowledge_gap)
if "kingdom" in ct_full.index and "true_knowledge_gap" in ct_full.columns:
    row_idx = list(ct_full.index).index("kingdom")
    col_idx = list(ct_full.columns).index("true_knowledge_gap")
    ax.add_patch(plt.Rectangle((col_idx, row_idx), 1, 1, fill=False, 
                                edgecolor="blue", linewidth=3))

plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, "fig34_classification_heatmap.png"), bbox_inches="tight")
plt.show()
print("Saved fig34_classification_heatmap.png")

Saved fig34_classification_heatmap.png

# fig35: Top true knowledge gap OGs (most conserved, zero evidence)
true_gaps = og_importance[og_importance["hypothesis_status"] == "true_knowledge_gap"].head(25)
print(f"Top 25 true knowledge gap OGs (most conserved, zero evidence):")

fig, ax = plt.subplots(figsize=(14, 8))
y_pos = range(len(true_gaps))
bars = ax.barh(y_pos, true_gaps["n_species"].fillna(1).values, 
               color=["#d62728" if p > 1 else "#999999" for p in true_gaps["n_phyla"].fillna(1).values])

# Label with root_og and phyla count
labels = [f"{row['root_og']} ({int(row.get('n_phyla', 0) or 0)}p, {int(row['n_organisms'])}org)" 
          for _, row in true_gaps.iterrows()]
ax.set_yticks(y_pos)
ax.set_yticklabels(labels, fontsize=8)
ax.set_xlabel("Number of pangenome species")
ax.set_title("Top 25 True Knowledge Gap OGs — Most Conserved with Zero Functional Evidence\n(red = multi-phylum, label: root_og (phyla, FB organisms))")
ax.invert_yaxis()

plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, "fig35_top_knowledge_gaps.png"), bbox_inches="tight")
plt.show()
print("Saved fig35_top_knowledge_gaps.png")

Top 25 true knowledge gap OGs (most conserved, zero evidence):

Saved fig35_top_knowledge_gaps.png

# fig36: Covering set optimization curve
fig, ax1 = plt.subplots(figsize=(12, 6))

# Cumulative coverage
color1 = "#1f77b4"
ax1.plot(covering_df["step"], covering_df["pct_covered"], "o-", color=color1, linewidth=2)
ax1.set_xlabel("Organism added (step)")
ax1.set_ylabel("Cumulative coverage (%)", color=color1)
ax1.tick_params(axis="y", labelcolor=color1)
ax1.axhline(y=95, color=color1, linestyle="--", alpha=0.5, label="95% target")

# Marginal genes per step (right axis)
ax2 = ax1.twinx()
color2 = "#ff7f0e"
bars = ax2.bar(covering_df["step"], covering_df["n_new_genes"], alpha=0.4, color=color2)

# Color bars by tractability
for bar, tract in zip(bars, covering_df["tractability"]):
    bar.set_alpha(0.3 + 0.7 * tract)

ax2.set_ylabel("New genes covered (per step)", color=color2)
ax2.tick_params(axis="y", labelcolor=color2)

# Label top organisms
for _, row in covering_df.head(8).iterrows():
    ax1.annotate(row["orgId"], (row["step"], row["pct_covered"]),
                 textcoords="offset points", xytext=(5, 5), fontsize=7,
                 ha="left", rotation=20)

ax1.set_title("Conservation-Weighted Covering Set Optimization\n(bar opacity = tractability score)")
ax1.legend(loc="lower right")
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, "fig36_covering_set_curve.png"), bbox_inches="tight")
plt.show()
print("Saved fig36_covering_set_curve.png")

Saved fig36_covering_set_curve.png

# fig37: Per-organism experimental plan summary heatmap

# Build matrix: organisms × tier, gene count
plan_top = plan_df.head(20).copy()  # top 20 organisms

# Get tier counts per organism from assigned genes
org_tier_matrix = []
for _, row in plan_top.iterrows():
    org = row["orgId"]
    org_genes = assigned[assigned["assigned_organism"] == org]
    tier_counts = org_genes["taxonomic_tier"].value_counts()
    entry = {"orgId": org, "tractability": row["tractability"]}
    for t in tier_order:
        entry[t] = tier_counts.get(t, 0)
    org_tier_matrix.append(entry)

otm = pd.DataFrame(org_tier_matrix).set_index("orgId")
tract_col = otm.pop("tractability")
# Only keep tiers that have nonzero values
otm = otm[[t for t in tier_order if t in otm.columns and otm[t].sum() > 0]]

fig, ax = plt.subplots(figsize=(12, 8))
sns.heatmap(otm, annot=True, fmt="d", cmap="YlOrRd", ax=ax,
            linewidths=0.5, linecolor="white",
            cbar_kws={"label": "Gene count"})
ax.set_title("Per-Organism Dark Gene Coverage by Conservation Tier\n(top 20 organisms in covering set order)")
ax.set_ylabel("Organism (covering set order)")
ax.set_xlabel("Conservation tier")

# Add tractability as color bar on the right
for i, (org, tract) in enumerate(tract_col.items()):
    ax.text(len(otm.columns) + 0.3, i + 0.5, f"{tract:.1f}", va="center", fontsize=8)
ax.text(len(otm.columns) + 0.3, -0.5, "Tract.", va="center", fontsize=8, fontweight="bold")

plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, "fig37_experiment_plan_heatmap.png"), bbox_inches="tight")
plt.show()
print("Saved fig37_experiment_plan_heatmap.png")

Saved fig37_experiment_plan_heatmap.png

# Final summary statistics
print("=" * 70)
print("NB11 SUMMARY")
print("=" * 70)
print(f"\nDark genes classified: {len(dark_with_tier):,}")
print(f"OGs with pangenome distribution: {len(pg_dist):,}")
print(f"OGs ranked by importance: {len(og_importance):,}")
print(f"\nTaxonomic tier distribution:")
for tier, n in dark_with_tier["taxonomic_tier"].value_counts().items():
    print(f"  {tier:12s}: {n:6,} ({100*n/len(dark_with_tier):5.1f}%)")
print(f"\nHypothesis status:")
for hyp, n in dark_with_tier["hypothesis_status"].value_counts().items():
    print(f"  {hyp:25s}: {n:6,} ({100*n/len(dark_with_tier):5.1f}%)")
print(f"\nCovering set: {len(covering_df)} organisms for 95% priority coverage")
print(f"Expected novel functions (top organism): {plan_df.iloc[0]['expected_novel']}")
print(f"\nOutput files:")
for f in ["og_pangenome_distribution.tsv", "dark_gene_classes.tsv", 
          "og_importance_ranked.tsv", "conservation_covering_set.tsv",
          "conservation_experiment_plans.tsv"]:
    path = os.path.join(DATA_DIR, f)
    if os.path.exists(path):
        size = os.path.getsize(path)
        print(f"  {f}: {size:,} bytes")
print(f"\nFigures: fig32-fig37 saved to {FIG_DIR}")

======================================================================
NB11 SUMMARY
======================================================================

Dark genes classified: 57,011
OGs with pangenome distribution: 11,774
OGs ranked by importance: 11,774

Taxonomic tier distribution:
  kingdom     : 29,330 ( 51.4%)
  species     : 17,005 ( 29.8%)
  genus       :  3,547 (  6.2%)
  class       :  2,262 (  4.0%)
  family      :  1,992 (  3.5%)
  mobile      :  1,062 (  1.9%)
  phylum      :  1,059 (  1.9%)
  order       :    754 (  1.3%)

Hypothesis status:
  weak_lead                : 29,926 ( 52.5%)
  true_knowledge_gap       : 23,654 ( 41.5%)
  strong_hypothesis        :  3,431 (  6.0%)

Covering set: 42 organisms for 95% priority coverage
Expected novel functions (top organism): 1630

Output files:
  og_pangenome_distribution.tsv: 3,817,536 bytes
  dark_gene_classes.tsv: 7,280,299 bytes
  og_importance_ranked.tsv: 1,279,070 bytes
  conservation_covering_set.tsv: 4,800 bytes
  conservation_experiment_plans.tsv: 7,998 bytes

Figures: fig32-fig37 saved to /home/aparkin/BERIL-research-observatory/projects/functional_dark_matter/figures

11 Conservation Classes

NB11: Conservation × Dark Matter Classes with Experimental Prioritization¶

Section 1: Pangenome Species Distribution¶

Full Pangenome Distribution (pre-computed by NB11b)¶

Section 2: Taxonomic Tier Classification¶

Section 3: Hypothesis Status Classification¶

Section 4: Importance Score¶

Section 5: Minimum Covering Set (Conservation-Weighted)¶

Section 6: Per-Organism Experimental Plans¶

Section 7: Figures¶

Summary¶