import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import os
import glob

DATA_DIR = Path('../data')
DIAMOND_DIR = DATA_DIR / 'diamond_hits'
FIGURES_DIR = Path('../figures')
FIGURES_DIR.mkdir(exist_ok=True)

# Load organism mapping
mapping = pd.read_csv(DATA_DIR / 'organism_mapping.tsv', sep='\t')
print(f"Organism mapping: {len(mapping)} rows, {mapping['orgId'].nunique()} organisms")

Organism mapping: 498 rows, 44 organisms

# Parse all DIAMOND output files
# outfmt 6: qseqid sseqid pident length qlen slen evalue bitscore
diamond_cols = ['qseqid', 'sseqid', 'pident', 'length', 'qlen', 'slen', 'evalue', 'bitscore']

all_hits = []
per_org_stats = []

hit_files = sorted(DIAMOND_DIR.glob('*.tsv'))
print(f"DIAMOND result files found: {len(hit_files)}")

for f in hit_files:
    orgId = f.stem
    if f.stat().st_size == 0:
        per_org_stats.append({'orgId': orgId, 'n_hits': 0})
        continue
    
    df = pd.read_csv(f, sep='\t', header=None, names=diamond_cols)
    
    # Split qseqid (format: orgId:locusId)
    df[['hit_orgId', 'locusId']] = df['qseqid'].str.split(':', n=1, expand=True)
    df['gene_cluster_id'] = df['sseqid']
    
    all_hits.append(df)
    per_org_stats.append({'orgId': orgId, 'n_hits': len(df)})

hits = pd.concat(all_hits, ignore_index=True) if all_hits else pd.DataFrame()
print(f"Total DIAMOND hits: {len(hits):,}")
print(f"Organisms with hits: {hits['hit_orgId'].nunique() if len(hits) > 0 else 0}")

DIAMOND result files found: 44

Total DIAMOND hits: 192,563
Organisms with hits: 44

# Build gene_cluster_id -> clade_id lookup from species FASTAs
# Each species FASTA file has headers like >gene_cluster_id
cluster_to_clade = {}

species_fasta_dir = DATA_DIR / 'species_fastas'
for fasta_file in species_fasta_dir.glob('*.fasta'):
    clade_id = fasta_file.stem
    with open(fasta_file) as f:
        for line in f:
            if line.startswith('>'):
                cluster_id = line[1:].strip()
                cluster_to_clade[cluster_id] = clade_id

print(f"Built cluster->clade lookup: {len(cluster_to_clade):,} entries")

# Add clade ID to hits
hits['gtdb_species_clade_id'] = hits['gene_cluster_id'].map(cluster_to_clade)
unmapped = hits['gtdb_species_clade_id'].isna().sum()
if unmapped > 0:
    print(f"WARNING: {unmapped} hits without clade mapping")
else:
    print("All hits mapped to clades")

Built cluster->clade lookup: 2,209,907 entries

All hits mapped to clades

# Find multi-clade organisms
clades_per_org = mapping.groupby('orgId')['gtdb_species_clade_id'].nunique()
multi_clade_orgs = clades_per_org[clades_per_org > 1].index.tolist()

print(f"Multi-clade organisms: {len(multi_clade_orgs)}")

resolved_clades = {}  # orgId -> chosen clade
resolution_log = []

for orgId in sorted(hits['hit_orgId'].unique()):
    org_clades = mapping[mapping['orgId'] == orgId]['gtdb_species_clade_id'].unique()
    
    if len(org_clades) == 1:
        resolved_clades[orgId] = org_clades[0]
        continue
    
    # Count hits per clade for this organism
    org_hits = hits[hits['hit_orgId'] == orgId]
    hits_per_clade = org_hits.groupby('gtdb_species_clade_id').size().sort_values(ascending=False)
    
    best_clade = hits_per_clade.index[0]
    resolved_clades[orgId] = best_clade
    
    org_info = mapping[mapping['orgId'] == orgId].iloc[0]
    log_entry = {
        'orgId': orgId,
        'genus': org_info['genus'],
        'species': org_info['species'],
        'chosen_clade': best_clade,
        'chosen_hits': hits_per_clade.iloc[0],
        'n_candidate_clades': len(org_clades),
        'all_clades_hits': dict(hits_per_clade)
    }
    resolution_log.append(log_entry)
    
    print(f"\n  {orgId} ({org_info['genus']} {org_info['species']}):")
    for clade, count in hits_per_clade.items():
        marker = ' <-- CHOSEN' if clade == best_clade else ''
        print(f"    {clade}: {count} hits{marker}")

if not resolution_log:
    print("  No multi-clade organisms to resolve")

Multi-clade organisms: 14

  Burk376 (Paraburkholderia bryophila):
    s__Paraburkholderia_bryophila--RS_GCF_003269035.1: 1251 hits <-- CHOSEN
    s__Paraburkholderia_sp000745015--RS_GCF_013409255.1: 1045 hits

  Cup4G11 (Cupriavidus basilensis):
    s__Cupriavidus_basilensis--RS_GCF_008801925.2: 7187 hits <-- CHOSEN
    s__Cupriavidus_basilensis_B--RS_GCF_000876015.1: 20 hits

  Keio (Escherichia coli):
    s__Escherichia_coli_E--RS_GCF_011881725.1: 1083 hits <-- CHOSEN
    s__Escherichia_sp005843885--RS_GCF_005843885.1: 947 hits
    s__Escherichia_ruysiae--GB_GCA_021307345.1: 944 hits
    s__Escherichia_marmotae--RS_GCF_002900365.1: 528 hits
    s__Enterobacter_hormaechei_A--RS_GCF_001729745.1: 190 hits
    s__Citrobacter_freundii--RS_GCF_011064845.1: 139 hits
    s__Klebsiella_variicola--RS_GCF_000828055.2: 62 hits
    s__Enterobacter_roggenkampii--RS_GCF_001729805.1: 35 hits
    s__Klebsiella_quasipneumoniae--RS_GCF_020525925.1: 27 hits
    s__Citrobacter_murliniae--RS_GCF_001037495.1: 23 hits
    s__Phytobacter_ursingii--GB_GCA_901456055.1: 22 hits
    s__Klebsiella_aerogenes--RS_GCF_000215745.1: 21 hits
    s__Hafnia_alvei--RS_GCF_000735375.1: 6 hits

  Pedo557 (Pedobacter sp.):
    s__Pedobacter_sp018613605--RS_GCF_018613605.1: 1020 hits <-- CHOSEN
    s__Pedobacter_sp918698245--RS_GCF_918698245.1: 944 hits
    s__Pedobacter_sp011759615--RS_GCF_014692875.1: 881 hits
    s__Pedobacter_sp019317285--RS_GCF_019317285.1: 267 hits
    s__Pedobacter_sp003852395--RS_GCF_011303555.1: 101 hits
    s__Pedobacter_suwonensis--RS_GCF_900111825.1: 67 hits
    s__Pedobacter_agri--RS_GCF_000258495.1: 39 hits
    s__Pedobacter_nototheniae--RS_GCF_004335085.1: 12 hits
    s__Pedobacter_sp920984665--RS_GCF_920984665.1: 2 hits
    s__Daejeonella_sp002257025--GB_GCA_002257025.1: 1 hits

  RalstoniaBSBF1503 (Ralstonia solanacearum):
    s__Ralstonia_solanacearum--RS_GCF_002251695.1: 4213 hits <-- CHOSEN
    s__Ralstonia_pseudosolanacearum--RS_GCF_024925465.1: 159 hits
    s__Ralstonia_syzygii--GB_GCA_919592095.1: 153 hits

  RalstoniaPSI07 (Ralstonia solanacearum):
    s__Ralstonia_syzygii--GB_GCA_919592095.1: 4326 hits <-- CHOSEN
    s__Ralstonia_pseudosolanacearum--RS_GCF_024925465.1: 172 hits
    s__Ralstonia_solanacearum--RS_GCF_002251695.1: 83 hits

  RalstoniaUW163 (Ralstonia solanacearum):
    s__Ralstonia_solanacearum--RS_GCF_002251695.1: 4303 hits <-- CHOSEN
    s__Ralstonia_pseudosolanacearum--RS_GCF_024925465.1: 171 hits
    s__Ralstonia_syzygii--GB_GCA_919592095.1: 149 hits

  SyringaeB728a_mexBdelta (Pseudomonas syringae):
    s__Pseudomonas_E_syringae_M--RS_GCF_009176725.1: 3321 hits <-- CHOSEN
    s__Pseudomonas_E_syringae--RS_GCF_000507185.2: 614 hits
    s__Pseudomonas_E_cerasi--RS_GCF_900074915.1: 596 hits
    s__Pseudomonas_E_amygdali--RS_GCF_002699855.1: 284 hits
    s__Pseudomonas_E_caricapapayae--RS_GCF_001400735.1: 105 hits
    s__Pseudomonas_E_avellanae--RS_GCF_000444135.1: 86 hits
    s__Pseudomonas_E_cannabina--RS_GCF_900100365.1: 22 hits
    s__Pseudomonas_E_graminis_C--RS_GCF_002093745.1: 14 hits
    s__Pseudomonas_E_syringae_Q--RS_GCF_023278085.1: 9 hits

  azobra (Azospirillum brasilense):
    s__Azospirillum_baldaniorum--RS_GCF_003119195.2: 3928 hits <-- CHOSEN
    s__Azospirillum_brasilense--RS_GCF_001315015.1: 396 hits
    s__Azospirillum_brasilense_B--RS_GCF_003349955.1: 381 hits
    s__Azospirillum_brasilense_C--RS_GCF_007827815.1: 302 hits
    s__Azospirillum_sp003119115--RS_GCF_003119115.1: 213 hits
    s__Azospirillum_tabaci--RS_GCF_014596085.1: 208 hits

  pseudo13_GW456_L13 (Pseudomonas fluorescens):
    s__Pseudomonas_E_jessenii--RS_GCF_002236115.1: 3703 hits <-- CHOSEN
    s__Pseudomonas_E_fluorescens_BH--RS_GCF_902498135.1: 472 hits
    s__Pseudomonas_E_fluorescens_O--RS_GCF_000967965.1: 260 hits
    s__Pseudomonas_E_sp002303925--RS_GCF_002303925.1: 115 hits
    s__Pseudomonas_E_sp002113025--RS_GCF_002113025.1: 103 hits
    s__Pseudomonas_E_azerbaijanoccidens--RS_GCF_019145495.1: 86 hits
    s__Pseudomonas_E_sp009764265--RS_GCF_009764265.1: 38 hits
    s__Pseudomonas_E_fluorescens_A--RS_GCF_000802965.1: 37 hits
    s__Pseudomonas_E_alvandae--RS_GCF_019141525.1: 22 hits
    s__Pseudomonas_E_sp000282215--RS_GCF_000282215.1: 21 hits
    s__Pseudomonas_E_anatoliensis--RS_GCF_017976235.1: 21 hits
    s__Pseudomonas_E_alkylphenolica--RS_GCF_000746525.1: 14 hits
    s__Pseudomonas_E_reinekei--RS_GCF_001945365.1: 14 hits
    s__Pseudomonas_E_fluorescens_BI--RS_GCF_902497855.1: 12 hits
    s__Pseudomonas_E_mandelii--RS_GCF_900106065.1: 6 hits
    s__Pseudomonas_E_fluorescens_BK--RS_GCF_902497995.1: 6 hits
    s__Pseudomonas_E_asgharzadehiana--RS_GCF_019139815.1: 5 hits
    s__Pseudomonas_E_fluorescens--RS_GCF_900215245.1: 5 hits
    s__Pseudomonas_E_fluorescens_H--RS_GCF_000292795.1: 3 hits
    s__Pseudomonas_E_baetica--RS_GCF_002813455.1: 3 hits
    s__Pseudomonas_E_bijieensis--RS_GCF_013347965.1: 3 hits
    s__Pseudomonas_E_fluorescens_BT--RS_GCF_012974785.1: 3 hits
    s__Pseudomonas_E_sp002029345--RS_GCF_006094435.1: 2 hits
    s__Pseudomonas_E_siliginis--RS_GCF_019145195.1: 2 hits
    s__Pseudomonas_E_sp002000165--RS_GCF_024748695.1: 2 hits
    s__Pseudomonas_E_fluorescens_W--RS_GCF_000968015.1: 2 hits
    s__Pseudomonas_E_glycinae--RS_GCF_001594225.2: 2 hits
    s__Pseudomonas_E_paracarnis--RS_GCF_904063055.1: 2 hits
    s__Pseudomonas_E_fluorescens_AP--RS_GCF_013407925.2: 1 hits
    s__Pseudomonas_E_fluorescens_AA--RS_GCF_001307275.1: 1 hits
    s__Pseudomonas_E_azotoformans_A--RS_GCF_001579805.1: 1 hits
    s__Pseudomonas_E_fluorescens_AO--RS_GCF_000817895.1: 1 hits
    s__Pseudomonas_E_putida_A--RS_GCF_004683905.1: 1 hits
    s__Pseudomonas_E_helmanticensis--RS_GCF_000783395.1: 1 hits
    s__Pseudomonas_E_frederiksbergensis_A--RS_GCF_000802155.2: 1 hits
    s__Pseudomonas_E_fluorescens_BV--RS_GCF_001902145.1: 1 hits
    s__Pseudomonas_E_fluorescens_BJ--RS_GCF_902498065.1: 1 hits
    s__Pseudomonas_E_fluorescens_G--GB_GCA_902825215.1: 1 hits
    s__Pseudomonas_E_iridis--RS_GCF_017980685.1: 1 hits
    s__Pseudomonas_E_sp000282495--RS_GCF_000282495.1: 1 hits
    s__Pseudomonas_E_sivasensis--RS_GCF_013778505.1: 1 hits
    s__Pseudomonas_E_putida_F--RS_GCF_000800615.1: 1 hits
    s__Pseudomonas_E_sp002874965--RS_GCF_002874965.1: 1 hits
    s__Pseudomonas_E_sp002980155--RS_GCF_019212185.1: 1 hits
    s__Pseudomonas_E_sp004011755--RS_GCF_004011755.1: 1 hits
    s__Pseudomonas_E_sp900156465--RS_GCF_900156465.1: 1 hits
    s__Pseudomonas_E_vlassakiae--RS_GCF_014269035.2: 1 hits

  pseudo1_N1B4 (Pseudomonas fluorescens):
    s__Pseudomonas_E_sp000282495--RS_GCF_000282495.1: 5273 hits <-- CHOSEN
    s__Pseudomonas_E_fluorescens_BI--RS_GCF_902497855.1: 44 hits
    s__Pseudomonas_E_azerbaijanoccidens--RS_GCF_019145495.1: 35 hits
    s__Pseudomonas_E_alvandae--RS_GCF_019141525.1: 35 hits
    s__Pseudomonas_E_sp900187425--RS_GCF_900187425.1: 33 hits
    s__Pseudomonas_E_fluorescens_W--RS_GCF_000968015.1: 31 hits
    s__Pseudomonas_E_fluorescens_BJ--RS_GCF_902498065.1: 30 hits
    s__Pseudomonas_E_mandelii--RS_GCF_900106065.1: 28 hits
    s__Pseudomonas_E_fluorescens_A--RS_GCF_000802965.1: 21 hits
    s__Pseudomonas_E_fluorescens_E--RS_GCF_001307155.1: 18 hits
    s__Pseudomonas_E_fluorescens_BK--RS_GCF_902497995.1: 17 hits
    s__Pseudomonas_E_anatoliensis--RS_GCF_017976235.1: 16 hits
    s__Pseudomonas_E_fluorescens_BH--RS_GCF_902498135.1: 14 hits
    s__Pseudomonas_E_alkylphenolica--RS_GCF_000746525.1: 13 hits
    s__Pseudomonas_E_sp000282215--RS_GCF_000282215.1: 12 hits
    s__Pseudomonas_E_sp002000165--RS_GCF_024748695.1: 12 hits
    s__Pseudomonas_E_asgharzadehiana--RS_GCF_019139815.1: 11 hits
    s__Pseudomonas_E_sp002113025--RS_GCF_002113025.1: 7 hits
    s__Pseudomonas_E_jessenii--RS_GCF_002236115.1: 7 hits
    s__Pseudomonas_E_baetica--RS_GCF_002813455.1: 7 hits
    s__Pseudomonas_E_crudilactis--RS_GCF_013433315.1: 5 hits
    s__Pseudomonas_E_bijieensis--RS_GCF_013347965.1: 5 hits
    s__Pseudomonas_E_sp014851765--RS_GCF_014851765.1: 5 hits
    s__Pseudomonas_E_reinekei--RS_GCF_001945365.1: 5 hits
    s__Pseudomonas_E_fluorescens_O--RS_GCF_000967965.1: 4 hits
    s__Pseudomonas_E_sp002029345--RS_GCF_006094435.1: 4 hits
    s__Pseudomonas_E_fluorescens_AP--RS_GCF_013407925.2: 3 hits
    s__Pseudomonas_E_sp009764265--RS_GCF_009764265.1: 3 hits
    s__Pseudomonas_E_rhodesiae--RS_GCF_007858255.1: 3 hits
    s__Pseudomonas_E_sp002303925--RS_GCF_002303925.1: 3 hits
    s__Pseudomonas_E_azotoformans_A--RS_GCF_001579805.1: 2 hits
    s__Pseudomonas_E_fluorescens_H--RS_GCF_000292795.1: 2 hits
    s__Pseudomonas_E_reactans--RS_GCF_013385825.1: 2 hits
    s__Pseudomonas_E_fluorescens_BV--RS_GCF_001902145.1: 2 hits
    s__Pseudomonas_E_fluorescens_S--RS_GCF_000281895.1: 2 hits
    s__Pseudomonas_E_fluorescens_AN--RS_GCF_001708445.1: 2 hits
    s__Pseudomonas_E_cyclaminis--RS_GCF_015163715.1: 1 hits
    s__Pseudomonas_E_fluorescens_AA--RS_GCF_001307275.1: 1 hits
    s__Pseudomonas_E_fluorescens_AO--RS_GCF_000817895.1: 1 hits
    s__Pseudomonas_E_fluorescens_G--GB_GCA_902825215.1: 1 hits
    s__Pseudomonas_E_fluorescens_BT--RS_GCF_012974785.1: 1 hits
    s__Pseudomonas_E_pharyngis--RS_GCF_021602345.1: 1 hits
    s__Pseudomonas_E_parakoreensis--RS_GCF_021602155.1: 1 hits
    s__Pseudomonas_E_orientalis_B--RS_GCF_022807995.1: 1 hits
    s__Pseudomonas_E_marginalis_C--RS_GCF_008692035.1: 1 hits
    s__Pseudomonas_E_kilonensis_B--RS_GCF_009601685.2: 1 hits
    s__Pseudomonas_E_protegens--RS_GCF_000397205.1: 1 hits
    s__Pseudomonas_E_salomonii--RS_GCF_900107155.1: 1 hits
    s__Pseudomonas_E_shirazensis--RS_GCF_014268785.2: 1 hits
    s__Pseudomonas_E_siliginis--RS_GCF_019145195.1: 1 hits
    s__Pseudomonas_E_shahriarae--RS_GCF_014268455.2: 1 hits
    s__Pseudomonas_E_sp002980155--RS_GCF_019212185.1: 1 hits
    s__Pseudomonas_E_sp900187605--RS_GCF_900187605.1: 1 hits
    s__Pseudomonas_E_tensinigenes--RS_GCF_014268445.2: 1 hits
    s__Pseudomonas_E_vlassakiae--RS_GCF_014269035.2: 1 hits

  pseudo3_N2E3 (Pseudomonas fluorescens):
    s__Pseudomonas_E_fluorescens_E--RS_GCF_001307155.1: 5535 hits <-- CHOSEN
    s__Pseudomonas_E_alvandae--RS_GCF_019141525.1: 31 hits
    s__Pseudomonas_E_fluorescens_BJ--RS_GCF_902498065.1: 15 hits
    s__Pseudomonas_E_azerbaijanoccidens--RS_GCF_019145495.1: 14 hits
    s__Pseudomonas_E_alkylphenolica--RS_GCF_000746525.1: 11 hits
    s__Pseudomonas_E_asgharzadehiana--RS_GCF_019139815.1: 8 hits
    s__Pseudomonas_E_anatoliensis--RS_GCF_017976235.1: 7 hits
    s__Pseudomonas_E_fluorescens_AP--RS_GCF_013407925.2: 7 hits
    s__Pseudomonas_E_fluorescens_BH--RS_GCF_902498135.1: 7 hits
    s__Pseudomonas_E_fluorescens_BI--RS_GCF_902497855.1: 5 hits
    s__Pseudomonas_E_fluorescens_A--RS_GCF_000802965.1: 5 hits
    s__Pseudomonas_E_fluorescens_BT--RS_GCF_012974785.1: 5 hits
    s__Pseudomonas_E_sp000282495--RS_GCF_000282495.1: 4 hits
    s__Pseudomonas_E_sp002980155--RS_GCF_019212185.1: 4 hits
    s__Pseudomonas_E_fluorescens--RS_GCF_900215245.1: 3 hits
    s__Pseudomonas_E_azotoformans_A--RS_GCF_001579805.1: 3 hits
    s__Pseudomonas_E_fluorescens_W--RS_GCF_000968015.1: 3 hits
    s__Pseudomonas_E_sp900187425--RS_GCF_900187425.1: 3 hits
    s__Pseudomonas_E_bijieensis--RS_GCF_013347965.1: 3 hits
    s__Pseudomonas_E_fluorescens_BK--RS_GCF_902497995.1: 3 hits
    s__Pseudomonas_E_cyclaminis--RS_GCF_015163715.1: 3 hits
    s__Pseudomonas_E_reinekei--RS_GCF_001945365.1: 2 hits
    s__Pseudomonas_E_shahriarae--RS_GCF_014268455.2: 2 hits
    s__Pseudomonas_E_sp000282215--RS_GCF_000282215.1: 2 hits
    s__Pseudomonas_E_sp002029345--RS_GCF_006094435.1: 2 hits
    s__Pseudomonas_E_rhodesiae--RS_GCF_007858255.1: 2 hits
    s__Pseudomonas_E_putida_A--RS_GCF_004683905.1: 2 hits
    s__Pseudomonas_E_glycinae--RS_GCF_001594225.2: 2 hits
    s__Pseudomonas_E_sp900156465--RS_GCF_900156465.1: 2 hits
    s__Pseudomonas_E_mandelii--RS_GCF_900106065.1: 2 hits
    s__Pseudomonas_E_baetica--RS_GCF_002813455.1: 1 hits
    s__Pseudomonas_E_crudilactis--RS_GCF_013433315.1: 1 hits
    s__Pseudomonas_E_fluorescens_AN--RS_GCF_001708445.1: 1 hits
    s__Pseudomonas_E_fluorescens_AM--RS_GCF_002837185.1: 1 hits
    s__Pseudomonas_E_fluorescens_AA--RS_GCF_001307275.1: 1 hits
    s__Pseudomonas_E_paraglycinae--RS_GCF_021601625.1: 1 hits
    s__Pseudomonas_E_fluorescens_X--RS_GCF_000730425.1: 1 hits
    s__Pseudomonas_E_fluorescens_O--RS_GCF_000967965.1: 1 hits
    s__Pseudomonas_E_frederiksbergensis_A--RS_GCF_000802155.2: 1 hits
    s__Pseudomonas_E_kilonensis_B--RS_GCF_009601685.2: 1 hits
    s__Pseudomonas_E_pharyngis--RS_GCF_021602345.1: 1 hits
    s__Pseudomonas_E_siliginis--RS_GCF_019145195.1: 1 hits
    s__Pseudomonas_E_salomonii--RS_GCF_900107155.1: 1 hits
    s__Pseudomonas_E_sivasensis--RS_GCF_013778505.1: 1 hits
    s__Pseudomonas_E_sp002113025--RS_GCF_002113025.1: 1 hits
    s__Pseudomonas_E_sp002000165--RS_GCF_024748695.1: 1 hits
    s__Pseudomonas_E_sp004011755--RS_GCF_004011755.1: 1 hits
    s__Pseudomonas_E_vlassakiae--RS_GCF_014269035.2: 1 hits

  pseudo5_N2C3_1 (Pseudomonas fluorescens):
    s__Pseudomonas_E_fluorescens_AA--RS_GCF_001307275.1: 4796 hits <-- CHOSEN
    s__Pseudomonas_E_bijieensis--RS_GCF_013347965.1: 353 hits
    s__Pseudomonas_E_fluorescens_Q--RS_GCF_001623525.1: 289 hits
    s__Pseudomonas_E_alvandae--RS_GCF_019141525.1: 209 hits
    s__Pseudomonas_E_kilonensis_B--RS_GCF_009601685.2: 96 hits
    s__Pseudomonas_E_fluorescens_S--RS_GCF_000281895.1: 44 hits
    s__Pseudomonas_E_chlororaphis_E--RS_GCF_001023535.1: 38 hits
    s__Pseudomonas_E_sp900156465--RS_GCF_900156465.1: 26 hits
    s__Pseudomonas_E_tehranensis--RS_GCF_014268615.1: 23 hits
    s__Pseudomonas_E_frederiksbergensis_A--RS_GCF_000802155.2: 19 hits
    s__Pseudomonas_E_alkylphenolica--RS_GCF_000746525.1: 15 hits
    s__Pseudomonas_E_anatoliensis--RS_GCF_017976235.1: 10 hits
    s__Pseudomonas_E_azerbaijanoccidens--RS_GCF_019145495.1: 4 hits
    s__Pseudomonas_E_asgharzadehiana--RS_GCF_019139815.1: 3 hits
    s__Pseudomonas_E_rhodesiae--RS_GCF_007858255.1: 3 hits
    s__Pseudomonas_E_sp002113025--RS_GCF_002113025.1: 2 hits
    s__Pseudomonas_E_fluorescens_AP--RS_GCF_013407925.2: 2 hits
    s__Pseudomonas_E_fluorescens_BV--RS_GCF_001902145.1: 2 hits
    s__Pseudomonas_E_fluorescens_AN--RS_GCF_001708445.1: 2 hits
    s__Pseudomonas_E_cyclaminis--RS_GCF_015163715.1: 2 hits
    s__Pseudomonas_E_synxantha_A--RS_GCF_000263715.2: 2 hits
    s__Pseudomonas_E_marginalis_C--RS_GCF_008692035.1: 2 hits
    s__Pseudomonas_E_fluorescens_X--RS_GCF_000730425.1: 1 hits
    s__Pseudomonas_E_fluorescens_BI--RS_GCF_902497855.1: 1 hits
    s__Pseudomonas_E_baetica--RS_GCF_002813455.1: 1 hits
    s__Pseudomonas_E_fluorescens_A--RS_GCF_000802965.1: 1 hits
    s__Pseudomonas_E_fluorescens_BJ--RS_GCF_902498065.1: 1 hits
    s__Pseudomonas_E_fluorescens_BK--RS_GCF_902497995.1: 1 hits
    s__Pseudomonas_E_salomonii--RS_GCF_900107155.1: 1 hits
    s__Pseudomonas_E_pharyngis--RS_GCF_021602345.1: 1 hits
    s__Pseudomonas_E_lactis--RS_GCF_001439845.1: 1 hits
    s__Pseudomonas_E_marginalis--RS_GCF_001645105.1: 1 hits
    s__Pseudomonas_E_sp002000165--RS_GCF_024748695.1: 1 hits
    s__Pseudomonas_E_sp000282215--RS_GCF_000282215.1: 1 hits
    s__Pseudomonas_E_sivasensis--RS_GCF_013778505.1: 1 hits
    s__Pseudomonas_E_sp009764265--RS_GCF_009764265.1: 1 hits

  pseudo6_N2E2 (Pseudomonas fluorescens):
    s__Pseudomonas_E_fluorescens_Q--RS_GCF_001623525.1: 4702 hits <-- CHOSEN
    s__Pseudomonas_E_fluorescens_AA--RS_GCF_001307275.1: 376 hits
    s__Pseudomonas_E_bijieensis--RS_GCF_013347965.1: 312 hits
    s__Pseudomonas_E_alvandae--RS_GCF_019141525.1: 183 hits
    s__Pseudomonas_E_kilonensis_B--RS_GCF_009601685.2: 127 hits
    s__Pseudomonas_E_tehranensis--RS_GCF_014268615.1: 35 hits
    s__Pseudomonas_E_fluorescens_S--RS_GCF_000281895.1: 27 hits
    s__Pseudomonas_E_chlororaphis_E--RS_GCF_001023535.1: 21 hits
    s__Pseudomonas_E_alkylphenolica--RS_GCF_000746525.1: 17 hits
    s__Pseudomonas_E_frederiksbergensis_A--RS_GCF_000802155.2: 13 hits
    s__Pseudomonas_E_sp900156465--RS_GCF_900156465.1: 12 hits
    s__Pseudomonas_E_cyclaminis--RS_GCF_015163715.1: 7 hits
    s__Pseudomonas_E_fluorescens_BH--RS_GCF_902498135.1: 5 hits
    s__Pseudomonas_E_anatoliensis--RS_GCF_017976235.1: 5 hits
    s__Pseudomonas_E_azerbaijanoccidens--RS_GCF_019145495.1: 5 hits
    s__Pseudomonas_E_asgharzadehiana--RS_GCF_019139815.1: 4 hits
    s__Pseudomonas_E_paracarnis--RS_GCF_904063055.1: 4 hits
    s__Pseudomonas_E_fluorescens_E--RS_GCF_001307155.1: 3 hits
    s__Pseudomonas_E_crudilactis--RS_GCF_013433315.1: 2 hits
    s__Pseudomonas_E_sp002113025--RS_GCF_002113025.1: 2 hits
    s__Pseudomonas_E_sp900187425--RS_GCF_900187425.1: 2 hits
    s__Pseudomonas_E_sp000282495--RS_GCF_000282495.1: 2 hits
    s__Pseudomonas_E_fluorescens_BI--RS_GCF_902497855.1: 1 hits
    s__Pseudomonas_E_fluorescens_BT--RS_GCF_012974785.1: 1 hits
    s__Pseudomonas_E_fluorescens_AN--RS_GCF_001708445.1: 1 hits
    s__Pseudomonas_E_fluorescens_A--RS_GCF_000802965.1: 1 hits
    s__Pseudomonas_E_azotoformans_A--RS_GCF_001579805.1: 1 hits
    s__Pseudomonas_E_fluorescens_BK--RS_GCF_902497995.1: 1 hits
    s__Pseudomonas_E_fluorescens_O--RS_GCF_000967965.1: 1 hits
    s__Pseudomonas_E_rhodesiae--RS_GCF_007858255.1: 1 hits
    s__Pseudomonas_E_reactans--RS_GCF_013385825.1: 1 hits
    s__Pseudomonas_E_marginalis--RS_GCF_001645105.1: 1 hits
    s__Pseudomonas_E_sivasensis--RS_GCF_013778505.1: 1 hits
    s__Pseudomonas_E_sp000282215--RS_GCF_000282215.1: 1 hits
    s__Pseudomonas_E_sp014851765--RS_GCF_014851765.1: 1 hits
    s__Pseudomonas_E_vlassakiae--RS_GCF_014269035.2: 1 hits

# Filter hits to only keep the resolved clade per organism
before_count = len(hits)

hits['resolved_clade'] = hits['hit_orgId'].map(resolved_clades)
hits_filtered = hits[hits['gtdb_species_clade_id'] == hits['resolved_clade']].copy()

print(f"Hits before clade resolution: {before_count:,}")
print(f"Hits after clade resolution:  {len(hits_filtered):,}")
print(f"Removed: {before_count - len(hits_filtered):,} hits from non-chosen clades")

Hits before clade resolution: 192,563
Hits after clade resolution:  177,863
Removed: 14,700 hits from non-chosen clades

# Try to load cluster metadata if available
cluster_meta_path = DATA_DIR / 'cluster_metadata.tsv'

if cluster_meta_path.exists():
    cluster_meta = pd.read_csv(cluster_meta_path, sep='\t')
    print(f"Loaded cluster metadata: {len(cluster_meta):,} clusters")
else:
    print("No cluster_metadata.tsv found.")
    print("To generate it, run this on JupyterHub:")
    print()
    
    # Generate the query for the user
    unique_clades = list(set(resolved_clades.values()))
    clade_str = "','".join(unique_clades)
    print(f"""spark.sql(\"\"\"""")
    print(f"""    SELECT gc.gene_cluster_id, gc.gtdb_species_clade_id,""")
    print(f"""           gc.no_genomes, gc.no_genes,""")
    print(f"""           p.no_genomes as species_total_genomes""")
    print(f"""    FROM kbase_ke_pangenome.gene_cluster gc""")
    print(f"""    JOIN kbase_ke_pangenome.pangenome p""")
    print(f"""        ON gc.gtdb_species_clade_id = p.gtdb_species_clade_id""")
    print(f"""    WHERE gc.gtdb_species_clade_id IN ('{clade_str}')""")
    print(f"""\"\"\").toPandas().to_csv('data/cluster_metadata.tsv', sep='\\t', index=False)""")
    print()
    print("Then re-run this cell.")
    cluster_meta = None

Loaded cluster metadata: 2,209,907 clusters

if cluster_meta is not None:
    # is_core, is_auxiliary, is_singleton are already in the metadata
    print("Cluster classification:")
    print(f"  Core:      {cluster_meta['is_core'].sum():>10,}  "
          f"({cluster_meta['is_core'].mean()*100:.1f}%)")
    print(f"  Auxiliary:  {cluster_meta['is_auxiliary'].sum():>10,}  "
          f"({cluster_meta['is_auxiliary'].mean()*100:.1f}%)")
    print(f"  Singleton:  {cluster_meta['is_singleton'].sum():>10,}  "
          f"({cluster_meta['is_singleton'].mean()*100:.1f}%)")

    # Merge with hits
    hits_enriched = hits_filtered.merge(
        cluster_meta[['gene_cluster_id', 'is_core', 'is_auxiliary', 'is_singleton']],
        on='gene_cluster_id',
        how='left'
    )
    print(f"\nEnriched hits: {len(hits_enriched):,}")
    print(f"  with conservation data: {hits_enriched['is_core'].notna().sum():,}")
else:
    print("Skipping enrichment — no cluster metadata available")
    hits_enriched = hits_filtered.copy()
    hits_enriched['is_core'] = None
    hits_enriched['is_auxiliary'] = None
    hits_enriched['is_singleton'] = None

Cluster classification:
  Core:         682,965  (30.9%)
  Auxiliary:   1,526,942  (69.1%)
  Singleton:     909,096  (41.1%)

Enriched hits: 177,863
  with conservation data: 177,863

# Select and rename columns for final table
link_table = hits_enriched[[
    'hit_orgId', 'locusId', 'gene_cluster_id', 'resolved_clade',
    'pident', 'evalue', 'bitscore',
    'is_core', 'is_auxiliary', 'is_singleton'
]].rename(columns={
    'hit_orgId': 'orgId',
    'resolved_clade': 'gtdb_species_clade_id'
})

# Sort for readability
link_table = link_table.sort_values(['orgId', 'locusId']).reset_index(drop=True)

print(f"Final link table: {len(link_table):,} rows")
print(f"Unique organisms: {link_table['orgId'].nunique()}")
print(f"Unique gene clusters: {link_table['gene_cluster_id'].nunique():,}")
link_table.head(10)

Final link table: 177,863 rows
Unique organisms: 44
Unique gene clusters: 163,974

# Save link table
output_path = DATA_DIR / 'fb_pangenome_link.tsv'
link_table.to_csv(output_path, sep='\t', index=False)
print(f"Saved: {output_path}")
print(f"  {len(link_table):,} rows")

Saved: ../data/fb_pangenome_link.tsv
  177,863 rows

# Per-organism coverage: % of FB genes matched
# Count total FB genes per organism from the FB FASTA files
fb_fasta_dir = DATA_DIR / 'fb_fastas'

fb_gene_counts = {}
for f in fb_fasta_dir.glob('*.fasta'):
    orgId = f.stem
    n = sum(1 for line in open(f) if line.startswith('>'))
    fb_gene_counts[orgId] = n

# Build per-organism summary
matched_counts = link_table.groupby('orgId').size().to_dict()
median_pident = link_table.groupby('orgId')['pident'].median().to_dict()

summary_rows = []
for orgId in sorted(set(list(fb_gene_counts.keys()) + list(matched_counts.keys()))):
    n_total = fb_gene_counts.get(orgId, 0)
    n_matched = matched_counts.get(orgId, 0)
    pct = (n_matched / n_total * 100) if n_total > 0 else 0
    clade = resolved_clades.get(orgId, 'unmapped')
    summary_rows.append({
        'orgId': orgId,
        'n_genes': n_total,
        'n_matched': n_matched,
        'pct_matched': round(pct, 1),
        'median_pident': round(median_pident.get(orgId, 0), 1),
        'resolved_clade': clade
    })

summary = pd.DataFrame(summary_rows).sort_values('pct_matched', ascending=False)

print("=== PER-ORGANISM COVERAGE ===")
print(summary.to_string(index=False))

# Flag low-coverage organisms
low_cov = summary[summary['pct_matched'] < 80]
if len(low_cov) > 0:
    print(f"\nWARNING: {len(low_cov)} organisms with <80% coverage:")
    print(low_cov[['orgId', 'n_genes', 'n_matched', 'pct_matched']].to_string(index=False))

=== PER-ORGANISM COVERAGE ===
                  orgId  n_genes  n_matched  pct_matched  median_pident                                       resolved_clade
                 psRCH2     4265       4238         99.4          100.0     s__Stutzerimonas_stutzeri_AE--RS_GCF_000327065.1
                     PS     3443       3418         99.3           99.9               s__Azospira_oryzae--RS_GCF_004217225.1
                  Korea     4149       4116         99.2           99.7        s__Sphingomonas_koreensis--RS_GCF_002797435.1
                   ANA3     4360       4315         99.0           99.6        s__Shewanella_sp000203935--RS_GCF_000203935.1
                 WCS417     5506       5451         99.0          100.0          s__Pseudomonas_E_simiae--RS_GCF_900111895.1
          SyringaeB728a     5089       5031         98.9           99.7      s__Pseudomonas_E_syringae_M--RS_GCF_009176725.1
                   SynE     2669       2636         98.8          100.0       s__Synechococcus_elongatus--RS_GCF_000010065.1
                   Dino     4192       4140         98.8          100.0        s__Dinoroseobacter_shibae--RS_GCF_000018145.1
                  Phaeo     3875       3823         98.7           99.4          s__Phaeobacter_inhibens--RS_GCF_000473105.1
                HerbieS     4715       4645         98.5          100.0    s__Herbaspirillum_seropedicae--RS_GCF_001040945.1
                    PV4     3859       3800         98.5           99.5            s__Shewanella_loihica--RS_GCF_000016065.1
       Methanococcus_JJ     1796       1769         98.5           99.0     s__Methanococcus_maripaludis--RS_GCF_002945325.1
                  Smeli     6218       6123         98.5           99.8        s__Sinorhizobium_meliloti--RS_GCF_017876815.1
       Methanococcus_S2     1741       1714         98.4           98.9     s__Methanococcus_maripaludis--RS_GCF_002945325.1
               DdiaME23     4182       4115         98.4          100.0           s__Dickeya_dianthicola--RS_GCF_000365305.1
                 Putida     5563       5470         98.3           99.5      s__Pseudomonas_E_alloputida--RS_GCF_021282585.1
                 Btheta     4816       4727         98.2           99.9  s__Bacteroides_thetaiotaomicron--RS_GCF_000011065.1
                  Ponti     4220       4138         98.1          100.0        s__Pontibacter_actiniarum--RS_GCF_003585765.1
               Ddia6719     4110       4030         98.1           99.3           s__Dickeya_dianthicola--RS_GCF_000365305.1
                Dda3937     4213       4131         98.1           99.6              s__Dickeya_dadantii--RS_GCF_000406145.1
               Dyella79     4375       4281         97.9          100.0               s__Dyella_marensis--RS_GCF_900112865.1
                Cup4G11     7358       7187         97.7           99.1        s__Cupriavidus_basilensis--RS_GCF_008801925.2
                  BFirm     7182       7016         97.7          100.0 s__Paraburkholderia_phytofirmans--RS_GCF_000020125.1
       RalstoniaGMI1000     4956       4812         97.1           99.6  s__Ralstonia_pseudosolanacearum--RS_GCF_024925465.1
                    MR1     4467       4330         96.9          100.0         s__Shewanella_oneidensis--RS_GCF_000146165.2
           pseudo3_N2E3     5766       5535         96.0          100.0   s__Pseudomonas_E_fluorescens_E--RS_GCF_001307155.1
                 Marino     4410       4216         95.6          100.0        s__Marinobacter_adhaerens--RS_GCF_000166295.1
                  Caulo     3886       3698         95.2          100.0        s__Caulobacter_vibrioides--RS_GCF_002858865.1
        acidovorax_3H11     4964       4721         95.1          100.0        s__Acidovorax_sp001633105--RS_GCF_001633105.1
                    DvH     3403       3206         94.2          100.0 s__Nitratidesulfovibrio_vulgaris--RS_GCF_000195755.1
                   Koxy     5309       4965         93.5           97.9      s__Klebsiella_michiganensis--RS_GCF_002925905.1
         RalstoniaPSI07     4735       4326         91.4           99.6             s__Ralstonia_syzygii--GB_GCA_919592095.1
      RalstoniaBSBF1503     4659       4213         90.4           99.1        s__Ralstonia_solanacearum--RS_GCF_002251695.1
         RalstoniaUW163     4758       4303         90.4           99.2        s__Ralstonia_solanacearum--RS_GCF_002251695.1
           pseudo1_N1B4     5972       5273         88.3          100.0     s__Pseudomonas_E_sp000282495--RS_GCF_000282495.1
         pseudo5_N2C3_1     6000       4796         79.9           99.6  s__Pseudomonas_E_fluorescens_AA--RS_GCF_001307275.1
           pseudo6_N2E2     6094       4702         77.2           99.8   s__Pseudomonas_E_fluorescens_Q--RS_GCF_001623525.1
     pseudo13_GW456_L13     5152       3703         71.9           99.6        s__Pseudomonas_E_jessenii--RS_GCF_002236115.1
                 azobra     5488       3928         71.6          100.0      s__Azospirillum_baldaniorum--RS_GCF_003119195.2
SyringaeB728a_mexBdelta     5089       3321         65.3           99.7      s__Pseudomonas_E_syringae_M--RS_GCF_009176725.1
                   Keio     4146       1083         26.1           99.3            s__Escherichia_coli_E--RS_GCF_011881725.1
                Pedo557     4964       1020         20.5           96.6        s__Pedobacter_sp018613605--RS_GCF_018613605.1
                Burk376     6555       1251         19.1           94.9    s__Paraburkholderia_bryophila--RS_GCF_003269035.1
                   Miya     3180        147          4.6           92.2 s__Nitratidesulfovibrio_vulgaris--RS_GCF_000195755.1
                Magneto     4489          0          0.0            0.0                                             unmapped
                   Kang     2467          0          0.0            0.0                                             unmapped
                  Brev2     3184          0          0.0            0.0                                             unmapped
    Burkholderia_OAS925     6501          0          0.0            0.0                                             unmapped
   Bvulgatus_CL09T03C04     3806          0          0.0            0.0                                             unmapped
                   CL21     4638          0          0.0            0.0                                             unmapped
                   Cola     4625          0          0.0            0.0                                             unmapped
                 Bifido     1960          0          0.0            0.0                                             unmapped
      Lysobacter_OAE881     3702          0          0.0            0.0                                             unmapped
  Mucilaginibacter_YX36     4390          0          0.0            0.0                                             unmapped
               MycoTube     4029          0          0.0            0.0                                             unmapped
                   SB2B     3645          0          0.0            0.0                                             unmapped
      Variovorax_OAS795     5732          0          0.0            0.0                                             unmapped
                 Xantho     4617          0          0.0            0.0                                             unmapped
    rhodanobacter_10B01     3526          0          0.0            0.0                                             unmapped

WARNING: 24 organisms with <80% coverage:
                  orgId  n_genes  n_matched  pct_matched
         pseudo5_N2C3_1     6000       4796         79.9
           pseudo6_N2E2     6094       4702         77.2
     pseudo13_GW456_L13     5152       3703         71.9
                 azobra     5488       3928         71.6
SyringaeB728a_mexBdelta     5089       3321         65.3
                   Keio     4146       1083         26.1
                Pedo557     4964       1020         20.5
                Burk376     6555       1251         19.1
                   Miya     3180        147          4.6
                Magneto     4489          0          0.0
                   Kang     2467          0          0.0
                  Brev2     3184          0          0.0
    Burkholderia_OAS925     6501          0          0.0
   Bvulgatus_CL09T03C04     3806          0          0.0
                   CL21     4638          0          0.0
                   Cola     4625          0          0.0
                 Bifido     1960          0          0.0
      Lysobacter_OAE881     3702          0          0.0
  Mucilaginibacter_YX36     4390          0          0.0
               MycoTube     4029          0          0.0
                   SB2B     3645          0          0.0
      Variovorax_OAS795     5732          0          0.0
                 Xantho     4617          0          0.0
    rhodanobacter_10B01     3526          0          0.0

# Identity distribution per organism (expect peaks near 100%)
orgs_to_plot = link_table['orgId'].unique()
n_orgs = len(orgs_to_plot)
ncols = 6
nrows = (n_orgs + ncols - 1) // ncols

fig, axes = plt.subplots(nrows, ncols, figsize=(4 * ncols, 3 * nrows))
axes = axes.flatten() if n_orgs > 1 else [axes]

for idx, orgId in enumerate(sorted(orgs_to_plot)):
    org_data = link_table[link_table['orgId'] == orgId]
    axes[idx].hist(org_data['pident'], bins=30, range=(90, 100),
                   edgecolor='black', alpha=0.7, color='steelblue')
    axes[idx].set_title(f"{orgId}\n(n={len(org_data)}, med={org_data['pident'].median():.1f}%)",
                        fontsize=8)
    axes[idx].set_xlabel('Identity (%)', fontsize=7)
    axes[idx].set_ylabel('Count', fontsize=7)
    axes[idx].tick_params(labelsize=6)

# Hide unused subplots
for idx in range(n_orgs, len(axes)):
    axes[idx].set_visible(False)

plt.suptitle('Protein Identity Distributions (DIAMOND blastp, ≥90% filter)', fontsize=14)
plt.tight_layout()
plt.savefig(FIGURES_DIR / 'identity_distributions.png', dpi=150, bbox_inches='tight')
plt.show()

# Core/auxiliary/singleton breakdown per organism
if link_table['is_core'].notna().any():
    conservation_summary = link_table.groupby('orgId').agg(
        n_core=('is_core', 'sum'),
        n_auxiliary=('is_auxiliary', 'sum'),
        n_singleton=('is_singleton', 'sum'),
        n_total=('orgId', 'size')
    ).reset_index()
    
    conservation_summary['pct_core'] = (conservation_summary['n_core'] / 
                                         conservation_summary['n_total'] * 100).round(1)
    conservation_summary['pct_singleton'] = (conservation_summary['n_singleton'] / 
                                              conservation_summary['n_total'] * 100).round(1)
    
    print("=== CONSERVATION BREAKDOWN PER ORGANISM ===")
    print(conservation_summary.sort_values('orgId').to_string(index=False))
    
    # Stacked bar chart
    fig, ax = plt.subplots(figsize=(14, 6))
    cs = conservation_summary.sort_values('pct_core', ascending=True)
    
    ax.barh(cs['orgId'], cs['n_core'], label='Core', color='#2196F3')
    ax.barh(cs['orgId'], cs['n_auxiliary'], left=cs['n_core'], 
            label='Auxiliary', color='#FF9800')
    ax.barh(cs['orgId'], cs['n_singleton'], 
            left=cs['n_core'] + cs['n_auxiliary'],
            label='Singleton', color='#F44336')
    
    ax.set_xlabel('Number of matched genes')
    ax.set_title('Gene Conservation Classification per Organism')
    ax.legend()
    plt.tight_layout()
    plt.savefig(FIGURES_DIR / 'conservation_breakdown.png', dpi=150, bbox_inches='tight')
    plt.show()
else:
    print("No conservation data available — run cluster_metadata export first")

=== CONSERVATION BREAKDOWN PER ORGANISM ===
                  orgId  n_core  n_auxiliary  n_singleton  n_total  pct_core  pct_singleton
                   ANA3    3413          902          486     4315      79.1           11.3
                  BFirm    5284         1732         1176     7016      75.3           16.8
                 Btheta    3095         1632           98     4727      65.5            2.1
                Burk376    1234           17            7     1251      98.6            0.6
                  Caulo    3363          335            8     3698      90.9            0.2
                Cup4G11    4634         2553         1481     7187      64.5           20.6
                Dda3937    3392          739           86     4131      82.1            2.1
               Ddia6719    3263          767          383     4030      81.0            9.5
               DdiaME23    3490          625           20     4115      84.8            0.5
                   Dino    4140            0            0     4140     100.0            0.0
                    DvH    2623          583           40     3206      81.8            1.2
               Dyella79    4274            7            7     4281      99.8            0.2
                HerbieS    3993          652           20     4645      86.0            0.4
                   Keio    1055           28           28     1083      97.4            2.6
                  Korea    3479          637           10     4116      84.5            0.2
                   Koxy    4143          822          116     4965      83.4            2.3
                    MR1    4216          114          114     4330      97.4            2.6
                 Marino    3189         1027          107     4216      75.6            2.5
       Methanococcus_JJ    1420          349           67     1769      80.3            3.8
       Methanococcus_S2    1444          270           65     1714      84.2            3.8
                   Miya      99           48           46      147      67.3           31.3
                     PS    2731          687          251     3418      79.9            7.3
                    PV4    3332          468          155     3800      87.7            4.1
                Pedo557     894          126          126     1020      87.6           12.4
                  Phaeo    3315          508           15     3823      86.7            0.4
                  Ponti    4128           10           10     4138      99.8            0.2
                 Putida    4098         1372           31     5470      74.9            0.6
      RalstoniaBSBF1503    3449          764           47     4213      81.9            1.1
       RalstoniaGMI1000    3880          932           68     4812      80.6            1.4
         RalstoniaPSI07    3676          650          170     4326      85.0            3.9
         RalstoniaUW163    3436          867           56     4303      79.9            1.3
                  Smeli    4758         1365          135     6123      77.7            2.2
                   SynE    2583           53            4     2636      98.0            0.2
          SyringaeB728a    4308          723           89     5031      85.6            1.8
SyringaeB728a_mexBdelta    2855          466           66     3321      86.0            2.0
                 WCS417    5067          384          134     5451      93.0            2.5
        acidovorax_3H11    3553         1168         1168     4721      75.3           24.7
                 azobra    2772         1156          114     3928      70.6            2.9
                 psRCH2    2247         1991          346     4238      53.0            8.2
     pseudo13_GW456_L13    3088          615           13     3703      83.4            0.4
           pseudo1_N1B4    3903         1370          139     5273      74.0            2.6
           pseudo3_N2E3    5395          140           10     5535      97.5            0.2
         pseudo5_N2C3_1    3344         1452           12     4796      69.7            0.3
           pseudo6_N2E2    3766          936           50     4702      80.1            1.1

# Spot-check: known organisms
spot_checks = [
    ('Putida', 'PP_0001'),    # P. putida chromosome partitioning
    ('Putida', 'PP_0002'),    # P. putida
    ('MR1', 'SO_0001'),       # Shewanella oneidensis MR-1
    ('MR1', 'SO_0002'),       # Shewanella
]

print("=== SPOT CHECKS ===")
for orgId, locusId in spot_checks:
    hit = link_table[(link_table['orgId'] == orgId) & (link_table['locusId'] == locusId)]
    if len(hit) > 0:
        h = hit.iloc[0]
        core_status = 'core' if h.get('is_core') else ('singleton' if h.get('is_singleton') else 'auxiliary')
        print(f"  {orgId}:{locusId} -> {h['gene_cluster_id']} "
              f"(pident={h['pident']:.1f}%, {core_status})")
    else:
        print(f"  {orgId}:{locusId} -> NO MATCH")

=== SPOT CHECKS ===
  Putida:PP_0001 -> DOAB01000360.1_6 (pident=99.2%, core)
  Putida:PP_0002 -> NZ_NHBB01000001.1_48 (pident=100.0%, core)
  MR1:SO_0001 -> NO MATCH
  MR1:SO_0002 -> NO MATCH

# Note E. coli exclusion
ecoli_orgs = ['Keio']  # E. coli BW25113
for org in ecoli_orgs:
    in_mapping = org in mapping['orgId'].values
    in_results = org in link_table['orgId'].values if len(link_table) > 0 else False
    print(f"E. coli check ({org}):")
    print(f"  In organism mapping: {in_mapping}")
    print(f"  In link table: {in_results}")
    if not in_mapping:
        print(f"  Reason: main s__Escherichia_coli clade absent from pangenome")
        print(f"  (too large — would have thousands of genomes)")

E. coli check (Keio):
  In organism mapping: True
  In link table: True

print("=" * 60)
print("NB03 SUMMARY")
print("=" * 60)
print(f"Link table: {len(link_table):,} rows")
print(f"Organisms: {link_table['orgId'].nunique()}")
print(f"Gene clusters: {link_table['gene_cluster_id'].nunique():,}")
print(f"")
print(f"Coverage: {summary['pct_matched'].median():.1f}% median across organisms")
print(f"Identity: {link_table['pident'].median():.1f}% median across all hits")
print(f"")
high_cov = (summary['pct_matched'] >= 90).sum()
print(f"Organisms with ≥90% coverage: {high_cov}/{len(summary)}")
print(f"")
if link_table['is_core'].notna().any():
    print(f"Conservation: {link_table['is_core'].sum():,} core, "
          f"{link_table['is_auxiliary'].sum():,} auxiliary, "
          f"{link_table['is_singleton'].sum():,} singleton")
print(f"")
print(f"Output: {output_path}")
print("=" * 60)

# Promotion decision
if summary['pct_matched'].median() >= 90 and link_table['pident'].median() >= 95:
    print("\n** Quality sufficient for promotion to data/fitnessbrowser_link/ **")
    print("Run: cp data/fb_pangenome_link.tsv ../../data/fitnessbrowser_link/")
else:
    print("\n** Quality below threshold for promotion — review issues above **")

============================================================
NB03 SUMMARY
============================================================
Link table: 177,863 rows
Organisms: 44
Gene clusters: 163,974

Coverage: 94.2% median across organisms
Identity: 100.0% median across all hits

Organisms with ≥90% coverage: 34/59

Conservation: 145,821 core, 32,042 auxiliary, 7,574 singleton

Output: ../data/fb_pangenome_link.tsv
============================================================

** Quality sufficient for promotion to data/fitnessbrowser_link/ **
Run: cp data/fb_pangenome_link.tsv ../../data/fitnessbrowser_link/

03 Build Link Table

NB03: Build Link Table & QC¶

1. Load DIAMOND Results¶

2. Resolve Multi-Clade Ambiguities¶

3. Enrich with Gene Cluster Conservation Status¶

4. Build Final Link Table¶

5. Quality Control¶

6. Summary & Promotion Decision¶

	orgId	locusId	gene_cluster_id	gtdb_species_clade_id	pident	evalue	bitscore	is_core	is_auxiliary	is_singleton
0	ANA3	7022495	NC_008573.1_1	s__Shewanella_sp000203935--RS_GCF_000203935.1	100.0	1.210000e-213	585.0	False	True	True
1	ANA3	7022496	NC_008573.1_2	s__Shewanella_sp000203935--RS_GCF_000203935.1	100.0	2.580000e-50	149.0	False	True	True
2	ANA3	7022497	NC_008573.1_3	s__Shewanella_sp000203935--RS_GCF_000203935.1	100.0	2.080000e-241	653.0	False	True	True
3	ANA3	7022498	NC_008573.1_4	s__Shewanella_sp000203935--RS_GCF_000203935.1	100.0	5.440000e-66	191.0	False	True	True
4	ANA3	7022499	NC_008573.1_5	s__Shewanella_sp000203935--RS_GCF_000203935.1	100.0	4.440000e-84	238.0	False	True	True
5	ANA3	7022500	NC_008573.1_6	s__Shewanella_sp000203935--RS_GCF_000203935.1	100.0	0.000000e+00	1452.0	False	True	True
6	ANA3	7022501	NC_008573.1_7	s__Shewanella_sp000203935--RS_GCF_000203935.1	100.0	1.730000e-153	423.0	False	True	True
7	ANA3	7022502	NC_008573.1_8	s__Shewanella_sp000203935--RS_GCF_000203935.1	100.0	8.110000e-305	820.0	False	True	True
8	ANA3	7022503	NC_008573.1_9	s__Shewanella_sp000203935--RS_GCF_000203935.1	100.0	1.520000e-92	261.0	False	True	True
9	ANA3	7022504	NZ_NAJR01000019.1_39	s__Shewanella_sp000203935--RS_GCF_000203935.1	99.5	2.860000e-271	732.0	False	True	False