import os
import re
from glob import glob

import pandas as pd
import numpy as np

# Robust repo root detection — works from notebook dir or repo root
_here = os.path.abspath("")
if os.path.basename(_here) == "notebooks":
    REPO = os.path.abspath(os.path.join(_here, "..", "..", ".."))
elif os.path.exists(os.path.join(_here, "projects", "amr_fitness_cost")):
    REPO = _here
else:
    REPO = os.path.abspath(os.path.join(_here, "..", "..", ".."))

PROJECT = os.path.join(REPO, "projects", "amr_fitness_cost")
DATA = os.path.join(PROJECT, "data")

# Prior project data
CVF_DATA = os.path.join(REPO, "projects", "conservation_vs_fitness", "data")
FM_DATA = os.path.join(REPO, "projects", "fitness_modules", "data")
EG_DATA = os.path.join(REPO, "projects", "essential_genome", "data")

print(f"REPO: {REPO}")
print(f"FB link exists: {os.path.exists(os.path.join(CVF_DATA, 'fb_pangenome_link.tsv'))}")

REPO: /home/psdehal/pangenome_science/BERIL-research-observatory
FB link exists: True

fb_link = pd.read_csv(os.path.join(CVF_DATA, "fb_pangenome_link.tsv"), sep="\t")
print(f"FB-pangenome links: {len(fb_link):,}")
print(f"Unique clusters: {fb_link['gene_cluster_id'].nunique():,}")
print(f"Organisms: {fb_link['orgId'].nunique()}")
print(f"\nConservation breakdown:")
print(f"  Core: {fb_link['is_core'].sum():,} ({fb_link['is_core'].mean():.1%})")
print(f"  Auxiliary: {(fb_link['is_auxiliary'] & ~fb_link['is_singleton']).sum():,}")
print(f"  Singleton: {fb_link['is_singleton'].sum():,}")
fb_link.head()

FB-pangenome links: 177,863
Unique clusters: 163,974
Organisms: 44

Conservation breakdown:
  Core: 145,821 (82.0%)
  Auxiliary: 24,468
  Singleton: 7,574

# Load bakta_amr data for FB-linked clusters
amr_raw = pd.read_csv(os.path.join(DATA, "bakta_amr_fb_clusters.csv"))
print(f"bakta_amr hits in FB clusters: {len(amr_raw)}")
print(f"Unique clusters: {amr_raw['gene_cluster_id'].nunique()}")

# Join with FB link to get orgId, locusId, conservation
tier1 = fb_link.merge(amr_raw, on="gene_cluster_id", how="inner")
print(f"\nTier 1 (bakta_amr x FB link): {len(tier1)} gene-organism pairs")
print(f"  Unique genes (orgId, locusId): {tier1.groupby(['orgId', 'locusId']).ngroups}")
print(f"  Unique clusters: {tier1['gene_cluster_id'].nunique()}")
print(f"  Organisms: {tier1['orgId'].nunique()}")

# Show AMR gene distribution
print(f"\nTop AMR genes:")
print(tier1['amr_gene'].value_counts().head(20).to_string())

bakta_amr hits in FB clusters: 171
Unique clusters: 171

Tier 1 (bakta_amr x FB link): 178 gene-organism pairs
  Unique genes (orgId, locusId): 178
  Unique clusters: 171
  Organisms: 37

Top AMR genes:
amr_gene
ampC       14
bla        14
blaOXA     10
mexE       10
merA        6
merP        6
arsD        6
arsN2       6
catB        5
emhB        5
arsC        4
asr         4
silR        3
silA        3
merR        3
merT        3
merE        3
vat         3
aac(6')     3
blaCAR      3

# Define AMR class and mechanism mappings
# Based on AMRFinderPlus gene naming conventions

AMR_CLASS_PATTERNS = {
    # Antibiotic resistance
    'beta_lactam': [r'^bla', r'^ampC', r'^OXA', r'^CTX-M', r'^TEM', r'^SHV',
                    r'^KPC', r'^NDM', r'^VIM', r'^IMP'],
    'aminoglycoside': [r'^aac', r'^aph', r'^ant', r'^aad', r'^str[AB]',
                       r'^rmt[A-Z]', r'^armA'],
    'tetracycline': [r'^tet\(', r'^tet[A-Z]$', r'^otr'],
    'chloramphenicol': [r'^cat', r'^cml', r'^floR', r'^fexA'],
    'macrolide_lincosamide': [r'^erm', r'^msr', r'^mph', r'^mef',
                              r'^lnu', r'^lin'],
    'glycopeptide': [r'^van[A-Z]$'],
    'rifampin': [r'^arr', r'^rox'],
    'fluoroquinolone': [r'^qnr', r'^oqx', r'^qep'],
    'sulfonamide': [r'^sul[0-9]', r'^dfr'],
    'polymyxin': [r'^mcr'],
    'fosfomycin': [r'^fos[A-Z]'],
    # Efflux systems
    'efflux_rnd': [r'^acr[A-Z]', r'^mex[A-Z]', r'^ade[A-Z]', r'^opr[MN]',
                   r'^tolC', r'^emh[A-C]', r'^ttg[A-H]', r'^mdt[A-Z]',
                   r'^mux[A-C]'],
    'efflux_mfs': [r'^emrD', r'^emrB', r'^emr[AE]', r'^norA', r'^norB',
                   r'^lmr', r'^bmr'],
    'efflux_mate': [r'^mdt[KL]', r'^norM', r'^dinF'],
    # Metal resistance
    'mercury': [r'^mer[A-Z]'],
    'arsenic': [r'^ars[A-Z]'],
    'copper': [r'^cop[A-Z]', r'^cue[A-Z]', r'^cus[A-Z]', r'^pco[A-Z]'],
    'silver': [r'^sil[A-Z]'],
    'chromate': [r'^chr[A-Z]'],
    'tellurite': [r'^ter[A-Z]', r'^teh[AB]'],
}

AMR_MECHANISM_MAP = {
    'enzymatic_inactivation': ['beta_lactam', 'aminoglycoside', 'chloramphenicol',
                                'fosfomycin'],
    'efflux': ['efflux_rnd', 'efflux_mfs', 'efflux_mate', 'tetracycline'],
    'target_modification': ['macrolide_lincosamide', 'glycopeptide', 'rifampin'],
    'target_protection': [],  # tet(M), tet(O) etc — ribosomal protection
    'metal_resistance': ['mercury', 'arsenic', 'copper', 'silver', 'chromate', 'tellurite'],
    'other': ['fluoroquinolone', 'sulfonamide', 'polymyxin'],
}

# Invert mechanism map
CLASS_TO_MECHANISM = {}
for mechanism, classes in AMR_MECHANISM_MAP.items():
    for cls in classes:
        CLASS_TO_MECHANISM[cls] = mechanism

# Also classify by resistance type: antibiotic vs metal
METAL_CLASSES = {'mercury', 'arsenic', 'copper', 'silver', 'chromate', 'tellurite'}


def classify_amr_gene(gene_name):
    """Classify an AMR gene name into class and mechanism."""
    if pd.isna(gene_name) or gene_name == '':
        return 'unclassified', 'unknown'
    for amr_class, patterns in AMR_CLASS_PATTERNS.items():
        for pattern in patterns:
            if re.search(pattern, gene_name, re.IGNORECASE):
                mechanism = CLASS_TO_MECHANISM.get(amr_class, 'other')
                return amr_class, mechanism
    return 'unclassified', 'unknown'


def classify_amr_product(product):
    """Classify based on bakta product description."""
    if pd.isna(product) or product == '':
        return 'unclassified', 'unknown'
    product_lower = product.lower()
    if 'beta-lactamase' in product_lower:
        return 'beta_lactam', 'enzymatic_inactivation'
    if 'aminoglycoside' in product_lower:
        return 'aminoglycoside', 'enzymatic_inactivation'
    if 'chloramphenicol' in product_lower and ('acetyltransferase' in product_lower or 'efflux' in product_lower):
        return 'chloramphenicol', 'enzymatic_inactivation'
    if 'tetracycline' in product_lower and ('efflux' in product_lower or 'resistance' in product_lower):
        return 'tetracycline', 'efflux'
    if 'macrolide' in product_lower or 'erythromycin' in product_lower:
        return 'macrolide_lincosamide', 'target_modification'
    if 'vancomycin' in product_lower:
        return 'glycopeptide', 'target_modification'
    if 'multidrug' in product_lower and ('efflux' in product_lower or 'resistance' in product_lower):
        return 'efflux_rnd', 'efflux'
    if 'mercury' in product_lower and ('resistance' in product_lower or 'reductase' in product_lower):
        return 'mercury', 'metal_resistance'
    if ('arsenic' in product_lower or 'arsenate' in product_lower) and ('resistance' in product_lower or 'reductase' in product_lower):
        return 'arsenic', 'metal_resistance'
    if 'copper' in product_lower and 'resistance' in product_lower:
        return 'copper', 'metal_resistance'
    if 'chromate' in product_lower and 'resistance' in product_lower:
        return 'chromate', 'metal_resistance'
    if 'antibiotic' in product_lower and ('efflux' in product_lower or 'resistance' in product_lower):
        return 'efflux_rnd', 'efflux'
    return 'unclassified', 'unknown'


# Classify Tier 1 genes
tier1[['amr_class', 'amr_mechanism']] = tier1['amr_gene'].apply(
    lambda g: pd.Series(classify_amr_gene(g)))

# For unclassified, try product
mask = tier1['amr_class'] == 'unclassified'
if mask.any():
    tier1.loc[mask, ['amr_class', 'amr_mechanism']] = tier1.loc[mask, 'amr_product'].apply(
        lambda p: pd.Series(classify_amr_product(p)))

# Add resistance type
tier1['resistance_type'] = tier1['amr_class'].apply(
    lambda c: 'metal' if c in METAL_CLASSES else 'antibiotic')

# Add intrinsic vs acquired proxy (core = likely intrinsic)
tier1['intrinsic_proxy'] = tier1['is_core'].apply(
    lambda c: 'intrinsic' if c else 'acquired')

tier1['amr_tier'] = 1

print(f"Tier 1 AMR class distribution:")
print(tier1['amr_class'].value_counts().to_string())
print(f"\nMechanism distribution:")
print(tier1['amr_mechanism'].value_counts().to_string())
print(f"\nResistance type:")
print(tier1['resistance_type'].value_counts().to_string())

Tier 1 AMR class distribution:
amr_class
beta_lactam        44
mercury            27
arsenic            22
efflux_rnd         22
silver             11
aminoglycoside      9
chloramphenicol     6
copper              5
rifampin            2
fosfomycin          1
fluoroquinolone     1
efflux_mfs          1
chromate            1

Mechanism distribution:
amr_mechanism
metal_resistance          66
enzymatic_inactivation    60
efflux                    23
target_modification        2
other                      1

Resistance type:
resistance_type
antibiotic    112
metal          66

# Load AMR keyword annotations
annot_amr = pd.read_csv(os.path.join(DATA, "bakta_annotations_amr_keywords.csv"))
print(f"AMR keyword annotation hits: {len(annot_amr)}")
print(f"Unique clusters: {annot_amr['gene_cluster_id'].nunique()}")

# Exclude clusters already in Tier 1
tier1_clusters = set(tier1['gene_cluster_id'].unique())
annot_amr_new = annot_amr[~annot_amr['gene_cluster_id'].isin(tier1_clusters)].copy()
print(f"New clusters (not in Tier 1): {annot_amr_new['gene_cluster_id'].nunique()}")

# Join with FB link
tier2_raw = fb_link.merge(annot_amr_new[['gene_cluster_id', 'gene', 'product']],
                          on='gene_cluster_id', how='inner')

# Classify by product
tier2_raw[['amr_class', 'amr_mechanism']] = tier2_raw['product'].apply(
    lambda p: pd.Series(classify_amr_product(p)))

tier2_raw['resistance_type'] = tier2_raw['amr_class'].apply(
    lambda c: 'metal' if c in METAL_CLASSES else 'antibiotic')
tier2_raw['intrinsic_proxy'] = tier2_raw['is_core'].apply(
    lambda c: 'intrinsic' if c else 'acquired')
tier2_raw['amr_tier'] = 2
tier2_raw['amr_gene'] = tier2_raw['gene']
tier2_raw['amr_product'] = tier2_raw['product']

# Deduplicate: one row per (orgId, locusId)
tier2 = tier2_raw.drop_duplicates(subset=['orgId', 'locusId']).copy()

print(f"\nTier 2: {len(tier2)} gene-organism pairs")
print(f"  Organisms: {tier2['orgId'].nunique()}")
print(f"\nTier 2 class distribution:")
print(tier2['amr_class'].value_counts().to_string())

AMR keyword annotation hits: 1179
Unique clusters: 1179
New clusters (not in Tier 1): 1083

Tier 2: 1174 gene-organism pairs
  Organisms: 43

Tier 2 class distribution:
amr_class
efflux_rnd               434
beta_lactam              359
unclassified             114
copper                    92
aminoglycoside            78
arsenic                   61
chromate                  11
chloramphenicol           10
tetracycline               9
macrolide_lincosamide      6

# Combine Tier 1 and Tier 2
common_cols = ['orgId', 'locusId', 'gene_cluster_id', 'gtdb_species_clade_id',
               'pident', 'evalue', 'bitscore', 'is_core', 'is_auxiliary', 'is_singleton',
               'amr_gene', 'amr_product', 'amr_class', 'amr_mechanism',
               'resistance_type', 'intrinsic_proxy', 'amr_tier']

# Ensure columns exist in both
for col in common_cols:
    if col not in tier1.columns:
        tier1[col] = None
    if col not in tier2.columns:
        tier2[col] = None

amr_all = pd.concat([tier1[common_cols], tier2[common_cols]], ignore_index=True)

# Deduplicate: if a gene appears in both tiers, keep Tier 1
amr_all = amr_all.sort_values('amr_tier').drop_duplicates(
    subset=['orgId', 'locusId'], keep='first')

print(f"Combined AMR genes: {len(amr_all)}")
print(f"  Tier 1: {(amr_all['amr_tier'] == 1).sum()}")
print(f"  Tier 2: {(amr_all['amr_tier'] == 2).sum()}")
print(f"  Organisms: {amr_all['orgId'].nunique()}")
print(f"\nConservation:")
print(f"  Core: {amr_all['is_core'].sum()} ({amr_all['is_core'].mean():.1%})")
print(f"  Auxiliary: {(amr_all['is_auxiliary'] & ~amr_all['is_singleton']).sum()}")
print(f"  Singleton: {amr_all['is_singleton'].sum()}")
print(f"\nResistance type:")
print(amr_all['resistance_type'].value_counts().to_string())
print(f"\nIntrinsic vs acquired:")
print(amr_all['intrinsic_proxy'].value_counts().to_string())

# Save
amr_all.to_csv(os.path.join(DATA, "amr_genes_fb.csv"), index=False)
print(f"\nSaved to data/amr_genes_fb.csv")

Combined AMR genes: 1352
  Tier 1: 178
  Tier 2: 1174
  Organisms: 43

Conservation:
  Core: 1082 (80.0%)
  Auxiliary: 205
  Singleton: 65

Resistance type:
resistance_type
antibiotic    1122
metal          230

Intrinsic vs acquired:
intrinsic_proxy
intrinsic    1082
acquired      270

Saved to data/amr_genes_fb.csv

# Load all experiment metadata
exp_files = sorted(glob(os.path.join(FM_DATA, "annotations", "*_experiments.csv")))
print(f"Found {len(exp_files)} experiment files")

all_exps = []
for f in exp_files:
    org = os.path.basename(f).replace("_experiments.csv", "")
    df = pd.read_csv(f)
    df['orgId'] = org
    all_exps.append(df)

exps = pd.concat(all_exps, ignore_index=True)
print(f"Total experiments: {len(exps)} across {exps['orgId'].nunique()} organisms")

Found 32 experiment files
Total experiments: 6804 across 32 organisms

# Antibiotic compound patterns
ANTIBIOTIC_PATTERNS = {
    'ampicillin': ('ampicillin', 'beta_lactam'),
    'carbenicillin': ('carbenicillin', 'beta_lactam'),
    'cefoxitin': ('cefoxitin', 'beta_lactam'),
    'chloramphenicol': ('chloramphenicol', 'chloramphenicol'),
    'tetracycline': ('tetracycline', 'tetracycline'),
    'oxytetracycline': ('oxytetracycline', 'tetracycline'),
    'anhydrotetracycline': ('anhydrotetracycline', 'tetracycline'),
    'vancomycin': ('vancomycin', 'glycopeptide'),
    'rifampicin': ('rifampicin', 'rifampin'),
    'rifampin': ('rifampin', 'rifampin'),
    'gentamicin': ('gentamicin', 'aminoglycoside'),
    'spectinomycin': ('spectinomycin', 'aminoglycoside'),
    'kanamycin': ('kanamycin', 'aminoglycoside'),
    'streptomycin': ('streptomycin', 'aminoglycoside'),
    'erythromycin': ('erythromycin', 'macrolide'),
    'nalidixic': ('nalidixic_acid', 'fluoroquinolone'),
    'ciprofloxacin': ('ciprofloxacin', 'fluoroquinolone'),
    'polymyxin': ('polymyxin_b', 'polymyxin'),
    'trimethoprim': ('trimethoprim', 'sulfonamide'),
    'novobiocin': ('novobiocin', 'other_antibiotic'),
    'bacitracin': ('bacitracin', 'other_antibiotic'),
    'colistin': ('colistin', 'polymyxin'),
    'penicillin': ('penicillin', 'beta_lactam'),
}

# Metal compound patterns (for separating metal from other stress)
METAL_PATTERNS = [
    'cobalt', 'nickel', 'copper', 'zinc', 'aluminum', 'iron',
    'tungsten', 'molybdenum', 'chromium', 'uranium', 'selenium',
    'manganese', 'mercury', 'cadmium', 'arsenic', 'silver',
    'CoCl', 'NiCl', 'CuCl', 'CuSO', 'ZnCl', 'ZnSO', 'AlCl',
    'FeCl', 'FeSO', 'CrO', 'Na2CrO', 'Na2SeO', 'Na2WO', 'Na2MoO',
    'CdCl', 'AgNO', 'HgCl', 'Uranyl',
]


def classify_experiment(row):
    """Classify an experiment into categories."""
    cond = str(row.get('condition_1', '')).lower() if pd.notna(row.get('condition_1')) else ''
    desc = str(row.get('expDesc', '')).lower() if pd.notna(row.get('expDesc')) else ''
    group = str(row.get('expGroup', '')).lower() if pd.notna(row.get('expGroup')) else ''

    # Check for antibiotics
    for pattern, (name, abx_class) in ANTIBIOTIC_PATTERNS.items():
        if pattern.lower() in cond or pattern.lower() in desc:
            return 'antibiotic', name, abx_class

    # Check for metals
    for pattern in METAL_PATTERNS:
        if pattern.lower() in cond or pattern.lower() in desc:
            return 'metal_stress', '', ''

    # Standard/baseline conditions
    if group in ('carbon source', 'nitrogen source', 'nutrient'):
        return 'carbon_nitrogen', '', ''
    if cond == '' or cond == 'nan':
        return 'standard', '', ''
    if group == 'stress':
        return 'other_stress', '', ''

    return 'other', '', ''


# Classify all experiments
classifications = exps.apply(classify_experiment, axis=1, result_type='expand')
classifications.columns = ['exp_category', 'antibiotic_name', 'antibiotic_class']
exps_classified = pd.concat([exps, classifications], axis=1)

print("Experiment categories:")
print(exps_classified['exp_category'].value_counts().to_string())

print(f"\nAntibiotic experiments by compound:")
abx_exps = exps_classified[exps_classified['exp_category'] == 'antibiotic']
print(abx_exps['antibiotic_name'].value_counts().to_string())

print(f"\nAntibiotic experiments by organism:")
print(abx_exps.groupby('orgId').size().sort_values(ascending=False).head(15).to_string())

# Save
exps_classified.to_csv(os.path.join(DATA, "experiment_classification.csv"), index=False)
print(f"\nSaved to data/experiment_classification.csv")

Experiment categories:
exp_category
carbon_nitrogen    2868
other_stress       1862
standard            727
other               457
metal_stress        447
antibiotic          443

Antibiotic experiments by compound:
antibiotic_name
tetracycline       69
chloramphenicol    51
polymyxin_b        51
spectinomycin      50
vancomycin         47
nalidixic_acid     44
bacitracin         39
carbenicillin      35
gentamicin         20
rifampicin         19
cefoxitin           9
trimethoprim        6
ciprofloxacin       3

Antibiotic experiments by organism:
orgId
Btheta     44
Miya       29
Caulo      26
Ponti      24
psRCH2     24
WCS417     22
Cola       21
Marino     21
SynE       21
Phaeo      19
Pedo557    18
Korea      17
PV4        16
Kang       16
DvH        15

Saved to data/experiment_classification.csv

# Which organisms have both AMR genes and fitness matrices?
orgs_with_matrices = set(
    os.path.basename(f).replace('_fitness_matrix.csv', '')
    for f in glob(os.path.join(FM_DATA, 'matrices', '*_fitness_matrix.csv'))
)
orgs_with_amr = set(amr_all['orgId'].unique())
orgs_with_abx = set(abx_exps['orgId'].unique())

orgs_amr_fitness = orgs_with_amr & orgs_with_matrices
orgs_amr_abx = orgs_with_amr & orgs_with_abx & orgs_with_matrices

print(f"Organisms with fitness matrices: {len(orgs_with_matrices)}")
print(f"Organisms with AMR genes: {len(orgs_with_amr)}")
print(f"Organisms with AMR + fitness: {len(orgs_amr_fitness)}")
print(f"Organisms with AMR + fitness + antibiotic experiments: {len(orgs_amr_abx)}")

print(f"\nAMR genes per organism (with fitness data):")
amr_fitness = amr_all[amr_all['orgId'].isin(orgs_amr_fitness)]
for org in sorted(orgs_amr_fitness):
    org_amr = amr_fitness[amr_fitness['orgId'] == org]
    t1 = (org_amr['amr_tier'] == 1).sum()
    t2 = (org_amr['amr_tier'] == 2).sum()
    n_std = len(exps_classified[(exps_classified['orgId'] == org) &
                                (exps_classified['exp_category'].isin(['standard', 'carbon_nitrogen']))])
    n_abx = len(exps_classified[(exps_classified['orgId'] == org) &
                                (exps_classified['exp_category'] == 'antibiotic')])
    print(f"  {org:25s}  T1={t1:3d}  T2={t2:3d}  total={len(org_amr):4d}  "
          f"std_exps={n_std:3d}  abx_exps={n_abx:3d}")

Organisms with fitness matrices: 32
Organisms with AMR genes: 43
Organisms with AMR + fitness: 28
Organisms with AMR + fitness + antibiotic experiments: 26

AMR genes per organism (with fitness data):
  ANA3                       T1= 22  T2= 26  total=  48  std_exps= 58  abx_exps=  4
  BFirm                      T1=  4  T2= 49  total=  53  std_exps= 91  abx_exps=  3
  Btheta                     T1=  4  T2= 28  total=  32  std_exps=160  abx_exps= 44
  Caulo                      T1=  2  T2= 33  total=  35  std_exps=103  abx_exps= 26
  Cup4G11                    T1=  4  T2= 74  total=  78  std_exps= 72  abx_exps=  4
  Dino                       T1=  0  T2= 31  total=  31  std_exps= 88  abx_exps=  9
  DvH                        T1=  1  T2= 17  total=  18  std_exps=363  abx_exps= 15
  Keio                       T1=  0  T2=  9  total=   9  std_exps=102  abx_exps= 12
  Korea                      T1=  8  T2= 28  total=  36  std_exps= 70  abx_exps= 17
  Koxy                       T1= 10  T2= 38  total=  48  std_exps=163  abx_exps=  1
  MR1                        T1=  4  T2= 18  total=  22  std_exps= 93  abx_exps= 10
  Marino                     T1=  3  T2= 19  total=  22  std_exps=122  abx_exps= 21
  Methanococcus_JJ           T1=  0  T2=  2  total=   2  std_exps= 77  abx_exps=  0
  Methanococcus_S2           T1=  0  T2=  2  total=   2  std_exps=153  abx_exps=  0
  PV4                        T1=  3  T2= 18  total=  21  std_exps= 59  abx_exps= 16
  Pedo557                    T1=  0  T2=  1  total=   1  std_exps= 87  abx_exps= 18
  Phaeo                      T1=  1  T2= 16  total=  17  std_exps=163  abx_exps= 19
  Ponti                      T1=  2  T2= 30  total=  32  std_exps= 10  abx_exps= 24
  Putida                     T1= 12  T2= 36  total=  48  std_exps=260  abx_exps=  5
  SynE                       T1=  1  T2= 10  total=  11  std_exps= 31  abx_exps= 21
  WCS417                     T1=  3  T2= 31  total=  34  std_exps=128  abx_exps= 22
  acidovorax_3H11            T1=  6  T2= 31  total=  37  std_exps= 81  abx_exps=  8
  psRCH2                     T1=  2  T2= 31  total=  33  std_exps=164  abx_exps= 24
  pseudo13_GW456_L13         T1=  5  T2= 34  total=  39  std_exps= 78  abx_exps=  5
  pseudo1_N1B4               T1=  7  T2= 30  total=  37  std_exps= 93  abx_exps=  8
  pseudo3_N2E3               T1=  5  T2= 24  total=  29  std_exps=140  abx_exps= 12
  pseudo5_N2C3_1             T1=  4  T2= 27  total=  31  std_exps=122  abx_exps=  8
  pseudo6_N2E2               T1=  6  T2= 28  total=  34  std_exps=128  abx_exps= 14

# Spot-check: look for well-known AMR genes
known_amr = ['acrA', 'acrB', 'tolC', 'acrR', 'marA', 'marR',
              'ompF', 'ompC', 'bla', 'tet', 'cat', 'erm']

print("Spot-check of known AMR genes in the dataset:")
for gene in known_amr:
    matches = amr_all[amr_all['amr_gene'].str.contains(gene, case=False, na=False)]
    if len(matches) > 0:
        orgs = matches['orgId'].unique()
        print(f"  {gene:10s}: {len(matches)} hits in {len(orgs)} organisms: {', '.join(sorted(orgs)[:5])}")
    else:
        print(f"  {gene:10s}: not found")

print(f"\nData assembly complete.")
print(f"  AMR genes: {len(amr_all)} ({(amr_all['amr_tier']==1).sum()} Tier 1, {(amr_all['amr_tier']==2).sum()} Tier 2)")
print(f"  Experiments classified: {len(exps_classified)}")
print(f"  Ready for NB02: fitness cost analysis")

Spot-check of known AMR genes in the dataset:
  acrA      : 50 hits in 22 organisms: BFirm, Btheta, Cup4G11, Dda3937, DvH
  acrB      : 57 hits in 24 organisms: ANA3, BFirm, Btheta, Burk376, Cup4G11
  tolC      : 14 hits in 8 organisms: Btheta, Koxy, Putida, RalstoniaBSBF1503, RalstoniaGMI1000
  acrR      : 6 hits in 6 organisms: ANA3, Dda3937, Ddia6719, DdiaME23, Koxy
  marA      : 3 hits in 3 organisms: Ddia6719, DdiaME23, Koxy
  marR      : 3 hits in 3 organisms: Dda3937, Keio, Koxy
  ompF      : not found
  ompC      : not found
  bla       : 30 hits in 25 organisms: ANA3, BFirm, Btheta, Caulo, Cup4G11
  tet       : 1 hits in 1 organisms: Cup4G11
  cat       : 8 hits in 6 organisms: Btheta, DvH, Korea, MR1, PV4
  erm       : not found

Data assembly complete.
  AMR genes: 1352 (178 Tier 1, 1174 Tier 2)
  Experiments classified: 6804
  Ready for NB02: fitness cost analysis

01 Data Assembly

NB01: Data Assembly¶

1. Load FB-pangenome link table¶

2. Tier 1: Strict bakta_amr hits¶

3. AMR Class Taxonomy¶

4. Tier 2: bakta_annotations keyword matches¶

5. Combine tiers¶

6. Experiment Classification¶

7. Summary: organisms with both AMR genes and fitness data¶

8. Spot-check known AMR genes¶

	orgId	locusId	gene_cluster_id	gtdb_species_clade_id	pident	evalue	bitscore	is_core	is_auxiliary	is_singleton
0	ANA3	7022495	NC_008573.1_1	s__Shewanella_sp000203935--RS_GCF_000203935.1	100.0	1.210000e-213	585.0	False	True	True
1	ANA3	7022496	NC_008573.1_2	s__Shewanella_sp000203935--RS_GCF_000203935.1	100.0	2.580000e-50	149.0	False	True	True
2	ANA3	7022497	NC_008573.1_3	s__Shewanella_sp000203935--RS_GCF_000203935.1	100.0	2.080000e-241	653.0	False	True	True
3	ANA3	7022498	NC_008573.1_4	s__Shewanella_sp000203935--RS_GCF_000203935.1	100.0	5.440000e-66	191.0	False	True	True
4	ANA3	7022499	NC_008573.1_5	s__Shewanella_sp000203935--RS_GCF_000203935.1	100.0	4.440000e-84	238.0	False	True	True