import os, sys, warnings
warnings.filterwarnings('ignore')

# Spark session — works on JupyterHub (no import needed) or local (needs proxy)
try:
    spark = get_spark_session()
except NameError:
    sys.path.insert(0, os.path.join(os.getcwd(), '..', '..', '..', 'scripts'))
    from get_spark_session import get_spark_session
    spark = get_spark_session()

spark.sql("SELECT 1 AS test").show()
print("Spark session ready.")

+----+
|test|
+----+
|   1|
+----+

Spark session ready.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from pathlib import Path

# Output directories
DATA_DIR = Path('../data')
FIG_DIR = Path('../figures')
DATA_DIR.mkdir(exist_ok=True)
FIG_DIR.mkdir(exist_ok=True)

plt.rcParams.update({
    'figure.figsize': (12, 6),
    'figure.dpi': 150,
    'font.size': 11,
    'axes.titlesize': 13,
    'savefig.bbox': 'tight',
    'savefig.dpi': 150,
})

# Full scan of bakta_amr — small table (83K rows)
amr_raw = spark.sql("""
    SELECT * FROM kbase_ke_pangenome.bakta_amr
""").toPandas()

print(f"Total AMR hits: {len(amr_raw):,}")
print(f"Columns: {list(amr_raw.columns)}")
print(f"\nDistinct values:")
for col in amr_raw.columns:
    n = amr_raw[col].nunique()
    print(f"  {col}: {n:,} unique")
amr_raw.head(10)

Total AMR hits: 83,008
Columns: ['gene_cluster_id', 'amr_gene', 'amr_product', 'method', 'identity', 'query_cov', 'subject_cov', 'accession']

Distinct values:
  gene_cluster_id: 82,908 unique
  amr_gene: 1,939 unique
  amr_product: 2,079 unique
  method: 5 unique
  identity: 1,386 unique
  query_cov: 3,356 unique
  subject_cov: 3,370 unique
  accession: 3,126 unique

# Detection method breakdown
print("=== Detection Methods ===")
print(amr_raw['method'].value_counts().to_string())
print()

# AMR gene family distribution
print(f"\n=== Top 30 AMR Genes ===")
print(amr_raw['amr_gene'].value_counts().head(30).to_string())
print()

# AMR product distribution  
print(f"\n=== Top 30 AMR Products ===")
print(amr_raw['amr_product'].value_counts().head(30).to_string())

=== Detection Methods ===
method
HMM         42742
BLASTP      18886
EXACTP      10812
PARTIALP     8049
ALLELEP      2519


=== Top 30 AMR Genes ===
amr_gene
bla        6115
merA       4506
arsD       2611
merP       2222
vanR       1929
blaOXA     1811
arr        1607
cml        1302
ampC       1273
arsN1      1213
arsN2      1173
aac(6')    1112
merC        950
blaR1       922
vat         922
arsC        919
merE        904
erm         850
merF        820
catA        767
merR        732
merD        723
helR        689
merT        685
merB        677
vanT        652
vanG        636
aph(3')     632
rox         591
asr         579


=== Top 30 AMR Products ===
amr_product
mercury(II) reductase                                          4506
class A beta-lactamase                                         4054
arsenite efflux transporter metallochaperone ArsD              2611
mercury resistance system periplasmic binding protein MerP     2222
VanR-ABDEGLN family response regulator transcription factor    1675
NAD(+)--rifampin ADP-ribosyltransferase                        1555
class D beta-lactamase                                         1374
class C beta-lactamase                                         1224
arsenic resistance N-acetyltransferase ArsN2                   1173
aminoglycoside 6'-N-acetyltransferase                          1132
Cmx/CmrA family chloramphenicol efflux MFS transporter          972
organomercurial transporter MerC                                950
Vat family streptogramin A O-acetyltransferase                  922
broad-spectrum mercury transporter MerE                         904
subclass B3 metallo-beta-lactamase                              902
mercury resistance system transport protein MerF                820
glutaredoxin-dependent arsenate reductase                       818
BlaR1 family beta-lactam sensor/signal transducer               806
mercury resistance transcriptional regulator MerR               732
mercury resistance co-regulator MerD                            723
23S ribosomal RNA methyltransferase Erm                         705
arsinothricin resistance N-acetyltransferase ArsN1 family B     695
RNA polymerase recycling motor ATPase HelR                      689
mercuric transport protein MerT                                 685
organomercurial lyase MerB                                      677
VanT-G-like membrane-bound serine racemase                      646
APH(3') family aminoglycoside O-phosphotransferase              632
type A chloramphenicol O-acetyltransferase                      619
D-alanine--D-serine ligase VanG                                 619
rifampin monooxygenase                                          589

# Identity and coverage distributions
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

amr_raw['identity'].hist(bins=50, ax=axes[0], color='steelblue', edgecolor='white')
axes[0].set_xlabel('Sequence Identity (%)')
axes[0].set_ylabel('Count')
axes[0].set_title('AMRFinderPlus Identity Distribution')

amr_raw['query_cov'].hist(bins=50, ax=axes[1], color='darkorange', edgecolor='white')
axes[1].set_xlabel('Query Coverage (%)')
axes[1].set_title('Query Coverage Distribution')

amr_raw['subject_cov'].hist(bins=50, ax=axes[2], color='seagreen', edgecolor='white')
axes[2].set_xlabel('Subject Coverage (%)')
axes[2].set_title('Subject Coverage Distribution')

plt.suptitle('AMRFinderPlus Hit Quality Distributions (n={:,})'.format(len(amr_raw)), y=1.02)
plt.tight_layout()
plt.savefig(FIG_DIR / 'amr_hit_quality_distributions.png')
plt.show()

# Summary statistics for numeric columns
print("\n=== Numeric Column Summary ===")
print(amr_raw[['identity', 'query_cov', 'subject_cov']].describe().round(2))

=== Numeric Column Summary ===
       identity  query_cov  subject_cov
count  40266.00   40266.00     40266.00
mean       0.98       0.98         0.93
std        0.03       0.06         0.14
min        0.51       0.10         0.50
25%        0.97       1.00         0.99
50%        1.00       1.00         1.00
75%        1.00       1.00         1.00
max        1.00       1.18         1.00

# Join AMR → gene_cluster for conservation flags + species
# bakta_amr is small (83K) so we can use it as the driving table
# NOTE: is_singleton is a SUBSET of is_auxiliary (not mutually exclusive)
# We create a clean 3-way classification: core / auxiliary_non_singleton / singleton
amr_gc = spark.sql("""
    SELECT
        amr.gene_cluster_id,
        amr.amr_gene,
        amr.amr_product,
        amr.method,
        amr.identity,
        amr.query_cov,
        amr.subject_cov,
        amr.accession,
        gc.gtdb_species_clade_id,
        gc.is_core,
        gc.is_auxiliary,
        gc.is_singleton,
        gc.likelihood,
        CASE
            WHEN gc.is_core THEN 'Core'
            WHEN gc.is_singleton THEN 'Singleton'
            WHEN gc.is_auxiliary THEN 'Auxiliary'
            ELSE 'Unknown'
        END AS conservation_class
    FROM kbase_ke_pangenome.bakta_amr amr
    JOIN kbase_ke_pangenome.gene_cluster gc
        ON amr.gene_cluster_id = gc.gene_cluster_id
""").toPandas()

print(f"AMR clusters with gene_cluster info: {len(amr_gc):,}")
print(f"Distinct species: {amr_gc['gtdb_species_clade_id'].nunique():,}")
print(f"\nJoin coverage: {len(amr_gc)/len(amr_raw)*100:.1f}% of AMR hits matched")
print(f"\nConservation class distribution:")
print(amr_gc['conservation_class'].value_counts().to_string())
amr_gc.head()

AMR clusters with gene_cluster info: 83,008
Distinct species: 14,723

Join coverage: 100.0% of AMR hits matched

Conservation class distribution:
conservation_class
Singleton    29937
Auxiliary    27899
Core         25172

# Conservation class distribution — AMR vs pangenome baseline
# Using mutually exclusive classes: Core / Auxiliary (non-singleton) / Singleton
amr_cons_counts = amr_gc['conservation_class'].value_counts()
amr_conservation = pd.DataFrame({
    'class': ['Core', 'Auxiliary', 'Singleton'],
    'amr_count': [
        amr_cons_counts.get('Core', 0),
        amr_cons_counts.get('Auxiliary', 0),
        amr_cons_counts.get('Singleton', 0)
    ]
})
amr_conservation['amr_pct'] = amr_conservation['amr_count'] / amr_conservation['amr_count'].sum() * 100

# Pangenome baseline: core=62M, aux(total)=70M, singleton=50M out of 132M
# Auxiliary (non-singleton) = 70M - 50M = 20M
baseline_core = 62_062_686
baseline_aux_nonsing = 70_468_815 - 50_203_195  # auxiliary minus singleton
baseline_sing = 50_203_195
baseline_total = baseline_core + baseline_aux_nonsing + baseline_sing

amr_conservation['baseline_pct'] = [
    baseline_core / baseline_total * 100,
    baseline_aux_nonsing / baseline_total * 100,
    baseline_sing / baseline_total * 100
]

print("=== AMR Conservation vs Pangenome Baseline ===")
print("(Mutually exclusive: Core / Auxiliary-non-singleton / Singleton)")
print(amr_conservation.to_string(index=False))
print(f"\nTotal AMR clusters: {amr_conservation['amr_count'].sum():,}")

# Chi-squared test
from scipy import stats
observed = amr_conservation['amr_count'].values
expected_pcts = amr_conservation['baseline_pct'].values / 100
expected = expected_pcts * observed.sum()
chi2, p_val = stats.chisquare(observed, expected)
print(f"\nChi-squared test vs baseline: chi2={chi2:.1f}, p={p_val:.2e}")

=== AMR Conservation vs Pangenome Baseline ===
(Mutually exclusive: Core / Auxiliary-non-singleton / Singleton)
    class  amr_count   amr_pct  baseline_pct
     Core      25172 30.324788     46.828630
Auxiliary      27899 33.610013     15.291172
Singleton      29937 36.065199     37.880198

Total AMR clusters: 83,008

Chi-squared test vs baseline: chi2=23117.2, p=0.00e+00

# Visualization: AMR conservation vs baseline
fig, ax = plt.subplots(figsize=(8, 5))
x = np.arange(len(amr_conservation))
width = 0.35

bars1 = ax.bar(x - width/2, amr_conservation['amr_pct'], width, label='AMR Genes', color='#e74c3c')
bars2 = ax.bar(x + width/2, amr_conservation['baseline_pct'], width, label='Pangenome Baseline', color='#95a5a6')

ax.set_xlabel('Conservation Class')
ax.set_ylabel('Percentage (%)')
ax.set_title('AMR Gene Conservation vs Pangenome Baseline')
ax.set_xticks(x)
ax.set_xticklabels(amr_conservation['class'])
ax.legend()

# Add percentage labels
for bar in bars1:
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
            f'{bar.get_height():.1f}%', ha='center', va='bottom', fontsize=10)
for bar in bars2:
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
            f'{bar.get_height():.1f}%', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.savefig(FIG_DIR / 'amr_conservation_vs_baseline.png')
plt.show()

# Classify AMR mechanisms from product descriptions
# These keywords are based on standard AMR mechanism categories (CARD ontology)
def classify_mechanism(product):
    if pd.isna(product):
        return 'Unknown'
    p = product.lower()
    # Efflux pumps
    if any(k in p for k in ['efflux', 'pump', 'transporter', 'exporter', 'permease']):
        return 'Efflux'
    # Beta-lactamases and enzymatic inactivation
    if any(k in p for k in ['beta-lactamase', 'lactamase', 'carbapenemase']):
        return 'Beta-lactamase'
    if any(k in p for k in ['acetyltransferase', 'phosphotransferase', 'nucleotidyltransferase',
                             'aminoglycoside-modifying', 'chloramphenicol acetyltransferase',
                             'inactivat', 'hydrolase']):
        return 'Enzymatic inactivation'
    # Target modification/protection
    if any(k in p for k in ['methyltransferase', 'ribosomal protection', '16s rrna methyltransferase',
                             'target protect', 'erm(', 'cfr(']):
        return 'Target modification'
    if any(k in p for k in ['mutant', 'mutation', 'altered']):
        return 'Target mutation'
    # Cell wall / membrane
    if any(k in p for k in ['penicillin-binding', 'pbp', 'vancomycin resistance',
                             'van', 'lipid a', 'mcr-']):
        return 'Cell wall modification'
    # Ribosomal
    if any(k in p for k in ['ribosom', '23s rrna', '16s rrna']):
        return 'Ribosomal'
    # Regulatory
    if any(k in p for k in ['regulator', 'repressor', 'activator', 'transcription']):
        return 'Regulatory'
    # Stress / general resistance
    if any(k in p for k in ['oxidoreductase', 'reductase', 'oxidase']):
        return 'Oxidoreductase'
    return 'Other/Unclassified'

amr_gc['mechanism'] = amr_gc['amr_product'].apply(classify_mechanism)

print("=== AMR Mechanism Classification ===")
mech_counts = amr_gc['mechanism'].value_counts()
for mech, count in mech_counts.items():
    print(f"  {mech}: {count:,} ({count/len(amr_gc)*100:.1f}%)")
print(f"\nTotal: {len(amr_gc):,}")

=== AMR Mechanism Classification ===
  Other/Unclassified: 18,448 (22.2%)
  Enzymatic inactivation: 15,755 (19.0%)
  Efflux: 14,338 (17.3%)
  Beta-lactamase: 12,824 (15.4%)
  Target modification: 6,955 (8.4%)
  Oxidoreductase: 6,852 (8.3%)
  Cell wall modification: 4,964 (6.0%)
  Regulatory: 2,848 (3.4%)
  Ribosomal: 24 (0.0%)

Total: 83,008

# Conservation breakdown by mechanism (using mutually exclusive classes)
mech_cons = amr_gc.groupby('mechanism').agg(
    total=('gene_cluster_id', 'count'),
    n_core=('conservation_class', lambda x: (x == 'Core').sum()),
    n_auxiliary=('conservation_class', lambda x: (x == 'Auxiliary').sum()),
    n_singleton=('conservation_class', lambda x: (x == 'Singleton').sum()),
    n_species=('gtdb_species_clade_id', 'nunique'),
    mean_identity=('identity', 'mean')
).reset_index()

mech_cons['pct_core'] = (mech_cons['n_core'] / mech_cons['total'] * 100).round(1)
mech_cons['pct_aux'] = (mech_cons['n_auxiliary'] / mech_cons['total'] * 100).round(1)
mech_cons['pct_sing'] = (mech_cons['n_singleton'] / mech_cons['total'] * 100).round(1)
mech_cons = mech_cons.sort_values('total', ascending=False)

print("=== Conservation by AMR Mechanism (mutually exclusive classes) ===")
print(mech_cons[['mechanism', 'total', 'n_species', 'pct_core', 'pct_aux', 'pct_sing', 'mean_identity']].to_string(index=False))

=== Conservation by AMR Mechanism (mutually exclusive classes) ===
             mechanism  total  n_species  pct_core  pct_aux  pct_sing  mean_identity
    Other/Unclassified  18448       5661      24.0     39.3      36.8       0.974350
Enzymatic inactivation  15755       6502      28.2     32.8      39.0       0.991705
                Efflux  14338       5187      30.9     34.6      34.5       0.973296
        Beta-lactamase  12824       7048      54.9     22.0      23.2       0.989255
   Target modification   6955       3353      19.6     35.9      44.5       0.989641
        Oxidoreductase   6852       3530      15.4     38.3      46.3       0.975365
Cell wall modification   4964       2027      45.2     22.5      32.3       0.967857
            Regulatory   2848        975       6.5     51.4      42.1       0.964266
             Ribosomal     24          9       0.0     75.0      25.0       0.986638

# Stacked bar: mechanism x conservation
fig, ax = plt.subplots(figsize=(12, 6))
mech_plot = mech_cons.sort_values('total', ascending=True).tail(10)  # top 10

ax.barh(mech_plot['mechanism'], mech_plot['pct_core'], color='#2ecc71', label='Core')
ax.barh(mech_plot['mechanism'], mech_plot['pct_aux'], left=mech_plot['pct_core'], color='#f39c12', label='Auxiliary')
ax.barh(mech_plot['mechanism'], mech_plot['pct_sing'],
        left=mech_plot['pct_core'] + mech_plot['pct_aux'], color='#e74c3c', label='Singleton')

ax.set_xlabel('Percentage (%)')
ax.set_title('Conservation Class by AMR Mechanism')
ax.legend(loc='lower right')

# Add count labels
for i, (_, row) in enumerate(mech_plot.iterrows()):
    ax.text(101, i, f'n={int(row["total"]):,}', va='center', fontsize=9)

ax.set_xlim(0, 115)
plt.tight_layout()
plt.savefig(FIG_DIR / 'amr_mechanism_conservation.png')
plt.show()

# Species-level AMR counts (using mutually exclusive conservation classes)
species_amr = amr_gc.groupby('gtdb_species_clade_id').agg(
    n_amr=('gene_cluster_id', 'count'),
    n_amr_genes=('amr_gene', 'nunique'),
    n_core_amr=('conservation_class', lambda x: (x == 'Core').sum()),
    n_aux_amr=('conservation_class', lambda x: (x == 'Auxiliary').sum()),
    n_sing_amr=('conservation_class', lambda x: (x == 'Singleton').sum()),
    n_mechanisms=('mechanism', 'nunique')
).reset_index()

print(f"Species with at least 1 AMR gene: {len(species_amr):,}")
print(f"Total species in pangenome: 27,690")
print(f"Coverage: {len(species_amr)/27690*100:.1f}%")
print(f"\n=== AMR Cluster Count per Species ===")
print(species_amr['n_amr'].describe().round(1))

# Distribution plot
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

species_amr['n_amr'].hist(bins=50, ax=axes[0], color='#e74c3c', edgecolor='white')
axes[0].set_xlabel('AMR Clusters per Species')
axes[0].set_ylabel('Number of Species')
axes[0].set_title(f'AMR Gene Count Distribution ({len(species_amr):,} species)')

species_amr['n_amr'].hist(bins=50, ax=axes[1], color='#e74c3c', edgecolor='white', log=True)
axes[1].set_xlabel('AMR Clusters per Species')
axes[1].set_ylabel('Number of Species (log)')
axes[1].set_title('AMR Gene Count Distribution (log scale)')

plt.tight_layout()
plt.savefig(FIG_DIR / 'amr_species_density_distribution.png')
plt.show()

# Top 20 AMR-rich species
print(f"\n=== Top 20 Species by AMR Cluster Count ===")
top_species = species_amr.nlargest(20, 'n_amr')
for _, row in top_species.iterrows():
    sp = row['gtdb_species_clade_id'].split('--')[0].replace('s__', '')
    print(f"  {sp}: {int(row['n_amr'])} AMR clusters ({int(row['n_core_amr'])} core, "
          f"{int(row['n_aux_amr'])} aux, {int(row['n_sing_amr'])} sing)")

Species with at least 1 AMR gene: 14,723
Total species in pangenome: 27,690
Coverage: 53.2%

=== AMR Cluster Count per Species ===
count    14723.0
mean         5.6
std         19.8
min          1.0
25%          1.0
50%          2.0
75%          5.0
max       1120.0
Name: n_amr, dtype: float64

=== Top 20 Species by AMR Cluster Count ===
  Klebsiella_pneumoniae: 1120 AMR clusters (7 core, 738 aux, 375 sing)
  Salmonella_enterica: 840 AMR clusters (11 core, 641 aux, 188 sing)
  Staphylococcus_aureus: 645 AMR clusters (9 core, 481 aux, 155 sing)
  Enterobacter_hormaechei_A: 540 AMR clusters (5 core, 372 aux, 163 sing)
  Pseudomonas_aeruginosa: 538 AMR clusters (8 core, 354 aux, 176 sing)
  Acinetobacter_baumannii: 382 AMR clusters (7 core, 246 aux, 129 sing)
  Klebsiella_quasipneumoniae: 380 AMR clusters (8 core, 258 aux, 114 sing)
  Citrobacter_freundii: 379 AMR clusters (5 core, 252 aux, 122 sing)
  Klebsiella_variicola: 301 AMR clusters (8 core, 194 aux, 99 sing)
  Klebsiella_michiganensis: 274 AMR clusters (5 core, 195 aux, 74 sing)
  Enterococcus_B_faecium: 261 AMR clusters (3 core, 184 aux, 74 sing)
  Enterobacter_roggenkampii: 255 AMR clusters (6 core, 181 aux, 68 sing)
  Enterobacter_kobei: 250 AMR clusters (11 core, 171 aux, 68 sing)
  Enterococcus_faecalis: 234 AMR clusters (1 core, 168 aux, 65 sing)
  Proteus_mirabilis: 229 AMR clusters (4 core, 134 aux, 91 sing)
  Enterobacter_cloacae: 225 AMR clusters (5 core, 150 aux, 70 sing)
  Klebsiella_aerogenes: 220 AMR clusters (10 core, 144 aux, 66 sing)
  Citrobacter_portucalensis: 216 AMR clusters (3 core, 150 aux, 63 sing)
  Klebsiella_oxytoca: 214 AMR clusters (10 core, 146 aux, 58 sing)
  Enterobacter_cloacae_M: 196 AMR clusters (7 core, 145 aux, 44 sing)

# Get taxonomy for all species
# gtdb_taxonomy_r214v1 joins on genome_id (not gtdb_species_clade_id)
# So we go: genome → gtdb_taxonomy, then deduplicate to species level
taxonomy = spark.sql("""
    SELECT g.gtdb_species_clade_id,
           t.domain, t.phylum, t.class, t.`order`, t.family, t.genus
    FROM kbase_ke_pangenome.genome g
    JOIN kbase_ke_pangenome.gtdb_taxonomy_r214v1 t
        ON g.genome_id = t.genome_id
""").toPandas()

# Deduplicate to species level (one row per species)
taxonomy_species = taxonomy.drop_duplicates('gtdb_species_clade_id')
print(f"Taxonomy records (species-level): {len(taxonomy_species):,}")

# Get pangenome stats (small table)
# Actual column names: no_core, no_aux_genome, no_singleton_gene_clusters, no_gene_clusters, no_genomes
pangenome_stats = spark.sql("""
    SELECT gtdb_species_clade_id, no_genomes, no_gene_clusters,
           no_core, no_aux_genome, no_singleton_gene_clusters
    FROM kbase_ke_pangenome.pangenome
""").toPandas()

# Rename for clarity in downstream analysis
pangenome_stats = pangenome_stats.rename(columns={
    'no_core': 'no_core_gene_clusters',
    'no_aux_genome': 'no_auxiliary_gene_clusters'
})

print(f"Pangenome stats: {len(pangenome_stats):,} species")

Taxonomy records (species-level): 27,690

Pangenome stats: 27,702 species

# Merge: species_amr + taxonomy + pangenome stats
species_full = species_amr.merge(taxonomy_species, on='gtdb_species_clade_id', how='left')
species_full = species_full.merge(pangenome_stats, on='gtdb_species_clade_id', how='left')

# AMR density = AMR clusters / total clusters
species_full['amr_density'] = species_full['n_amr'] / species_full['no_gene_clusters']
species_full['pct_core_amr'] = species_full['n_core_amr'] / species_full['n_amr'] * 100
species_full['openness'] = species_full['no_singleton_gene_clusters'] / species_full['no_gene_clusters']

print(f"Species with full data: {len(species_full):,}")
print(f"\n=== Phylum-level AMR Summary ===")
phylum_amr = species_full.groupby('phylum').agg(
    n_species_with_amr=('gtdb_species_clade_id', 'count'),
    total_amr=('n_amr', 'sum'),
    mean_amr=('n_amr', 'mean'),
    median_amr=('n_amr', 'median'),
    mean_amr_density=('amr_density', 'mean'),
    mean_pct_core=('pct_core_amr', 'mean')
).round(3).sort_values('total_amr', ascending=False)

# Add total species per phylum for context
total_species_by_phylum = taxonomy_species.groupby('phylum').size().rename('total_species')
phylum_amr = phylum_amr.join(total_species_by_phylum)
phylum_amr['pct_species_with_amr'] = (phylum_amr['n_species_with_amr'] / phylum_amr['total_species'] * 100).round(1)

print(phylum_amr.head(20).to_string())

Species with full data: 14,723

=== Phylum-level AMR Summary ===
                      n_species_with_amr  total_amr  mean_amr  median_amr  mean_amr_density  mean_pct_core  total_species  pct_species_with_amr
phylum                                                                                                                                         
p__Pseudomonadota                   4992      42904     8.595         3.0             0.001         52.784           7456                  67.0
p__Bacillota                        1401      12614     9.004         4.0             0.001         40.540           2146                  65.3
p__Actinomycetota                   2036       9027     4.434         3.0             0.001         59.354           3172                  64.2
p__Bacillota_A                      2254       8776     3.894         3.0             0.001         36.667           3917                  57.5
p__Bacteroidota                     1835       5458     2.974         2.0             0.001         46.082           3629                  50.6
p__Bacillota_C                       118        396     3.356         2.0             0.001         22.829            235                  50.2
p__Cyanobacteriota                   176        386     2.193         2.0             0.000         55.179            469                  37.5
p__Acidobacteriota                   175        373     2.131         1.0             0.000         71.077            283                  61.8
p__Campylobacterota                   96        294     3.062         1.0             0.001         56.033            271                  35.4
p__Chloroflexota                     176        291     1.653         1.0             0.000         58.454            457                  38.5
p__Verrucomicrobiota                 140        211     1.507         1.0             0.000         66.332            553                  25.3
p__Myxococcota                        86        189     2.198         2.0             0.000         75.678            157                  54.8
p__Spirochaetota                     117        171     1.462         1.0             0.000         31.481            296                  39.5
p__Gemmatimonadota                    74        164     2.216         1.0             0.000         55.025            102                  72.5
p__Bacillota_B                        40        138     3.450         2.0             0.001         38.716            101                  39.6
p__Desulfobacterota                   72        133     1.847         1.0             0.000         48.901            319                  22.6
p__Halobacteriota                     82        126     1.537         1.0             0.000         59.959            259                  31.7
p__Thermoplasmatota                   65        109     1.677         1.0             0.001         26.603            294                  22.1
p__Planctomycetota                    84        102     1.214         1.0             0.000         64.286            348                  24.1
p__Deinococcota                       41         98     2.390         2.0             0.001         65.579             52                  78.8

# Phylum-level visualization: AMR prevalence and density
fig, axes = plt.subplots(1, 2, figsize=(16, 7))

# Top 15 phyla by number of AMR-carrying species
top_phyla = phylum_amr.nlargest(15, 'n_species_with_amr')

# Panel 1: % species with AMR
top_phyla_sorted = top_phyla.sort_values('pct_species_with_amr')
colors = plt.cm.RdYlGn_r(top_phyla_sorted['pct_species_with_amr'] / top_phyla_sorted['pct_species_with_amr'].max())
axes[0].barh(top_phyla_sorted.index, top_phyla_sorted['pct_species_with_amr'], color=colors)
axes[0].set_xlabel('% Species with AMR Genes')
axes[0].set_title('AMR Prevalence by Phylum')
for i, (idx, row) in enumerate(top_phyla_sorted.iterrows()):
    axes[0].text(row['pct_species_with_amr'] + 0.5, i,
                f'{int(row["n_species_with_amr"])}/{int(row["total_species"])}',
                va='center', fontsize=8)

# Panel 2: Mean AMR clusters per species
top_phyla_sorted2 = top_phyla.sort_values('mean_amr')
colors2 = plt.cm.Reds(top_phyla_sorted2['mean_amr'] / top_phyla_sorted2['mean_amr'].max())
axes[1].barh(top_phyla_sorted2.index, top_phyla_sorted2['mean_amr'], color=colors2)
axes[1].set_xlabel('Mean AMR Clusters per Species')
axes[1].set_title('AMR Density by Phylum')

plt.suptitle('Taxonomic Distribution of AMR Genes', fontsize=14, y=1.01)
plt.tight_layout()
plt.savefig(FIG_DIR / 'amr_phylum_distribution.png')
plt.show()

# AMR count vs pangenome openness
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Panel 1: AMR count vs openness
axes[0].scatter(species_full['openness'], species_full['n_amr'],
                alpha=0.3, s=10, color='#e74c3c')
axes[0].set_xlabel('Pangenome Openness (singleton fraction)')
axes[0].set_ylabel('AMR Clusters')
axes[0].set_title('AMR Count vs Openness')

# Correlation
from scipy.stats import spearmanr
rho, p = spearmanr(species_full['openness'].dropna(), species_full['n_amr'].dropna())
axes[0].text(0.05, 0.95, f'rho={rho:.3f}, p={p:.2e}',
             transform=axes[0].transAxes, fontsize=9, va='top')

# Panel 2: AMR count vs total gene clusters (pangenome size)
axes[1].scatter(species_full['no_gene_clusters'], species_full['n_amr'],
                alpha=0.3, s=10, color='#3498db')
axes[1].set_xlabel('Total Gene Clusters')
axes[1].set_ylabel('AMR Clusters')
axes[1].set_title('AMR Count vs Pangenome Size')
rho2, p2 = spearmanr(species_full['no_gene_clusters'].dropna(), species_full['n_amr'].dropna())
axes[1].text(0.05, 0.95, f'rho={rho2:.3f}, p={p2:.2e}',
             transform=axes[1].transAxes, fontsize=9, va='top')

# Panel 3: AMR count vs number of genomes
axes[2].scatter(species_full['no_genomes'], species_full['n_amr'],
                alpha=0.3, s=10, color='#9b59b6')
axes[2].set_xlabel('Number of Genomes')
axes[2].set_ylabel('AMR Clusters')
axes[2].set_title('AMR Count vs Genome Count')
rho3, p3 = spearmanr(species_full['no_genomes'].dropna(), species_full['n_amr'].dropna())
axes[2].text(0.05, 0.95, f'rho={rho3:.3f}, p={p3:.2e}',
             transform=axes[2].transAxes, fontsize=9, va='top')

plt.tight_layout()
plt.savefig(FIG_DIR / 'amr_vs_pangenome_structure.png')
plt.show()

# Get bakta annotations for AMR clusters
# Cannot use spark.createDataFrame (version mismatch), so use IN clause in chunks
amr_ids = amr_gc['gene_cluster_id'].unique().tolist()

def chunked_query(spark, ids, query_template, chunk_size=5000):
    """Run a query with IN clause in chunks to avoid size limits."""
    results = []
    for i in range(0, len(ids), chunk_size):
        chunk = ids[i:i+chunk_size]
        id_list = "','".join(chunk)
        query = query_template.format(id_list=f"'{id_list}'")
        results.append(spark.sql(query).toPandas())
    return pd.concat(results, ignore_index=True) if results else pd.DataFrame()

# Actual bakta_annotations columns: gene, product, hypothetical, ec, go, cog_id, cog_category, kegg_orthology_id
amr_bakta = chunked_query(spark, amr_ids, """
    SELECT gene_cluster_id, product, gene, hypothetical,
           cog_category, ec, go, kegg_orthology_id
    FROM kbase_ke_pangenome.bakta_annotations
    WHERE gene_cluster_id IN ({id_list})
""")

print(f"Bakta annotations for AMR clusters: {len(amr_bakta):,}")
print(f"AMR clusters with bakta annotation: {amr_bakta['gene_cluster_id'].nunique():,} / {len(amr_ids):,}")

# Check annotation completeness
print(f"\n=== Annotation Field Completeness ===")
for col in ['product', 'gene', 'cog_category', 'ec', 'go', 'kegg_orthology_id']:
    non_null = amr_bakta[col].notna().sum()
    non_empty = (amr_bakta[col].notna() & (amr_bakta[col] != '') & (amr_bakta[col] != 'hypothetical protein')).sum()
    print(f"  {col}: {non_null:,} non-null, {non_empty:,} informative")

Bakta annotations for AMR clusters: 82,976
AMR clusters with bakta annotation: 82,908 / 82,908

=== Annotation Field Completeness ===
  product: 82,976 non-null, 82,976 informative
  gene: 82,940 non-null, 82,940 informative
  cog_category: 22,761 non-null, 22,761 informative
  ec: 34,594 non-null, 34,594 informative
  go: 51,660 non-null, 51,660 informative
  kegg_orthology_id: 37,284 non-null, 37,284 informative

# Get eggNOG annotations for AMR clusters (chunked IN clause)
amr_eggnog = chunked_query(spark, amr_ids, """
    SELECT query_name as gene_cluster_id,
           COG_category, Description, Preferred_name,
           KEGG_ko, KEGG_Pathway, GOs, PFAMs
    FROM kbase_ke_pangenome.eggnog_mapper_annotations
    WHERE query_name IN ({id_list})
""")

print(f"eggNOG annotations for AMR clusters: {len(amr_eggnog):,}")
print(f"AMR clusters with eggNOG: {amr_eggnog['gene_cluster_id'].nunique():,} / {len(amr_ids):,}")

# COG category distribution for AMR genes
print(f"\n=== COG Categories of AMR Genes ===")
cog_counts = amr_eggnog['COG_category'].value_counts().head(20)
print(cog_counts.to_string())

eggNOG annotations for AMR clusters: 77,156
AMR clusters with eggNOG: 77,156 / 82,908

=== COG Categories of AMR Genes ===
COG_category
S      16369
V      12637
J       7420
P       5487
M       4880
K       4758
C       4422
H       4003
EGP     3974
E       2659
T       2033
F       1290
O       1042
KT       977
CH       921
G        840
L        744
KTV      508
U        474
Q        359

# Get Pfam domains for AMR clusters (chunked IN clause)
# Actual columns: gene_cluster_id, pfam_id, pfam_name, start, stop, score, evalue, aa_cov, hmm_cov
amr_pfam = chunked_query(spark, amr_ids, """
    SELECT gene_cluster_id, pfam_id, pfam_name
    FROM kbase_ke_pangenome.bakta_pfam_domains
    WHERE gene_cluster_id IN ({id_list})
""")

print(f"Pfam domain hits for AMR clusters: {len(amr_pfam):,}")
print(f"AMR clusters with Pfam domains: {amr_pfam['gene_cluster_id'].nunique():,} / {len(amr_ids):,}")

print(f"\n=== Top 20 Pfam Domains in AMR Genes ===")
print(amr_pfam.groupby(['pfam_id', 'pfam_name']).size()
      .sort_values(ascending=False).head(20).to_string())

Pfam domain hits for AMR clusters: 0
AMR clusters with Pfam domains: 0 / 82,908

=== Top 20 Pfam Domains in AMR Genes ===
Series([], )

# Identify "annotation depth" tiers for AMR clusters
# Merge all annotation sources
amr_anno_depth = pd.DataFrame({'gene_cluster_id': amr_ids})

# Has bakta product (non-hypothetical)?
bakta_informative = set(amr_bakta.loc[
    (amr_bakta['product'].notna()) & 
    (~amr_bakta['product'].str.contains('hypothetical', case=False, na=True)),
    'gene_cluster_id'
].unique())

# Has eggNOG?
eggnog_annotated = set(amr_eggnog['gene_cluster_id'].unique())

# Has Pfam?
pfam_annotated = set(amr_pfam['gene_cluster_id'].unique())

amr_anno_depth['has_bakta_product'] = amr_anno_depth['gene_cluster_id'].isin(bakta_informative)
amr_anno_depth['has_eggnog'] = amr_anno_depth['gene_cluster_id'].isin(eggnog_annotated)
amr_anno_depth['has_pfam'] = amr_anno_depth['gene_cluster_id'].isin(pfam_annotated)
amr_anno_depth['n_sources'] = (amr_anno_depth['has_bakta_product'].astype(int) +
                                amr_anno_depth['has_eggnog'].astype(int) +
                                amr_anno_depth['has_pfam'].astype(int))

print("=== Annotation Depth of AMR Clusters ===")
print(f"Total AMR clusters: {len(amr_anno_depth):,}")
print(f"\nAnnotation sources:")
print(f"  Bakta product (non-hypothetical): {amr_anno_depth['has_bakta_product'].sum():,} ({amr_anno_depth['has_bakta_product'].mean()*100:.1f}%)")
print(f"  eggNOG: {amr_anno_depth['has_eggnog'].sum():,} ({amr_anno_depth['has_eggnog'].mean()*100:.1f}%)")
print(f"  Pfam: {amr_anno_depth['has_pfam'].sum():,} ({amr_anno_depth['has_pfam'].mean()*100:.1f}%)")
print(f"\nNumber of annotation sources:")
print(amr_anno_depth['n_sources'].value_counts().sort_index().to_string())

amr_only = amr_anno_depth[amr_anno_depth['n_sources'] == 0]
print(f"\n'AMR-only' clusters (no other annotation): {len(amr_only):,} ({len(amr_only)/len(amr_anno_depth)*100:.1f}%)")

=== Annotation Depth of AMR Clusters ===
Total AMR clusters: 82,908

Annotation sources:
  Bakta product (non-hypothetical): 82,908 (100.0%)
  eggNOG: 77,156 (93.1%)
  Pfam: 0 (0.0%)

Number of annotation sources:
n_sources
1     5752
2    77156

'AMR-only' clusters (no other annotation): 0 (0.0%)

# Save the core census dataset: AMR + conservation + species + mechanism + annotation depth
amr_census = amr_gc.merge(
    amr_anno_depth[['gene_cluster_id', 'has_bakta_product', 'has_eggnog', 'has_pfam', 'n_sources']],
    on='gene_cluster_id', how='left'
)

# Add taxonomy
amr_census = amr_census.merge(
    taxonomy_species[['gtdb_species_clade_id', 'phylum', 'class', 'order', 'family', 'genus']],
    on='gtdb_species_clade_id', how='left'
)

amr_census.to_csv(DATA_DIR / 'amr_census.csv', index=False)
print(f"Saved {len(amr_census):,} rows to data/amr_census.csv")
print(f"Columns: {list(amr_census.columns)}")

# Save species-level summary
species_full.to_csv(DATA_DIR / 'amr_species_summary.csv', index=False)
print(f"\nSaved {len(species_full):,} species to data/amr_species_summary.csv")

# Save phylum summary
phylum_amr.to_csv(DATA_DIR / 'amr_phylum_summary.csv')
print(f"Saved phylum summary to data/amr_phylum_summary.csv")

Saved 83,008 rows to data/amr_census.csv
Columns: ['gene_cluster_id', 'amr_gene', 'amr_product', 'method', 'identity', 'query_cov', 'subject_cov', 'accession', 'gtdb_species_clade_id', 'is_core', 'is_auxiliary', 'is_singleton', 'likelihood', 'conservation_class', 'mechanism', 'has_bakta_product', 'has_eggnog', 'has_pfam', 'n_sources', 'phylum', 'class', 'order', 'family', 'genus']

Saved 14,723 species to data/amr_species_summary.csv
Saved phylum summary to data/amr_phylum_summary.csv

	gene_cluster_id	amr_gene	amr_product	method	identity	query_cov	subject_cov	accession
0	NZ_CP021744.1_6105	rox	rifampin monooxygenase	HMM	NaN	NaN	NaN	NF033145.1
1	NZ_CP021744.1_6518	rox	rifampin monooxygenase	HMM	NaN	NaN	NaN	NF033145.1
2	NZ_CP021744.1_6758	bla	class A beta-lactamase	HMM	NaN	NaN	NaN	NF033103.1
3	NZ_CP097123.1_2672	cml	Cmx/CmrA family chloramphenicol efflux MFS tra...	HMM	NaN	NaN	NaN	NF033135.1
4	NZ_CP097123.1_3003	cpt	chloramphenicol phosphotransferase CPT	HMM	NaN	NaN	NaN	NF033114.1
5	NZ_CP097123.1_3739	ble	BLMA family bleomycin binding protein	HMM	NaN	NaN	NaN	NF033156.1
6	BMST01000005.1_428	bla	class A beta-lactamase	HMM	NaN	NaN	NaN	NF033103.1
7	NZ_BNBF01000004.1_2	cpt	chloramphenicol phosphotransferase CPT	HMM	NaN	NaN	NaN	NF033114.1
8	NZ_BNBF01000004.1_44	arr	NAD(+)--rifampin ADP-ribosyltransferase	HMM	NaN	NaN	NaN	NF033144.1
9	NZ_BNBF01000004.1_8	helR	RNA polymerase recycling motor ATPase HelR	HMM	NaN	NaN	NaN	NF041254.1

Metric	Value
Total AMR hits
Distinct AMR gene families
Distinct AMR products
Species with AMR
% species with AMR
Overall % core AMR
Overall % auxiliary AMR
Overall % singleton AMR
Baseline % core (all genes)	46.8%
Top mechanism
"AMR-only" clusters

01 Amr Census

NB01: AMR Data Census¶

1. Raw AMR Table: Overview¶

2. Join AMR to Gene Cluster — Conservation & Species Context¶

3. AMR Gene Families & Mechanism Classification¶

4. Species-Level AMR Density¶

5. Taxonomic Distribution¶

6. AMR vs Pangenome Openness¶

7. Functional Annotation of AMR Clusters¶

8. Save Census Data¶

9. Census Summary¶

	gene_cluster_id	amr_gene	amr_product	method	identity	query_cov	subject_cov	accession	gtdb_species_clade_id	is_core	is_auxiliary	is_singleton	likelihood	conservation_class
0	CAJTGB010000003.1_105	lsa	Lsa family ABC-F type ribosomal protection pro...	HMM	NaN	NaN	NaN	NF000167.2	s__Sporofaciens_sp910575835--GB_GCA_910575835.1	True	False	False	0.669497	Core
1	CAJTGB010000004.1_67	aadE	aminoglycoside 6-adenylyltransferase AadE	EXACTP	1.0	1.0	1.0	WP_001255868.1	s__Sporofaciens_sp910575835--GB_GCA_910575835.1	False	True	True	-5.362968	Singleton
2	CAJTGB010000005.1_167	vanR	VanR-ABDEGLN family response regulator transcr...	HMM	NaN	NaN	NaN	NF033117.2	s__Sporofaciens_sp910575835--GB_GCA_910575835.1	False	True	True	-5.362968	Singleton
3	CAJTGB010000005.1_169	vanG	D-alanine--D-serine ligase VanG	HMM	NaN	NaN	NaN	NF000091.3	s__Sporofaciens_sp910575835--GB_GCA_910575835.1	False	True	True	-5.362968	Singleton
4	CAJTGB010000005.1_172	vanT	VanT-G-like membrane-bound serine racemase	HMM	NaN	NaN	NaN	NF033131.1	s__Sporofaciens_sp910575835--GB_GCA_910575835.1	False	True	True	-5.362968	Singleton