import os
import warnings
from collections import Counter
from itertools import combinations

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from statsmodels.stats.multitest import multipletests

warnings.filterwarnings('ignore', category=FutureWarning)

_here = os.path.abspath('')
if os.path.basename(_here) == 'notebooks':
    REPO = os.path.abspath(os.path.join(_here, '..', '..', '..'))
elif os.path.exists(os.path.join(_here, 'projects', 'amr_cofitness_networks')):
    REPO = _here
else:
    REPO = os.path.abspath(os.path.join(_here, '..', '..', '..'))

PROJECT = os.path.join(REPO, 'projects', 'amr_cofitness_networks')
DATA = os.path.join(PROJECT, 'data')
FIGURES = os.path.join(PROJECT, 'figures')
CVF_DATA = os.path.join(REPO, 'projects', 'conservation_vs_fitness', 'data')

# Load partners
partners = pd.read_csv(os.path.join(DATA, 'amr_cofitness_partners.csv'))
partners['amr_locusId'] = partners['amr_locusId'].astype(str)
partners['partner_locusId'] = partners['partner_locusId'].astype(str)
extra = partners[~partners['is_near_operon']].copy()

# Load FB link for locusId -> gene_cluster_id
fb_link = pd.read_csv(os.path.join(CVF_DATA, 'fb_pangenome_link.tsv'), sep='\t')
fb_link['locusId'] = fb_link['locusId'].astype(str)
locus_to_cluster = dict(zip(
    zip(fb_link['orgId'], fb_link['locusId']),
    fb_link['gene_cluster_id']
))

# Map partners to gene clusters
extra['partner_cluster_id'] = extra.apply(
    lambda r: locus_to_cluster.get((r['orgId'], r['partner_locusId'])), axis=1)

# Load InterProScan GO
go_annot = pd.read_csv(os.path.join(DATA, 'fb_interproscan_go.csv'))
cluster_go = go_annot.groupby('gene_cluster_id')['go_id'].apply(set).to_dict()

# Load bakta KEGG
bakta_kegg = pd.read_csv(os.path.join(DATA, 'fb_bakta_kegg.csv'))
cluster_kegg = dict(zip(bakta_kegg['gene_cluster_id'], bakta_kegg['kegg_orthology_id']))

# Known GO term descriptions
GO_DESCRIPTIONS = {
    'GO:0071973': 'flagellum-dependent motility',
    'GO:0044780': 'flagellum assembly',
    'GO:0009288': 'bacterial-type flagellum',
    'GO:0071978': 'flagellum-dependent swarming',
    'GO:0000105': 'histidine biosynthesis',
    'GO:0000162': 'tryptophan biosynthesis',
    'GO:0000160': 'phosphorelay signal transduction',
    'GO:0007165': 'signal transduction',
    'GO:0006935': 'chemotaxis',
    'GO:0000155': 'phosphorelay sensor kinase',
    'GO:0003774': 'motor activity',
    'GO:0016310': 'phosphorylation',
    'GO:0016020': 'membrane',
    'GO:0015074': 'DNA integration',
    'GO:0055085': 'transmembrane transport',
    'GO:0006355': 'regulation of transcription',
    'GO:0005524': 'ATP binding',
    'GO:0003677': 'DNA binding',
    'GO:0005886': 'plasma membrane',
    'GO:0006313': 'transposition, DNA-mediated',
}

print(f'Extra-operon partners: {len(extra):,}')
print(f'Mapped to clusters: {extra["partner_cluster_id"].notna().sum():,}')

Extra-operon partners: 179,375
Mapped to clusters: 171,051

# For each mechanism, which GO terms appear in support networks across organisms?
mechanisms = ['efflux', 'enzymatic_inactivation', 'metal_resistance']

# Focus on the key GO terms from NB03b (sig in >=1 organism)
key_go_terms = ['GO:0071973', 'GO:0044780', 'GO:0009288', 'GO:0071978',
                'GO:0000105', 'GO:0000162', 'GO:0000160', 'GO:0006935',
                'GO:0055085', 'GO:0006355', 'GO:0016020', 'GO:0007165',
                'GO:0016310', 'GO:0015074']

conservation_data = []

for mech in mechanisms:
    mech_extra = extra[extra['amr_mechanism'] == mech]
    
    for org in sorted(mech_extra['orgId'].unique()):
        org_partners = mech_extra[mech_extra['orgId'] == org]
        sup_clusters = set(org_partners['partner_cluster_id'].dropna())
        
        # Collect all GO terms in this organism's support network
        sup_go = set()
        for cid in sup_clusters:
            sup_go.update(cluster_go.get(cid, set()))
        
        for go_term in key_go_terms:
            conservation_data.append({
                'mechanism': mech,
                'orgId': org,
                'go_term': go_term,
                'go_desc': GO_DESCRIPTIONS.get(go_term, go_term),
                'present': go_term in sup_go,
            })

cons_df = pd.DataFrame(conservation_data)

# Pivot: GO term x mechanism -> % of organisms with that term in support network
for mech in mechanisms:
    mech_data = cons_df[cons_df['mechanism'] == mech]
    n_orgs = mech_data['orgId'].nunique()
    
    print(f'\n=== {mech} ({n_orgs} organisms) ===')
    for go_term in key_go_terms:
        go_data = mech_data[mech_data['go_term'] == go_term]
        n_present = go_data['present'].sum()
        pct = n_present / n_orgs * 100
        desc = GO_DESCRIPTIONS.get(go_term, '')
        if pct >= 25:  # Only show terms present in >=25% of organisms
            print(f'  {go_term} ({desc:35s}): {n_present:2d}/{n_orgs} ({pct:5.1f}%)')

=== efflux (28 organisms) ===
  GO:0071973 (flagellum-dependent motility       ): 16/28 ( 57.1%)
  GO:0044780 (flagellum assembly                 ): 16/28 ( 57.1%)
  GO:0009288 (bacterial-type flagellum           ): 15/28 ( 53.6%)
  GO:0071978 (flagellum-dependent swarming       ): 16/28 ( 57.1%)
  GO:0000105 (histidine biosynthesis             ): 19/28 ( 67.9%)
  GO:0000162 (tryptophan biosynthesis            ): 18/28 ( 64.3%)
  GO:0000160 (phosphorelay signal transduction   ): 26/28 ( 92.9%)
  GO:0006935 (chemotaxis                         ): 20/28 ( 71.4%)
  GO:0055085 (transmembrane transport            ): 28/28 (100.0%)
  GO:0006355 (regulation of transcription        ): 28/28 (100.0%)
  GO:0016020 (membrane                           ): 28/28 (100.0%)
  GO:0007165 (signal transduction                ): 28/28 (100.0%)
  GO:0016310 (phosphorylation                    ): 27/28 ( 96.4%)
  GO:0015074 (DNA integration                    ): 18/28 ( 64.3%)

=== enzymatic_inactivation (26 organisms) ===
  GO:0071973 (flagellum-dependent motility       ): 15/26 ( 57.7%)
  GO:0044780 (flagellum assembly                 ): 12/26 ( 46.2%)
  GO:0009288 (bacterial-type flagellum           ): 15/26 ( 57.7%)
  GO:0071978 (flagellum-dependent swarming       ): 16/26 ( 61.5%)
  GO:0000105 (histidine biosynthesis             ): 17/26 ( 65.4%)
  GO:0000162 (tryptophan biosynthesis            ): 19/26 ( 73.1%)
  GO:0000160 (phosphorelay signal transduction   ): 26/26 (100.0%)
  GO:0006935 (chemotaxis                         ): 24/26 ( 92.3%)
  GO:0055085 (transmembrane transport            ): 26/26 (100.0%)
  GO:0006355 (regulation of transcription        ): 26/26 (100.0%)
  GO:0016020 (membrane                           ): 26/26 (100.0%)
  GO:0007165 (signal transduction                ): 26/26 (100.0%)
  GO:0016310 (phosphorylation                    ): 26/26 (100.0%)
  GO:0015074 (DNA integration                    ): 23/26 ( 88.5%)

=== metal_resistance (23 organisms) ===
  GO:0071973 (flagellum-dependent motility       ): 13/23 ( 56.5%)
  GO:0044780 (flagellum assembly                 ): 12/23 ( 52.2%)
  GO:0009288 (bacterial-type flagellum           ): 14/23 ( 60.9%)
  GO:0071978 (flagellum-dependent swarming       ): 14/23 ( 60.9%)
  GO:0000105 (histidine biosynthesis             ):  7/23 ( 30.4%)
  GO:0000162 (tryptophan biosynthesis            ): 10/23 ( 43.5%)
  GO:0000160 (phosphorelay signal transduction   ): 21/23 ( 91.3%)
  GO:0006935 (chemotaxis                         ): 16/23 ( 69.6%)
  GO:0055085 (transmembrane transport            ): 21/23 ( 91.3%)
  GO:0006355 (regulation of transcription        ): 22/23 ( 95.7%)
  GO:0016020 (membrane                           ): 22/23 ( 95.7%)
  GO:0007165 (signal transduction                ): 20/23 ( 87.0%)
  GO:0016310 (phosphorylation                    ): 20/23 ( 87.0%)
  GO:0015074 (DNA integration                    ): 16/23 ( 69.6%)

# Conservation heatmap: GO term x mechanism (% organisms)
pivot = cons_df.groupby(['go_term', 'mechanism'])['present'].mean().unstack() * 100
# Add descriptions
pivot.index = [f"{idx} ({GO_DESCRIPTIONS.get(idx, '')[:30]})" for idx in pivot.index]

# Sort by max presence
pivot['max'] = pivot[mechanisms].max(axis=1)
pivot = pivot.sort_values('max', ascending=False).drop('max', axis=1)

# Only keep terms present in >25% for any mechanism
mask = (pivot > 25).any(axis=1)
pivot_filtered = pivot[mask]

fig, ax = plt.subplots(figsize=(8, max(5, len(pivot_filtered) * 0.4)))

sns.heatmap(pivot_filtered, cmap='YlOrRd', annot=True, fmt='.0f',
            cbar_kws={'label': '% of organisms with term in support network'},
            ax=ax, linewidths=0.5, linecolor='white', vmin=0, vmax=100)
ax.set_title('GO Term Conservation in AMR Support Networks by Mechanism')
ax.set_ylabel('')
ax.set_xticklabels(['Efflux', 'Enzymatic\nInactivation', 'Metal\nResistance'], rotation=0)
ax.tick_params(axis='y', labelsize=8)

plt.tight_layout()
plt.savefig(os.path.join(FIGURES, 'go_conservation_heatmap.png'), dpi=150, bbox_inches='tight')
plt.show()
print('Saved to figures/go_conservation_heatmap.png')

Saved to figures/go_conservation_heatmap.png

# Test: are any GO terms significantly more prevalent in one mechanism's
# support networks than another?

specificity_results = []

for go_term in key_go_terms:
    contingency = []
    mech_pcts = {}
    for mech in mechanisms:
        mech_data = cons_df[(cons_df['mechanism'] == mech) & (cons_df['go_term'] == go_term)]
        n_yes = mech_data['present'].sum()
        n_no = len(mech_data) - n_yes
        contingency.append([n_yes, n_no])
        mech_pcts[mech] = n_yes / len(mech_data) * 100 if len(mech_data) > 0 else 0
    
    table = np.array(contingency)
    if table.sum() == 0 or table[:, 0].sum() < 3:
        continue
    
    chi2, p, dof, expected = stats.chi2_contingency(table)
    
    specificity_results.append({
        'go_term': go_term,
        'go_desc': GO_DESCRIPTIONS.get(go_term, ''),
        'efflux_pct': mech_pcts['efflux'],
        'enzymatic_pct': mech_pcts['enzymatic_inactivation'],
        'metal_pct': mech_pcts['metal_resistance'],
        'chi2': chi2,
        'p_value': p,
    })

spec_df = pd.DataFrame(specificity_results)
if len(spec_df) > 0:
    _, q, _, _ = multipletests(spec_df['p_value'], method='fdr_bh')
    spec_df['q_value'] = q
    spec_df['significant'] = q < 0.05
    
    print('Mechanism specificity of GO terms in support networks:')
    print(f'Tested: {len(spec_df)}, Significant (FDR<0.05): {spec_df["significant"].sum()}')
    print()
    for _, row in spec_df.sort_values('p_value').iterrows():
        sig = '*' if row['significant'] else ''
        print(f'  {row["go_term"]} ({row["go_desc"]:35s}): '
              f'eff={row["efflux_pct"]:5.1f}% enz={row["enzymatic_pct"]:5.1f}% '
              f'met={row["metal_pct"]:5.1f}%  p={row["p_value"]:.4g} q={row["q_value"]:.4g} {sig}')

Mechanism specificity of GO terms in support networks:
Tested: 14, Significant (FDR<0.05): 0

  GO:0000105 (histidine biosynthesis             ): eff= 67.9% enz= 65.4% met= 30.4%  p=0.01343 q=0.1793 
  GO:0007165 (signal transduction                ): eff=100.0% enz=100.0% met= 87.0%  p=0.02562 q=0.1793 
  GO:0055085 (transmembrane transport            ): eff=100.0% enz=100.0% met= 91.3%  p=0.08978 q=0.2175 
  GO:0006935 (chemotaxis                         ): eff= 71.4% enz= 92.3% met= 69.6%  p=0.09313 q=0.2175 
  GO:0000162 (tryptophan biosynthesis            ): eff= 64.3% enz= 73.1% met= 43.5%  p=0.0958 q=0.2175 
  GO:0016310 (phosphorylation                    ): eff= 96.4% enz=100.0% met= 87.0%  p=0.108 q=0.2175 
  GO:0015074 (DNA integration                    ): eff= 64.3% enz= 88.5% met= 69.6%  p=0.1087 q=0.2175 
  GO:0016020 (membrane                           ): eff=100.0% enz=100.0% met= 95.7%  p=0.3044 q=0.4631 
  GO:0006355 (regulation of transcription        ): eff=100.0% enz=100.0% met= 95.7%  p=0.3044 q=0.4631 
  GO:0000160 (phosphorelay signal transduction   ): eff= 92.9% enz=100.0% met= 91.3%  p=0.3308 q=0.4631 
  GO:0044780 (flagellum assembly                 ): eff= 57.1% enz= 46.2% met= 52.2%  p=0.7215 q=0.9183 
  GO:0009288 (bacterial-type flagellum           ): eff= 53.6% enz= 57.7% met= 60.9%  p=0.8696 q=0.9966 
  GO:0071978 (flagellum-dependent swarming       ): eff= 57.1% enz= 61.5% met= 60.9%  p=0.9391 q=0.9966 
  GO:0071973 (flagellum-dependent motility       ): eff= 57.1% enz= 57.7% met= 56.5%  p=0.9966 q=0.9966

# Map partners to bakta KEGG IDs for cross-organism comparison
extra['partner_bakta_kegg'] = extra['partner_cluster_id'].map(
    lambda cid: cluster_kegg.get(cid) if pd.notna(cid) else None)

with_kegg = extra[extra['partner_bakta_kegg'].notna()]
print(f'Partners with bakta KEGG: {len(with_kegg):,} / {len(extra):,} ({len(with_kegg)/len(extra)*100:.1f}%)')

# Top KEGG orthologs across mechanisms
for mech in mechanisms:
    mech_kegg = with_kegg[with_kegg['amr_mechanism'] == mech]
    n_orgs = mech_kegg['orgId'].nunique()
    
    # Organism prevalence per KEGG
    ko_orgs = mech_kegg.groupby('partner_bakta_kegg')['orgId'].nunique()
    conserved = ko_orgs[ko_orgs >= n_orgs * 0.5].sort_values(ascending=False)
    
    print(f'\n=== {mech} ({n_orgs} orgs): {len(conserved)} KEGG orthologs in >50% of orgs ===')
    for ko, n in conserved.head(10).items():
        print(f'  {ko}: {n}/{n_orgs} organisms')

Partners with bakta KEGG: 39,278 / 179,375 (21.9%)

=== efflux (28 orgs): 1 KEGG orthologs in >50% of orgs ===
  K01652: 16/28 organisms

=== enzymatic_inactivation (26 orgs): 4 KEGG orthologs in >50% of orgs ===
  K01524: 14/26 organisms
  K01626: 14/26 organisms
  K01652: 13/26 organisms
  K03566: 13/26 organisms

=== metal_resistance (23 orgs): 0 KEGG orthologs in >50% of orgs ===

# Cross-organism Jaccard using GO terms (richer vocabulary than KEGG)
jaccard_go = []

for mech in mechanisms:
    mech_extra = extra[extra['amr_mechanism'] == mech]
    orgs = sorted(mech_extra['orgId'].unique())
    
    # GO terms per organism's support network
    org_go_sets = {}
    for org in orgs:
        org_partners = mech_extra[mech_extra['orgId'] == org]
        sup_clusters = set(org_partners['partner_cluster_id'].dropna())
        go_set = set()
        for cid in sup_clusters:
            go_set.update(cluster_go.get(cid, set()))
        org_go_sets[org] = go_set
    
    for o1, o2 in combinations(orgs, 2):
        s1, s2 = org_go_sets[o1], org_go_sets[o2]
        if len(s1) == 0 or len(s2) == 0:
            continue
        j = len(s1 & s2) / len(s1 | s2)
        jaccard_go.append({
            'mechanism': mech, 'org1': o1, 'org2': o2,
            'jaccard': j, 'shared': len(s1 & s2), 'union': len(s1 | s2),
        })

jac_go_df = pd.DataFrame(jaccard_go)

# Cross-mechanism (within organism)
cross_go = []
for mech1, mech2 in combinations(mechanisms, 2):
    for org in extra['orgId'].unique():
        m1_extra = extra[(extra['amr_mechanism'] == mech1) & (extra['orgId'] == org)]
        m2_extra = extra[(extra['amr_mechanism'] == mech2) & (extra['orgId'] == org)]
        if len(m1_extra) == 0 or len(m2_extra) == 0:
            continue
        
        s1 = set()
        for cid in m1_extra['partner_cluster_id'].dropna():
            s1.update(cluster_go.get(cid, set()))
        s2 = set()
        for cid in m2_extra['partner_cluster_id'].dropna():
            s2.update(cluster_go.get(cid, set()))
        
        if len(s1) == 0 or len(s2) == 0:
            continue
        j = len(s1 & s2) / len(s1 | s2)
        cross_go.append({
            'comparison': f'{mech1} vs {mech2}', 'orgId': org, 'jaccard': j,
        })

cross_go_df = pd.DataFrame(cross_go)

print('=== GO-based Jaccard similarities ===')
print('\nWithin-mechanism (same mechanism, different organisms):')
for mech in mechanisms:
    sub = jac_go_df[jac_go_df['mechanism'] == mech]
    print(f'  {mech:30s}: mean J={sub["jaccard"].mean():.3f}, '
          f'median={sub["jaccard"].median():.3f}, N={len(sub)} pairs')

print(f'\nCross-mechanism (different mechanisms, same organism):')
for comp in sorted(cross_go_df['comparison'].unique()):
    sub = cross_go_df[cross_go_df['comparison'] == comp]
    print(f'  {comp:45s}: mean J={sub["jaccard"].mean():.3f}, N={len(sub)}')

print(f'\nOverall within-mechanism: {jac_go_df["jaccard"].mean():.3f}')
print(f'Overall cross-mechanism:  {cross_go_df["jaccard"].mean():.3f}')

if len(jac_go_df) > 5 and len(cross_go_df) > 5:
    u, p = stats.mannwhitneyu(cross_go_df['jaccard'], jac_go_df['jaccard'],
                               alternative='greater')
    print(f'Cross > Within: MWU p={p:.4g}')

=== GO-based Jaccard similarities ===

Within-mechanism (same mechanism, different organisms):
  efflux                        : mean J=0.202, median=0.198, N=378 pairs
  enzymatic_inactivation        : mean J=0.240, median=0.215, N=325 pairs
  metal_resistance              : mean J=0.170, median=0.170, N=253 pairs

Cross-mechanism (different mechanisms, same organism):
  efflux vs enzymatic_inactivation             : mean J=0.422, N=26
  efflux vs metal_resistance                   : mean J=0.329, N=23
  enzymatic_inactivation vs metal_resistance   : mean J=0.368, N=22

Overall within-mechanism: 0.207
Overall cross-mechanism:  0.375
Cross > Within: MWU p=4.284e-13

# Summary figure: Jaccard comparison
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Left: within-mechanism Jaccard by mechanism
ax = axes[0]
sns.boxplot(data=jac_go_df, x='mechanism', y='jaccard', palette='Set2', ax=ax, showfliers=False)
ax.set_xlabel('AMR Mechanism')
ax.set_ylabel('Jaccard Similarity (GO terms)')
ax.set_title('Cross-Organism Support Network Similarity\n(same mechanism, different organisms)')
ax.tick_params(axis='x', rotation=20)

# Right: within-mech vs cross-mech
ax = axes[1]
compare_data = pd.DataFrame([
    {'type': 'Same mechanism\ndifferent organisms', 'jaccard': j}
    for j in jac_go_df['jaccard']
] + [
    {'type': 'Different mechanisms\nsame organism', 'jaccard': j}
    for j in cross_go_df['jaccard']
])
sns.boxplot(data=compare_data, x='type', y='jaccard', palette='Set1', ax=ax, showfliers=False)
ax.set_xlabel('')
ax.set_ylabel('Jaccard Similarity (GO terms)')
ax.set_title('Support Network Sharing:\nOrganism-Specific or Mechanism-Specific?')

plt.tight_layout()
plt.savefig(os.path.join(FIGURES, 'jaccard_go_comparison.png'), dpi=150, bbox_inches='tight')
plt.show()
print('Saved to figures/jaccard_go_comparison.png')

Saved to figures/jaccard_go_comparison.png

# Save and summarize
cons_df.to_csv(os.path.join(DATA, 'go_conservation_by_mechanism.csv'), index=False)
jac_go_df.to_csv(os.path.join(DATA, 'jaccard_go_within_mechanism.csv'), index=False)
cross_go_df.to_csv(os.path.join(DATA, 'jaccard_go_cross_mechanism.csv'), index=False)
if len(spec_df) > 0:
    spec_df.to_csv(os.path.join(DATA, 'go_mechanism_specificity.csv'), index=False)

print('=== NB04b Summary ===')
print(f'Annotation: InterProScan GO (60% partner coverage vs old KEGG ~25%)')
print(f'\nKey finding: support networks are ORGANISM-SPECIFIC, not mechanism-specific')
print(f'  Cross-mechanism Jaccard (same org): {cross_go_df["jaccard"].mean():.3f}')
print(f'  Within-mechanism Jaccard (diff org): {jac_go_df["jaccard"].mean():.3f}')
print(f'\nConserved across organisms for ALL mechanisms:')
print(f'  Flagellar motility, chemotaxis, amino acid biosynthesis')
if len(spec_df) > 0 and spec_df['significant'].any():
    print(f'\nMechanism-specific GO terms (FDR<0.05):')
    for _, row in spec_df[spec_df['significant']].iterrows():
        print(f'  {row["go_term"]}: {row["go_desc"]}')
else:
    print(f'\nNo mechanism-specific GO terms after FDR correction')

=== NB04b Summary ===
Annotation: InterProScan GO (60% partner coverage vs old KEGG ~25%)

Key finding: support networks are ORGANISM-SPECIFIC, not mechanism-specific
  Cross-mechanism Jaccard (same org): 0.375
  Within-mechanism Jaccard (diff org): 0.207

Conserved across organisms for ALL mechanisms:
  Flagellar motility, chemotaxis, amino acid biosynthesis

No mechanism-specific GO terms after FDR correction

04B Cross Organism Interproscan

NB04b: Cross-Organism Conservation with InterProScan Annotations¶

1. GO term recurrence across organisms by mechanism¶

2. Mechanism specificity of GO terms¶

3. Bakta KEGG orthology conservation (comparison)¶

4. Jaccard analysis with GO terms¶