import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from pathlib import Path
import os

MODULE_DIR = Path('../../fitness_modules/data/modules')
FAMILY_DIR = Path('../../fitness_modules/data/module_families')
CONS_DIR = Path('../../conservation_vs_fitness/data')
DATA_DIR = Path('../data')
FIGURES_DIR = Path('../figures')
FIGURES_DIR.mkdir(exist_ok=True)

# Load conservation link table
link = pd.read_csv(CONS_DIR / 'fb_pangenome_link.tsv', sep='\t')
link = link[link['orgId'] != 'Dyella79']

# Find overlapping organisms
mod_orgs = set(f.replace('_gene_membership.csv', '')
               for f in os.listdir(MODULE_DIR) if f.endswith('_gene_membership.csv'))
link_orgs = set(link['orgId'].unique())
overlap_orgs = sorted(mod_orgs & link_orgs)

print(f'Module organisms: {len(mod_orgs)}')
print(f'Link table organisms: {len(link_orgs)}')
print(f'Overlap: {len(overlap_orgs)}')

Module organisms: 32
Link table organisms: 43
Overlap: 29

# For each organism, melt module membership and join with conservation
all_module_genes = []

for orgId in overlap_orgs:
    mem = pd.read_csv(MODULE_DIR / f'{orgId}_gene_membership.csv')
    # Melt: one row per gene x module where membership = 1
    module_cols = [c for c in mem.columns if c.startswith('M')]
    melted = mem.melt(id_vars='locusId', value_vars=module_cols,
                      var_name='module', value_name='member')
    melted = melted[melted['member'] == 1].drop(columns='member')
    melted['orgId'] = orgId
    all_module_genes.append(melted)

module_genes = pd.concat(all_module_genes, ignore_index=True)
print(f'Module gene assignments: {len(module_genes):,} (gene x module pairs)')
print(f'Unique genes in modules: {module_genes.drop_duplicates(["orgId","locusId"]).shape[0]:,}')

# Join with conservation
module_genes['locusId'] = module_genes['locusId'].astype(str)
link['locusId'] = link['locusId'].astype(str)

mg = module_genes.merge(
    link[['orgId', 'locusId', 'is_core', 'is_auxiliary', 'is_singleton', 'gene_cluster_id']],
    on=['orgId', 'locusId'], how='left'
)
mg['is_mapped'] = mg['gene_cluster_id'].notna()
mg['conservation'] = 'unmapped'
mg.loc[mg['is_mapped'] & (mg['is_core'] == True), 'conservation'] = 'core'
mg.loc[mg['is_mapped'] & (mg['is_singleton'] == True), 'conservation'] = 'singleton'
mg.loc[mg['is_mapped'] & (mg['is_core'] != True) & (mg['is_singleton'] != True), 'conservation'] = 'auxiliary'

print(f'\nModule genes with conservation status:')
print(mg['conservation'].value_counts().to_string())

Module gene assignments: 30,897 (gene x module pairs)
Unique genes in modules: 27,670

Module genes with conservation status:
conservation
core         23099
unmapped      4213
auxiliary     2434
singleton     1151

# Per-module conservation composition
module_cons = mg.groupby(['orgId', 'module']).agg(
    n_genes=('locusId', 'size'),
    n_mapped=('is_mapped', 'sum'),
    n_core=('is_core', lambda x: (x == True).sum()),
    n_auxiliary=('is_auxiliary', lambda x: (x == True).sum()),
    n_singleton=('is_singleton', lambda x: (x == True).sum()),
).reset_index()

module_cons['pct_core'] = (module_cons['n_core'] / module_cons['n_mapped'] * 100).round(1)
module_cons['pct_core'] = module_cons['pct_core'].fillna(0)

# Organism baseline % core (all genes, not just module genes)
org_baseline = link.groupby('orgId')['is_core'].mean().reset_index()
org_baseline.columns = ['orgId', 'baseline_core_frac']
org_baseline['baseline_pct_core'] = (org_baseline['baseline_core_frac'] * 100).round(1)

module_cons = module_cons.merge(org_baseline, on='orgId', how='left')

print(f'Modules analyzed: {len(module_cons):,}')
print(f'\nModule % core summary:')
print(f'  Mean: {module_cons["pct_core"].mean():.1f}%')
print(f'  Median: {module_cons["pct_core"].median():.1f}%')
print(f'  Baseline mean: {module_cons["baseline_pct_core"].mean():.1f}%')

# Are module genes more core than non-module genes?
module_unique = mg.drop_duplicates(['orgId', 'locusId'])
mapped_module = module_unique[module_unique['is_mapped']]
in_module_core_n = (mapped_module['is_core'] == True).sum()
in_module_not_core_n = len(mapped_module) - in_module_core_n
in_module_core = in_module_core_n / len(mapped_module) * 100

# All genes (including non-module)
all_core_n = (link['is_core'] == True).sum()
all_not_core_n = len(link) - all_core_n
all_genes_core = all_core_n / len(link) * 100

# Non-module genes
non_module_core_n = all_core_n - in_module_core_n
non_module_not_core_n = all_not_core_n - in_module_not_core_n

# Statistical test: Fisher's exact (module vs non-module × core vs non-core)
table = [[in_module_core_n, in_module_not_core_n],
         [non_module_core_n, non_module_not_core_n]]
odds_ratio, p_value = stats.fisher_exact(table)

print(f'\n=== MODULE GENES vs ALL GENES ===')
print(f'  Module genes % core: {in_module_core:.1f}% ({in_module_core_n:,} / {len(mapped_module):,})')
print(f'  All genes % core:    {all_genes_core:.1f}%')
print(f'  Difference: {in_module_core - all_genes_core:+.1f} percentage points')
print(f'  Fisher exact test: OR={odds_ratio:.2f}, p={p_value:.2e}')

# Per-organism paired test
org_module_core = []
for orgId in overlap_orgs:
    org_mod = mapped_module[mapped_module['orgId'] == orgId]
    org_all = link[link['orgId'] == orgId]
    if len(org_mod) >= 10 and len(org_all) >= 10:
        org_module_core.append({
            'orgId': orgId,
            'module_pct_core': (org_mod['is_core'] == True).mean() * 100,
            'all_pct_core': (org_all['is_core'] == True).mean() * 100,
        })

org_paired = pd.DataFrame(org_module_core)
org_paired['diff'] = org_paired['module_pct_core'] - org_paired['all_pct_core']
wilcox_stat, wilcox_p = stats.wilcoxon(org_paired['diff'])
print(f'\n  Per-organism paired Wilcoxon: W={wilcox_stat:.0f}, p={wilcox_p:.2e}')
print(f'  Organisms with module > baseline: {(org_paired["diff"] > 0).sum()}/{len(org_paired)}')
print(f'  Mean per-organism difference: {org_paired["diff"].mean():+.1f} pp')

# Save
module_cons.to_csv(DATA_DIR / 'module_conservation.tsv', sep='\t', index=False)

Modules analyzed: 1,008

Module % core summary:
  Mean: 85.1%
  Median: 93.4%
  Baseline mean: 82.2%

=== MODULE GENES vs ALL GENES ===
  Module genes % core: 86.0% (20,568 / 23,907)
  All genes % core:    81.5%
  Difference: +4.5 percentage points
  Fisher exact test: OR=1.46, p=1.58e-87

  Per-organism paired Wilcoxon: W=59, p=1.04e-03
  Organisms with module > baseline: 22/29
  Mean per-organism difference: +3.3 pp

# Only modules with mapped genes
has_mapped = module_cons[module_cons['n_mapped'] >= 3]

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram of % core
axes[0].hist(has_mapped['pct_core'], bins=20, range=(0, 100),
             edgecolor='black', alpha=0.7, color='steelblue')
axes[0].axvline(x=has_mapped['baseline_pct_core'].mean(), color='red',
                linestyle='--', linewidth=2, label=f'Baseline ({has_mapped["baseline_pct_core"].mean():.0f}%)')
axes[0].set_xlabel('% Core genes in module')
axes[0].set_ylabel('Number of modules')
axes[0].set_title(f'Module Conservation Spectrum (n={len(has_mapped)})')
axes[0].legend()

# Classify modules
has_mapped = has_mapped.copy()
has_mapped['module_type'] = 'mixed'
has_mapped.loc[has_mapped['pct_core'] >= 90, 'module_type'] = 'core_module'
has_mapped.loc[has_mapped['pct_core'] < 50, 'module_type'] = 'accessory_module'

type_counts = has_mapped['module_type'].value_counts()
axes[1].bar(range(len(type_counts)), type_counts.values,
            color=['#2196F3', '#FF9800', '#F44336'])
axes[1].set_xticks(range(len(type_counts)))
axes[1].set_xticklabels(type_counts.index)
axes[1].set_ylabel('Number of modules')
axes[1].set_title('Module Classification')

for i, (cat, n) in enumerate(type_counts.items()):
    axes[1].text(i, n + 5, f'{n}\n({n/len(has_mapped)*100:.0f}%)', ha='center', fontsize=9)

plt.tight_layout()
plt.savefig(FIGURES_DIR / 'module_core_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

print('=== MODULE CLASSIFICATION ===')
print(type_counts.to_string())

=== MODULE CLASSIFICATION ===
module_type
core_module         577
mixed               349
accessory_module     48

# Load module annotations
all_annot = []
for orgId in overlap_orgs:
    annot_file = MODULE_DIR / f'{orgId}_module_annotations.csv'
    if annot_file.exists():
        ann = pd.read_csv(annot_file)
        ann['orgId'] = orgId
        all_annot.append(ann)

if all_annot:
    annotations = pd.concat(all_annot, ignore_index=True)
    print(f'Module annotations loaded: {len(annotations):,} rows')

    # Get the best annotation per module (lowest FDR)
    best_annot = annotations.sort_values('fdr').drop_duplicates(['orgId', 'module'], keep='first')

    # Merge with module conservation
    mc_annot = has_mapped.merge(best_annot, on=['orgId', 'module'], how='left')

    # Resolve PFam/TIGRFam/KEGG IDs to descriptions using the annotation data itself
    # The SEED annotations already have readable names; for PFam/TIGRFam, the term IS the ID
    # Let's show the database + term together for clarity, and flag which are opaque IDs
    annotated = mc_annot[mc_annot['term'].notna()].copy()
    annotated['label'] = annotated['database'] + ': ' + annotated['term']
    print(f'Annotated modules: {len(annotated)} / {len(mc_annot)}')

    # Top terms in core modules vs accessory modules
    core_mods = annotated[annotated['module_type'] == 'core_module']
    acc_mods = annotated[annotated['module_type'] == 'accessory_module']

    print(f'\n=== TOP ANNOTATIONS IN CORE MODULES (n={len(core_mods)}) ===')
    if len(core_mods) > 0:
        print(core_mods['label'].value_counts().head(15).to_string())

    print(f'\n=== TOP ANNOTATIONS IN ACCESSORY MODULES (n={len(acc_mods)}) ===')
    if len(acc_mods) > 0:
        print(acc_mods['label'].value_counts().head(15).to_string())

    # Summary by database for each module type
    print(f'\n=== ANNOTATION DATABASE BY MODULE TYPE ===')
    db_by_type = pd.crosstab(annotated['module_type'], annotated['database'], normalize='index') * 100
    print(db_by_type.round(1).to_string())
    print(f'\nNote: PFam/TIGRFam/KEGG IDs are domain/ortholog identifiers.')
    print(f'SEED annotations are human-readable functional descriptions.')
else:
    print('No module annotation files found')

Module annotations loaded: 5,995 rows
Annotated modules: 803 / 974

=== TOP ANNOTATIONS IN CORE MODULES (n=477) ===
label
PFam: PF02653         11
PFam: PF00528         10
PFam: PF01012          8
PFam: PF00270          7
PFam: PF00460          7
PFam: PF00994          6
PFam: PF00072          6
PFam: PF00590          6
PFam: PF00361          6
PFam: PF02518          5
PFam: PF03471          5
TIGRFam: TIGR01726     5
PFam: PF00155          4
PFam: PF05138          4
PFam: PF00271          4

=== TOP ANNOTATIONS IN ACCESSORY MODULES (n=37) ===
label
PFam: PF13692                                     2
PFam: PF00122                                     2
TIGRFam: TIGR01539                                2
PFam: PF00589                                     1
PFam: PF13443                                     1
TIGRFam: TIGR04057                                1
KEGG: K03657                                      1
PFam: PF02668                                     1
TIGRFam: TIGR04259                                1
PFam: PF13186                                     1
PFam: PF13641                                     1
SEED: Phage tail length tape-measure protein 1    1
PFam: PF00419                                     1
PFam: PF00482                                     1
PFam: PF02472                                     1

=== ANNOTATION DATABASE BY MODULE TYPE ===
database          KEGG  PFam  SEED  TIGRFam
module_type                                
accessory_module   5.4  73.0   5.4     16.2
core_module       10.7  68.1  14.9      6.3
mixed              4.8  68.2  18.7      8.3

Note: PFam/TIGRFam/KEGG IDs are domain/ortholog identifiers.
SEED annotations are human-readable functional descriptions.

print('=' * 60)
print('NB01 SUMMARY: Module Conservation Profiles')
print('=' * 60)
print(f'Organisms: {len(overlap_orgs)}')
print(f'Module gene pairs: {len(module_genes):,}')
print(f'Modules analyzed: {len(has_mapped)}')
print(f'Module genes % core: {in_module_core:.1f}%')
print(f'All genes % core: {all_genes_core:.1f}%')
print(f'Module types: {dict(type_counts)}')
print('=' * 60)

============================================================
NB01 SUMMARY: Module Conservation Profiles
============================================================
Organisms: 29
Module gene pairs: 30,897
Modules analyzed: 974
Module genes % core: 86.0%
All genes % core: 81.5%
Module types: {'core_module': np.int64(577), 'mixed': np.int64(349), 'accessory_module': np.int64(48)}
============================================================

01 Module Conservation

NB01: Module Conservation Profiles¶

Step 1: Build Gene-Level Dataset¶

Step 2: Per-Module Conservation Composition¶

Step 3: Module Conservation Spectrum¶

Step 4: Module Function vs Conservation¶