import os
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from statsmodels.stats.multitest import multipletests
import statsmodels.formula.api as smf

warnings.filterwarnings('ignore')

_here = os.path.abspath('')
if os.path.basename(_here) == 'notebooks':
    REPO = os.path.abspath(os.path.join(_here, '..', '..', '..'))
else:
    REPO = os.path.abspath(os.path.join(_here, '..', '..', '..'))

PROJECT = os.path.join(REPO, 'projects', 'pgp_pangenome_ecology')
DATA = os.path.join(PROJECT, 'data')
FIGURES = os.path.join(PROJECT, 'figures')

species_pgp = pd.read_csv(os.path.join(DATA, 'species_pgp_matrix.csv'))
species_env = pd.read_csv(os.path.join(DATA, 'species_environment.csv'))

print(f'species_pgp: {len(species_pgp):,} species')
print(f'species_env: {len(species_env):,} species')
print(f'Env distribution:')
print(species_env['dominant_env'].value_counts().to_string())

species_pgp: 11,272 species
species_env: 27,690 species
Env distribution:
dominant_env
other               13631
marine_aquatic       6260
host_clinical        6162
soil_rhizosphere     1637

# Also load taxonomy for phylogenetic control
genome_env = pd.read_csv(os.path.join(DATA, 'genome_environment.csv'))

# Get one taxonomy record per species
tax_cols = ['gtdb_species_clade_id', 'phylum', 'family']
available_cols = [c for c in tax_cols if c in genome_env.columns]
if len(available_cols) > 1:
    species_tax = genome_env.drop_duplicates('gtdb_species_clade_id')[available_cols].copy()
else:
    species_tax = pd.DataFrame({'gtdb_species_clade_id': species_pgp['gtdb_species_clade_id'].unique()})
    species_tax['phylum'] = 'unknown'
    species_tax['family'] = 'unknown'

FOCAL_GENES = ['nifH', 'pqqC', 'acdS', 'ipdC', 'hcnC']

df = species_pgp.merge(species_env[['gtdb_species_clade_id', 'dominant_env', 'n_classified_genomes']],
                        on='gtdb_species_clade_id', how='inner')
df = df.merge(species_tax, on='gtdb_species_clade_id', how='left')

df['is_soil'] = (df['dominant_env'] == 'soil_rhizosphere').astype(int)

N_soil = df['is_soil'].sum()
N_other = (df['is_soil'] == 0).sum()
print(f'Merged: {len(df):,} species with env labels')
print(f'Soil/rhizosphere: {N_soil:,} | Other: {N_other:,}')

Merged: 11,272 species with env labels
Soil/rhizosphere: 1,039 | Other: 10,233

def enrichment_test(df, gene, env_col='is_soil'):
    """Fisher's exact test for gene enrichment in soil vs non-soil."""
    col = f'{gene}_present'
    soil = df[df[env_col] == 1]
    other = df[df[env_col] == 0]
    a = soil[col].sum()          # soil + gene
    b = (soil[col] == 0).sum()   # soil - gene
    c = other[col].sum()          # other + gene
    d = (other[col] == 0).sum()   # other - gene
    table = [[a, b], [c, d]]
    or_, p = stats.fisher_exact(table, alternative='two-sided')
    # Prevalence in each group
    prev_soil = a / len(soil) if len(soil) > 0 else np.nan
    prev_other = c / len(other) if len(other) > 0 else np.nan
    return {
        'gene': gene, 'n_soil_pos': a, 'n_soil_total': len(soil),
        'n_other_pos': c, 'n_other_total': len(other),
        'prev_soil': prev_soil, 'prev_other': prev_other,
        'odds_ratio': or_, 'p_value': p
    }


results = [enrichment_test(df, g) for g in FOCAL_GENES]
enrich_df = pd.DataFrame(results)

_, q_values, _, _ = multipletests(enrich_df['p_value'], method='fdr_bh')
enrich_df['q_value'] = q_values
enrich_df['log2_or'] = np.log2(enrich_df['odds_ratio'].clip(lower=1e-6))
enrich_df['significant'] = enrich_df['q_value'] < 0.05

print('Enrichment results (soil vs non-soil):')
print(enrich_df[['gene', 'prev_soil', 'prev_other', 'odds_ratio', 'p_value', 'q_value', 'significant']]
      .to_string(index=False))

Enrichment results (soil vs non-soil):
gene  prev_soil  prev_other  odds_ratio      p_value      q_value  significant
nifH   0.128970    0.197401    0.602014 3.278430e-08 5.464050e-08         True
pqqC   0.437921    0.211668    2.901698 1.122714e-53 2.806785e-53         True
acdS   0.157844    0.025994    7.022934 1.029988e-62 5.149939e-62         True
ipdC   0.015399    0.019349    0.792678 4.729621e-01 4.729621e-01        False
hcnC   0.112608    0.064204    1.849583 4.883991e-08 6.104988e-08         True

# Repeat enrichment test within each major phylum (>=50 species with env label)
phylum_results = []

if 'phylum' in df.columns and df['phylum'].notna().sum() > 0:
    phylum_counts = df['phylum'].value_counts()
    major_phyla = phylum_counts[phylum_counts >= 50].index.tolist()
    print(f'Major phyla (>=50 species): {len(major_phyla)}')
    for phylum in major_phyla:
        sub = df[df['phylum'] == phylum]
        if sub['is_soil'].sum() < 5 or (sub['is_soil'] == 0).sum() < 5:
            continue
        for g in FOCAL_GENES:
            r = enrichment_test(sub, g)
            r['phylum'] = phylum
            r['n_phylum_species'] = len(sub)
            phylum_results.append(r)
    
    if phylum_results:
        phylum_df = pd.DataFrame(phylum_results)
        _, q_phy, _, _ = multipletests(phylum_df['p_value'], method='fdr_bh')
        phylum_df['q_value'] = q_phy
        phylum_df['significant'] = phylum_df['q_value'] < 0.05
        print(f'\nPhylum-stratified tests: {len(phylum_df)}')
        sig_phy = phylum_df[phylum_df['significant'] & (phylum_df['odds_ratio'] > 1)]
        print(f'Significant soil-enriched within phyla: {len(sig_phy)}')
        print(sig_phy[['phylum', 'gene', 'odds_ratio', 'q_value']].head(10).to_string(index=False))
    else:
        phylum_df = pd.DataFrame()
        print('No qualifying phyla for stratified analysis')
else:
    phylum_df = pd.DataFrame()
    print('Taxonomy not available for phylogenetic control')

Major phyla (>=50 species): 18

Phylum-stratified tests: 55
Significant soil-enriched within phyla: 5
              phylum gene  odds_ratio      q_value
   p__Pseudomonadota pqqC    1.669370 1.505831e-05
   p__Pseudomonadota acdS    7.193467 4.054738e-54
   p__Pseudomonadota hcnC    1.696664 1.509670e-03
      p__Bacillota_A nifH         inf 2.512436e-04
p__Verrucomicrobiota pqqC  100.571429 5.412290e-18

# Logistic regression: PGP_gene_present ~ is_soil + C(phylum)
logit_results = []

if 'phylum' in df.columns and df['phylum'].notna().sum() > 0:
    # Use phyla with >=20 species
    common_phyla = df['phylum'].value_counts()
    df_logit = df[df['phylum'].isin(common_phyla[common_phyla >= 20].index)].copy()
    df_logit['phylum'] = df_logit['phylum'].astype('category')
    
    for g in FOCAL_GENES:
        try:
            model = smf.logit(f'{g}_present ~ is_soil + C(phylum)', data=df_logit)
            fit = model.fit(disp=0, maxiter=100)
            coef = fit.params['is_soil']
            p_val = fit.pvalues['is_soil']
            ci_low, ci_high = fit.conf_int().loc['is_soil']
            logit_results.append({
                'gene': g,
                'coef_is_soil': coef,
                'or_phylum_controlled': np.exp(coef),
                'ci_low': np.exp(ci_low),
                'ci_high': np.exp(ci_high),
                'p_value': p_val,
                'n_species': int(fit.nobs)
            })
        except Exception as e:
            print(f'Logit failed for {g}: {e}')
            logit_results.append({'gene': g, 'coef_is_soil': np.nan, 'or_phylum_controlled': np.nan,
                                   'ci_low': np.nan, 'ci_high': np.nan, 'p_value': np.nan, 'n_species': 0})
    
    logit_df = pd.DataFrame(logit_results)
    _, q_logit, _, _ = multipletests(logit_df['p_value'].fillna(1), method='fdr_bh')
    logit_df['q_value'] = q_logit
    print('Logistic regression results (phylum-controlled OR for soil enrichment):')
    print(logit_df[['gene', 'or_phylum_controlled', 'ci_low', 'ci_high', 'p_value', 'q_value']]
          .to_string(index=False))
else:
    logit_df = pd.DataFrame()
    print('Skipping logistic regression — taxonomy not available')

Logistic regression results (phylum-controlled OR for soil enrichment):
gene  or_phylum_controlled   ci_low  ci_high      p_value      q_value
nifH              0.991646 0.805790 1.220370 9.368546e-01 9.368546e-01
pqqC              1.930682 1.641704 2.270526 1.827777e-15 4.569442e-15
acdS              6.982148 5.541659 8.797076 4.757506e-61 2.378753e-60
ipdC              0.557957 0.332300 0.936852 2.733783e-02 3.417228e-02
hcnC              1.428231 1.150092 1.773634 1.257974e-03 2.096623e-03

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Left: prevalence comparison
ax = axes[0]
x = np.arange(len(FOCAL_GENES))
w = 0.35
ax.bar(x - w/2, enrich_df['prev_soil'] * 100, w, label='Soil/rhizosphere',
       color='#4CAF50', alpha=0.85)
ax.bar(x + w/2, enrich_df['prev_other'] * 100, w, label='Other environments',
       color='#90A4AE', alpha=0.85)
ax.set_xticks(x)
ax.set_xticklabels(FOCAL_GENES)
ax.set_ylabel('Prevalence (%)')
ax.set_title('PGP Gene Prevalence: Soil vs Other')
ax.legend()
# Mark significant genes
for i, (_, row) in enumerate(enrich_df.iterrows()):
    if row['significant']:
        ymax = max(row['prev_soil'], row['prev_other']) * 100
        ax.text(i, ymax * 1.05, '*', ha='center', fontsize=14)

# Right: log2(OR) with confidence indication
ax = axes[1]
colors = ['#4CAF50' if row['significant'] and row['odds_ratio'] > 1
          else '#F44336' if row['significant'] and row['odds_ratio'] < 1
          else '#90A4AE'
          for _, row in enrich_df.iterrows()]
bars = ax.bar(FOCAL_GENES, enrich_df['log2_or'], color=colors, alpha=0.85)
ax.axhline(0, color='black', linewidth=0.8, linestyle='--')
ax.set_ylabel('log2(Odds Ratio)')
ax.set_title('Soil/Rhizosphere Enrichment\n(green=significant enrichment, grey=ns)')
for i, (_, row) in enumerate(enrich_df.iterrows()):
    ax.text(i, row['log2_or'] + (0.05 if row['log2_or'] >= 0 else -0.15),
            f'q={row["q_value"]:.2e}', ha='center', fontsize=7)

plt.tight_layout()
plt.savefig(os.path.join(FIGURES, 'env_enrichment_barplot.png'), dpi=150, bbox_inches='tight')
plt.show()
print('Saved figures/env_enrichment_barplot.png')

Saved figures/env_enrichment_barplot.png

# Sensitivity: strict rhizosphere-only label
if 'soil_rhizosphere' in species_env['dominant_env'].values:
    # Check if genome_environment has isolation_source text for rhizosphere-strict
    strict_rhizo_pattern = r'rhizospher|root.nodule|nodule'
    if 'ncbi_isolation_source' in genome_env.columns:
        rhizo_genomes = genome_env[
            genome_env['ncbi_isolation_source'].str.contains(strict_rhizo_pattern, na=False, case=False)
        ]['gtdb_species_clade_id'].unique()
        df['is_rhizosphere_strict'] = df['gtdb_species_clade_id'].isin(rhizo_genomes).astype(int)
        print(f'Strict rhizosphere species: {df["is_rhizosphere_strict"].sum():,}')
        
        strict_results = [enrichment_test(df[df['is_soil'] == 1], g, 'is_rhizosphere_strict')
                          for g in FOCAL_GENES]
        strict_df = pd.DataFrame(strict_results)
        _, q_strict, _, _ = multipletests(strict_df['p_value'], method='fdr_bh')
        strict_df['q_value'] = q_strict
        print('\nSensitivity: rhizosphere-strict within soil/rhizosphere species:')
        print(strict_df[['gene', 'odds_ratio', 'q_value']].to_string(index=False))

# Save all enrichment results
enrich_df['test'] = 'soil_vs_other'
all_enrich = enrich_df.copy()
if not phylum_df.empty:
    phylum_df['test'] = 'phylum_stratified'
    all_enrich = pd.concat([all_enrich, phylum_df], ignore_index=True)
if not logit_df.empty:
    logit_df['test'] = 'logit_phylum_controlled'
    all_enrich = pd.concat([all_enrich, logit_df], ignore_index=True)

all_enrich.to_csv(os.path.join(DATA, 'env_enrichment_results.csv'), index=False)
print('\nSaved env_enrichment_results.csv')

Strict rhizosphere species: 589

Sensitivity: rhizosphere-strict within soil/rhizosphere species:
gene  odds_ratio      q_value
nifH    4.173571 4.237461e-13
pqqC    2.682366 1.367040e-11
acdS   10.590968 7.595005e-38
ipdC    0.404494 2.653880e-01
hcnC    1.867154 4.232764e-03

Saved env_enrichment_results.csv

print('=== H2 Summary: Environmental Enrichment ===')
sig_genes = enrich_df[enrich_df['significant'] & (enrich_df['odds_ratio'] > 1)]
print(f'Soil-enriched PGP genes (FDR q<0.05, OR>1): {len(sig_genes):,} / {len(FOCAL_GENES)}')
if len(sig_genes) > 0:
    for _, row in sig_genes.iterrows():
        print(f'  {row["gene"]:8s}: OR={row["odds_ratio"]:.2f}, q={row["q_value"]:.2e}')
h2_supported = len(sig_genes) >= 3
print(f'\nH2 supported (>=3 genes enriched): {h2_supported}')

=== H2 Summary: Environmental Enrichment ===
Soil-enriched PGP genes (FDR q<0.05, OR>1): 3 / 5
  pqqC    : OR=2.90, q=2.81e-53
  acdS    : OR=7.02, q=5.15e-62
  hcnC    : OR=1.85, q=6.10e-08

H2 supported (>=3 genes enriched): True

03 Environmental Selection

NB03: Environmental Selection (H2)¶

1. Merge PGP matrix with environment labels¶

2. Per-gene Fisher's exact enrichment test¶

3. Phylogenetic control within major phyla¶

4. Logistic regression with phylum fixed effects¶

5. Enrichment barplot¶