from pathlib import Path
import re
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from scipy.stats import linregress
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

DATA_DIR = Path('../data')
FIG_DIR = Path('../figures')
FIG_DIR.mkdir(parents=True, exist_ok=True)

geo = pd.read_csv(DATA_DIR / 'geochemistry_sample_matrix.tsv', sep='	')
community = pd.read_csv(DATA_DIR / 'community_taxon_counts.tsv', sep='	')
bridge = pd.read_csv(DATA_DIR / 'taxon_bridge.tsv', sep='	')
features = pd.read_csv(DATA_DIR / 'taxon_functional_features.tsv', sep='	')
meta = pd.read_csv(DATA_DIR / 'sample_location_metadata.tsv', sep='	')

for c in ['depth_meter', 'latitude_degree', 'longitude_degree']:
    if c in meta.columns:
        meta[c] = pd.to_numeric(meta[c], errors='coerce')

meta['location_prefix'] = meta['sdt_location_name'].astype(str).str.split('-').str[0].replace({'nan': np.nan})

print('geo:', geo.shape)
print('community:', community.shape)
print('bridge:', bridge.shape)
print('features:', features.shape)
print('meta:', meta.shape)
print('location_prefix levels:', meta['location_prefix'].nunique(dropna=True))

geo: (108, 49)
community: (41711, 5)
bridge: (8242, 7)
features: (3630, 5)
meta: (108, 8)
location_prefix levels: 5

metal_keywords = ['uranium', 'chromium', 'nickel', 'zinc', 'copper', 'cadmium', 'lead', 'arsenic', 'mercury']
metal_cols = [c for c in geo.columns if any(k in c.lower() for k in metal_keywords)]
if not metal_cols:
    raise RuntimeError('No contamination columns found in geochemistry_sample_matrix.tsv')

geo_model = geo[['sdt_sample_name'] + metal_cols].copy()
for c in metal_cols:
    geo_model[c] = pd.to_numeric(geo_model[c], errors='coerce')

zparts = []
for c in metal_cols:
    s = np.log1p(geo_model[c])
    std = s.std(ddof=0)
    z = (s - s.mean()) / (std if std else 1)
    zparts.append(z.rename(c + '_z'))

zmat = pd.concat(zparts, axis=1)
geo_model['contamination_index'] = zmat.mean(axis=1, skipna=True)
geo_model = geo_model[['sdt_sample_name', 'contamination_index']].dropna()
print('Samples with contamination_index:', len(geo_model))

Samples with contamination_index: 108

def parse_fraction_type(s: str) -> str:
    s = '' if pd.isna(s) else str(s)
    m = re.search(r'([0-9]+(?:\.[0-9]+)?)\s*micron filter', s.lower())
    if not m:
        return 'other_fraction'
    val = float(m.group(1))
    if abs(val - 0.2) < 1e-9:
        return '0.2_micron_filter'
    if abs(val - 10.0) < 1e-9:
        return '10_micron_filter'
    return f'{val:g}_micron_filter'

comm = community[['sdt_sample_name', 'sdt_community_name', 'genus', 'read_count']].copy()
comm['read_count'] = pd.to_numeric(comm['read_count'], errors='coerce').fillna(0)
comm = comm[comm['read_count'] > 0]
comm['community_fraction_type'] = comm['sdt_community_name'].map(parse_fraction_type)

# Primary analysis table: sample-level genus abundance collapsed across communities.
genus_counts = comm.groupby(['sdt_sample_name', 'genus'], as_index=False)['read_count'].sum()
totals = genus_counts.groupby('sdt_sample_name', as_index=False)['read_count'].sum().rename(columns={'read_count':'sample_total'})
genus_counts = genus_counts.merge(totals, on='sdt_sample_name', how='left')
genus_counts['rel_abundance'] = genus_counts['read_count'] / genus_counts['sample_total']

# Robustness table: retain community fraction types before sample-level collapse.
genus_counts_frac = comm.groupby(['sdt_sample_name', 'community_fraction_type', 'genus'], as_index=False)['read_count'].sum()
frac_totals = genus_counts_frac.groupby(['sdt_sample_name', 'community_fraction_type'], as_index=False)['read_count'].sum().rename(columns={'read_count': 'fraction_total'})
genus_counts_frac = genus_counts_frac.merge(frac_totals, on=['sdt_sample_name', 'community_fraction_type'], how='left')
genus_counts_frac['rel_abundance'] = genus_counts_frac['read_count'] / genus_counts_frac['fraction_total']

print('Sample-genus abundance rows (collapsed):', len(genus_counts))
print('Sample-fraction-genus rows (robustness):', len(genus_counts_frac))
print('Community fraction types:', comm['community_fraction_type'].value_counts().to_dict())

Sample-genus abundance rows (collapsed): 28001
Sample-fraction-genus rows (robustness): 39460
Community fraction types: {'10_micron_filter': 22532, '0.2_micron_filter': 19179}

def bh_fdr(pvals: np.ndarray) -> np.ndarray:
    pvals = np.asarray(pvals, dtype=float)
    qvals = np.full_like(pvals, np.nan, dtype=float)
    ok = np.isfinite(pvals)
    if ok.sum() == 0:
        return qvals

    pv = pvals[ok]
    m = len(pv)
    order = np.argsort(pv)
    ranked = pv[order]
    q = ranked * m / np.arange(1, m + 1)
    q = np.minimum.accumulate(q[::-1])[::-1]
    q = np.clip(q, 0, 1)

    out = np.empty_like(pv)
    out[order] = q
    qvals[ok] = out
    return qvals


def bootstrap_spearman_ci(x: np.ndarray, y: np.ndarray, n_boot: int = 400, seed: int = 42):
    x = np.asarray(x, dtype=float)
    y = np.asarray(y, dtype=float)
    n = len(x)
    if n < 10:
        return (np.nan, np.nan)

    rng = np.random.default_rng(seed)
    vals = []
    for _ in range(n_boot):
        idx = rng.integers(0, n, n)
        r, _ = spearmanr(x[idx], y[idx])
        if np.isfinite(r):
            vals.append(float(r))
    if len(vals) < max(20, n_boot // 5):
        return (np.nan, np.nan)
    return (float(np.quantile(vals, 0.025)), float(np.quantile(vals, 0.975)))


def bootstrap_beta_ci(formula: str, data: pd.DataFrame, coef: str = 'contamination_index', n_boot: int = 250, seed: int = 123):
    if len(data) < 20:
        return (np.nan, np.nan)

    rng = np.random.default_rng(seed)
    n = len(data)
    vals = []
    for _ in range(n_boot):
        idx = rng.integers(0, n, n)
        ds = data.iloc[idx]
        try:
            fit = smf.ols(formula, data=ds).fit()
            v = fit.params.get(coef, np.nan)
            if np.isfinite(v):
                vals.append(float(v))
        except Exception:
            continue
    if len(vals) < max(20, n_boot // 5):
        return (np.nan, np.nan)
    return (float(np.quantile(vals, 0.025)), float(np.quantile(vals, 0.975)))


if 'mapping_mode' not in features.columns:
    features['mapping_mode'] = 'relaxed_all_clades'

bridge_ok = bridge[bridge['mapping_tier'] == 'genus_exact'][['genus', 'genus_norm']].drop_duplicates()

site_scores_all = []
model_results_all = []
highcov_threshold = 0.25
primary_modes = {'strict_single_clade', 'relaxed_all_clades'}

for mapping_mode in sorted(features['mapping_mode'].dropna().unique()):
    feat_mode = features[features['mapping_mode'] == mapping_mode].copy()
    feat_wide = feat_mode.pivot_table(index='genus_norm', columns='feature_name', values='feature_value', aggfunc='mean').reset_index()

    feature_cols = ['cog_defense_fraction', 'cog_mobilome_fraction', 'cog_metabolism_fraction']

    # Collapsed sample-level features (main pipeline)
    genus_feat = genus_counts.merge(bridge_ok, on='genus', how='left').merge(feat_wide, on='genus_norm', how='left')
    for c in feature_cols:
        if c not in genus_feat.columns:
            genus_feat[c] = np.nan
        genus_feat[c] = pd.to_numeric(genus_feat[c], errors='coerce')

    genus_feat['has_feature_mapping'] = genus_feat[feature_cols].notna().any(axis=1)
    for c in feature_cols:
        genus_feat[c] = genus_feat[c].fillna(0.0)

    genus_feat['stress_function_score'] = 0.5 * genus_feat['cog_defense_fraction'] + 0.5 * genus_feat['cog_mobilome_fraction']
    for c in ['cog_defense_fraction', 'cog_mobilome_fraction', 'cog_metabolism_fraction', 'stress_function_score']:
        genus_feat[c + '_weighted'] = genus_feat['rel_abundance'] * genus_feat[c]

    site_scores = genus_feat.groupby('sdt_sample_name', as_index=False)[
        ['cog_defense_fraction_weighted', 'cog_mobilome_fraction_weighted', 'cog_metabolism_fraction_weighted', 'stress_function_score_weighted']
    ].sum()

    coverage = genus_feat.groupby('sdt_sample_name', as_index=False)['rel_abundance'].sum().rename(columns={'rel_abundance': 'total_abundance'})
    mapped_cov = genus_feat[genus_feat['has_feature_mapping']].groupby('sdt_sample_name', as_index=False)['rel_abundance'].sum().rename(columns={'rel_abundance': 'mapped_abundance_fraction'})
    coverage = coverage.merge(mapped_cov, on='sdt_sample_name', how='left').fillna({'mapped_abundance_fraction': 0.0})
    coverage['mapped_abundance_fraction'] = coverage['mapped_abundance_fraction'] / coverage['total_abundance'].replace(0, np.nan)
    coverage['mapped_abundance_fraction'] = coverage['mapped_abundance_fraction'].fillna(0.0)
    coverage['unmapped_abundance_fraction'] = 1.0 - coverage['mapped_abundance_fraction']

    site_scores = site_scores.rename(columns={
        'cog_defense_fraction_weighted': 'site_defense_score',
        'cog_mobilome_fraction_weighted': 'site_mobilome_score',
        'cog_metabolism_fraction_weighted': 'site_metabolism_score',
        'stress_function_score_weighted': 'site_stress_score'
    }).merge(coverage[['sdt_sample_name', 'mapped_abundance_fraction', 'unmapped_abundance_fraction']], on='sdt_sample_name', how='left')
    site_scores['mapping_mode'] = mapping_mode

    # Fraction-retaining robustness table
    genus_feat_frac = genus_counts_frac.merge(bridge_ok, on='genus', how='left').merge(feat_wide, on='genus_norm', how='left')
    for c in feature_cols:
        if c not in genus_feat_frac.columns:
            genus_feat_frac[c] = np.nan
        genus_feat_frac[c] = pd.to_numeric(genus_feat_frac[c], errors='coerce')

    genus_feat_frac['has_feature_mapping'] = genus_feat_frac[feature_cols].notna().any(axis=1)
    for c in feature_cols:
        genus_feat_frac[c] = genus_feat_frac[c].fillna(0.0)

    genus_feat_frac['stress_function_score'] = 0.5 * genus_feat_frac['cog_defense_fraction'] + 0.5 * genus_feat_frac['cog_mobilome_fraction']
    for c in ['cog_defense_fraction', 'cog_mobilome_fraction', 'cog_metabolism_fraction', 'stress_function_score']:
        genus_feat_frac[c + '_weighted'] = genus_feat_frac['rel_abundance'] * genus_feat_frac[c]

    frac_scores = genus_feat_frac.groupby(['sdt_sample_name', 'community_fraction_type'], as_index=False)[
        ['cog_defense_fraction_weighted', 'cog_mobilome_fraction_weighted', 'cog_metabolism_fraction_weighted', 'stress_function_score_weighted']
    ].sum()

    frac_cov = genus_feat_frac.groupby(['sdt_sample_name', 'community_fraction_type'], as_index=False)['rel_abundance'].sum().rename(columns={'rel_abundance': 'total_abundance'})
    frac_mapped_cov = genus_feat_frac[genus_feat_frac['has_feature_mapping']].groupby(['sdt_sample_name', 'community_fraction_type'], as_index=False)['rel_abundance'].sum().rename(columns={'rel_abundance': 'mapped_abundance_fraction'})
    frac_cov = frac_cov.merge(frac_mapped_cov, on=['sdt_sample_name', 'community_fraction_type'], how='left').fillna({'mapped_abundance_fraction': 0.0})
    frac_cov['mapped_abundance_fraction'] = frac_cov['mapped_abundance_fraction'] / frac_cov['total_abundance'].replace(0, np.nan)
    frac_cov['mapped_abundance_fraction'] = frac_cov['mapped_abundance_fraction'].fillna(0.0)

    frac_scores = frac_scores.rename(columns={
        'cog_defense_fraction_weighted': 'site_defense_score',
        'cog_mobilome_fraction_weighted': 'site_mobilome_score',
        'cog_metabolism_fraction_weighted': 'site_metabolism_score',
        'stress_function_score_weighted': 'site_stress_score'
    }).merge(
        frac_cov[['sdt_sample_name', 'community_fraction_type', 'mapped_abundance_fraction']],
        on=['sdt_sample_name', 'community_fraction_type'],
        how='left'
    )

    model_df = site_scores.merge(geo_model, on='sdt_sample_name', how='inner').merge(
        meta[['sdt_sample_name', 'depth_meter', 'latitude_degree', 'longitude_degree', 'location_prefix']],
        on='sdt_sample_name', how='left'
    ).dropna(subset=['contamination_index'])

    model_frac_df = frac_scores.merge(geo_model, on='sdt_sample_name', how='inner').dropna(subset=['contamination_index'])

    rows = []
    for y in ['site_defense_score', 'site_mobilome_score', 'site_metabolism_score', 'site_stress_score']:
        d = model_df[['contamination_index', y]].dropna()

        row = {
            'mapping_mode': mapping_mode,
            'outcome': y,
            'analysis_tier': 'confirmatory' if (y == 'site_defense_score' and mapping_mode in primary_modes) else 'exploratory',
            'status': 'ok',
            'n_samples': len(d),
            'spearman_rho': np.nan,
            'spearman_rho_ci_low': np.nan,
            'spearman_rho_ci_high': np.nan,
            'spearman_p': np.nan,
            'permutation_p': np.nan,
            'ols_beta_contamination': np.nan,
            'ols_p_contamination': np.nan,
            'adj_ols_beta_contamination': np.nan,
            'adj_ols_beta_ci_low': np.nan,
            'adj_ols_beta_ci_high': np.nan,
            'adj_ols_p_contamination': np.nan,
            'adj_ols_n': np.nan,
            'adj_cov_beta_contamination': np.nan,
            'adj_cov_beta_ci_low': np.nan,
            'adj_cov_beta_ci_high': np.nan,
            'adj_cov_p_contamination': np.nan,
            'adj_cov_n': np.nan,
            'adj_site_beta_contamination': np.nan,
            'adj_site_p_contamination': np.nan,
            'adj_site_n': np.nan,
            'adj_frac_beta_contamination': np.nan,
            'adj_frac_beta_ci_low': np.nan,
            'adj_frac_beta_ci_high': np.nan,
            'adj_frac_p_contamination': np.nan,
            'adj_frac_n': np.nan,
            'frac_0_2_n': np.nan,
            'frac_0_2_spearman_rho': np.nan,
            'frac_0_2_spearman_p': np.nan,
            'frac_10_n': np.nan,
            'frac_10_spearman_rho': np.nan,
            'frac_10_spearman_p': np.nan,
            'highcov_threshold': highcov_threshold,
            'highcov_n': np.nan,
            'highcov_spearman_rho': np.nan,
            'highcov_spearman_p': np.nan,
        }

        if len(d) < 10:
            row['status'] = 'insufficient_samples'
            rows.append(row)
            continue

        if d[y].nunique(dropna=True) < 2 or float(np.nanstd(d[y].to_numpy())) == 0.0:
            row['status'] = 'constant_feature'
            rows.append(row)
            continue

        rho, p_spear = spearmanr(d['contamination_index'], d[y])
        if np.isnan(rho):
            row['status'] = 'invalid_spearman'
            rows.append(row)
            continue

        lr = linregress(d['contamination_index'], d[y])
        row['spearman_rho'] = float(rho)
        ci_low, ci_high = bootstrap_spearman_ci(d['contamination_index'].to_numpy(), d[y].to_numpy())
        row['spearman_rho_ci_low'] = ci_low
        row['spearman_rho_ci_high'] = ci_high
        row['spearman_p'] = float(p_spear)
        row['ols_beta_contamination'] = float(lr.slope)
        row['ols_p_contamination'] = float(lr.pvalue)

        obs = abs(float(rho))
        perms = 500
        gt = 0
        arr_x = d['contamination_index'].to_numpy()
        arr_y = d[y].to_numpy()
        rng = np.random.default_rng(42)
        for _ in range(perms):
            r, _ = spearmanr(arr_x, rng.permutation(arr_y))
            if not np.isnan(r) and abs(float(r)) >= obs:
                gt += 1
        row['permutation_p'] = (gt + 1) / (perms + 1)

        d_adj = model_df[[y, 'contamination_index', 'depth_meter', 'latitude_degree', 'longitude_degree']].dropna()
        if len(d_adj) >= 20 and d_adj[y].nunique(dropna=True) > 1:
            try:
                fml = f"{y} ~ contamination_index + depth_meter + latitude_degree + longitude_degree"
                fit = smf.ols(fml, data=d_adj).fit()
                row['adj_ols_beta_contamination'] = float(fit.params.get('contamination_index', np.nan))
                ci_l, ci_h = bootstrap_beta_ci(fml, d_adj)
                row['adj_ols_beta_ci_low'] = ci_l
                row['adj_ols_beta_ci_high'] = ci_h
                row['adj_ols_p_contamination'] = float(fit.pvalues.get('contamination_index', np.nan))
                row['adj_ols_n'] = int(len(d_adj))
            except Exception:
                pass

        d_cov = model_df[[y, 'contamination_index', 'depth_meter', 'latitude_degree', 'longitude_degree', 'mapped_abundance_fraction']].dropna()
        if len(d_cov) >= 20 and d_cov[y].nunique(dropna=True) > 1:
            try:
                fml = f"{y} ~ contamination_index + depth_meter + latitude_degree + longitude_degree + mapped_abundance_fraction"
                fit_cov = smf.ols(fml, data=d_cov).fit()
                row['adj_cov_beta_contamination'] = float(fit_cov.params.get('contamination_index', np.nan))
                ci_l, ci_h = bootstrap_beta_ci(fml, d_cov)
                row['adj_cov_beta_ci_low'] = ci_l
                row['adj_cov_beta_ci_high'] = ci_h
                row['adj_cov_p_contamination'] = float(fit_cov.pvalues.get('contamination_index', np.nan))
                row['adj_cov_n'] = int(len(d_cov))
            except Exception:
                pass

        d_site = model_df[[y, 'contamination_index', 'depth_meter', 'location_prefix']].dropna()
        if len(d_site) >= 30 and d_site[y].nunique(dropna=True) > 1 and d_site['location_prefix'].nunique(dropna=True) > 1:
            try:
                fit_site = smf.ols(f"{y} ~ contamination_index + depth_meter + C(location_prefix)", data=d_site).fit()
                row['adj_site_beta_contamination'] = float(fit_site.params.get('contamination_index', np.nan))
                row['adj_site_p_contamination'] = float(fit_site.pvalues.get('contamination_index', np.nan))
                row['adj_site_n'] = int(len(d_site))
            except Exception:
                pass

        d_frac = model_frac_df[[y, 'contamination_index', 'mapped_abundance_fraction', 'community_fraction_type']].dropna()
        if len(d_frac) >= 20 and d_frac[y].nunique(dropna=True) > 1 and d_frac['community_fraction_type'].nunique(dropna=True) > 1:
            try:
                fml = f"{y} ~ contamination_index + mapped_abundance_fraction + C(community_fraction_type)"
                fit_frac = smf.ols(fml, data=d_frac).fit()
                row['adj_frac_beta_contamination'] = float(fit_frac.params.get('contamination_index', np.nan))
                ci_l, ci_h = bootstrap_beta_ci(fml, d_frac)
                row['adj_frac_beta_ci_low'] = ci_l
                row['adj_frac_beta_ci_high'] = ci_h
                row['adj_frac_p_contamination'] = float(fit_frac.pvalues.get('contamination_index', np.nan))
                row['adj_frac_n'] = int(len(d_frac))
            except Exception:
                pass

        for frac_label, prefix in [('0.2_micron_filter', 'frac_0_2'), ('10_micron_filter', 'frac_10')]:
            d_sub = model_frac_df.loc[model_frac_df['community_fraction_type'] == frac_label, ['contamination_index', y]].dropna()
            row[f'{prefix}_n'] = int(len(d_sub))
            if len(d_sub) >= 10 and d_sub[y].nunique(dropna=True) > 1:
                r_sub, p_sub = spearmanr(d_sub['contamination_index'], d_sub[y])
                row[f'{prefix}_spearman_rho'] = float(r_sub)
                row[f'{prefix}_spearman_p'] = float(p_sub)

        d_hi = model_df.loc[model_df['mapped_abundance_fraction'] >= highcov_threshold, ['contamination_index', y]].dropna()
        row['highcov_n'] = int(len(d_hi))
        if len(d_hi) >= 10 and d_hi[y].nunique(dropna=True) > 1:
            r_hi, p_hi = spearmanr(d_hi['contamination_index'], d_hi[y])
            row['highcov_spearman_rho'] = float(r_hi)
            row['highcov_spearman_p'] = float(p_hi)

        rows.append(row)

    site_scores_all.append(site_scores)
    model_results_all.append(pd.DataFrame(rows))

site_scores_all = pd.concat(site_scores_all, ignore_index=True)
model_results = pd.concat(model_results_all, ignore_index=True)

# Global FDR across all reported p-values
p_cols = [c for c in model_results.columns if ('_p' in c and not c.endswith('_fdr_q'))]
entries = []
for ridx in model_results.index:
    for col in p_cols:
        val = model_results.at[ridx, col]
        if pd.notna(val):
            entries.append((ridx, col, float(val)))

for col in p_cols:
    model_results[col + '_fdr_q'] = np.nan

if entries:
    all_p = np.array([x[2] for x in entries], dtype=float)
    all_q = bh_fdr(all_p)
    for (ridx, col, _), q in zip(entries, all_q):
        model_results.at[ridx, col + '_fdr_q'] = float(q)

q_cols = [c for c in model_results.columns if c.endswith('_fdr_q')]
model_results['min_p'] = model_results[p_cols].min(axis=1, skipna=True)
model_results['min_fdr_q'] = model_results[q_cols].min(axis=1, skipna=True)

if len(model_results):
    model_results = model_results.sort_values(['analysis_tier', 'mapping_mode', 'status', 'spearman_p'], na_position='last')

# Model-family sample counts table
families = {
    'base_spearman': 'n_samples',
    'adj_ols': 'adj_ols_n',
    'adj_cov': 'adj_cov_n',
    'adj_site': 'adj_site_n',
    'adj_frac': 'adj_frac_n',
    'highcov_subset': 'highcov_n',
    'frac_0_2_subset': 'frac_0_2_n',
    'frac_10_subset': 'frac_10_n',
}
count_rows = []
for _, r in model_results.iterrows():
    for fam, col in families.items():
        count_rows.append({
            'mapping_mode': r['mapping_mode'],
            'outcome': r['outcome'],
            'analysis_tier': r['analysis_tier'],
            'family': fam,
            'n_used': r.get(col, np.nan)
        })
model_family_counts = pd.DataFrame(count_rows)

# Contamination-index sensitivity (confirmatory endpoint only)
idx_variants = {}
idx_variants['composite_all_metals'] = geo_model[['sdt_sample_name', 'contamination_index']].rename(columns={'contamination_index': 'contam_idx'})

uranium_cols = [c for c in metal_cols if 'uranium' in c.lower()]
if uranium_cols:
    u = np.log1p(pd.to_numeric(geo[uranium_cols[0]], errors='coerce'))
    us = (u - u.mean()) / (u.std(ddof=0) if u.std(ddof=0) else 1)
    idx_variants['uranium_only'] = pd.DataFrame({'sdt_sample_name': geo['sdt_sample_name'], 'contam_idx': us}).dropna()

metal_z = zmat.copy()
var_rank = metal_z.var(axis=0, skipna=True).sort_values(ascending=False)
topk_cols = var_rank.head(min(3, len(var_rank))).index.tolist()
if topk_cols:
    idx_variants['top3_var_metals'] = pd.DataFrame({
        'sdt_sample_name': geo['sdt_sample_name'],
        'contam_idx': metal_z[topk_cols].mean(axis=1, skipna=True)
    }).dropna()

mz = metal_z.copy().fillna(metal_z.mean())
if mz.shape[1] >= 1:
    X = mz.to_numpy(dtype=float)
    X = X - X.mean(axis=0, keepdims=True)
    try:
        u, s, vt = np.linalg.svd(X, full_matrices=False)
        pc1 = u[:, 0] * s[0]
        idx_variants['pca1_metals'] = pd.DataFrame({'sdt_sample_name': geo['sdt_sample_name'], 'contam_idx': pc1})
    except Exception:
        pass

idx_sens_rows = []
for mapping_mode in sorted(site_scores_all['mapping_mode'].dropna().unique()):
    if mapping_mode not in primary_modes:
        continue
    defense = site_scores_all[site_scores_all['mapping_mode'] == mapping_mode][['sdt_sample_name', 'site_defense_score']]
    for variant, idxdf in idx_variants.items():
        d = defense.merge(idxdf, on='sdt_sample_name', how='inner').dropna()
        row = {
            'mapping_mode': mapping_mode,
            'index_variant': variant,
            'n_samples': len(d),
            'spearman_rho': np.nan,
            'spearman_rho_ci_low': np.nan,
            'spearman_rho_ci_high': np.nan,
            'spearman_p': np.nan,
        }
        if len(d) >= 10 and d['site_defense_score'].nunique(dropna=True) > 1:
            r, p = spearmanr(d['contam_idx'], d['site_defense_score'])
            row['spearman_rho'] = float(r)
            ci_l, ci_h = bootstrap_spearman_ci(d['contam_idx'].to_numpy(), d['site_defense_score'].to_numpy())
            row['spearman_rho_ci_low'] = ci_l
            row['spearman_rho_ci_high'] = ci_h
            row['spearman_p'] = float(p)
        idx_sens_rows.append(row)

index_sensitivity = pd.DataFrame(idx_sens_rows)
if len(index_sensitivity):
    index_sensitivity['spearman_p_fdr_q'] = bh_fdr(index_sensitivity['spearman_p'].to_numpy(dtype=float))

# Mapped-coverage decile diagnostics (defense endpoint)
decile_rows = []
for mapping_mode in sorted(site_scores_all['mapping_mode'].dropna().unique()):
    d = site_scores_all[site_scores_all['mapping_mode'] == mapping_mode][['sdt_sample_name', 'site_defense_score', 'mapped_abundance_fraction']].merge(
        geo_model[['sdt_sample_name', 'contamination_index']], on='sdt_sample_name', how='inner'
    ).dropna()
    if len(d) < 20:
        continue
    try:
        d = d.assign(mapped_cov_decile=pd.qcut(d['mapped_abundance_fraction'], 10, labels=False, duplicates='drop'))
    except Exception:
        continue
    for dec, g in d.groupby('mapped_cov_decile'):
        row = {
            'mapping_mode': mapping_mode,
            'mapped_cov_decile': int(dec) if pd.notna(dec) else np.nan,
            'n_samples': int(len(g)),
            'mapped_abundance_fraction_mean': float(g['mapped_abundance_fraction'].mean()),
            'spearman_rho': np.nan,
            'spearman_p': np.nan,
        }
        if len(g) >= 8 and g['site_defense_score'].nunique(dropna=True) > 1:
            rr, pp = spearmanr(g['contamination_index'], g['site_defense_score'])
            row['spearman_rho'] = float(rr)
            row['spearman_p'] = float(pp)
        decile_rows.append(row)

mapped_cov_deciles = pd.DataFrame(decile_rows)
if len(mapped_cov_deciles):
    mapped_cov_deciles['spearman_p_fdr_q'] = bh_fdr(mapped_cov_deciles['spearman_p'].to_numpy(dtype=float))

print('Site-score rows:', len(site_scores_all))
print('Model rows:', len(model_results))
print('Mapped-abundance summary by mode:')
print(site_scores_all.groupby('mapping_mode')['mapped_abundance_fraction'].describe().to_string())
print('High-coverage threshold:', highcov_threshold)
print('P-value columns with global FDR q-values:', p_cols)
print('Index sensitivity rows:', len(index_sensitivity))
print('Coverage-decile rows:', len(mapped_cov_deciles))
print('Model-family count rows:', len(model_family_counts))
model_results

Site-score rows: 324
Model rows: 12
Mapped-abundance summary by mode:
                            count      mean       std       min       25%       50%       75%       max
mapping_mode                                                                                           
relaxed_all_clades          108.0  0.342996  0.183007  0.031131  0.206741  0.303975  0.467964  0.853528
species_proxy_unique_genus  108.0  0.031014  0.054763  0.000196  0.010052  0.020619  0.036425  0.542533
strict_single_clade         108.0  0.342996  0.183007  0.031131  0.206741  0.303975  0.467964  0.853528
High-coverage threshold: 0.25
P-value columns with global FDR q-values: ['spearman_p', 'permutation_p', 'ols_p_contamination', 'adj_ols_p_contamination', 'adj_cov_p_contamination', 'adj_site_p_contamination', 'adj_frac_p_contamination', 'frac_0_2_spearman_p', 'frac_10_spearman_p', 'highcov_spearman_p']
Index sensitivity rows: 8
Coverage-decile rows: 30
Model-family count rows: 96

# Site-score and model generation moved into previous cell (mapping-mode loop).

# Statistical testing moved into previous cell (mapping-mode loop).

site_scores_all.to_csv(DATA_DIR / 'site_functional_scores.tsv', sep='	', index=False)
model_results.to_csv(DATA_DIR / 'model_results.tsv', sep='	', index=False)
index_sensitivity.to_csv(DATA_DIR / 'contamination_index_sensitivity.tsv', sep='	', index=False)
mapped_cov_deciles.to_csv(DATA_DIR / 'mapped_coverage_deciles.tsv', sep='	', index=False)
model_family_counts.to_csv(DATA_DIR / 'model_family_sample_counts.tsv', sep='	', index=False)

# Diagnostic 1: contamination index distribution
plt.figure(figsize=(6, 4))
plt.hist(geo_model['contamination_index'].dropna(), bins=20)
plt.xlabel('Contamination index (z-score composite)')
plt.ylabel('Sample count')
plt.title('Distribution of contamination index')
plt.tight_layout()
plt.savefig(FIG_DIR / 'contamination_index_distribution.png', dpi=160)
plt.close()

# Diagnostic 2: mapping coverage by mode
genera_total = bridge[['genus']].drop_duplicates().shape[0]
genera_mapped = bridge[bridge['mapping_tier'] == 'genus_exact'][['genus']].drop_duplicates().shape[0]
by_mode = (features[['mapping_mode', 'genus_norm']].drop_duplicates().groupby('mapping_mode')['genus_norm'].nunique())

plt.figure(figsize=(6, 4))
labels = ['enigma_total_genera', 'mapped_genera'] + [f"features_{m}" for m in by_mode.index.tolist()]
vals = [genera_total, genera_mapped] + by_mode.tolist()
plt.bar(labels, vals)
plt.xticks(rotation=20, ha='right')
plt.ylabel('Count')
plt.title('Taxonomy mapping / feature coverage')
plt.tight_layout()
plt.savefig(FIG_DIR / 'mapping_coverage_by_mode.png', dpi=160)
plt.close()

# Main association figure by mapping mode
modes = sorted(site_scores_all['mapping_mode'].dropna().unique())
fig, axes = plt.subplots(1, len(modes), figsize=(6 * max(1, len(modes)), 4), sharey=True)
if len(modes) == 1:
    axes = [axes]

for ax, mode in zip(axes, modes):
    md = site_scores_all[site_scores_all['mapping_mode'] == mode].merge(geo_model, on='sdt_sample_name', how='inner').dropna()
    ax.scatter(md['contamination_index'], md['site_stress_score'], s=16, alpha=0.7)
    if len(md) >= 3:
        m, b = np.polyfit(md['contamination_index'], md['site_stress_score'], 1)
        xs = np.linspace(md['contamination_index'].min(), md['contamination_index'].max(), 100)
        ax.plot(xs, m * xs + b)
    ax.set_title(mode)
    ax.set_xlabel('Contamination index')
axes[0].set_ylabel('Site stress functional score')
fig.suptitle('Contamination vs stress score by mapping mode')
fig.tight_layout()
fig.savefig(FIG_DIR / 'contamination_vs_functional_score.png', dpi=160)
plt.close(fig)

print('Saved:')
print(' -', (DATA_DIR / 'site_functional_scores.tsv').resolve())
print(' -', (DATA_DIR / 'model_results.tsv').resolve())
print(' -', (DATA_DIR / 'contamination_index_sensitivity.tsv').resolve())
print(' -', (DATA_DIR / 'mapped_coverage_deciles.tsv').resolve())
print(' -', (DATA_DIR / 'model_family_sample_counts.tsv').resolve())
print(' -', (FIG_DIR / 'contamination_vs_functional_score.png').resolve())
print(' -', (FIG_DIR / 'contamination_index_distribution.png').resolve())
print(' -', (FIG_DIR / 'mapping_coverage_by_mode.png').resolve())
print('\nTop model rows:')
print(model_results.head(12).to_string(index=False) if len(model_results) else 'No model rows produced')
print('\nConfirmatory index sensitivity rows:')
print(index_sensitivity.to_string(index=False) if len(index_sensitivity) else 'No index sensitivity rows')

Saved:
 - /home/psdehal/pangenome_science/BERIL-research-observatory/projects/enigma_contamination_functional_potential/data/site_functional_scores.tsv
 - /home/psdehal/pangenome_science/BERIL-research-observatory/projects/enigma_contamination_functional_potential/data/model_results.tsv
 - /home/psdehal/pangenome_science/BERIL-research-observatory/projects/enigma_contamination_functional_potential/data/contamination_index_sensitivity.tsv
 - /home/psdehal/pangenome_science/BERIL-research-observatory/projects/enigma_contamination_functional_potential/data/mapped_coverage_deciles.tsv
 - /home/psdehal/pangenome_science/BERIL-research-observatory/projects/enigma_contamination_functional_potential/data/model_family_sample_counts.tsv
 - /home/psdehal/pangenome_science/BERIL-research-observatory/projects/enigma_contamination_functional_potential/figures/contamination_vs_functional_score.png
 - /home/psdehal/pangenome_science/BERIL-research-observatory/projects/enigma_contamination_functional_potential/figures/contamination_index_distribution.png
 - /home/psdehal/pangenome_science/BERIL-research-observatory/projects/enigma_contamination_functional_potential/figures/mapping_coverage_by_mode.png

Top model rows:
              mapping_mode               outcome analysis_tier status  n_samples  spearman_rho  spearman_rho_ci_low  spearman_rho_ci_high  spearman_p  permutation_p  ols_beta_contamination  ols_p_contamination  adj_ols_beta_contamination  adj_ols_beta_ci_low  adj_ols_beta_ci_high  adj_ols_p_contamination  adj_ols_n  adj_cov_beta_contamination  adj_cov_beta_ci_low  adj_cov_beta_ci_high  adj_cov_p_contamination  adj_cov_n  adj_site_beta_contamination  adj_site_p_contamination  adj_site_n  adj_frac_beta_contamination  adj_frac_beta_ci_low  adj_frac_beta_ci_high  adj_frac_p_contamination  adj_frac_n  frac_0_2_n  frac_0_2_spearman_rho  frac_0_2_spearman_p  frac_10_n  frac_10_spearman_rho  frac_10_spearman_p  highcov_threshold  highcov_n  highcov_spearman_rho  highcov_spearman_p  spearman_p_fdr_q  permutation_p_fdr_q  ols_p_contamination_fdr_q  adj_ols_p_contamination_fdr_q  adj_cov_p_contamination_fdr_q  adj_site_p_contamination_fdr_q  adj_frac_p_contamination_fdr_q  frac_0_2_spearman_p_fdr_q  frac_10_spearman_p_fdr_q  highcov_spearman_p_fdr_q    min_p  min_fdr_q
        relaxed_all_clades    site_defense_score  confirmatory     ok        108      0.058685            -0.128230              0.250245    0.546311       0.530938                0.000946             0.068083                    0.001073            -0.000438              0.003664                 0.038552        108                    0.000751         2.236492e-04              0.001779                 0.000398        108                     0.001203                  0.045082         108                     0.000607              0.000213               0.001221                  0.001444         212         106              -0.029166             0.766631        106              0.012621            0.897831               0.25         68              0.280032            0.020730          0.862427             0.862427                   0.403092                       0.354630                       0.046162                        0.354630                        0.083765                   0.975506                  0.975734                  0.300578 0.000398   0.046162
       strict_single_clade    site_defense_score  confirmatory     ok        108      0.068240            -0.110762              0.252974    0.482840       0.463074                0.000888             0.127177                    0.001009            -0.000635              0.003622                 0.084181        108                    0.000640         1.685966e-04              0.001538                 0.003540        108                     0.001297                  0.055504         108                     0.000566              0.000214               0.001113                  0.005480         212         106              -0.027433             0.780136        106              0.025841            0.792600               0.25         68              0.311181            0.009796          0.848627             0.826409                   0.430143                       0.403092                       0.130181                        0.378736                        0.130181                   0.975506                  0.975734                  0.189382 0.003540   0.130181
        relaxed_all_clades   site_mobilome_score   exploratory     ok        108     -0.018453            -0.184271              0.166175    0.849655       0.836327                0.000287             0.865651                    0.000761            -0.002836              0.006000                 0.652728        108                   -0.000342        -1.364619e-03              0.000325                 0.506979        108                     0.002202                  0.257318         108                    -0.000225             -0.000936               0.000325                  0.643337         212         106              -0.074550             0.447557        106             -0.051263            0.601758               0.25         68              0.185556            0.129785          0.975734             0.975734                   0.975734                       0.901386                       0.856802                        0.671361                        0.900467                   0.824074                  0.894923                  0.430143 0.129785   0.430143
        relaxed_all_clades site_metabolism_score   exploratory     ok        108     -0.006450            -0.183459              0.188108    0.947181       0.944112                0.002382             0.761656                    0.004583            -0.012561              0.028914                 0.559485        108                   -0.000757        -2.197109e-03              0.001416                 0.375465        108                     0.009611                  0.284450         108                    -0.000388             -0.001453               0.001112                  0.634365         212         106              -0.072927             0.457529        106             -0.045359            0.644299               0.25         68              0.192350            0.116074          0.975734             0.975734                   0.975506                       0.865337                       0.774324                        0.700863                        0.900467                   0.826409                  0.900467                  0.408018 0.116074   0.408018
        relaxed_all_clades     site_stress_score   exploratory     ok        108      0.000934            -0.165851              0.187886    0.992349       0.984032                0.000616             0.572974                    0.000917            -0.001619              0.004645                 0.400512        108                    0.000204        -2.205891e-04              0.000606                 0.525209        108                     0.001703                  0.176033         108                     0.000191             -0.000229               0.000591                  0.540363         212         106              -0.064745             0.509649        106             -0.034224            0.727627               0.25         68              0.217468            0.074843          0.992349             0.992349                   0.874540                       0.774324                       0.862427                        0.537364                        0.862427                   0.856802                  0.948769                  0.403092 0.074843   0.403092
species_proxy_unique_genus    site_defense_score   exploratory     ok        108      0.168691            -0.005029              0.351323    0.080947       0.075848                0.000179             0.227121                    0.000155             0.000005              0.000476                 0.308858        108                    0.000022         1.906993e-06              0.000059                 0.290012        108                    -0.000011                  0.947098         108                     0.000035              0.000016               0.000069                  0.048915         212         106               0.202338             0.037520        106              0.123279            0.208033               0.25          1                   NaN                 NaN          0.403092             0.403092                   0.642587                       0.716550                       0.700863                        0.975734                        0.354630                   0.354630                  0.603296                       NaN 0.037520   0.354630
species_proxy_unique_genus   site_mobilome_score   exploratory     ok        108      0.157183            -0.015505              0.330853    0.104248       0.097804                0.000482             0.273870                    0.000449            -0.000001              0.001346                 0.322545        108                    0.000050        -1.163060e-05              0.000113                 0.092259        108                    -0.000058                  0.907366         108                     0.000040              0.000003               0.000079                  0.081224         212         106               0.193178             0.047251        106              0.094894            0.333248               0.25          1                   NaN                 NaN          0.403092             0.403092                   0.690628                       0.719525                       0.403092                        0.975734                        0.403092                   0.354630                  0.727746                       NaN 0.047251   0.354630
species_proxy_unique_genus     site_stress_score   exploratory     ok        108      0.154953            -0.019326              0.329397    0.109316       0.101796                0.000331             0.260305                    0.000302             0.000005              0.000910                 0.317954        108                    0.000036         9.620980e-07              0.000074                 0.036247        108                    -0.000035                  0.917421         108                     0.000038              0.000015               0.000067                  0.005611         212         106               0.200423             0.039402        106              0.101192            0.302011               0.25          1                   NaN                 NaN          0.408018             0.403092                   0.671361                       0.719525                       0.354630                        0.975734                        0.130181                   0.354630                  0.714966                       NaN 0.005611   0.130181
species_proxy_unique_genus site_metabolism_score   exploratory     ok        108      0.152219            -0.010797              0.324520    0.115793       0.099800                0.002452             0.338778                    0.002231            -0.000105              0.007709                 0.397271        108                   -0.000092        -2.991098e-04              0.000252                 0.398051        108                    -0.000619                  0.831392         108                    -0.000098             -0.000248               0.000127                  0.260442         212         106               0.225221             0.020276        106              0.088979            0.364389               0.25          1                   NaN                 NaN          0.408018             0.403092                   0.727746                       0.774324                       0.774324                        0.975734                        0.671361                   0.300578                  0.768529                       NaN 0.020276   0.300578
       strict_single_clade site_metabolism_score   exploratory     ok        108     -0.014662            -0.186744              0.178184    0.880288       0.868263                0.000466             0.950499                    0.002674            -0.012635              0.024162                 0.720587        108                   -0.002354        -5.849028e-03              0.000993                 0.093372        108                     0.007482                  0.381496         108                    -0.001895             -0.004247               0.000083                  0.143141         212         106              -0.079930             0.415369        106             -0.052019            0.596405               0.25         68              0.175745            0.151701          0.975734             0.975734                   0.975734                       0.948769                       0.403092                        0.774324                        0.461232                   0.789882                  0.894923                  0.475603 0.093372   0.403092
       strict_single_clade     site_stress_score   exploratory     ok        108      0.013919            -0.155106              0.192222    0.886313       0.870259                0.000690             0.673136                    0.000985            -0.002745              0.006258                 0.550169        108                   -0.000014        -1.243660e-03              0.000929                 0.985703        108                     0.002487                  0.190120         108                     0.000225             -0.000704               0.001242                  0.782087         212         106              -0.067335             0.492829        106             -0.034184            0.727935               0.25         68              0.248883            0.040695          0.975734             0.975734                   0.918633                       0.862427                       0.992349                        0.565486                        0.975506                   0.853256                  0.948769                  0.354630 0.040695   0.354630
       strict_single_clade   site_mobilome_score   exploratory     ok        108     -0.007459            -0.169651              0.171033    0.938926       0.938124                0.000492             0.856817                    0.000961            -0.004833              0.009190                 0.726901        108                   -0.000667        -3.220076e-03              0.000861                 0.631781        108                     0.003677                  0.244578         108                    -0.000117             -0.001934               0.001692                  0.936300         212         106              -0.076192             0.437587        106             -0.047777            0.626727               0.25         68              0.220865            0.070303          0.975734             0.975734                   0.975734                       0.948769                       0.900467                        0.671361                        0.975734                   0.818711                  0.900467                  0.403092 0.070303   0.403092

Confirmatory index sensitivity rows:
       mapping_mode        index_variant  n_samples  spearman_rho  spearman_rho_ci_low  spearman_rho_ci_high  spearman_p  spearman_p_fdr_q
 relaxed_all_clades composite_all_metals        108      0.058685            -0.128230              0.250245    0.546311          0.546311
 relaxed_all_clades         uranium_only        108     -0.097926            -0.294196              0.108525    0.313329          0.546311
 relaxed_all_clades      top3_var_metals        108      0.079387            -0.107038              0.284737    0.414098          0.546311
 relaxed_all_clades          pca1_metals        108      0.082264            -0.085891              0.273366    0.397330          0.546311
strict_single_clade composite_all_metals        108      0.068240            -0.110762              0.252974    0.482840          0.546311
strict_single_clade         uranium_only        108     -0.121648            -0.317446              0.078924    0.209786          0.546311
strict_single_clade      top3_var_metals        108      0.077967            -0.112999              0.279743    0.422522          0.546311
strict_single_clade          pca1_metals        108      0.090895            -0.078812              0.266110    0.349501          0.546311

03 Contamination Functional Models

NB03: Contamination vs Functional Potential Models¶

Modeling Assumptions and Sensitivity¶

	mapping_mode	outcome	analysis_tier	status	n_samples	spearman_rho	spearman_rho_ci_low	spearman_rho_ci_high	spearman_p	permutation_p	...	ols_p_contamination_fdr_q	adj_ols_p_contamination_fdr_q	adj_cov_p_contamination_fdr_q	adj_site_p_contamination_fdr_q	adj_frac_p_contamination_fdr_q	frac_0_2_spearman_p_fdr_q	frac_10_spearman_p_fdr_q	highcov_spearman_p_fdr_q	min_p	min_fdr_q
0	relaxed_all_clades	site_defense_score	confirmatory	ok	108	0.058685	-0.128230	0.250245	0.546311	0.530938	...	0.403092	0.354630	0.046162	0.354630	0.083765	0.975506	0.975734	0.300578	0.000398	0.046162
8	strict_single_clade	site_defense_score	confirmatory	ok	108	0.068240	-0.110762	0.252974	0.482840	0.463074	...	0.430143	0.403092	0.130181	0.378736	0.130181	0.975506	0.975734	0.189382	0.003540	0.130181
1	relaxed_all_clades	site_mobilome_score	exploratory	ok	108	-0.018453	-0.184271	0.166175	0.849655	0.836327	...	0.975734	0.901386	0.856802	0.671361	0.900467	0.824074	0.894923	0.430143	0.129785	0.430143
2	relaxed_all_clades	site_metabolism_score	exploratory	ok	108	-0.006450	-0.183459	0.188108	0.947181	0.944112	...	0.975506	0.865337	0.774324	0.700863	0.900467	0.826409	0.900467	0.408018	0.116074	0.408018
3	relaxed_all_clades	site_stress_score	exploratory	ok	108	0.000934	-0.165851	0.187886	0.992349	0.984032	...	0.874540	0.774324	0.862427	0.537364	0.862427	0.856802	0.948769	0.403092	0.074843	0.403092
4	species_proxy_unique_genus	site_defense_score	exploratory	ok	108	0.168691	-0.005029	0.351323	0.080947	0.075848	...	0.642587	0.716550	0.700863	0.975734	0.354630	0.354630	0.603296	NaN	0.037520	0.354630
5	species_proxy_unique_genus	site_mobilome_score	exploratory	ok	108	0.157183	-0.015505	0.330853	0.104248	0.097804	...	0.690628	0.719525	0.403092	0.975734	0.403092	0.354630	0.727746	NaN	0.047251	0.354630
7	species_proxy_unique_genus	site_stress_score	exploratory	ok	108	0.154953	-0.019326	0.329397	0.109316	0.101796	...	0.671361	0.719525	0.354630	0.975734	0.130181	0.354630	0.714966	NaN	0.005611	0.130181
6	species_proxy_unique_genus	site_metabolism_score	exploratory	ok	108	0.152219	-0.010797	0.324520	0.115793	0.099800	...	0.727746	0.774324	0.774324	0.975734	0.671361	0.300578	0.768529	NaN	0.020276	0.300578
10	strict_single_clade	site_metabolism_score	exploratory	ok	108	-0.014662	-0.186744	0.178184	0.880288	0.868263	...	0.975734	0.948769	0.403092	0.774324	0.461232	0.789882	0.894923	0.475603	0.093372	0.403092
11	strict_single_clade	site_stress_score	exploratory	ok	108	0.013919	-0.155106	0.192222	0.886313	0.870259	...	0.918633	0.862427	0.992349	0.565486	0.975506	0.853256	0.948769	0.354630	0.040695	0.354630
9	strict_single_clade	site_mobilome_score	exploratory	ok	108	-0.007459	-0.169651	0.171033	0.938926	0.938124	...	0.975734	0.948769	0.900467	0.671361	0.975734	0.818711	0.900467	0.403092	0.070303	0.403092