import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.spatial.distance import pdist, squareform
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, silhouette_samples
from pathlib import Path
import sys
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 10)
plt.rcParams['font.size'] = 11

PROJECT_ROOT = Path('/home/cjneely/repos/BERIL-research-observatory/projects/metabolic_capability_dependency')
DATA_DIR = PROJECT_ROOT / 'data'
FIG_DIR = PROJECT_ROOT / 'figures'
FIG_DIR.mkdir(exist_ok=True)

sys.path.insert(0, str(PROJECT_ROOT / 'src'))
from pathway_utils import build_pathway_matrix, compute_pathway_heterogeneity

print(f'Data directory: {DATA_DIR}')
print(f'Figures directory: {FIG_DIR}')

Data directory: /home/cjneely/repos/BERIL-research-observatory/projects/metabolic_capability_dependency/data
Figures directory: /home/cjneely/repos/BERIL-research-observatory/projects/metabolic_capability_dependency/figures

gapmind_path = DATA_DIR / 'gapmind_genome_pathways.csv'
if not gapmind_path.exists():
    raise FileNotFoundError(f'GapMind data not found at {gapmind_path}. Run NB01 first.')

print('Loading gapmind_genome_pathways.csv...')
gapmind = pd.read_csv(gapmind_path, dtype={'genome_id': str, 'species': str,
                                            'pathway': str, 'best_score': np.int8,
                                            'is_complete': np.int8})
print(f'Loaded {len(gapmind):,} records')
print(f'  Genomes: {gapmind["genome_id"].nunique():,}')
print(f'  Species: {gapmind["species"].nunique():,}')
print(f'  Pathways: {gapmind["pathway"].nunique()}')

species_summary = pd.read_csv(DATA_DIR / 'gapmind_species_summary.csv')
print(f'\nSpecies summary: {len(species_summary):,} species')

Loading gapmind_genome_pathways.csv...
Loaded 23,424,480 records
  Genomes: 292,806
  Species: 27,690
  Pathways: 80

Species summary: 27,690 species

MIN_GENOMES = 50
MIN_VARIABLE_PATHWAYS = 15
MAX_SPECIES = 10  # analyze top N species

# Pre-filter species with enough genomes
large_species = species_summary[species_summary['n_genomes'] >= MIN_GENOMES].copy()
print(f'Species with ≥{MIN_GENOMES} genomes: {len(large_species):,}')

# For each species, compute how many pathways are variable (10-90% completion)
variable_pathway_counts = []

for _, row in large_species.iterrows():
    sp = row['species']
    sp_data = gapmind[gapmind['species'] == sp]
    if len(sp_data) == 0:
        continue

    pathway_rates = sp_data.groupby('pathway')['is_complete'].mean()
    n_variable = ((pathway_rates >= 0.10) & (pathway_rates <= 0.90)).sum()

    variable_pathway_counts.append({
        'species': sp,
        'n_genomes': row['n_genomes'],
        'n_variable_pathways': int(n_variable),
    })

heterogeneity_df = pd.DataFrame(variable_pathway_counts).sort_values(
    'n_genomes', ascending=False
)

eligible = heterogeneity_df[heterogeneity_df['n_variable_pathways'] >= MIN_VARIABLE_PATHWAYS]
print(f'Species with ≥{MIN_VARIABLE_PATHWAYS} variable pathways: {len(eligible):,}')

target_species = eligible.head(MAX_SPECIES)['species'].tolist()
print(f'\nTarget species for ecotype analysis ({len(target_species)}):')
for sp in target_species:
    row = eligible[eligible['species'] == sp].iloc[0]
    print(f'  {sp}: {int(row["n_genomes"]):,} genomes, '
          f'{int(row["n_variable_pathways"])} variable pathways')

Species with ≥50 genomes: 457
Species with ≥15 variable pathways: 11

Target species for ecotype analysis (10):
  s__Salmonella_enterica--RS_GCF_000006945.2: 11,396 genomes, 53 variable pathways
  s__Stutzerimonas_stutzeri--RS_GCF_000219605.1: 149 genomes, 19 variable pathways
  s__Pelagibacter_sp003209915--GB_GCA_003209915.1: 79 genomes, 23 variable pathways
  s__Prochlorococcus_A_sp000635495--GB_GCA_000635495.1: 74 genomes, 22 variable pathways
  s__Acetatifactor_intestinalis--RS_GCF_009695995.1: 59 genomes, 15 variable pathways
  s__Alteromonas_macleodii--RS_GCF_000172635.2: 56 genomes, 19 variable pathways
  s__Ruminococcus_E_sp900316555--GB_GCA_900316555.1: 53 genomes, 16 variable pathways
  s__Phenylobacterium_sp020401865--GB_GCA_020401865.1: 51 genomes, 21 variable pathways
  s__PALSA-747_sp903897155--GB_GCA_903897155.1: 51 genomes, 22 variable pathways
  s__Limivicinus_sp900317375--GB_GCA_900317375.1: 50 genomes, 21 variable pathways

try:
    spark = get_spark_session()
    SPARK_AVAILABLE = True
    print('Spark session available — will query environment metadata')
except Exception as e:
    SPARK_AVAILABLE = False
    print(f'Spark not available ({e}) — skipping environment metadata')

Spark session available — will query environment metadata

env_metadata = None

if SPARK_AVAILABLE:
    # Discover available environment-related columns
    # gtdb_metadata uses `accession` (e.g. GB_GCA_000123.1) not genome_id
    schema_df = spark.sql('DESCRIBE kbase_ke_pangenome.gtdb_metadata').toPandas()
    available_cols = set(schema_df['col_name'].str.lower().tolist())

    print(f'Total columns in gtdb_metadata: {len(available_cols)}')

    # Preferred env columns; take whatever exists
    ENV_COL_CANDIDATES = [
        'ncbi_isolation_source',
        'ncbi_biosample_accession',
        'ncbi_host',
        'ncbi_country',
        'ncbi_lat_lon',
        'ncbi_biosample',
        'ncbi_isolate',
        'ncbi_wgs_master',
    ]
    env_cols = [c for c in ENV_COL_CANDIDATES if c in available_cols]
    print(f'Available env columns: {env_cols if env_cols else "none — will skip env test"}')

    if not env_cols:
        print('No NCBI environment columns found in gtdb_metadata — skipping env metadata query.')
    else:
        # Get all genome IDs for the target species
        target_genomes = gapmind[
            gapmind['species'].isin(target_species)
        ]['genome_id'].unique().tolist()
        print(f'\nQuerying metadata for {len(target_genomes):,} genomes from {len(target_species)} species...')

        # GapMind genome_ids are GCA_XXXX.Y / GCF_XXXX.Y (no prefix).
        # gtdb_metadata.accession is GB_GCA_XXXX.Y / RS_GCF_XXXX.Y — strip prefix to match.
        genome_list_df = spark.createDataFrame(
            pd.DataFrame({'genome_id': target_genomes})
        )
        genome_list_df.createOrReplaceTempView('target_genomes_view')

        select_cols = ', '.join([f'm.{c}' for c in env_cols])
        env_metadata_spark = spark.sql(f"""
            SELECT
                REGEXP_REPLACE(m.accession, '^(GB_|RS_)', '') AS genome_id,
                {select_cols}
            FROM kbase_ke_pangenome.gtdb_metadata m
            INNER JOIN target_genomes_view t
                ON REGEXP_REPLACE(m.accession, '^(GB_|RS_)', '') = t.genome_id
        """)

        env_metadata = env_metadata_spark.toPandas()
        print(f'Retrieved metadata for {len(env_metadata):,} genomes')
        print(env_metadata.head(5).to_string())

        env_metadata.to_csv(DATA_DIR / 'genome_env_metadata.csv', index=False)
        print(f'Saved: {DATA_DIR}/genome_env_metadata.csv')

else:
    env_path = DATA_DIR / 'genome_env_metadata.csv'
    if env_path.exists():
        env_metadata = pd.read_csv(env_path)
        print(f'Loaded cached environment metadata: {len(env_metadata):,} records')
    else:
        print('No environment metadata available — clustering will proceed without it')

Total columns in gtdb_metadata: 110
Available env columns: ['ncbi_isolation_source', 'ncbi_country', 'ncbi_lat_lon', 'ncbi_biosample', 'ncbi_isolate', 'ncbi_wgs_master']

Querying metadata for 12,018 genomes from 10 species...
Retrieved metadata for 12,018 genomes
         genome_id    ncbi_isolation_source ncbi_country ncbi_lat_lon ncbi_biosample ncbi_isolate ncbi_wgs_master
0  GCF_000312805.2                     none    Australia         none   SAMN02225276         none  AMEH00000000.2
1  GCF_000757955.1                    stool      USA: NY         none   SAMN01942271         none  JRLP00000000.1
2  GCF_001364375.1                    blood     Cambodia         none   SAMEA2148542         none  CEZR00000000.1
3  GCF_000748005.1             Cumin Powder        India         none   SAMN01816226         none  JRDA00000000.1
4  GCF_000330005.1  chicken (Gallus gallus)        China         none   SAMN01041153         none  ALHC00000000.1
Saved: /home/cjneely/repos/BERIL-research-observatory/projects/metabolic_capability_dependency/data/genome_env_metadata.csv

from pathway_utils import compute_pathway_heterogeneity, categorize_pathway

# Compute heterogeneity for each (species, pathway) pair
hetero_records = []
for sp in target_species:
    sp_data = gapmind[gapmind['species'] == sp]
    matrix = build_pathway_matrix(gapmind, sp)
    h = compute_pathway_heterogeneity(matrix)
    for pw, score in h.items():
        hetero_records.append({
            'species': sp,
            'pathway': pw,
            'heterogeneity': score,
            'completion_rate': matrix[pw].mean(),
            'pathway_category': categorize_pathway(pw),
        })

hetero_df = pd.DataFrame(hetero_records)

# Mean heterogeneity per pathway across all target species
mean_hetero = (
    hetero_df.groupby(['pathway', 'pathway_category'])['heterogeneity']
    .mean()
    .reset_index()
    .sort_values('heterogeneity', ascending=False)
)

print('Top 20 most heterogeneous pathways (mean across target species):')
print(mean_hetero.head(20).to_string(index=False))

hetero_df.to_csv(DATA_DIR / 'pathway_heterogeneity.csv', index=False)
print(f'\nSaved: {DATA_DIR}/pathway_heterogeneity.csv')

# Figure: top 30 heterogeneous pathways
top30 = mean_hetero.head(30)
palette = {'amino_acid': '#e74c3c', 'carbon': '#3498db', 'cofactor': '#2ecc71', 'other': '#95a5a6'}
colors = top30['pathway_category'].map(palette)

fig, ax = plt.subplots(figsize=(10, 8))
bars = ax.barh(range(len(top30)), top30['heterogeneity'].values, color=colors)
ax.set_yticks(range(len(top30)))
ax.set_yticklabels(top30['pathway'].values, fontsize=9)
ax.set_xlabel('Mean Pathway Heterogeneity (4·p·(1-p))', fontsize=11)
ax.set_title('Most Variable Pathways Across Target Species', fontsize=13)

# Legend
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor=c, label=l) for l, c in palette.items()]
ax.legend(handles=legend_elements, loc='lower right', fontsize=9)

plt.tight_layout()
plt.savefig(FIG_DIR / 'nb05_pathway_heterogeneity.png', dpi=150, bbox_inches='tight')
plt.close()
print('Saved: figures/nb05_pathway_heterogeneity.png')

Top 20 most heterogeneous pathways (mean across target species):
          pathway pathway_category  heterogeneity
              val       amino_acid       0.600931
              leu       amino_acid       0.600931
              trp       amino_acid       0.510638
              lys       amino_acid       0.485343
              thr       amino_acid       0.485343
              cys       amino_acid       0.471149
      deoxyribose           carbon       0.447398
              asn       amino_acid       0.435358
              pro       amino_acid       0.430322
       tryptophan       amino_acid       0.408835
              ile       amino_acid       0.404941
              arg       amino_acid       0.400691
4-hydroxybenzoate           carbon       0.390611
              met       amino_acid       0.384979
              ser       amino_acid       0.377891
          ethanol           carbon       0.367788
    phenylalanine       amino_acid       0.329311
           fucose           carbon       0.318993
        thymidine           carbon       0.316237
     deoxyinosine           carbon       0.316237

Saved: /home/cjneely/repos/BERIL-research-observatory/projects/metabolic_capability_dependency/data/pathway_heterogeneity.csv
Saved: figures/nb05_pathway_heterogeneity.png

def cluster_species(
    gapmind_data: pd.DataFrame,
    species_name: str,
    min_heterogeneity: float = 0.05,
    k_range: range = range(2, 7)
) -> dict:
    """Cluster genomes for one species by metabolic pathway profiles."""
    matrix = build_pathway_matrix(gapmind_data, species_name)
    if matrix.shape[0] < 10:
        return None

    # Keep only variable pathways
    hetero = compute_pathway_heterogeneity(matrix)
    informative_pathways = hetero[hetero >= min_heterogeneity].index.tolist()
    if len(informative_pathways) < 5:
        print(f'  {species_name}: too few variable pathways ({len(informative_pathways)})')
        return None

    matrix_var = matrix[informative_pathways]

    # Jaccard distance
    try:
        dist_vec = pdist(matrix_var.values, metric='jaccard')
    except Exception as e:
        print(f'  {species_name}: distance computation failed — {e}')
        return None

    dist_sq = squareform(dist_vec)

    # Hierarchical clustering
    Z = linkage(dist_vec, method='average')

    # Select optimal k by silhouette score (skip if <20 genomes)
    best_k, best_sil, best_labels = 2, -1, None
    if matrix_var.shape[0] >= 20:
        for k in k_range:
            labels = fcluster(Z, t=k, criterion='maxclust')
            if len(np.unique(labels)) < 2:
                continue
            try:
                sil = silhouette_score(dist_sq, labels, metric='precomputed')
                if sil > best_sil:
                    best_sil, best_k, best_labels = sil, k, labels
            except Exception:
                pass
    if best_labels is None:
        best_labels = fcluster(Z, t=best_k, criterion='maxclust')

    return {
        'species': species_name,
        'matrix': matrix_var,
        'linkage': Z,
        'dist_sq': dist_sq,
        'cluster_labels': best_labels,
        'n_clusters': best_k,
        'silhouette': best_sil,
        'n_genomes': matrix_var.shape[0],
        'n_pathways': len(informative_pathways),
    }


print('Clustering functions defined.')

Clustering functions defined.

cluster_results = {}

for sp in target_species:
    print(f'\nClustering: {sp}')
    result = cluster_species(gapmind, sp)
    if result is None:
        print(f'  Skipped')
        continue
    cluster_results[sp] = result
    print(f'  Genomes: {result["n_genomes"]:,}, '
          f'variable pathways: {result["n_pathways"]}, '
          f'k={result["n_clusters"]}, '
          f'silhouette={result["silhouette"]:.3f}')

print(f'\nSuccessfully clustered {len(cluster_results)} species')

Clustering: s__Salmonella_enterica--RS_GCF_000006945.2
  Genomes: 11,396, variable pathways: 54, k=6, silhouette=0.894

Clustering: s__Stutzerimonas_stutzeri--RS_GCF_000219605.1
  Genomes: 149, variable pathways: 45, k=2, silhouette=0.780

Clustering: s__Pelagibacter_sp003209915--GB_GCA_003209915.1
  Genomes: 79, variable pathways: 25, k=2, silhouette=0.430

Clustering: s__Prochlorococcus_A_sp000635495--GB_GCA_000635495.1
  Genomes: 74, variable pathways: 23, k=2, silhouette=0.429

Clustering: s__Acetatifactor_intestinalis--RS_GCF_009695995.1
  Genomes: 59, variable pathways: 35, k=6, silhouette=0.657

Clustering: s__Alteromonas_macleodii--RS_GCF_000172635.2
  Genomes: 56, variable pathways: 50, k=2, silhouette=0.738

Clustering: s__Ruminococcus_E_sp900316555--GB_GCA_900316555.1
  Genomes: 53, variable pathways: 30, k=2, silhouette=0.609

Clustering: s__Phenylobacterium_sp020401865--GB_GCA_020401865.1
  Genomes: 51, variable pathways: 41, k=2, silhouette=0.544

Clustering: s__PALSA-747_sp903897155--GB_GCA_903897155.1
  Genomes: 51, variable pathways: 29, k=2, silhouette=0.349

Clustering: s__Limivicinus_sp900317375--GB_GCA_900317375.1
  Genomes: 50, variable pathways: 38, k=2, silhouette=0.528

Successfully clustered 10 species

CLUSTER_COLORS = ['#e74c3c', '#3498db', '#2ecc71', '#f39c12', '#9b59b6', '#1abc9c']

for sp, res in cluster_results.items():
    matrix = res['matrix']
    labels = res['cluster_labels']

    # Cluster color row
    row_colors = pd.Series(
        [CLUSTER_COLORS[(l - 1) % len(CLUSTER_COLORS)] for l in labels],
        index=matrix.index,
        name='Cluster'
    )

    # Downsample if too many genomes for a readable heatmap
    if matrix.shape[0] > 500:
        # Sample proportionally from each cluster
        sample_idx = []
        for cl in np.unique(labels):
            cl_idx = matrix.index[labels == cl]
            n_samp = max(50, int(500 * (len(cl_idx) / matrix.shape[0])))
            sample_idx.extend(np.random.choice(cl_idx, min(n_samp, len(cl_idx)), replace=False))
        matrix_plot = matrix.loc[sample_idx]
        row_colors_plot = row_colors.loc[sample_idx]
        title_suffix = f' (n={len(sample_idx)} sampled)'
    else:
        matrix_plot = matrix
        row_colors_plot = row_colors
        title_suffix = f' (n={matrix.shape[0]})'

    # Short species label for filename
    sp_short = sp.replace('s__', '').split('--')[0].replace(' ', '_')[:40]

    try:
        g = sns.clustermap(
            matrix_plot,
            method='average',
            metric='jaccard',
            row_colors=row_colors_plot,
            cmap='Blues',
            xticklabels=True,
            yticklabels=False,
            figsize=(max(12, matrix_plot.shape[1] * 0.3), min(20, max(8, matrix_plot.shape[0] * 0.02 + 4))),
            cbar_kws={'label': 'Pathway Complete'},
        )
        g.fig.suptitle(
            f'Metabolic Profiles — {sp_short}{title_suffix}\n'
            f'k={res["n_clusters"]} clusters, silhouette={res["silhouette"]:.3f}',
            y=1.01, fontsize=11
        )
        g.ax_heatmap.set_xlabel('Pathway', fontsize=9)
        plt.setp(g.ax_heatmap.xaxis.get_majorticklabels(), rotation=90, fontsize=7)

        out_path = FIG_DIR / f'nb05_heatmap_{sp_short}.png'
        g.savefig(out_path, dpi=120, bbox_inches='tight')
        plt.close()
        print(f'Saved: figures/nb05_heatmap_{sp_short}.png')
    except Exception as e:
        print(f'  Heatmap failed for {sp_short}: {e}')

Saved: figures/nb05_heatmap_Salmonella_enterica.png
Saved: figures/nb05_heatmap_Stutzerimonas_stutzeri.png
Saved: figures/nb05_heatmap_Pelagibacter_sp003209915.png
Saved: figures/nb05_heatmap_Prochlorococcus_A_sp000635495.png
Saved: figures/nb05_heatmap_Acetatifactor_intestinalis.png
Saved: figures/nb05_heatmap_Alteromonas_macleodii.png
Saved: figures/nb05_heatmap_Ruminococcus_E_sp900316555.png
Saved: figures/nb05_heatmap_Phenylobacterium_sp020401865.png
Saved: figures/nb05_heatmap_PALSA-747_sp903897155.png
Saved: figures/nb05_heatmap_Limivicinus_sp900317375.png

fig, axes = plt.subplots(
    2, min(5, (len(cluster_results) + 1) // 2),
    figsize=(20, 8),
    squeeze=False
)
axes_flat = axes.flatten()

pca_results_all = []

for idx, (sp, res) in enumerate(cluster_results.items()):
    if idx >= len(axes_flat):
        break

    matrix = res['matrix']
    labels = res['cluster_labels']
    ax = axes_flat[idx]

    pca = PCA(n_components=2, random_state=42)
    coords = pca.fit_transform(matrix.values)

    for cl in np.unique(labels):
        mask = labels == cl
        color = CLUSTER_COLORS[(cl - 1) % len(CLUSTER_COLORS)]
        ax.scatter(coords[mask, 0], coords[mask, 1], c=color,
                   s=10, alpha=0.6, linewidths=0, label=f'Cluster {cl}')

    ax.legend(title='Cluster', fontsize=6, title_fontsize=6,
              markerscale=2, loc='best', framealpha=0.7)

    sp_short = sp.replace('s__', '').split('--')[0][:30]
    ax.set_title(f'{sp_short}\n(k={res["n_clusters"]}, sil={res["silhouette"]:.2f})', fontsize=8)
    ax.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%})', fontsize=7)
    ax.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%})', fontsize=7)
    ax.tick_params(labelsize=6)

    # Store PCA results for ecotype table
    for i, (genome, label) in enumerate(zip(matrix.index, labels)):
        pca_results_all.append({
            'species': sp,
            'genome_id': genome,
            'cluster_id': int(label),
            'pc1': coords[i, 0],
            'pc2': coords[i, 1],
        })

# Hide unused axes
for ax in axes_flat[len(cluster_results):]:
    ax.set_visible(False)

plt.suptitle('PCA of Metabolic Pathway Profiles by Species', fontsize=12, y=1.01)
plt.tight_layout()
plt.savefig(FIG_DIR / 'nb05_pca_all_species.png', dpi=150, bbox_inches='tight')
plt.close()
print('Saved: figures/nb05_pca_all_species.png')

Saved: figures/nb05_pca_all_species.png

from scipy.stats import chi2_contingency

ecotypes_df = pd.DataFrame(pca_results_all)

# Determine which isolation-source column is available
ISOLATION_COL = None
if env_metadata is not None and len(env_metadata) > 0:
    for candidate in ['ncbi_isolation_source', 'ncbi_isolate', 'ncbi_host']:
        if candidate in env_metadata.columns:
            ISOLATION_COL = candidate
            break

if env_metadata is not None and len(env_metadata) > 0 and ISOLATION_COL is not None:
    # Merge cluster assignments with environment data
    ecotypes_env = ecotypes_df.merge(
        env_metadata[['genome_id', ISOLATION_COL]].dropna(),
        on='genome_id',
        how='left'
    )

    print(f'Using isolation column: {ISOLATION_COL}')
    print('Cluster × isolation source associations:')
    for sp in ecotypes_env['species'].unique():
        sp_data = ecotypes_env[
            (ecotypes_env['species'] == sp) &
            (ecotypes_env[ISOLATION_COL].notna())
        ]
        if len(sp_data) < 20:
            continue

        # Harmonize isolation source into broad categories
        def harmonize_source(s):
            s = str(s).lower()
            if any(x in s for x in ['blood', 'wound', 'urine', 'sputum', 'lung', 'clinical', 'patient', 'hospital']):
                return 'clinical'
            elif any(x in s for x in ['soil', 'sediment', 'ground']):
                return 'soil/sediment'
            elif any(x in s for x in ['water', 'river', 'lake', 'ocean', 'marine', 'sea']):
                return 'water'
            elif any(x in s for x in ['food', 'chicken', 'beef', 'pork', 'dairy', 'milk']):
                return 'food/animal'
            elif any(x in s for x in ['plant', 'root', 'leaf', 'rhizosphere']):
                return 'plant'
            else:
                return 'other'

        sp_data = sp_data.copy()
        sp_data['source_category'] = sp_data[ISOLATION_COL].apply(harmonize_source)

        contingency = pd.crosstab(sp_data['cluster_id'], sp_data['source_category'])
        if contingency.shape[0] < 2 or contingency.shape[1] < 2:
            continue

        chi2, p, dof, _ = chi2_contingency(contingency)
        sp_short = sp.replace('s__', '').split('--')[0][:30]
        print(f'  {sp_short}: χ²={chi2:.1f}, df={dof}, p={p:.4f}')
        print(contingency.to_string())
        print()

    ecotypes_df = ecotypes_df.merge(
        ecotypes_env[['genome_id', ISOLATION_COL]].drop_duplicates(),
        on='genome_id', how='left'
    )
else:
    if env_metadata is None or len(env_metadata) == 0:
        print('No environment metadata available — skipping association test.')
    else:
        print(f'No usable isolation-source column found in env_metadata '
              f'(columns: {list(env_metadata.columns)}) — skipping association test.')
    ecotypes_df['ncbi_isolation_source'] = np.nan

Using isolation column: ncbi_isolation_source
Cluster × isolation source associations:
  Salmonella_enterica: χ²=1570.2, df=25, p=0.0000
source_category  clinical  food/animal  other  plant  soil/sediment  water
cluster_id                                                                
1                      50           89    269      0              5     34
2                       3            0     34      0              0      0
3                     197            1     44      0              0      0
4                     880         1253   6504      6            441    399
5                      22            9     79      0              1      4
6                      20          111    904      0             14     20

  Stutzerimonas_stutzeri: χ²=0.6, df=5, p=0.9877
source_category  clinical  food/animal  other  plant  soil/sediment  water
cluster_id                                                                
1                      29            1     92      1             13     12
2                       0            0      1      0              0      0

  Alteromonas_macleodii: χ²=0.0, df=1, p=1.0000
source_category  other  water
cluster_id                   
1                   26     29
2                    0      1

  Phenylobacterium_sp020401865: χ²=12.2, df=1, p=0.0005
source_category  other  water
cluster_id                   
1                    0     50
2                    1      0

from scipy.stats import chi2_contingency

signature_records = []

for sp, res in cluster_results.items():
    matrix = res['matrix']
    labels = res['cluster_labels']

    for pathway in matrix.columns:
        # Completion rate per cluster
        rates = {}
        for cl in np.unique(labels):
            cl_genomes = matrix.index[labels == cl]
            rates[cl] = matrix.loc[cl_genomes, pathway].mean()

        # Chi-square: is pathway completion associated with cluster?
        contingency = pd.crosstab(
            labels,
            matrix[pathway].values
        )
        if contingency.shape[0] >= 2 and contingency.shape[1] >= 2:
            try:
                chi2, p, _, _ = chi2_contingency(contingency)
            except Exception:
                chi2, p = np.nan, np.nan
        else:
            chi2, p = np.nan, np.nan

        max_rate = max(rates.values())
        min_rate = min(rates.values())

        signature_records.append({
            'species': sp,
            'pathway': pathway,
            'chi2': chi2,
            'p_value': p,
            'rate_range': max_rate - min_rate,
            'max_cluster_rate': max_rate,
            'min_cluster_rate': min_rate,
        })

sig_df = pd.DataFrame(signature_records).sort_values(['species', 'p_value'])

# Bonferroni correction within species
sig_df['p_bonf'] = sig_df.groupby('species')['p_value'].transform(
    lambda x: (x * len(x)).clip(upper=1.0)
)

print('Top pathway signatures per species (Bonferroni p < 0.05):')
for sp in sig_df['species'].unique():
    sp_sig = sig_df[(sig_df['species'] == sp) & (sig_df['p_bonf'] < 0.05)].head(10)
    sp_short = sp.replace('s__', '').split('--')[0][:40]
    print(f'\n  {sp_short} ({len(sp_sig)} significant pathways):')
    if len(sp_sig) > 0:
        print(sp_sig[['pathway', 'chi2', 'p_bonf', 'rate_range']].to_string(index=False))

Top pathway signatures per species (Bonferroni p < 0.05):

  Acetatifactor_intestinalis (10 significant pathways):
          pathway  chi2       p_bonf  rate_range
4-hydroxybenzoate  59.0 6.844908e-10         1.0
     deoxyinosine  59.0 6.844908e-10         1.0
      deoxyribose  59.0 6.844908e-10         1.0
          ethanol  59.0 6.844908e-10         1.0
              lys  59.0 6.844908e-10         1.0
              met  59.0 6.844908e-10         1.0
              thr  59.0 6.844908e-10         1.0
        thymidine  59.0 6.844908e-10         1.0
       tryptophan  59.0 6.844908e-10         1.0
       cellobiose  59.0 6.844908e-10         1.0

  Alteromonas_macleodii (1 significant pathways):
pathway      chi2   p_bonf  rate_range
sucrose 13.495537 0.011957         1.0

  Limivicinus_sp900317375 (0 significant pathways):

  PALSA-747_sp903897155 (1 significant pathways):
pathway      chi2   p_bonf  rate_range
    gly 17.962213 0.000653    0.979592

  Pelagibacter_sp003209915 (0 significant pathways):

  Phenylobacterium_sp020401865 (0 significant pathways):

  Prochlorococcus_A_sp000635495 (1 significant pathways):
pathway      chi2   p_bonf  rate_range
    met 12.344669 0.010172    0.944444

  Ruminococcus_E_sp900316555 (1 significant pathways):
   pathway      chi2   p_bonf  rate_range
chorismate 12.745285 0.010707         1.0

  Salmonella_enterica (10 significant pathways):
          pathway         chi2  p_bonf  rate_range
   2-oxoglutarate  9440.367340     0.0    0.972377
4-hydroxybenzoate 10985.934407     0.0    0.993780
        D-alanine 10005.555024     0.0    0.977227
        D-lactate 10004.274193     0.0    0.977333
         D-serine 10011.341278     0.0    0.977333
        L-lactate  9992.708336     0.0    0.977122
         L-malate  9988.228861     0.0    0.976911
              NAG  9845.720166     0.0    0.974275
          acetate  9801.001658     0.0    0.979424
          alanine 10005.555024     0.0    0.977227

  Stutzerimonas_stutzeri (3 significant pathways):
      pathway      chi2   p_bonf  rate_range
 deoxyinosine 17.999201 0.000994    0.993243
phenylalanine 17.999201 0.000994    0.993243
    thymidine 17.999201 0.000994    0.993243

# Cluster assignment table
ecotypes_df.to_csv(DATA_DIR / 'metabolic_ecotypes.csv', index=False)
print(f'Saved: {DATA_DIR}/metabolic_ecotypes.csv ({len(ecotypes_df):,} genomes)')

# Pathway signatures
sig_df.to_csv(DATA_DIR / 'pathway_cluster_signatures.csv', index=False)
print(f'Saved: {DATA_DIR}/pathway_cluster_signatures.csv')

# Cluster summary stats
cluster_summary = []
for sp, res in cluster_results.items():
    cluster_summary.append({
        'species': sp,
        'n_genomes': res['n_genomes'],
        'n_variable_pathways': res['n_pathways'],
        'n_clusters': res['n_clusters'],
        'silhouette_score': round(res['silhouette'], 4),
    })

summary_df = pd.DataFrame(cluster_summary).sort_values('silhouette_score', ascending=False)
print('\nCluster analysis summary:')
print(summary_df.to_string(index=False))

summary_df.to_csv(DATA_DIR / 'ecotype_cluster_summary.csv', index=False)

print('\n=== NB05 Complete ===')
print(f'Species analyzed: {len(cluster_results)}')
print(f'Total genomes clustered: {ecotypes_df["genome_id"].nunique():,}')
good_clusters = summary_df[summary_df['silhouette_score'] > 0.2]
print(f'Species with clear clustering (silhouette > 0.2): {len(good_clusters)}')

Saved: /home/cjneely/repos/BERIL-research-observatory/projects/metabolic_capability_dependency/data/metabolic_ecotypes.csv (12,018 genomes)
Saved: /home/cjneely/repos/BERIL-research-observatory/projects/metabolic_capability_dependency/data/pathway_cluster_signatures.csv

Cluster analysis summary:
                                             species  n_genomes  n_variable_pathways  n_clusters  silhouette_score
          s__Salmonella_enterica--RS_GCF_000006945.2      11396                   54           6            0.8937
       s__Stutzerimonas_stutzeri--RS_GCF_000219605.1        149                   45           2            0.7797
        s__Alteromonas_macleodii--RS_GCF_000172635.2         56                   50           2            0.7379
   s__Acetatifactor_intestinalis--RS_GCF_009695995.1         59                   35           6            0.6568
   s__Ruminococcus_E_sp900316555--GB_GCA_900316555.1         53                   30           2            0.6089
 s__Phenylobacterium_sp020401865--GB_GCA_020401865.1         51                   41           2            0.5439
      s__Limivicinus_sp900317375--GB_GCA_900317375.1         50                   38           2            0.5275
     s__Pelagibacter_sp003209915--GB_GCA_003209915.1         79                   25           2            0.4295
s__Prochlorococcus_A_sp000635495--GB_GCA_000635495.1         74                   23           2            0.4288
        s__PALSA-747_sp903897155--GB_GCA_903897155.1         51                   29           2            0.3495

=== NB05 Complete ===
Species analyzed: 10
Total genomes clustered: 12,018
Species with clear clustering (silhouette > 0.2): 10

05 Metabolic Ecotypes

NB05: Metabolic Ecotypes Within Species¶

1. Load GapMind Data¶

2. Select Target Species¶

3. (Optional) Query Environment Metadata via Spark¶

4. Pathway Heterogeneity Overview¶

5. Hierarchical Clustering of Metabolic Profiles¶

6. Clustered Heatmap Figures¶

7. PCA of Pathway Profiles¶

8. Environment–Cluster Association¶

9. Pathway Signatures per Cluster¶

10. Save Results¶

Completion¶