spark = get_spark_session()

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

PROJECT_DIR = os.path.expanduser('~/BERIL-research-observatory/projects/phb_granule_ecology')
DATA_DIR = os.path.join(PROJECT_DIR, 'data')
FIG_DIR = os.path.join(PROJECT_DIR, 'figures')

# Load all previous results
gene_clusters = pd.read_csv(os.path.join(DATA_DIR, 'phb_gene_clusters.tsv'), sep='\t')
tax_phb = pd.read_csv(os.path.join(DATA_DIR, 'phb_by_taxonomy.tsv'), sep='\t')
order_stats = pd.read_csv(os.path.join(DATA_DIR, 'phb_by_order.tsv'), sep='\t')
species_env = pd.read_csv(os.path.join(DATA_DIR, 'species_environment.tsv'), sep='\t')
nmdc_phb = pd.read_csv(os.path.join(DATA_DIR, 'nmdc_phb_prevalence.tsv'), sep='\t')

print(f'Gene clusters: {len(gene_clusters):,}')
print(f'Species with taxonomy: {len(tax_phb):,}')
print(f'Orders: {len(order_stats):,}')
print(f'Species with environment: {len(species_env):,}')
print(f'NMDC samples: {len(nmdc_phb):,}')

Gene clusters: 118,513
Species with taxonomy: 27,690
Orders: 1,058
Species with environment: 27,690
NMDC samples: 6,365

# Family-level PHB enrichment
family_stats = tax_phb.groupby(['gtdb_phylum', 'gtdb_class', 'gtdb_order', 'gtdb_family']).agg(
    n_species=('gtdb_species_clade_id', 'count'),
    n_phaC=('has_phaC', 'sum'),
    pct_phaC=('has_phaC', lambda x: x.mean() * 100),
    n_complete=('has_complete_pathway', 'sum'),
    pct_complete=('has_complete_pathway', lambda x: x.mean() * 100),
).round(1).reset_index()

print(f'Total families: {len(family_stats):,}')
print(f'Families with >=10 species: {(family_stats["n_species"] >= 10).sum()}')

# Identify phyla with variable PHB prevalence (mixed clades)
phylum_var = family_stats[family_stats['n_species'] >= 10].groupby('gtdb_phylum').agg(
    n_families=('gtdb_family', 'count'),
    mean_pct_phaC=('pct_phaC', 'mean'),
    std_pct_phaC=('pct_phaC', 'std'),
    min_pct_phaC=('pct_phaC', 'min'),
    max_pct_phaC=('pct_phaC', 'max'),
).round(1)

phylum_var['range_pct_phaC'] = phylum_var['max_pct_phaC'] - phylum_var['min_pct_phaC']

print('\nPhyla with high within-phylum variability in PHB prevalence (range > 30%, >=3 families):')
variable_phyla = phylum_var[(phylum_var['range_pct_phaC'] > 30) & (phylum_var['n_families'] >= 3)]
variable_phyla.sort_values('range_pct_phaC', ascending=False)

Total families: 2,464
Families with >=10 species: 404

Phyla with high within-phylum variability in PHB prevalence (range > 30%, >=3 families):

# Within each variable phylum, identify enriched and depleted families
# Use Fisher's exact test: family PHB rate vs rest-of-phylum rate

enrichment_results = []
for phylum in variable_phyla.index:
    phylum_data = family_stats[(family_stats['gtdb_phylum'] == phylum) & 
                                (family_stats['n_species'] >= 10)]
    
    total_in_phylum = phylum_data['n_species'].sum()
    total_phaC_in_phylum = phylum_data['n_phaC'].sum()
    
    for _, fam in phylum_data.iterrows():
        # Fisher's exact test: family vs rest of phylum
        a = int(fam['n_phaC'])  # phaC+ in family
        b = int(fam['n_species'] - fam['n_phaC'])  # phaC- in family
        c = int(total_phaC_in_phylum - fam['n_phaC'])  # phaC+ outside family
        d = int((total_in_phylum - fam['n_species']) - (total_phaC_in_phylum - fam['n_phaC']))  # phaC- outside
        
        if d < 0:
            d = 0  # Edge case protection
        
        odds_ratio, p_val = stats.fisher_exact([[a, b], [c, d]])
        
        enrichment_results.append({
            'gtdb_phylum': phylum,
            'gtdb_family': fam['gtdb_family'],
            'n_species': int(fam['n_species']),
            'pct_phaC': fam['pct_phaC'],
            'phylum_pct_phaC': total_phaC_in_phylum / total_in_phylum * 100,
            'odds_ratio': odds_ratio,
            'p_value': p_val,
            'enrichment': 'enriched' if odds_ratio > 1 and p_val < 0.05 else 
                          ('depleted' if odds_ratio < 1 and p_val < 0.05 else 'ns'),
        })

enrichment_df = pd.DataFrame(enrichment_results)

# Apply Bonferroni correction
enrichment_df['p_corrected'] = enrichment_df['p_value'] * len(enrichment_df)
enrichment_df['p_corrected'] = enrichment_df['p_corrected'].clip(upper=1.0)
enrichment_df['enrichment_corrected'] = 'ns'
enrichment_df.loc[(enrichment_df['odds_ratio'] > 1) & (enrichment_df['p_corrected'] < 0.05), 
                   'enrichment_corrected'] = 'enriched'
enrichment_df.loc[(enrichment_df['odds_ratio'] < 1) & (enrichment_df['p_corrected'] < 0.05), 
                   'enrichment_corrected'] = 'depleted'

print('Subclade enrichment summary (Bonferroni-corrected):')
print(enrichment_df['enrichment_corrected'].value_counts())
print('\nEnriched families:')
enrichment_df[enrichment_df['enrichment_corrected'] == 'enriched'].sort_values('odds_ratio', ascending=False)

Subclade enrichment summary (Bonferroni-corrected):
enrichment_corrected
ns          145
depleted     62
enriched     41
Name: count, dtype: int64

Enriched families:

# Depleted families
print('Depleted families:')
enrichment_df[enrichment_df['enrichment_corrected'] == 'depleted'].sort_values('odds_ratio')

Depleted families:

# Do enriched/depleted families differ in environment?
# Merge enrichment status with environment data

# Get family-level primary environment from species_env
env_merged = species_env.merge(
    tax_phb[['gtdb_species_clade_id', 'gtdb_phylum', 'gtdb_family']], 
    on='gtdb_species_clade_id', how='inner'
)

family_env = env_merged.groupby('gtdb_family').agg(
    primary_env=('primary_env', lambda x: x.value_counts().index[0]),
    primary_variability=('primary_variability', lambda x: x.value_counts().index[0]),
    pct_high_var=('pct_high_var', 'mean'),
    n_env_categories=('n_env_categories', 'mean'),
).reset_index()

enrichment_env = enrichment_df.merge(family_env, on='gtdb_family', how='left')

print('Environment distribution of enriched vs depleted families:')
for status in ['enriched', 'depleted', 'ns']:
    subset = enrichment_env[enrichment_env['enrichment_corrected'] == status]
    if len(subset) > 0:
        print(f'\n{status.upper()} (n={len(subset)}):')
        if 'primary_env' in subset.columns:
            print(subset['primary_env'].value_counts().to_string())

Environment distribution of enriched vs depleted families:

ENRICHED (n=41):
primary_env
other_unknown            30
freshwater                5
wastewater_engineered     3
marine                    2
plant_associated          1

DEPLETED (n=62):
primary_env
other_unknown        26
marine               18
freshwater            6
animal_associated     5
human_clinical        5
human_associated      2

NS (n=145):
primary_env
other_unknown            63
freshwater               24
marine                   22
animal_associated        12
wastewater_engineered     8
soil                      5
human_clinical            5
human_associated          3
sediment                  2
plant_associated          1

# Classify phaC gene clusters by PHA synthase class
# Extract PFam domains from gene_clusters data
phac_clusters = gene_clusters[gene_clusters['KEGG_ko'].str.contains('K03821', na=False)].copy()
print(f'phaC gene clusters: {len(phac_clusters):,}')

# Check PFam column
print(f'\nPFam coverage: {phac_clusters["PFAMs"].notna().sum():,} / {len(phac_clusters):,}')
print('\nTop PFam annotations for phaC:')
pfam_counts = phac_clusters['PFAMs'].value_counts().head(20)
print(pfam_counts)

phaC gene clusters: 11,792

PFam coverage: 11,792 / 11,792

Top PFam annotations for phaC:
PFAMs
Abhydrolase_1,PhaC_N                                                  3932
PhaC_N                                                                2890
Abhydrolase_1                                                         1850
PHBC_N,PhaC_N                                                         1742
AMP-binding,Abhydrolase_1,Abhydrolase_6,PHB_depo_C                     392
Abhydrolase_1,Abhydrolase_6                                            258
Abhydrolase_6                                                          139
Abhydrolase_1,Abhydrolase_6,PHB_depo_C                                 108
Abhydrolase_1,HHH_5                                                     86
Abhydrolase_1,PHBC_N,PhaC_N                                             75
Abhydrolase_3,PAE                                                       72
AMP-binding,Abhydrolase_1                                               43
Abhydrolase_3,DUF1460,Peptidase_S9                                      37
Abhydrolase_1,PHB_depo_C                                                36
Abhydrolase_1,DUF2048                                                   29
AMP-binding,Abhydrolase_1,PHB_depo_C                                    29
-                                                                       27
Complex1_51K,DUF4157,DUF930,FeS,Fer4_10,Fer4_21,Fer4_8,RnfC_N,SLBB      19
Hydrolase_4                                                             13
Abhydrolase_1,DUF3141                                                    7
Name: count, dtype: int64

# Classify PHA synthase class based on PFam domains
def classify_pha_class(pfams_str):
    if pd.isna(pfams_str):
        return 'unknown'
    pfams = str(pfams_str).upper()
    has_class_I_II = 'PF00561' in pfams  # Alpha/beta hydrolase fold
    has_class_III_IV = 'PF07167' in pfams  # PhaC_N domain
    
    if has_class_I_II and has_class_III_IV:
        return 'mixed_domains'
    elif has_class_I_II:
        return 'class_I_II'  # Single-subunit (Cupriavidus/Pseudomonas type)
    elif has_class_III_IV:
        return 'class_III_IV'  # Two-subunit (Allochromatium/Bacillus type)
    else:
        return 'other_pfam'

phac_clusters['pha_class'] = phac_clusters['PFAMs'].apply(classify_pha_class)
print('PHA synthase class distribution:')
print(phac_clusters['pha_class'].value_counts())

PHA synthase class distribution:
pha_class
other_pfam    11792
Name: count, dtype: int64

# PHA class distribution across phyla
phac_with_tax = phac_clusters.merge(
    tax_phb[['gtdb_species_clade_id', 'gtdb_phylum', 'gtdb_class', 'gtdb_order']],
    on='gtdb_species_clade_id', how='left'
)

# Aggregate to species level (take most common class if multiple phaC clusters)
species_pha_class = phac_with_tax.groupby(['gtdb_species_clade_id', 'gtdb_phylum']).agg(
    pha_class=('pha_class', lambda x: x.value_counts().index[0]),
    n_phac_clusters=('gene_cluster_id', 'count'),
).reset_index()

# Cross-tab: phylum × PHA class
class_by_phylum = pd.crosstab(species_pha_class['gtdb_phylum'], species_pha_class['pha_class'])
# Filter to phyla with >= 10 phaC+ species
class_by_phylum = class_by_phylum[class_by_phylum.sum(axis=1) >= 10]
class_by_phylum_pct = class_by_phylum.div(class_by_phylum.sum(axis=1), axis=0) * 100

print('PHA synthase class by phylum (% of phaC+ species):')
class_by_phylum_pct.round(1)

PHA synthase class by phylum (% of phaC+ species):

# PHA class by environment type
phac_env = species_pha_class.merge(
    species_env[['gtdb_species_clade_id', 'primary_env', 'primary_variability']],
    on='gtdb_species_clade_id', how='inner'
)

class_by_env = pd.crosstab(phac_env['primary_env'], phac_env['pha_class'])
class_by_env = class_by_env[class_by_env.sum(axis=1) >= 10]
class_by_env_pct = class_by_env.div(class_by_env.sum(axis=1), axis=0) * 100

print('PHA synthase class by environment type (% of phaC+ species):')
class_by_env_pct.round(1)

PHA synthase class by environment type (% of phaC+ species):

# Identify discordant species at the family level
# PHB+ species in families with <20% phaC prevalence (potential HGT acquisition)
# PHB- species in families with >80% phaC prevalence (potential HGT loss)

fam_phaC = tax_phb.groupby('gtdb_family').agg(
    n_species=('gtdb_species_clade_id', 'count'),
    pct_phaC=('has_phaC', lambda x: x.mean() * 100),
).reset_index()

species_with_fam = tax_phb.merge(fam_phaC[['gtdb_family', 'pct_phaC']].rename(
    columns={'pct_phaC': 'family_pct_phaC'}), on='gtdb_family', how='left')

# PHB+ in PHB-poor families
hgt_candidates_gain = species_with_fam[
    (species_with_fam['has_phaC'] == 1) & 
    (species_with_fam['family_pct_phaC'] < 20)
]

# PHB- in PHB-rich families  
hgt_candidates_loss = species_with_fam[
    (species_with_fam['has_phaC'] == 0) & 
    (species_with_fam['family_pct_phaC'] > 80)
]

print(f'Potential HGT acquisition (phaC+ in <20% family): {len(hgt_candidates_gain):,} species')
print(f'Potential HGT loss (phaC- in >80% family): {len(hgt_candidates_loss):,} species')

Potential HGT acquisition (phaC+ in <20% family): 311 species
Potential HGT loss (phaC- in >80% family): 278 species

# Examine HGT acquisition candidates — which families?
if len(hgt_candidates_gain) > 0:
    print('Top families with potential HGT-acquired phaC:')
    hgt_fam = hgt_candidates_gain.groupby(['gtdb_phylum', 'gtdb_family']).agg(
        n_phaC_discordant=('gtdb_species_clade_id', 'count'),
        family_pct_phaC=('family_pct_phaC', 'first'),
    ).sort_values('n_phaC_discordant', ascending=False)
    print(hgt_fam.head(20))

# phaC core vs accessory in discordant species
if len(hgt_candidates_gain) > 0:
    print('\nphaC core vs accessory in potential HGT acquisitions:')
    print(hgt_candidates_gain['phaC_is_core'].value_counts())
    print(hgt_candidates_gain['phaC_is_aux'].value_counts())
    n_aux = hgt_candidates_gain['phaC_is_aux'].sum()
    n_gain = len(hgt_candidates_gain)
    pct_aux = n_aux / n_gain * 100 if n_gain > 0 else 0
    print(f'\n% accessory phaC in discordant species: {pct_aux:.1f}%')
    
    # Compare to overall phaC core/aux rate
    all_phac = tax_phb[tax_phb['has_phaC'] == 1]
    n_aux_all = all_phac['phaC_is_aux'].sum()
    pct_aux_all = n_aux_all / len(all_phac) * 100 if len(all_phac) > 0 else 0
    print(f'% accessory phaC overall: {pct_aux_all:.1f}%')
    print('(Higher accessory rate in discordant species supports HGT hypothesis)')

Top families with potential HGT-acquired phaC:
                                     n_phaC_discordant  family_pct_phaC
gtdb_phylum     gtdb_family                                            
Bacillota_A     Lachnospiraceae                     38         3.127572
Bacteroidota    Chitinophagaceae                    22        12.429379
Pseudomonadota  Pelagibacteraceae                   13         5.263158
                Enterobacteriaceae                  11         2.330508
Bacillota       Planococcaceae                      10        12.820513
                Erysipelotrichaceae                  8         6.451613
Actinomycetota  Actinomycetaceae                     8         6.956522
                RAAP-2                               8        11.111111
Pseudomonadota  Halieaceae                           8        19.047619
                Shewanellaceae                       7        10.606061
Cyanobacteriota Nostocaceae                          7        10.769231
Pseudomonadota  Neisseriaceae                        7         8.860759
Actinomycetota  Streptomycetaceae                    7         1.794872
Pseudomonadota  Gallionellaceae                      5        17.241379
Acidobacteriota Acidobacteriaceae                    4         8.695652
Bacillota_A     Oscillospiraceae                     4         0.950119
                Ruminococcaceae                      4         1.052632
Actinomycetota  TK06                                 4        18.181818
Pseudomonadota  Thiobacillaceae                      4        12.500000
                Methylophilaceae                     4         8.163265

phaC core vs accessory in potential HGT acquisitions:
phaC_is_core
False    176
True     135
Name: count, dtype: int64
phaC_is_aux
True     187
False    124
Name: count, dtype: int64

% accessory phaC in discordant species: 60.1%
% accessory phaC overall: 32.3%
(Higher accessory rate in discordant species supports HGT hypothesis)

# Load NMDC taxonomy features and map taxon IDs to genus names
# taxonomy_features columns are numeric taxon IDs, not genus names
tax_features = spark.sql("""
    SELECT * FROM nmdc_arkin.taxonomy_features
""").toPandas()

tax_dim = spark.sql("""
    SELECT * FROM nmdc_arkin.taxonomy_dim
""").toPandas()

print(f'NMDC taxonomy features: {tax_features.shape[0]} samples x {tax_features.shape[1]} columns')
print(f'taxonomy_dim: {tax_dim.shape[0]:,} entries')

taxon_cols = [c for c in tax_features.columns if c != 'sample_id']

# --- Two-tier NCBI taxid → GTDB genus mapping (same as NB04) ---

# Tier 1: gtdb_metadata bridge (NCBI taxid → GTDB genus)
print('\n=== Tier 1: Mapping via gtdb_metadata ===')
ncbi_to_gtdb = spark.sql("""
    SELECT DISTINCT
        CAST(m.ncbi_species_taxid AS INT) as ncbi_taxid,
        REGEXP_EXTRACT(m.gtdb_taxonomy, 'g__([^;]+)', 1) AS gtdb_genus
    FROM kbase_ke_pangenome.gtdb_metadata m
    WHERE m.gtdb_taxonomy IS NOT NULL
      AND REGEXP_EXTRACT(m.gtdb_taxonomy, 'g__([^;]+)', 1) != ''
    UNION
    SELECT DISTINCT
        CAST(m.ncbi_taxid AS INT) as ncbi_taxid,
        REGEXP_EXTRACT(m.gtdb_taxonomy, 'g__([^;]+)', 1) AS gtdb_genus
    FROM kbase_ke_pangenome.gtdb_metadata m
    WHERE m.gtdb_taxonomy IS NOT NULL
      AND REGEXP_EXTRACT(m.gtdb_taxonomy, 'g__([^;]+)', 1) != ''
""").toPandas()

ncbi_genus_counts = ncbi_to_gtdb.groupby('ncbi_taxid')['gtdb_genus'].agg(
    lambda x: x.value_counts().index[0]
).to_dict()

taxid_to_genus = {}
tier1_hits = 0
for col_id in taxon_cols:
    try:
        tid = int(col_id)
    except (ValueError, TypeError):
        continue
    if tid in ncbi_genus_counts:
        taxid_to_genus[col_id] = ncbi_genus_counts[tid]
        tier1_hits += 1

print(f'Tier 1 matches: {tier1_hits}/{len(taxon_cols)}')

# Tier 2: Fallback via taxonomy_dim genus names
print('\n=== Tier 2: Fallback via taxonomy_dim ===')
gtdb_genus_set = set(
    tax_phb['gtdb_genus'].dropna().str.replace('g__', '', regex=False).str.strip().str.lower()
)

tier2_hits = 0
for col_id in taxon_cols:
    if col_id in taxid_to_genus:
        continue
    try:
        tid = int(col_id)
    except (ValueError, TypeError):
        continue
    matches = tax_dim[tax_dim['taxid'] == tid]
    if len(matches) > 0:
        genus = str(matches.iloc[0]['genus']).strip()
        if genus and genus.lower() != 'unclassified' and genus.lower() != 'nan':
            if genus.lower() in gtdb_genus_set:
                taxid_to_genus[col_id] = genus
                tier2_hits += 1

print(f'Tier 2 matches: {tier2_hits} additional')
print(f'Total mapped: {len(taxid_to_genus)}/{len(taxon_cols)}')

# Get genus-level PHB prevalence from pangenome
genus_phb_nb5 = tax_phb.groupby('gtdb_genus').agg(
    n_species=('gtdb_species_clade_id', 'count'),
    pct_phaC=('has_phaC', lambda x: x.mean()),
).reset_index()
genus_phb_nb5['genus_clean'] = genus_phb_nb5['gtdb_genus'].str.replace('g__', '', regex=False).str.strip()

# Compute genus-level mean abundance across NMDC samples
genus_mean_abundance = {}
for col_id in taxon_cols:
    genus = taxid_to_genus.get(col_id, None)
    if genus:
        vals = pd.to_numeric(tax_features[col_id], errors='coerce')
        mean_val = vals.mean()
        if mean_val > 0:
            genus_lower = genus.lower()
            genus_mean_abundance[genus_lower] = genus_mean_abundance.get(genus_lower, 0) + mean_val

print(f'Genera with non-zero mean abundance in NMDC: {len(genus_mean_abundance):,}')

NMDC taxonomy features: 6365 samples x 3493 columns
taxonomy_dim: 2,594,787 entries

=== Tier 1: Mapping via gtdb_metadata ===

Tier 1 matches: 2336/3492

=== Tier 2: Fallback via taxonomy_dim ===

Tier 2 matches: 678 additional
Total mapped: 3014/3492

Genera with non-zero mean abundance in NMDC: 693

# Match NMDC genera to pangenome genera and compare PHB prevalence vs abundance
genus_phb_lower = dict(zip(genus_phb_nb5['genus_clean'].str.lower(), genus_phb_nb5['pct_phaC']))
genus_n_species = dict(zip(genus_phb_nb5['genus_clean'].str.lower(), genus_phb_nb5['n_species']))

cross_val = []
for genus, mean_abund in genus_mean_abundance.items():
    if genus in genus_phb_lower:
        cross_val.append({
            'genus': genus,
            'pangenome_pct_phaC': genus_phb_lower[genus],
            'pangenome_n_species': genus_n_species.get(genus, 0),
            'nmdc_mean_abundance': mean_abund,
        })

cross_val_df = pd.DataFrame(cross_val)
print(f'Genera matched between pangenome and NMDC: {len(cross_val_df)}')

if len(cross_val_df) > 10:
    # Classify genera as PHB-high (>50% phaC) or PHB-low (<50%)
    cross_val_df['phb_group'] = np.where(cross_val_df['pangenome_pct_phaC'] >= 0.5, 
                                          'PHB-high (>=50%)', 'PHB-low (<50%)')
    
    # Are PHB-high genera more abundant in NMDC metagenomes?
    high = cross_val_df[cross_val_df['phb_group'] == 'PHB-high (>=50%)']['nmdc_mean_abundance']
    low = cross_val_df[cross_val_df['phb_group'] == 'PHB-low (<50%)']['nmdc_mean_abundance']
    
    u_stat, p_val = stats.mannwhitneyu(high, low, alternative='two-sided')
    print(f'\nNMDC abundance: PHB-high genera vs PHB-low genera')
    print(f'  PHB-high: median={high.median():.6f}, n={len(high)}')
    print(f'  PHB-low:  median={low.median():.6f}, n={len(low)}')
    print(f'  Mann-Whitney U: p={p_val:.2e}')
    
    # Show top PHB-high genera by NMDC abundance
    print(f'\nTop 15 PHB-high genera by NMDC abundance:')
    top_high = cross_val_df[cross_val_df['phb_group'] == 'PHB-high (>=50%)'].nlargest(15, 'nmdc_mean_abundance')
    print(top_high[['genus', 'pangenome_pct_phaC', 'pangenome_n_species', 'nmdc_mean_abundance']].to_string(index=False))
else:
    print('Insufficient matched genera for cross-validation.')

Genera matched between pangenome and NMDC: 693

NMDC abundance: PHB-high genera vs PHB-low genera
  PHB-high: median=0.000000, n=205
  PHB-low:  median=0.000000, n=488
  Mann-Whitney U: p=8.41e-22

Top 15 PHB-high genera by NMDC abundance:
           genus  pangenome_pct_phaC  pangenome_n_species  nmdc_mean_abundance
   mycobacterium            1.000000                  186         1.790032e-15
   pseudomonas_e            0.917085                  398         9.829268e-16
     cupriavidus            1.000000                   28         7.189152e-16
    burkholderia            1.000000                   50         6.430049e-16
methylobacterium            1.000000                   49         5.838395e-16
      bordetella            1.000000                    8         4.643924e-16
    azospirillum            1.000000                   17         4.130413e-16
   sinorhizobium            1.000000                   13         3.951801e-16
       rhizobium            1.000000                   88         3.516433e-16
   amycolatopsis            0.862069                   29         3.393637e-16
   mesorhizobium            1.000000                   72         3.181535e-16
        gordonia            1.000000                   27         3.125718e-16
          afipia            1.000000                    9         3.125718e-16
      myxococcus            1.000000                   12         2.902453e-16
    sphingomonas            1.000000                   73         2.768493e-16

# Figure 1: PHB enrichment heatmap — order-level within top phyla
# Select phyla with >=5 orders that have >=10 species each

order_data = tax_phb.groupby(['gtdb_phylum', 'gtdb_order']).agg(
    n_species=('gtdb_species_clade_id', 'count'),
    pct_phaC=('has_phaC', lambda x: x.mean() * 100),
).reset_index()

order_data = order_data[order_data['n_species'] >= 10]

# Phyla with enough orders
phyla_with_orders = order_data.groupby('gtdb_phylum').filter(lambda x: len(x) >= 4)
top_phyla = phyla_with_orders.groupby('gtdb_phylum')['n_species'].sum().nlargest(6).index

fig, axes = plt.subplots(2, 3, figsize=(16, 10))
axes = axes.flatten()

for i, phylum in enumerate(top_phyla):
    ax = axes[i]
    pdata = order_data[order_data['gtdb_phylum'] == phylum].sort_values('pct_phaC', ascending=True)
    
    # Clean order names
    labels = [o.replace('o__', '') for o in pdata['gtdb_order']]
    colors = ['#F44336' if p > 50 else '#FF9800' if p > 20 else '#2196F3' 
              for p in pdata['pct_phaC']]
    
    ax.barh(range(len(pdata)), pdata['pct_phaC'], color=colors, alpha=0.8)
    ax.set_yticks(range(len(pdata)))
    ax.set_yticklabels(labels, fontsize=7)
    ax.set_xlabel('% phaC+')
    ax.set_title(phylum.replace('p__', ''), fontsize=10)
    ax.set_xlim(0, 100)

plt.suptitle('PHB Prevalence by Order Within Major Phyla', fontsize=13, y=1.01)
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, 'phb_enrichment_heatmap.png'), dpi=150, bbox_inches='tight')
plt.show()

# Figure 2: Cross-validation — pangenome PHB prevalence vs NMDC abundance
if len(cross_val_df) > 10:
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    # Scatter: pangenome phaC % vs NMDC mean abundance
    ax = axes[0]
    ax.scatter(cross_val_df['pangenome_pct_phaC'] * 100, 
               np.log10(cross_val_df['nmdc_mean_abundance'] + 1e-8),
               alpha=0.3, s=15, color='#2196F3')
    ax.set_xlabel('Pangenome phaC prevalence (% of species in genus)')
    ax.set_ylabel('log10(NMDC mean abundance)')
    ax.set_title('Pangenome PHB Prevalence vs\nNMDC Metagenomic Abundance')
    
    # Box: NMDC abundance of PHB-high vs PHB-low genera
    ax = axes[1]
    sns.boxplot(data=cross_val_df, x='phb_group', 
                y=np.log10(cross_val_df['nmdc_mean_abundance'] + 1e-8), 
                ax=ax, palette={'PHB-high (>=50%)': '#4CAF50', 'PHB-low (<50%)': '#9E9E9E'})
    ax.set_xlabel('Pangenome PHB group')
    ax.set_ylabel('log10(NMDC mean abundance)')
    ax.set_title(f'NMDC Abundance by PHB Group\n(Mann-Whitney p={p_val:.2e})')
    
    plt.tight_layout()
    plt.savefig(os.path.join(FIG_DIR, 'pangenome_vs_metagenome.png'), dpi=150, bbox_inches='tight')
    plt.show()
else:
    print('Insufficient matched genera for cross-validation plot.')

/tmp/ipykernel_17588/1154049313.py:16: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=cross_val_df, x='phb_group',

# Save all results
enrichment_df.to_csv(os.path.join(DATA_DIR, 'subclade_enrichment.tsv'), sep='\t', index=False)

phac_class_data = species_pha_class.merge(
    species_env[['gtdb_species_clade_id', 'primary_env', 'primary_variability']],
    on='gtdb_species_clade_id', how='left'
)
phac_class_data.to_csv(os.path.join(DATA_DIR, 'phaC_class_distribution.tsv'), sep='\t', index=False)

if len(cross_val_df) > 0:
    cross_val_df.to_csv(os.path.join(DATA_DIR, 'pangenome_vs_metagenome.tsv'), sep='\t', index=False)

print('Saved:')
print(f'  subclade_enrichment.tsv: {len(enrichment_df)} families')
print(f'  phaC_class_distribution.tsv: {len(phac_class_data)} species')
print(f'  pangenome_vs_metagenome.tsv: {len(cross_val_df)} genera')

Saved:
  subclade_enrichment.tsv: 248 families
  phaC_class_distribution.tsv: 6067 species
  pangenome_vs_metagenome.tsv: 693 genera

	n_families	mean_pct_phaC	std_pct_phaC	min_pct_phaC	max_pct_phaC	range_pct_phaC
gtdb_phylum
Bacillota	37	9.9	22.8	0.0	100.0	100.0
Pseudomonadota	94	47.9	41.5	0.0	100.0	100.0
Chloroflexota	13	17.2	31.8	0.0	100.0	100.0
Cyanobacteriota	9	18.5	31.4	0.0	100.0	100.0
Thermoproteota	4	47.3	54.8	0.0	100.0	100.0
Halobacteriota	10	28.5	37.5	0.0	90.0	90.0
Desulfobacterota	6	24.6	27.9	0.0	75.0	75.0
Actinomycetota	35	14.6	22.6	0.0	74.3	74.3
Bacteroidota	40	2.6	9.0	0.0	46.2	46.2

	gtdb_phylum	gtdb_family	n_species	pct_phaC	phylum_pct_phaC	odds_ratio	p_value	enrichment	p_corrected	enrichment_corrected
41	Bacillota	Bacillaceae_G	52	100.0	7.416625	inf	1.354664e-63	enriched	3.359567e-61	enriched
152	Pseudomonadota	Caulobacteraceae	83	100.0	62.000596	inf	7.150114e-18	enriched	1.773228e-15	enriched
153	Pseudomonadota	Hyphomonadaceae	25	100.0	62.000596	inf	9.293479e-06	enriched	2.304783e-03	enriched
171	Pseudomonadota	Stappiaceae	23	100.0	62.000596	inf	3.131122e-05	enriched	7.765184e-03	enriched
169	Pseudomonadota	Hyphomicrobiaceae	24	100.0	62.000596	inf	1.687862e-05	enriched	4.185899e-03	enriched
172	Pseudomonadota	Xanthobacteraceae	150	100.0	62.000596	inf	3.773477e-32	enriched	9.358223e-30	enriched
151	Pseudomonadota	Azospirillaceae	25	100.0	62.000596	inf	9.293479e-06	enriched	2.304783e-03	enriched
127	Cyanobacteriota	Microcystaceae	14	100.0	7.552083	inf	5.669549e-18	enriched	1.406048e-15	enriched
117	Chloroflexota	Chloroflexaceae	10	100.0	11.637931	inf	8.249858e-11	enriched	2.045965e-08	enriched
192	Pseudomonadota	Chromobacteriaceae	28	100.0	62.000596	inf	1.746215e-06	enriched	4.330612e-04	enriched
203	Pseudomonadota	Competibacteraceae	19	100.0	62.000596	inf	1.594610e-04	enriched	3.954633e-02	enriched
198	Pseudomonadota	Rhodocyclaceae	147	99.3	62.000596	92.700723	2.150923e-29	enriched	5.334289e-27	enriched
167	Pseudomonadota	Beijerinckiaceae	128	99.2	62.000596	80.256944	1.950457e-25	enriched	4.837132e-23	enriched
189	Pseudomonadota	Burkholderiaceae_B	360	98.9	62.000596	59.559558	9.225162e-70	enriched	2.287840e-67	enriched
42	Bacillota	DSM-1321	14	78.6	7.416625	49.340580	7.867773e-11	enriched	1.951208e-08	enriched
104	Bacteroidota	UKL13-3	13	46.2	1.927261	48.183673	6.195599e-08	enriched	1.536509e-05	enriched
91	Bacteroidota	Saprospiraceae	75	32.0	1.927261	38.439628	6.206275e-25	enriched	1.539156e-22	enriched
182	Pseudomonadota	Sphingomonadaceae	274	98.2	62.000596	35.184370	9.757786e-50	enriched	2.419931e-47	enriched
139	Desulfobacterota	Smithellaceae	16	75.0	15.789474	31.000000	1.734866e-08	enriched	4.302468e-06	enriched
21	Actinomycetota	Mycobacteriaceae	481	72.6	18.832711	29.006325	1.239127e-190	enriched	3.073034e-188	enriched
141	Halobacteriota	Haloarculaceae	30	90.0	34.123223	27.200000	9.214221e-12	enriched	2.285127e-09	enriched
245	Thermoproteota	Nitrosopumilaceae	55	89.1	61.224490	23.757576	8.304361e-11	enriched	2.059482e-08	enriched
120	Chloroflexota	Tepidiformaceae	10	70.0	11.637931	23.566667	1.275060e-05	enriched	3.162149e-03	enriched
43	Bacillota	DSM-18226	53	58.5	7.416625	21.948382	5.158348e-23	enriched	1.279270e-20	enriched
144	Halobacteriota	Natrialbaceae	26	88.5	34.123223	21.278912	1.328294e-09	enriched	3.294170e-07	enriched
241	Pseudomonadota	Rhodanobacteraceae	52	96.2	62.000596	15.496471	9.616731e-09	enriched	2.384949e-06	enriched
187	Pseudomonadota	Burkholderiaceae	333	95.5	62.000596	13.986149	4.267357e-49	enriched	1.058305e-46	enriched
26	Actinomycetota	Nocardioidaceae	70	74.3	18.832711	13.634637	2.096830e-24	enriched	5.200138e-22	enriched
215	Pseudomonadota	Legionellaceae	64	95.3	62.000596	12.632666	7.824283e-10	enriched	1.940422e-07	enriched
173	Pseudomonadota	Rhodobacteraceae	415	94.5	62.000596	11.428677	1.342438e-56	enriched	3.329246e-54	enriched
90	Bacteroidota	Chitinophagaceae	177	12.4	1.927261	10.645161	2.742955e-13	enriched	6.802528e-11	enriched
243	Pseudomonadota	Xanthomonadaceae	207	94.2	62.000596	10.400164	1.393533e-27	enriched	3.455961e-25	enriched
229	Pseudomonadota	Oleiphilaceae	50	94.0	62.000596	9.700227	2.385833e-07	enriched	5.916866e-05	enriched
158	Pseudomonadota	Micavibrionaceae	32	93.8	62.000596	9.252846	6.785811e-05	enriched	1.682881e-02	enriched
190	Pseudomonadota	Burkholderiaceae_C	93	93.5	62.000596	9.055378	2.709309e-12	enriched	6.719087e-10	enriched
22	Actinomycetota	Pseudonocardiaceae	94	54.3	18.832711	5.527824	5.360020e-15	enriched	1.329285e-12	enriched
39	Bacillota	Bacillaceae	42	28.6	7.416625	5.343066	2.820579e-05	enriched	6.995036e-03	enriched
225	Pseudomonadota	Halomonadaceae	95	87.4	62.000596	4.305099	4.842302e-08	enriched	1.200891e-05	enriched
170	Pseudomonadota	Rhizobiaceae	413	86.4	62.000596	4.180135	1.188161e-29	enriched	2.946640e-27	enriched
232	Pseudomonadota	Pseudomonadaceae	498	84.1	62.000596	3.502775	4.242306e-29	enriched	1.052092e-26	enriched
212	Pseudomonadota	Vibrionaceae	173	82.1	62.000596	2.871313	1.046626e-08	enriched	2.595634e-06	enriched

	gtdb_phylum	gtdb_family	n_species	pct_phaC	phylum_pct_phaC	odds_ratio	p_value	enrichment	p_corrected	enrichment_corrected
11	Actinomycetota	Bifidobacteriaceae	98	0.0	18.832711	0.000000	1.311330e-09	depleted	3.252098e-07	depleted
13	Actinomycetota	Cellulomonadaceae	57	0.0	18.832711	0.000000	8.803206e-06	depleted	2.183195e-03	depleted
16	Actinomycetota	Microbacteriaceae	330	0.0	18.832711	0.000000	2.378544e-32	depleted	5.898788e-30	depleted
20	Actinomycetota	Micromonosporaceae	86	0.0	18.832711	0.000000	2.210428e-08	depleted	5.481862e-06	depleted
66	Bacillota	Paenibacillaceae	126	0.0	7.416625	0.000000	6.890711e-05	depleted	1.708896e-02	depleted
...	...	...	...	...	...	...	...	...	...	...
230	Pseudomonadota	Porticoccaceae	56	23.2	62.000596	0.182737	4.670705e-09	depleted	1.158335e-06	depleted
208	Pseudomonadota	Alteromonadaceae	152	27.0	62.000596	0.218680	9.502769e-19	depleted	2.356687e-16	depleted
188	Pseudomonadota	Burkholderiaceae_A	79	31.6	62.000596	0.279413	5.904650e-08	depleted	1.464353e-05	depleted
223	Pseudomonadota	HTCC2089	61	36.1	62.000596	0.342252	4.900087e-05	depleted	1.215222e-02	depleted
150	Pseudomonadota	Acetobacteraceae	152	46.1	62.000596	0.515034	6.274416e-05	depleted	1.556055e-02	depleted

pha_class	other_pfam
gtdb_phylum
Acidobacteriota	100.0
Actinomycetota	100.0
Bacillota	100.0
Bacillota_A	100.0
Bacillota_B	100.0
Bacteroidota	100.0
Chloroflexota	100.0
Cyanobacteriota	100.0
Desulfobacterota	100.0
Eremiobacterota	100.0
Halobacteriota	100.0
Methylomirabilota	100.0
Myxococcota	100.0
Nitrospirota	100.0
Planctomycetota	100.0
Pseudomonadota	100.0
SAR324	100.0
Thermoproteota	100.0

05 Subclade Enrichment

NB05: Subclade Enrichment and Cross-Validation¶

Part 1: Subclade Enrichment Analysis¶

Part 2: PHA Synthase Class Distribution¶

Part 3: HGT Signal Detection¶

Part 4: Cross-Validation — Pangenome vs NMDC Metagenome¶

Summary¶

Key Findings (to be filled after execution)¶

Interpretation¶

pha_class	other_pfam
primary_env
animal_associated	100.0
food	100.0
freshwater	100.0
human_associated	100.0
human_clinical	100.0
marine	100.0
other_unknown	100.0
plant_associated	100.0
sediment	100.0
soil	100.0
wastewater_engineered	100.0