import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from statsmodels.stats.multitest import multipletests

DATA_DIR = os.path.join(os.path.dirname(os.getcwd()), 'data') if os.path.basename(os.getcwd()) == 'notebooks' else 'data'
FIG_DIR = os.path.join(os.path.dirname(os.getcwd()), 'figures') if os.path.basename(os.getcwd()) == 'notebooks' else 'figures'
os.makedirs(FIG_DIR, exist_ok=True)

pathway_df = pd.read_csv(os.path.join(DATA_DIR, 'carbon_pathway_scores.csv'))
species_df = pd.read_csv(os.path.join(DATA_DIR, 'pseudomonas_species.csv'))
species_env = pd.read_csv(os.path.join(DATA_DIR, 'species_lifestyle.csv'))

print(f'Loaded: {len(pathway_df):,} pathway scores, {len(species_df)} species, {len(species_env)} with lifestyle')

Loaded: 789,012 pathway scores, 433 species, 387 with lifestyle

# Binary: is pathway complete/likely_complete in this genome?
pathway_df['is_complete'] = (pathway_df['best_score'] >= 4).astype(int)

# Species-level: fraction of genomes with pathway complete
species_pathway = pathway_df.groupby(['clade_name', 'pathway'])['is_complete'].mean().reset_index()
species_pathway.columns = ['gtdb_species_clade_id', 'pathway', 'frac_complete']

# Pivot to species x pathway matrix
pathway_matrix = species_pathway.pivot(
    index='gtdb_species_clade_id', columns='pathway', values='frac_complete'
).fillna(0)

print(f'Pathway matrix: {pathway_matrix.shape[0]} species x {pathway_matrix.shape[1]} pathways')

# Total pathway richness per species (count of pathways complete in >50% of genomes)
pathway_richness = (pathway_matrix > 0.5).sum(axis=1)
pathway_richness.name = 'pathway_richness'
print(f'\nPathway richness (>50% genomes): mean={pathway_richness.mean():.1f}, range={pathway_richness.min()}-{pathway_richness.max()}')

Pathway matrix: 433 species x 62 pathways

Pathway richness (>50% genomes): mean=54.6, range=27-61

# Save species-level pathway profiles
pathway_matrix.to_csv(os.path.join(DATA_DIR, 'species_pathway_profiles.csv'))
print(f'Saved species_pathway_profiles.csv')

Saved species_pathway_profiles.csv

# Merge with subgenus info
pathway_with_meta = pathway_matrix.reset_index().merge(
    species_df[['gtdb_species_clade_id', 'gtdb_subgenus', 'no_genomes']],
    on='gtdb_species_clade_id'
)

# Focus on the two main groups with sufficient data
main_groups = pathway_with_meta[
    pathway_with_meta['gtdb_subgenus'].isin(['Pseudomonas', 'Pseudomonas_E'])
].copy()

# Filter to species with >= 5 genomes for reliable pathway estimates
main_groups = main_groups[main_groups['no_genomes'] >= 5]

pathways = [c for c in pathway_matrix.columns]

# Mean pathway completeness by subgenus
subgenus_means = main_groups.groupby('gtdb_subgenus')[pathways].mean()

print('Mean pathway completeness by subgenus (species with >= 5 genomes):')
print(f'  Pseudomonas s.s. (aeruginosa): {len(main_groups[main_groups.gtdb_subgenus=="Pseudomonas"])} species')
print(f'  Pseudomonas_E (fluorescens/putida): {len(main_groups[main_groups.gtdb_subgenus=="Pseudomonas_E"])} species')

# Difference: Pseudomonas_E - Pseudomonas s.s.
diff = subgenus_means.loc['Pseudomonas_E'] - subgenus_means.loc['Pseudomonas']
diff_sorted = diff.sort_values()

print(f'\nPathways MORE complete in Pseudomonas_E (free-living) vs Pseudomonas (aeruginosa):')
print(diff_sorted.tail(15).to_string())
print(f'\nPathways LESS complete in Pseudomonas_E vs Pseudomonas:')
print(diff_sorted.head(10).to_string())

Mean pathway completeness by subgenus (species with >= 5 genomes):
  Pseudomonas s.s. (aeruginosa): 7 species
  Pseudomonas_E (fluorescens/putida): 189 species

Pathways MORE complete in Pseudomonas_E (free-living) vs Pseudomonas (aeruginosa):
galactose        0.125212
lactose          0.125333
trehalose        0.125415
glucuronate      0.203727
NAG              0.207009
D-serine         0.255973
xylitol          0.256851
glucosamine      0.319226
sorbitol         0.515063
mannitol         0.515906
myoinositol      0.587679
galacturonate    0.598473
arabinose        0.626347
ribose           0.641801
xylose           0.743703

Pathways LESS complete in Pseudomonas_E vs Pseudomonas:
rhamnose         -0.254575
fucose           -0.216920
acetate          -0.026117
L-lactate        -0.025776
fructose         -0.007463
phenylacetate    -0.005582
sucrose          -0.004100
2-oxoglutarate   -0.003750
L-malate         -0.002656
pyruvate         -0.001933

# Statistical test: Mann-Whitney U for each pathway between subgenera
results = []
ps_data = main_groups[main_groups['gtdb_subgenus'] == 'Pseudomonas']
pe_data = main_groups[main_groups['gtdb_subgenus'] == 'Pseudomonas_E']

for pathway in pathways:
    u_stat, p_val = stats.mannwhitneyu(
        ps_data[pathway].values,
        pe_data[pathway].values,
        alternative='two-sided'
    )
    results.append({
        'pathway': pathway,
        'mean_aeruginosa': ps_data[pathway].mean(),
        'mean_fluorescens': pe_data[pathway].mean(),
        'diff': pe_data[pathway].mean() - ps_data[pathway].mean(),
        'U_statistic': u_stat,
        'p_value': p_val
    })

results_df = pd.DataFrame(results)

# FDR correction
reject, q_values, _, _ = multipletests(results_df['p_value'], method='fdr_bh')
results_df['q_value'] = q_values
results_df['significant'] = reject

results_df = results_df.sort_values('diff')
print(f'Pathways significantly different (FDR < 0.05): {results_df.significant.sum()} / {len(results_df)}')
print(f'\nSignificant pathways (sorted by difference):')
sig = results_df[results_df['significant']]
print(sig[['pathway', 'mean_aeruginosa', 'mean_fluorescens', 'diff', 'q_value']].to_string(index=False))

results_df.to_csv(os.path.join(DATA_DIR, 'subgenus_pathway_comparison.csv'), index=False)

Pathways significantly different (FDR < 0.05): 43 / 62

Significant pathways (sorted by difference):
          pathway  mean_aeruginosa  mean_fluorescens      diff  q_value
         L-malate         0.995837          0.993181 -0.002656 0.034141
          ethanol         0.999979          1.000000  0.000021 0.000001
        aspartate         0.995837          0.995922  0.000085 0.010746
        threonine         0.999641          1.000000  0.000359 0.000001
4-hydroxybenzoate         0.999620          1.000000  0.000380 0.000001
      deoxyribose         0.999620          1.000000  0.000380 0.000001
       tryptophan         0.999620          1.000000  0.000380 0.000001
       putrescine         0.999577          1.000000  0.000423 0.000001
     deoxyinosine         0.999535          0.999983  0.000448 0.000999
        thymidine         0.999535          0.999983  0.000448 0.000999
          proline         0.995837          0.998347  0.002510 0.010746
           valine         0.995921          0.998457  0.002535 0.010746
       isoleucine         0.995921          0.998457  0.002535 0.010746
           lysine         0.995879          0.998942  0.003063 0.000999
         fumarate         0.996302          0.999471  0.003169 0.000999
         tyrosine         0.995858          0.999102  0.003244 0.010746
          leucine         0.995879          0.999339  0.003460 0.000999
    phenylalanine         0.995858          0.999339  0.003481 0.000999
        glutamate         0.995858          0.999669  0.003811 0.000999
       asparagine         0.995858          0.999669  0.003811 0.000999
        histidine         0.995879          1.000000  0.004121 0.000001
       citrulline         0.995879          1.000000  0.004121 0.000001
         arginine         0.995879          1.000000  0.004121 0.000001
        succinate         0.995858          0.999983  0.004125 0.000999
           serine         0.995858          1.000000  0.004142 0.000001
        D-alanine         0.995858          1.000000  0.004142 0.000001
          citrate         0.995858          1.000000  0.004142 0.000001
        gluconate         0.936666          0.981096  0.044430 0.000184
          glucose         0.852050          0.976324  0.124275 0.049957
       cellobiose         0.852747          0.977586  0.124839 0.030014
          maltose         0.852050          0.976995  0.124945 0.040065
        galactose         0.851057          0.976268  0.125212 0.047536
          lactose         0.852050          0.977383  0.125333 0.039490
        trehalose         0.852050          0.977465  0.125415 0.030490
      glucuronate         0.748363          0.952090  0.203727 0.000020
      glucosamine         0.369759          0.688986  0.319226 0.029248
         sorbitol         0.258947          0.774010  0.515063 0.000632
         mannitol         0.258989          0.774895  0.515906 0.000626
      myoinositol         0.000000          0.587679  0.587679 0.003289
    galacturonate         0.285714          0.884187  0.598473 0.000238
        arabinose         0.000021          0.626368  0.626347 0.004058
           ribose         0.278550          0.920351  0.641801 0.000001
           xylose         0.000021          0.743724  0.743703 0.000208

# Among Pseudomonas_E species, compare free-living vs host-associated vs plant-associated
pe_with_lifestyle = pathway_with_meta.merge(
    species_env[['gtdb_species_clade_id', 'majority_lifestyle', 'majority_environment']],
    on='gtdb_species_clade_id',
    how='inner'
)

pe_e_only = pe_with_lifestyle[
    (pe_with_lifestyle['gtdb_subgenus'] == 'Pseudomonas_E') &
    (pe_with_lifestyle['no_genomes'] >= 5) &
    (pe_with_lifestyle['majority_lifestyle'] != 'unknown')
]

print(f'Pseudomonas_E species with lifestyle and >= 5 genomes: {len(pe_e_only)}')
print(pe_e_only['majority_lifestyle'].value_counts().to_string())

# Compare free-living vs plant-associated within Pseudomonas_E
for lifestyle in pe_e_only['majority_lifestyle'].unique():
    sub = pe_e_only[pe_e_only['majority_lifestyle'] == lifestyle]
    richness = (sub[pathways] > 0.5).sum(axis=1)
    print(f'\n{lifestyle}: {len(sub)} species, mean pathway richness = {richness.mean():.1f}')

Pseudomonas_E species with lifestyle and >= 5 genomes: 184
majority_lifestyle
free_living         87
host_associated     58
plant_associated    30
food_associated      9

host_associated: 58 species, mean pathway richness = 55.2

plant_associated: 30 species, mean pathway richness = 56.7

free_living: 87 species, mean pathway richness = 56.1

food_associated: 9 species, mean pathway richness = 55.7

# Heatmap of pathway completeness for species with >= 10 genomes
heatmap_data = pathway_with_meta[
    pathway_with_meta['no_genomes'] >= 10
].copy()

# Sort by subgenus then by pathway richness
heatmap_data['pathway_richness'] = (heatmap_data[pathways] > 0.5).sum(axis=1)
heatmap_data = heatmap_data.sort_values(['gtdb_subgenus', 'pathway_richness'], ascending=[True, False])

# Create heatmap
fig, ax = plt.subplots(figsize=(20, max(8, len(heatmap_data) * 0.15)))

# Labels
labels = []
for _, row in heatmap_data.iterrows():
    sp = row['gtdb_species_clade_id'].split('--')[0].replace('s__', '').replace('_', ' ')
    labels.append(f"{sp} (n={int(row['no_genomes'])})")

matrix = heatmap_data[pathways].values

sns.heatmap(matrix, xticklabels=pathways, yticklabels=labels,
            cmap='YlOrRd', vmin=0, vmax=1, ax=ax,
            cbar_kws={'label': 'Fraction of genomes with pathway complete'})
ax.set_xlabel('Carbon pathway')
ax.set_ylabel('Species')
ax.set_title('GapMind Carbon Pathway Completeness Across Pseudomonas Species (>=10 genomes)')
plt.xticks(rotation=45, ha='right', fontsize=8)
plt.yticks(fontsize=6)
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, 'pathway_heatmap.png'), dpi=150, bbox_inches='tight')
plt.show()
print(f'Saved {FIG_DIR}/pathway_heatmap.png')

Saved /home/mamillerpa/BERIL-research-observatory/projects/pseudomonas_carbon_ecology/figures/pathway_heatmap.png

# Bar plot: pathways lost in host-associated vs free-living
sig_results = results_df[results_df['significant']].copy()

if len(sig_results) > 0:
    fig, ax = plt.subplots(figsize=(10, max(4, len(sig_results) * 0.35)))
    colors = ['#e74c3c' if d < 0 else '#2ecc71' for d in sig_results['diff']]
    ax.barh(sig_results['pathway'], sig_results['diff'], color=colors)
    ax.axvline(0, color='black', linewidth=0.5)
    ax.set_xlabel('Difference in completeness\n(Pseudomonas_E - Pseudomonas s.s.)')
    ax.set_title('Significant pathway differences (FDR < 0.05)\nRed = higher in aeruginosa group; Green = higher in fluorescens/putida group')
    plt.tight_layout()
    plt.savefig(os.path.join(FIG_DIR, 'pathway_loss_barplot.png'), dpi=150, bbox_inches='tight')
    plt.show()
    print(f'Saved {FIG_DIR}/pathway_loss_barplot.png')
else:
    print('No significant pathway differences found.')

Saved /home/mamillerpa/BERIL-research-observatory/projects/pseudomonas_carbon_ecology/figures/pathway_loss_barplot.png

03 Pathway Lifestyle Analysis

03 - Carbon Pathway Profiles by Lifestyle¶

1. Species-Level Pathway Completeness Profiles¶

2. Pathway Profiles by GTDB Subgenus¶

3. Pathway Profiles by Lifestyle (within Pseudomonas_E)¶

4. Heatmap: Pathway Completeness Across Species¶

Summary¶