spark = get_spark_session()

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

PROJECT_DIR = os.path.expanduser('~/BERIL-research-observatory/projects/phb_granule_ecology')
DATA_DIR = os.path.join(PROJECT_DIR, 'data')
FIG_DIR = os.path.join(PROJECT_DIR, 'figures')

# Load NB01 results
species_phb = pd.read_csv(os.path.join(DATA_DIR, 'phb_species_summary.tsv'), sep='\t')
print(f'Loaded {len(species_phb):,} species with PHB genes')

Loaded 19,496 species with PHB genes

# Get full taxonomy for all species
# Parse taxonomy ranks directly from GTDB_taxonomy string in gtdb_species_clade
# Format: d__Domain;p__Phylum;c__Class;o__Order;f__Family;g__Genus;s__Species
# This avoids the broken join to gtdb_taxonomy_r214v1 (representative_genome_id != gtdb_taxonomy_id)
taxonomy = spark.sql("""
    SELECT sc.gtdb_species_clade_id,
           sc.GTDB_taxonomy,
           REGEXP_EXTRACT(sc.GTDB_taxonomy, 'p__([^;]+)', 1) AS gtdb_phylum,
           REGEXP_EXTRACT(sc.GTDB_taxonomy, 'c__([^;]+)', 1) AS gtdb_class,
           REGEXP_EXTRACT(sc.GTDB_taxonomy, 'o__([^;]+)', 1) AS gtdb_order,
           REGEXP_EXTRACT(sc.GTDB_taxonomy, 'f__([^;]+)', 1) AS gtdb_family,
           REGEXP_EXTRACT(sc.GTDB_taxonomy, 'g__([^;]+)', 1) AS gtdb_genus,
           p.no_genomes, p.no_core, p.no_aux_genome, p.no_gene_clusters
    FROM kbase_ke_pangenome.gtdb_species_clade sc
    JOIN kbase_ke_pangenome.pangenome p
        ON sc.gtdb_species_clade_id = p.gtdb_species_clade_id
""").toPandas()

print(f'Total species with taxonomy: {len(taxonomy):,}')
print(f'Phyla: {taxonomy["gtdb_phylum"].nunique()}')
print(f'Sample taxonomy:')
taxonomy[['gtdb_species_clade_id', 'gtdb_phylum', 'gtdb_class', 'gtdb_order']].head(3)

Total species with taxonomy: 27,690
Phyla: 142
Sample taxonomy:

# Merge PHB status with taxonomy
# Species not in species_phb are PHB-absent
tax_phb = taxonomy.merge(species_phb[['gtdb_species_clade_id', 'phb_status', 'phb_genes_str', 
                                       'phaC_is_core', 'phaC_is_aux']],
                          on='gtdb_species_clade_id', how='left')
tax_phb['phb_status'] = tax_phb['phb_status'].fillna('absent')
# Cast to int to avoid numpy boolean arithmetic issues in downstream aggregations
tax_phb['has_phaC'] = tax_phb['phb_status'].isin(['complete', 'synthase_only']).astype(int)
tax_phb['has_complete_pathway'] = (tax_phb['phb_status'] == 'complete').astype(int)

print('PHB pathway status across all species:')
print(tax_phb['phb_status'].value_counts())
print(f'\nOverall phaC prevalence: {tax_phb["has_phaC"].mean()*100:.1f}%')
print(f'Overall complete pathway: {tax_phb["has_complete_pathway"].mean()*100:.1f}%')

PHB pathway status across all species:
phb_status
precursors_only    12869
absent              8199
complete            6005
accessory_only       555
synthase_only         62
Name: count, dtype: int64

Overall phaC prevalence: 21.9%
Overall complete pathway: 21.7%

# PHB prevalence by phylum
phylum_stats = tax_phb.groupby('gtdb_phylum').agg(
    n_species=('gtdb_species_clade_id', 'count'),
    n_phaC=('has_phaC', 'sum'),
    pct_phaC=('has_phaC', lambda x: x.mean() * 100),
    n_complete=('has_complete_pathway', 'sum'),
    pct_complete=('has_complete_pathway', lambda x: x.mean() * 100),
).round(1).sort_values('n_species', ascending=False)

print('PHB prevalence by phylum (>50 species):')
phylum_stats[phylum_stats['n_species'] >= 50]

PHB prevalence by phylum (>50 species):

# Figure 1: PHB prevalence by phylum (top 20 phyla by species count)
top_phyla = phylum_stats.head(20).sort_values('pct_phaC', ascending=True)

fig, ax = plt.subplots(figsize=(10, 8))
bars = ax.barh(range(len(top_phyla)), top_phyla['pct_complete'], 
               label='Complete (phaC+phaA/B)', color='#2196F3', alpha=0.8)
ax.barh(range(len(top_phyla)), top_phyla['pct_phaC'] - top_phyla['pct_complete'],
        left=top_phyla['pct_complete'],
        label='Synthase only (phaC)', color='#90CAF9', alpha=0.8)

ax.set_yticks(range(len(top_phyla)))
ax.set_yticklabels([f"{p} (n={int(n)})" for p, n in 
                     zip(top_phyla.index, top_phyla['n_species'])])
ax.set_xlabel('% of species with PHB pathway')
ax.set_title('PHB Pathway Prevalence Across Bacterial Phyla')
ax.legend(loc='lower right')
ax.set_xlim(0, 100)

plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, 'phb_prevalence_by_phylum.png'), dpi=150, bbox_inches='tight')
plt.show()
print('Saved phb_prevalence_by_phylum.png')

Saved phb_prevalence_by_phylum.png

# PHB prevalence by order (finer resolution)
order_stats = tax_phb.groupby(['gtdb_phylum', 'gtdb_class', 'gtdb_order']).agg(
    n_species=('gtdb_species_clade_id', 'count'),
    n_phaC=('has_phaC', 'sum'),
    pct_phaC=('has_phaC', lambda x: x.mean() * 100),
    n_complete=('has_complete_pathway', 'sum'),
    pct_complete=('has_complete_pathway', lambda x: x.mean() * 100),
).round(1).sort_values('n_species', ascending=False)

print(f'Orders with >20 species and >50% phaC prevalence:')
enriched = order_stats[(order_stats['n_species'] >= 20) & (order_stats['pct_phaC'] >= 50)]
enriched.sort_values('pct_phaC', ascending=False)

Orders with >20 species and >50% phaC prevalence:

# Orders depleted for PHB
print(f'Orders with >20 species and <10% phaC prevalence:')
depleted = order_stats[(order_stats['n_species'] >= 20) & (order_stats['pct_phaC'] < 10)]
depleted.sort_values('pct_phaC', ascending=True)

Orders with >20 species and <10% phaC prevalence:

# Figure 2: phaC core vs accessory across phyla
phac_species = tax_phb[tax_phb['has_phaC'] == 1].copy()
phac_species['phaC_status'] = 'unknown'
phac_species.loc[phac_species['phaC_is_core'] == 1, 'phaC_status'] = 'core'
phac_species.loc[phac_species['phaC_is_aux'] == 1, 'phaC_status'] = 'accessory'

phac_by_phylum = phac_species.groupby(['gtdb_phylum', 'phaC_status']).size().unstack(fill_value=0)
phac_by_phylum = phac_by_phylum.loc[phac_by_phylum.sum(axis=1) >= 10]
phac_by_phylum_pct = phac_by_phylum.div(phac_by_phylum.sum(axis=1), axis=0) * 100

fig, ax = plt.subplots(figsize=(10, 6))
phac_by_phylum_pct.sort_values('core', ascending=True).plot.barh(
    stacked=True, ax=ax, color={'core': '#4CAF50', 'accessory': '#FF9800', 'unknown': '#9E9E9E'}
)
ax.set_xlabel('% of phaC-carrying species')
ax.set_title('phaC Gene: Core vs Accessory Status by Phylum')
ax.legend(title='phaC status')

plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, 'phb_core_vs_accessory.png'), dpi=150, bbox_inches='tight')
plt.show()

# Figure 3: Pathway completeness distribution
status_order = ['complete', 'synthase_only', 'precursors_only', 'accessory_only', 'absent']
status_colors = ['#2196F3', '#90CAF9', '#FFB74D', '#FFCC80', '#E0E0E0']

fig, ax = plt.subplots(figsize=(8, 5))
counts = [len(tax_phb[tax_phb['phb_status'] == s]) for s in status_order]
bars = ax.bar(status_order, counts, color=status_colors)

for bar, count in zip(bars, counts):
    ax.text(bar.get_x() + bar.get_width()/2., bar.get_height() + 100,
            f'{count:,}\n({count/len(tax_phb)*100:.1f}%)',
            ha='center', va='bottom', fontsize=9)

ax.set_ylabel('Number of species')
ax.set_title('PHB Pathway Completeness Across 27K Bacterial Species')
ax.set_xticklabels([s.replace('_', '\n') for s in status_order])

plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, 'phb_pathway_completeness.png'), dpi=150, bbox_inches='tight')
plt.show()

/tmp/ipykernel_11910/2888123116.py:16: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_xticklabels([s.replace('_', '\n') for s in status_order])

# Save taxonomy-level results
tax_phb.to_csv(os.path.join(DATA_DIR, 'phb_by_taxonomy.tsv'), sep='\t', index=False)
order_stats.to_csv(os.path.join(DATA_DIR, 'phb_by_order.tsv'), sep='\t')
print(f'Saved taxonomy data: {len(tax_phb):,} species, {len(order_stats):,} orders')

Saved taxonomy data: 27,690 species, 1,058 orders

	gtdb_species_clade_id	gtdb_phylum	gtdb_class	gtdb_order
0	s__Klebsiella_pneumoniae--RS_GCF_000742135.1	Pseudomonadota	Gammaproteobacteria	Enterobacterales
1	s__Staphylococcus_aureus--RS_GCF_001027105.1	Bacillota	Bacilli	Staphylococcales
2	s__Salmonella_enterica--RS_GCF_000006945.2	Pseudomonadota	Gammaproteobacteria	Enterobacterales

	n_species	n_phaC	pct_phaC	n_complete	pct_complete
gtdb_phylum
Pseudomonadota	7456	4544	60.9	4539	60.9
Bacillota_A	3917	71	1.8	56	1.4
Bacteroidota	3629	78	2.1	75	2.1
Actinomycetota	3172	587	18.5	578	18.2
Bacillota	2146	187	8.7	175	8.2
Patescibacteria	981	4	0.4	2	0.2
Verrucomicrobiota	553	6	1.1	3	0.5
Cyanobacteriota	469	43	9.2	42	9.0
Chloroflexota	457	56	12.3	56	12.3
Planctomycetota	348	14	4.0	14	4.0
Desulfobacterota	319	70	21.9	68	21.3
Spirochaetota	296	4	1.4	4	1.4
Thermoplasmatota	294	6	2.0	6	2.0
Acidobacteriota	283	25	8.8	24	8.5
Campylobacterota	271	0	0.0	0	0.0
Halobacteriota	259	89	34.4	89	34.4
Bacillota_C	235	6	2.6	4	1.7
Thermoproteota	233	65	27.9	65	27.9
Myxococcota	157	83	52.9	83	52.9
Nanoarchaeota	135	0	0.0	0	0.0
Nitrospirota	115	15	13.0	14	12.2
Gemmatimonadota	102	0	0.0	0	0.0
Bacillota_B	101	11	10.9	11	10.9
Marinisomatota	90	0	0.0	0	0.0
Omnitrophota	80	0	0.0	0	0.0
Methanobacteriota	74	0	0.0	0	0.0
Bdellovibrionota	71	5	7.0	5	7.0
Elusimicrobiota	70	1	1.4	1	1.4
Desulfobacterota_F	68	3	4.4	3	4.4
Fusobacteriota	59	3	5.1	0	0.0
Synergistota	58	1	1.7	0	0.0
Micrarchaeota	57	0	0.0	0	0.0
Deinococcota	52	3	5.8	3	5.8
Thermotogota	51	1	2.0	0	0.0

			n_species	n_phaC	pct_phaC	n_complete	pct_complete
gtdb_phylum	gtdb_class	gtdb_order
Pseudomonadota	Alphaproteobacteria	Azospirillales	25	25	100.0	25	100.0
		Rhodospirillales	20	20	100.0	20	100.0
		Caulobacterales	142	141	99.3	141	99.3
		Sphingomonadales	292	280	95.9	280	95.9
	Gammaproteobacteria	Legionellales	64	61	95.3	61	95.3
	Alphaproteobacteria	Rhodobacterales	415	392	94.5	392	94.5
	Alphaproteobacteria	Micavibrionales	35	33	94.3	33	94.3
	Gammaproteobacteria	Xanthomonadales	289	266	92.0	266	92.0
	Alphaproteobacteria	Rhizobiales	813	718	88.3	718	88.3
Eremiobacterota	Eremiobacteria	Baltobacterales	20	16	80.0	16	80.0
Thermoproteota	Nitrososphaeria	Nitrososphaerales	84	65	77.4	65	77.4
Pseudomonadota	Gammaproteobacteria	Burkholderiales	1379	1059	76.8	1056	76.6
Chloroflexota	Chloroflexia	Chloroflexales	21	15	71.4	15	71.4
Pseudomonadota	Alphaproteobacteria	Rhodospirillales_A	47	33	70.2	33	70.2
Pseudomonadota	Gammaproteobacteria	Chromatiales	58	40	69.0	40	69.0
Myxococcota	Polyangia	Polyangiales	34	23	67.6	23	67.6
Pseudomonadota	Gammaproteobacteria	Pseudomonadales	1188	801	67.4	801	67.4
Pseudomonadota	Gammaproteobacteria	Arenicellales	32	20	62.5	20	62.5
Halobacteriota	Halobacteria	Halobacteriales	120	74	61.7	74	61.7
Actinomycetota	Actinomycetia	Mycobacteriales	689	408	59.2	408	59.2
Pseudomonadota	Gammaproteobacteria	Steroidobacterales	44	26	59.1	26	59.1
Myxococcota	Myxococcia	Myxococcales	41	24	58.5	24	58.5
Desulfobacterota	Syntrophia	Syntrophales	45	25	55.6	25	55.6

			n_species	n_phaC	pct_phaC	n_complete	pct_complete
gtdb_phylum	gtdb_class	gtdb_order
Bacillota	Bacilli	RF39	251	0	0.0	0	0.0
Patescibacteria	Paceibacteria	UBA9983_A	208	0	0.0	0	0.0
Cyanobacteriota	Cyanobacteriia	PCC-6307	167	0	0.0	0	0.0
Campylobacterota	Campylobacteria	Campylobacterales	265	0	0.0	0	0.0
Bacillota	Bacilli	Paenibacillales	149	0	0.0	0	0.0
...	...	...	...	...	...	...	...
Deinococcota	Deinococci	Deinococcales	52	3	5.8	3	5.8
Desulfobacterota_F	Desulfuromonadia	Geobacterales	33	2	6.1	2	6.1
Bacillota_B	Desulfotomaculia	Desulfotomaculales	24	2	8.3	2	8.3
Desulfobacterota	Desulfobulbia	Desulfobulbales	45	4	8.9	4	8.9
Bacillota_D	Dethiobacteria	DTU022	21	2	9.5	2	9.5

02 Phylogenetic Mapping

NB02: PHB Pathway Distribution Across the Bacterial Tree¶

Summary¶

Key Findings (to be filled after execution)¶

Next Notebook (NB03)¶