# Initialize Spark session
# On BERDL JupyterHub — no import needed (injected into kernel)
spark = get_spark_session()
print(f'Spark session active: {spark.version}')

Spark session active: 4.0.1

import os
import pandas as pd
import numpy as np

# Project paths
PROJECT_DIR = os.path.expanduser('~/BERIL-research-observatory/projects/phb_granule_ecology')
DATA_DIR = os.path.join(PROJECT_DIR, 'data')
FIG_DIR = os.path.join(PROJECT_DIR, 'figures')
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(FIG_DIR, exist_ok=True)
print(f'Project dir: {PROJECT_DIR}')

Project dir: /home/aparkin/BERIL-research-observatory/projects/phb_granule_ecology

# Define PHB pathway KEGG KOs
PHB_KOS = {
    'K03821': 'phaC - PHA synthase (committed step)',
    'K00023': 'phaB - acetoacetyl-CoA reductase',
    'K00626': 'phaA - beta-ketothiolase',
    'K05973': 'phaZ - PHB depolymerase',
    'K14205': 'phaP - phasin (granule protein)',
    'K18080': 'phaR - PHB transcriptional regulator',
}

# Also search by description keywords as a cross-check
PHB_KEYWORDS = [
    'polyhydroxybutyrate', 'polyhydroxyalkanoate', 'phasin',
    'pha synthase', 'phb synthase', 'poly-beta-hydroxybutyrate',
    'acetoacetyl-coa reductase',
]

print('PHB pathway markers defined')
for ko, desc in PHB_KOS.items():
    print(f'  {ko}: {desc}')

PHB pathway markers defined
  K03821: phaC - PHA synthase (committed step)
  K00023: phaB - acetoacetyl-CoA reductase
  K00626: phaA - beta-ketothiolase
  K05973: phaZ - PHB depolymerase
  K14205: phaP - phasin (granule protein)
  K18080: phaR - PHB transcriptional regulator

# Query 1: Find all gene clusters with PHB-related KEGG KOs
# Join eggnog_mapper_annotations with gene_cluster to get species and core/aux status

phb_clusters_df = spark.sql("""
    SELECT gc.gtdb_species_clade_id,
           gc.gene_cluster_id,
           gc.is_core,
           gc.is_auxiliary,
           gc.is_singleton,
           ann.KEGG_ko,
           ann.COG_category,
           ann.EC,
           ann.PFAMs,
           ann.Description
    FROM kbase_ke_pangenome.gene_cluster gc
    JOIN kbase_ke_pangenome.eggnog_mapper_annotations ann
        ON gc.gene_cluster_id = ann.query_name
    WHERE ann.KEGG_ko LIKE '%K03821%'
       OR ann.KEGG_ko LIKE '%K00023%'
       OR ann.KEGG_ko LIKE '%K00626%'
       OR ann.KEGG_ko LIKE '%K05973%'
       OR ann.KEGG_ko LIKE '%K14205%'
       OR ann.KEGG_ko LIKE '%K18080%'
""")

# Cache for reuse
phb_clusters_df.cache()
n_clusters = phb_clusters_df.count()
print(f'Found {n_clusters:,} PHB-related gene clusters across all species')

Found 118,513 PHB-related gene clusters across all species

# Convert to pandas for analysis (should be manageable — one row per gene cluster per species)
phb_pd = phb_clusters_df.toPandas()
print(f'Shape: {phb_pd.shape}')
print(f'\nUnique species: {phb_pd["gtdb_species_clade_id"].nunique():,}')
print(f'\nKEGG KO distribution:')

# Parse which PHB KOs are present in each row
for ko, desc in PHB_KOS.items():
    mask = phb_pd['KEGG_ko'].str.contains(ko, na=False)
    n = mask.sum()
    n_species = phb_pd.loc[mask, 'gtdb_species_clade_id'].nunique()
    print(f'  {ko} ({desc.split(" - ")[0]}): {n:,} clusters in {n_species:,} species')

Shape: (118513, 10)

Unique species: 19,496

KEGG KO distribution:
  K03821 (phaC): 11,792 clusters in 6,067 species
  K00023 (phaB): 9,617 clusters in 6,977 species
  K00626 (phaA): 86,318 clusters in 17,969 species
  K05973 (phaZ): 4,656 clusters in 3,151 species
  K14205 (phaP): 6,130 clusters in 4,571 species
  K18080 (phaR): 0 clusters in 0 species

# Query 2: Cross-check with description-based search
# This catches annotations that might have PHB function but different KO assignments

desc_conditions = ' OR '.join(
    [f"LOWER(ann.Description) LIKE '%{kw}%'" for kw in PHB_KEYWORDS]
)

phb_desc_df = spark.sql(f"""
    SELECT ann.Description, ann.KEGG_ko, ann.COG_category, ann.PFAMs,
           COUNT(*) as n_clusters,
           COUNT(DISTINCT gc.gtdb_species_clade_id) as n_species
    FROM kbase_ke_pangenome.gene_cluster gc
    JOIN kbase_ke_pangenome.eggnog_mapper_annotations ann
        ON gc.gene_cluster_id = ann.query_name
    WHERE {desc_conditions}
    GROUP BY ann.Description, ann.KEGG_ko, ann.COG_category, ann.PFAMs
    ORDER BY n_clusters DESC
""")

phb_desc_pd = phb_desc_df.toPandas()
print(f'Description-based search found {len(phb_desc_pd)} distinct annotation groups')
print(f'Total clusters: {phb_desc_pd["n_clusters"].sum():,}')
print(f'\nTop annotations:')
phb_desc_pd.head(20)

Description-based search found 56 distinct annotation groups
Total clusters: 17,324

Top annotations:

# Assign each gene cluster a PHB gene label based on which KO it matches
# Priority: phaC > phaB > phaA > phaZ > phaP > phaR (most to least specific)

def assign_phb_gene(kegg_ko):
    """Assign PHB gene name based on KEGG KO content."""
    if pd.isna(kegg_ko):
        return 'unknown'
    ko = str(kegg_ko)
    if 'K03821' in ko: return 'phaC'
    if 'K05973' in ko: return 'phaZ'
    if 'K14205' in ko: return 'phaP'
    if 'K18080' in ko: return 'phaR'
    if 'K00023' in ko: return 'phaB'
    if 'K00626' in ko: return 'phaA'
    return 'unknown'

phb_pd['phb_gene'] = phb_pd['KEGG_ko'].apply(assign_phb_gene)
print('PHB gene assignments:')
print(phb_pd['phb_gene'].value_counts())

PHB gene assignments:
phb_gene
phaA    86318
phaC    11792
phaB     9617
phaP     6130
phaZ     4656
Name: count, dtype: int64

# Core/Accessory/Singleton status by PHB gene
status_summary = phb_pd.groupby('phb_gene').agg(
    n_clusters=('gene_cluster_id', 'count'),
    n_species=('gtdb_species_clade_id', 'nunique'),
    pct_core=('is_core', lambda x: (x == 1).mean() * 100),
    pct_aux=('is_auxiliary', lambda x: (x == 1).mean() * 100),
    pct_singleton=('is_singleton', lambda x: (x == 1).mean() * 100),
).round(1)

print('PHB gene clusters — core/accessory/singleton status:')
status_summary

PHB gene clusters — core/accessory/singleton status:

# Save gene cluster data
out_path = os.path.join(DATA_DIR, 'phb_gene_clusters.tsv')
phb_pd.to_csv(out_path, sep='\t', index=False)
print(f'Saved {len(phb_pd):,} rows to {out_path}')

Saved 118,513 rows to /home/aparkin/BERIL-research-observatory/projects/phb_granule_ecology/data/phb_gene_clusters.tsv

# Aggregate to species level: which PHB genes does each species have?
species_genes = phb_pd.groupby('gtdb_species_clade_id')['phb_gene'].apply(set).reset_index()
species_genes.columns = ['gtdb_species_clade_id', 'phb_genes_present']

def classify_phb_status(gene_set):
    has_phaC = 'phaC' in gene_set
    has_phaA = 'phaA' in gene_set
    has_phaB = 'phaB' in gene_set
    if has_phaC and (has_phaA or has_phaB):
        return 'complete'
    elif has_phaC:
        return 'synthase_only'
    elif has_phaA or has_phaB:
        return 'precursors_only'
    else:
        return 'accessory_only'  # only phaP/phaR/phaZ

species_genes['phb_status'] = species_genes['phb_genes_present'].apply(classify_phb_status)
species_genes['phb_genes_str'] = species_genes['phb_genes_present'].apply(lambda s: ','.join(sorted(s)))

print('Species PHB pathway status:')
print(species_genes['phb_status'].value_counts())
print(f'\nTotal species with any PHB gene: {len(species_genes):,}')

Species PHB pathway status:
phb_status
precursors_only    12871
complete            6005
accessory_only       558
synthase_only         62
Name: count, dtype: int64

Total species with any PHB gene: 19,496

# Get total species count to calculate PHB-absent species
total_species = spark.sql("""
    SELECT COUNT(*) as n FROM kbase_ke_pangenome.pangenome
""").collect()[0]['n']

n_phb_any = len(species_genes)
n_phb_absent = total_species - n_phb_any

print(f'Total species in pangenome: {total_species:,}')
print(f'Species with any PHB gene: {n_phb_any:,} ({n_phb_any/total_species*100:.1f}%)')
print(f'Species with no PHB genes: {n_phb_absent:,} ({n_phb_absent/total_species*100:.1f}%)')

Total species in pangenome: 27,702
Species with any PHB gene: 19,496 (70.4%)
Species with no PHB genes: 8,206 (29.6%)

# Get phaC core/accessory status per species (is the synthase itself core or accessory?)
phac_status = phb_pd[phb_pd['phb_gene'] == 'phaC'].groupby('gtdb_species_clade_id').agg(
    n_phaC_clusters=('gene_cluster_id', 'count'),
    phaC_is_core=('is_core', 'max'),
    phaC_is_aux=('is_auxiliary', 'max'),
).reset_index()

# Merge with species-level summary
species_summary = species_genes.merge(phac_status, on='gtdb_species_clade_id', how='left')

print('\nAmong species with phaC:')
phac_species = species_summary[species_summary['phb_status'].isin(['complete', 'synthase_only'])]
print(f'  Total: {len(phac_species):,}')
print(f'  phaC is core: {(phac_species["phaC_is_core"] == 1).sum():,}')
print(f'  phaC is accessory: {(phac_species["phaC_is_aux"] == 1).sum():,}')

Among species with phaC:
  Total: 6,067
  phaC is core: 5,371
  phaC is accessory: 1,959

# Save species summary
out_path = os.path.join(DATA_DIR, 'phb_species_summary.tsv')
species_summary.to_csv(out_path, sep='\t', index=False)
print(f'Saved {len(species_summary):,} species to {out_path}')

Saved 19,496 species to /home/aparkin/BERIL-research-observatory/projects/phb_granule_ecology/data/phb_species_summary.tsv

# List all tables in nmdc_arkin
nmdc_tables = spark.sql("SHOW TABLES IN nmdc_arkin").toPandas()
print(f'NMDC tables: {len(nmdc_tables)}')
nmdc_tables

NMDC tables: 63

# Check trait_features columns — do any relate to PHB/PHA?
trait_schema = spark.sql("DESCRIBE nmdc_arkin.trait_features").toPandas()
print('trait_features columns:')
for _, row in trait_schema.iterrows():
    col = row['col_name']
    if any(kw in col.lower() for kw in ['pha', 'phb', 'poly', 'granule', 'storage', 'carbon', 'ferment']):
        print(f'  *** {col} ({row["data_type"]})')
    else:
        print(f'      {col} ({row["data_type"]})')

trait_features columns:
      cell_shape (double)
      oxygen_preference (double)
      functional_group:aerobic_chemoheterotrophy (double)
  *** functional_group:fermentation (double)
      functional_group:nitrate_denitrification (double)
      functional_group:nitrate_respiration (double)
      functional_group:nitrogen_fixation (double)
      functional_group:dark_thiosulfate_oxidation (double)
      functional_group:nitrate_reduction (double)
      functional_group:aromatic_compound_degradation (double)
  *** functional_group:aromatic_hydrocarbon_degradation (double)
      functional_group:arsenite_oxidation_energy_yielding (double)
      functional_group:cellulolysis (double)
      functional_group:dark_hydrogen_oxidation (double)
      functional_group:denitrification (double)
      functional_group:human_pathogens_all (double)
      functional_group:human_pathogens_pneumonia (double)
      functional_group:human_pathogens_septicemia (double)
      functional_group:invertebrate_parasites (double)
      functional_group:ligninolysis (double)
      functional_group:manganese_oxidation (double)
      functional_group:methanol_oxidation (double)
      functional_group:methanotrophy (double)
      functional_group:nitrite_respiration (double)
      functional_group:nitrous_oxide_denitrification (double)
      functional_group:oil_bioremediation (double)
      functional_group:plant_pathogen (double)
      functional_group:plastic_degradation (double)
      functional_group:ureolysis (double)
      functional_group:xylanolysis (double)
      functional_group:aerobic_nitrite_oxidation (double)
      functional_group:animal_parasites_or_symbionts (double)
      functional_group:chitinolysis (double)
      functional_group:knallgas_bacteria (double)
      functional_group:arsenite_oxidation_detoxification (double)
      functional_group:dark_sulfide_oxidation (double)
      functional_group:aerobic_anoxygenic_phototrophy (double)
      functional_group:nitrate_ammonification (double)
      functional_group:arsenate_detoxification (double)
  *** functional_group:aliphatic_non_methane_hydrocarbon_degradation (double)
      functional_group:dark_oxidation_of_sulfur_compounds (double)
      functional_group:dark_sulfur_oxidation (double)
      functional_group:iron_respiration (double)
      functional_group:human_pathogens_nosocomia (double)
      functional_group:anoxygenic_photoautotrophy_Fe_oxidizing (double)
      functional_group:anoxygenic_photoautotrophy_H2_oxidizing (double)
      functional_group:anoxygenic_photoautotrophy_S_oxidizing (double)
      functional_group:dark_iron_oxidation (double)
      functional_group:photoheterotrophy (double)
      cell_length (double)
      cell_width (double)
      functional_group:human_pathogens_gastroenteritis (double)
      functional_group:sulfite_respiration (double)
      functional_group:thiosulfate_respiration (double)
      functional_group:nitrite_denitrification (double)
      GC_content (double)
      gram_stain (double)
      motility (double)
      pH_optimum (double)
      pH_range (double)
      pigment_production (double)
      temperature_optimum (double)
      temperature_range (double)
      functional_group:methylotrophy (double)
      salt_optimum (double)
      salt_range (double)
      functional_group:dark_sulfite_oxidation (double)
      functional_group:anoxygenic_photoautotrophy (double)
  *** functional_group:hydrocarbon_degradation (double)
      spore_formation (double)
      functional_group:fumarate_respiration (double)
      functional_group:manganese_respiration (double)
      functional_group:sulfur_respiration (double)
      functional_group:human_pathogens_meningitis (double)
      functional_group:aerobic_ammonia_oxidation (double)
      functional_group:sulfate_respiration (double)
      functional_group:arsenate_respiration (double)
      functional_group:human_gut (double)
      functional_group:nitrite_ammonification (double)
      functional_group:chlorate_reducers (double)
      functional_group:dissimilatory_arsenite_oxidation (double)
      functional_group:reductive_acetogenesis (double)
      functional_group:methanogenesis_using_formate (double)
      functional_group:human_pathogens_diarrhea (double)
      functional_group:human_associated (double)
      functional_group:fish_parasites (double)
      functional_group:acetoclastic_methanogenesis (double)
      functional_group:methanogenesis_by_CO2_reduction_with_H2 (double)
      functional_group:methanogenesis_by_reduction_of_methyl_compounds_with_H2 (double)
      functional_group:hydrogenotrophic_methanogenesis (double)
      functional_group:methanogenesis (double)
      sample_id (string)

# Check abiotic_features schema — what environmental measurements are available?
abiotic_schema = spark.sql("DESCRIBE nmdc_arkin.abiotic_features").toPandas()
print(f'abiotic_features columns ({len(abiotic_schema)}):')
for _, row in abiotic_schema.iterrows():
    print(f'  {row["col_name"]} ({row["data_type"]})')

abiotic_features columns (22):
  sample_id (string)
  annotations_ammonium_has_numeric_value (double)
  annotations_ammonium_nitrogen_has_numeric_value (double)
  annotations_calcium_has_numeric_value (double)
  annotations_carb_nitro_ratio_has_numeric_value (double)
  annotations_chlorophyll_has_numeric_value (double)
  annotations_conduc_has_numeric_value (double)
  annotations_depth_has_maximum_numeric_value (double)
  annotations_depth_has_minimum_numeric_value (double)
  annotations_depth_has_numeric_value (double)
  annotations_diss_org_carb_has_numeric_value (double)
  annotations_diss_oxygen_has_numeric_value (double)
  annotations_magnesium_has_numeric_value (double)
  annotations_manganese_has_numeric_value (double)
  annotations_ph (double)
  annotations_potassium_has_numeric_value (double)
  annotations_samp_size_has_numeric_value (double)
  annotations_soluble_react_phosp_has_numeric_value (double)
  annotations_temp_has_numeric_value (double)
  annotations_tot_nitro_content_has_numeric_value (double)
  annotations_tot_org_carb_has_numeric_value (double)
  annotations_tot_phosp_has_numeric_value (double)

# Check study_table — what NMDC studies are available?
studies = spark.sql("SELECT * FROM nmdc_arkin.study_table").toPandas()
print(f'NMDC studies: {len(studies)}')
studies

NMDC studies: 48

# Check taxonomy_features — sample count and structure
tax_count = spark.sql("SELECT COUNT(*) as n FROM nmdc_arkin.taxonomy_features").collect()[0]['n']
tax_schema = spark.sql("DESCRIBE nmdc_arkin.taxonomy_features").toPandas()
print(f'taxonomy_features: {tax_count:,} samples, {len(tax_schema)} columns')
print('\nFirst 10 columns:')
tax_schema.head(10)

taxonomy_features: 6,365 samples, 3493 columns

First 10 columns:

# Check if there are per-sample functional annotation tables we might have missed
# Look for tables with 'annotation', 'ko', 'kegg', 'function', 'gene' in the name
annotation_tables = nmdc_tables[nmdc_tables['tableName'].str.contains(
    'annot|ko|kegg|function|gene|contig', case=False, na=False
)]
print('Potential per-sample annotation tables:')
for _, row in annotation_tables.iterrows():
    tname = row['tableName']
    try:
        cnt = spark.sql(f"SELECT COUNT(*) as n FROM nmdc_arkin.{tname}").collect()[0]['n']
        print(f'  {tname}: {cnt:,} rows')
    except Exception as e:
        print(f'  {tname}: error - {e}')

Potential per-sample annotation tables:
  annotation_terms_unified: 67,353 rows
  kegg_ko_module: 2,814 rows
  kegg_ko_pathway: 15,617 rows
  kegg_ko_terms: 8,104 rows
  kegg_module_terms: 370 rows
  kegg_pathway_terms: 306 rows
  contig_taxonomy: 3,981,010,222 rows
  contig_taxonomy_backup: 9,009,525,315 rows
  annotation_crossrefs: 46,861 rows
  annotation_hierarchies_unified: 75,181 rows

# Check KEGG KO terms — verify our PHB KOs exist in NMDC
phb_ko_list = "', '".join(PHB_KOS.keys())
nmdc_kos = spark.sql(f"""
    SELECT * FROM nmdc_arkin.kegg_ko_terms 
    WHERE ko_id IN ('{phb_ko_list}')
""").toPandas()
print('PHB KEGG KOs in NMDC reference:')
nmdc_kos

# Check metabolomics_gold schema and search for 3-hydroxybutyrate
metab_schema = spark.sql("DESCRIBE nmdc_arkin.metabolomics_gold").toPandas()
print('metabolomics_gold columns:')
print(metab_schema[['col_name', 'data_type']].to_string(index=False))

# The compound name column is "Compound Name" (with space and capitals)
# Also available: "name", "Traditional Name", "Common Name", "IUPAC Name"
# Use backticks for columns with spaces in SQL
name_col = '`Compound Name`'
hb_metabolites = spark.sql(f"""
    SELECT DISTINCT {name_col}, `Common Name`, `Traditional Name`, name, kegg, smiles
    FROM nmdc_arkin.metabolomics_gold
    WHERE LOWER({name_col}) LIKE '%hydroxybutyrat%'
       OR LOWER({name_col}) LIKE '%phb%'
       OR LOWER(name) LIKE '%hydroxybutyrat%'
       OR LOWER(`Common Name`) LIKE '%hydroxybutyrat%'
    LIMIT 20
""").toPandas()
print(f'\n3-hydroxybutyrate-related metabolites in NMDC:')
print(hb_metabolites)

# Check ncbi_env harmonized_name categories (for pangenome environment data)
env_categories = spark.sql("""
    SELECT harmonized_name, COUNT(*) as n
    FROM kbase_ke_pangenome.ncbi_env
    GROUP BY harmonized_name
    ORDER BY n DESC
""").toPandas()
print('NCBI environment metadata categories:')
env_categories

	Description	KEGG_ko	COG_category	PFAMs	n_clusters	n_species
0	Phasin protein	-	S	Phasin_2	5484	2793
1	Poly-beta-hydroxybutyrate polymerase (PhaC) N-...	ko:K03821	I	Abhydrolase_1,PhaC_N	1441	1185
2	TIGRFAM phasin family protein	-	S	Phasin_2	1142	753
3	Acetoacetyl-CoA reductase	ko:K00023	IQ	adh_short_C2	1108	1066
4	Polyhydroxyalkanoate synthesis repressor PhaR	-	S	PHB_acc,PHB_acc_N	814	798
5	Poly-beta-hydroxybutyrate polymerase (PhaC) N-...	ko:K03821	I	PhaC_N	768	668
6	Poly(hydroxyalcanoate) granule associated prot...	-	S	Phasin	737	673
7	Acetoacetyl-CoA reductase	ko:K00023	IQ	adh_short,adh_short_C2	718	544
8	Poly-beta-hydroxybutyrate polymerase N terminal	ko:K03821	I	PHBC_N,PhaC_N	556	450
9	Polyhydroxyalkanoate synthesis repressor	-	S	PHB_acc,PHB_acc_N	448	443
10	Poly-beta-hydroxybutyrate polymerase	ko:K03821	I	PHBC_N,PhaC_N	416	355
11	Poly-beta-hydroxybutyrate polymerase	ko:K03821	I	Abhydrolase_1,PhaC_N	349	334
12	Poly-beta-hydroxybutyrate polymerase (PhaC) N-...	ko:K03821	I	Abhydrolase_1	348	325
13	TIGRFAM polyhydroxyalkanoate depolymerase, int...	ko:K05973	I	PHB_depo_C	336	309
14	poly-beta-hydroxybutyrate polymerase	ko:K03821	I	Abhydrolase_1,PhaC_N	242	228
15	Polyhydroxyalkanoate depolymerase, intracellular	ko:K05973	I	PHB_depo_C	235	231
16	Phasin, PhaP	-	S	Phasin_2	185	175
17	phasin family	-	S	Phasin_2	178	177
18	Polyhydroxyalkanoate synthesis repressor PhaR	ko:K01654	M	NeuB,SAF	155	142
19	Poly-beta-hydroxybutyrate	ko:K03821	I	PhaC_N	144	134

	n_clusters	n_species	pct_core	pct_aux	pct_singleton
phb_gene
phaA	86318	17969	62.2	37.8	26.3
phaB	9617	6977	68.0	32.0	21.0
phaC	11792	6067	70.7	29.3	19.4
phaP	6130	4571	73.3	26.7	16.7
phaZ	4656	3151	80.4	19.6	12.8

	study_id	name	description	ecosystem	ecosystem_category	ecosystem_type	ecosystem_subtype	specific_ecosystem	principal_investigator_has_raw_value	principal_investigator_profile_image_url	...	part_of	principal_investigator_email	study_image	insdc_bioproject_identifiers	homepage_website	gnps_task_identifiers	jgi_portal_study_identifiers	notes	emsl_project_identifiers	alternative_names
0	nmdc:sty-11-8fb6t785	Deep subsurface shale carbon reservoir microbi...	This project aims to improve the understanding...	Environmental	Terrestrial	Deep subsurface	Unclassified	Unclassified	Kelly Wrighton	https://portal.nersc.gov/project/m3408/profile...	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	nmdc:sty-11-33fbta56	Peatland microbial communities from Minnesota,...	This study is part of the Spruce and Peatland ...	Environmental	Aquatic	Freshwater	Wetlands	Unclassified	Christopher Schadt	https://portal.nersc.gov/project/m3408/profile...	...	["nmdc:sty-11-cytnjc39"]	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	nmdc:sty-11-aygzgv51	Riverbed sediment microbial communities from t...	This research project aimed to understand how ...	Environmental	Aquatic	Freshwater	River	Sediment	James Stegen	https://portal.nersc.gov/project/m3408/profile...	...	["nmdc:sty-11-x4aawf73", "nmdc:sty-11-xcbexm97"]	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	nmdc:sty-11-34xj1150	National Ecological Observatory Network: soil ...	This study contains the quality-controlled lab...	NaN	NaN	NaN	NaN	NaN	Kate Thibault	https://portal.nersc.gov/project/m3408/profile...	...	["nmdc:sty-11-nxrz9m96"]	kthibault@battelleecology.org	[{"url": "https://portal.nersc.gov/project/m34...	["bioproject:PRJNA406974", "bioproject:PRJNA10...	["https://www.neonscience.org/"]	NaN	NaN	NaN	NaN	NaN
4	nmdc:sty-11-076c9980	Lab enrichment of tropical soil microbial comm...	This study is part of the Microbes Persist: Sy...	Environmental	Terrestrial	Soil	Unclassified	Forest Soil	Jennifer Pett-Ridge	https://portal.nersc.gov/project/m3408/profile...	...	["nmdc:sty-11-msexsy29"]	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
5	nmdc:sty-11-t91cwb40	Determining the genomic basis for interactions...	The goal of this work is to develop the knowle...	NaN	NaN	NaN	NaN	NaN	Michelle O'Malley	https://chemengr.ucsb.edu/sites/default/files/...	...	NaN	momalley@engineering.ucsb.edu	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
6	nmdc:sty-11-5bgrvr62	Freshwater microbial communities from Lake Men...	The goal of this study is to examine long-term...	NaN	NaN	NaN	NaN	NaN	Katherine McMahon	https://portal.nersc.gov/project/m3408/profile...	...	NaN	tmcmahon@cae.wisc.edu	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
7	nmdc:sty-11-5tgfr349	Freshwater microbial communities from rivers f...	Streams and rivers represent key functioning u...	Environmental	Aquatic	Freshwater	River	Unclassified	Kelly Wrighton	https://portal.nersc.gov/project/m3408/profile...	...	["nmdc:sty-11-x4aawf73", "nmdc:sty-11-xcbexm97"]	kwrighton@gmail.com	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
8	nmdc:sty-11-dcqce727	Bulk soil microbial communities from the East ...	This research project aimed to understand how ...	Environmental	Terrestrial	Soil	Meadow	Bulk soil	Eoin Brodie	https://portal.nersc.gov/project/m3408/profile...	...	["nmdc:sty-11-2zhqs261"]	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
9	nmdc:sty-11-1t150432	Populus root and rhizosphere microbial communi...	This study is part of the Plant-Microbe Interf...	Host-associated	Plants	Unclassified	Unclassified	Unclassified	Mitchel J. Doktycz	https://portal.nersc.gov/project/m3408/profile...	...	["nmdc:sty-11-f1he1955"]	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
10	nmdc:sty-11-zs2syx06	Meadow Soil Metagenomes from the Angelo Coast ...	Assembled metagenomes for 60 soil samples acro...	NaN	NaN	NaN	NaN	NaN	Jillian F. Banfield	https://ourenvironment.berkeley.edu/sites/oure...	...	["nmdc:sty-11-abkzcd11"]	jbanfield@berkeley.edu	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
11	nmdc:sty-11-r2h77870	Roots, rhizosphere and bulk soil microbial com...	The goal of this Bio-Scales Pilot Project stud...	Environmental	Terrestrial	Soil	Unclassified	Unclassified	Mitchel J. Doktycz	https://portal.nersc.gov/project/m3408/profile...	...	NaN	doktyczmj@ornl.gov	NaN	NaN	NaN	["gnps.task:4b848c342a4f4abc871bdf8a09a60807",...	["jgi.proposal:507130"]	NaN	NaN	NaN
12	nmdc:sty-11-28tm5d36	1000 Soils Research Campaign	The 1,000 Soils Pilot is composed of crowdsour...	NaN	NaN	NaN	NaN	NaN	Emily Graham	https://portal.nersc.gov/project/m3408/profile...	...	NaN	emily.graham@pnnl.gov	[{"url": "https://portal.nersc.gov/project/m34...	["bioproject:PRJNA1260013"]	["https://www.emsl.pnnl.gov/project/60141"]	NaN	["jgi.proposal:508306"]	NaN	NaN	NaN
13	nmdc:sty-11-547rwq94	NaN	The Earth Microbiome Project (EMP) is a massiv...	NaN	NaN	NaN	NaN	NaN	Jack A. Gilbert, Janet K. Jansson, Rob Knight	NaN	...	NaN	janet.jansson@pnnl.gov	[{"url": "https://portal.nersc.gov/project/m34...	NaN	["https://earthmicrobiome.org"]	NaN	NaN	NaN	NaN	NaN
14	nmdc:sty-11-hht5sb92	National Ecological Observatory Network: surfa...	This study contains the quality-controlled lab...	NaN	NaN	NaN	NaN	NaN	Kate Thibault	NaN	...	["nmdc:sty-11-nxrz9m96"]	kthibault@battelleecology.org	[{"url": "https://portal.nersc.gov/project/m34...	["bioproject:PRJNA406977"]	["https://www.neonscience.org/"]	NaN	NaN	NaN	NaN	NaN
15	nmdc:sty-11-pzmd0x14	National Ecological Observatory Network: benth...	This study contains the primary field and qual...	NaN	NaN	NaN	NaN	NaN	Kate Thibault	NaN	...	["nmdc:sty-11-nxrz9m96"]	kthibault@battelleecology.org	[{"url": "https://portal.nersc.gov/project/m34...	["bioproject:PRJNA406976"]	["https://www.neonscience.org/"]	NaN	NaN	NaN	NaN	NaN
16	nmdc:sty-11-db67n062	Geochemical controls on soil resiliency to car...	This research project aimed to understand perm...	NaN	NaN	NaN	NaN	NaN	NaN	https://portal.nersc.gov/project/m3408/profile...	...	NaN	ralybrand@ucdavis.edu	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
17	nmdc:sty-11-8xdqsn54	Coupling spectral techniques; Molecular charac...	This User proposal will facilitate the complet...	NaN	NaN	NaN	NaN	NaN	NaN	https://portal.nersc.gov/project/m3408/profile...	...	NaN	sanclements@battelle.org	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
18	nmdc:sty-11-hdd4bf83	Colonization resistance against Candida	Identify metabolic pathways in the microbiota ...	NaN	NaN	NaN	NaN	NaN	Andreas Baumler	https://health.ucdavis.edu/medmicro/includes/i...	...	NaN	ajbaumler@ucdavis.edu	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
19	nmdc:sty-11-2zhqs261	NaN	Climate change, extreme weather, land-use chan...	NaN	NaN	NaN	NaN	NaN	Eoin Brodie	https://portal.nersc.gov/project/m3408/profile...	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
20	nmdc:sty-11-xcbexm97	NaN	The Worldwide Hydrobiogeochemistry Observation...	NaN	NaN	NaN	NaN	NaN	Amy Goldman, James C. Stegen	NaN	...	NaN	NaN	[{"url": "https://www.pnnl.gov/sites/default/f...	NaN	["https://www.pnnl.gov/projects/WHONDRS"]	NaN	NaN	NaN	NaN	NaN
21	nmdc:sty-11-x4aawf73	NaN	The Pacific Northwest National Laboratory (PNN...	NaN	NaN	NaN	NaN	NaN	Timothy Scheibe	https://www.pnnl.gov/sites/default/files/style...	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
22	nmdc:sty-11-f1he1955	NaN	The goal of the Plant-Microbe Interfaces SFA i...	NaN	NaN	NaN	NaN	NaN	Mitch Doktycz	https://www.ornl.gov/sites/default/files/style...	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
23	nmdc:sty-11-cytnjc39	NaN	The Terrestrial Ecosystem Science SFA supports...	NaN	NaN	NaN	NaN	NaN	Paul J. Hanson	https://www.ornl.gov/sites/default/files/style...	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
24	nmdc:sty-11-msexsy29	NaN	The LLNL Soil Microbiome Science Focus Area (S...	NaN	NaN	NaN	NaN	NaN	Jennifer Pett-Ridge	https://portal.nersc.gov/project/m3408/profile...	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
25	nmdc:sty-11-nxrz9m96	NaN	The National Science Foundation's National Eco...	NaN	NaN	NaN	NaN	NaN	Kate Thibault	NaN	...	NaN	NaN	[{"url": "https://portal.nersc.gov/project/m34...	NaN	["https://www.neonscience.org/"]	NaN	NaN	NaN	NaN	NaN
26	nmdc:sty-11-e4yb9z58	Seasonal activities of the phyllosphere microb...	Understanding the interactions between plants ...	NaN	NaN	NaN	NaN	NaN	NaN	https://portal.nersc.gov/project/m3408/submiss...	...	["nmdc:sty-11-y1kdd163"]	shade.ashley@gmail.com	NaN	NaN	NaN	NaN	["jgi.proposal:503249"]	NaN	NaN	NaN
27	nmdc:sty-11-abkzcd11	NaN	We aim to understand and control the microscop...	NaN	NaN	NaN	NaN	NaN	Trent R. Northen	https://biosciences.lbl.gov/wp-content/uploads...	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
28	nmdc:sty-11-ev70y104	EcoFAB 2.0 Root Microbiome Ring Trial	Leveraging microbiomes to promote soil health ...	NaN	NaN	NaN	NaN	NaN	NaN	https://portal.nersc.gov/project/m3408/submiss...	...	NaN	trnorthen@lbl.gov	NaN	["bioproject:PRJNA1151037"]	NaN	["gnps.task:2ccbf82840724c99a2acc2c9e512a302"]	NaN	The submission contains MassIVE dataset with r...	NaN	NaN
29	nmdc:sty-11-8ws97026	Molecular mechanisms underlying changes in the...	Global climate changes is transforming our soi...	NaN	NaN	NaN	NaN	NaN	NaN	https://portal.nersc.gov/project/m3408/submiss...	...	NaN	jlb@umass.edu	NaN	NaN	NaN	NaN	["jgi.proposal:503125"]	NaN	["emsl.project:49483"]	NaN
30	nmdc:sty-11-fkbnah04	Freshwater microbial communities from oligotro...	Study of oligotrophic, dystrophic, and eutroph...	NaN	NaN	NaN	NaN	NaN	Katherine McMahon	NaN	...	NaN	tmcmahon@engr.wisc.edu	NaN	NaN	NaN	NaN	["jgi.proposal:1977"]	NaN	NaN	NaN
31	nmdc:sty-11-prtb4s28	Arabidopsis, maize, boechera and miscanthus rh...	Plant associated metagenomes--Microbial commun...	NaN	NaN	NaN	NaN	NaN	Jeff Dangl	NaN	...	NaN	dangl@email.unc.edu	NaN	NaN	NaN	NaN	["jgi.proposal:584"]	NaN	NaN	NaN
32	nmdc:sty-11-dwsv7q78	Microbial regulation of soil water repellency ...	We investigated the direct effect of microbial...	NaN	NaN	NaN	NaN	NaN	NaN	https://portal.nersc.gov/project/m3408/submiss...	...	NaN	marie.kroeger7@gmail.com	NaN	NaN	NaN	NaN	["jgi.proposal:506588"]	NaN	NaN	NaN
33	nmdc:sty-11-y1kdd163	Great Lakes Bioenergy Research Center (GLBRC)	GLBRC is developing science and technological ...	NaN	NaN	NaN	NaN	NaN	Timothy Donohue	NaN	...	NaN	NaN	[{"type": "nmdc:ImageValue", "url": "https://p...	NaN	["https://www.glbrc.org"]	NaN	NaN	NaN	NaN	NaN
34	nmdc:sty-11-3cmn1g53	Center for Advanced Bioenergy and Bioproducts ...	CABBI is integrating recent advances in agrono...	NaN	NaN	NaN	NaN	NaN	Andrew Leakey	NaN	...	NaN	NaN	[{"type": "nmdc:ImageValue", "url": "https://p...	NaN	["https://cabbi.bio/"]	NaN	NaN	NaN	NaN	NaN
35	nmdc:sty-11-cssvjy19	Center for Bioenergy Innovation (CBI)	CBI is accelerating the domestication of bioen...	NaN	NaN	NaN	NaN	NaN	Jerry Tuskan	NaN	...	NaN	NaN	[{"type": "nmdc:ImageValue", "url": "https://p...	NaN	["https://cbi.ornl.gov/"]	NaN	NaN	NaN	NaN	NaN
36	nmdc:sty-11-ggfd7z76	Joint BioEnergy Institute (JBEI)	JBEI is using the latest tools in molecular bi...	NaN	NaN	NaN	NaN	NaN	Jay Keasling	NaN	...	NaN	NaN	[{"type": "nmdc:ImageValue", "url": "https://p...	NaN	["https://www.jbei.org"]	NaN	NaN	NaN	NaN	NaN
37	nmdc:sty-11-srtxhh77	MONet	The Molecular Observation Network (MONet) is a...	NaN	NaN	NaN	NaN	NaN	John Bargar	NaN	...	NaN	NaN	[{"type": "nmdc:ImageValue", "url": "https://p...	NaN	["https://www.emsl.pnnl.gov/monet"]	NaN	NaN	NaN	NaN	NaN
38	nmdc:sty-11-kjs8av65	COMPASS - Field, Measurements, and Experiments	COMPASS-FME is developing a predictive underst...	NaN	NaN	NaN	NaN	NaN	Vanessa Bailey	NaN	...	NaN	NaN	[{"type": "nmdc:ImageValue", "url": "https://p...	NaN	["https://compass.pnnl.gov/FME/COMPASSFME"]	NaN	NaN	NaN	NaN	NaN
39	nmdc:sty-11-wbc14h22	Switchgrass cropping systems affect soil carbo...	Understanding how the rhizosphere microbiome i...	NaN	NaN	NaN	NaN	NaN	NaN	https://portal.nersc.gov/project/m3408/submiss...	...	["nmdc:sty-11-y1kdd163"]	m.friesen@wsu.edu	NaN	["bioproject:PRJNA733109", "bioproject:PRJNA73...	NaN	NaN	NaN	NaN	NaN	NaN
40	nmdc:sty-11-vh2hty57	The Impact of Stand Age and Fertilization on t...	Yield of the perennial grass Miscanthus × giga...	NaN	NaN	NaN	NaN	NaN	NaN	https://portal.nersc.gov/project/m3408/submiss...	...	["nmdc:sty-11-3cmn1g53"]	adina@iastate.edu	NaN	["bioproject:PRJNA601860"]	NaN	NaN	NaN	NaN	NaN	NaN
41	nmdc:sty-11-kfvmk798	Chronic drought differentially alters the belo...	Populus trichocarpa is an ecologically importa...	NaN	NaN	NaN	NaN	NaN	NaN	https://portal.nersc.gov/project/m3408/submiss...	...	["nmdc:sty-11-cssvjy19"]	creggerma@ornl.gov	NaN	["bioproject:PRJNA784967"]	NaN	NaN	NaN	NaN	NaN	NaN
42	nmdc:sty-11-n7mtj961	Impact of modulating bioenergy traits on the s...	This dataset comprises 16S amplicon sequences ...	NaN	NaN	NaN	NaN	NaN	NaN	https://portal.nersc.gov/project/m3408/submiss...	...	["nmdc:sty-11-ggfd7z76"]	hscheller@lbl.gov	NaN	["bioproject:PRJNA1205755"]	NaN	NaN	NaN	NaN	NaN	["Impact of modulating bioenergy traits on the...
43	nmdc:sty-11-46aje659	Panicgrass rhizosphere soil microbial communit...	Evaluating effects of drought stress on Panicu...	NaN	NaN	NaN	NaN	NaN	NaN	https://portal.nersc.gov/project/m3408/submiss...	...	NaN	esther.singer13@gmail.com	NaN	NaN	NaN	NaN	["jgi.proposal:504603"]	NaN	NaN	NaN
44	nmdc:sty-11-h1m9nj62	Seawater and ice microbial communities from Ar...	The goal of this study is to enhance understan...	NaN	NaN	NaN	NaN	NaN	NaN	https://portal.nersc.gov/project/m3408/submiss...	...	NaN	t.mock@uea.ac.uk	NaN	NaN	NaN	NaN	["jgi.proposal:505419"]	NaN	NaN	NaN
45	nmdc:sty-11-rh9tya90	Effects of warming and drought on the microbia...	Northern peatlands store carbon as thick layer...	NaN	NaN	NaN	NaN	NaN	NaN	https://portal.nersc.gov/project/m3408/profile...	...	NaN	joel.kostka@biology.gatech.edu	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
46	nmdc:sty-11-b0ykqz91	2014 Lake Erie Harmful Algal Bloom	Lake Erie experiences annual harmful algal blo...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	gdick@umich.edu	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
47	nmdc:sty-11-j05pc998	Soil microbial communities from watershed of U...	In mountainous systems with seasonal snow-cove...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	["nmdc:sty-11-2zhqs261"]	elbrodie@lbl.gov	NaN	NaN	NaN	NaN	["jgi.proposal:505800"]	NaN	NaN	NaN

01 Phb Gene Discovery

NB01: PHB Gene Discovery and Data Exploration¶

PHB Pathway Markers¶

Part 1: PHB Gene Discovery in the Pangenome¶

Part 2: Species-Level PHB Pathway Completeness¶

Part 3: NMDC Schema Exploration¶

Summary and Next Steps¶

Pangenome Results¶

NMDC Data Availability¶

Next Notebook (NB02)¶

Gene	Function	KEGG KO	Specificity
phaC	PHA synthase (committed step)	K03821	PHB-specific
phaP	Phasin (granule protein)	K14205	PHB-specific
phaR	PHB transcriptional regulator	K18080	PHB-specific
phaZ	PHB depolymerase	K05973	PHB-specific
phaA	Beta-ketothiolase	K00626	Shared with fatty acid metabolism
phaB	Acetoacetyl-CoA reductase	K00023	Shared with SDR family

	namespace	tableName	isTemporary
0	nmdc_arkin	annotation_terms_unified	False
1	nmdc_arkin	cog_categories	False
2	nmdc_arkin	cog_hierarchy_flat	False
3	nmdc_arkin	ec_hierarchy_flat	False
4	nmdc_arkin	ec_hierarchy_graph	False
...	...	...	...
58	nmdc_arkin	trait_unified	False
59	nmdc_arkin	annotation_hierarchies_unified	False
60	nmdc_arkin	abiotic_features	False
61	nmdc_arkin	metabolomics_gold	False
62	nmdc_arkin	metatranscriptomics_gold	False

	col_name	data_type	comment
0	7	double	NaN
1	11	double	NaN
2	33	double	NaN
3	34	double	NaN
4	35	double	NaN
5	41	double	NaN
6	43	double	NaN
7	48	double	NaN
8	52	double	NaN
9	56	double	NaN