spark = get_spark_session()
import pandas as pd

PHB_KOS = {
    'K03821': 'phaC - PHA synthase (committed step)',
    'K00023': 'phaB - acetoacetyl-CoA reductase',
    'K00626': 'phaA - beta-ketothiolase',
    'K05973': 'phaZ - PHB depolymerase',
    'K14205': 'phaP - phasin (granule protein)',
    'K18080': 'phaR - PHB transcriptional regulator',
}
print('Ready')

Ready

# Cell 23 fix: Check KEGG KO terms — verify our PHB KOs exist in NMDC
phb_ko_list = "', '".join(PHB_KOS.keys())
nmdc_kos = spark.sql(f"""
    SELECT * FROM nmdc_arkin.kegg_ko_terms 
    WHERE ko_id IN ('{phb_ko_list}')
""").toPandas()
print('PHB KEGG KOs in NMDC reference:')
nmdc_kos

PHB KEGG KOs in NMDC reference:

# Cell 24 fix: Check metabolomics_gold schema and search for 3-hydroxybutyrate
metab_schema = spark.sql("DESCRIBE nmdc_arkin.metabolomics_gold").toPandas()
print('metabolomics_gold columns:')
print(metab_schema[['col_name', 'data_type']].to_string(index=False))

name_cols = [c for c in metab_schema['col_name'] 
             if any(kw in c.lower() for kw in ['name', 'compound', 'metabolite', 'label'])]
print(f'\nPotential name columns: {name_cols}')

if name_cols:
    name_col = name_cols[0]
    hb_metabolites = spark.sql(f"""
        SELECT DISTINCT *
        FROM nmdc_arkin.metabolomics_gold
        WHERE LOWER({name_col}) LIKE '%hydroxybutyrate%'
           OR LOWER({name_col}) LIKE '%hydroxybutyr%'
           OR LOWER({name_col}) LIKE '%phb%'
        LIMIT 20
    """).toPandas()
    print(f'\n3-hydroxybutyrate-related metabolites in NMDC (using {name_col}):')
    print(hb_metabolites)
else:
    print('\nNo obvious name column found. Showing sample rows:')
    sample = spark.sql("SELECT * FROM nmdc_arkin.metabolomics_gold LIMIT 3").toPandas()
    print(sample)

metabolomics_gold columns:
                                    col_name data_type
                                     file_id    string
                                   file_name    string
                                  feature_id    string
                            Apex Scan Number    double
                                        Area    double
Associated Mass Features after Deconvolution    string
                              Calculated m/z    double
                            Confidence Score    double
                            Dispersity Index    double
                          Entropy Similarity    double
                                   Intensity    double
                                 Ion Formula    string
                                    Ion Type    string
          Is Largest Ion after Deconvolution   boolean
                     Isotopologue Similarity    double
                           Isotopologue Type    string
             Library mzs in Query (fraction)    double
                                MS2 Spectrum    string
                             Mass Feature ID    bigint
                           Molecular Formula    string
                Monoisotopic Mass Feature ID    double
                                 Persistence    double
                                    Polarity    string
                        Retention Time (min)    double
                                 Sample Name    string
                 Spectra with Annotation (n)    double
                              Tailing Factor    double
                                       chebi    double
                               database_name    string
                                  final_scan    bigint
                                       inchi    string
                                    inchikey    string
                                        kegg    string
                                         m/z    double
                             m/z Error (ppm)    double
                             m/z Error Score    double
                                        name    string
                                 noise_score    double
                             noise_score_max    double
                             noise_score_min    double
                 normalized_dispersity_index    double
                                   ref_ms_id    string
                                      smiles    string
                                  start_scan    bigint
                                   Peak Area    double
                            Traditional Name    string
                   Spectral Similarity Score    string
                            Similarity Score    string
                          Retention Time Ref    string
                       Retention Index Score    string
                            Kegg Compound ID    string
                         Retention index Ref    string
                             Retention index    string
                     Half Height Width (min)    string
                               Compound Name    string
                                  IUPAC Name    string
                                   Inchi Key    string
                              Retention Time    string
                                 Peak Height    double
                              Derivatization    string
                                    Chebi ID    string
                                  Peak Index    string
                                 Common Name    string

Potential name columns: ['file_name', 'Sample Name', 'database_name', 'name', 'Traditional Name', 'Kegg Compound ID', 'Compound Name', 'IUPAC Name', 'Common Name']

3-hydroxybutyrate-related metabolites in NMDC (using file_name):
Empty DataFrame
Columns: [file_id, file_name, feature_id, Apex Scan Number, Area, Associated Mass Features after Deconvolution, Calculated m/z, Confidence Score, Dispersity Index, Entropy Similarity, Intensity, Ion Formula, Ion Type, Is Largest Ion after Deconvolution, Isotopologue Similarity, Isotopologue Type, Library mzs in Query (fraction), MS2 Spectrum, Mass Feature ID, Molecular Formula, Monoisotopic Mass Feature ID, Persistence, Polarity, Retention Time (min), Sample Name, Spectra with Annotation (n), Tailing Factor, chebi, database_name, final_scan, inchi, inchikey, kegg, m/z, m/z Error (ppm), m/z Error Score, name, noise_score, noise_score_max, noise_score_min, normalized_dispersity_index, ref_ms_id, smiles, start_scan, Peak Area, Traditional Name, Spectral Similarity Score, Similarity Score, Retention Time Ref, Retention Index Score, Kegg Compound ID, Retention index Ref, Retention index, Half Height Width (min), Compound Name, IUPAC Name, Inchi Key, Retention Time, Peak Height, Derivatization, Chebi ID, Peak Index, Common Name]
Index: []

[0 rows x 63 columns]

# Cell 25: Check ncbi_env harmonized_name categories
env_categories = spark.sql("""
    SELECT harmonized_name, COUNT(*) as n
    FROM kbase_ke_pangenome.ncbi_env
    GROUP BY harmonized_name
    ORDER BY n DESC
""").toPandas()
print('NCBI environment metadata categories:')
env_categories

NCBI environment metadata categories:

	harmonized_name	n
0	NaN	1600369
1	collection_date	273042
2	geo_loc_name	272707
3	isolation_source	245435
4	strain	205650
...	...	...
329	turbidity	1
330	link_class_info	1
331	microbial_biomass_meth	1
332	standing_water_regm	1
333	drug_usage	1

01B Fix Remaining Cells

NB01b: Run remaining cells from NB01¶

	ko_id	name	description	category
0	K00023
1	K00626
2	K03821
3	K05973
4	K14205