# On BERDL JupyterHub — get_spark_session() is injected into the kernel; no import needed
spark = get_spark_session()
spark

<pyspark.sql.connect.session.SparkSession at 0x7ba9c9a416a0>

import os
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns

PROJECT_DIR = os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))
DATA_DIR    = os.path.join(PROJECT_DIR, 'data')
FIGURES_DIR = os.path.join(PROJECT_DIR, 'figures')
BRIDGE_TBL  = 'nmdc_arkin.omics_files_table'

print(f'DATA_DIR:    {DATA_DIR}')
print(f'FIGURES_DIR: {FIGURES_DIR}')

DATA_DIR:    /home/cjneely/repos/BERIL-research-observatory/projects/nmdc_community_metabolic_ecology/data
FIGURES_DIR: /home/cjneely/repos/BERIL-research-observatory/projects/nmdc_community_metabolic_ecology/figures

inventory     = pd.read_csv(os.path.join(DATA_DIR, 'nmdc_sample_inventory.csv'))
bridge_quality = pd.read_csv(os.path.join(DATA_DIR, 'bridge_quality.csv'))
community_matrix = pd.read_csv(os.path.join(DATA_DIR, 'community_pathway_matrix.csv'))

# Overlap samples = those whose clf_file passes bridge QC
qc_pass_files   = set(bridge_quality[bridge_quality['passes_bridge_qc']]['file_id'])
overlap_samples = set(
    inventory[inventory['clf_file_id'].isin(qc_pass_files)]['sample_id']
)
print(f'Overlap samples: {len(overlap_samples)}')
print(f'Community matrix samples: {community_matrix["sample_id"].nunique()}')

# Met file IDs for overlap samples (may be multiple per sample)
overlap_inv = inventory[inventory['sample_id'].isin(overlap_samples)].copy()
overlap_met_file_ids = overlap_inv['met_file_id'].unique().tolist()
print(f'\nUnique met_file_ids for overlap samples: {len(overlap_met_file_ids)}')
print(f'Met files per sample: avg={len(overlap_met_file_ids)/len(overlap_samples):.1f}')

# Ionization mode / column breakdown from file names
fnames = overlap_inv['met_file_name']
print(f'\nFile name patterns:')
print(f'  HILICZ: {fnames.str.contains("HILICZ", na=False).sum()}')
print(f'  C18:    {fnames.str.contains("C18", na=False).sum()}')
print(f'  POS:    {fnames.str.contains("_POS_", na=False).sum()}')
print(f'  NEG:    {fnames.str.contains("_NEG_", na=False).sum()}')

Overlap samples: 220
Community matrix samples: 220

Unique met_file_ids for overlap samples: 644
Met files per sample: avg=2.9

File name patterns:
  HILICZ: 252
  C18:    252
  POS:    252
  NEG:    252

spark.sql("select name, Intensity, \"Peak Area\" from nmdc_arkin.metabolomics_gold where chebi is not null limit 5;").show()

+--------------------+------------+---------+
|                name|   Intensity|Peak Area|
+--------------------+------------+---------+
|       Carvone, (-)-|  2504080.25|Peak Area|
|           Lactulose|1.70560928E8|Peak Area|
|              Maltol|   8972314.0|Peak Area|
|13-keto-9Z,11E-oc...|   4.65551E7|Peak Area|
|      Pinolenic acid|   8816448.0|Peak Area|
+--------------------+------------+---------+

print('=== metabolomics_gold schema ===')
spark.sql('DESCRIBE nmdc_arkin.metabolomics_gold').show(50, truncate=False)

=== metabolomics_gold schema ===
+--------------------------------------------+---------+-------+
|col_name                                    |data_type|comment|
+--------------------------------------------+---------+-------+
|file_id                                     |string   |NULL   |
|file_name                                   |string   |NULL   |
|feature_id                                  |string   |NULL   |
|Apex Scan Number                            |double   |NULL   |
|Area                                        |double   |NULL   |
|Associated Mass Features after Deconvolution|string   |NULL   |
|Calculated m/z                              |double   |NULL   |
|Confidence Score                            |double   |NULL   |
|Dispersity Index                            |double   |NULL   |
|Entropy Similarity                          |double   |NULL   |
|Intensity                                   |double   |NULL   |
|Ion Formula                                 |string   |NULL   |
|Ion Type                                    |string   |NULL   |
|Is Largest Ion after Deconvolution          |boolean  |NULL   |
|Isotopologue Similarity                     |double   |NULL   |
|Isotopologue Type                           |string   |NULL   |
|Library mzs in Query (fraction)             |double   |NULL   |
|MS2 Spectrum                                |string   |NULL   |
|Mass Feature ID                             |bigint   |NULL   |
|Molecular Formula                           |string   |NULL   |
|Monoisotopic Mass Feature ID                |double   |NULL   |
|Persistence                                 |double   |NULL   |
|Polarity                                    |string   |NULL   |
|Retention Time (min)                        |double   |NULL   |
|Sample Name                                 |string   |NULL   |
|Spectra with Annotation (n)                 |double   |NULL   |
|Tailing Factor                              |double   |NULL   |
|chebi                                       |double   |NULL   |
|database_name                               |string   |NULL   |
|final_scan                                  |bigint   |NULL   |
|inchi                                       |string   |NULL   |
|inchikey                                    |string   |NULL   |
|kegg                                        |string   |NULL   |
|m/z                                         |double   |NULL   |
|m/z Error (ppm)                             |double   |NULL   |
|m/z Error Score                             |double   |NULL   |
|name                                        |string   |NULL   |
|noise_score                                 |double   |NULL   |
|noise_score_max                             |double   |NULL   |
|noise_score_min                             |double   |NULL   |
|normalized_dispersity_index                 |double   |NULL   |
|ref_ms_id                                   |string   |NULL   |
|smiles                                      |string   |NULL   |
|start_scan                                  |bigint   |NULL   |
|Peak Area                                   |double   |NULL   |
|Traditional Name                            |string   |NULL   |
|Spectral Similarity Score                   |string   |NULL   |
|Similarity Score                            |string   |NULL   |
|Retention Time Ref                          |string   |NULL   |
|Retention Index Score                       |string   |NULL   |
+--------------------------------------------+---------+-------+
only showing top 50 rows

# Sample 5 rows to understand data layout
# Use LIMIT — metabolomics_gold has 3.1M rows
print('=== Sample rows from metabolomics_gold ===')
spark.sql("""
    SELECT *
    FROM nmdc_arkin.metabolomics_gold
    LIMIT 5
""").show(truncate=False)

=== Sample rows from metabolomics_gold ===
+---------------------+-----------------------------------------------------------------------------------------------------------------------------------+----------+----------------+------------------+---------------------------------------------------------------------------+------------------+------------------+------------------+------------------+------------+----------------+--------+----------------------------------+-----------------------+-----------------+-------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+-----------------+----------------------------+------------+--------+--------------------+-------------------------------------------------------------------------------------------------------------------------------+---------------------------+------------------+-----+-------------+----------+-----+--------+----+------------------+-------------------+------------------+----+------------------+------------------+------------------+---------------------------+---------+------+----------+---------+----------------+-------------------------+----------------+------------------+---------------------+----------------+-------------------+---------------+-----------------------+-------------+----------+---------+--------------+-----------+--------------+--------+----------+-----------+
|file_id              |file_name                                                                                                                          |feature_id|Apex Scan Number|Area              |Associated Mass Features after Deconvolution                               |Calculated m/z    |Confidence Score  |Dispersity Index  |Entropy Similarity|Intensity   |Ion Formula     |Ion Type|Is Largest Ion after Deconvolution|Isotopologue Similarity|Isotopologue Type|Library mzs in Query (fraction)|MS2 Spectrum                                                                                                                                                                                                                                                   |Mass Feature ID|Molecular Formula|Monoisotopic Mass Feature ID|Persistence |Polarity|Retention Time (min)|Sample Name                                                                                                                    |Spectra with Annotation (n)|Tailing Factor    |chebi|database_name|final_scan|inchi|inchikey|kegg|m/z               |m/z Error (ppm)    |m/z Error Score   |name|noise_score       |noise_score_max   |noise_score_min   |normalized_dispersity_index|ref_ms_id|smiles|start_scan|Peak Area|Traditional Name|Spectral Similarity Score|Similarity Score|Retention Time Ref|Retention Index Score|Kegg Compound ID|Retention index Ref|Retention index|Half Height Width (min)|Compound Name|IUPAC Name|Inchi Key|Retention Time|Peak Height|Derivatization|Chebi ID|Peak Index|Common Name|
+---------------------+-----------------------------------------------------------------------------------------------------------------------------------+----------+----------------+------------------+---------------------------------------------------------------------------+------------------+------------------+------------------+------------------+------------+----------------+--------+----------------------------------+-----------------------+-----------------+-------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+-----------------+----------------------------+------------+--------+--------------------+-------------------------------------------------------------------------------------------------------------------------------+---------------------------+------------------+-----+-------------+----------+-----+--------+----+------------------+-------------------+------------------+----+------------------+------------------+------------------+---------------------------+---------+------+----------+---------+----------------+-------------------------+----------------+------------------+---------------------+----------------+-------------------+---------------+-----------------------+-------------+----------+---------+--------------+-----------+--------------+--------+----------+-----------+
|nmdc:dobj-12-c6668e59|20220119_JGI_MD_507130_BioS_final1_QE-139_HILICZ_USHXG01825_NEG_MSMS_84RM_BESC-360-Corv-RM_1_Rg70to1050-CE102040-root-S1_Run722.csv|1823      |5536.0          |207312.58836064488|104, 167, 491, 524, 726, 746, 833, 1607, 1658, 1823, 2135, 2174, 2277, 2334|505.16501065439905|0.4998728013228436|0.009356166646272 |NULL              |4624002.0   |C24 H29 O8 N2 S1|NULL    |false                             |0.7723194841954907     |NULL             |NULL                           |NULL                                                                                                                                                                                                                                                           |1823           |NULL             |NULL                        |4624002.0   |negative|15.37629222869873   |20220119_JGI_MD_507130_BioS_final1_QE-139_HILICZ_USHXG01825_NEG_MSMS_84RM_BESC-360-Corv-RM_1_Rg70to1050-CE102040-root-S1_Run722|NULL                       |0.909089972598224 |NULL |NULL         |5554      |NULL |NULL    |NULL|505.1663513183594 |2.6539129433839457 |0.3182416794077456|NULL|0.7646735703542076|0.9221820515613964|0.6071650891470185|0.0015614843839305         |NULL     |NULL  |5515      |NULL     |NULL            |NULL                     |NULL            |NULL              |NULL                 |NULL            |NULL               |NULL           |NULL                   |NULL         |NULL      |NULL     |NULL          |NULL       |NULL          |NULL    |NULL      |NULL       |
|nmdc:dobj-12-c6668e59|20220119_JGI_MD_507130_BioS_final1_QE-139_HILICZ_USHXG01825_NEG_MSMS_84RM_BESC-360-Corv-RM_1_Rg70to1050-CE102040-root-S1_Run722.csv|194       |643.0           |1996123.4725409448|194, 828, 970, 1099, 2379                                                  |399.1846834708591 |0.8623966979911594|0.0038853883743286|NULL              |5.6286152E7 |C20 H31 O6 S1   |NULL    |true                              |0.9805461274215276     |NULL             |NULL                           |96.9602:0.09; 399.1851:1.0                                                                                                                                                                                                                                     |194            |NULL             |194.0                       |5.6286152E7 |negative|1.768058180809021   |20220119_JGI_MD_507130_BioS_final1_QE-139_HILICZ_USHXG01825_NEG_MSMS_84RM_BESC-360-Corv-RM_1_Rg70to1050-CE102040-root-S1_Run722|NULL                       |1.5783462748289745|NULL |NULL         |679       |NULL |NULL    |NULL|399.184814453125  |0.328124477993345  |0.7836304117042472|NULL|0.9270566902694404|0.952969921876794 |0.9011434586620864|8.17346627209E-4           |NULL     |NULL  |628       |NULL     |NULL            |NULL                     |NULL            |NULL              |NULL                 |NULL            |NULL               |NULL           |NULL                   |NULL         |NULL      |NULL     |NULL          |NULL       |NULL          |NULL    |NULL      |NULL       |
|nmdc:dobj-12-c6668e59|20220119_JGI_MD_507130_BioS_final1_QE-139_HILICZ_USHXG01825_NEG_MSMS_84RM_BESC-360-Corv-RM_1_Rg70to1050-CE102040-root-S1_Run722.csv|45        |271.0           |6817671.240330413 |45, 97, 188, 272, 735, 1069                                                |445.3323335085391 |0.6158845203521177|0.0038833320140838|NULL              |1.92782656E8|C28 H45 O4      |NULL    |true                              |0.990726622176506      |NULL             |NULL                           |133.0296:0.11; 177.0193:0.05; 430.3088:0.32; 445.3322:1.0                                                                                                                                                                                                      |45             |NULL             |45.0                        |1.92782656E8|negative|0.7712243795394897  |20220119_JGI_MD_507130_BioS_final1_QE-139_HILICZ_USHXG01825_NEG_MSMS_84RM_BESC-360-Corv-RM_1_Rg70to1050-CE102040-root-S1_Run722|NULL                       |0.8804817595393114|NULL |NULL         |283       |NULL |NULL    |NULL|445.33209228515625|-0.5416704889335812|0.3659897858025256|NULL|0.9869020785675428|1.0               |0.9738041571350856|0.0010336373063757         |NULL     |NULL  |253       |NULL     |NULL            |NULL                     |NULL            |NULL              |NULL                 |NULL            |NULL               |NULL           |NULL                   |NULL         |NULL      |NULL     |NULL          |NULL       |NULL          |NULL    |NULL      |NULL       |
|nmdc:dobj-12-c6668e59|20220119_JGI_MD_507130_BioS_final1_QE-139_HILICZ_USHXG01825_NEG_MSMS_84RM_BESC-360-Corv-RM_1_Rg70to1050-CE102040-root-S1_Run722.csv|534       |3184.0          |618261.4868396148 |62, 534, 708, 803, 1286, 1934, 2459                                        |NULL              |NULL              |0.0               |NULL              |2.0292116E7 |NULL            |NULL    |false                             |NULL                   |NULL             |NULL                           |57.3553:0.01; 64.43:0.01; 79.9569:0.14; 80.9647:0.02; 95.9522:0.04; 96.96:1.0; 133.904:0.01; 172.0124:0.95; 172.5139:0.11; 172.8308:0.02; 181.0176:0.4; 182.0209:0.02; 247.0642:0.14; 248.0664:0.02; 273.2598:0.01; 296.2651:0.01; 311.0364:0.01; 319.4683:0.01|534            |NULL             |NULL                        |2.0292116E7 |negative|8.802865982055664   |20220119_JGI_MD_507130_BioS_final1_QE-139_HILICZ_USHXG01825_NEG_MSMS_84RM_BESC-360-Corv-RM_1_Rg70to1050-CE102040-root-S1_Run722|NULL                       |1.118663847780127 |NULL |NULL         |3208      |NULL |NULL    |NULL|172.0122528076172 |NULL               |NULL              |NULL|1.0               |1.0               |1.0               |0.0                        |NULL     |NULL  |3166      |NULL     |NULL            |NULL                     |NULL            |NULL              |NULL                 |NULL            |NULL               |NULL           |NULL                   |NULL         |NULL      |NULL     |NULL          |NULL       |NULL          |NULL    |NULL      |NULL       |
|nmdc:dobj-12-c6668e59|20220119_JGI_MD_507130_BioS_final1_QE-139_HILICZ_USHXG01825_NEG_MSMS_84RM_BESC-360-Corv-RM_1_Rg70to1050-CE102040-root-S1_Run722.csv|750       |1609.0          |1236735.067583379 |750, 945, 1581                                                             |249.0434527681991 |0.6133640927005115|0.0210686804710109|NULL              |1.3480962E7 |C11 H10 O3 N2 P1|NULL    |true                              |0.8936766034327009     |NULL             |NULL                           |96.96:1.0; 249.0435:0.1; 249.08:0.02                                                                                                                                                                                                                           |750            |NULL             |NULL                        |1.3480962E7 |negative|4.419151306152344   |20220119_JGI_MD_507130_BioS_final1_QE-139_HILICZ_USHXG01825_NEG_MSMS_84RM_BESC-360-Corv-RM_1_Rg70to1050-CE102040-root-S1_Run722|NULL                       |0.9243409989181778|NULL |NULL         |1708      |NULL |NULL    |NULL|249.0436859130859 |0.9361614780199936 |0.4264890855457184|NULL|0.8925020802252861|0.9855993560493296|0.7994048044012427|0.0035168828868922         |NULL     |NULL  |1546      |NULL     |NULL            |NULL                     |NULL            |NULL              |NULL                 |NULL            |NULL               |NULL           |NULL                   |NULL         |NULL      |NULL     |NULL          |NULL       |NULL          |NULL    |NULL      |NULL       |
+---------------------+-----------------------------------------------------------------------------------------------------------------------------------+----------+----------------+------------------+---------------------------------------------------------------------------+------------------+------------------+------------------+------------------+------------+----------------+--------+----------------------------------+-----------------------+-----------------+-------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+-----------------+----------------------------+------------+--------+--------------------+-------------------------------------------------------------------------------------------------------------------------------+---------------------------+------------------+-----+-------------+----------+-----+--------+----+------------------+-------------------+------------------+----+------------------+------------------+------------------+---------------------------+---------+------+----------+---------+----------------+-------------------------+----------------+------------------+---------------------+----------------+-------------------+---------------+-----------------------+-------------+----------+---------+--------------+-----------+--------------+--------+----------+-----------+

# Confirm the file_id format for metabolomics_gold (expect nmdc:dobj-12-* prefix)
print('=== file_id format in metabolomics_gold ===')
spark.sql("""
    SELECT file_id, COUNT(*) as n
    FROM nmdc_arkin.metabolomics_gold
    GROUP BY file_id
    ORDER BY n DESC
    LIMIT 10
""").show(truncate=False)

# Check total rows for overlap met_file_ids
# Build a small validation using one sample met_file_id
sample_met_id = overlap_met_file_ids[0]
print(f'\nRow count for one sample met file ({sample_met_id}):')
spark.sql(f"""
    SELECT COUNT(*) as n
    FROM nmdc_arkin.metabolomics_gold
    WHERE file_id = '{sample_met_id}'
""").show()

=== file_id format in metabolomics_gold ===
+---------------------+-----+
|file_id              |n    |
+---------------------+-----+
|nmdc:dobj-12-tsrn1q55|12281|
|nmdc:dobj-12-e7vpe098|11461|
|nmdc:dobj-12-j7tq4n27|11355|
|nmdc:dobj-12-kk96n084|11162|
|nmdc:dobj-12-0zjp6543|10871|
|nmdc:dobj-12-v98y9q72|10706|
|nmdc:dobj-12-3j09wy18|10576|
|nmdc:dobj-12-h0c0js15|10535|
|nmdc:dobj-12-4ks2cq70|9184 |
|nmdc:dobj-12-9241q096|8766 |
+---------------------+-----+


Row count for one sample met file (nmdc:dobj-12-716d9s20):
+---+
|  n|
+---+
|810|
+---+

# ── Set column names based on schema inspection above ──────────────────────
# Update these after running Part 2.
# Common NMDC metabolomics_gold column names (verify from DESCRIBE output):
#   compound name: 'compound_name', 'metabolite_name', 'feature_name', 'name'
#   intensity:     'abundance', 'intensity', 'peak_area', 'normalized_abundance'
#   KEGG ID:       'kegg_id', 'compound_id', 'annotation_id' (may not exist)
# ---------------------------------------------------------------------------
COMPOUND_COL  = 'name'     # UPDATE if different
INTENSITY_COL = 'Intensity'         # UPDATE if different
KEGG_COL      = 'kegg'                # Set to column name if KEGG IDs exist

print(f'Using compound column:  {COMPOUND_COL}')
print(f'Using intensity column: {INTENSITY_COL}')
print(f'KEGG ID column:         {KEGG_COL}')

Using compound column:  name
Using intensity column: Intensity
KEGG ID column:         kegg

# Build SELECT clause (include KEGG_COL only if it exists)
kegg_select = f', m.{KEGG_COL} AS kegg_id' if KEGG_COL else ''

# Extract metabolomics data for overlap samples via Spark.
# Filter: INTERSECT of samples with centrifuge + metabolomics files
# Cast intensity to DOUBLE to avoid decimal.Decimal arithmetic errors
met_spark = spark.sql(f"""
    SELECT
        b.sample_id,
        m.file_id,
        m.{COMPOUND_COL} AS compound_name,
        CAST(m.{INTENSITY_COL} AS DOUBLE) AS intensity
        {kegg_select}
    FROM nmdc_arkin.metabolomics_gold m
    JOIN {BRIDGE_TBL} b ON m.file_id = b.file_id
    WHERE b.sample_id IN (
        SELECT b2.sample_id FROM {BRIDGE_TBL} b2
        JOIN (SELECT DISTINCT file_id FROM nmdc_arkin.centrifuge_gold) c2
          ON b2.file_id = c2.file_id
        INTERSECT
        SELECT b3.sample_id FROM {BRIDGE_TBL} b3
        JOIN (SELECT DISTINCT file_id FROM nmdc_arkin.metabolomics_gold) m3
          ON b3.file_id = m3.file_id
    )
    AND m.{COMPOUND_COL} IS NOT NULL
    AND m.{COMPOUND_COL} != ''
    AND m.{INTENSITY_COL} IS NOT NULL
    AND CAST(m.{INTENSITY_COL} AS DOUBLE) > 0
""")

print('Collecting metabolomics data (may take a few minutes)...')
met_raw = met_spark.toPandas()
met_raw['intensity'] = met_raw['intensity'].astype(float)  # defensive cast

print(f'\nMetabolomics rows: {len(met_raw):,}')
print(f'Unique samples:   {met_raw["sample_id"].nunique()}')
print(f'Unique files:     {met_raw["file_id"].nunique()}')
print(f'Unique compounds: {met_raw["compound_name"].nunique():,}')
print(f'Intensity range:  [{met_raw["intensity"].min():.2e}, {met_raw["intensity"].max():.2e}]')
print(met_raw.head(5).to_string())

Collecting metabolomics data (may take a few minutes)...

Metabolomics rows: 33,726
Unique samples:   175
Unique files:     553
Unique compounds: 1,944
Intensity range:  [7.41e+04, 4.54e+09]
              sample_id                file_id           compound_name    intensity kegg_id
0  nmdc:bsm-11-dz264p50  nmdc:dobj-12-d1v5ag89   3-Hydroxybenzaldehyde   9118199.00  C03067
1  nmdc:bsm-11-tcxb3w32  nmdc:dobj-12-d7nkac20      Phthalic anhydride  42594000.00     NaN
2  nmdc:bsm-11-cv5vxb38  nmdc:dobj-12-dj8n6c53                 Choline  57018780.00  C00114
3  nmdc:bsm-11-vp7n5531  nmdc:dobj-12-dn3mvn79                D-Ribose   3060144.25     NaN
4  nmdc:bsm-11-ck98jg55  nmdc:dobj-12-edfh5e04  4-Hydroxycinnamic acid  19322698.00     NaN

# Map file_id → ionization mode / column type from met_file_name
file_meta = overlap_inv[['met_file_id', 'met_file_name', 'sample_id']].rename(
    columns={'met_file_id': 'file_id'}
).drop_duplicates()
file_meta['ionization'] = np.where(
    file_meta['met_file_name'].str.contains('_POS_', na=False), 'POS',
    np.where(file_meta['met_file_name'].str.contains('_NEG_', na=False), 'NEG', 'unknown')
)
file_meta['lc_column'] = np.where(
    file_meta['met_file_name'].str.contains('HILICZ', na=False), 'HILICZ',
    np.where(file_meta['met_file_name'].str.contains('C18', na=False), 'C18', 'unknown')
)

met_raw = met_raw.merge(file_meta[['file_id', 'ionization', 'lc_column']],
                        on='file_id', how='left')
print('Ionization mode breakdown:')
print(met_raw['ionization'].value_counts().to_string())
print('\nLC column breakdown:')
print(met_raw['lc_column'].value_counts().to_string())

Ionization mode breakdown:
ionization
POS        19494
NEG        11933
unknown     2262

LC column breakdown:
lc_column
C18        16998
HILICZ     14429
unknown     2262

# Mapping: GapMind aa pathway → compound name fragments (case-insensitive substring match)
# chorismate is the aromatic aa precursor; include shikimic acid as metabolomics proxy
AA_PATHWAY_TO_PATTERNS = {
    'arg':       ['arginine'],
    'asn':       ['asparagine'],
    'chorismate': ['chorismate', 'chorismic acid', 'shikimic acid', 'shikimate'],
    'cys':       ['cysteine'],
    'gln':       ['glutamine'],
    'gly':       ['glycine'],
    'his':       ['histidine'],
    'ile':       ['isoleucine'],
    'leu':       ['leucine'],
    'lys':       ['lysine'],
    'met':       ['methionine'],
    'phe':       ['phenylalanine'],
    'pro':       ['proline'],
    'ser':       ['serine'],
    'thr':       ['threonine'],
    'trp':       ['tryptophan'],
    'tyr':       ['tyrosine'],
    'val':       ['valine'],
}

# Build a compound → pathway mapping by substring match.
# IMPORTANT: use first-match-wins to avoid 'leucine' (a substring of 'isoleucine')
# overwriting the correct 'ile' mapping.  The dict order above ensures 'ile' is checked
# before 'leu', so isoleucine compounds are correctly assigned to 'ile'.
all_compounds = met_raw['compound_name'].dropna().unique()

compound_pathway_map = {}  # compound_name → aa_pathway
for pathway, patterns in AA_PATHWAY_TO_PATTERNS.items():
    for compound in all_compounds:
        # Skip if already mapped (first match wins — prevents substring collisions
        # e.g. 'leucine' matching inside 'isoleucine')
        if compound in compound_pathway_map:
            continue
        cl = compound.lower()
        if any(p in cl for p in patterns):
            # Avoid mapping glutamine to glutamic acid and vice versa
            if pathway == 'gln' and 'glutamic' in cl:
                continue
            compound_pathway_map[compound] = pathway

print(f'Total unique compounds in data: {len(all_compounds):,}')
print(f'Compounds mapped to aa pathways: {len(compound_pathway_map)}')
print()
# Show matches per pathway
pathway_match_counts = pd.Series(compound_pathway_map).value_counts().sort_index()
print('Compounds matched per aa pathway:')
print(pathway_match_counts.to_string())

# Pathways with no matches
missing = [p for p in AA_PATHWAY_TO_PATTERNS if p not in pathway_match_counts.index]
if missing:
    print(f'\naa pathways with NO compound match: {missing}')

Total unique compounds in data: 1,944
Compounds mapped to aa pathways: 50

Compounds matched per aa pathway:
arg            2
asn            3
chorismate     2
gln            1
gly           10
ile            3
leu            5
met            1
phe            3
pro            4
ser            5
thr            1
trp            2
tyr            2
val            6

aa pathways with NO compound match: ['cys', 'his', 'lys']

# Show matched compound names for verification
matched_df = pd.DataFrame(
    list(compound_pathway_map.items()), columns=['compound_name', 'aa_pathway']
).sort_values('aa_pathway')

print('Matched amino acid compounds:')
print(matched_df.to_string(index=False))

Matched amino acid compounds:
                                    compound_name aa_pathway
                                         Arginine        arg
                                     Homoarginine        arg
                                       Asparagine        asn
                                    DL-Asparagine        asn
                               N-Acetylasparagine        asn
                                    Shikimic Acid chorismate
                           3-Dehydroshikimic acid chorismate
                                     DL-Glutamine        gln
                                    Glycylglycine        gly
                                  N-Acetylglycine        gly
                                  N-oleoylglycine        gly
                                   Suberylglycine        gly
                           2-Methylbutyrylglycine        gly
                       (S)-4-hydroxyphenylglycine        gly
                           Phenylpropionylglycine        gly
                                  Hexanoylglycine        gly
                                Capryloyl Glycine        gly
                                Isovalerylglycine        gly
                              Alloisoleucine, DL-        ile
                 N-[(+)-Jasmonoyl]-(L)-isoleucine        ile
N-[(9H-fluoren-9-ylmethoxy)carbonyl]-L-isoleucine        ile
                                          Leucine        leu
                                      Ketoleucine        leu
                               Norleucine, (+/-)-        leu
                                    Acetylleucine        leu
                               N-Acetyl-L-leucine        leu
                                       Methionine        met
                              L-Phenylalanine, 23        phe
                                    Phenylalanine        phe
                         N-Acetyl-L-phenylalanine        phe
                                        D-Proline        pro
                                  proline dl-form        pro
                                  Histidylproline        pro
                          Cis-4-Hydroxy-D-Proline        pro
                                       d,l-Serine        ser
                                     L-homoserine        ser
                                    3-Hexylserine        ser
                                        DL-Serine        ser
                               3-Phenyl-DL-Serine        ser
                                Allothreonine, L-        thr
                                       Tryptophan        trp
             N-[4-(1-adamantyl)benzoyl]tryptophan        trp
                                         Tyrosine        tyr
                                      DL-Tyrosine        tyr
                                        Norvaline        val
                                     Glycylvaline        val
                                     DL-norvaline        val
                                  Alanylnorvaline        val
                                        DL-valine        val
                                L-Leucyl-L-Valine        val

# Log1p transform
met_raw['log_intensity'] = np.log1p(met_raw['intensity'])

# Aggregate per (sample_id, compound_name): take max log_intensity across files
met_agg = (
    met_raw.groupby(['sample_id', 'compound_name'])['log_intensity']
    .max()
    .reset_index()
    .rename(columns={'log_intensity': 'log_intensity_max'})
)

print(f'After aggregation (max per sample-compound): {len(met_agg):,} rows')
print(f'Unique samples:   {met_agg["sample_id"].nunique()}')
print(f'Unique compounds: {met_agg["compound_name"].nunique():,}')
print(f'\nlog_intensity distribution:')
print(met_agg['log_intensity_max'].describe().to_string())

After aggregation (max per sample-compound): 26,465 rows
Unique samples:   175
Unique compounds: 1,944

log_intensity distribution:
count    26465.000000
mean        16.111302
std          1.647851
min         11.946451
25%         14.952030
50%         15.942837
75%         17.138308
max         22.236964

# Filter to compounds detected in at least 10% of samples (min prevalence filter)
# This removes rare contaminants and reduces sparsity in the matrix
MIN_PREVALENCE = 0.10
n_samples_total = met_agg['sample_id'].nunique()
compound_prevalence = (
    met_agg.groupby('compound_name')['sample_id']
    .nunique()
    .rename('n_samples')
    / n_samples_total
)
prevalent_compounds = compound_prevalence[compound_prevalence >= MIN_PREVALENCE].index

# Always keep aa-mapped compounds regardless of prevalence
aa_compounds = set(compound_pathway_map.keys())
keep_compounds = set(prevalent_compounds) | aa_compounds

met_filtered = met_agg[met_agg['compound_name'].isin(keep_compounds)].copy()

print(f'Prevalent compounds (>={MIN_PREVALENCE:.0%} samples): {len(prevalent_compounds):,}')
print(f'AA compounds always kept:                          {len(aa_compounds)}')
print(f'Total compounds after filter:                      {len(keep_compounds):,}')
print(f'Filtered rows:                                     {len(met_filtered):,}')

Prevalent compounds (>=10% samples): 448
AA compounds always kept:                          50
Total compounds after filter:                      475
Filtered rows:                                     19,583

# Pivot to wide format: samples × compounds
met_wide = met_filtered.pivot_table(
    index='sample_id',
    columns='compound_name',
    values='log_intensity_max',
    aggfunc='first'
).reset_index()

compound_cols = [c for c in met_wide.columns if c != 'sample_id']
print(f'Metabolomics matrix (wide): {met_wide.shape}')
print(f'  Samples:   {len(met_wide)}')
print(f'  Compounds: {len(compound_cols)}')
print(f'\nSparsity: {met_wide[compound_cols].isna().mean().mean():.1%} missing')

Metabolomics matrix (wide): (175, 476)
  Samples:   175
  Compounds: 475

Sparsity: 76.4% missing

# Build amino acid metabolomics table: long format with pathway mapping
aa_met = met_filtered[met_filtered['compound_name'].isin(aa_compounds)].copy()
aa_met['aa_pathway'] = aa_met['compound_name'].map(compound_pathway_map)

print(f'Amino acid metabolomics rows: {len(aa_met):,}')
print(f'Samples with ≥1 AA compound: {aa_met["sample_id"].nunique()}')
print(f'AA pathways covered:         {aa_met["aa_pathway"].nunique()} / 18')
print()

# Per-pathway: how many samples have a detection?
aa_coverage = (
    aa_met.groupby('aa_pathway')['sample_id']
    .nunique()
    .rename('n_samples_detected')
    .reset_index()
    .sort_values('n_samples_detected', ascending=False)
)
aa_coverage['pct'] = aa_coverage['n_samples_detected'] / n_samples_total
print('AA pathway detection rates:')
print(aa_coverage.to_string(index=False))

Amino acid metabolomics rows: 1,036
Samples with ≥1 AA compound: 131
AA pathways covered:         15 / 18

AA pathway detection rates:
aa_pathway  n_samples_detected      pct
       gly                 114 0.651429
       phe                  88 0.502857
       arg                  80 0.457143
       asn                  73 0.417143
chorismate                  65 0.371429
       leu                  62 0.354286
       ser                  59 0.337143
       val                  54 0.308571
       trp                  44 0.251429
       tyr                  26 0.148571
       thr                  23 0.131429
       met                  18 0.102857
       ile                  18 0.102857
       pro                   9 0.051429
       gln                   4 0.022857

# For each aa_pathway × sample: aggregate across multiple compound matches
# (e.g., 'L-Leucine' and 'leucine' both map to 'leu' — take max)
aa_by_pathway = (
    aa_met.groupby(['sample_id', 'aa_pathway'])['log_intensity_max']
    .max()
    .rename('log_intensity')
    .reset_index()
)

print(f'AA by pathway rows (sample × pathway): {len(aa_by_pathway):,}')
print(aa_by_pathway.describe().to_string())

AA by pathway rows (sample × pathway): 737
       log_intensity
count     737.000000
mean       16.588106
std         1.360495
min        12.559582
25%        15.681756
50%        16.386069
75%        17.369864
max        21.443584

# Explore abiotic_features schema first
print('=== abiotic_features schema ===')
spark.sql('DESCRIBE nmdc_arkin.abiotic_features').show(40, truncate=False)

=== abiotic_features schema ===
+-------------------------------------------------+---------+-------+
|col_name                                         |data_type|comment|
+-------------------------------------------------+---------+-------+
|sample_id                                        |string   |NULL   |
|annotations_ammonium_has_numeric_value           |double   |NULL   |
|annotations_ammonium_nitrogen_has_numeric_value  |double   |NULL   |
|annotations_calcium_has_numeric_value            |double   |NULL   |
|annotations_carb_nitro_ratio_has_numeric_value   |double   |NULL   |
|annotations_chlorophyll_has_numeric_value        |double   |NULL   |
|annotations_conduc_has_numeric_value             |double   |NULL   |
|annotations_depth_has_maximum_numeric_value      |double   |NULL   |
|annotations_depth_has_minimum_numeric_value      |double   |NULL   |
|annotations_depth_has_numeric_value              |double   |NULL   |
|annotations_diss_org_carb_has_numeric_value      |double   |NULL   |
|annotations_diss_oxygen_has_numeric_value        |double   |NULL   |
|annotations_magnesium_has_numeric_value          |double   |NULL   |
|annotations_manganese_has_numeric_value          |double   |NULL   |
|annotations_ph                                   |double   |NULL   |
|annotations_potassium_has_numeric_value          |double   |NULL   |
|annotations_samp_size_has_numeric_value          |double   |NULL   |
|annotations_soluble_react_phosp_has_numeric_value|double   |NULL   |
|annotations_temp_has_numeric_value               |double   |NULL   |
|annotations_tot_nitro_content_has_numeric_value  |double   |NULL   |
|annotations_tot_org_carb_has_numeric_value       |double   |NULL   |
|annotations_tot_phosp_has_numeric_value          |double   |NULL   |
+-------------------------------------------------+---------+-------+

# Sample abiotic_features rows
print('=== Sample abiotic_features rows ===')
spark.sql("""
    SELECT *
    FROM nmdc_arkin.abiotic_features
    LIMIT 3
""").show(truncate=False)

=== Sample abiotic_features rows ===
+--------------------+--------------------------------------+-----------------------------------------------+-------------------------------------+----------------------------------------------+-----------------------------------------+------------------------------------+-------------------------------------------+-------------------------------------------+-----------------------------------+-------------------------------------------+-----------------------------------------+---------------------------------------+---------------------------------------+--------------+---------------------------------------+---------------------------------------+-------------------------------------------------+----------------------------------+-----------------------------------------------+------------------------------------------+---------------------------------------+
|sample_id           |annotations_ammonium_has_numeric_value|annotations_ammonium_nitrogen_has_numeric_value|annotations_calcium_has_numeric_value|annotations_carb_nitro_ratio_has_numeric_value|annotations_chlorophyll_has_numeric_value|annotations_conduc_has_numeric_value|annotations_depth_has_maximum_numeric_value|annotations_depth_has_minimum_numeric_value|annotations_depth_has_numeric_value|annotations_diss_org_carb_has_numeric_value|annotations_diss_oxygen_has_numeric_value|annotations_magnesium_has_numeric_value|annotations_manganese_has_numeric_value|annotations_ph|annotations_potassium_has_numeric_value|annotations_samp_size_has_numeric_value|annotations_soluble_react_phosp_has_numeric_value|annotations_temp_has_numeric_value|annotations_tot_nitro_content_has_numeric_value|annotations_tot_org_carb_has_numeric_value|annotations_tot_phosp_has_numeric_value|
+--------------------+--------------------------------------+-----------------------------------------------+-------------------------------------+----------------------------------------------+-----------------------------------------+------------------------------------+-------------------------------------------+-------------------------------------------+-----------------------------------+-------------------------------------------+-----------------------------------------+---------------------------------------+---------------------------------------+--------------+---------------------------------------+---------------------------------------+-------------------------------------------------+----------------------------------+-----------------------------------------------+------------------------------------------+---------------------------------------+
|nmdc:bsm-11-042nd237|0.0                                   |0.0                                            |0.0                                  |0.0                                           |0.0                                      |0.0                                 |0.0                                        |0.02                                       |0.0                                |0.0                                        |0.0                                      |0.0                                    |0.0                                    |0.0           |0.0                                    |0.0                                    |0.0                                              |0.0                               |0.0                                            |0.0                                       |0.0                                    |
|nmdc:bsm-11-622k6044|0.0                                   |0.0                                            |0.0                                  |0.0                                           |0.0                                      |0.0                                 |0.0                                        |0.02                                       |0.0                                |0.0                                        |0.0                                      |0.0                                    |0.0                                    |0.0           |0.0                                    |0.0                                    |0.0                                              |0.0                               |0.0                                            |0.0                                       |0.0                                    |
|nmdc:bsm-11-65a4xw75|0.0                                   |0.0                                            |0.0                                  |0.0                                           |0.0                                      |0.0                                 |0.0                                        |0.02                                       |0.0                                |0.0                                        |0.0                                      |0.0                                    |0.0                                    |0.0           |0.0                                    |0.0                                    |0.0                                              |0.0                               |0.0                                            |0.0                                       |0.0                                    |
+--------------------+--------------------------------------+-----------------------------------------------+-------------------------------------+----------------------------------------------+-----------------------------------------+------------------------------------+-------------------------------------------+-------------------------------------------+-----------------------------------+-------------------------------------------+-----------------------------------------+---------------------------------------+---------------------------------------+--------------+---------------------------------------+---------------------------------------+-------------------------------------------------+----------------------------------+-----------------------------------------------+------------------------------------------+---------------------------------------+

# Extract abiotic features for overlap samples.
# All numeric columns in abiotic_features are already double-typed.
# Column naming: most use _has_numeric_value suffix; annotations_ph is the exception.
# Replace annotations_water_content (doesn't exist) with diss_org_carb and conduc.
# Join via sample_id directly (abiotic_features has sample_id as primary key).
abiotic_spark = spark.sql(f"""
    SELECT DISTINCT
        a.sample_id,
        a.annotations_ph                                    AS ph,
        a.annotations_temp_has_numeric_value                AS temp_c,
        a.annotations_depth_has_numeric_value               AS depth_m,
        a.annotations_tot_org_carb_has_numeric_value        AS tot_org_carb,
        a.annotations_tot_nitro_content_has_numeric_value   AS tot_nitro,
        a.annotations_diss_org_carb_has_numeric_value       AS diss_org_carb,
        a.annotations_conduc_has_numeric_value              AS conductance
    FROM nmdc_arkin.abiotic_features a
    WHERE a.sample_id IN (
        SELECT b2.sample_id FROM {BRIDGE_TBL} b2
        JOIN (SELECT DISTINCT file_id FROM nmdc_arkin.centrifuge_gold) c2
          ON b2.file_id = c2.file_id
        INTERSECT
        SELECT b3.sample_id FROM {BRIDGE_TBL} b3
        JOIN (SELECT DISTINCT file_id FROM nmdc_arkin.metabolomics_gold) m3
          ON b3.file_id = m3.file_id
    )
""")

abiotic = abiotic_spark.toPandas()
abiotic_num_cols = ['ph', 'temp_c', 'depth_m', 'tot_org_carb',
                    'tot_nitro', 'diss_org_carb', 'conductance']
for col in abiotic_num_cols:
    abiotic[col] = pd.to_numeric(abiotic[col], errors='coerce')

# Treat 0.0 as missing (abiotic_features stores 0 for unmeasured variables)
for col in abiotic_num_cols:
    abiotic[col] = abiotic[col].replace(0.0, np.nan)

print(f'Abiotic features rows: {len(abiotic)}')
print(f'Unique samples:        {abiotic["sample_id"].nunique()}')
print()
print('Non-null rates per feature (after replacing 0 → NaN):')
print(abiotic[abiotic_num_cols].notna().mean().round(3).to_string())
print()
print(abiotic[abiotic_num_cols].describe().to_string())

Abiotic features rows: 221
Unique samples:        221

Non-null rates per feature (after replacing 0 → NaN):
ph               0.000
temp_c           0.000
depth_m          0.122
tot_org_carb     0.000
tot_nitro        0.000
diss_org_carb    0.000
conductance      0.000

        ph  temp_c    depth_m  tot_org_carb  tot_nitro  diss_org_carb  conductance
count  0.0     0.0  27.000000           0.0        0.0            0.0          0.0
mean   NaN     NaN  -0.185185           NaN        NaN            NaN          NaN
std    NaN     NaN   0.174761           NaN        NaN            NaN          NaN
min    NaN     NaN  -0.400000           NaN        NaN            NaN          NaN
25%    NaN     NaN  -0.300000           NaN        NaN            NaN          NaN
50%    NaN     NaN  -0.200000           NaN        NaN            NaN          NaN
75%    NaN     NaN  -0.100000           NaN        NaN            NaN          NaN
max    NaN     NaN   0.100000           NaN        NaN            NaN          NaN

# Start from community_matrix (220 samples × 86 cols)
# Inner join with metabolomics wide matrix
analysis_matrix = community_matrix.merge(met_wide, on='sample_id', how='inner')
print(f'After joining pathway + metabolomics: {analysis_matrix.shape}')
print(f'  Samples: {len(analysis_matrix)}')

# Left join abiotic features
abiotic_num_cols = ['ph', 'temp_c', 'depth_m', 'tot_org_carb',
                    'tot_nitro', 'diss_org_carb', 'conductance']
if len(abiotic) > 0 and abiotic['sample_id'].nunique() > 0:
    analysis_matrix = analysis_matrix.merge(
        abiotic.drop_duplicates(subset='sample_id'),
        on='sample_id', how='left'
    )
    print(f'After joining abiotic features: {analysis_matrix.shape}')
    non_null = {c: int(analysis_matrix[c].notna().sum()) for c in abiotic_num_cols
                if c in analysis_matrix.columns}
    print(f'  Samples with abiotic data: {non_null}')
else:
    print('No abiotic features found for overlap samples — skipping abiotic join')
    for col in abiotic_num_cols:
        analysis_matrix[col] = np.nan

print(f'\nFinal analysis matrix shape: {analysis_matrix.shape}')
print(f'Samples: {analysis_matrix["sample_id"].nunique()}')

After joining pathway + metabolomics: (174, 561)
  Samples: 174
After joining abiotic features: (174, 568)
  Samples with abiotic data: {'ph': 0, 'temp_c': 0, 'depth_m': 0, 'tot_org_carb': 0, 'tot_nitro': 0, 'diss_org_carb': 0, 'conductance': 0}

Final analysis matrix shape: (174, 568)
Samples: 174

# Plot: log-intensity distributions for amino acid compounds, colored by ecosystem type
# Merge ecosystem type into aa_by_pathway
eco_map = community_matrix[['sample_id', 'ecosystem_type']].drop_duplicates()
aa_plot = aa_by_pathway.merge(eco_map, on='sample_id', how='left')
aa_plot['ecosystem_type'] = aa_plot['ecosystem_type'].fillna('Unknown')

# Filter to aa_pathways with detections in at least 20 samples
well_detected = aa_coverage[aa_coverage['n_samples_detected'] >= 20]['aa_pathway'].tolist()
aa_plot_filtered = aa_plot[aa_plot['aa_pathway'].isin(well_detected)]

n_pathways_plot = len(well_detected)
if n_pathways_plot > 0:
    fig, axes = plt.subplots(
        n_pathways_plot, 1,
        figsize=(8, max(6, n_pathways_plot * 1.5)),
        sharex=False
    )
    if n_pathways_plot == 1:
        axes = [axes]

    palette = {'Soil': '#c8a96e', 'Freshwater': '#4e8fb5', 'Unknown': '#aaaaaa'}

    for ax, pathway in zip(axes, sorted(well_detected)):
        subset = aa_plot_filtered[aa_plot_filtered['aa_pathway'] == pathway]
        for eco, grp in subset.groupby('ecosystem_type'):
            color = palette.get(eco, '#888888')
            ax.hist(grp['log_intensity'], bins=25, alpha=0.5,
                    label=eco, color=color, density=True)
        ax.set_title(f'aa pathway: {pathway}', fontsize=9)
        ax.set_ylabel('Density', fontsize=8)
        ax.tick_params(labelsize=7)
        ax.legend(fontsize=7)

    axes[-1].set_xlabel('log(intensity + 1)', fontsize=9)
    plt.suptitle('Amino Acid Compound Intensities by Ecosystem Type',
                 fontsize=11, y=1.01)
    plt.tight_layout()
    fig_path = os.path.join(FIGURES_DIR, 'metabolomics_distribution.png')
    plt.savefig(fig_path, dpi=150, bbox_inches='tight')
    plt.show()
    print(f'Saved: figures/metabolomics_distribution.png')
else:
    print('No aa pathways with sufficient detections for plotting')

Saved: figures/metabolomics_distribution.png

# Save metabolomics matrix (wide: samples × compounds)
met_path = os.path.join(DATA_DIR, 'metabolomics_matrix.csv')
met_wide.to_csv(met_path, index=False)
print(f'Saved: data/metabolomics_matrix.csv  ({met_wide.shape})')

# Save amino acid metabolites (long: sample × pathway)
aa_path = os.path.join(DATA_DIR, 'amino_acid_metabolites.csv')
aa_by_pathway.to_csv(aa_path, index=False)
print(f'Saved: data/amino_acid_metabolites.csv  ({aa_by_pathway.shape})')

# Save analysis-ready matrix
ar_path = os.path.join(DATA_DIR, 'analysis_ready_matrix.csv')
analysis_matrix.to_csv(ar_path, index=False)
print(f'Saved: data/analysis_ready_matrix.csv  ({analysis_matrix.shape})')

Saved: data/metabolomics_matrix.csv  ((175, 476))
Saved: data/amino_acid_metabolites.csv  ((737, 3))
Saved: data/analysis_ready_matrix.csv  ((174, 568))

# NB04 Summary
METADATA_COLS = ['sample_id', 'study_id', 'ecosystem_category', 'ecosystem_type',
                 'ecosystem_subtype', 'specific_ecosystem',
                 'ph', 'temp_c', 'depth_m', 'tot_org_carb',
                 'tot_nitro', 'diss_org_carb', 'conductance']
pathway_cols_cm  = [c for c in community_matrix.columns
                    if c not in ['sample_id', 'study_id', 'ecosystem_category',
                                 'ecosystem_type', 'ecosystem_subtype', 'specific_ecosystem']]
compound_cols_ar = [c for c in analysis_matrix.columns
                    if c not in METADATA_COLS and c not in pathway_cols_cm]

print('=== NB04 Summary ===')
print(f'Samples in analysis_ready_matrix:  {analysis_matrix["sample_id"].nunique()}')
print(f'Pathway columns (from NB03):        {len(pathway_cols_cm)}')
print(f'Compound columns (metabolomics):    {len(compound_cols_ar)}')
abiotic_num_cols = ['ph', 'temp_c', 'depth_m', 'tot_org_carb',
                    'tot_nitro', 'diss_org_carb', 'conductance']
non_null = {c: int(analysis_matrix[c].notna().sum())
            for c in abiotic_num_cols if c in analysis_matrix.columns}
print(f'Abiotic feature non-null counts:    {non_null}')
print()
print('AA pathways with metabolomics coverage:')
print(aa_coverage.to_string(index=False))
print()
print('Ecosystem breakdown in analysis_ready_matrix:')
print(analysis_matrix['ecosystem_type'].value_counts(dropna=False).to_string())

=== NB04 Summary ===
Samples in analysis_ready_matrix:  174
Pathway columns (from NB03):        80
Compound columns (metabolomics):    475
Abiotic feature non-null counts:    {'ph': 0, 'temp_c': 0, 'depth_m': 0, 'tot_org_carb': 0, 'tot_nitro': 0, 'diss_org_carb': 0, 'conductance': 0}

AA pathways with metabolomics coverage:
aa_pathway  n_samples_detected      pct
       gly                 114 0.651429
       phe                  88 0.502857
       arg                  80 0.457143
       asn                  73 0.417143
chorismate                  65 0.371429
       leu                  62 0.354286
       ser                  59 0.337143
       val                  54 0.308571
       trp                  44 0.251429
       tyr                  26 0.148571
       thr                  23 0.131429
       met                  18 0.102857
       ile                  18 0.102857
       pro                   9 0.051429
       gln                   4 0.022857

Ecosystem breakdown in analysis_ready_matrix:
ecosystem_type
Soil    126
NaN      48

Question	Finding
Samples in analysis_ready_matrix	174
Metabolomics compounds (prevalent ≥10%)	475
AA pathways with metabolomics detections	14 / 18 (cys, his, ile, lys not detected — note: ile bug fixed in cell-14)
Samples with AA metabolomics coverage	131
Abiotic features available	All NaN for overlap samples (0.0 → NaN replacement applied)
Ecosystem split in analysis matrix	Soil: 126, Unknown: 48 (Freshwater lost in inner join — no met files)

04 Metabolomics Processing

NB04: Metabolomics Processing¶

Purpose¶

Inputs¶

Outputs¶

Part 1: Load NB02/NB03 Outputs and Identify Met Files¶

Part 2: Explore `metabolomics_gold` Schema¶

Part 3: Extract Metabolomics Data for Overlap Samples¶

Part 4: Compound Annotation and Amino Acid Mapping¶

Part 5: Normalization and Per-Sample Aggregation¶

Part 6: Amino Acid Metabolomics Subset¶

Part 7: Abiotic Features¶

Part 9: Metabolomics Distribution Figure¶

Part 10: Save Outputs¶

Summary and Decisions for NB05¶

04 Metabolomics Processing

NB04: Metabolomics Processing¶

Purpose¶

Inputs¶

Outputs¶

Part 1: Load NB02/NB03 Outputs and Identify Met Files¶

Part 2: Explore metabolomics_gold Schema¶

Part 3: Extract Metabolomics Data for Overlap Samples¶

Part 4: Compound Annotation and Amino Acid Mapping¶

Part 5: Normalization and Per-Sample Aggregation¶

Part 6: Amino Acid Metabolomics Subset¶

Part 7: Abiotic Features¶

Part 9: Metabolomics Distribution Figure¶

Part 10: Save Outputs¶

Summary and Decisions for NB05¶

Part 2: Explore `metabolomics_gold` Schema¶