import os
import pandas as pd

# get_spark_session() is injected into JupyterHub kernels — no import needed
spark = get_spark_session()

DATA_DIR = '../data'
os.makedirs(DATA_DIR, exist_ok=True)

print('Spark session initialized')
print(f'Output directory: {os.path.abspath(DATA_DIR)}')

Spark session initialized
Output directory: /home/psdehal/pangenome_science/BERIL-research-observatory/projects/env_embedding_explorer/data

ae_df = spark.sql("""
    SELECT *
    FROM kbase_ke_pangenome.alphaearth_embeddings_all_years
""").toPandas()

emb_cols = [c for c in ae_df.columns if c.startswith('A') and c[1:].isdigit()]

print(f'AlphaEarth embeddings: {len(ae_df):,} genomes')
print(f'Embedding dimensions: {len(emb_cols)}')
print(f'Lat/lon non-null: {ae_df["cleaned_lat"].notna().sum():,} / {len(ae_df):,}')
print(f'Year range: {ae_df["cleaned_year"].min()} – {ae_df["cleaned_year"].max()}')
print(f'\nTaxonomy coverage:')
for col in ['domain', 'phylum', 'class', 'order', 'family', 'genus', 'species']:
    if col in ae_df.columns:
        print(f'  {col}: {ae_df[col].nunique():,} unique')

AlphaEarth embeddings: 83,287 genomes
Embedding dimensions: 64
Lat/lon non-null: 83,286 / 83,287
Year range: 1905 – 2022

Taxonomy coverage:
  domain: 2 unique
  phylum: 135 unique
  class: 334 unique
  order: 953 unique
  family: 2,122 unique
  genus: 6,293 unique
  species: 15,046 unique

ae_df.head(3)

attr_counts = spark.sql("""
    SELECT harmonized_name,
           COUNT(*) as n_rows,
           COUNT(DISTINCT accession) as n_genomes
    FROM kbase_ke_pangenome.ncbi_env
    GROUP BY harmonized_name
    ORDER BY n_genomes DESC
""").toPandas()

print(f'Total distinct harmonized_name values: {len(attr_counts)}')
print(f'\nTop 30 attributes by number of genomes:')
attr_counts.head(30)

Total distinct harmonized_name values: 334

Top 30 attributes by number of genomes:

attr_counts.to_csv(os.path.join(DATA_DIR, 'ncbi_env_attribute_counts.csv'), index=False)
print(f'Saved ncbi_env_attribute_counts.csv ({len(attr_counts)} attributes)')

Saved ncbi_env_attribute_counts.csv (334 attributes)

# Register AlphaEarth biosample IDs as temp view for Spark join
biosample_ids = ae_df['ncbi_biosample_accession_id'].dropna().unique().tolist()
print(f'AlphaEarth genomes with biosample IDs: {len(biosample_ids):,}')

biosample_sdf = spark.createDataFrame(
    [(b,) for b in biosample_ids],
    ['accession']
)
biosample_sdf.createOrReplaceTempView('ae_biosamples')

AlphaEarth genomes with biosample IDs: 83,103

# Pivot key environment attributes into columns
env_pivot = spark.sql("""
    SELECT ne.accession,
           MAX(CASE WHEN ne.harmonized_name = 'isolation_source' THEN ne.content END) as isolation_source,
           MAX(CASE WHEN ne.harmonized_name = 'geo_loc_name' THEN ne.content END) as geo_loc_name,
           MAX(CASE WHEN ne.harmonized_name = 'env_broad_scale' THEN ne.content END) as env_broad_scale,
           MAX(CASE WHEN ne.harmonized_name = 'env_local_scale' THEN ne.content END) as env_local_scale,
           MAX(CASE WHEN ne.harmonized_name = 'env_medium' THEN ne.content END) as env_medium,
           MAX(CASE WHEN ne.harmonized_name = 'host' THEN ne.content END) as host,
           MAX(CASE WHEN ne.harmonized_name = 'collection_date' THEN ne.content END) as collection_date,
           MAX(CASE WHEN ne.harmonized_name = 'lat_lon' THEN ne.content END) as lat_lon,
           MAX(CASE WHEN ne.harmonized_name = 'depth' THEN ne.content END) as depth,
           MAX(CASE WHEN ne.harmonized_name = 'altitude' THEN ne.content END) as altitude,
           MAX(CASE WHEN ne.harmonized_name = 'temp' THEN ne.content END) as temp
    FROM kbase_ke_pangenome.ncbi_env ne
    JOIN ae_biosamples ab ON ne.accession = ab.accession
    GROUP BY ne.accession
""").toPandas()

print(f'Environment labels pivoted for {len(env_pivot):,} genomes')
print(f'\nAttribute population rates:')
for col in env_pivot.columns[1:]:
    n = env_pivot[col].notna().sum()
    pct = 100 * n / len(env_pivot) if len(env_pivot) > 0 else 0
    print(f'  {col}: {n:,} ({pct:.1f}%)')

Environment labels pivoted for 83,103 genomes

Attribute population rates:
  isolation_source: 76,230 (91.7%)
  geo_loc_name: 83,089 (100.0%)
  env_broad_scale: 34,778 (41.8%)
  env_local_scale: 31,542 (38.0%)
  env_medium: 31,498 (37.9%)
  host: 53,026 (63.8%)
  collection_date: 83,103 (100.0%)
  lat_lon: 83,098 (100.0%)
  depth: 15,300 (18.4%)
  altitude: 4,382 (5.3%)
  temp: 1,917 (2.3%)

merged = ae_df.merge(
    env_pivot,
    left_on='ncbi_biosample_accession_id',
    right_on='accession',
    how='left'
)

if 'accession' in merged.columns:
    merged = merged.drop(columns=['accession'])

print(f'Merged dataset: {len(merged):,} genomes')
print(f'  With isolation_source: {merged["isolation_source"].notna().sum():,}')
print(f'  Without: {merged["isolation_source"].isna().sum():,}')

Merged dataset: 83,287 genomes
  With isolation_source: 76,347
  Without: 6,940

flag_cols = {
    'has_latlon': merged['cleaned_lat'].notna() & merged['cleaned_lon'].notna(),
    'has_isolation_source': merged['isolation_source'].notna(),
    'has_env_broad_scale': merged['env_broad_scale'].notna(),
    'has_env_local_scale': merged['env_local_scale'].notna(),
    'has_env_medium': merged['env_medium'].notna(),
    'has_host': merged['host'].notna(),
    'has_geo_loc_name': merged['geo_loc_name'].notna(),
}

for name, flag in flag_cols.items():
    merged[name] = flag

coverage = pd.DataFrame([
    {'attribute': name.replace('has_', ''), 'n_genomes': int(flag.sum()),
     'pct_of_alphaearth': round(100 * flag.mean(), 1)}
    for name, flag in flag_cols.items()
])

print('Coverage of AlphaEarth genomes:')
coverage

Coverage of AlphaEarth genomes:

coverage.to_csv(os.path.join(DATA_DIR, 'coverage_stats.csv'), index=False)
print('Saved coverage_stats.csv')

Saved coverage_stats.csv

iso_counts = (
    merged.loc[merged['isolation_source'].notna(), 'isolation_source']
    .value_counts()
    .reset_index()
)
iso_counts.columns = ['isolation_source', 'count']

print(f'Unique isolation_source values: {len(iso_counts):,}')
print(f'\nTop 30 values:')
iso_counts.head(30)

Unique isolation_source values: 5,776

Top 30 values:

iso_counts.to_csv(os.path.join(DATA_DIR, 'isolation_source_raw_counts.csv'), index=False)
print('Saved isolation_source_raw_counts.csv')

Saved isolation_source_raw_counts.csv

out_path = os.path.join(DATA_DIR, 'alphaearth_with_env.csv')
merged.to_csv(out_path, index=False)

print(f'Saved alphaearth_with_env.csv')
print(f'  Rows: {len(merged):,}')
print(f'  Columns: {len(merged.columns)}')
print(f'  File size: {os.path.getsize(out_path) / 1e6:.1f} MB')

Saved alphaearth_with_env.csv
  Rows: 83,287
  Columns: 95
  File size: 90.6 MB

emb_cols = [f'A{i:02d}' for i in range(64)]
emb_stats = merged[emb_cols].describe().T

print('Embedding dimensions (A00–A63):')
print(f'  Value range: [{emb_stats["min"].min():.3f}, {emb_stats["max"].max():.3f}]')
print(f'  Mean of means: {emb_stats["mean"].mean():.3f}')
print(f'  Mean of stds:  {emb_stats["std"].mean():.3f}')
print(f'  Any NaN: {merged[emb_cols].isna().any().any()}')
print(f'  Genomes with any NaN embedding: {merged[emb_cols].isna().any(axis=1).sum():,}')

print(f'\nGeographic extent:')
print(f'  Latitude:  [{merged["cleaned_lat"].min():.2f}, {merged["cleaned_lat"].max():.2f}]')
print(f'  Longitude: [{merged["cleaned_lon"].min():.2f}, {merged["cleaned_lon"].max():.2f}]')

print(f'\nTop 10 phyla (of {merged["phylum"].nunique()} total):')
merged['phylum'].value_counts().head(10)

Embedding dimensions (A00–A63):
  Value range: [-0.544, 0.544]
  Mean of means: -0.008
  Mean of stds:  0.109
  Any NaN: True
  Genomes with any NaN embedding: 3,838

Geographic extent:
  Latitude:  [-85.00, 84.48]
  Longitude: [-178.47, 179.58]

Top 10 phyla (of 135 total):

phylum
p__Pseudomonadota       34686
p__Bacillota            12142
p__Actinomycetota        7640
p__Bacteroidota          6238
p__Bacillota_A           5975
p__Patescibacteria       2212
p__Campylobacterota      2080
p__Cyanobacteriota       1131
p__Chloroflexota          983
p__Verrucomicrobiota      801
Name: count, dtype: int64

print('=== Data extraction complete ===')
print(f'\nOutput files in {os.path.abspath(DATA_DIR)}:')
for f in sorted(os.listdir(DATA_DIR)):
    if f.endswith('.csv'):
        size = os.path.getsize(os.path.join(DATA_DIR, f)) / 1e6
        print(f'  {f} ({size:.1f} MB)')

=== Data extraction complete ===

Output files in /home/psdehal/pangenome_science/BERIL-research-observatory/projects/env_embedding_explorer/data:
  alphaearth_with_env.csv (90.6 MB)
  coverage_stats.csv (0.0 MB)
  isolation_source_raw_counts.csv (0.2 MB)
  ncbi_env_attribute_counts.csv (0.0 MB)
  umap_coords.csv (3.1 MB)

File	Description
`alphaearth_with_env.csv`	Merged embeddings + pivoted env labels (one row per genome)
`coverage_stats.csv`	Per-attribute population rates
`ncbi_env_attribute_counts.csv`	Full inventory of all 334 harmonized_name values
`isolation_source_raw_counts.csv`	Raw value frequencies for harmonization in NB02

	genome_id	ncbi_biosample_accession_id	ncbi_bioproject	domain	phylum	class	order	family	genus	species	...	A54	A55	A56	A57	A58	A59	A60	A61	A62	A63
0	RS_GCF_018722605.1	SAMN18928743	PRJNA224116	d__Bacteria	p__Pseudomonadota	c__Gammaproteobacteria	o__Methylococcales	f__Methylomonadaceae	g__Methylovulum	s__Methylovulum psychrotolerans	...	0.124567	0.124567	0.088827	-0.038447	0.003937	0.019931	-0.024606	0.012057	0.015748	0.029773
1	GB_GCA_019417585.1	SAMN19225087	PRJNA730993	d__Bacteria	p__Bacillota_A	c__Clostridia	o__Lachnospirales	f__Lachnospiraceae	g__UBA3402	s__UBA3402 sp003478355	...	0.186082	-0.267958	0.147697	0.108512	-0.027128	0.002215	0.066990	-0.103406	-0.244152	0.041584
2	GB_GCA_024640595.1	SAMN19289924	PRJNA668816	d__Bacteria	p__Bacteroidota	c__Bacteroidia	o__Flavobacteriales	f__Flavobacteriaceae	g__MAG-120531	s__MAG-120531 sp018500065	...	-0.038447	-0.244152	-0.000984	-0.004983	-0.124567	0.108512	0.051734	-0.059116	0.098424	0.000554

	harmonized_name	n_rows	n_genomes
0	collection_date	273042	272680
1	geo_loc_name	272707	271943
2	isolation_source	245435	245344
3	strain	205650	204424
4	host	170593	170542
5	NaN	1600369	159029
6	lat_lon	137438	137438
7	sample_type	112681	111887
8	env_broad_scale	88423	88251
9	env_local_scale	79814	79794
10	env_medium	79635	79613
11	collected_by	79233	79233
12	sample_name	78793	78793
13	isolate	64407	64406
14	host_disease	52879	52701
15	metagenome_source	50133	50133
16	project_name	41136	41136
17	ref_biomaterial	36479	36478
18	investigation_type	31845	31845
19	derived_from	30283	30283
20	host_health_state	30164	29616
21	serovar	28635	28635
22	isol_growth_condt	28028	28027
23	num_replicons	27608	27607
24	depth	20091	20088
25	sub_species	16262	16262
26	culture_collection	13928	13788
27	biomaterial_provider	11369	11369
28	temp	9259	9259
29	genotype	8504	8501

	isolation_source	count
0	feces	5833
1	blood	4506
2	sputum	3713
3	stool	2663
4	missing	2318
5	patient	1758
6	hypoxic seawater	1665
7	urine	1601
8	soil	1279
9	permafrost active layer soil	1095
10	hypersaline soda lake sediment	997
11	groundwater	986
12	stool sample	984
13	acid mine drainage sediment	887
14	infant feces	644
15	freshwater	618
16	root nodule	591
17	rumen	469
18	rectal swab	438
19	Unknown	437
20	Aspo HRL	435
21	feces extracted directly from colon	429
22	wound	386
23	saliva	380
24	aquifer sediment	375
25	activated sludge from Hong Kong Shatin wastewa...	347
26	Microcystis colony from freshwater lake	323
27	fecal material	313
28	water	302
29	sputum or oropharyngeal	298

01 Data Extraction

NB01: Data Extraction — AlphaEarth Embeddings + Environment Labels¶

Background¶

Key considerations¶

Outputs¶

1. Extract AlphaEarth Embeddings¶

2. Inventory NCBI Environment Attributes¶

3. Pivot NCBI Environment Labels for AlphaEarth Genomes¶

4. Merge Embeddings with Environment Labels¶

5. Coverage Statistics¶

6. Isolation Source Value Counts¶

7. Save Merged Dataset¶

8. Sanity Checks¶

Summary¶

	attribute	n_genomes	pct_of_alphaearth
0	latlon	83286	100.0
1	isolation_source	76347	91.7
2	env_broad_scale	34858	41.9
3	env_local_scale	31621	38.0
4	env_medium	31577	37.9
5	host	53176	63.8
6	geo_loc_name	83273	100.0