# Imports
import pandas as pd
import numpy as np
from pathlib import Path

# Spark Connect (uses local proxy to BERDL cluster)
from get_spark_session import get_spark_session

print("✅ Imports successful")

✅ Imports successful

# Create Spark session
spark = get_spark_session()

print(f"✅ Spark session created")
print(f"   Version: {spark.version}")

# Verify database access
databases = [row.namespace for row in spark.sql("SHOW DATABASES").collect()]
print(f"\n✅ Connected to BERDL")
print(f"   Databases: {len(databases)}")
print(f"   Pangenome: {'kbase_ke_pangenome' in databases}")
print(f"   Biochemistry: {'kbase_msd_biochemistry' in databases}")

✅ Spark session created
   Version: 4.0.1

✅ Connected to BERDL
   Databases: 77
   Pangenome: True
   Biochemistry: True

# List biochemistry tables
print("Biochemistry tables:")
spark.sql("SHOW TABLES IN kbase_msd_biochemistry").show()

Biochemistry tables:

+--------------------+-------------------+-----------+
|           namespace|          tableName|isTemporary|
+--------------------+-------------------+-----------+
|kbase_msd_biochem...|           molecule|      false|
|kbase_msd_biochem...|           reaction|      false|
|kbase_msd_biochem...|            reagent|      false|
|kbase_msd_biochem...|reaction_similarity|      false|
|kbase_msd_biochem...|          structure|      false|
+--------------------+-------------------+-----------+

# Reaction table schema
print("Reaction table schema:")
spark.sql("DESCRIBE kbase_msd_biochemistry.reaction").show(truncate=False)

Reaction table schema:

+-------------+---------+-------+
|col_name     |data_type|comment|
+-------------+---------+-------+
|abbreviation |string   |NULL   |
|deltag       |float    |NULL   |
|deltagerr    |float    |NULL   |
|id           |string   |NULL   |
|is_transport |boolean  |NULL   |
|name         |string   |NULL   |
|reversibility|string   |NULL   |
|source       |string   |NULL   |
|status       |string   |NULL   |
+-------------+---------+-------+

# Sample reactions - check 'source' field for EC references
print("Sample reactions (checking 'source' field for EC numbers):")
spark.sql("""
    SELECT id, name, source, abbreviation
    FROM kbase_msd_biochemistry.reaction
    WHERE source IS NOT NULL
    LIMIT 20
""").show(truncate=False)

Sample reactions (checking 'source' field for EC numbers):

+----------------------+--------------------------------------------------------------------------------------------------------------------+----------------+--------------------------------------------------------------------------------------------------------------------+
|id                    |name                                                                                                                |source          |abbreviation                                                                                                        |
+----------------------+--------------------------------------------------------------------------------------------------------------------+----------------+--------------------------------------------------------------------------------------------------------------------+
|seed.reaction:rxn40243|L-asparagine,2-oxoglutarate:oxygen oxidoreductase (3-hydroxylating)                                                 |Primary Database|L-asparagine,2-oxoglutarate:oxygen oxidoreductase (3-hydroxylating)                                                 |
|seed.reaction:rxn40244|S-adenosyl-L-methionine:gibberellin A9 O-methyltransferase                                                          |Primary Database|S-adenosyl-L-methionine:gibberellin A9 O-methyltransferase                                                          |
|seed.reaction:rxn40245|S-adenosyl-L-methionine:gibberellin A4 O-methyltransferase                                                          |Primary Database|S-adenosyl-L-methionine:gibberellin A4 O-methyltransferase                                                          |
|seed.reaction:rxn40246|S-adenosyl-L-methionine:anthranilate O-methyltransferase                                                            |Primary Database|S-adenosyl-L-methionine:anthranilate O-methyltransferase                                                            |
|seed.reaction:rxn40247|3-dimethylallyl-4-hydroxybenzoate:3-amino-4,7-dihydroxycoumarin ligase (AMP-forming)                                |Primary Database|3-dimethylallyl-4-hydroxybenzoate:3-amino-4,7-dihydroxycoumarin ligase (AMP-forming)                                |
|seed.reaction:rxn40248|S-adenosyl-L-methionine:8-demethylnovobiocic acid C8-methyltransferase                                              |Primary Database|S-adenosyl-L-methionine:8-demethylnovobiocic acid C8-methyltransferase                                              |
|seed.reaction:rxn40249|S-adenosyl-L-methionine:L-histidine-[translation elongation factor 2] 2-[(3S)-3-amino-3-carboxypropyl]transferase   |Primary Database|S-adenosyl-L-methionine:L-histidine-[translation elongation factor 2] 2-[(3S)-3-amino-3-carboxypropyl]transferase   |
|seed.reaction:rxn40250|dimethylallyl-diphosphate:brevianamide-F tert-dimethylallyl-C-2-transferase                                         |Primary Database|dimethylallyl-diphosphate:brevianamide-F tert-dimethylallyl-C-2-transferase                                         |
|seed.reaction:rxn40251|dimethylallyl-diphosphate:12alpha,13alpha-dihydroxyfumitremorgin C dimethylallyl-N-1-transferase                    |Primary Database|dimethylallyl-diphosphate:12alpha,13alpha-dihydroxyfumitremorgin C dimethylallyl-N-1-transferase                    |
|seed.reaction:rxn40252|NULL                                                                                                                |Primary Database|NULL                                                                                                                |
|seed.reaction:rxn40253|NULL                                                                                                                |Primary Database|NULL                                                                                                                |
|seed.reaction:rxn40254|GDP-4-amino-4,6-dideoxy-alpha-D-mannose:2-oxoglutarate aminotransferase                                             |Primary Database|GDP-4-amino-4,6-dideoxy-alpha-D-mannose:2-oxoglutarate aminotransferase                                             |
|seed.reaction:rxn40255|O-acetyl-L-serine:hydroxyurea 2-amino-2-carboxyethyltransferase                                                     |Primary Database|O-acetyl-L-serine:hydroxyurea 2-amino-2-carboxyethyltransferase                                                     |
|seed.reaction:rxn40256|ATP:GTP adenylyltransferase (cyclizing)                                                                             |Primary Database|ATP:GTP adenylyltransferase (cyclizing)                                                                             |
|seed.reaction:rxn40257|ATP:L-threonyl,bicarbonate adenylyltransferase                                                                      |Primary Database|ATP:L-threonyl,bicarbonate adenylyltransferase                                                                      |
|seed.reaction:rxn40258|CDP-2,3-bis-(O-phytanyl)-sn-glycerol:1L-myo-inositol 1-phosphate 1-sn-archaetidyltransferase                        |Primary Database|CDP-2,3-bis-(O-phytanyl)-sn-glycerol:1L-myo-inositol 1-phosphate 1-sn-archaetidyltransferase                        |
|seed.reaction:rxn40259|UDP-N-acetyl-alpha-D-galactosamine:ditrans,octacis-undecaprenyl phosphate N-acetyl-D-galactosaminephosphotransferase|Primary Database|UDP-N-acetyl-alpha-D-galactosamine:ditrans,octacis-undecaprenyl phosphate N-acetyl-D-galactosaminephosphotransferase|
|seed.reaction:rxn40260|N(omega)-hydroxy-L-arginine amidinohydrolase                                                                        |Primary Database|N(omega)-hydroxy-L-arginine amidinohydrolase                                                                        |
|seed.reaction:rxn40261|protoheme,hydrogen-donor:oxygen oxidoreductase (delta-methene-oxidizing, hydroxylating)                             |Primary Database|protoheme,hydrogen-donor:oxygen oxidoreductase (delta-methene-oxidizing, hydroxylating)                             |
|seed.reaction:rxn40262|2-amino-4,5-dihydroxy-6-oxo-7-(phosphonooxy)heptanoate hydro-lyase (cyclizing, 3-amino-4-hydroxybenzoate-forming)   |Primary Database|2-amino-4,5-dihydroxy-6-oxo-7-(phosphonooxy)heptanoate hydro-lyase (cyclizing, 3-amino-4-hydroxybenzoate-forming)   |
+----------------------+--------------------------------------------------------------------------------------------------------------------+----------------+--------------------------------------------------------------------------------------------------------------------+

# Check if 'source' contains EC numbers
print("Checking if 'source' field contains EC pattern (e.g., 'EC:1.2.3.4'):")
ec_in_source = spark.sql("""
    SELECT id, name, source
    FROM kbase_msd_biochemistry.reaction
    WHERE source LIKE '%EC:%' OR source LIKE '%ec:%'
    LIMIT 10
""")
print(f"Reactions with EC in source: {ec_in_source.count()}")
ec_in_source.show(truncate=False)

Checking if 'source' field contains EC pattern (e.g., 'EC:1.2.3.4'):

Reactions with EC in source: 0

+---+----+------+
|id |name|source|
+---+----+------+
+---+----+------+

# Count total EC annotations
total_ec = spark.sql("""
    SELECT COUNT(*) as count
    FROM kbase_ke_pangenome.eggnog_mapper_annotations
    WHERE EC IS NOT NULL AND EC != '-'
""").collect()[0]['count']

print(f"Total gene clusters with EC annotations: {total_ec:,}")

Total gene clusters with EC annotations: 25,962,995

# Extract all EC annotations
print("Extracting EC annotations from eggNOG...")
eggnog_ec = spark.sql("""
    SELECT
        query_name as gene_cluster_id,
        EC as ec_number,
        Description,
        Preferred_name,
        COG_category
    FROM kbase_ke_pangenome.eggnog_mapper_annotations
    WHERE EC IS NOT NULL AND EC != '-'
""")

ec_count = eggnog_ec.count()
print(f"✅ Extracted {ec_count:,} EC annotations")

# Sample
print("\nSample EC annotations:")
eggnog_ec.limit(10).show(truncate=False)

Extracting EC annotations from eggNOG...

✅ Extracted 25,962,995 EC annotations

Sample EC annotations:

+-----------------------+------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+------------+
|gene_cluster_id        |ec_number         |Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |Preferred_name|COG_category|
+-----------------------+------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+------------+
|NZ_PELQ01000313.1_1    |3.6.4.12          |-                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |-             |-           |
|NZ_PELQ01000248.1_2    |5.4.99.13,5.4.99.2|Methylmalonyl-CoA mutase                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |-             |I           |
|NZ_PELP01000373.1_1    |3.7.1.22          |Involved in the cleavage of the C1-C2 bond of 3D- (3,5 4)-trihydroxycyclohexane-1,2-dione (THcHDO) to yield 5-deoxy- glucuronate (5DG)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |iolD          |E           |
|DYZN01000261.1_13      |2.1.3.15,6.4.1.3  |Acetyl-CoA carboxylase, carboxyltransferase component subunits alpha and beta                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |-             |I           |
|DDKL01000188.1_6       |2.1.1.34          |Catalyzes the 2'-O methylation of guanosine at position 18 in tRNA                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |trmH          |J           |
|JADJND010000001.1_938  |3.2.2.27          |Alternative locus ID                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |-             |L           |
|NZ_CAJGZC010000010.1_93|1.6.5.3           |NDH-1 shuttles electrons from NADH, via FMN and iron- sulfur (Fe-S) centers, to quinones in the respiratory chain. The immediate electron acceptor for the enzyme in this species is believed to be ubiquinone. Couples the redox reaction to proton translocation (for every two electrons transferred, four hydrogen ions are translocated across the cytoplasmic membrane), and thus conserves the redox energy in a proton gradient                                                                                                                                                                                                                                           |nuoK          |C           |
|CADAMK010000075.1_1    |2.4.2.9           |Catalyzes the conversion of uracil and 5-phospho-alpha- D-ribose 1-diphosphate (PRPP) to UMP and diphosphate                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |upp           |F           |
|NZ_BIMD01000036.1_13   |6.1.1.15          |Catalyzes the attachment of proline to tRNA(Pro) in a two-step reaction proline is first activated by ATP to form Pro- AMP and then transferred to the acceptor end of tRNA(Pro). As ProRS can inadvertently accommodate and process non-cognate amino acids such as alanine and cysteine, to avoid such errors it has two additional distinct editing activities against alanine. One activity is designated as 'pretransfer' editing and involves the tRNA(Pro)-independent hydrolysis of activated Ala-AMP. The other activity is designated 'posttransfer' editing and involves deacylation of mischarged Ala-tRNA(Pro). The misacylated Cys- tRNA(Pro) is not edited by ProRS|proS          |J           |
|NZ_JYJH01000021.1_85   |1.6.5.3           |NDH-1 shuttles electrons from NADH, via FMN and iron- sulfur (Fe-S) centers, to quinones in the respiratory chain                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |nuoA2         |C           |
+-----------------------+------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+------------+

# Get unique EC numbers
unique_ecs = spark.sql("""
    SELECT DISTINCT EC as ec_number
    FROM kbase_ke_pangenome.eggnog_mapper_annotations
    WHERE EC IS NOT NULL AND EC != '-'
""")

unique_count = unique_ecs.count()
print(f"Unique EC numbers: {unique_count:,}")

# Sample
print("\nSample unique ECs:")
unique_ecs.limit(20).show(truncate=False)

Unique EC numbers: 8,914

Sample unique ECs:

+------------------+
|ec_number         |
+------------------+
|1.1.1.1,1.1.1.284 |
|2.7.8.5           |
|6.2.1.4,6.2.1.5   |
|4.2.1.2           |
|2.7.1.25,2.7.7.4  |
|2.1.1.35          |
|1.3.7.7           |
|1.1.2.3           |
|5.1.1.1,6.3.2.10  |
|3.6.4.12,5.99.1.3 |
|1.1.1.288         |
|2.7.3.2,2.7.3.3   |
|2.7.1.175         |
|6.2.1.1,6.2.1.17  |
|2.7.4.10,2.7.4.3  |
|3.2.1.25          |
|3.6.1.57          |
|1.1.1.373,1.1.1.61|
|2.7.13.1          |
|2.1.1.17,2.1.1.71 |
+------------------+

# Try matching EC numbers in 'source' field
# Note: This is exploratory - schema may need different approach

print("Attempting to match EC numbers to reactions via 'source' field...")
print("(This may not work if EC mappings are stored differently)\n")

# Sample a few EC numbers
sample_ecs = unique_ecs.limit(10).toPandas()['ec_number'].tolist()
print(f"Testing with sample ECs: {sample_ecs[:5]}")

# Build OR conditions for EC pattern matching
ec_conditions = " OR ".join([f"source LIKE '%{ec}%'" for ec in sample_ecs])

test_matches = spark.sql(f"""
    SELECT id, name, source
    FROM kbase_msd_biochemistry.reaction
    WHERE {ec_conditions}
""")

match_count = test_matches.count()
print(f"\nMatches found: {match_count}")

if match_count > 0:
    print("\nSample matches:")
    test_matches.show(truncate=False)
else:
    print("\n⚠️  No matches found via 'source' field.")
    print("   EC → reaction mapping may use different mechanism.")
    print("   Need to explore biochemistry schema further.")

Attempting to match EC numbers to reactions via 'source' field...
(This may not work if EC mappings are stored differently)

Testing with sample ECs: ['6.3.5.6,6.3.5.7', '2.1.1.77', '2.8.4.4', '2.7.7.13', '6.3.5.5']

Matches found: 0

⚠️  No matches found via 'source' field.
   EC → reaction mapping may use different mechanism.
   Need to explore biochemistry schema further.

# Create data directory
data_dir = Path("../data")
data_dir.mkdir(exist_ok=True)

# Save EC annotations (sample for now - full set is large)
print("Saving EC annotations...")
ec_sample = eggnog_ec.limit(10000).toPandas()
ec_sample_file = data_dir / "eggnog_ec_annotations_sample.tsv"
ec_sample.to_csv(ec_sample_file, sep='\t', index=False)
print(f"✅ Saved {len(ec_sample):,} EC annotations to {ec_sample_file}")

# Save unique EC numbers
print("\nSaving unique EC numbers...")
unique_ecs_pd = unique_ecs.toPandas()
unique_ecs_file = data_dir / "unique_ec_numbers.tsv"
unique_ecs_pd.to_csv(unique_ecs_file, sep='\t', index=False)
print(f"✅ Saved {len(unique_ecs_pd):,} unique EC numbers to {unique_ecs_file}")

Saving EC annotations...

✅ Saved 10,000 EC annotations to ../data/eggnog_ec_annotations_sample.tsv

Saving unique EC numbers...

✅ Saved 8,914 unique EC numbers to ../data/unique_ec_numbers.tsv

# Clean up
spark.stop()
print("✅ Spark session closed")

✅ Spark session closed

01 Extract Essential Reactions

Extract Essential Metabolic Reactions¶

Initialize Spark Session¶

Step 1: Explore Biochemistry Schema¶

Step 2: Extract EC Numbers from eggNOG Annotations¶

Step 3: Map EC Numbers to Reactions¶

Step 4: Save Intermediate Results¶

Summary & Next Steps¶