import sqlite3
import pandas as pd
from pathlib import Path

from berdl_notebook_utils.setup_spark_session import get_spark_session

spark = get_spark_session()
print(f'Spark version: {spark.version}')

# Connect to user-provided ADP1 database
DB_PATH = Path('../user_data/berdl_tables.db')
conn = sqlite3.connect(DB_PATH)
print(f'ADP1 database: {DB_PATH}')

Spark version: 4.0.1
ADP1 database: ../user_data/berdl_tables.db

# Get genome IDs from ADP1 database
genomes = pd.read_sql('SELECT id FROM genome', conn)
berdl_ids = [g for g in genomes['id'].tolist() if g.startswith(('RS_GCF_', 'GB_GCA_'))]
print(f'ADP1 database genome IDs (BERDL format): {len(berdl_ids)}')

# Query BERDL pangenome for these genome IDs
id_list = ','.join([f"'{g}'" for g in berdl_ids])
pangenome_genomes = spark.sql(f"""
    SELECT genome_id, gtdb_species_clade_id
    FROM kbase_ke_pangenome.genome
    WHERE genome_id IN ({id_list})
""").toPandas()

matched = set(pangenome_genomes['genome_id'].tolist())
unmatched = [g for g in berdl_ids if g not in matched]

print(f'Matched in BERDL pangenome: {len(matched)}/{len(berdl_ids)}')
print()
print('Matched genomes with species clade:')
display(pangenome_genomes)

if unmatched:
    print(f'\nUnmatched: {unmatched}')

ADP1 database genome IDs (BERDL format): 13

Matched in BERDL pangenome: 13/13

Matched genomes with species clade:

# What species clade do these genomes belong to?
if len(pangenome_genomes) > 0:
    clade_ids = pangenome_genomes['gtdb_species_clade_id'].unique()
    print(f'Species clades represented: {len(clade_ids)}')
    for cid in clade_ids:
        count = (pangenome_genomes['gtdb_species_clade_id'] == cid).sum()
        # Get species name
        sp = spark.sql(f"""
            SELECT GTDB_species, no_genomes, no_core, no_aux_genome
            FROM kbase_ke_pangenome.pangenome p
            JOIN kbase_ke_pangenome.gtdb_species_clade s
              ON p.gtdb_species_clade_id = s.gtdb_species_clade_id
            WHERE p.gtdb_species_clade_id = '{cid}'
        """).toPandas()
        if len(sp) > 0:
            row = sp.iloc[0]
            print(f'  {cid}')
            print(f'    Species: {row.GTDB_species}')
            print(f'    Pangenome: {row.no_genomes} genomes, {row.no_core} core, {row.no_aux_genome} accessory')
            print(f'    ADP1 DB genomes in this clade: {count}')

Species clades represented: 1

  s__Acinetobacter_baylyi--RS_GCF_000368685.1
    Species: s__Acinetobacter_baylyi
    Pangenome: 13 genomes, 3207 core, 1684 accessory
    ADP1 DB genomes in this clade: 13

# Get unique reaction IDs from ADP1 database
adp1_rxns = pd.read_sql('SELECT DISTINCT reaction_id FROM genome_reactions', conn)
adp1_rxn_ids = adp1_rxns['reaction_id'].tolist()
print(f'ADP1 reaction IDs: {len(adp1_rxn_ids)}')

# Query BERDL biochemistry - get total count first
berdl_rxn_count = spark.sql(
    'SELECT COUNT(*) as n FROM kbase_msd_biochemistry.reaction'
).collect()[0].n
print(f'BERDL total reactions: {berdl_rxn_count}')

# Check matches in batches (IN clause limit)
batch_size = 100
all_matched = []
for i in range(0, len(adp1_rxn_ids), batch_size):
    batch = adp1_rxn_ids[i:i+batch_size]
    # Reaction IDs in biochemistry use seed.reaction: prefix
    prefixed = [f"'seed.reaction:{r}'" for r in batch]
    id_list = ','.join(prefixed)
    matched = spark.sql(f"""
        SELECT id, name
        FROM kbase_msd_biochemistry.reaction
        WHERE id IN ({id_list})
    """).toPandas()
    all_matched.append(matched)

matched_rxns = pd.concat(all_matched, ignore_index=True)
print(f'\nMatched in BERDL biochemistry: {len(matched_rxns)}/{len(adp1_rxn_ids)} ({len(matched_rxns)/len(adp1_rxn_ids)*100:.1f}%)')
print(f'\nSample matched reactions:')
display(matched_rxns.head(10))

ADP1 reaction IDs: 1330

BERDL total reactions: 56012

Matched in BERDL biochemistry: 1210/1330 (91.0%)

Sample matched reactions:

# Get unique compound IDs from gene_phenotypes
adp1_cpds = pd.read_sql(
    "SELECT DISTINCT phenotype_id FROM gene_phenotypes WHERE phenotype_id LIKE 'cpd%'", 
    conn
)
adp1_cpd_ids = adp1_cpds['phenotype_id'].tolist()
print(f'ADP1 compound IDs: {len(adp1_cpd_ids)}')

# Query BERDL biochemistry molecules (compounds are in the molecule table)
prefixed = [f"'seed.compound:{c}'" for c in adp1_cpd_ids]
id_list = ','.join(prefixed)
matched_cpds = spark.sql(f"""
    SELECT id, name, formula, mass
    FROM kbase_msd_biochemistry.molecule
    WHERE id IN ({id_list})
    ORDER BY id
""").toPandas()

print(f'Matched in BERDL biochemistry: {len(matched_cpds)}/{len(adp1_cpd_ids)} ({len(matched_cpds)/len(adp1_cpd_ids)*100:.1f}%)')
print(f'\nSample matched compounds:')
display(matched_cpds.head(15))

ADP1 compound IDs: 230

Matched in BERDL biochemistry: 230/230 (100.0%)

Sample matched compounds:

# Get ADP1 pangenome cluster IDs and species clade
adp1_clusters = pd.read_sql(
    'SELECT DISTINCT pangenome_cluster_id FROM genome_features WHERE pangenome_cluster_id IS NOT NULL',
    conn
)
adp1_cluster_ids = adp1_clusters['pangenome_cluster_id'].tolist()
print(f'ADP1 pangenome cluster IDs: {len(adp1_cluster_ids)}')
print(f'Sample: {adp1_cluster_ids[:3]}')

# These clusters are species-specific. Find the species clade first.
if len(pangenome_genomes) > 0:
    clade_id = pangenome_genomes['gtdb_species_clade_id'].iloc[0]
    print(f'\nSpecies clade: {clade_id}')
    
    # Check a sample of cluster IDs against BERDL
    sample_ids = adp1_cluster_ids[:20]
    id_list = ','.join([f"'{c}'" for c in sample_ids])
    matched_clusters = spark.sql(f"""
        SELECT gene_cluster_id, is_core, is_auxiliary, is_singleton
        FROM kbase_ke_pangenome.gene_cluster
        WHERE gtdb_species_clade_id = '{clade_id}'
          AND gene_cluster_id IN ({id_list})
    """).toPandas()
    
    print(f'Sample match: {len(matched_clusters)}/{len(sample_ids)} cluster IDs found in BERDL')
    if len(matched_clusters) > 0:
        display(matched_clusters.head(10))
    else:
        print('No matches — cluster ID format may differ between user DB and BERDL pangenome.')
        # Show what BERDL cluster IDs look like for this species
        berdl_sample = spark.sql(f"""
            SELECT gene_cluster_id, is_core
            FROM kbase_ke_pangenome.gene_cluster
            WHERE gtdb_species_clade_id = '{clade_id}'
            LIMIT 5
        """).toPandas()
        print(f'\nBERDL cluster IDs for this species look like:')
        display(berdl_sample)
        print(f'\nADP1 cluster IDs look like: {adp1_cluster_ids[:5]}')
        print('These use different naming conventions — cross-reference requires mapping via sequences or annotations.')

ADP1 pangenome cluster IDs: 2538
Sample: ['NHSXFYEX_mmseqsCluster_0450', 'NHSXFYEX_mmseqsCluster_1660', 'NHSXFYEX_mmseqsCluster_3861']

Species clade: s__Acinetobacter_baylyi--RS_GCF_000368685.1

Sample match: 0/20 cluster IDs found in BERDL
No matches — cluster ID format may differ between user DB and BERDL pangenome.

BERDL cluster IDs for this species look like:

ADP1 cluster IDs look like: ['NHSXFYEX_mmseqsCluster_0450', 'NHSXFYEX_mmseqsCluster_1660', 'NHSXFYEX_mmseqsCluster_3861', 'NHSXFYEX_mmseqsCluster_0362', 'NHSXFYEX_mmseqsCluster_0465']
These use different naming conventions — cross-reference requires mapping via sequences or annotations.

# Search for Acinetobacter in fitness browser
fb_organisms = spark.sql("""
    SELECT orgId, genus, species, strain, taxonomyId
    FROM kescience_fitnessbrowser.organism
    WHERE genus LIKE '%Acinetobacter%'
       OR species LIKE '%baylyi%'
""").toPandas()

if len(fb_organisms) > 0:
    print(f'Acinetobacter in Fitness Browser: {len(fb_organisms)} organisms')
    display(fb_organisms)
else:
    print('No Acinetobacter found in Fitness Browser.')
    print('The ADP1 mutant growth data in the user database is from a different source.')

No Acinetobacter found in Fitness Browser.
The ADP1 mutant growth data in the user database is from a different source.

# Check PhageFoundry Acinetobacter tables
pf_tables = spark.sql(
    'SHOW TABLES IN phagefoundry_acinetobacter_genome_browser'
).toPandas()
print(f'PhageFoundry Acinetobacter tables: {len(pf_tables)}')
print(pf_tables['tableName'].tolist()[:15])

# Check if they have a genome/strain table
for table_name in ['genome', 'strain', 'organism', 'assembly']:
    if table_name in pf_tables['tableName'].values:
        sample = spark.sql(f"""
            SELECT * FROM phagefoundry_acinetobacter_genome_browser.{table_name}
            LIMIT 3
        """).toPandas()
        print(f'\n=== {table_name} (sample) ===')
        print(f'Columns: {list(sample.columns)}')
        display(sample)

PhageFoundry Acinetobacter tables: 37
['browser_genome', 'browser_annotation', 'browser_cazy_family', 'browser_cog_class', 'browser_contig', 'browser_ec_number', 'browser_eggnog_description', 'browser_gene', 'browser_genome_tags', 'browser_go_term', 'browser_kegg_ortholog', 'browser_kegg_pathway', 'browser_kegg_reaction', 'browser_operon', 'browser_ortholog_group']

# Search for ADP1 or baylyi in PhageFoundry
for table_name in pf_tables['tableName'].tolist():
    try:
        schema = spark.sql(f'DESCRIBE phagefoundry_acinetobacter_genome_browser.{table_name}').toPandas()
        text_cols = schema[schema['data_type'].isin(['string', 'varchar'])]['col_name'].tolist()
        for col in text_cols[:3]:  # check first 3 text columns
            result = spark.sql(f"""
                SELECT COUNT(*) as n
                FROM phagefoundry_acinetobacter_genome_browser.{table_name}
                WHERE {col} LIKE '%baylyi%' OR {col} LIKE '%ADP1%'
            """).collect()[0].n
            if result > 0:
                print(f'Found ADP1/baylyi in {table_name}.{col}: {result} rows')
                sample = spark.sql(f"""
                    SELECT *
                    FROM phagefoundry_acinetobacter_genome_browser.{table_name}
                    WHERE {col} LIKE '%baylyi%' OR {col} LIKE '%ADP1%'
                    LIMIT 3
                """).toPandas()
                display(sample)
                break
    except Exception as e:
        continue

# Build summary table
summary = []

summary.append({
    'Connection': 'Genome IDs → Pangenome',
    'ADP1 Count': len(berdl_ids),
    'BERDL Matches': len(pangenome_genomes),
    'Match %': f'{len(pangenome_genomes)/len(berdl_ids)*100:.0f}%' if berdl_ids else 'N/A',
    'Status': 'Strong' if len(pangenome_genomes) > 0 else 'None',
})

summary.append({
    'Connection': 'Reactions → Biochemistry',
    'ADP1 Count': len(adp1_rxn_ids),
    'BERDL Matches': len(matched_rxns),
    'Match %': f'{len(matched_rxns)/len(adp1_rxn_ids)*100:.0f}%',
    'Status': 'Strong' if len(matched_rxns)/len(adp1_rxn_ids) > 0.5 else 'Partial',
})

summary.append({
    'Connection': 'Compounds → Biochemistry',
    'ADP1 Count': len(adp1_cpd_ids),
    'BERDL Matches': len(matched_cpds),
    'Match %': f'{len(matched_cpds)/len(adp1_cpd_ids)*100:.0f}%',
    'Status': 'Strong' if len(matched_cpds)/len(adp1_cpd_ids) > 0.5 else 'Partial',
})

summary.append({
    'Connection': 'Cluster IDs → Pangenome',
    'ADP1 Count': len(adp1_cluster_ids),
    'BERDL Matches': len(matched_clusters),
    'Match %': f'{len(matched_clusters)}/20 sample' if len(matched_clusters) > 0 else '0/20 sample',
    'Status': 'Direct match' if len(matched_clusters) > 0 else 'ID format differs',
})

summary.append({
    'Connection': 'ADP1 → Fitness Browser',
    'ADP1 Count': 1,
    'BERDL Matches': len(fb_organisms),
    'Match %': 'Yes' if len(fb_organisms) > 0 else 'No',
    'Status': 'Available' if len(fb_organisms) > 0 else 'Not in FB',
})

summary_df = pd.DataFrame(summary)
print('=== BERDL Connection Scan Summary ===')
print()
display(summary_df)

# Save summary
summary_df.to_csv('../data/berdl_connection_summary.csv', index=False)
print('\nSaved: data/berdl_connection_summary.csv')

=== BERDL Connection Scan Summary ===

Saved: data/berdl_connection_summary.csv

conn.close()
spark.stop()
print('Connections closed.')

Connections closed.

	id	name
0	seed.reaction:rxn00001	diphosphate phosphohydrolase
1	seed.reaction:rxn00107	ATP:2'-deoxy-5-hydroxymethylcytidine-5'-diphos...
2	seed.reaction:rxn00165	L-serine ammonia-lyase
3	seed.reaction:rxn00172	Acetate:CoA ligase (ADP-forming)
4	seed.reaction:rxn00192	acetyl-CoA:L-glutamate N-acetyltransferase
5	seed.reaction:rxn00199	oxalosuccinate carboxy-lyase (2-oxoglutarate-f...
6	seed.reaction:rxn00208	Pyridoxamine-5'-phosphate:oxygen oxidoreductas...
7	seed.reaction:rxn00211	UDP-glucose:NAD+ 6-oxidoreductase
8	seed.reaction:rxn00262	L-Aspartic acid:oxygen oxidoreductase (deamina...
9	seed.reaction:rxn00337	ATP:L-aspartate 4-phosphotransferase

	id	name	formula	mass
0	seed.compound:cpd00020	Pyruvate	C3H3O3	87.0
1	seed.compound:cpd00023	L-Glutamate	C5H8NO4	146.0
2	seed.compound:cpd00024	2-Oxoglutarate	C5H4O5	144.0
3	seed.compound:cpd00027	D-Glucose	C6H12O6	180.0
4	seed.compound:cpd00029	Acetate	C2H3O2	59.0
5	seed.compound:cpd00033	Glycine	C2H5NO2	75.0
6	seed.compound:cpd00035	L-Alanine	C3H7NO2	89.0
7	seed.compound:cpd00036	Succinate	C4H4O4	116.0
8	seed.compound:cpd00039	L-Lysine	C6H15N2O2	147.0
9	seed.compound:cpd00040	Glyoxalate	C2HO3	73.0
10	seed.compound:cpd00041	L-Aspartate	C4H6NO4	132.0
11	seed.compound:cpd00047	Formate	CHO2	45.0
12	seed.compound:cpd00051	L-Arginine	C6H15N4O2	175.0
13	seed.compound:cpd00053	L-Glutamine	C5H10N2O3	146.0
14	seed.compound:cpd00054	L-Serine	C3H7NO3	105.0

02 Berdl Connection Scan

02 — BERDL Connection Scan¶

1. Genome IDs → Pangenome¶

2. Reaction IDs → Biochemistry¶

3. Compound IDs → Biochemistry¶

4. Pangenome Clusters¶

5. Fitness Browser — ADP1¶

6. PhageFoundry — Acinetobacter¶

7. Connection Summary¶

	genome_id	gtdb_species_clade_id
0	GB_GCA_002694305.1	s__Acinetobacter_baylyi--RS_GCF_000368685.1
1	RS_GCF_000046845.1	s__Acinetobacter_baylyi--RS_GCF_000368685.1
2	RS_GCF_000302115.1	s__Acinetobacter_baylyi--RS_GCF_000368685.1
3	RS_GCF_000368685.1	s__Acinetobacter_baylyi--RS_GCF_000368685.1
4	RS_GCF_000621045.1	s__Acinetobacter_baylyi--RS_GCF_000368685.1
5	RS_GCF_001485005.1	s__Acinetobacter_baylyi--RS_GCF_000368685.1
6	RS_GCF_010577805.1	s__Acinetobacter_baylyi--RS_GCF_000368685.1
7	RS_GCF_010577855.1	s__Acinetobacter_baylyi--RS_GCF_000368685.1
8	RS_GCF_010577875.1	s__Acinetobacter_baylyi--RS_GCF_000368685.1
9	RS_GCF_010577895.1	s__Acinetobacter_baylyi--RS_GCF_000368685.1
10	RS_GCF_010577925.1	s__Acinetobacter_baylyi--RS_GCF_000368685.1
11	RS_GCF_010577955.1	s__Acinetobacter_baylyi--RS_GCF_000368685.1
12	RS_GCF_900465415.1	s__Acinetobacter_baylyi--RS_GCF_000368685.1

	gene_cluster_id	is_core
0	NZ_MPVY01000010.1_125	True
1	NZ_MPVY01000010.1_126	True
2	NZ_MPVY01000010.1_156	True
3	NZ_MPVY01000010.1_157	True
4	NZ_MPVY01000010.1_158	False

	Connection	ADP1 Count	BERDL Matches	Match %	Status
0	Genome IDs → Pangenome	13	13	100%	Strong
1	Reactions → Biochemistry	1330	1210	91%	Strong
2	Compounds → Biochemistry	230	230	100%	Strong
3	Cluster IDs → Pangenome	2538	0	0/20 sample	ID format differs
4	ADP1 → Fitness Browser	1	0	No	Not in FB