import os
import requests
import pandas as pd
import json
import time
from pathlib import Path

DATA_DIR = Path("../data")
DATA_DIR.mkdir(exist_ok=True)

API_URL = "https://hub.berdl.kbase.us/apis/mcp/delta/tables/query"
TOKEN = os.environ.get("KBASE_AUTH_TOKEN") or os.environ.get("KBASE_TOKEN")
HEADERS = {"Authorization": f"Bearer {TOKEN}", "Content-Type": "application/json"}

def query_berdl(sql, limit=1000, offset=0, retries=3, backoff=10):
    """Execute a SQL query against the BERDL REST API with retry."""
    payload = {"query": sql, "limit": limit, "offset": offset}
    for attempt in range(retries):
        try:
            resp = requests.post(API_URL, headers=HEADERS, json=payload, timeout=120)
            resp.raise_for_status()
            data = resp.json()
            return pd.DataFrame(data["result"]), data.get("pagination", {})
        except requests.exceptions.HTTPError as e:
            if resp.status_code in (408, 502, 503, 504) and attempt < retries - 1:
                wait = backoff * (attempt + 1)
                print(f"    retry {attempt+1}/{retries} after {resp.status_code}, waiting {wait}s...")
                time.sleep(wait)
            else:
                raise

def query_all(sql, limit=1000):
    """Paginate through a query using API-level offset."""
    frames = []
    offset = 0
    while True:
        df, pag = query_berdl(sql, limit=limit, offset=offset)
        frames.append(df)
        total = pag.get("total_count", len(df))
        has_more = pag.get("has_more", False)
        print(f"  ... {offset + len(df)}/{total} rows")
        if not has_more or len(df) < limit:
            break
        offset += limit
    result = pd.concat(frames, ignore_index=True)
    print(f"Total rows: {len(result)}")
    return result

def q(sql, limit=1000):
    """Convenience wrapper returning just the DataFrame."""
    df, _ = query_berdl(sql, limit=limit)
    return df

print(f"Token available: {bool(TOKEN)}")

Token available: True

# Count gene clusters with DUF4041
count_duf4041 = q("SELECT COUNT(*) AS n FROM kbase_ke_pangenome.eggnog_mapper_annotations WHERE PFAMs LIKE '%DUF4041%'")
print(f"DUF4041 hits: {count_duf4041['n'].iloc[0]}")

# Count gene clusters with GIY-YIG
count_giy = q("SELECT COUNT(*) AS n FROM kbase_ke_pangenome.eggnog_mapper_annotations WHERE PFAMs LIKE '%GIY-YIG%'")
print(f"GIY-YIG hits: {count_giy['n'].iloc[0]}")

# Count gene clusters with BOTH domains in same annotation (expected: 0)
count_both = q("SELECT COUNT(*) AS n FROM kbase_ke_pangenome.eggnog_mapper_annotations WHERE PFAMs LIKE '%DUF4041%' AND PFAMs LIKE '%GIY-YIG%'")
print(f"Both DUF4041 + GIY-YIG (same annotation): {count_both['n'].iloc[0]}")

# Check common domain co-occurrences with DUF4041
print("\nDUF4041 co-occurring domains (sample):")
sample_pfams = q("SELECT DISTINCT PFAMs FROM kbase_ke_pangenome.eggnog_mapper_annotations WHERE PFAMs LIKE '%DUF4041%'")
from collections import Counter
domain_counts = Counter()
for pfam_str in sample_pfams['PFAMs']:
    for d in pfam_str.split(','):
        d = d.strip()
        if d and d != 'DUF4041':
            domain_counts[d] += 1
for domain, count in domain_counts.most_common(10):
    print(f"  {domain}: {count}")

DUF4041 hits: 2962

GIY-YIG hits: 74686

Both DUF4041 + GIY-YIG (same annotation): 0

DUF4041 co-occurring domains (sample):

  T5orf172: 10
  MUG113: 5
  HATPase_c: 3
  Response_reg: 3
  PSCyt2: 2
  PSD1: 2
  GGDEF: 2
  HisKA: 2
  dCache_1: 2
  7TMR-DISMED2: 2

# Step 2a: Get all DUF4041 annotations (API pagination, ~3 pages of 1000)
duf4041_ann_sql = """
SELECT query_name AS gene_cluster_id,
       PFAMs,
       Description,
       Preferred_name,
       COG_category,
       GOs
FROM kbase_ke_pangenome.eggnog_mapper_annotations
WHERE PFAMs LIKE '%DUF4041%'
"""

df_duf4041_ann = query_all(duf4041_ann_sql)
print(f"DUF4041 annotations: {len(df_duf4041_ann)}")
df_duf4041_ann.head()

  ... 1000/2962 rows

  ... 2000/2962 rows

  ... 2962/2962 rows
Total rows: 2962
DUF4041 annotations: 2962

# Step 2b: Look up gene_cluster info (species, core/accessory) in batches
cluster_ids = df_duf4041_ann['gene_cluster_id'].tolist()
gc_frames = []

batch_size = 100  # smaller batches to be gentle on API
for i in range(0, len(cluster_ids), batch_size):
    batch = cluster_ids[i:i+batch_size]
    id_list = ", ".join(f"'{cid}'" for cid in batch)
    sql = f"""
    SELECT gene_cluster_id, gtdb_species_clade_id, is_core, is_auxiliary, is_singleton
    FROM kbase_ke_pangenome.gene_cluster
    WHERE gene_cluster_id IN ({id_list})
    """
    df_batch = q(sql)
    gc_frames.append(df_batch)
    if i > 0:
        time.sleep(1)  # rate limit

df_gc = pd.concat(gc_frames, ignore_index=True)
print(f"Gene cluster info retrieved: {len(df_gc)}")

# Merge annotations with gene_cluster info
df_duf4041 = df_duf4041_ann.merge(df_gc, on='gene_cluster_id', how='left')
print(f"DUF4041 clusters with species info: {df_duf4041['gtdb_species_clade_id'].notna().sum()}/{len(df_duf4041)}")
print(f"Unique species: {df_duf4041['gtdb_species_clade_id'].nunique()}")
df_duf4041.head()

Gene cluster info retrieved: 2928
DUF4041 clusters with species info: 4538/4572
Unique species: 1696

# GIY-YIG has ~75K hits — too many to pull all via REST API
giy_count = q("SELECT COUNT(*) AS n FROM kbase_ke_pangenome.eggnog_mapper_annotations WHERE PFAMs LIKE '%GIY-YIG%'")
print(f"Total GIY-YIG annotations: {giy_count['n'].iloc[0]}")

# Get a sample of GIY-YIG cluster IDs (limited by API max 1000)
print("\nFetching sample of GIY-YIG cluster IDs...")
df_giy_ids = q("""
SELECT DISTINCT query_name AS gene_cluster_id
FROM kbase_ke_pangenome.eggnog_mapper_annotations
WHERE PFAMs LIKE '%GIY-YIG%'
""")
print(f"GIY-YIG cluster IDs (sample): {len(df_giy_ids)}")

# Batch-lookup species for these clusters
giy_cluster_ids = df_giy_ids['gene_cluster_id'].tolist()
giy_gc_frames = []
batch_size = 200
for i in range(0, len(giy_cluster_ids), batch_size):
    batch = giy_cluster_ids[i:i+batch_size]
    id_list = ", ".join(f"'{cid}'" for cid in batch)
    sql = f"""
    SELECT gene_cluster_id, gtdb_species_clade_id
    FROM kbase_ke_pangenome.gene_cluster
    WHERE gene_cluster_id IN ({id_list})
    """
    df_batch = q(sql)
    giy_gc_frames.append(df_batch)

df_giy_species_raw = pd.concat(giy_gc_frames, ignore_index=True)
giy_species_counts = df_giy_species_raw.groupby('gtdb_species_clade_id').size().reset_index(name='giy_cluster_count')
giy_species_counts = giy_species_counts.sort_values('giy_cluster_count', ascending=False)
print(f"\nGIY-YIG species (from {len(giy_cluster_ids)} clusters): {len(giy_species_counts)}")
print(f"Top species by GIY-YIG count:")
print(giy_species_counts.head(10))

Total GIY-YIG annotations: 74686

Fetching sample of GIY-YIG cluster IDs...

GIY-YIG cluster IDs (sample): 1000

GIY-YIG species (from 1000 clusters): 728
Top species by GIY-YIG count:
                                 gtdb_species_clade_id  giy_cluster_count
369   s__Marinirhabdus_sp002375495--GB_GCA_002375495.1                 10
654         s__UBA2144_sp018818235--GB_GCA_018818235.1                  9
3    s__2-01-FULL-40-42_sp001789965--GB_GCA_0017899...                  8
73       s__Bradyrhizobium_elkanii--RS_GCF_023278185.1                  8
508    s__Qipengyuania_sp001542855--RS_GCF_001542855.1                  6
62         s__Balneola_sp001650905--RS_GCF_001650905.1                  5
6    s__2-01-FULL-45-33_sp001778595--GB_GCA_0017785...                  5
74    s__Bradyrhizobium_pachyrhizi--RS_GCF_001189245.1                  5
111        s__CAJBVG01_sp903958995--GB_GCA_903958995.1                  5
207     s__Flavobacterium_salmonis--RS_GCF_903819435.1                  5

# Species-level co-occurrence: DUF4041 species that also have GIY-YIG
duf4041_species = set(df_duf4041['gtdb_species_clade_id'].dropna().unique())
giy_species = set(giy_species_counts['gtdb_species_clade_id'].unique())

both_species = duf4041_species & giy_species
duf_only = duf4041_species - giy_species
giy_only = giy_species - duf4041_species

print(f"Species with DUF4041: {len(duf4041_species)}")
print(f"Species with GIY-YIG (sampled): {len(giy_species)}")
print(f"Species with BOTH (species-level): {len(both_species)}")
print(f"Species with DUF4041 only: {len(duf_only)}")
print(f"Species with GIY-YIG only: {len(giy_only)}")

# DUF4041 is our primary SNIPE marker — use it for downstream analysis
# Tag which DUF4041-bearing species also have GIY-YIG
df_duf4041['species_has_giy_yig'] = df_duf4041['gtdb_species_clade_id'].isin(giy_species)
print(f"\nDUF4041 clusters in species that also have GIY-YIG: {df_duf4041['species_has_giy_yig'].sum()}/{len(df_duf4041)}")

# For the "putative SNIPE" dataset, use all DUF4041 clusters
# (DUF4041 is already highly specific to SNIPE-like systems)
df_snipe = df_duf4041.copy()
print(f"\nUsing DUF4041 as primary SNIPE marker: {len(df_snipe)} clusters, {df_snipe['gtdb_species_clade_id'].nunique()} species")

Species with DUF4041: 1696
Species with GIY-YIG (sampled): 728
Species with BOTH (species-level): 47
Species with DUF4041 only: 1649
Species with GIY-YIG only: 681

DUF4041 clusters in species that also have GIY-YIG: 187/4572

Using DUF4041 as primary SNIPE marker: 4572 clusters, 1696 species

# Get taxonomy for DUF4041-bearing species
# NOTE: BERDL API rejects queries containing '--' (SQL comment metacharacter),
# and all species clade IDs contain '--' (e.g., s__Genus_species--RS_GCF_XXX).
# Workaround: query ALL species taxonomy (no WHERE filter on clade ID), then join locally.
# NOTE: JOIN must be on genome_id (not gtdb_taxonomy_id — format differs between tables).

snipe_clade_ids = list(df_duf4041['gtdb_species_clade_id'].dropna().unique())
print(f"Need taxonomy for {len(snipe_clade_ids)} DUF4041-bearing species")
print("Fetching all species taxonomy (unfiltered) and joining locally...")

# Get all species-level taxonomy — paginate through ~45K species
all_tax_sql = """
SELECT DISTINCT g.gtdb_species_clade_id,
       t.domain, t.phylum, t.`class`, t.`order`, t.family, t.genus, t.species
FROM kbase_ke_pangenome.genome g
JOIN kbase_ke_pangenome.gtdb_taxonomy_r214v1 t
  ON g.genome_id = t.genome_id
"""

df_all_tax = query_all(all_tax_sql)
print(f"\nTotal species with taxonomy: {len(df_all_tax)}")

# Filter to SNIPE species
df_snipe_tax = df_all_tax[df_all_tax['gtdb_species_clade_id'].isin(snipe_clade_ids)].copy()
df_snipe_tax = df_snipe_tax.drop_duplicates(subset='gtdb_species_clade_id')

print(f"SNIPE species matched: {len(df_snipe_tax)}/{len(snipe_clade_ids)}")

if len(df_snipe_tax) > 0:
    print(f"Unique phyla: {df_snipe_tax['phylum'].nunique()}")
    print(f"\nPhylum breakdown:")
    print(df_snipe_tax['phylum'].value_counts().head(20))
else:
    print("WARNING: No taxonomy matches found!")

Need taxonomy for 1696 DUF4041-bearing species
Fetching all species taxonomy (unfiltered) and joining locally...

  ... 1000/27690 rows

  ... 2000/27690 rows

  ... 3000/27690 rows

  ... 4000/27690 rows

  ... 5000/27690 rows

  ... 6000/27690 rows

  ... 7000/27690 rows

  ... 8000/27690 rows

  ... 9000/27690 rows

if len(df_snipe) > 0:
    # Convert boolean columns
    for col in ['is_core', 'is_auxiliary', 'is_singleton']:
        df_snipe[col] = df_snipe[col].astype(str).str.lower().map({'true': True, 'false': False})
    
    print("Pangenome status of putative SNIPE gene clusters:")
    print(f"  Core:      {df_snipe['is_core'].sum()} ({100*df_snipe['is_core'].mean():.1f}%)")
    print(f"  Accessory: {df_snipe['is_auxiliary'].sum()} ({100*df_snipe['is_auxiliary'].mean():.1f}%)")
    print(f"  Singleton: {df_snipe['is_singleton'].sum()} ({100*df_snipe['is_singleton'].mean():.1f}%)")
    
    print(f"\nDescription samples:")
    for desc in df_snipe['Description'].dropna().unique()[:10]:
        print(f"  - {desc}")
else:
    print("No putative SNIPE clusters found with both domains.")
    print("Will analyze DUF4041 and GIY-YIG separately in downstream notebooks.")

Pangenome status of putative SNIPE gene clusters:
  Core:      604 (13.3%)
  Accessory: 3934 (86.7%)
  Singleton: 2566 (56.5%)

Description samples:
  - T5orf172
  - Domain of unknown function (DUF4041)
  - Protein of unknown function (DUF1549)
  - Meiotically up-regulated gene 113
  - nuclear chromosome segregation
  - Histidine kinase
  - seryl-tRNA aminoacylation
  - Psort location Cytoplasmic, score 7.50
  - extracellular polysaccharide biosynthetic process
  - translation initiation factor activity

df_duf4041.to_csv(DATA_DIR / "duf4041_clusters.csv", index=False)
giy_species_counts.to_csv(DATA_DIR / "giy_yig_species.csv", index=False)
df_snipe.to_csv(DATA_DIR / "snipe_both_domains.csv", index=False)
if len(df_snipe_tax) > 0:
    df_snipe_tax.to_csv(DATA_DIR / "snipe_taxonomy.csv", index=False)
if len(df_all_tax) > 0:
    df_all_tax.to_csv(DATA_DIR / "all_species_taxonomy.csv", index=False)

print("Saved:")
print(f"  duf4041_clusters.csv       — {len(df_duf4041)} rows")
print(f"  giy_yig_species.csv        — {len(giy_species_counts)} rows")
print(f"  snipe_both_domains.csv     — {len(df_snipe)} rows (DUF4041 = primary SNIPE marker)")
print(f"  snipe_taxonomy.csv         — {len(df_snipe_tax)} rows")
print(f"  all_species_taxonomy.csv   — {len(df_all_tax)} rows")

Saved:
  duf4041_clusters.csv       — 4572 rows
  giy_yig_species.csv        — 728 rows
  snipe_both_domains.csv     — 4572 rows (DUF4041 = primary SNIPE marker)
  snipe_taxonomy.csv         — 1696 rows
  all_species_taxonomy.csv   — 27690 rows

01 Extract Snipe Domains

01 — Extract SNIPE Domain Hits from BERDL Pangenome¶

Step 1: Count DUF4041 and GIY-YIG hits¶

Step 2: Extract DUF4041 clusters with pangenome status¶

Step 3: Extract GIY-YIG clusters with pangenome status¶

Step 4: Species-level co-occurrence of DUF4041 and GIY-YIG¶

Step 5: Get taxonomy for SNIPE-bearing species¶

Step 6: Pangenome status summary¶

Step 7: Save extracted data¶

	gene_cluster_id	PFAMs	Description	Preferred_name	COG_category	GOs
0	JAFKIX010000002.1_1677	DUF4041,MUG113,T5orf172	T5orf172	-	T	-
1	CAJLIF010000005.1_29	DUF4041,T5orf172	T5orf172	-	J	-
2	NZ_JXMW01000010.1_84	DUF4041,T5orf172	T5orf172	-	S	-
3	NZ_WTKV01000012.1_71	DUF4041	T5orf172	-	M	-
4	NZ_PKNN01000035.1_23	DUF4041,T5orf172	T5orf172	-	D	-