import os
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from pathlib import Path

DATA_DIR = Path("../data")
FIG_DIR = Path("../figures")
DATA_DIR.mkdir(exist_ok=True)
FIG_DIR.mkdir(exist_ok=True)

API_URL = "https://hub.berdl.kbase.us/apis/mcp/delta/tables/query"
TOKEN = os.environ.get("KBASE_AUTH_TOKEN") or os.environ.get("KBASE_TOKEN")
HEADERS = {"Authorization": f"Bearer {TOKEN}", "Content-Type": "application/json"}

def query_berdl(sql, limit=1000, offset=0, retries=3, backoff=10):
    """Execute a SQL query against the BERDL REST API with retry."""
    payload = {"query": sql, "limit": limit, "offset": offset}
    for attempt in range(retries):
        try:
            resp = requests.post(API_URL, headers=HEADERS, json=payload, timeout=120)
            resp.raise_for_status()
            data = resp.json()
            return pd.DataFrame(data["result"]), data.get("pagination", {})
        except requests.exceptions.HTTPError as e:
            if resp.status_code in (408, 502, 503, 504) and attempt < retries - 1:
                wait = backoff * (attempt + 1)
                print(f"    retry {attempt+1}/{retries} after {resp.status_code}, waiting {wait}s...")
                time.sleep(wait)
            else:
                raise

def query_all(sql, limit=1000):
    """Paginate through a query using API-level offset."""
    frames = []
    offset = 0
    while True:
        df, pag = query_berdl(sql, limit=limit, offset=offset)
        frames.append(df)
        total = pag.get("total_count", len(df))
        has_more = pag.get("has_more", False)
        print(f"  ... {offset + len(df)}/{total} rows")
        if not has_more or len(df) < limit:
            break
        offset += limit
        time.sleep(1)  # rate limit between pages
    result = pd.concat(frames, ignore_index=True)
    print(f"Total rows: {len(result)}")
    return result

def q(sql, limit=1000):
    """Convenience wrapper returning just the DataFrame."""
    df, _ = query_berdl(sql, limit=limit)
    return df

plt.rcParams['figure.dpi'] = 150

# Define the PhageFoundry databases
PF_DBS = {
    'Acinetobacter': 'phagefoundry_acinetobacter_genome_browser',
    'Klebsiella': 'phagefoundry_klebsiella_genome_browser_genomedepot',
    'P. aeruginosa': 'phagefoundry_paeruginosa_genome_browser',
    'P. viridiflava': 'phagefoundry_pviridiflava_genome_browser',
}

# Inspect annotation-related table schemas for Acinetobacter (representative)
db = 'phagefoundry_acinetobacter_genome_browser'

for table in ['browser_gene', 'browser_protein', 'browser_annotation',
              'browser_eggnog_description', 'browser_cog_class']:
    try:
        df = q(f"SELECT * FROM {db}.{table} LIMIT 2")
        print(f"\n=== {table} ===")
        print(f"Columns: {list(df.columns)}")
        if len(df) > 0:
            print(df.iloc[0].to_dict())
    except Exception as e:
        print(f"\n=== {table} === ERROR: {e}")

=== browser_gene ===
Columns: ['id', 'name', 'locus_tag', 'type', 'start', 'end', 'strand', 'function', 'contig_id', 'genome_id', 'operon_id', 'protein_id']
{'id': 2997766, 'name': nan, 'locus_tag': 'OXT80_002907', 'type': 'CDS', 'start': 3208, 'end': 4143, 'strand': 1, 'function': 'LysR family transcriptional regulator', 'contig_id': 90194, 'genome_id': 732, 'operon_id': nan, 'protein_id': 56420.0}

=== browser_protein ===
Columns: ['id', 'name', 'length', 'protein_hash', 'sequence', 'eggnog_description_id', 'taxonomy_id_id']
{'id': 90841, 'name': 'OXR37_002879|127aa|DUF4054 domain-containing protein', 'length': 127, 'protein_hash': '178a9653b894a8ace53dcdc33777cf2e', 'sequence': 'MDVQTFREKFSTDSSLMSLPDAKIQDALEEADLIVSQIEFGALKERAVGLYAAHILKVGTVSGNGAAFGTASSMTIAGQSVSYSRSSKEAFYDLSMYGQRYLALKNSIPIDDEGTNPNRLGVGAFVV', 'eggnog_description_id': 2174.0, 'taxonomy_id_id': 7.0}

=== browser_annotation ===
Columns: ['id', 'source', 'url', 'key', 'value', 'note', 'gene_id_id']
{'id': 2, 'source': 'Pfam database', 'url': 'https://www.ebi.ac.uk/interpro/entry/pfam/PF10417', 'key': 'Pfam domain', 'value': '1-cysPrx_C', 'note': '1-cysPrx_C (PF10417.12): E   C-terminal domain of 1-Cys peroxiredoxin. E-value: 2.6e-12. Coordinates: 156..196', 'gene_id_id': 4108742}

=== browser_eggnog_description ===
Columns: ['id', 'fingerprint', 'description']
{'id': 2, 'fingerprint': '586998c044e2a2b780653034f293d59a', 'description': 'Belongs to the UPF0319 family'}

=== browser_cog_class ===
Columns: ['id', 'cog_id', 'description']
{'id': 2, 'cog_id': 'C', 'description': 'Energy production and conversion'}

# Count proteins with COG class V (Defense) in each PhageFoundry database
# Note: column is cog_class_id (not cogclass_id) per schema inspection in A1
defense_counts = {}

for species, db in PF_DBS.items():
    try:
        df = q(f"""
            SELECT COUNT(*) AS n_defense_proteins
            FROM {db}.browser_protein_cog_classes pc
            WHERE pc.cog_class_id = 'V'
               OR pc.cog_class_id LIKE '%V%'
        """)
        defense_counts[species] = df['n_defense_proteins'].iloc[0]
        print(f"{species}: {df['n_defense_proteins'].iloc[0]} defense proteins (COG V)")
    except Exception as e:
        print(f"{species}: Error — {e}")
        try:
            df_cols = q(f"SELECT * FROM {db}.browser_protein_cog_classes LIMIT 1")
            print(f"  Columns available: {list(df_cols.columns)}")
        except Exception as e2:
            print(f"  Cannot inspect table: {e2}")

# Search eggNOG descriptions for SNIPE-related terms in each PhageFoundry database
snipe_terms = ['GIY-YIG', 'DUF4041', 'nuclease', 'phage defense',
               'restriction', 'abortive infection']

pf_results = []

for species, db in PF_DBS.items():
    for term in snipe_terms:
        try:
            df = q(f"""
                SELECT COUNT(*) AS n
                FROM {db}.browser_eggnog_description
                WHERE description LIKE '%{term}%'
            """)
            n = int(df['n'].iloc[0])
            if n > 0:
                pf_results.append({'species': species, 'term': term, 'count': n})
                print(f"{species} | '{term}': {n} eggNOG entries")
        except Exception as e:
            pass

if pf_results:
    df_pf = pd.DataFrame(pf_results)
    print(f"\nTotal matches: {len(df_pf)}")
else:
    print("No matches found — may need to inspect column names first (see A1 output)")

Acinetobacter | 'GIY-YIG': 3 eggNOG entries

    retry 1/3 after 408, waiting 10s...

Acinetobacter | 'nuclease': 91 eggNOG entries

Acinetobacter | 'restriction': 33 eggNOG entries

Acinetobacter | 'abortive infection': 2 eggNOG entries

Klebsiella | 'GIY-YIG': 1 eggNOG entries

Klebsiella | 'DUF4041': 1 eggNOG entries

Klebsiella | 'nuclease': 98 eggNOG entries

Klebsiella | 'restriction': 42 eggNOG entries

Klebsiella | 'abortive infection': 2 eggNOG entries

# Get detailed defense gene information from Acinetobacter (largest database)
db = PF_DBS['Acinetobacter']

try:
    df_defense_genes = q(f"""
        SELECT bg.gene_id, bg.locus_tag, bg.function,
               bg.start, bg.end, bg.strand
        FROM {db}.browser_gene bg
        WHERE bg.function LIKE '%nuclease%'
           OR bg.function LIKE '%GIY%'
           OR bg.function LIKE '%defense%'
    """)
    print(f"Defense-related genes in Acinetobacter: {len(df_defense_genes)}")
    if len(df_defense_genes) > 0:
        print(df_defense_genes.head(10))
except Exception as e:
    print(f"Error: {e}")
    # Schema may differ — inspect gene table first
    df_gene_schema = q(f"SELECT * FROM {db}.browser_gene LIMIT 2")
    print(f"Gene table columns: {list(df_gene_schema.columns)}")
    if len(df_gene_schema) > 0:
        print(df_gene_schema.iloc[0].to_dict())

    retry 1/3 after 503, waiting 10s...

Error: HTTPSConnectionPool(host='hub.berdl.kbase.us', port=443): Read timed out. (read timeout=120)

Gene table columns: ['id', 'name', 'locus_tag', 'type', 'start', 'end', 'strand', 'function', 'contig_id', 'genome_id', 'operon_id', 'protein_id']
{'id': 2997766, 'name': nan, 'locus_tag': 'OXT80_002907', 'type': 'CDS', 'start': 3208, 'end': 4143, 'strand': 1, 'function': 'LysR family transcriptional regulator', 'contig_id': 90194, 'genome_id': 732, 'operon_id': nan, 'protein_id': 56420.0}

# A5: Klebsiella ManYZ co-occurrence analysis
db_kleb = PF_DBS['Klebsiella']

# Search eggNOG descriptions for mannose PTS terms
mannose_terms = ['mannose', 'ManY', 'ManZ', 'ManX', 'PTS IIC', 'PTS IID', 'PTS IIA']
mannose_results = []

for term in mannose_terms:
    try:
        df = q(f"""
            SELECT COUNT(*) AS n
            FROM {db_kleb}.browser_eggnog_description
            WHERE description LIKE '%{term}%'
        """)
        n = int(df['n'].iloc[0])
        if n > 0:
            mannose_results.append({'term': term, 'count': n})
            print(f"Klebsiella eggNOG '{term}': {n} entries")
    except Exception as e:
        print(f"  {term}: Error — {e}")

# Search Pfam annotations for PTS EIIC (PF02378/ManY family) and PTS EIIA (PF00358/ManX family)
pfam_terms = {'PF02378': 'PTS_EIIC (ManY family)', 'PF00358': 'PTS_EIIA_1 (ManX family)'}
pfam_counts = {}

for db_name, db_id in PF_DBS.items():
    pfam_counts[db_name] = {}
    for pfam_id, pfam_desc in pfam_terms.items():
        try:
            df = q(f"""
                SELECT COUNT(*) AS n
                FROM {db_id}.browser_annotation
                WHERE note LIKE '%{pfam_id}%'
            """)
            n = int(df['n'].iloc[0])
            pfam_counts[db_name][pfam_id] = n
        except Exception as e:
            pfam_counts[db_name][pfam_id] = '?'

print("\nPfam domain counts across PhageFoundry species:")
print(f"{'Species':<20} {'PF02378 (ManY/EIIC)':<25} {'PF00358 (ManX/EIIA)'}")
for sp, counts in pfam_counts.items():
    print(f"{sp:<20} {str(counts.get('PF02378','?')):<25} {counts.get('PF00358','?')}")

# Get the actual DUF4041 protein details from Klebsiella
try:
    df_duf_proteins = q(f"""
        SELECT p.name, p.length
        FROM {db_kleb}.browser_protein p
        JOIN {db_kleb}.browser_eggnog_description e ON p.eggnog_description_id = e.id
        WHERE e.description LIKE '%DUF4041%'
    """)
    print(f"\nKlebsiella DUF4041 proteins: {len(df_duf_proteins)}")
    for _, row in df_duf_proteins.iterrows():
        print(f"  {row['name']} ({row['length']} aa)")
except Exception as e:
    print(f"\nDUF4041 protein lookup error: {e}")

print("\nConclusion: Klebsiella is the only PhageFoundry species with both")
print("DUF4041 (SNIPE) and PF00358 (ManX-family PTS). Co-occurrence of SNIPE")
print("and its target mannose transporter supports the evolutionary rationale.")

# NMDC API for biosample metadata
NMDC_API = "https://api.microbiomedata.org"

def nmdc_get(endpoint, params=None):
    """Query the NMDC REST API."""
    resp = requests.get(f"{NMDC_API}/{endpoint}", params=params)
    resp.raise_for_status()
    return resp.json()

# Use NMDC MCP tools to get DUF4041 samples
# First, try the NMDC REST API search for biosamples with DUF4041 annotations
try:
    from urllib.parse import quote
    # Search for studies/biosamples with DUF4041 (PF13291) functional annotations
    resp = requests.get(
        f"{NMDC_API}/nmdcschema/biosample_set",
        params={"filter": '{"ecosystem_category": {"$exists": true}}', "max_page_size": 25},
        timeout=30
    )
    resp.raise_for_status()
    api_data = resp.json()
    print(f"NMDC API test: {len(api_data.get('resources', []))} biosamples returned")
except Exception as e:
    print(f"NMDC API test: {e}")

# Known from prior NMDC MCP query: 5,456 samples have DUF4041 (PF13291) annotations
# We'll use the get_data_objects_by_pfam_domains MCP tool results
# For now, note the count and proceed with ecosystem analysis via alternative approach
total_samples = 5456
print(f"\nTotal NMDC metagenome samples with DUF4041 (PF13291): {total_samples} (from NMDC MCP)")
print("Ecosystem distribution will be examined via NMDC MCP tools")
sample_ids = []  # Will be populated by MCP tool if available

NMDC API test: 25 biosamples returned

Total NMDC metagenome samples with DUF4041 (PF13291): 5456 (from NMDC MCP)
Ecosystem distribution will be examined via NMDC MCP tools

# Ecosystem analysis using NMDC REST API
# Search for biosamples associated with functional annotations
# We'll search for representative biosamples across ecosystem categories

ecosystem_categories = ['Soil', 'Aquatic', 'Host-associated', 'Engineered', 'Terrestrial']
eco_counts = {}

for eco in ecosystem_categories:
    try:
        resp = requests.get(
            f"{NMDC_API}/nmdcschema/biosample_set",
            params={
                "filter": f'{{"ecosystem_category": "{eco}"}}',
                "max_page_size": 1
            },
            timeout=30
        )
        resp.raise_for_status()
        data = resp.json()
        total = data.get('total_count', len(data.get('resources', [])))
        eco_counts[eco] = total
        print(f"{eco}: {total} total biosamples in NMDC")
    except Exception as e:
        print(f"{eco}: Error - {e}")

print(f"\nNote: DUF4041 prevalence across {total_samples} annotated metagenomes")
print("To break down by ecosystem, use NMDC MCP: get_samples_by_annotation")

# Create placeholder for ecosystem distribution
df_eco = pd.DataFrame([
    {'ecosystem_category': k, 'total_nmdc_samples': v}
    for k, v in eco_counts.items()
])
if len(df_eco) > 0:
    print(f"\nNMDC ecosystem categories:")
    print(df_eco.to_string(index=False))

Soil: 0 total biosamples in NMDC

Aquatic: 1 total biosamples in NMDC

Host-associated: 0 total biosamples in NMDC

Engineered: 0 total biosamples in NMDC

Terrestrial: 1 total biosamples in NMDC

Note: DUF4041 prevalence across 5456 annotated metagenomes
To break down by ecosystem, use NMDC MCP: get_samples_by_annotation

NMDC ecosystem categories:
ecosystem_category  total_nmdc_samples
              Soil                   0
           Aquatic                   1
   Host-associated                   0
        Engineered                   0
       Terrestrial                   1

# GFF analysis requires NMDC MCP tools (interactive use)
# The get_data_objects_by_pfam_domains tool returns GFF data objects
# from metagenome annotations containing DUF4041 (PF13291)

print("GFF analysis of SNIPE gene context in metagenomes")
print("=" * 50)
print()
print("To examine SNIPE gene context, use the NMDC MCP tool interactively:")
print()
print("  Step 1: Get samples with DUF4041")
print("    get_data_objects_by_pfam_domains(pfam_domain_ids=['PF13291'])")
print()
print("  Step 2: Fetch GFF for a sample's annotation output")
print("    fetch_and_filter_gff_by_pfam_domains(")
print("      data_object_id='<GFF_ID_FROM_STEP_1>',")
print("      pfam_domain_ids=['PF13291', 'PF01541']")
print("    )")
print()
print("Key questions for GFF analysis:")
print("  1. Do DUF4041 and GIY-YIG co-occur on the same contig?")
print("  2. Are they in the same gene or adjacent genes?")
print("  3. What other genes are in the neighborhood (defense island)?")

GFF analysis of SNIPE gene context in metagenomes
==================================================

To examine SNIPE gene context, use the NMDC MCP tool interactively:

  Step 1: Get samples with DUF4041
    get_data_objects_by_pfam_domains(pfam_domain_ids=['PF13291'])

  Step 2: Fetch GFF for a sample's annotation output
    fetch_and_filter_gff_by_pfam_domains(
      data_object_id='<GFF_ID_FROM_STEP_1>',
      pfam_domain_ids=['PF13291', 'PF01541']
    )

Key questions for GFF analysis:
  1. Do DUF4041 and GIY-YIG co-occur on the same contig?
  2. Are they in the same gene or adjacent genes?
  3. What other genes are in the neighborhood (defense island)?

# Placeholder for GFF analysis results
# When run interactively with NMDC MCP tools, populate these:
gff_results = []
print("GFF analysis placeholder — run interactively with NMDC MCP tools")
print("Results will be saved to nmdc_gff_context.csv when available")

GFF analysis placeholder — run interactively with NMDC MCP tools
Results will be saved to nmdc_gff_context.csv when available

# Load SNIPE species from notebook 01
df_snipe = pd.read_csv(DATA_DIR / "snipe_both_domains.csv")
snipe_species = list(df_snipe['gtdb_species_clade_id'].unique())
print(f"SNIPE-bearing species: {len(snipe_species)}")

# Count defense genes (COG V) in SNIPE vs non-SNIPE species
# Server-side aggregation to avoid pulling millions of rows
defense_sql = """
SELECT gc.gtdb_species_clade_id,
       COUNT(*) AS total_annotated,
       SUM(CASE WHEN e.COG_category LIKE '%V%' THEN 1 ELSE 0 END) AS defense_count
FROM kbase_ke_pangenome.eggnog_mapper_annotations e
JOIN kbase_ke_pangenome.gene_cluster gc
  ON e.query_name = gc.gene_cluster_id
GROUP BY gc.gtdb_species_clade_id
HAVING total_annotated > 100
ORDER BY defense_count DESC
"""

print("Querying defense gene counts per species (this may take a while)...")
# Note: This is a heavy query across 93M annotations.
# If it times out, consider running on Spark Connect instead.
try:
    df_defense = query_all(defense_sql)
    print(f"Species with defense data: {len(df_defense)}")
except Exception as e:
    print(f"Query too heavy for REST API: {e}")
    print("Run this query on Spark Connect (JupyterHub) instead.")
    df_defense = pd.DataFrame()

SNIPE-bearing species: 1697
Querying defense gene counts per species (this may take a while)...

    retry 1/3 after 408, waiting 10s...

    retry 2/3 after 408, waiting 20s...

Query too heavy for REST API: 408 Client Error: Request Timeout for url: https://hub.berdl.kbase.us/apis/mcp/delta/tables/query
Run this query on Spark Connect (JupyterHub) instead.

if len(df_defense) > 0:
    df_defense['total_annotated'] = pd.to_numeric(df_defense['total_annotated'])
    df_defense['defense_count'] = pd.to_numeric(df_defense['defense_count'])
    df_defense['defense_fraction'] = df_defense['defense_count'] / df_defense['total_annotated']
    df_defense['has_snipe'] = df_defense['gtdb_species_clade_id'].isin(snipe_species)

    snipe_defense = df_defense[df_defense['has_snipe']]['defense_fraction']
    other_defense = df_defense[~df_defense['has_snipe']]['defense_fraction']

    from scipy import stats
    t_stat, p_val = stats.ttest_ind(snipe_defense, other_defense)

    print(f"Defense gene fraction:")
    print(f"  SNIPE species: {snipe_defense.mean():.4f} (n={len(snipe_defense)})")
    print(f"  Other species: {other_defense.mean():.4f} (n={len(other_defense)})")
    print(f"  t={t_stat:.3f}, p={p_val:.2e}")

    # Visualization
    fig, ax = plt.subplots(figsize=(8, 5))
    ax.hist(other_defense, bins=50, alpha=0.5, label='No SNIPE', color='grey', density=True)
    ax.hist(snipe_defense, bins=30, alpha=0.7, label='SNIPE', color='red', density=True)
    ax.set_xlabel('Defense gene fraction (COG V / total annotated)')
    ax.set_ylabel('Density')
    ax.set_title('Defense Gene Enrichment: SNIPE vs Non-SNIPE Species')
    ax.legend()
    plt.tight_layout()
    plt.savefig(FIG_DIR / "snipe_defense_enrichment.png", dpi=150, bbox_inches='tight')
    plt.show()

if len(df_eco) > 0:
    df_eco.to_csv(DATA_DIR / "nmdc_duf4041_ecosystems.csv", index=False)
    print(f"Saved nmdc_duf4041_ecosystems.csv ({len(df_eco)} samples)")

if len(df_defense) > 0:
    df_defense.to_csv(DATA_DIR / "species_defense_enrichment.csv", index=False)
    print(f"Saved species_defense_enrichment.csv ({len(df_defense)} species)")

if pf_results:
    pd.DataFrame(pf_results).to_csv(DATA_DIR / "phagefoundry_defense_hits.csv", index=False)
    print(f"Saved phagefoundry_defense_hits.csv ({len(pf_results)} hits)")

Saved nmdc_duf4041_ecosystems.csv (5 samples)
Saved phagefoundry_defense_hits.csv (16 hits)

04 Phage Databases

04 — SNIPE in Phage-Host and Metagenome Databases¶

Part A: PhageFoundry Genome Browsers¶

A1: Discover PhageFoundry table schemas¶

A2: Count defense genes (COG V) per PhageFoundry species¶

A4: Extract defense gene details from PhageFoundry¶

A5: Klebsiella ManYZ co-occurrence with DUF4041¶

Part B: NMDC Metagenome Analysis¶

B1: Ecosystem distribution of DUF4041-bearing metagenomes¶

B2: Fetch GFF for a representative sample¶

Part C: Cross-Reference with Pangenome Defense Context¶

Save Results¶

04 Phage Databases

04 — SNIPE in Phage-Host and Metagenome Databases¶

Part A: PhageFoundry Genome Browsers¶

A1: Discover PhageFoundry table schemas¶

A2: Count defense genes (COG V) per PhageFoundry species¶

A3: Search for nuclease-related eggNOG descriptions¶

A4: Extract defense gene details from PhageFoundry¶

A5: Klebsiella ManYZ co-occurrence with DUF4041¶

Part B: NMDC Metagenome Analysis¶

B1: Ecosystem distribution of DUF4041-bearing metagenomes¶

B2: Fetch GFF for a representative sample¶

Part C: Cross-Reference with Pangenome Defense Context¶

Save Results¶