import pandas as pd
import numpy as np
from pathlib import Path

DATA_DIR = Path('../data')
DATA_DIR.mkdir(exist_ok=True)

# Load experiment metadata (cached from fitness_modules project)
exp_df = pd.read_csv('../../fitness_modules/data/annotations/DvH_experiments.csv')
print(f"Total experiments: {len(exp_df)}")
print(f"Columns: {exp_df.columns.tolist()}")
print()
print(f"expGroup distribution:")
print(exp_df['expGroup'].value_counts())
print()
print(f"Unique condition_1 values: {exp_df['condition_1'].nunique()}")
print(f"Missing condition_1: {exp_df['condition_1'].isna().sum()}")

Total experiments: 757
Columns: ['expName', 'expDesc', 'expGroup', 'condition_1', 'media', 'cor12', 'mad12', 'nMapped']

expGroup distribution:
expGroup
stress                 268
nutrient               202
respiratory growth     145
nitrogen source        125
fermentative growth     17
Name: count, dtype: int64

Unique condition_1 values: 91
Missing condition_1: 99

# Classification mapping: condition_1 substring -> category
# Order matters: first match wins. More specific patterns before general ones.
CLASSIFICATION_RULES = [
    # Field-core: DvH primary metabolism
    # Note: 'persulfate' and 'zinc sulfate' must come before 'sulfate'
    ('d-lactate', 'field-core'),
    ('l-lactate', 'field-core'),
    ('d,l-lactate', 'field-core'),
    ('formate', 'field-core'),
    ('pyruvate', 'field-core'),
    ('fumarate', 'field-core'),
    
    # Field-stress: FRC contaminants and environmental stresses
    # Must come before 'sulfate' rule
    ('uranyl', 'field-stress'),
    ('mercury', 'field-stress'),
    ('chromate', 'field-stress'),
    ('nitrate', 'field-stress'),
    ('nitrite', 'field-stress'),
    ('n2', 'field-stress'),  # oxygen stress (anaerobe!)
    ('deta/no', 'field-stress'),  # NO donor
    ('peroxynitrite', 'field-stress'),
    ('persulfate', 'field-stress'),  # oxidizing agent, NOT sulfate metabolism
    ('chlorite', 'field-stress'),
    ('chlorate', 'field-stress'),
    ('perchlorate', 'field-stress'),
    ('fluorophosphate', 'field-stress'),
    
    # Heavy metals (environmental)
    # 'zinc sulfate' must come before 'sulfate'
    ('zinc', 'heavy-metals'),
    ('cobalt', 'heavy-metals'),
    ('nickel', 'heavy-metals'),
    ('copper', 'heavy-metals'),
    ('manganese', 'heavy-metals'),
    ('selenate', 'heavy-metals'),
    ('molybdate', 'heavy-metals'),
    ('tungstate', 'heavy-metals'),
    ('aluminum', 'heavy-metals'),
    
    # Field-core: sulfate and H2 (after specific compounds containing sulfate/zinc)
    ('sulfate', 'field-core'),
    ('h2', 'field-core'),
    
    # Lab-antibiotic
    ('tetracycline', 'lab-antibiotic'),
    ('chloramphenicol', 'lab-antibiotic'),
    ('spectinomycin', 'lab-antibiotic'),
    ('cefoxitin', 'lab-antibiotic'),
    ('piperacillin', 'lab-antibiotic'),
    ('antimycin', 'lab-antibiotic'),
    ('heptyl', 'lab-antibiotic'),  # HQNO - respiratory inhibitor
    ('fccp', 'lab-antibiotic'),  # uncoupler
    
    # Lab-other: reagents, misc
    ('dimethyl sulfoxide', 'lab-other'),
    ('polyethylene glycol', 'lab-other'),
    ('formamide', 'lab-other'),
    ('casamino acids', 'lab-other'),
    ('sucrose treatment', 'lab-other'),
    ('furfuraldehyde', 'lab-other'),
    ('tetramethylethylenediamine', 'lab-other'),
    ('rubidium', 'lab-other'),
    ('sodium chloride', 'lab-other'),  # osmotic stress
    ('dipyridyl', 'lab-other'),  # iron chelator
    ('bathophenanthroline', 'lab-other'),  # iron chelator
    ('ethanol', 'lab-other'),
    ('pantothenate', 'lab-other'),
    ('glycerol', 'lab-other'),
]

# Remaining amino acids and nutrients -> lab-nutrient
AMINO_ACIDS = [
    'serine', 'valine', 'methionine', 'cysteine', 'tryptophan',
    'isoleucine', 'phenylalanine', 'threonine', 'glutamine', 'lysine',
    'asparagine', 'proline', 'glycine', 'alanine', 'arginine', 'leucine',
    'tyrosine', 'histidine', 'homoserine', 'glutamic', 'aspartic',
]

OTHER_NUTRIENTS = [
    'ammonium', 'urea', 'putrescine', 'spermidine', 'thymine', 'cytosine',
    'isocitric', 'succinate', 'citric', 'malic', 'butyrate', 'oxobutyric',
    'oxaloacetic', 'ketoglutaric', 'oxobutanoic', 'adenosyl',
]

print(f"Classification rules defined: {len(CLASSIFICATION_RULES)} specific rules")
print(f"Amino acids: {len(AMINO_ACIDS)}")
print(f"Other nutrients: {len(OTHER_NUTRIENTS)}")

Classification rules defined: 52 specific rules
Amino acids: 21
Other nutrients: 16

def classify_condition(condition_1, exp_group):
    """Classify an experiment by its condition_1 value."""
    if pd.isna(condition_1):
        # Experiments with no condition_1 -- classify by expGroup
        if exp_group == 'respiratory growth':
            return 'field-core'
        elif exp_group == 'fermentative growth':
            return 'field-core'
        elif exp_group == 'nitrogen source':
            return 'lab-nutrient'
        else:
            return 'lab-other'
    
    cond_lower = condition_1.lower()
    
    # Check specific rules first
    for pattern, category in CLASSIFICATION_RULES:
        if pattern in cond_lower:
            return category
    
    # Check amino acids
    for aa in AMINO_ACIDS:
        if aa in cond_lower:
            return 'lab-nutrient'
    
    # Check other nutrients
    for nutr in OTHER_NUTRIENTS:
        if nutr in cond_lower:
            return 'lab-nutrient'
    
    # Fallback: use expGroup
    if exp_group == 'stress':
        return 'lab-other'
    elif exp_group in ('nutrient', 'nitrogen source'):
        return 'lab-nutrient'
    elif exp_group in ('respiratory growth', 'fermentative growth'):
        return 'field-core'
    else:
        return 'lab-other'


# Apply classification
exp_df['category'] = exp_df.apply(
    lambda row: classify_condition(row['condition_1'], row['expGroup']),
    axis=1
)

print("Classification results:")
print(exp_df['category'].value_counts())
print()
print(f"Total classified: {len(exp_df)}")

Classification results:
category
lab-nutrient      237
field-core        204
lab-other         140
field-stress       78
heavy-metals       55
lab-antibiotic     43
Name: count, dtype: int64

Total classified: 757

# Show representative experiments per category
for cat in ['field-core', 'field-stress', 'heavy-metals', 'lab-nutrient', 'lab-antibiotic', 'lab-other']:
    subset = exp_df[exp_df['category'] == cat]
    print(f"\n{'='*60}")
    print(f"{cat}: {len(subset)} experiments")
    print(f"{'='*60}")
    # Show unique conditions
    cond_counts = subset['condition_1'].fillna('(none)').value_counts()
    for c, n in cond_counts.items():
        print(f"  {c}: {n}")

============================================================
field-core: 204 experiments
============================================================
  Sodium Formate: 67
  Sodium pyruvate: 65
  (none): 33
  Sodium D,L-Lactate: 18
  H2: 9
  Sodium Fumarate dibasic: 3
  Sodium sulfate: 3
  Sodium D-Lactate: 3
  Sodium L-Lactate: 3

============================================================
field-stress: 78 experiments
============================================================
  Sodium nitrite: 15
  N2: 9
  Uranyl acetate: 6
  Sodium perchlorate monohydrate: 6
  Sodium Chlorite: 6
  Sodium Chlorate: 6
  DETA/NO: 6
  Sodium nitrate: 5
  mercury (II) chloride: 5
  Sodium Persulfate: 3
  ammonium persulfate: 3
  peroxynitrite: 3
  Sodium Fluorophosphate: 3
  Sodium Chromate: 2

============================================================
heavy-metals: 55 experiments
============================================================
  Aluminum chloride hydrate: 9
  Cobalt chloride hexahydrate: 9
  Sodium molybdate: 8
  Nickel (II) chloride hexahydrate: 6
  Manganese (II) chloride tetrahydrate: 6
  Zinc sulfate heptahydrate: 6
  Sodium tungstate dihydrate: 5
  copper (II) chloride dihydrate: 3
  Sodium selenate: 3

============================================================
lab-nutrient: 237 experiments
============================================================
  L-Serine: 26
  L-Valine (HPLC purified): 15
  L-Methionine: 14
  L-Cysteine: 11
  L-Tryptophan: 9
  S-adenosyl Homocysteine: 9
  L-Isoleucine: 9
  Ammonium chloride: 8
  L-Phenylalanine: 8
  L-Threonine: 7
  Putrescine Dihydrochloride: 6
  Urea: 6
  L-Glutamine: 6
  L-Lysine: 6
  L-Asparagine: 6
  L-Valine: 6
  L-Proline: 6
  Glycine: 6
  L-Alanine: 6
  L-Arginine: 6
  L-Leucine: 6
  spermidine: 5
  L-Glutamic acid monopotassium salt monohydrate: 5
  L-tyrosine: 3
  Thymine: 3
  DL-Isocitric acid trisodium salt hydrate: 3
  Sodium succinate dibasic hexahydrate: 3
  Citric Acid: 3
  L-Malic acid disodium salt monohydrate: 3
  Sodium butyrate: 3
  2-oxobutyric acid sodium salt: 3
  Cytosine: 3
  Oxaloacetic acid: 3
  a-Ketoglutaric acid disodium salt hydrate: 3
  L-Aspartic Acid: 3
  3-Methyl-2-oxobutanoic acid sodium salt: 3
  L-Homoserine: 3
  L-Histidine: 3

============================================================
lab-antibiotic: 43 experiments
============================================================
  2-n-Heptyl-4-hydroxyquinoline N-oxide: 9
  Antimycin A: 9
  Spectinomycin dihydrochloride pentahydrate: 6
  FCCP: 6
  Piperacillin sodium salt: 4
  Cefoxitin sodium salt: 3
  Chloramphenicol: 3
  Tetracycline hydrochloride: 3

============================================================
lab-other: 140 experiments
============================================================
  (none): 66
  casamino acids: 9
  Dimethyl Sulfoxide: 8
  sucrose treatment: 6
  Glycerol: 6
  Polyethylene glycol: 6
  calcium pantothenate: 6
  Formamide: 6
  N,N,N′,N′-Tetramethylethylenediamine: 6
  rubidium chloride: 6
  Sodium Chloride: 6
  2-Furfuraldehyde: 3
  2,2'-Dipyridyl: 2
  Ethanol: 2
  Bathophenanthrolinedisulfonic acid disodium salt hydrate: 2

# Cross-tabulate category vs expGroup for sanity check
ct = pd.crosstab(exp_df['category'], exp_df['expGroup'], margins=True)
print("Category x expGroup cross-tabulation:")
ct

Category x expGroup cross-tabulation:

# Broader grouping: field vs lab
FIELD_CATEGORIES = {'field-core', 'field-stress', 'heavy-metals'}
LAB_CATEGORIES = {'lab-nutrient', 'lab-antibiotic', 'lab-other'}

exp_df['broad_category'] = exp_df['category'].apply(
    lambda c: 'field' if c in FIELD_CATEGORIES else 'lab'
)

print("Broad classification:")
print(exp_df['broad_category'].value_counts())
print()
print(f"Field experiments: {(exp_df['broad_category'] == 'field').sum()}")
print(f"Lab experiments: {(exp_df['broad_category'] == 'lab').sum()}")

Broad classification:
broad_category
lab      420
field    337
Name: count, dtype: int64

Field experiments: 337
Lab experiments: 420

# Save
out_file = DATA_DIR / 'experiment_classification.csv'
exp_df.to_csv(out_file, index=False)
print(f"Saved: {out_file} ({len(exp_df)} experiments)")
print()
print("Columns saved:")
for c in exp_df.columns:
    print(f"  {c}")

Saved: ../data/experiment_classification.csv (757 experiments)

Columns saved:
  expName
  expDesc
  expGroup
  condition_1
  media
  cor12
  mad12
  nMapped
  category
  broad_category

print("=" * 60)
print("CONDITION CLASSIFICATION SUMMARY")
print("=" * 60)
print(f"Total experiments: {len(exp_df)}")
print()
print("By category:")
for cat, n in exp_df['category'].value_counts().items():
    pct = n / len(exp_df) * 100
    print(f"  {cat:20s}: {n:4d} ({pct:5.1f}%)")
print()
print("By broad category:")
for cat, n in exp_df['broad_category'].value_counts().items():
    pct = n / len(exp_df) * 100
    print(f"  {cat:20s}: {n:4d} ({pct:5.1f}%)")
print("=" * 60)

============================================================
CONDITION CLASSIFICATION SUMMARY
============================================================
Total experiments: 757

By category:
  lab-nutrient        :  237 ( 31.3%)
  field-core          :  204 ( 26.9%)
  lab-other           :  140 ( 18.5%)
  field-stress        :   78 ( 10.3%)
  heavy-metals        :   55 (  7.3%)
  lab-antibiotic      :   43 (  5.7%)

By broad category:
  lab                 :  420 ( 55.5%)
  field               :  337 ( 44.5%)
============================================================

02 Condition Classification

NB 02: Condition Classification¶

1. Define Classification Rules¶

2. Apply Classification¶

3. Validate Classification¶

4. Create Broader Field vs Lab Grouping¶

5. Save Classification¶

expGroup	fermentative growth	nitrogen source	nutrient	respiratory growth	stress	All
category
field-core	17	0	6	145	36	204
field-stress	0	12	0	0	66	78
heavy-metals	0	0	0	0	55	55
lab-antibiotic	0	0	0	0	43	43
lab-nutrient	0	98	130	0	9	237
lab-other	0	15	66	0	59	140
All	17	125	202	145	268	757