import pandas as pd
import numpy as np
import os
import re
from pathlib import Path

# Paths
PROJ = Path('..') 
REPO = PROJ / '..'
EXP_DIR = REPO / 'fitness_modules' / 'data' / 'annotations'
MAT_DIR = REPO / 'fitness_modules' / 'data' / 'matrices'
METAL_ATLAS = REPO / 'metal_fitness_atlas' / 'data'
DATA_OUT = PROJ / 'data'
DATA_OUT.mkdir(exist_ok=True)

# Scan all experiment annotation files for Sodium Chloride experiments
nacl_records = []

for f in sorted(EXP_DIR.glob('*_experiments.csv')):
    org = f.stem.replace('_experiments', '')
    edf = pd.read_csv(f)
    
    # Find pure NaCl/Sodium Chloride experiments
    mask = edf['condition_1'].astype(str).str.contains(
        r'^Sodium Chloride$|^sodium chloride$', case=False, na=False, regex=True
    )
    
    # Also check for rubidium chloride (DvH has these — another chloride control)
    mask_rb = edf['condition_1'].astype(str).str.contains(
        r'rubidium chloride', case=False, na=False
    )
    
    for idx, row in edf[mask].iterrows():
        # Extract concentration from expDesc
        desc = str(row.get('expDesc', ''))
        conc_match = re.search(r'Chloride\s+([\d.]+)\s*mM', desc)
        conc_mm = float(conc_match.group(1)) if conc_match else np.nan
        
        nacl_records.append({
            'orgId': org,
            'expName': row['expName'],
            'expDesc': desc,
            'condition_1': row['condition_1'],
            'salt_type': 'NaCl',
            'chloride_conc_mM': conc_mm,
            'in_matrix': True  # will verify below
        })
    
    for idx, row in edf[mask_rb].iterrows():
        desc = str(row.get('expDesc', ''))
        conc_match = re.search(r'Rubidium Chloride\s+([\d.]+)\s*mM', desc)
        conc_mm = float(conc_match.group(1)) if conc_match else np.nan
        
        nacl_records.append({
            'orgId': org,
            'expName': row['expName'],
            'expDesc': desc,
            'condition_1': row['condition_1'],
            'salt_type': 'RbCl',
            'chloride_conc_mM': conc_mm,
            'in_matrix': True
        })

nacl_df = pd.DataFrame(nacl_records)

# Verify experiments are in cached matrices
for org in nacl_df['orgId'].unique():
    mat_file = MAT_DIR / f'{org}_fitness_matrix.csv'
    if mat_file.exists():
        mat_cols = set(pd.read_csv(mat_file, nrows=0).columns)
        mask = nacl_df['orgId'] == org
        nacl_df.loc[mask, 'in_matrix'] = nacl_df.loc[mask, 'expName'].isin(mat_cols)
    else:
        nacl_df.loc[nacl_df['orgId'] == org, 'in_matrix'] = False

print(f'Total NaCl experiments found: {len(nacl_df)}')
print(f'In cached matrices: {nacl_df["in_matrix"].sum()}')
print(f'Organisms: {nacl_df["orgId"].nunique()}')
print()
print('Per-organism summary:')
summary = nacl_df[nacl_df['in_matrix']].groupby('orgId').agg(
    n_exps=('expName', 'count'),
    conc_range=('chloride_conc_mM', lambda x: f"{x.min():.0f}-{x.max():.0f} mM" if x.notna().any() else 'unknown')
).reset_index()
print(summary.to_string(index=False))

Total NaCl experiments found: 71
In cached matrices: 71
Organisms: 25

Per-organism summary:
             orgId  n_exps   conc_range
              ANA3       1   500-500 mM
             BFirm       1   200-200 mM
            Btheta       1   350-350 mM
             Caulo       2   100-100 mM
              Cola       4  750-1000 mM
           Cup4G11       2   300-300 mM
              Dino       2   600-600 mM
               DvH      12    25-125 mM
              Kang       1   500-500 mM
              Keio       1   750-750 mM
             Korea       2   100-200 mM
              Koxy       2 1000-1000 mM
               MR1       2   250-350 mM
            Marino       1 1000-1000 mM
              Miya       2    62-125 mM
           Pedo557       2   300-300 mM
             Phaeo       4   600-800 mM
             Ponti       4   500-750 mM
              SB2B       3   500-700 mM
              SynE      12     0-250 mM
            psRCH2       3   200-400 mM
pseudo13_GW456_L13       1   400-400 mM
      pseudo1_N1B4       2   200-300 mM
      pseudo3_N2E3       1   500-500 mM
      pseudo6_N2E2       3   500-600 mM

# Save NaCl experiments
nacl_df.to_csv(DATA_OUT / 'nacl_experiments.csv', index=False)
print(f'Saved {len(nacl_df)} NaCl experiments to data/nacl_experiments.csv')
nacl_df[nacl_df['in_matrix']].head(20)

Saved 71 NaCl experiments to data/nacl_experiments.csv

# For each organism with NaCl experiments in the matrix,
# extract the NaCl fitness columns and compute per-gene summaries
nacl_in_matrix = nacl_df[nacl_df['in_matrix'] & (nacl_df['salt_type'] == 'NaCl')]

fitness_records = []
important_records = []

for org in sorted(nacl_in_matrix['orgId'].unique()):
    mat_file = MAT_DIR / f'{org}_fitness_matrix.csv'
    mat = pd.read_csv(mat_file, index_col=0)
    
    # Get NaCl experiment columns for this organism
    org_nacl = nacl_in_matrix[nacl_in_matrix['orgId'] == org]
    nacl_cols = [c for c in org_nacl['expName'] if c in mat.columns]
    
    if not nacl_cols:
        continue
    
    nacl_mat = mat[nacl_cols]
    
    # Per-gene NaCl fitness summary
    for gene in nacl_mat.index:
        values = nacl_mat.loc[gene].dropna()
        if len(values) == 0:
            continue
        
        mean_fit = values.mean()
        min_fit = values.min()
        max_fit = values.max()
        n_sick = (values < -1).sum()  # Note: matrices have fit values, no t-scores
        n_beneficial = (values > 1).sum()
        
        fitness_records.append({
            'orgId': org,
            'locusId': gene,
            'nacl_mean_fit': mean_fit,
            'nacl_min_fit': min_fit,
            'nacl_max_fit': max_fit,
            'n_nacl_exps': len(values),
            'n_sick': int(n_sick),
            'n_beneficial': int(n_beneficial)
        })
        
        # NaCl-important: mean fit < -1 OR at least 1 sick experiment
        if mean_fit < -1 or n_sick >= 1:
            important_records.append({
                'orgId': org,
                'locusId': gene,
                'nacl_mean_fit': mean_fit,
                'nacl_min_fit': min_fit,
                'n_nacl_exps': len(values),
                'n_sick': int(n_sick)
            })

nacl_fitness = pd.DataFrame(fitness_records)
nacl_important = pd.DataFrame(important_records)

print(f'Total gene-NaCl fitness records: {len(nacl_fitness)}')
print(f'NaCl-important genes: {len(nacl_important)} ({100*len(nacl_important)/len(nacl_fitness):.1f}%)')
print()
print('Per-organism NaCl-important genes:')
imp_summary = nacl_important.groupby('orgId').agg(
    n_important=('locusId', 'count'),
    mean_fit=('nacl_mean_fit', 'mean')
).reset_index()
# Add total genes per organism
total_genes = nacl_fitness.groupby('orgId')['locusId'].count().reset_index()
total_genes.columns = ['orgId', 'n_total']
imp_summary = imp_summary.merge(total_genes, on='orgId')
imp_summary['pct_important'] = 100 * imp_summary['n_important'] / imp_summary['n_total']
print(imp_summary.sort_values('pct_important', ascending=False).to_string(index=False))

Total gene-NaCl fitness records: 94908
NaCl-important genes: 4648 (4.9%)

Per-organism NaCl-important genes:
             orgId  n_important  mean_fit  n_total  pct_important
              SynE          620 -0.615163     1899      32.648763
            psRCH2          350 -0.915132     3349      10.450881
              ANA3          342 -2.127316     3668       9.323882
             Ponti          340 -1.320575     3685       9.226594
             Caulo          266 -0.907665     3312       8.031401
              SB2B          205 -1.133591     3121       6.568408
      pseudo6_N2E2          302 -1.034750     5133       5.883499
             Phaeo          164 -0.868245     3099       5.292030
            Btheta          182 -1.594476     4055       4.488286
           Pedo557          191 -2.526817     4423       4.318336
              Keio          162 -1.701762     3789       4.275534
              Kang           80 -1.509847     2003       3.994009
              Cola          155 -1.032217     3954       3.920081
           Cup4G11          248 -1.312871     6384       3.884712
               DvH          105 -0.869730     2741       3.830719
              Miya           96 -1.723044     2531       3.792967
               MR1          136 -1.445324     3782       3.595981
      pseudo1_N1B4          139 -0.954906     4336       3.205720
      pseudo3_N2E3          122 -1.674120     5028       2.426412
              Koxy           99 -1.314036     4608       2.148438
             Korea           70 -1.456059     3393       2.063071
              Dino           65 -1.296740     3187       2.039536
             BFirm           91 -1.786010     5428       1.676492
pseudo13_GW456_L13           72 -1.822338     4350       1.655172
            Marino           46 -1.756951     3650       1.260274

# Save
nacl_fitness.to_csv(DATA_OUT / 'nacl_fitness_summary.csv', index=False)
nacl_important.to_csv(DATA_OUT / 'nacl_important_genes.csv', index=False)
print(f'Saved {len(nacl_fitness)} fitness records and {len(nacl_important)} important genes')

Saved 94908 fitness records and 4648 important genes

# Load metal experiments from the metal fitness atlas
metal_exps = pd.read_csv(METAL_ATLAS / 'metal_experiments.csv')

# Determine counter ion and valence for each compound
def classify_counter_ion(row):
    compound = str(row['condition_1']).lower()
    metal = row['metal_element']
    conc = row.get('concentration', np.nan)
    
    if 'chloride' in compound:
        counter_ion = 'chloride'
        # Determine valence: Al is 3+, all others are 2+
        valence = 3 if metal == 'Aluminum' else 2
        cl_conc = valence * conc if pd.notna(conc) else np.nan
    elif 'sulfate' in compound:
        counter_ion = 'sulfate'
        cl_conc = 0  # no chloride from sulfate salts
    elif 'acetate' in compound:
        counter_ion = 'acetate'
        cl_conc = 0
    elif 'pyrithione' in compound:
        counter_ion = 'pyrithione'
        cl_conc = 0
    elif any(x in compound for x in ['chromat', 'dichromat', 'molybdat', 'selenat', 'tungstat']):
        counter_ion = 'oxyanion (Na+ counter cation)'
        cl_conc = 0  # metal IS the anion
    elif 'cisplatin' in compound:
        counter_ion = 'cisplatin'
        cl_conc = 0
    else:
        # Media components (pyruvate, formate, serine, etc.) — multi-condition exps
        counter_ion = 'media_component'
        cl_conc = np.nan
    
    return pd.Series({'counter_ion': counter_ion, 'effective_cl_mM': cl_conc})

cl_info = metal_exps.apply(classify_counter_ion, axis=1)
metal_exps_cl = pd.concat([metal_exps, cl_info], axis=1)

# Summary
print('Counter ion distribution:')
print(metal_exps_cl['counter_ion'].value_counts().to_string())
print()

# Effective Cl- by metal (for chloride salts only)
cl_salts = metal_exps_cl[metal_exps_cl['counter_ion'] == 'chloride']
print('Effective Cl⁻ concentrations by metal (chloride salts):')
cl_summary = cl_salts.groupby('metal_element').agg(
    n_exps=('expName', 'count'),
    min_cl=('effective_cl_mM', 'min'),
    max_cl=('effective_cl_mM', 'max'),
    mean_cl=('effective_cl_mM', 'mean')
).sort_values('max_cl', ascending=False)
print(cl_summary.to_string())

Counter ion distribution:
counter_ion
chloride                         317
media_component                   96
cisplatin                         67
sulfate                           35
oxyanion (Na+ counter cation)     21
pyrithione                        14
acetate                            9

Effective Cl⁻ concentrations by metal (chloride salts):
               n_exps     min_cl  max_cl     mean_cl
metal_element                                       
Cobalt             89   0.010000   500.0   26.224898
Manganese           6  20.000000   200.0  110.000000
Aluminum           48   0.234375    30.0    8.485742
Mercury             5  20.000000    20.0   20.000000
Iron               36   0.400000    10.0    5.133333
Copper             51   0.100000     6.0    1.914375
Nickel             79   0.020000     4.0    1.598947
Cadmium             3        NaN     NaN         NaN

# Save
metal_exps_cl.to_csv(DATA_OUT / 'effective_chloride_concentrations.csv', index=False)
print(f'Saved {len(metal_exps_cl)} records with effective Cl- concentrations')

Saved 559 records with effective Cl- concentrations

# Organisms with NaCl in matrix
nacl_orgs = set(nacl_in_matrix['orgId'].unique())

# Organisms with metal chloride experiments (and cached matrices)
metal_cl_orgs = set(cl_salts['orgId'].unique())
orgs_with_matrices = set(f.stem.replace('_fitness_matrix', '') for f in MAT_DIR.glob('*_fitness_matrix.csv'))
metal_cl_with_matrix = metal_cl_orgs & orgs_with_matrices

# Overlap: organisms with BOTH NaCl and metal chloride in cached matrices
testable_orgs = nacl_orgs & metal_cl_with_matrix

print(f'Organisms with NaCl experiments in matrix: {len(nacl_orgs)}')
print(f'  {sorted(nacl_orgs)}')
print(f'\nOrganisms with metal chloride experiments + matrix: {len(metal_cl_with_matrix)}')
print(f'  {sorted(metal_cl_with_matrix)}')
print(f'\nTestable organisms (both NaCl and metal-Cl): {len(testable_orgs)}')
print(f'  {sorted(testable_orgs)}')
print()

# For each testable organism, show NaCl and metal data available
print('Detailed coverage for testable organisms:')
for org in sorted(testable_orgs):
    n_nacl = len(nacl_in_matrix[nacl_in_matrix['orgId'] == org])
    org_metals = cl_salts[cl_salts['orgId'] == org]
    metals = sorted(org_metals['metal_element'].unique())
    max_cl = org_metals['effective_cl_mM'].max()
    print(f'  {org}: {n_nacl} NaCl exps, metals={metals}, max Cl⁻={max_cl:.0f} mM')

Organisms with NaCl experiments in matrix: 25
  ['ANA3', 'BFirm', 'Btheta', 'Caulo', 'Cola', 'Cup4G11', 'Dino', 'DvH', 'Kang', 'Keio', 'Korea', 'Koxy', 'MR1', 'Marino', 'Miya', 'Pedo557', 'Phaeo', 'Ponti', 'SB2B', 'SynE', 'psRCH2', 'pseudo13_GW456_L13', 'pseudo1_N1B4', 'pseudo3_N2E3', 'pseudo6_N2E2']

Organisms with metal chloride experiments + matrix: 31
  ['ANA3', 'BFirm', 'Btheta', 'Caulo', 'Cola', 'Cup4G11', 'Dino', 'DvH', 'Kang', 'Keio', 'Korea', 'Koxy', 'MR1', 'Marino', 'Methanococcus_JJ', 'Methanococcus_S2', 'Miya', 'PV4', 'Pedo557', 'Phaeo', 'Ponti', 'SB2B', 'SynE', 'WCS417', 'acidovorax_3H11', 'psRCH2', 'pseudo13_GW456_L13', 'pseudo1_N1B4', 'pseudo3_N2E3', 'pseudo5_N2C3_1', 'pseudo6_N2E2']

Testable organisms (both NaCl and metal-Cl): 25
  ['ANA3', 'BFirm', 'Btheta', 'Caulo', 'Cola', 'Cup4G11', 'Dino', 'DvH', 'Kang', 'Keio', 'Korea', 'Koxy', 'MR1', 'Marino', 'Miya', 'Pedo557', 'Phaeo', 'Ponti', 'SB2B', 'SynE', 'psRCH2', 'pseudo13_GW456_L13', 'pseudo1_N1B4', 'pseudo3_N2E3', 'pseudo6_N2E2']

Detailed coverage for testable organisms:
  ANA3: 1 NaCl exps, metals=['Aluminum', 'Cobalt', 'Nickel'], max Cl⁻=15 mM
  BFirm: 1 NaCl exps, metals=['Cobalt', 'Nickel'], max Cl⁻=2 mM
  Btheta: 1 NaCl exps, metals=['Cobalt', 'Copper', 'Nickel'], max Cl⁻=2 mM
  Caulo: 2 NaCl exps, metals=['Aluminum', 'Cobalt', 'Copper', 'Nickel'], max Cl⁻=4 mM
  Cola: 4 NaCl exps, metals=['Aluminum', 'Cobalt', 'Copper', 'Nickel'], max Cl⁻=6 mM
  Cup4G11: 2 NaCl exps, metals=['Cobalt', 'Copper', 'Nickel'], max Cl⁻=1 mM
  Dino: 2 NaCl exps, metals=['Aluminum', 'Cobalt', 'Copper', 'Nickel'], max Cl⁻=2 mM
  DvH: 6 NaCl exps, metals=['Aluminum', 'Cobalt', 'Copper', 'Manganese', 'Mercury', 'Nickel'], max Cl⁻=500 mM
  Kang: 1 NaCl exps, metals=['Cobalt', 'Copper', 'Nickel'], max Cl⁻=2 mM
  Keio: 1 NaCl exps, metals=['Aluminum', 'Cobalt', 'Copper', 'Nickel'], max Cl⁻=15 mM
  Korea: 2 NaCl exps, metals=['Aluminum', 'Cobalt', 'Copper', 'Nickel'], max Cl⁻=12 mM
  Koxy: 2 NaCl exps, metals=['Cobalt', 'Copper', 'Nickel'], max Cl⁻=6 mM
  MR1: 2 NaCl exps, metals=['Aluminum', 'Cobalt', 'Copper', 'Nickel'], max Cl⁻=12 mM
  Marino: 1 NaCl exps, metals=['Aluminum', 'Cobalt', 'Copper', 'Nickel'], max Cl⁻=3 mM
  Miya: 2 NaCl exps, metals=['Aluminum'], max Cl⁻=30 mM
  Pedo557: 2 NaCl exps, metals=['Aluminum', 'Cobalt', 'Copper', 'Nickel'], max Cl⁻=3 mM
  Phaeo: 4 NaCl exps, metals=['Aluminum', 'Cobalt', 'Copper', 'Nickel'], max Cl⁻=4 mM
  Ponti: 4 NaCl exps, metals=['Cobalt', 'Copper', 'Nickel'], max Cl⁻=4 mM
  SB2B: 3 NaCl exps, metals=['Aluminum', 'Cobalt', 'Copper', 'Nickel'], max Cl⁻=8 mM
  SynE: 12 NaCl exps, metals=['Aluminum'], max Cl⁻=1 mM
  psRCH2: 3 NaCl exps, metals=['Aluminum', 'Cadmium', 'Cobalt', 'Copper', 'Nickel'], max Cl⁻=2 mM
  pseudo13_GW456_L13: 1 NaCl exps, metals=['Aluminum', 'Cobalt', 'Nickel'], max Cl⁻=15 mM
  pseudo1_N1B4: 2 NaCl exps, metals=['Aluminum', 'Cobalt', 'Copper'], max Cl⁻=8 mM
  pseudo3_N2E3: 1 NaCl exps, metals=['Cobalt', 'Copper', 'Nickel'], max Cl⁻=5 mM
  pseudo6_N2E2: 3 NaCl exps, metals=['Aluminum', 'Cobalt', 'Copper', 'Nickel'], max Cl⁻=12 mM

01 Nacl Identification

NB01: NaCl Experiment Identification and Fitness Extraction¶

1. Identify NaCl Experiments Across All Organisms¶

2. Extract NaCl Fitness Profiles and Identify NaCl-Important Genes¶

3. Compute Effective Chloride Concentrations for Metal Experiments¶

4. Identify Organisms with Both NaCl and Metal Chloride Data¶

Summary¶

	orgId	expName	expDesc	condition_1	salt_type	chloride_conc_mM	in_matrix
0	ANA3	set3IT059	Chloride 500 mM	Sodium Chloride	NaCl	500.0	True
1	BFirm	set1IT041	Chloride 200 mM	Sodium Chloride	NaCl	200.0	True
2	Btheta	set3IT072	BHIS with Chloride 350 mM	Sodium Chloride	NaCl	350.0	True
3	Caulo	set3IT042	PYE with Chloride 100 mM	Sodium Chloride	NaCl	100.0	True
4	Caulo	set4IT016	PYE with Chloride 100 mM	Sodium Chloride	NaCl	100.0	True
5	Cola	set1IT046	m.b. Chloride 750 mM	Sodium Chloride	NaCl	750.0	True
6	Cola	set1IT047	m.b. Chloride 1000 mM	Sodium Chloride	NaCl	1000.0	True
7	Cola	set1IT077	m.b. Chloride 750 mM	Sodium Chloride	NaCl	750.0	True
8	Cola	set1IT078	m.b. Chloride 1000 mM	Sodium Chloride	NaCl	1000.0	True
9	Cup4G11	set1IT018	R2A with Chloride 300 mM	Sodium Chloride	NaCl	300.0	True
10	Cup4G11	set4IT074	R2A with Chloride 300 mM	Sodium Chloride	NaCl	300.0	True
11	Dino	set4IT017	m.b. Chloride 600 mM	Sodium Chloride	NaCl	600.0	True
12	Dino	set4IT035	m.b. Chloride 600 mM	Sodium Chloride	NaCl	600.0	True
13	DvH	set2S277	MoYLS4 with Chloride 125 mM	Sodium Chloride	NaCl	125.0	True
14	DvH	set2S278	MoYLS4 with Chloride 125 mM	Sodium Chloride	NaCl	125.0	True
15	DvH	set2S279	MoYLS4 with Chloride 125 mM	Sodium Chloride	NaCl	125.0	True
16	DvH	set2S280	MoYLS4 with Chloride 62.5 mM	Sodium Chloride	NaCl	62.5	True
17	DvH	set2S281	MoYLS4 with Chloride 62.5 mM	Sodium Chloride	NaCl	62.5	True
18	DvH	set2S282	MoYLS4 with Chloride 62.5 mM	Sodium Chloride	NaCl	62.5	True
19	DvH	set15IT043	MoYLS4 with Rubidium Chloride 25mM	rubidium chloride	RbCl	25.0	True