import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import os

DATA_DIR = '../data'
FIG_DIR = '../figures'

df = pd.read_csv(os.path.join(DATA_DIR, 'growth_matrix_complete.csv'))

GROWTH_COLS = [
    'mutant_growth_acetate', 'mutant_growth_asparagine',
    'mutant_growth_butanediol', 'mutant_growth_glucarate',
    'mutant_growth_glucose', 'mutant_growth_lactate',
    'mutant_growth_quinate', 'mutant_growth_urea'
]
CONDITION_NAMES = [c.replace('mutant_growth_', '') for c in GROWTH_COLS]

print(f'Loaded {len(df):,} genes')

Loaded 2,034 genes

# Z-score per condition
X = df[GROWTH_COLS].values
scaler = StandardScaler()
X_z = scaler.fit_transform(X)

# Compute specificity scores for each gene × condition
specificity = np.zeros_like(X_z)
for i in range(8):
    abs_z = np.abs(X_z)
    other_cols = [j for j in range(8) if j != i]
    mean_other = abs_z[:, other_cols].mean(axis=1)
    specificity[:, i] = abs_z[:, i] - mean_other

# For each gene, find the most specific condition
df['most_specific_condition'] = [CONDITION_NAMES[i] for i in np.argmax(specificity, axis=1)]
df['max_specificity'] = specificity.max(axis=1)
df['most_specific_zscore'] = [X_z[g, np.argmax(specificity[g])] for g in range(len(df))]
df['specificity_direction'] = np.where(df['most_specific_zscore'] < 0, 'defect', 'advantage')

# Add per-condition specificity scores
for i, name in enumerate(CONDITION_NAMES):
    df[f'specificity_{name}'] = specificity[:, i]
    df[f'zscore_{name}'] = X_z[:, i]

print(f'Specificity scores computed for {len(df):,} genes × 8 conditions')
print(f'\nMost specific condition distribution:')
print(df['most_specific_condition'].value_counts().to_string())

Specificity scores computed for 2,034 genes × 8 conditions

Most specific condition distribution:
most_specific_condition
acetate       451
glucarate     322
butanediol    304
lactate       294
asparagine    277
urea          248
glucose        70
quinate        68

# Distribution of specificity scores
fig, ax = plt.subplots(figsize=(10, 5))
ax.hist(df['max_specificity'], bins=50, edgecolor='black', linewidth=0.3, alpha=0.7)
ax.axvline(1.0, color='red', linestyle='--', label='Specificity = 1.0')
ax.axvline(2.0, color='darkred', linestyle='--', label='Specificity = 2.0')
ax.set_xlabel('Max Condition Specificity Score', fontsize=12)
ax.set_ylabel('Count', fontsize=12)
ax.set_title('Distribution of Gene Condition-Specificity', fontsize=14, fontweight='bold')
ax.legend()

n_spec1 = (df['max_specificity'] >= 1.0).sum()
n_spec2 = (df['max_specificity'] >= 2.0).sum()
ax.text(0.95, 0.95, f'Score ≥ 1.0: {n_spec1} genes\nScore ≥ 2.0: {n_spec2} genes',
        transform=ax.transAxes, ha='right', va='top', fontsize=11,
        bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, 'specificity_distribution.png'), dpi=150, bbox_inches='tight')
plt.show()
print('Saved: figures/specificity_distribution.png')

Saved: figures/specificity_distribution.png

# Top 20 condition-specific genes per condition (by defect — negative z-score)
print('=== Top 20 Condition-Specific Genes (growth defect) per Carbon Source ===')

for i, name in enumerate(CONDITION_NAMES):
    col_spec = f'specificity_{name}'
    col_z = f'zscore_{name}'
    
    # Genes with high specificity AND negative z-score (defect)
    candidates = df[df[col_z] < -1.0].nlargest(20, col_spec)
    
    if len(candidates) == 0:
        print(f'\n{name.upper()}: No condition-specific defect genes (z < -1.0)')
        continue
    
    print(f'\n{name.upper()} (n={len(candidates)}):')
    for _, row in candidates.head(10).iterrows():
        func = row['rast_function'][:55] if pd.notna(row['rast_function']) else 'Hypothetical'
        locus = row['old_locus_tag'] if pd.notna(row['old_locus_tag']) else row['feature_id']
        gene = f" ({row['gene_names']})" if pd.notna(row.get('gene_names')) else ''
        print(f'  {locus:15s}{gene:12s} z={row[col_z]:>6.2f} spec={row[col_spec]:>5.2f}  {func}')

=== Top 20 Condition-Specific Genes (growth defect) per Carbon Source ===

ACETATE (n=20):
  ACIAD3465                   z= -4.85 spec= 4.42  Two-component system sensor histidine kinase
  ACIAD0335       (fadB)      z= -4.80 spec= 4.37  Enoyl-CoA hydratase (EC 4.2.1.17);Delta(3)-cis-delta(2)
  ACIAD3522                   z= -4.80 spec= 4.30  NADH-FMN oxidoreductase
  ACIAD2318                   z= -4.66 spec= 4.03  Bis(5'-nucleosyl)-tetraphosphatase (asymmetrical) (EC 3
  ACIAD1135       (nudC)      z= -4.93 spec= 3.62  NADH pyrophosphatase (EC 3.6.1.22), decaps 5'-NAD modif
  ACIAD3469       (citB)      z= -4.72 spec= 3.46  Two-component transcriptional response regulator, LuxR 
  ACIAD2335                   z= -4.95 spec= 3.04  Malate synthase G (EC 2.3.3.9)
  ACIAD0043                   z= -3.65 spec= 2.84  Similar to phosphoglycolate phosphatase, clustered with
  ACIAD1115       (lon)       z= -3.30 spec= 2.47  ATP-dependent protease La (EC 3.4.21.53) Type I
  ACIAD2284                   z= -2.88 spec= 2.34  16S rRNA (uracil(1498)-N(3))-methyltransferase (EC 2.1.

ASPARAGINE (n=20):
  ACIAD1744                   z= -9.60 spec= 9.27  Aspartate ammonia-lyase (EC 4.3.1.1)
  ACIAD2088                   z= -8.90 spec= 8.47  L-asparaginase (EC 3.5.1.1)
  ACIAD1648                   z= -4.78 spec= 3.25  Leucine-responsive regulatory protein, regulator for le
  ACIAD0091                   z= -6.79 spec= 3.05  Glycosyl transferase, group 1
  ACIAD2207       (epmB)      z= -6.30 spec= 2.84  Lysine 2,3-aminomutase (EC 5.4.3.2)
  ACIAD2202                   z= -3.50 spec= 2.82  DUF58 domain-containing protein
  ACIAD0090                   z= -6.64 spec= 2.50  Glycosyl transferase, group 1 family protein
  ACIAD1509       (cntD)      z= -2.92 spec= 2.35  ABC transporter, ATP-binding protein (cluster 5, nickel
  ACIAD2841                   z= -2.72 spec= 2.24  Transcription regulator [contains diacylglycerol kinase
  ACIAD0912                   z= -3.03 spec= 2.23  Twitching motility protein PilT

BUTANEDIOL (n=20):
  ACIAD1019                   z= -5.82 spec= 5.47  Dihydrolipoamide acetyltransferase component (E2) of ac
  ACIAD2084                   z= -5.83 spec= 5.46  Transcriptional regulator, AraC family
  ACIAD1014                   z= -5.83 spec= 5.37  Transcriptional regulator, ArsR family
  ACIAD0922                   z= -5.65 spec= 5.20  Glycine oxidase ThiO (EC 1.4.3.19)
  ACIAD3071       (cysM)      z= -5.65 spec= 5.07  Cysteine synthase B (EC 2.5.1.47)
  ACIAD0808                   z= -5.83 spec= 5.05  Intracellular protease
  ACIAD2236                   z= -5.63 spec= 4.92  Mutator MutT protein (7,8-dihydro-8-oxoguanine-triphosp
  ACIAD1135       (nudC)      z= -5.98 spec= 4.82  NADH pyrophosphatase (EC 3.6.1.22), decaps 5'-NAD modif
  ACIAD1021                   z= -5.07 spec= 4.75  2,3-butanediol dehydrogenase, R-alcohol forming, (R)- a
  ACIAD0772       (apbC)      z= -5.90 spec= 4.66  [4Fe-4S] cluster assembly scaffold protein Mrp (=ApbC)

GLUCARATE (n=20):
  ACIAD0128       (gudD)      z=-11.28 spec=10.92  Glucarate dehydratase (EC 4.2.1.40)
  ACIAD0127                   z=-11.27 spec=10.52  D-glucarate transporter
  ACIAD0131                   z=-11.42 spec=10.48  2,5-dioxovalerate dehydrogenase (EC 1.2.1.26)
  ACIAD2417       (prmC)      z= -8.97 spec= 8.09  Peptide chain release factor N(5)-glutamine methyltrans
  ACIAD0244       (glnK)      z= -9.71 spec= 7.45  Nitrogen regulatory protein P-II, GlnK
  ACIAD0598                   z= -6.83 spec= 5.61  DUF1022 domain-containing protein
  ACIAD2263       (rodA)      z= -2.91 spec= 2.33  Rod shape-determining protein RodA
  ACIAD0266       (xerD)      z= -2.90 spec= 2.15  Site-specific tyrosine recombinase XerD
  ACIAD0144       (parA)      z= -2.40 spec= 2.05  ParA-like protein
  ACIAD0476                   z= -2.42 spec= 2.02  L-asparaginase I, cytoplasmic (EC 3.5.1.1)

GLUCOSE (n=20):
  ACIAD0543       (eda)       z= -7.98 spec= 7.54  4-hydroxy-2-oxoglutarate aldolase (EC 4.1.3.16);2-dehyd
  ACIAD0544       (gntT)      z= -7.84 spec= 7.50  Gluconate transporter family protein
  ACIAD0545                   z= -7.86 spec= 7.47  Gluconokinase (EC 2.7.1.12)
  ACIAD2507       (pqqE)      z= -7.82 spec= 7.37  Coenzyme PQQ synthesis protein E
  ACIAD2755       (acnD)      z= -7.78 spec= 7.04  2-methylcitrate dehydratase (2-methyl-trans-aconitate f
  ACIAD2983                   z= -7.96 spec= 7.01  Glucose dehydrogenase, PQQ-dependent (EC 1.1.5.2)
  ACIAD0546       (adhE)      z= -7.80 spec= 6.98  Glyceraldehyde-3-phosphate dehydrogenase, putative
  ACIAD0962       (ntrR)      z= -7.42 spec= 6.86  Nudix-related transcriptional regulator NrtR
  ACIAD1038                   z= -7.55 spec= 6.85  Hydrolase, HAD superfamily
  ACIAD0521       (tatC)      z= -7.13 spec= 6.52  Twin-arginine translocation protein TatC

LACTATE (n=20):
  ACIAD0107       (lldR)      z=-10.66 spec= 9.79  Lactate-responsive regulator LldR in Enterobacteria, Gn
  ACIAD1484       (lptC)      z=-10.56 spec= 9.55  Lipopolysaccharide export system protein LptC
  ACIAD2427       (cyoC)      z= -9.03 spec= 8.14  Cytochrome O ubiquinol oxidase subunit III (EC 1.10.3.-
  ACIAD0727       (bfmS)      z=-10.53 spec= 5.90  Two-component system sensor histidine kinase
  ACIAD2456       (ubiC)      z= -6.09 spec= 5.25  Chorismate--pyruvate lyase (EC 4.1.3.40)
  ACIAD0847       (zipA)      z= -3.97 spec= 3.43  Cell division protein ZipA
  ACIAD0598                   z= -4.90 spec= 3.41  DUF1022 domain-containing protein
  ACIAD2428       (cyoD)      z= -4.30 spec= 3.28  Cytochrome O ubiquinol oxidase subunit IV (EC 1.10.3.-)
  ACIAD3500       (recJ)      z= -3.09 spec= 2.49  Single-stranded-DNA-specific exonuclease RecJ
  ACIAD2628                   z= -3.02 spec= 2.31  hypothetical protein

QUINATE (n=20):
  ACIAD1710       (pcaC)      z= -8.66 spec= 8.02  4-carboxymuconolactone decarboxylase (EC 4.1.1.44)
  ACIAD1712       (pcaG)      z= -8.62 spec= 7.99  Protocatechuate 3,4-dioxygenase alpha chain (EC 1.13.11
  ACIAD1716       (quiA)      z= -8.38 spec= 7.94  Quinate/shikimate dehydrogenase [Pyrroloquinoline-quino
  ACIAD1713       (quiB)      z= -8.64 spec= 7.82  3-dehydroquinate dehydratase I (EC 4.2.1.10)
  ACIAD1711       (pcaH)      z= -8.47 spec= 7.82  Protocatechuate 3,4-dioxygenase beta chain (EC 1.13.11.
  ACIAD1707       (pcaB)      z= -8.51 spec= 7.71  3-carboxy-cis,cis-muconate cycloisomerase (EC 5.5.1.2)
  ACIAD2505       (pqqC)      z= -8.39 spec= 6.93  Pyrroloquinoline-quinone synthase (EC 1.3.3.11)
  ACIAD2506       (pqqD)      z= -8.39 spec= 6.70  Coenzyme PQQ synthesis protein D
  ACIAD1590       (exbD)      z= -7.08 spec= 6.33  Biopolymer transport protein ExbD/TolR
  ACIAD3137       (yajQ)      z= -7.06 spec= 6.14  UPF0234 protein Yitk

UREA (n=20):
  ACIAD1093       (ureE)      z= -7.87 spec= 7.40  Urease accessory protein UreE
  ACIAD1095       (ureG)      z= -7.76 spec= 7.22  Urease accessory protein UreG
  ACIAD1094       (ureF)      z= -7.65 spec= 7.20  Urease accessory protein UreF
  ACIAD1091       (ureC)      z= -7.55 spec= 7.19  Urease alpha subunit (EC 3.5.1.5)
  ACIAD1090       (ureB)      z= -7.61 spec= 7.12  Urease beta subunit (EC 3.5.1.5)
  ACIAD1089       (ureA)      z= -7.51 spec= 6.96  Urease gamma subunit (EC 3.5.1.5)
  ACIAD1088       (ureD)      z= -7.65 spec= 6.85  Urease accessory protein UreD
  ACIAD0148       (dedA)      z= -5.47 spec= 4.65  DedA protein
  ACIAD1549                   z= -4.83 spec= 4.12  Short-chain dehydrogenase, associated with 2-hydroxychr
  ACIAD0352       (ruvX)      z= -5.76 spec= 3.93  Putative pre-16S rRNA nuclease YqgF

# Quinate-specific genes
quinate_specific = df[(df['specificity_quinate'] > 0.5) & (df['zscore_quinate'] < -1.0)]
quinate_specific = quinate_specific.sort_values('specificity_quinate', ascending=False)

print(f'Quinate-specific genes (spec>0.5, z<-1): {len(quinate_specific)}')
print()
print('=== Quinate-Specific Genes ===')
for _, row in quinate_specific.iterrows():
    func = row['rast_function'][:60] if pd.notna(row['rast_function']) else 'Hypothetical'
    locus = row['old_locus_tag'] if pd.notna(row['old_locus_tag']) else row['feature_id']
    print(f'  {locus:15s} z={row["zscore_quinate"]:>6.2f} spec={row["specificity_quinate"]:>5.2f}  {func}')

# Check if aromatic degradation pathway genes are enriched
aromatic_keywords = ['quinate', 'shikimate', 'protocatechuate', 'benzoate', 'catechol',
                     'aromatic', 'pca', 'pob', 'van', 'hca', 'ben', 'sal']
quinate_aromatic = quinate_specific[quinate_specific['rast_function'].str.lower().str.contains(
    '|'.join(aromatic_keywords), na=False
)]
print(f'\nOf these, aromatic degradation related: {len(quinate_aromatic)}')

Quinate-specific genes (spec>0.5, z<-1): 51

=== Quinate-Specific Genes ===
  ACIAD1710       z= -8.66 spec= 8.02  4-carboxymuconolactone decarboxylase (EC 4.1.1.44)
  ACIAD1712       z= -8.62 spec= 7.99  Protocatechuate 3,4-dioxygenase alpha chain (EC 1.13.11.3)
  ACIAD1716       z= -8.38 spec= 7.94  Quinate/shikimate dehydrogenase [Pyrroloquinoline-quinone] (
  ACIAD1713       z= -8.64 spec= 7.82  3-dehydroquinate dehydratase I (EC 4.2.1.10)
  ACIAD1711       z= -8.47 spec= 7.82  Protocatechuate 3,4-dioxygenase beta chain (EC 1.13.11.3)
  ACIAD1707       z= -8.51 spec= 7.71  3-carboxy-cis,cis-muconate cycloisomerase (EC 5.5.1.2)
  ACIAD2505       z= -8.39 spec= 6.93  Pyrroloquinoline-quinone synthase (EC 1.3.3.11)
  ACIAD2506       z= -8.39 spec= 6.70  Coenzyme PQQ synthesis protein D
  ACIAD1590       z= -7.08 spec= 6.33  Biopolymer transport protein ExbD/TolR
  ACIAD3137       z= -7.06 spec= 6.14  UPF0234 protein Yitk
  ACIAD0739       z= -6.82 spec= 6.10  NADH-ubiquinone oxidoreductase chain J (EC 1.6.5.3)
  ACIAD2176       z= -6.90 spec= 6.07  DUF2280 domain-containing protein
  ACIAD2124       z= -6.82 spec= 6.07  Uncharacterized siderophore S biosynthesis protein, AcsD-lik
  ACIAD2741       z= -7.03 spec= 6.05  Phage capsid and scaffold
  ACIAD0735       z= -6.97 spec= 6.03  NADH-ubiquinone oxidoreductase chain F (EC 1.6.5.3)
  ACIAD0742       z= -7.00 spec= 5.99  NADH-ubiquinone oxidoreductase chain M (EC 1.6.5.3)
  ACIAD0737       z= -6.69 spec= 5.91  NADH-ubiquinone oxidoreductase chain H (EC 1.6.5.3)
  ACIAD0743       z= -6.84 spec= 5.91  NADH-ubiquinone oxidoreductase chain N (EC 1.6.5.3)
  ACIAD0736       z= -6.76 spec= 5.86  NADH-ubiquinone oxidoreductase chain G (EC 1.6.5.3)
  ACIAD1818       z= -7.04 spec= 5.82  3-methylmercaptopropionyl-CoA ligase (EC 6.2.1.44) of DmdB2 
  ACIAD0734       z= -6.91 spec= 5.81  NADH-ubiquinone oxidoreductase chain E (EC 1.6.5.3)
  ACIAD2066       z= -6.85 spec= 5.80  Coenzyme F420-dependent N5,N10-methylene tetrahydromethanopt
  ACIAD0730       z= -6.80 spec= 5.80  NADH ubiquinone oxidoreductase chain A (EC 1.6.5.3)
  ACIAD0740       z= -6.58 spec= 5.77  NADH-ubiquinone oxidoreductase chain K (EC 1.6.5.3)
  ACIAD0738       z= -6.64 spec= 5.72  NADH-ubiquinone oxidoreductase chain I (EC 1.6.5.3)
  ACIAD1648       z= -6.61 spec= 5.34  Leucine-responsive regulatory protein, regulator for leucine
  ACIAD0090       z= -7.10 spec= 3.03  Glycosyl transferase, group 1 family protein
  ACIAD0091       z= -5.50 spec= 1.58  Glycosyl transferase, group 1
  ACIAD0214       z= -1.92 spec= 1.02  TonB-dependent receptor
  ACIAD0778       z= -1.39 spec= 0.97  hypothetical protein
  ACIAD1708       z= -1.81 spec= 0.91  Beta-ketoadipate enol-lactone hydrolase (EC 3.1.1.24)
  ACIAD0347       z= -1.44 spec= 0.88  Transcriptional regulator, IclR family
  ACIAD0444       z= -1.53 spec= 0.81  Transcriptional regulator, DeoR family
  ACIAD1879       z= -1.76 spec= 0.77  S-(hydroxymethyl)glutathione dehydrogenase (EC 1.1.1.284)
  ACIAD3129       z= -1.39 spec= 0.72  Transcriptional regulator, LysR family
  ACIAD0750       z= -1.30 spec= 0.71  Small-conductance mechanosensitive channel
  ACIAD0966       z= -1.07 spec= 0.71  Arginine exporter protein ArgO
  ACIAD0323       z= -1.78 spec= 0.69  LysR-family transcriptional regulator YjiE
  ACIAD0230       z= -1.13 spec= 0.68  UPF0053 inner membrane protein YoaE
  ACIAD3176       z= -1.22 spec= 0.67  Osmosensitive K+ channel histidine kinase KdpD
  ACIAD0294       z= -1.13 spec= 0.65  General secretion pathway protein D
  ACIAD3641       z= -1.25 spec= 0.64  Sodium-dependent transporter, SNF family
  ACIAD0526       z= -1.39 spec= 0.63  Nucleoside 5-triphosphatase RdgB (dHAPTP, dITP, XTP-specific
  ACIAD0198       z= -1.00 spec= 0.62  ABC transporter, substrate-binding protein (cluster 11, ribo
  ACIAD2540       z= -1.38 spec= 0.60  hypothetical protein
  ACIAD0403       z= -1.27 spec= 0.55  Magnesium and cobalt transport protein CorA
  ACIAD0473       z= -1.01 spec= 0.54  Transcriptional regulator, AraC family
  ACIAD2213       z= -1.34 spec= 0.53  Lactoylglutathione lyase (EC 4.4.1.5)
  ACIAD0049       z= -1.39 spec= 0.51  Linoleoyl-CoA desaturase (EC 1.14.19.3)
  ACIAD2049       z= -1.05 spec= 0.51  Ferrichrome-iron receptor
  ACIAD0419       z= -1.36 spec= 0.51  DNA gyrase inhibitor YacG

Of these, aromatic degradation related: 4

# Urea-specific genes
urea_specific = df[(df['specificity_urea'] > 0.5) & (df['zscore_urea'] < -1.0)]
urea_specific = urea_specific.sort_values('specificity_urea', ascending=False)

print(f'Urea-specific genes (spec>0.5, z<-1): {len(urea_specific)}')
print()
if len(urea_specific) > 0:
    print('=== Top 20 Urea-Specific Genes ===')
    for _, row in urea_specific.head(20).iterrows():
        func = row['rast_function'][:60] if pd.notna(row['rast_function']) else 'Hypothetical'
        locus = row['old_locus_tag'] if pd.notna(row['old_locus_tag']) else row['feature_id']
        print(f'  {locus:15s} z={row["zscore_urea"]:>6.2f} spec={row["specificity_urea"]:>5.2f}  {func}')
else:
    print('Note: Urea has very low variance — almost all genes show severe defects.')
    print('This makes condition-specificity less meaningful for urea.')
    print(f'Urea std of z-scores: {df["zscore_urea"].std():.3f}')

Urea-specific genes (spec>0.5, z<-1): 138

=== Top 20 Urea-Specific Genes ===
  ACIAD1093       z= -7.87 spec= 7.40  Urease accessory protein UreE
  ACIAD1095       z= -7.76 spec= 7.22  Urease accessory protein UreG
  ACIAD1094       z= -7.65 spec= 7.20  Urease accessory protein UreF
  ACIAD1091       z= -7.55 spec= 7.19  Urease alpha subunit (EC 3.5.1.5)
  ACIAD1090       z= -7.61 spec= 7.12  Urease beta subunit (EC 3.5.1.5)
  ACIAD1089       z= -7.51 spec= 6.96  Urease gamma subunit (EC 3.5.1.5)
  ACIAD1088       z= -7.65 spec= 6.85  Urease accessory protein UreD
  ACIAD0148       z= -5.47 spec= 4.65  DedA protein
  ACIAD1549       z= -4.83 spec= 4.12  Short-chain dehydrogenase, associated with 2-hydroxychromene
  ACIAD0352       z= -5.76 spec= 3.93  Putative pre-16S rRNA nuclease YqgF
  ACIAD0110       z= -6.68 spec= 3.78  K30 capsule biosynthesis cluster, partial sequence
  ACIAD3582       z= -3.38 spec= 3.13  Uncharacterized protein YbbK
  ACIAD1671       z= -3.35 spec= 2.77  histidine-type phosphatase
  ACIAD1830       z= -3.19 spec= 2.77  putative LuxA
  ACIAD3119       z= -3.39 spec= 2.75  Ribosome small subunit biogenesis RbfA-release protein RsgA
  ACIAD2287       z= -3.69 spec= 2.65  NADP-dependent malic enzyme (EC 1.1.1.40)
  ACIAD3418       z= -3.06 spec= 2.55  MloA
  ACIAD2995       z= -2.97 spec= 2.55  SAM-dependent methyltransferase
  ACIAD0538       z= -3.60 spec= 2.53  Fumarate hydratase class I, aerobic (EC 4.2.1.2)
  ACIAD2581       z= -3.76 spec= 2.48  Ribonuclease III (EC 3.1.26.3)

# Get top 10 most condition-specific genes per condition
top_genes = []
for name in CONDITION_NAMES:
    col_spec = f'specificity_{name}'
    top = df.nlargest(10, col_spec)
    top_genes.extend(top.index.tolist())

top_genes = list(set(top_genes))  # deduplicate
df_top = df.loc[top_genes]

# Z-score heatmap for these genes
z_cols = [f'zscore_{n}' for n in CONDITION_NAMES]
z_matrix = df_top[z_cols].rename(
    columns={f'zscore_{n}': n.capitalize() for n in CONDITION_NAMES}
)

# Label rows with locus tag or feature_id
row_labels = []
for _, row in df_top.iterrows():
    locus = row['old_locus_tag'] if pd.notna(row['old_locus_tag']) else row['feature_id']
    cond = row['most_specific_condition']
    row_labels.append(f'{locus} ({cond})')

z_matrix.index = row_labels

g = sns.clustermap(z_matrix, cmap='RdBu_r', center=0, vmin=-5, vmax=5,
                   figsize=(10, max(8, len(top_genes) * 0.25)),
                   row_cluster=True, col_cluster=False,
                   linewidths=0.3, yticklabels=True)
g.fig.suptitle(f'Z-Score Profiles of Top Condition-Specific Genes (n={len(top_genes)})',
               fontsize=14, fontweight='bold', y=1.02)
plt.savefig(os.path.join(FIG_DIR, 'condition_specific_heatmap.png'), dpi=150, bbox_inches='tight')
plt.show()
print(f'Saved: figures/condition_specific_heatmap.png')

Saved: figures/condition_specific_heatmap.png

# Save all genes with specificity scores
df.to_csv(os.path.join(DATA_DIR, 'condition_specific_genes.csv'), index=False)
print(f'Saved: data/condition_specific_genes.csv ({len(df):,} rows)')

print(f'\n=== NB04 Summary ===')
print(f'Genes analyzed: {len(df):,}')
print(f'Genes with specificity ≥ 1.0: {(df["max_specificity"] >= 1.0).sum()}')
print(f'Genes with specificity ≥ 2.0: {(df["max_specificity"] >= 2.0).sum()}')
print(f'\nCondition with most specific genes:')
for name in CONDITION_NAMES:
    n = ((df['most_specific_condition'] == name) & (df['max_specificity'] >= 1.0)).sum()
    if n > 0:
        print(f'  {name:15s} {n:>3} genes')

Saved: data/condition_specific_genes.csv (2,034 rows)

=== NB04 Summary ===
Genes analyzed: 2,034
Genes with specificity ≥ 1.0: 625
Genes with specificity ≥ 2.0: 160

Condition with most specific genes:
  acetate          80 genes
  asparagine      120 genes
  butanediol       72 genes
  glucarate        76 genes
  glucose          48 genes
  lactate          90 genes
  quinate          27 genes
  urea            112 genes

04 Condition Specific

NB04: Condition-Specific Genes¶

1. Condition Specificity Score¶

2. Top Condition-Specific Genes per Carbon Source¶

3. Quinate-Specific Genes — Cross-Reference with Prior Project¶

4. Urea-Specific Genes — Nitrogen Metabolism¶

5. Condition-Specificity Heatmap — Top Genes¶

6. Save Results¶