import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

PROJ = Path('..')
REPO = PROJ / '..'
MAT_DIR = REPO / 'fitness_modules' / 'data' / 'matrices'
METAL_ATLAS = REPO / 'metal_fitness_atlas' / 'data'
ESSENTIAL = REPO / 'essential_genome' / 'data'
DATA_OUT = PROJ / 'data'
FIG_OUT = PROJ / 'figures'

# Load DvH data
metal_imp = pd.read_csv(METAL_ATLAS / 'metal_important_genes.csv')
nacl_imp = pd.read_csv(DATA_OUT / 'nacl_important_genes.csv')
metal_exps = pd.read_csv(METAL_ATLAS / 'metal_experiments.csv')

# DvH metal-important genes
dvh_metal = metal_imp[metal_imp['orgId'] == 'DvH'].copy()
dvh_nacl_set = set(nacl_imp[nacl_imp['orgId'] == 'DvH']['locusId'])

# Classify each gene × metal record
dvh_metal['is_nacl_important'] = dvh_metal['locusId'].isin(dvh_nacl_set)
dvh_metal['gene_class'] = dvh_metal['is_nacl_important'].map(
    {True: 'shared_stress', False: 'metal_specific'}
)

print(f'DvH metal-important records: {len(dvh_metal)}')
print(f'Unique genes: {dvh_metal["locusId"].nunique()}')
print()
print('Classification:')
print(dvh_metal['gene_class'].value_counts())
print()

# Per-metal classification
metal_class = dvh_metal.groupby(['metal_element', 'gene_class']).size().unstack(fill_value=0)
metal_class['total'] = metal_class.sum(axis=1)
metal_class['pct_shared'] = 100 * metal_class.get('shared_stress', 0) / metal_class['total']
print('Per-metal classification (DvH):')
print(metal_class.sort_values('pct_shared', ascending=False).to_string())

DvH metal-important records: 1366
Unique genes: 495

Classification:
gene_class
metal_specific    1064
shared_stress      302
Name: count, dtype: int64

Per-metal classification (DvH):
gene_class     metal_specific  shared_stress  total  pct_shared
metal_element                                                  
Copper                      7             24     31   77.419355
Cobalt                     13             20     33   60.606061
Manganese                  14             19     33   57.575758
Zinc                       27             27     54   50.000000
Nickel                     28             25     53   47.169811
Iron                        6              3      9   33.333333
Uranium                    45             18     63   28.571429
Selenium                  101             37    138   26.811594
Mercury                    82             25    107   23.364486
Chromium                   98             26    124   20.967742
Aluminum                   92             17    109   15.596330
Tungsten                  273             33    306   10.784314
Molybdenum                278             28    306    9.150327

# Classify at the gene level (across all metals)
# A gene is "shared_stress" if it's NaCl-important; "metal_specific" otherwise
dvh_genes = dvh_metal.groupby('locusId').agg(
    n_metals=('metal_element', 'nunique'),
    metals=('metal_element', lambda x: ','.join(sorted(x.unique()))),
    is_nacl=('is_nacl_important', 'first')  # same for all records of a gene
).reset_index()
dvh_genes['gene_class'] = dvh_genes['is_nacl'].map(
    {True: 'shared_stress', False: 'metal_specific'}
)

print(f'Unique DvH metal-important genes: {len(dvh_genes)}')
print(dvh_genes['gene_class'].value_counts())
print()
print('Metal breadth by class:')
for cls in ['shared_stress', 'metal_specific']:
    sub = dvh_genes[dvh_genes['gene_class'] == cls]
    print(f'  {cls}: n={len(sub)}, mean metals={sub["n_metals"].mean():.1f}, '
          f'median={sub["n_metals"].median():.0f}')

Unique DvH metal-important genes: 495
gene_class
metal_specific    422
shared_stress      73
Name: count, dtype: int64

Metal breadth by class:
  shared_stress: n=73, mean metals=4.1, median=3
  metal_specific: n=422, mean metals=2.5, median=2

# Load SEED annotations for DvH
seed_file = ESSENTIAL / 'all_seed_annotations.tsv'
if seed_file.exists():
    seed = pd.read_csv(seed_file, sep='\t')
    dvh_seed = seed[seed['orgId'] == 'DvH'][['locusId', 'seed_desc']].drop_duplicates('locusId')
    
    dvh_genes = dvh_genes.merge(dvh_seed, on='locusId', how='left')
    
    # Annotated fraction by class
    for cls in ['shared_stress', 'metal_specific']:
        sub = dvh_genes[dvh_genes['gene_class'] == cls]
        annotated = sub['seed_desc'].notna().sum()
        print(f'{cls}: {annotated}/{len(sub)} annotated ({100*annotated/len(sub):.1f}%)')
    
    print()
    print('Top SEED annotations in shared_stress genes:')
    shared = dvh_genes[dvh_genes['gene_class'] == 'shared_stress']
    print(shared['seed_desc'].value_counts().head(15).to_string())
    
    print()
    print('Top SEED annotations in metal_specific genes:')
    specific = dvh_genes[dvh_genes['gene_class'] == 'metal_specific']
    print(specific['seed_desc'].value_counts().head(15).to_string())
else:
    print('SEED annotations not available; skipping functional enrichment')

shared_stress: 57/73 annotated (78.1%)
metal_specific: 382/422 annotated (90.5%)

Top SEED annotations in shared_stress genes:
seed_desc
Translation elongation factor LepA                                             1
Shikimate kinase I (EC 2.7.1.71)                                               1
ATP synthase protein I                                                         1
Menaquinone via futalosine polyprenyltransferase (MenA homolog)                1
Transmembrane component NikQ of energizing module of nickel ECF transporter    1
Substrate-specific component NikM of nickel ECF transporter                    1
Ribonuclease P protein component (EC 3.1.26.5)                                 1
response regulator                                                             1
Nucleoside triphosphate pyrophosphohydrolase MazG (EC 3.6.1.8)                 1
COG1399 protein, clustered with ribosomal protein L32p                         1
ATP-dependent Clp protease proteolytic subunit (EC 3.4.21.92)                  1
Cell division initiation protein DivIVA                                        1
Acetolactate synthase small subunit (EC 2.2.1.6)                               1
GTP-binding and nucleic acid-binding protein YchF                              1
FIG137478: Hypothetical protein                                                1

Top SEED annotations in metal_specific genes:
seed_desc
membrane protein, putative                                               5
Sensory box histidine kinase/response regulator                          3
response regulator                                                       3
Glutamate synthase [NADPH] large chain (EC 1.4.1.13)                     2
Ni/Fe-hydrogenase 2 B-type cytochrome subunit                            2
Fe-S-cluster-containing hydrogenase components 1                         2
dTDP-4-dehydrorhamnose 3,5-epimerase (EC 5.1.3.13)                       2
sigma-54 dependent transcriptional regulator/response regulator          2
Response regulator                                                       2
Aspartate aminotransferase (EC 2.6.1.1)                                  2
FIG00603085: hypothetical protein                                        2
ATP-dependent protease La (EC 3.4.21.53) Type I                          2
COG1180: Radical SAM, Pyruvate-formate lyase-activating enzyme like      2
Flagellar hook protein FlgE                                              2
Nitrogenase (molybdenum-iron)-specific transcriptional regulator NifA    2

# Extend classification to ALL organisms with both NaCl and metal data
overlap = pd.read_csv(DATA_OUT / 'metal_nacl_overlap.csv')
nacl_all = pd.read_csv(DATA_OUT / 'nacl_important_genes.csv')

all_class_records = []

for org in metal_imp['orgId'].unique():
    nacl_genes_org = set(nacl_all[nacl_all['orgId'] == org]['locusId'])
    if not nacl_genes_org:
        continue
    
    org_metal = metal_imp[metal_imp['orgId'] == org]
    for _, row in org_metal.iterrows():
        all_class_records.append({
            'orgId': row['orgId'],
            'locusId': row['locusId'],
            'metal_element': row['metal_element'],
            'gene_class': 'shared_stress' if row['locusId'] in nacl_genes_org else 'metal_specific'
        })

all_classified = pd.DataFrame(all_class_records)
print(f'Cross-organism classification: {len(all_classified)} gene × metal records')
print(all_classified['gene_class'].value_counts())
print()

# Per-organism summary
org_class = all_classified.groupby(['orgId', 'gene_class']).size().unstack(fill_value=0)
org_class['total'] = org_class.sum(axis=1)
org_class['pct_shared'] = 100 * org_class.get('shared_stress', 0) / org_class['total']
print('Per-organism classification:')
print(org_class.sort_values('pct_shared', ascending=False).to_string())

Cross-organism classification: 10821 gene × metal records
gene_class
metal_specific    6517
shared_stress     4304
Name: count, dtype: int64

Per-organism classification:
gene_class          metal_specific  shared_stress  total  pct_shared
orgId                                                               
SynE                            73            565    638   88.557994
Korea                          132            187    319   58.620690
Phaeo                          140            181    321   56.386293
Pedo557                        488            551   1039   53.031761
Ponti                          299            318    617   51.539708
Cola                           191            190    381   49.868766
psRCH2                         572            483   1055   45.781991
pseudo6_N2E2                   213            159    372   42.741935
Caulo                          507            362    869   41.657077
Cup4G11                        525            300    825   36.363636
pseudo13_GW456_L13             190            108    298   36.241611
pseudo3_N2E3                   302            157    459   34.204793
BFirm                          174             72    246   29.268293
Kang                           241             78    319   24.451411
Marino                         342             99    441   22.448980
DvH                           1064            302   1366   22.108346
Koxy                           350             91    441   20.634921
pseudo1_N1B4                   181             32    213   15.023474
Btheta                         533             69    602   11.461794

# Save classifications
all_classified.to_csv(DATA_OUT / 'gene_classification_shared_vs_metal.csv', index=False)
dvh_genes.to_csv(DATA_OUT / 'dvh_gene_classification.csv', index=False)
print(f'Saved {len(all_classified)} classified records and {len(dvh_genes)} DvH gene classifications')

Saved 10821 classified records and 495 DvH gene classifications

# Figure: Stacked bar — per-metal fraction shared vs metal-specific
per_metal = all_classified.groupby(['metal_element', 'gene_class']).size().unstack(fill_value=0)
per_metal_pct = per_metal.div(per_metal.sum(axis=1), axis=0)
per_metal_pct = per_metal_pct.sort_values('shared_stress', ascending=False)

fig, ax = plt.subplots(figsize=(10, 5))
per_metal_pct.plot(kind='bar', stacked=True, ax=ax, 
                    color=['#d62728', '#2ca02c'],
                    edgecolor='black', linewidth=0.5)
ax.set_ylabel('Fraction of metal-important genes', fontsize=11)
ax.set_xlabel('Metal', fontsize=11)
ax.set_title('Gene Classification: Shared-Stress vs Metal-Specific\n(across all organisms)', fontsize=13)
ax.legend(['Shared with NaCl stress', 'Metal-specific'], fontsize=10)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
ax.axhline(0.5, color='gray', linestyle=':', alpha=0.5)

# Add total gene counts on top
totals = per_metal.sum(axis=1)
for i, (metal, total) in enumerate(totals.items()):
    ax.text(i, 1.02, f'n={total}', ha='center', fontsize=8)

plt.tight_layout()
fig.savefig(FIG_OUT / 'gene_classification_by_metal.png', dpi=150, bbox_inches='tight')
plt.show()
print('Saved to figures/gene_classification_by_metal.png')

Saved to figures/gene_classification_by_metal.png

03 Profile Decomposition

NB03: Profile Correlation and Gene Classification¶

1. Classify DvH Metal-Important Genes¶

2. Functional Enrichment: Shared-Stress vs Metal-Specific¶

3. Cross-Organism Gene Classification¶

4. Visualization¶

Summary¶