import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

DATA_DIR = os.path.join(os.path.dirname(os.getcwd()), 'data') if os.path.basename(os.getcwd()) == 'notebooks' else 'data'
FIG_DIR = os.path.join(os.path.dirname(os.getcwd()), 'figures') if os.path.basename(os.getcwd()) == 'notebooks' else 'figures'
os.makedirs(FIG_DIR, exist_ok=True)

pathway_matrix = pd.read_csv(os.path.join(DATA_DIR, 'species_pathway_profiles.csv'), index_col=0)
species_env = pd.read_csv(os.path.join(DATA_DIR, 'species_lifestyle.csv'))
species_df = pd.read_csv(os.path.join(DATA_DIR, 'pseudomonas_species.csv'))

print(f'Loaded: {pathway_matrix.shape[0]} species x {pathway_matrix.shape[1]} pathways')

Loaded: 433 species x 62 pathways

# Merge pathway profiles with environment data
merged = pathway_matrix.reset_index().merge(
    species_env[['gtdb_species_clade_id', 'majority_environment', 'majority_lifestyle',
                 'n_genomes_classified', 'pct_majority']],
    on='gtdb_species_clade_id'
).merge(
    species_df[['gtdb_species_clade_id', 'no_genomes', 'gtdb_subgenus']],
    on='gtdb_species_clade_id'
)

# Filter: free-living species with >= 5 genomes and >= 60% majority agreement
free_living = merged[
    (merged['majority_lifestyle'].isin(['free_living', 'plant_associated'])) &
    (merged['no_genomes'] >= 5) &
    (merged['pct_majority'] >= 60)
].copy()

print(f'Free-living/plant-associated species (>= 5 genomes, >= 60% agreement): {len(free_living)}')
print(f'\nEnvironment distribution:')
print(free_living['majority_environment'].value_counts().to_string())

Free-living/plant-associated species (>= 5 genomes, >= 60% agreement): 54

Environment distribution:
majority_environment
plant_surface    19
freshwater       13
soil             13
rhizosphere       6
industrial        2
marine            1

pathways = pathway_matrix.columns.tolist()
X = free_living[pathways].values

# PCA
pca = PCA(n_components=min(10, X.shape[1]))
X_pca = pca.fit_transform(X)

print(f'PCA explained variance:')
for i, var in enumerate(pca.explained_variance_ratio_[:5]):
    print(f'  PC{i+1}: {var:.1%}')
print(f'  Total (5 PCs): {pca.explained_variance_ratio_[:5].sum():.1%}')

# Plot PC1 vs PC2 colored by environment
fig, ax = plt.subplots(figsize=(10, 8))
env_colors = {
    'soil': '#8B4513', 'rhizosphere': '#228B22', 'freshwater': '#4169E1',
    'marine': '#00CED1', 'plant_surface': '#32CD32', 'industrial': '#808080'
}

for env in free_living['majority_environment'].unique():
    mask = free_living['majority_environment'] == env
    color = env_colors.get(env, '#999999')
    ax.scatter(X_pca[mask.values, 0], X_pca[mask.values, 1],
              label=f'{env} (n={mask.sum()})', alpha=0.7, s=50, color=color)

ax.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
ax.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
ax.set_title('PCA of Carbon Pathway Profiles — Free-Living Pseudomonas Species')
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, 'pathway_pca_by_environment.png'), dpi=150, bbox_inches='tight')
plt.show()

PCA explained variance:
  PC1: 31.2%
  PC2: 17.9%
  PC3: 11.2%
  PC4: 7.7%
  PC5: 6.8%
  Total (5 PCs): 74.9%

from scipy.spatial.distance import pdist, squareform

# Compute pairwise Bray-Curtis-like distance (Euclidean on fraction data)
D = squareform(pdist(X, metric='euclidean'))

# Simple PERMANOVA-like test: compare within-group vs between-group distances
env_labels = free_living['majority_environment'].values
unique_envs = [e for e in np.unique(env_labels) if (env_labels == e).sum() >= 3]

within_dists = []
between_dists = []
n = len(env_labels)
for i in range(n):
    for j in range(i+1, n):
        if env_labels[i] in unique_envs and env_labels[j] in unique_envs:
            if env_labels[i] == env_labels[j]:
                within_dists.append(D[i, j])
            else:
                between_dists.append(D[i, j])

print(f'Within-group mean distance: {np.mean(within_dists):.3f} (n={len(within_dists)})')
print(f'Between-group mean distance: {np.mean(between_dists):.3f} (n={len(between_dists)})')

# Permutation test
observed_ratio = np.mean(between_dists) / np.mean(within_dists)
n_perm = 999
perm_ratios = []
for _ in range(n_perm):
    perm_labels = np.random.permutation(env_labels)
    w, b = [], []
    for i in range(n):
        for j in range(i+1, n):
            if perm_labels[i] in unique_envs and perm_labels[j] in unique_envs:
                if perm_labels[i] == perm_labels[j]:
                    w.append(D[i, j])
                else:
                    b.append(D[i, j])
    if len(w) > 0:
        perm_ratios.append(np.mean(b) / np.mean(w))

p_value = (np.sum(np.array(perm_ratios) >= observed_ratio) + 1) / (n_perm + 1)
print(f'\nPERMANOVA-like test:')
print(f'  Observed B/W ratio: {observed_ratio:.3f}')
print(f'  Permutation p-value: {p_value:.4f}')

Within-group mean distance: 1.890 (n=342)
Between-group mean distance: 2.054 (n=933)

PERMANOVA-like test:
  Observed B/W ratio: 1.087
  Permutation p-value: 0.0060

# Only use environments with >= 5 species for classification
env_counts = free_living['majority_environment'].value_counts()
valid_envs = env_counts[env_counts >= 5].index.tolist()
clf_data = free_living[free_living['majority_environment'].isin(valid_envs)].copy().reset_index(drop=True)

if len(valid_envs) >= 2 and len(clf_data) >= 10:
    X_clf = clf_data[pathways].values.copy()
    y_clf = np.array(clf_data['majority_environment'].values)

    # Stratified cross-validation
    rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
    min_class_size = min(pd.Series(y_clf).value_counts())
    n_splits = min(5, min_class_size)

    if n_splits >= 2:
        cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
        scores = cross_val_score(rf, X_clf, y_clf, cv=cv, scoring='balanced_accuracy')
        print(f'Random Forest ({len(valid_envs)} classes, {len(clf_data)} species):')
        print(f'  Cross-val balanced accuracy: {scores.mean():.3f} +/- {scores.std():.3f}')
        print(f'  Chance level: {1/len(valid_envs):.3f}')

        # Fit on all data for feature importance
        rf.fit(X_clf, y_clf)
        importances = pd.Series(rf.feature_importances_, index=pathways).sort_values(ascending=False)
        print(f'\nTop 15 discriminating pathways:')
        print(importances.head(15).to_string())

        # Feature importance plot
        fig, ax = plt.subplots(figsize=(10, 6))
        importances.head(20).plot(kind='barh', ax=ax, color='steelblue')
        ax.set_xlabel('Feature importance (Gini)')
        ax.set_title('Top 20 Carbon Pathways Discriminating Environment Types')
        ax.invert_yaxis()
        plt.tight_layout()
        plt.savefig(os.path.join(FIG_DIR, 'rf_importance.png'), dpi=150, bbox_inches='tight')
        plt.show()
    else:
        print(f'Insufficient samples per class for cross-validation (min class size = {min_class_size})')
        print('Falling back to leave-one-out or simple train/test split not attempted.')
        print('This suggests fine-grained environment types have too few species for predictive modeling.')
else:
    print(f'Insufficient environment categories ({len(valid_envs)}) or species ({len(clf_data)}) for classification.')
    print('This suggests carbon pathway profiles may not strongly predict fine-grained environment type among free-living species.')

Random Forest (4 classes, 51 species):
  Cross-val balanced accuracy: 0.408 +/- 0.169
  Chance level: 0.250

Top 15 discriminating pathways:
D-serine         0.131708
arabinose        0.094083
rhamnose         0.086000
fucose           0.084542
xylose           0.069821
NAG              0.069084
xylitol          0.067386
mannose          0.050920
mannitol         0.048859
myoinositol      0.045657
sorbitol         0.045537
glucosamine      0.035278
propionate       0.021892
ribose           0.019198
galacturonate    0.017819

# All species with lifestyle assignments
all_with_lifestyle = pathway_matrix.reset_index().merge(
    species_env[['gtdb_species_clade_id', 'majority_lifestyle']],
    on='gtdb_species_clade_id'
).merge(
    species_df[['gtdb_species_clade_id', 'no_genomes']],
    on='gtdb_species_clade_id'
)

all_with_lifestyle = all_with_lifestyle[
    (all_with_lifestyle['no_genomes'] >= 5) &
    (all_with_lifestyle['majority_lifestyle'] != 'unknown')
]

# PCA colored by lifestyle
X_all = all_with_lifestyle[pathways].values
pca_all = PCA(n_components=5)
X_pca_all = pca_all.fit_transform(X_all)

fig, ax = plt.subplots(figsize=(10, 8))
lifestyle_colors = {
    'free_living': '#2ecc71', 'host_associated': '#e74c3c',
    'plant_associated': '#f39c12', 'food_associated': '#9b59b6'
}

for lifestyle in all_with_lifestyle['majority_lifestyle'].unique():
    mask = all_with_lifestyle['majority_lifestyle'] == lifestyle
    color = lifestyle_colors.get(lifestyle, '#999999')
    ax.scatter(X_pca_all[mask.values, 0], X_pca_all[mask.values, 1],
              label=f'{lifestyle} (n={mask.sum()})', alpha=0.7, s=50, color=color)

ax.set_xlabel(f'PC1 ({pca_all.explained_variance_ratio_[0]:.1%})')
ax.set_ylabel(f'PC2 ({pca_all.explained_variance_ratio_[1]:.1%})')
ax.set_title('PCA of Carbon Pathway Profiles — All Pseudomonas by Lifestyle')
ax.legend()
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, 'pathway_pca_by_lifestyle.png'), dpi=150, bbox_inches='tight')
plt.show()

# Pathway richness by lifestyle
all_with_lifestyle['pathway_richness'] = (all_with_lifestyle[pathways] > 0.5).sum(axis=1)

fig, ax = plt.subplots(figsize=(8, 5))
order = ['free_living', 'plant_associated', 'food_associated', 'host_associated']
order = [o for o in order if o in all_with_lifestyle['majority_lifestyle'].values]
sns.boxplot(data=all_with_lifestyle, x='majority_lifestyle', y='pathway_richness',
            order=order, ax=ax, palette=[lifestyle_colors.get(o, '#999') for o in order])
ax.set_xlabel('Lifestyle')
ax.set_ylabel('Carbon pathway richness\n(pathways complete in >50% of genomes)')
ax.set_title('Carbon Pathway Richness by Lifestyle')

# Add stats
for lifestyle in order:
    sub = all_with_lifestyle[all_with_lifestyle['majority_lifestyle'] == lifestyle]
    print(f'{lifestyle}: n={len(sub)}, median richness={sub["pathway_richness"].median():.0f}, mean={sub["pathway_richness"].mean():.1f}')

plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, 'pathway_richness_by_lifestyle.png'), dpi=150, bbox_inches='tight')
plt.show()

free_living: n=91, median richness=57, mean=55.9
plant_associated: n=31, median richness=57, mean=56.8
food_associated: n=9, median richness=55, mean=55.7
host_associated: n=66, median richness=55, mean=54.5

/tmp/ipykernel_12822/4201764123.py:7: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=all_with_lifestyle, x='majority_lifestyle', y='pathway_richness',

04 Ecology Prediction

04 - Predicting Ecology from Carbon Profiles¶

1. Subset to Free-Living Species with Clear Environment Classification¶

2. PCA of Carbon Pathway Profiles¶

3. PERMANOVA: Do Environment Categories Differ in Pathway Space?¶

4. Random Forest: Predict Environment from Pathway Profile¶

5. Broader Comparison: All Lifestyles¶

Summary¶