04 Ecology Prediction
Jupyter notebook from the Carbon Source Utilization Predicts Ecology and Lifestyle in Pseudomonas project.
04 - Predicting Ecology from Carbon Profiles¶
Runs locally (uses cached CSVs from NB01-03)
Tests H1a: Can carbon source utilization profiles predict the soil ecosystem type from which free-living Pseudomonas strains were isolated?
In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
DATA_DIR = os.path.join(os.path.dirname(os.getcwd()), 'data') if os.path.basename(os.getcwd()) == 'notebooks' else 'data'
FIG_DIR = os.path.join(os.path.dirname(os.getcwd()), 'figures') if os.path.basename(os.getcwd()) == 'notebooks' else 'figures'
os.makedirs(FIG_DIR, exist_ok=True)
pathway_matrix = pd.read_csv(os.path.join(DATA_DIR, 'species_pathway_profiles.csv'), index_col=0)
species_env = pd.read_csv(os.path.join(DATA_DIR, 'species_lifestyle.csv'))
species_df = pd.read_csv(os.path.join(DATA_DIR, 'pseudomonas_species.csv'))
print(f'Loaded: {pathway_matrix.shape[0]} species x {pathway_matrix.shape[1]} pathways')
Loaded: 433 species x 62 pathways
1. Subset to Free-Living Species with Clear Environment Classification¶
In [2]:
# Merge pathway profiles with environment data
merged = pathway_matrix.reset_index().merge(
species_env[['gtdb_species_clade_id', 'majority_environment', 'majority_lifestyle',
'n_genomes_classified', 'pct_majority']],
on='gtdb_species_clade_id'
).merge(
species_df[['gtdb_species_clade_id', 'no_genomes', 'gtdb_subgenus']],
on='gtdb_species_clade_id'
)
# Filter: free-living species with >= 5 genomes and >= 60% majority agreement
free_living = merged[
(merged['majority_lifestyle'].isin(['free_living', 'plant_associated'])) &
(merged['no_genomes'] >= 5) &
(merged['pct_majority'] >= 60)
].copy()
print(f'Free-living/plant-associated species (>= 5 genomes, >= 60% agreement): {len(free_living)}')
print(f'\nEnvironment distribution:')
print(free_living['majority_environment'].value_counts().to_string())
Free-living/plant-associated species (>= 5 genomes, >= 60% agreement): 54 Environment distribution: majority_environment plant_surface 19 freshwater 13 soil 13 rhizosphere 6 industrial 2 marine 1
2. PCA of Carbon Pathway Profiles¶
In [3]:
pathways = pathway_matrix.columns.tolist()
X = free_living[pathways].values
# PCA
pca = PCA(n_components=min(10, X.shape[1]))
X_pca = pca.fit_transform(X)
print(f'PCA explained variance:')
for i, var in enumerate(pca.explained_variance_ratio_[:5]):
print(f' PC{i+1}: {var:.1%}')
print(f' Total (5 PCs): {pca.explained_variance_ratio_[:5].sum():.1%}')
# Plot PC1 vs PC2 colored by environment
fig, ax = plt.subplots(figsize=(10, 8))
env_colors = {
'soil': '#8B4513', 'rhizosphere': '#228B22', 'freshwater': '#4169E1',
'marine': '#00CED1', 'plant_surface': '#32CD32', 'industrial': '#808080'
}
for env in free_living['majority_environment'].unique():
mask = free_living['majority_environment'] == env
color = env_colors.get(env, '#999999')
ax.scatter(X_pca[mask.values, 0], X_pca[mask.values, 1],
label=f'{env} (n={mask.sum()})', alpha=0.7, s=50, color=color)
ax.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
ax.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
ax.set_title('PCA of Carbon Pathway Profiles — Free-Living Pseudomonas Species')
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, 'pathway_pca_by_environment.png'), dpi=150, bbox_inches='tight')
plt.show()
PCA explained variance: PC1: 31.2% PC2: 17.9% PC3: 11.2% PC4: 7.7% PC5: 6.8% Total (5 PCs): 74.9%
3. PERMANOVA: Do Environment Categories Differ in Pathway Space?¶
In [4]:
from scipy.spatial.distance import pdist, squareform
# Compute pairwise Bray-Curtis-like distance (Euclidean on fraction data)
D = squareform(pdist(X, metric='euclidean'))
# Simple PERMANOVA-like test: compare within-group vs between-group distances
env_labels = free_living['majority_environment'].values
unique_envs = [e for e in np.unique(env_labels) if (env_labels == e).sum() >= 3]
within_dists = []
between_dists = []
n = len(env_labels)
for i in range(n):
for j in range(i+1, n):
if env_labels[i] in unique_envs and env_labels[j] in unique_envs:
if env_labels[i] == env_labels[j]:
within_dists.append(D[i, j])
else:
between_dists.append(D[i, j])
print(f'Within-group mean distance: {np.mean(within_dists):.3f} (n={len(within_dists)})')
print(f'Between-group mean distance: {np.mean(between_dists):.3f} (n={len(between_dists)})')
# Permutation test
observed_ratio = np.mean(between_dists) / np.mean(within_dists)
n_perm = 999
perm_ratios = []
for _ in range(n_perm):
perm_labels = np.random.permutation(env_labels)
w, b = [], []
for i in range(n):
for j in range(i+1, n):
if perm_labels[i] in unique_envs and perm_labels[j] in unique_envs:
if perm_labels[i] == perm_labels[j]:
w.append(D[i, j])
else:
b.append(D[i, j])
if len(w) > 0:
perm_ratios.append(np.mean(b) / np.mean(w))
p_value = (np.sum(np.array(perm_ratios) >= observed_ratio) + 1) / (n_perm + 1)
print(f'\nPERMANOVA-like test:')
print(f' Observed B/W ratio: {observed_ratio:.3f}')
print(f' Permutation p-value: {p_value:.4f}')
Within-group mean distance: 1.890 (n=342) Between-group mean distance: 2.054 (n=933)
PERMANOVA-like test: Observed B/W ratio: 1.087 Permutation p-value: 0.0060
4. Random Forest: Predict Environment from Pathway Profile¶
In [5]:
# Only use environments with >= 5 species for classification
env_counts = free_living['majority_environment'].value_counts()
valid_envs = env_counts[env_counts >= 5].index.tolist()
clf_data = free_living[free_living['majority_environment'].isin(valid_envs)].copy().reset_index(drop=True)
if len(valid_envs) >= 2 and len(clf_data) >= 10:
X_clf = clf_data[pathways].values.copy()
y_clf = np.array(clf_data['majority_environment'].values)
# Stratified cross-validation
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
min_class_size = min(pd.Series(y_clf).value_counts())
n_splits = min(5, min_class_size)
if n_splits >= 2:
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
scores = cross_val_score(rf, X_clf, y_clf, cv=cv, scoring='balanced_accuracy')
print(f'Random Forest ({len(valid_envs)} classes, {len(clf_data)} species):')
print(f' Cross-val balanced accuracy: {scores.mean():.3f} +/- {scores.std():.3f}')
print(f' Chance level: {1/len(valid_envs):.3f}')
# Fit on all data for feature importance
rf.fit(X_clf, y_clf)
importances = pd.Series(rf.feature_importances_, index=pathways).sort_values(ascending=False)
print(f'\nTop 15 discriminating pathways:')
print(importances.head(15).to_string())
# Feature importance plot
fig, ax = plt.subplots(figsize=(10, 6))
importances.head(20).plot(kind='barh', ax=ax, color='steelblue')
ax.set_xlabel('Feature importance (Gini)')
ax.set_title('Top 20 Carbon Pathways Discriminating Environment Types')
ax.invert_yaxis()
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, 'rf_importance.png'), dpi=150, bbox_inches='tight')
plt.show()
else:
print(f'Insufficient samples per class for cross-validation (min class size = {min_class_size})')
print('Falling back to leave-one-out or simple train/test split not attempted.')
print('This suggests fine-grained environment types have too few species for predictive modeling.')
else:
print(f'Insufficient environment categories ({len(valid_envs)}) or species ({len(clf_data)}) for classification.')
print('This suggests carbon pathway profiles may not strongly predict fine-grained environment type among free-living species.')
Random Forest (4 classes, 51 species): Cross-val balanced accuracy: 0.408 +/- 0.169 Chance level: 0.250 Top 15 discriminating pathways: D-serine 0.131708 arabinose 0.094083 rhamnose 0.086000 fucose 0.084542 xylose 0.069821 NAG 0.069084 xylitol 0.067386 mannose 0.050920 mannitol 0.048859 myoinositol 0.045657 sorbitol 0.045537 glucosamine 0.035278 propionate 0.021892 ribose 0.019198 galacturonate 0.017819
5. Broader Comparison: All Lifestyles¶
Even if fine-grained environment prediction is weak, the free-living vs host-associated distinction may be strong.
In [6]:
# All species with lifestyle assignments
all_with_lifestyle = pathway_matrix.reset_index().merge(
species_env[['gtdb_species_clade_id', 'majority_lifestyle']],
on='gtdb_species_clade_id'
).merge(
species_df[['gtdb_species_clade_id', 'no_genomes']],
on='gtdb_species_clade_id'
)
all_with_lifestyle = all_with_lifestyle[
(all_with_lifestyle['no_genomes'] >= 5) &
(all_with_lifestyle['majority_lifestyle'] != 'unknown')
]
# PCA colored by lifestyle
X_all = all_with_lifestyle[pathways].values
pca_all = PCA(n_components=5)
X_pca_all = pca_all.fit_transform(X_all)
fig, ax = plt.subplots(figsize=(10, 8))
lifestyle_colors = {
'free_living': '#2ecc71', 'host_associated': '#e74c3c',
'plant_associated': '#f39c12', 'food_associated': '#9b59b6'
}
for lifestyle in all_with_lifestyle['majority_lifestyle'].unique():
mask = all_with_lifestyle['majority_lifestyle'] == lifestyle
color = lifestyle_colors.get(lifestyle, '#999999')
ax.scatter(X_pca_all[mask.values, 0], X_pca_all[mask.values, 1],
label=f'{lifestyle} (n={mask.sum()})', alpha=0.7, s=50, color=color)
ax.set_xlabel(f'PC1 ({pca_all.explained_variance_ratio_[0]:.1%})')
ax.set_ylabel(f'PC2 ({pca_all.explained_variance_ratio_[1]:.1%})')
ax.set_title('PCA of Carbon Pathway Profiles — All Pseudomonas by Lifestyle')
ax.legend()
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, 'pathway_pca_by_lifestyle.png'), dpi=150, bbox_inches='tight')
plt.show()
In [7]:
# Pathway richness by lifestyle
all_with_lifestyle['pathway_richness'] = (all_with_lifestyle[pathways] > 0.5).sum(axis=1)
fig, ax = plt.subplots(figsize=(8, 5))
order = ['free_living', 'plant_associated', 'food_associated', 'host_associated']
order = [o for o in order if o in all_with_lifestyle['majority_lifestyle'].values]
sns.boxplot(data=all_with_lifestyle, x='majority_lifestyle', y='pathway_richness',
order=order, ax=ax, palette=[lifestyle_colors.get(o, '#999') for o in order])
ax.set_xlabel('Lifestyle')
ax.set_ylabel('Carbon pathway richness\n(pathways complete in >50% of genomes)')
ax.set_title('Carbon Pathway Richness by Lifestyle')
# Add stats
for lifestyle in order:
sub = all_with_lifestyle[all_with_lifestyle['majority_lifestyle'] == lifestyle]
print(f'{lifestyle}: n={len(sub)}, median richness={sub["pathway_richness"].median():.0f}, mean={sub["pathway_richness"].mean():.1f}')
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, 'pathway_richness_by_lifestyle.png'), dpi=150, bbox_inches='tight')
plt.show()
free_living: n=91, median richness=57, mean=55.9 plant_associated: n=31, median richness=57, mean=56.8 food_associated: n=9, median richness=55, mean=55.7 host_associated: n=66, median richness=55, mean=54.5
/tmp/ipykernel_12822/4201764123.py:7: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(data=all_with_lifestyle, x='majority_lifestyle', y='pathway_richness',
Summary¶
Key outputs:
pathway_pca_by_environment.png— PCA of free-living species colored by environmentrf_importance.png— feature importance for environment predictionpathway_pca_by_lifestyle.png— PCA of all species colored by lifestylepathway_richness_by_lifestyle.png— pathway richness boxplots