03 Statistical Analysis
Jupyter notebook from the Openness vs Functional Composition project.
Notebook 03: Statistical Analysis & Visualization¶
Project: Openness vs Functional Composition
Goal: Test whether COG enrichment patterns scale with pangenome openness. Phylogenetic controls. Publication-quality figures.
Input:
../data/cog_enrichment_by_openness.csv(from NB 02)../data/cog_enrichment_quartile_summary.csv(from NB 02)../data/all_species_openness.csv(from NB 01)
Output:
../figures/enrichment_by_quartile_heatmap.png../figures/openness_vs_lv_enrichment.png../figures/core_metabolic_by_openness.png
In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.dpi'] = 150
from scipy import stats
enrichment = pd.read_csv('../data/cog_enrichment_by_openness.csv')
quartile_summary = pd.read_csv('../data/cog_enrichment_quartile_summary.csv')
all_species = pd.read_csv('../data/all_species_openness.csv')
print(f"Enrichment data: {len(enrichment)} rows, {enrichment['gtdb_species_clade_id'].nunique()} species")
print(f"COG categories: {sorted(enrichment['COG_letter'].unique())}")
1. Spearman Correlation: Openness vs Enrichment¶
For each COG category, test whether enrichment correlates with continuous openness.
In [ ]:
# COG category descriptions
cog_descriptions = {
'J': 'Translation', 'A': 'RNA processing', 'K': 'Transcription',
'L': 'Replication/repair', 'B': 'Chromatin', 'D': 'Cell cycle',
'Y': 'Nuclear structure', 'V': 'Defense', 'T': 'Signal transduction',
'M': 'Cell wall', 'N': 'Cell motility', 'Z': 'Cytoskeleton',
'W': 'Extracellular', 'U': 'Secretion', 'O': 'Chaperones',
'X': 'Mobilome', 'C': 'Energy production', 'G': 'Carbohydrate metabolism',
'E': 'Amino acid metabolism', 'F': 'Nucleotide metabolism',
'H': 'Coenzyme metabolism', 'I': 'Lipid metabolism',
'P': 'Inorganic ion transport', 'Q': 'Secondary metabolites',
'R': 'General prediction', 'S': 'Unknown function'
}
results = []
for cog in sorted(enrichment['COG_letter'].unique()):
subset = enrichment[enrichment['COG_letter'] == cog]
if len(subset) < 5:
continue
rho, pval = stats.spearmanr(subset['openness'], subset['enrichment'])
results.append({
'COG': cog,
'description': cog_descriptions.get(cog, '?'),
'spearman_rho': rho,
'p_value': pval,
'n_species': len(subset),
'mean_enrichment': subset['enrichment'].mean()
})
corr_df = pd.DataFrame(results).sort_values('p_value')
# Bonferroni correction
corr_df['p_bonferroni'] = corr_df['p_value'] * len(corr_df)
corr_df['significant'] = corr_df['p_bonferroni'] < 0.05
print("Spearman correlation: openness vs enrichment")
print("=" * 80)
print(corr_df.to_string(index=False, float_format='%.4f'))
2. Kruskal-Wallis: Enrichment Across Quartiles¶
In [ ]:
kw_results = []
for cog in sorted(enrichment['COG_letter'].unique()):
groups = []
for q in ['Q1_closed', 'Q2', 'Q3', 'Q4_open']:
subset = enrichment[
(enrichment['COG_letter'] == cog) &
(enrichment['openness_quartile'] == q)
]['enrichment'].values
if len(subset) > 0:
groups.append(subset)
if len(groups) >= 2:
stat, pval = stats.kruskal(*groups)
kw_results.append({
'COG': cog,
'description': cog_descriptions.get(cog, '?'),
'H_statistic': stat,
'p_value': pval
})
kw_df = pd.DataFrame(kw_results).sort_values('p_value')
kw_df['p_bonferroni'] = kw_df['p_value'] * len(kw_df)
kw_df['significant'] = kw_df['p_bonferroni'] < 0.05
print("Kruskal-Wallis test: enrichment differs across openness quartiles?")
print("=" * 80)
print(kw_df.to_string(index=False, float_format='%.4f'))
3. Figure: Enrichment Heatmap by Quartile¶
In [ ]:
# Pivot for heatmap: quartile x COG category
heatmap_data = quartile_summary.pivot(
index='openness_quartile', columns='COG_letter', values='mean_enrichment'
).reindex(['Q1_closed', 'Q2', 'Q3', 'Q4_open'])
# Order COG categories by mean enrichment in Q4 (most open)
col_order = heatmap_data.loc['Q4_open'].sort_values(ascending=False).index
heatmap_data = heatmap_data[col_order]
fig, ax = plt.subplots(figsize=(14, 4))
im = ax.imshow(heatmap_data.values, cmap='RdBu_r', aspect='auto', vmin=-0.15, vmax=0.15)
ax.set_xticks(range(len(heatmap_data.columns)))
ax.set_xticklabels(heatmap_data.columns, fontsize=9)
ax.set_yticks(range(len(heatmap_data.index)))
ax.set_yticklabels(['Q1 (closed)', 'Q2', 'Q3', 'Q4 (open)'])
ax.set_xlabel('COG Category')
ax.set_ylabel('Openness Quartile')
ax.set_title('COG Enrichment (Novel - Core) by Pangenome Openness Quartile')
# Add text annotations
for i in range(heatmap_data.shape[0]):
for j in range(heatmap_data.shape[1]):
val = heatmap_data.values[i, j]
if not np.isnan(val):
color = 'white' if abs(val) > 0.08 else 'black'
ax.text(j, i, f'{val:.3f}', ha='center', va='center', fontsize=7, color=color)
plt.colorbar(im, ax=ax, label='Enrichment (novel - core)', shrink=0.8)
plt.tight_layout()
plt.savefig('../figures/enrichment_by_quartile_heatmap.png', dpi=150, bbox_inches='tight')
print("Saved: ../figures/enrichment_by_quartile_heatmap.png")
plt.show()
4. Figure: Openness vs L and V Enrichment (Scatter)¶
In [ ]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
for ax, cog, color in zip(axes, ['L', 'V'], ['#d62728', '#1f77b4']):
subset = enrichment[enrichment['COG_letter'] == cog]
ax.scatter(subset['openness'], subset['enrichment'],
alpha=0.6, c=color, edgecolors='k', linewidths=0.3, s=50)
# Trend line
z = np.polyfit(subset['openness'], subset['enrichment'], 1)
p = np.poly1d(z)
x_line = np.linspace(subset['openness'].min(), subset['openness'].max(), 100)
ax.plot(x_line, p(x_line), '--', color='gray', alpha=0.8)
# Stats
rho, pval = stats.spearmanr(subset['openness'], subset['enrichment'])
ax.set_title(f'COG {cog} ({cog_descriptions[cog]})\nSpearman rho={rho:.3f}, p={pval:.4f}')
ax.set_xlabel('Pangenome Openness (1 - core fraction)')
ax.set_ylabel('Enrichment in Novel Genes')
ax.axhline(y=0, color='gray', linestyle=':', alpha=0.5)
plt.suptitle('Does Mobile Element / Defense Enrichment Scale with Pangenome Openness?', fontsize=12)
plt.tight_layout()
plt.savefig('../figures/openness_vs_lv_enrichment.png', dpi=150, bbox_inches='tight')
print("Saved: ../figures/openness_vs_lv_enrichment.png")
plt.show()
5. Figure: Core Metabolic Enrichment vs Openness¶
In [ ]:
metabolic_cogs = ['E', 'C', 'G', 'F', 'H', 'I', 'P']
colors = plt.cm.Set2(np.linspace(0, 1, len(metabolic_cogs)))
fig, ax = plt.subplots(figsize=(10, 6))
for cog, color in zip(metabolic_cogs, colors):
subset = enrichment[enrichment['COG_letter'] == cog]
if len(subset) == 0:
continue
# Mean enrichment per quartile
q_means = subset.groupby('openness_quartile')['enrichment'].mean()
q_means = q_means.reindex(['Q1_closed', 'Q2', 'Q3', 'Q4_open'])
ax.plot(range(4), q_means.values, 'o-', color=color,
label=f"{cog} ({cog_descriptions[cog]})", markersize=6)
ax.set_xticks(range(4))
ax.set_xticklabels(['Q1\n(closed)', 'Q2', 'Q3', 'Q4\n(open)'])
ax.set_xlabel('Openness Quartile')
ax.set_ylabel('Mean Enrichment (novel - core)')
ax.set_title('Metabolic COG Categories: Does Core Enrichment Change with Openness?')
ax.axhline(y=0, color='gray', linestyle=':', alpha=0.5)
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=8)
plt.tight_layout()
plt.savefig('../figures/core_metabolic_by_openness.png', dpi=150, bbox_inches='tight')
print("Saved: ../figures/core_metabolic_by_openness.png")
plt.show()
6. Phylogenetic Control¶
Check if trends hold within phyla (not driven by phylogenetic confounding).
In [ ]:
# Within-phylum Spearman correlations for key COGs
key_cogs = ['L', 'V', 'J', 'E', 'C']
phyla = enrichment['phylum'].value_counts()
phyla_with_enough = phyla[phyla >= 6].index # need at least 6 species per phylum
print("Within-phylum Spearman correlations (openness vs enrichment)")
print("=" * 80)
for phylum in phyla_with_enough:
print(f"\n--- {phylum} (n={phyla[phylum]}) ---")
for cog in key_cogs:
subset = enrichment[
(enrichment['COG_letter'] == cog) &
(enrichment['phylum'] == phylum)
]
if len(subset) >= 4:
rho, pval = stats.spearmanr(subset['openness'], subset['enrichment'])
sig = '*' if pval < 0.05 else ''
print(f" {cog} ({cog_descriptions[cog]}): rho={rho:+.3f}, p={pval:.3f} {sig}")
7. Genome Count Control¶
Openness estimates improve with more genomes. Check if results are driven by genome count.
In [ ]:
# Partial correlation: openness vs enrichment, controlling for genome count
from scipy.stats import spearmanr
print("Partial correlation check: openness vs enrichment, controlling for genome count")
print("=" * 80)
for cog in key_cogs:
subset = enrichment[enrichment['COG_letter'] == cog].dropna(subset=['openness', 'enrichment', 'no_genomes'])
# Raw correlation
rho_raw, p_raw = spearmanr(subset['openness'], subset['enrichment'])
# Correlation of openness with genome count
rho_confound, _ = spearmanr(subset['openness'], subset['no_genomes'])
print(f"{cog} ({cog_descriptions[cog]}): rho={rho_raw:+.3f} (p={p_raw:.4f}), openness-genomes rho={rho_confound:+.3f}")
Summary¶
Record after running:
- Which COG categories show significant correlation with openness? ___
- Does L enrichment increase with openness (H1)? ___
- Does V enrichment increase with openness (H2)? ___
- Do metabolic categories show stronger core enrichment in closed pangenomes (H3)? ___
- Do trends survive phylogenetic controls? ___
- Is genome count a confounder? ___
- Overall conclusion: H0 or H1? ___