import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 6)
spark = get_spark_session()

# Spark session should already be available
print(f"Spark version: {spark.version}")

Spark version: 4.0.1

# Get pangenome statistics for all species
all_species_query = """
SELECT 
    s.GTDB_species,
    s.gtdb_species_clade_id,
    s.GTDB_taxonomy,
    p.no_genomes,
    p.no_core,
    p.no_aux_genome,
    p.no_singleton_gene_clusters,
    p.no_gene_clusters
FROM kbase_ke_pangenome.pangenome p
JOIN kbase_ke_pangenome.gtdb_species_clade s
    ON p.gtdb_species_clade_id = s.gtdb_species_clade_id
ORDER BY p.no_genomes DESC
"""

all_species = spark.sql(all_species_query).toPandas()

# Convert numeric columns from strings to integers
numeric_cols = ['no_genomes', 'no_core', 'no_aux_genome', 'no_singleton_gene_clusters', 'no_gene_clusters']
for col in numeric_cols:
    all_species[col] = pd.to_numeric(all_species[col], errors='coerce')

print(f"Total species: {len(all_species)}")
print(f"\nGenome count statistics:")
print(all_species['no_genomes'].describe())

all_species.head(10)

Total species: 27690

Genome count statistics:
count    27690.000000
mean        10.583568
std        174.136580
min          2.000000
25%          2.000000
50%          3.000000
75%          5.000000
max      14526.000000
Name: no_genomes, dtype: float64

# Visualize distribution
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Full distribution (log scale)
ax = axes[0]
all_species['no_genomes'].hist(bins=50, ax=ax, edgecolor='black')
ax.set_xlabel('Number of Genomes')
ax.set_ylabel('Number of Species')
ax.set_title('Distribution of Genome Counts (All Species)')
ax.set_yscale('log')

# Zoomed to 1-1000 genomes
ax = axes[1]
subset = all_species[all_species['no_genomes'] <= 1000]
subset['no_genomes'].hist(bins=50, ax=ax, edgecolor='black')
ax.set_xlabel('Number of Genomes')
ax.set_ylabel('Number of Species')
ax.set_title('Distribution (1-1000 genomes)')

# Cumulative distribution
ax = axes[2]
sorted_counts = np.sort(all_species['no_genomes'].values)
cumulative = np.arange(1, len(sorted_counts) + 1) / len(sorted_counts)
ax.plot(sorted_counts, cumulative)
ax.axhline(y=0.5, color='r', linestyle='--', label='50th percentile')
ax.axvline(x=100, color='g', linestyle='--', label='100 genomes')
ax.axvline(x=500, color='orange', linestyle='--', label='500 genomes')
ax.set_xlabel('Number of Genomes')
ax.set_ylabel('Cumulative Proportion of Species')
ax.set_title('Cumulative Distribution')
ax.set_xscale('log')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../data/genome_count_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

# Count species in different genome count bins
bins = [0, 10, 50, 100, 200, 500, 1000, 10000]
labels = ['1-10', '11-50', '51-100', '101-200', '201-500', '501-1000', '1000+']
all_species['genome_bin'] = pd.cut(all_species['no_genomes'], bins=bins, labels=labels)

bin_counts = all_species['genome_bin'].value_counts().sort_index()
print("\nSpecies counts by genome bin:")
print(bin_counts)
print(f"\nSpecies with 100-500 genomes: {len(all_species[(all_species['no_genomes'] >= 100) & (all_species['no_genomes'] < 500)])}")

Species counts by genome bin:
genome_bin
1-10        25206
11-50        2036
51-100        223
101-200       100
201-500        80
501-1000       19
1000+          23
Name: count, dtype: int64

Species with 100-500 genomes: 181

# Calculate proportions
all_species['core_proportion'] = all_species['no_core'] / all_species['no_gene_clusters']
all_species['singleton_proportion'] = all_species['no_singleton_gene_clusters'] / all_species['no_gene_clusters']
all_species['aux_proportion'] = all_species['no_aux_genome'] / all_species['no_gene_clusters']

# Plot relationships
fig, axes = plt.subplots(2, 2, figsize=(14, 12))

# Core proportion vs genome count
ax = axes[0, 0]
ax.scatter(all_species['no_genomes'], all_species['core_proportion'], alpha=0.3, s=10)
ax.set_xlabel('Number of Genomes')
ax.set_ylabel('Proportion Core Genes')
ax.set_title('Core Gene Proportion vs Genome Count')
ax.set_xscale('log')
ax.axvline(x=100, color='g', linestyle='--', alpha=0.5, label='100 genomes')
ax.axvline(x=500, color='orange', linestyle='--', alpha=0.5, label='500 genomes')
ax.legend()

# Singleton proportion vs genome count
ax = axes[0, 1]
ax.scatter(all_species['no_genomes'], all_species['singleton_proportion'], alpha=0.3, s=10, color='coral')
ax.set_xlabel('Number of Genomes')
ax.set_ylabel('Proportion Singleton Genes')
ax.set_title('Singleton Gene Proportion vs Genome Count')
ax.set_xscale('log')
ax.axvline(x=100, color='g', linestyle='--', alpha=0.5, label='100 genomes')
ax.axvline(x=500, color='orange', linestyle='--', alpha=0.5, label='500 genomes')
ax.legend()

# Total gene clusters vs genome count
ax = axes[1, 0]
ax.scatter(all_species['no_genomes'], all_species['no_gene_clusters'], alpha=0.3, s=10, color='purple')
ax.set_xlabel('Number of Genomes')
ax.set_ylabel('Total Gene Clusters')
ax.set_title('Pangenome Size vs Genome Count')
ax.set_xscale('log')
ax.set_yscale('log')
ax.axvline(x=100, color='g', linestyle='--', alpha=0.5, label='100 genomes')
ax.axvline(x=500, color='orange', linestyle='--', alpha=0.5, label='500 genomes')
ax.legend()

# Core vs singleton proportion
ax = axes[1, 1]
colors = all_species['no_genomes'].apply(lambda x: 'green' if x < 100 else ('orange' if x < 500 else 'red'))
ax.scatter(all_species['core_proportion'], all_species['singleton_proportion'], alpha=0.3, s=10, c=colors)
ax.set_xlabel('Proportion Core Genes')
ax.set_ylabel('Proportion Singleton Genes')
ax.set_title('Core vs Singleton Proportions\n(green: <100, orange: 100-500, red: >500 genomes)')

plt.tight_layout()
plt.savefig('../data/pangenome_characteristics.png', dpi=300, bbox_inches='tight')
plt.show()

# Extract phylum from GTDB taxonomy
def extract_phylum(taxonomy):
    if pd.isna(taxonomy):
        return 'Unknown'
    parts = taxonomy.split(';')
    for part in parts:
        if part.startswith('p__'):
            return part.replace('p__', '')
    return 'Unknown'

all_species['phylum'] = all_species['GTDB_taxonomy'].apply(extract_phylum)

# Compare taxonomic diversity at different thresholds
thresholds = [
    ('All species', all_species),
    ('≥50 genomes', all_species[all_species['no_genomes'] >= 50]),
    ('≥100 genomes', all_species[all_species['no_genomes'] >= 100]),
    ('100-500 genomes', all_species[(all_species['no_genomes'] >= 100) & (all_species['no_genomes'] < 500)]),
    ('≥500 genomes', all_species[all_species['no_genomes'] >= 500])
]

print("\nTaxonomic diversity at different genome count thresholds:")
print("=" * 80)
for name, subset in thresholds:
    phylum_counts = subset['phylum'].value_counts()
    print(f"\n{name}: {len(subset)} species, {len(phylum_counts)} phyla")
    print(f"  Top phyla: {', '.join([f'{p} ({c})' for p, c in phylum_counts.head(5).items()])}")

Taxonomic diversity at different genome count thresholds:
================================================================================

All species: 27690 species, 142 phyla
  Top phyla: Pseudomonadota (7456), Bacillota_A (3917), Bacteroidota (3629), Actinomycetota (3172), Bacillota (2146)

≥50 genomes: 457 species, 14 phyla
  Top phyla: Pseudomonadota (193), Bacillota (112), Bacillota_A (49), Bacteroidota (41), Actinomycetota (32)

≥100 genomes: 226 species, 9 phyla
  Top phyla: Pseudomonadota (100), Bacillota (72), Actinomycetota (16), Bacteroidota (16), Bacillota_A (11)

100-500 genomes: 181 species, 9 phyla
  Top phyla: Pseudomonadota (78), Bacillota (57), Bacteroidota (16), Actinomycetota (13), Bacillota_A (10)

≥500 genomes: 45 species, 5 phyla
  Top phyla: Pseudomonadota (22), Bacillota (15), Campylobacterota (4), Actinomycetota (3), Bacillota_A (1)

# Visualize phylum distribution for 100-500 genome species
subset_100_500 = all_species[(all_species['no_genomes'] >= 100) & (all_species['no_genomes'] < 500)]
phylum_counts = subset_100_500['phylum'].value_counts()

fig, ax = plt.subplots(figsize=(12, 6))
phylum_counts.head(20).plot(kind='barh', ax=ax, color='steelblue')
ax.set_xlabel('Number of Species')
ax.set_ylabel('Phylum')
ax.set_title(f'Phylum Distribution (Species with 100-500 genomes)\nTotal: {len(subset_100_500)} species')
plt.tight_layout()
plt.savefig('../data/phylum_distribution_100_500.png', dpi=300, bbox_inches='tight')
plt.show()

# Select diverse set of species with 100-500 genomes
# Strategy: sample from each major phylum

subset_100_500 = all_species[(all_species['no_genomes'] >= 100) & (all_species['no_genomes'] < 500)]

# Get top phyla
major_phyla = subset_100_500['phylum'].value_counts().head(10).index

# Sample 3-5 species per phylum
sampled_species = []
for phylum in major_phyla:
    phylum_species = subset_100_500[subset_100_500['phylum'] == phylum]
    # Sort by genome count and take diverse range
    sorted_species = phylum_species.sort_values('no_genomes')
    n_samples = min(5, len(sorted_species))
    if len(sorted_species) > 0:
        indices = np.linspace(0, len(sorted_species)-1, n_samples, dtype=int)
        sampled_species.append(sorted_species.iloc[indices])

sampled_df = pd.concat(sampled_species, ignore_index=True)

print(f"\nSampled {len(sampled_df)} species from {len(major_phyla)} major phyla")
print(f"Genome count range: {sampled_df['no_genomes'].min()}-{sampled_df['no_genomes'].max()}")
print(f"\nPhylum distribution in sample:")
print(sampled_df['phylum'].value_counts())

# Save to CSV
sampled_df[['GTDB_species', 'gtdb_species_clade_id', 'phylum', 'no_genomes', 
            'no_core', 'no_aux_genome', 'no_singleton_gene_clusters']].to_csv(
    '../data/sampled_species_for_cog_analysis.csv', index=False
)
print(f"\nSaved sampled species to ../data/sampled_species_for_cog_analysis.csv")

sampled_df[['GTDB_species', 'phylum', 'no_genomes', 'no_core', 'no_singleton_gene_clusters']].head(20)

Sampled 32 species from 9 major phyla
Genome count range: 100-472

Phylum distribution in sample:
phylum
Pseudomonadota       5
Bacillota            5
Bacteroidota         5
Actinomycetota       5
Bacillota_A          5
Spirochaetota        3
Campylobacterota     2
Verrucomicrobiota    1
Chlamydiota          1
Name: count, dtype: int64

Saved sampled species to ../data/sampled_species_for_cog_analysis.csv

print("\n" + "="*80)
print("RECOMMENDATIONS FOR MULTI-SPECIES COG ANALYSIS")
print("="*80)

print("\n1. GENOME COUNT THRESHOLD:")
print(f"   - 100-500 genomes recommended: {len(subset_100_500)} species available")
print(f"   - Good balance of statistical power and taxonomic diversity")
print(f"   - Median genome count in this range: {subset_100_500['no_genomes'].median():.0f}")

print("\n2. ALTERNATIVE THRESHOLDS TO CONSIDER:")
print(f"   - 50-200 genomes: {len(all_species[(all_species['no_genomes'] >= 50) & (all_species['no_genomes'] < 200)])} species")
print(f"   - 100-300 genomes: {len(all_species[(all_species['no_genomes'] >= 100) & (all_species['no_genomes'] < 300)])} species")
print(f"   - 200-1000 genomes: {len(all_species[(all_species['no_genomes'] >= 200) & (all_species['no_genomes'] < 1000)])} species")

print("\n3. SAMPLING STRATEGY:")
print(f"   - Option A: Stratified sample from {len(sampled_df)} diverse species (saved to CSV)")
print(f"   - Option B: All {len(subset_100_500)} species with 100-500 genomes")
print(f"   - Option C: Pilot with ~20-30 species, then expand if patterns are clear")

print("\n4. NEXT STEPS:")
print("   - Review sampled species list")
print("   - Test query performance on 5-10 species first")
print("   - Parallelize queries across species")
print("   - Look for conserved patterns across phyla")

================================================================================
RECOMMENDATIONS FOR MULTI-SPECIES COG ANALYSIS
================================================================================

1. GENOME COUNT THRESHOLD:
   - 100-500 genomes recommended: 181 species available
   - Good balance of statistical power and taxonomic diversity
   - Median genome count in this range: 175

2. ALTERNATIVE THRESHOLDS TO CONSIDER:
   - 50-200 genomes: 332 species
   - 100-300 genomes: 152 species
   - 200-1000 genomes: 99 species

3. SAMPLING STRATEGY:
   - Option A: Stratified sample from 32 diverse species (saved to CSV)
   - Option B: All 181 species with 100-500 genomes
   - Option C: Pilot with ~20-30 species, then expand if patterns are clear

4. NEXT STEPS:
   - Review sampled species list
   - Test query performance on 5-10 species first
   - Parallelize queries across species
   - Look for conserved patterns across phyla

	GTDB_species	gtdb_species_clade_id	GTDB_taxonomy	no_genomes	no_core	no_aux_genome	no_singleton_gene_clusters	no_gene_clusters
0	s__Bacteroides_caccae	s__Bacteroides_caccae--RS_GCF_002222615.2	d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...	99	2862	23563	13014	26425
1	s__Vibrio_lentus	s__Vibrio_lentus--RS_GCF_014878155.1	d__Bacteria;p__Pseudomonadota;c__Gammaproteoba...	99	3992	19090	10393	23082
2	s__Eggerthella_lenta	s__Eggerthella_lenta--RS_GCF_000024265.1	d__Bacteria;p__Actinomycetota;c__Coriobacterii...	98	2141	12489	7791	14630
3	s__Rhizobium_leguminosarum	s__Rhizobium_leguminosarum--RS_GCF_002008365.1	d__Bacteria;p__Pseudomonadota;c__Alphaproteoba...	97	5875	26691	15344	32566
4	s__Burkholderia_orbicola	s__Burkholderia_orbicola--RS_GCF_014211915.1	d__Bacteria;p__Pseudomonadota;c__Gammaproteoba...	97	5597	25844	15021	31441
5	s__Yersinia_ruckeri	s__Yersinia_ruckeri--RS_GCF_000754815.1	d__Bacteria;p__Pseudomonadota;c__Gammaproteoba...	97	2913	5149	2323	8062
6	s__Bifidobacterium_globosum	s__Bifidobacterium_globosum--RS_GCF_000741295.1	d__Bacteria;p__Actinomycetota;c__Actinomycetia...	97	1233	11075	6582	12308
7	s__Weissella_confusa	s__Weissella_confusa--GB_GCA_001436895.1	d__Bacteria;p__Bacillota;c__Bacilli;o__Lactoba...	96	1653	9228	4810	10881
8	s__Neisseria_gonorrhoeae	s__Neisseria_gonorrhoeae--RS_GCF_003315235.1	d__Bacteria;p__Pseudomonadota;c__Gammaproteoba...	954	1776	18119	10593	19895
9	s__Bacillus_A_tropicus	s__Bacillus_A_tropicus--RS_GCF_001884035.1	d__Bacteria;p__Bacillota;c__Bacilli;o__Bacilla...	95	4110	28051	15782	32161

	GTDB_species	phylum	no_genomes	no_core	no_singleton_gene_clusters
0	s__Escherichia_marmotae	Pseudomonadota	100	3576	8672
1	s__Xanthomonas_campestris	Pseudomonadota	128	3436	7259
2	s__Rhizobium_laguerreae	Pseudomonadota	175	5045	25707
3	s__Acinetobacter_nosocomialis	Pseudomonadota	246	2938	20078
4	s__Cronobacter_sakazakii	Pseudomonadota	449	3468	22032
5	s__Listeria_innocua	Bacillota	108	2474	3877
6	s__Streptococcus_dysgalactiae	Bacillota	136	1456	6257
7	s__Staphylococcus_saprophyticus	Bacillota	177	2112	11307
8	s__Lactobacillus_delbrueckii	Bacillota	240	1167	7533
9	s__Streptococcus_equi	Bacillota	472	1830	10576
10	s__Riemerella_anatipestifer	Bacteroidota	106	1520	2211
11	s__Prevotella_copri_B	Bacteroidota	112	1852	10300
12	s__Phocaeicola_dorei	Bacteroidota	135	3139	19354
13	s__Bacteroides_thetaiotaomicron	Bacteroidota	287	3225	34472
14	s__Phocaeicola_vulgatus	Bacteroidota	449	2635	39652
15	s__Mycobacterium_intracellulare	Actinomycetota	104	4216	11599
16	s__Bifidobacterium_infantis	Actinomycetota	118	1465	5779
17	s__Bifidobacterium_pseudocatenulatum	Actinomycetota	170	1378	8597
18	s__Mycobacterium_avium	Actinomycetota	249	3983	24423
19	s__Corynebacterium_diphtheriae	Actinomycetota	389	1788	5749

Species Selection Exploration

Species Selection Exploration for Multi-Species COG Analysis¶

1. Distribution of genome counts across all species¶

2. Pangenome characteristics vs genome count¶

3. Taxonomic diversity at different thresholds¶

4. Sample species for multi-species analysis¶

5. Recommendations for multi-species analysis¶