import pandas as pd

spark = get_spark_session()

# Load target species from NB 01
target_df = pd.read_csv('../data/species_openness_quartiles.csv')
print(f"Target species: {len(target_df)}")
print(target_df.groupby('openness_quartile').size())

Target species: 40
openness_quartile
Q1_closed    10
Q2           10
Q3           10
Q4_open      10
dtype: int64

# Build species list for SQL IN clause
species_list = target_df['gtdb_species_clade_id'].tolist()
species_sql = ", ".join([f"'{s}'" for s in species_list])

print(f"Querying COG distributions for {len(species_list)} species...")

cog_raw = spark.sql(f"""
    SELECT 
        gc.gtdb_species_clade_id,
        gc.is_core,
        gc.is_auxiliary,
        gc.is_singleton,
        ann.COG_category,
        COUNT(*) as gene_count
    FROM kbase_ke_pangenome.gene_cluster gc
    JOIN kbase_ke_pangenome.gene_genecluster_junction j 
        ON gc.gene_cluster_id = j.gene_cluster_id
    JOIN kbase_ke_pangenome.eggnog_mapper_annotations ann 
        ON j.gene_id = ann.query_name
    WHERE gc.gtdb_species_clade_id IN ({species_sql})
        AND ann.COG_category IS NOT NULL
        AND ann.COG_category != '-'
    GROUP BY 
        gc.gtdb_species_clade_id,
        gc.is_core, gc.is_auxiliary, gc.is_singleton,
        ann.COG_category
    ORDER BY gc.gtdb_species_clade_id, gc.is_core DESC, gene_count DESC
""")

cog_pdf = cog_raw.toPandas()
print(f"Query returned {len(cog_pdf):,} rows")
print(f"Species returned: {cog_pdf['gtdb_species_clade_id'].nunique()}")
cog_pdf.head(10)

Querying COG distributions for 40 species...
Query returned 6,535 rows
Species returned: 40

# Assign gene class labels
def classify_gene(row):
    if row['is_core'] == 1:
        return 'Core'
    elif row['is_singleton'] == 1:
        return 'Singleton'
    else:
        return 'Auxiliary'

cog_pdf['gene_class'] = cog_pdf.apply(classify_gene, axis=1)

# Split multi-letter COG categories (e.g., 'LV' -> 'L' and 'V')
# Each letter gets the full gene count (a gene with 'LV' is both L and V)
rows = []
for _, row in cog_pdf.iterrows():
    for letter in row['COG_category']:
        if letter.isalpha():
            new_row = row.copy()
            new_row['COG_letter'] = letter
            rows.append(new_row)

cog_split = pd.DataFrame(rows)
print(f"After splitting multi-letter COGs: {len(cog_split):,} rows")
print(f"Unique COG letters: {sorted(cog_split['COG_letter'].unique())}")

After splitting multi-letter COGs: 11,327 rows
Unique COG letters: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'S', 'T', 'U', 'V', 'W', 'Y', 'Z']

# Aggregate gene counts by species, gene class, COG letter
agg = cog_split.groupby(
    ['gtdb_species_clade_id', 'gene_class', 'COG_letter']
)['gene_count'].sum().reset_index()

# Compute proportions within each species + gene class
totals = agg.groupby(['gtdb_species_clade_id', 'gene_class'])['gene_count'].transform('sum')
agg['proportion'] = agg['gene_count'] / totals

# Pivot to get core and singleton proportions side by side
core_props = agg[agg['gene_class'] == 'Core'][['gtdb_species_clade_id', 'COG_letter', 'proportion']]
core_props = core_props.rename(columns={'proportion': 'core_proportion'})

singleton_props = agg[agg['gene_class'] == 'Singleton'][['gtdb_species_clade_id', 'COG_letter', 'proportion']]
singleton_props = singleton_props.rename(columns={'proportion': 'singleton_proportion'})

# Merge and compute enrichment
enrichment = core_props.merge(singleton_props, on=['gtdb_species_clade_id', 'COG_letter'], how='outer').fillna(0)
enrichment['enrichment'] = enrichment['singleton_proportion'] - enrichment['core_proportion']

# Add openness quartile
enrichment = enrichment.merge(
    target_df[['gtdb_species_clade_id', 'openness_quartile', 'openness', 'phylum', 'GTDB_species', 'no_genomes']],
    on='gtdb_species_clade_id',
    how='left'
)

print(f"Enrichment table: {len(enrichment)} rows")
enrichment.head(10)

Enrichment table: 857 rows

# Mean enrichment per COG letter per quartile
quartile_summary = enrichment.groupby(
    ['openness_quartile', 'COG_letter']
).agg(
    mean_enrichment=('enrichment', 'mean'),
    std_enrichment=('enrichment', 'std'),
    n_species=('enrichment', 'count'),
    median_enrichment=('enrichment', 'median')
).reset_index()

# Show key categories
for cog in ['L', 'V', 'S', 'J', 'E', 'C', 'G']:
    print(f"\n=== COG {cog} ===")
    subset = quartile_summary[quartile_summary['COG_letter'] == cog].sort_values('openness_quartile')
    print(subset[['openness_quartile', 'mean_enrichment', 'std_enrichment', 'n_species']].to_string(index=False))

=== COG L ===
openness_quartile  mean_enrichment  std_enrichment  n_species
        Q1_closed         0.079655        0.043578         10
               Q2         0.095383        0.027031         10
               Q3         0.095821        0.056469         10
          Q4_open         0.115828        0.051769         10

=== COG V ===
openness_quartile  mean_enrichment  std_enrichment  n_species
        Q1_closed         0.015102        0.024716         10
               Q2         0.026935        0.020388         10
               Q3         0.018925        0.007603         10
          Q4_open         0.018214        0.009879         10

=== COG S ===
openness_quartile  mean_enrichment  std_enrichment  n_species
        Q1_closed         0.013928        0.101044         10
               Q2        -0.023930        0.027142         10
               Q3        -0.018384        0.039583         10
          Q4_open        -0.012459        0.048610         10

=== COG J ===
openness_quartile  mean_enrichment  std_enrichment  n_species
        Q1_closed        -0.040075        0.039585         10
               Q2        -0.027835        0.011534         10
               Q3        -0.035789        0.016775         10
          Q4_open        -0.031086        0.020816         10

=== COG E ===
openness_quartile  mean_enrichment  std_enrichment  n_species
        Q1_closed        -0.008861        0.041972         10
               Q2        -0.014926        0.015035         10
               Q3        -0.011693        0.014659         10
          Q4_open        -0.019691        0.017250         10

=== COG C ===
openness_quartile  mean_enrichment  std_enrichment  n_species
        Q1_closed        -0.007880        0.018921         10
               Q2        -0.009437        0.012403         10
               Q3        -0.015442        0.012934         10
          Q4_open        -0.015711        0.016372         10

=== COG G ===
openness_quartile  mean_enrichment  std_enrichment  n_species
        Q1_closed        -0.004418        0.018134         10
               Q2        -0.006263        0.012108         10
               Q3        -0.004283        0.014056         10
          Q4_open        -0.013841        0.013352         10

# Save per-species enrichment with quartile labels
enrichment.to_csv('../data/cog_enrichment_by_openness.csv', index=False)
print(f"Saved {len(enrichment)} rows to ../data/cog_enrichment_by_openness.csv")

# Save quartile summary
quartile_summary.to_csv('../data/cog_enrichment_quartile_summary.csv', index=False)
print(f"Saved quartile summary to ../data/cog_enrichment_quartile_summary.csv")

# Save raw COG counts for notebook 03
cog_pdf.to_csv('../data/cog_raw_counts.csv', index=False)
print(f"Saved raw counts to ../data/cog_raw_counts.csv")

Saved 857 rows to ../data/cog_enrichment_by_openness.csv
Saved quartile summary to ../data/cog_enrichment_quartile_summary.csv
Saved raw counts to ../data/cog_raw_counts.csv

	gtdb_species_clade_id	is_core	is_auxiliary	is_singleton	COG_category	gene_count
0	s__Bacillus_licheniformis--RS_GCF_000011645.1	True	False	False	S	945
1	s__Bacillus_licheniformis--RS_GCF_000011645.1	True	False	False	K	301
2	s__Bacillus_licheniformis--RS_GCF_000011645.1	True	False	False	E	252
3	s__Bacillus_licheniformis--RS_GCF_000011645.1	True	False	False	G	233
4	s__Bacillus_licheniformis--RS_GCF_000011645.1	True	False	False	P	193
5	s__Bacillus_licheniformis--RS_GCF_000011645.1	True	False	False	C	180
6	s__Bacillus_licheniformis--RS_GCF_000011645.1	True	False	False	J	173
7	s__Bacillus_licheniformis--RS_GCF_000011645.1	True	False	False	M	160
8	s__Bacillus_licheniformis--RS_GCF_000011645.1	True	False	False	L	112
9	s__Bacillus_licheniformis--RS_GCF_000011645.1	True	False	False	H	111

	gtdb_species_clade_id	COG_letter	core_proportion	singleton_proportion	enrichment	openness_quartile	openness	phylum	GTDB_species	no_genomes
0	s__Bacillus_licheniformis--RS_GCF_000011645.1	B	0.000266	0.000620	0.000354	Q2	0.816783	p__Bacillota	s__Bacillus_licheniformis	226
1	s__Bacillus_licheniformis--RS_GCF_000011645.1	C	0.054226	0.062887	0.008661	Q2	0.816783	p__Bacillota	s__Bacillus_licheniformis	226
2	s__Bacillus_licheniformis--RS_GCF_000011645.1	D	0.012493	0.015799	0.003306	Q2	0.816783	p__Bacillota	s__Bacillus_licheniformis	226
3	s__Bacillus_licheniformis--RS_GCF_000011645.1	E	0.090909	0.092937	0.002028	Q2	0.816783	p__Bacillota	s__Bacillus_licheniformis	226
4	s__Bacillus_licheniformis--RS_GCF_000011645.1	F	0.027379	0.026022	-0.001357	Q2	0.816783	p__Bacillota	s__Bacillus_licheniformis	226
5	s__Bacillus_licheniformis--RS_GCF_000011645.1	G	0.083466	0.085192	0.001726	Q2	0.816783	p__Bacillota	s__Bacillus_licheniformis	226
6	s__Bacillus_licheniformis--RS_GCF_000011645.1	H	0.035088	0.036245	0.001158	Q2	0.816783	p__Bacillota	s__Bacillus_licheniformis	226
7	s__Bacillus_licheniformis--RS_GCF_000011645.1	I	0.028974	0.023544	-0.005430	Q2	0.816783	p__Bacillota	s__Bacillus_licheniformis	226
8	s__Bacillus_licheniformis--RS_GCF_000011645.1	J	0.048379	0.032218	-0.016160	Q2	0.816783	p__Bacillota	s__Bacillus_licheniformis	226
9	s__Bacillus_licheniformis--RS_GCF_000011645.1	K	0.088783	0.076518	-0.012265	Q2	0.816783	p__Bacillota	s__Bacillus_licheniformis	226

02 Cog Enrichment By Openness

Notebook 02: COG Enrichment by Openness Quartile¶

1. Query COG Distributions¶

2. Classify Gene Classes & Split Multi-letter COGs¶

3. Compute Per-Species Enrichment¶

4. Summarize by Quartile¶

5. Save Results¶

Findings¶