import json, sys, sqlite3
from pathlib import Path
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
matplotlib.rcParams['figure.dpi'] = 110
matplotlib.rcParams['figure.facecolor'] = 'white'

REPO = Path('.').resolve()
sys.path.insert(0, str(REPO))

CACHE_SUB = 'seaad'
CACHE = REPO / 'data' / 'forge_cache' / CACHE_SUB

def load(name):
    p = CACHE / f'{name}.json'
    if p.exists():
        return json.loads(p.read_text())
    return {}

db_path = Path('/home/ubuntu/scidex/scidex.db')
try:
    db = sqlite3.connect(str(db_path))
    prov = pd.read_sql_query('''
        SELECT skill_id, status, COUNT(*) AS n_calls,
               ROUND(AVG(duration_ms),0) AS mean_ms
        FROM tool_calls
        WHERE created_at >= date('now','-30 days')
        GROUP BY skill_id, status
        ORDER BY n_calls DESC
    ''', db)
    db.close()
    prov['tool'] = prov['skill_id'].str.replace('tool_', '', regex=False)
    print(f'{len(prov)} tool-call aggregates (last 30 days):')
    prov[['tool','status','n_calls','mean_ms']].head(20)
except Exception as e:
    print(f'Provenance unavailable: {e}')

77 tool-call aggregates (last 30 days):

ann_rows = []
for g in ['CST', 'LDHA', 'SNCA', 'SYK', 'TNF']:
    mg = load(f'mygene_{g}')
    hpa = load(f'hpa_{g}')
    if not mg and not hpa:
        ann_rows.append({'gene': g, 'name': '—', 'protein_class': '—',
                         'disease_involvement': '—'})
        continue
    ann_rows.append({
        'gene': g,
        'name': (mg.get('name') or '')[:55],
        'protein_class': ', '.join((hpa.get('protein_class') or [])[:2])[:55]
                        if isinstance(hpa.get('protein_class'), list)
                        else str(hpa.get('protein_class') or '—')[:55],
        'disease_involvement': ', '.join((hpa.get('disease_involvement') or [])[:2])[:55]
                              if isinstance(hpa.get('disease_involvement'), list)
                              else str(hpa.get('disease_involvement') or '')[:55],
    })
pd.DataFrame(ann_rows)

go_bp = load('enrichr_GO_Biological_Process')
if isinstance(go_bp, list) and go_bp:
    go_df = pd.DataFrame(go_bp[:10])[['term','p_value','odds_ratio','genes']]
    go_df['p_value'] = go_df['p_value'].apply(lambda p: f'{p:.2e}')
    go_df['odds_ratio'] = go_df['odds_ratio'].round(1)
    go_df['term'] = go_df['term'].str[:60]
    go_df['n_hits'] = go_df['genes'].apply(len)
    go_df['genes'] = go_df['genes'].apply(lambda g: ', '.join(g))
    go_df[['term','n_hits','p_value','odds_ratio','genes']]
else:
    print('No GO:BP enrichment data')

# Visualize top GO BP enrichment
go_bp = load('enrichr_GO_Biological_Process')
if isinstance(go_bp, list) and go_bp:
    top = go_bp[:8]
    terms = [t['term'][:45] for t in top][::-1]
    neglogp = [-np.log10(max(t['p_value'], 1e-300)) for t in top][::-1]
    fig, ax = plt.subplots(figsize=(9, 4.5))
    ax.barh(terms, neglogp, color='#4fc3f7')
    ax.set_xlabel('-log10(p-value)')
    ax.set_title('Top GO:BP enrichment (Enrichr)')
    ax.grid(axis='x', alpha=0.3)
    plt.tight_layout(); plt.show()
else:
    print('No GO:BP data to plot')

kegg = load('enrichr_KEGG_Pathways')
if isinstance(kegg, list) and kegg:
    kegg_df = pd.DataFrame(kegg[:10])[['term','p_value','odds_ratio','genes']]
    kegg_df['genes'] = kegg_df['genes'].apply(lambda g: ', '.join(g))
    kegg_df['p_value'] = kegg_df['p_value'].apply(lambda p: f'{p:.2e}')
    kegg_df['odds_ratio'] = kegg_df['odds_ratio'].round(1)
    kegg_df
else:
    print('No KEGG enrichment data')

No KEGG enrichment data

ppi = load('string_network')
if isinstance(ppi, list) and ppi:
    ppi_df = pd.DataFrame(ppi).sort_values('score', ascending=False)
    display_cols = [c for c in ['protein1','protein2','score','escore','tscore'] if c in ppi_df.columns]
    print(f'{len(ppi_df)} STRING edges')
    ppi_df[display_cols].head(20)
else:
    print('No STRING edges returned')

11 STRING edges

# Network figure
ppi = load('string_network')
if isinstance(ppi, list) and ppi:
    import math
    nodes = sorted({p for e in ppi for p in (e['protein1'], e['protein2'])})
    n = len(nodes)
    pos = {n_: (math.cos(2*math.pi*i/n), math.sin(2*math.pi*i/n)) for i, n_ in enumerate(nodes)}
    fig, ax = plt.subplots(figsize=(7, 7))
    for e in ppi:
        x1,y1 = pos[e['protein1']]; x2,y2 = pos[e['protein2']]
        ax.plot([x1,x2],[y1,y2], color='#888', alpha=0.3+0.5*e['score'],
                linewidth=0.5+2*e['score'])
    for name,(x,y) in pos.items():
        ax.scatter([x],[y], s=450, color='#ffd54f', edgecolors='#333', zorder=3)
        ax.annotate(name, (x,y), ha='center', va='center', fontsize=8, fontweight='bold', zorder=4)
    ax.set_aspect('equal'); ax.axis('off')
    ax.set_title(f'STRING PPI network ({len(ppi)} edges)')
    plt.tight_layout(); plt.show()
else:
    print('No STRING data to visualize')

pw_rows = []
for g in ['CST', 'LDHA', 'SNCA', 'SYK', 'TNF']:
    pws = load(f'reactome_{g}')
    if isinstance(pws, list):
        pw_rows.append({'gene': g, 'n_pathways': len(pws),
                        'top_pathway': (pws[0]['name'] if pws else '—')[:70]})
    else:
        pw_rows.append({'gene': g, 'n_pathways': 0, 'top_pathway': '—'})
pd.DataFrame(pw_rows).sort_values('n_pathways', ascending=False)

ish_rows = []
for g in ['CST', 'LDHA', 'SNCA', 'SYK', 'TNF']:
    ish = load(f'allen_ish_{g}')
    regions = ish.get('regions') or [] if isinstance(ish, dict) else []
    ish_rows.append({
        'gene': g,
        'n_ish_regions': len(regions),
        'top_region': (regions[0].get('structure','') if regions else '—')[:45],
        'top_energy': round(regions[0].get('expression_energy',0), 2) if regions else None,
    })
pd.DataFrame(ish_rows)

hyp_data = [('Astrocyte-Selective APOE4 Silencing via Lipid Nanoparti', 0.541), ('Neuronal Subtype-Specific Alpha-Synuclein Expression No', 0.516), ('Astrocyte-Microglia Communication Rebalancing via Cytok', 0.495), ('Microglial TREM2-Independent Pathway Activation', 0.495), ('Oligodendrocyte-Targeted Myelin Sulfatide Restoration T', 0.495), ('Oligodendrocyte Progenitor Cell Metabolic Reprogramming', 0.495), ('Inhibitory Neuron-Selective WNT Signaling Restoration', 0.495)]
titles = [h[0] for h in hyp_data][::-1]
scores = [h[1] for h in hyp_data][::-1]
fig, ax = plt.subplots(figsize=(10, max(8, len(titles)*0.4)))
colors = ['#ef5350' if s >= 0.6 else '#ffa726' if s >= 0.5 else '#66bb6a' for s in scores]
ax.barh(range(len(titles)), scores, color=colors)
ax.set_yticks(range(len(titles))); ax.set_yticklabels(titles, fontsize=7)
ax.set_xlabel('Composite Score'); ax.set_title('Which cell types show the most significant expression changes for neurodegeneration genes in SEA-AD cohorts?')
ax.grid(axis='x', alpha=0.3)
plt.tight_layout(); plt.show()

labels = ['Astrocyte-Selective APOE4 Silencing via ', 'Neuronal Subtype-Specific Alpha-Synuclei', 'Astrocyte-Microglia Communication Rebala', 'Microglial TREM2-Independent Pathway Act', 'Oligodendrocyte-Targeted Myelin Sulfatid', 'Oligodendrocyte Progenitor Cell Metaboli', 'Inhibitory Neuron-Selective WNT Signalin']
matrix = np.array([[0.9, 0.6, 0.9, 0.8, 0, 0.8, 0.7, 0.7, 0.5], [0.7, 0.3, 0.6, 0.5, 0, 0.6, 0.4, 0.3, 0.4], [0.6, 0.9, 0.8, 0.8, 0, 0.7, 0.8, 0.9, 0.6], [0.8, 0.7, 0.7, 0.6, 0, 0.7, 0.6, 0.8, 0.4], [0.9, 0.3, 0.8, 0.7, 0, 0.5, 0.5, 0.3, 0.4], [0.8, 0.4, 0.6, 0.5, 0, 0.5, 0.4, 0.6, 0.4], [0.8, 0.4, 0.6, 0.4, 0, 0.3, 0.3, 0.6, 0.3]])
dims = ['novelty_score', 'feasibility_score', 'impact_score', 'mechanistic_plausibility_score', 'clinical_relevance_score', 'data_availability_score', 'reproducibility_score', 'druggability_score', 'safety_profile_score']
if matrix.size:
    fig, ax = plt.subplots(figsize=(10, 5))
    im = ax.imshow(matrix, cmap='RdYlGn', aspect='auto', vmin=0, vmax=1)
    ax.set_xticks(range(len(dims)))
    ax.set_xticklabels([d.replace('_score','').replace('_',' ').title() for d in dims],
                       rotation=45, ha='right', fontsize=8)
    ax.set_yticks(range(len(labels))); ax.set_yticklabels(labels, fontsize=7)
    ax.set_title('Score dimensions — top hypotheses')
    plt.colorbar(im, ax=ax, shrink=0.8)
    plt.tight_layout(); plt.show()
else:
    print('No score data available')

hid = 'h-541d61c3'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
    lit = pd.DataFrame(papers)
    cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
    if cols:
        lit = lit[cols]
        lit['title'] = lit['title'].str[:80]
        if 'journal' in lit.columns:
            lit['journal'] = lit['journal'].str[:30]
        lit.sort_values('year', ascending=False, inplace=True)
        display_df = lit
    else:
        display_df = pd.DataFrame(papers[:5])
else:
    display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df

hid = 'h-b7ab85b6'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
    lit = pd.DataFrame(papers)
    cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
    if cols:
        lit = lit[cols]
        lit['title'] = lit['title'].str[:80]
        if 'journal' in lit.columns:
            lit['journal'] = lit['journal'].str[:30]
        lit.sort_values('year', ascending=False, inplace=True)
        display_df = lit
    else:
        display_df = pd.DataFrame(papers[:5])
else:
    display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df

hid = 'h-89500d80'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
    lit = pd.DataFrame(papers)
    cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
    if cols:
        lit = lit[cols]
        lit['title'] = lit['title'].str[:80]
        if 'journal' in lit.columns:
            lit['journal'] = lit['journal'].str[:30]
        lit.sort_values('year', ascending=False, inplace=True)
        display_df = lit
    else:
        display_df = pd.DataFrame(papers[:5])
else:
    display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df

hid = 'h-d2937ed0'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
    lit = pd.DataFrame(papers)
    cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
    if cols:
        lit = lit[cols]
        lit['title'] = lit['title'].str[:80]
        if 'journal' in lit.columns:
            lit['journal'] = lit['journal'].str[:30]
        lit.sort_values('year', ascending=False, inplace=True)
        display_df = lit
    else:
        display_df = pd.DataFrame(papers[:5])
else:
    display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df

hid = 'h-d16c2411'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
    lit = pd.DataFrame(papers)
    cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
    if cols:
        lit = lit[cols]
        lit['title'] = lit['title'].str[:80]
        if 'journal' in lit.columns:
            lit['journal'] = lit['journal'].str[:30]
        lit.sort_values('year', ascending=False, inplace=True)
        display_df = lit
    else:
        display_df = pd.DataFrame(papers[:5])
else:
    display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df

hid = 'h-2a1a95c1'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
    lit = pd.DataFrame(papers)
    cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
    if cols:
        lit = lit[cols]
        lit['title'] = lit['title'].str[:80]
        if 'journal' in lit.columns:
            lit['journal'] = lit['journal'].str[:30]
        lit.sort_values('year', ascending=False, inplace=True)
        display_df = lit
    else:
        display_df = pd.DataFrame(papers[:5])
else:
    display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df

hid = 'h-eef1be45'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
    lit = pd.DataFrame(papers)
    cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
    if cols:
        lit = lit[cols]
        lit['title'] = lit['title'].str[:80]
        if 'journal' in lit.columns:
            lit['journal'] = lit['journal'].str[:30]
        lit.sort_values('year', ascending=False, inplace=True)
        display_df = lit
    else:
        display_df = pd.DataFrame(papers[:5])
else:
    display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df

edge_data = [{'source': 'APOE4', 'relation': 'causes (astrocytic APOE4 ', 'target': 'synaptic phagocytosis', 'strength': 0.8}, {'source': 'APOE4 removal', 'relation': 'causes (selective removal', 'target': 'tau-mediated neurodegeneration', 'strength': 0.8}, {'source': 'h-541d61c3', 'relation': 'targets', 'target': 'APOE4', 'strength': 0.8}, {'source': 'h-541d61c3', 'relation': 'implicated_in', 'target': 'neurodegeneration', 'strength': 0.8}, {'source': 'oligodendrocyte sulfatide defi', 'relation': 'causes (adult-onset CNS m', 'target': 'AD-like neuroinflammation', 'strength': 0.7}, {'source': 'oligodendrocyte sulfatide defi', 'relation': 'causes (myelin sulfatide ', 'target': 'cognitive impairment', 'strength': 0.7}, {'source': 'astrocyte-derived inflammatory', 'relation': 'causes (astrocyte-derived', 'target': 'pathological microglial activa', 'strength': 0.7}, {'source': 'h-89500d80', 'relation': 'targets', 'target': 'IL1A', 'strength': 0.7}, {'source': 'h-89500d80', 'relation': 'targets', 'target': 'TNF', 'strength': 0.7}, {'source': 'h-89500d80', 'relation': 'targets', 'target': 'C1Q', 'strength': 0.7}, {'source': 'h-89500d80', 'relation': 'implicated_in', 'target': 'neurodegeneration', 'strength': 0.7}, {'source': 'WNT signaling disruption', 'relation': 'causes (disrupted WNT sig', 'target': 'inhibitory neuron vulnerabilit', 'strength': 0.65}, {'source': 'glia-neuron communication disr', 'relation': 'causes (altered glia-neur', 'target': 'altered WNT signaling', 'strength': 0.65}, {'source': 'environmental stressors', 'relation': 'causes (environmental str', 'target': 'energy metabolism disorders', 'strength': 0.6}, {'source': 'energy metabolism disorders', 'relation': 'causes (energy metabolism', 'target': "Parkinson's disease-like neuro", 'strength': 0.6}, {'source': 'APOE4 overexpression', 'relation': 'causes (complete APOE4 re', 'target': 'disrupted normal brain lipid t', 'strength': 0.6}, {'source': 'h-d2937ed0', 'relation': 'targets', 'target': 'DAP12', 'strength': 0.6}, {'source': 'h-d2937ed0', 'relation': 'targets', 'target': 'SYK', 'strength': 0.6}, {'source': 'h-d2937ed0', 'relation': 'targets', 'target': 'PLCG2', 'strength': 0.6}, {'source': 'h-d2937ed0', 'relation': 'implicated_in', 'target': 'neurodegeneration', 'strength': 0.6}, {'source': 'h-d16c2411', 'relation': 'targets', 'target': 'CST', 'strength': 0.6}, {'source': 'h-d16c2411', 'relation': 'targets', 'target': 'GAL3ST1', 'strength': 0.6}, {'source': 'h-d16c2411', 'relation': 'implicated_in', 'target': 'neurodegeneration', 'strength': 0.6}, {'source': 'APOE4', 'relation': 'drives', 'target': 'synaptic_phagocytosis', 'strength': 0.5}, {'source': 'astrocytes', 'relation': 'communicates_with', 'target': 'microglia', 'strength': 0.5}]
if edge_data:
    pd.DataFrame(edge_data).head(25)
else:
    print('No KG edge data available')

Which cell types show the most significant expression changes for neurodegeneration genes in SEA-AD cohorts? — Analysis Notebook

Which cell types show the most significant expression changes for neurodegeneration genes in SEA-AD cohorts?¶

Research question¶

Approach¶

Debate Summary¶

1. Forge tool provenance¶

2. Target gene annotations¶

3. GO Biological Process enrichment (Enrichr)¶

4. KEGG pathway enrichment¶

5. STRING protein interaction network¶

6. Reactome pathway footprint¶

7. Allen Brain Atlas ISH regional expression¶

8. Hypothesis ranking (7 hypotheses)¶

9. Score dimension heatmap (top 10)¶

10. PubMed evidence per hypothesis¶

Hypothesis 1: Astrocyte-Selective APOE4 Silencing via Lipid Nanoparticles¶

Molecular Mechanism and Rationale¶

Hypothesis 2: Neuronal Subtype-Specific Alpha-Synuclein Expression Normalization¶

Hypothesis 3: Astrocyte-Microglia Communication Rebalancing via Cytokine Modulation¶

Astrocyte-Microglia Communication Rebalancing via Cytokine Modulation¶

Mechanistic Hypothesis Overview¶

Hypothesis 4: Microglial TREM2-Independent Pathway Activation¶

Molecular Mechanism and Rationale¶

Hypothesis 5: Oligodendrocyte-Targeted Myelin Sulfatide Restoration Therapy¶

Molecular Mechanism and Rationale¶

Hypothesis 6: Oligodendrocyte Progenitor Cell Metabolic Reprogramming¶

Molecular Mechanism and Rationale¶

Hypothesis 7: Inhibitory Neuron-Selective WNT Signaling Restoration¶

11. Knowledge graph edges (68 total)¶

12. Caveats¶

	gene	name	protein_class	disease_involvement
0	CST	—	—	—
1	LDHA	—	—	—
2	SNCA	—	—	—
3	SYK	—	—	—
4	TNF	—	—	—