import json, sys, sqlite3
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.dpi'] = 110
matplotlib.rcParams['figure.facecolor'] = 'white'

REPO = Path('.').resolve()
CACHE = REPO / 'data' / 'forge_cache' / 'seaad'
sys.path.insert(0, str(REPO))
import forge.seaad_analysis as sa

def load(name): return json.loads((CACHE / f'{name}.json').read_text())

# Forge provenance: tool calls this session invoked
db = sqlite3.connect(str(REPO / 'scidex.db'))
prov = pd.read_sql_query('''
    SELECT skill_id, status, COUNT(*) AS n_calls,
           ROUND(AVG(duration_ms),0) AS mean_ms,
           MIN(created_at) AS first_call
    FROM tool_calls
    WHERE created_at >= date('now','-1 day')
    GROUP BY skill_id, status
    ORDER BY n_calls DESC
''', db)
db.close()
prov.rename(columns={'skill_id':'tool'}, inplace=True)
prov['tool'] = prov['tool'].str.replace('tool_', '', regex=False)
print(f'{len(prov)} tool-call aggregates from the last 24h of Forge provenance:')
prov.head(20)

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[1], line 12
      8 
      9 REPO = Path('.').resolve()
     10 CACHE = REPO / 'data' / 'forge_cache' / 'seaad'
     11 sys.path.insert(0, str(REPO))
---> 12 import forge.seaad_analysis as sa
     13 
     14 def load(name): return json.loads((CACHE / f'{name}.json').read_text())
     15 

ModuleNotFoundError: No module named 'forge'

anno = load('mygene_TREM2')  # probe one
ann_rows = []
for g in ['TREM2','GFAP','SLC17A7','PDGFRA','PDGFRB','APOE','MAPT','APP','PSEN1','TYROBP','CLU']:
    mg = load(f'mygene_{g}')
    hpa = load(f'hpa_{g}')
    ann_rows.append({
        'gene': g,
        'name': (mg.get('name') or '')[:55],
        'protein_class': ', '.join((hpa.get('protein_class') or [])[:2])[:55],
        'disease_involvement': ', '.join((hpa.get('disease_involvement') or [])[:2])[:55] if isinstance(hpa.get('disease_involvement'), list) else str(hpa.get('disease_involvement') or '')[:55],
        'ensembl_id': hpa.get('ensembl_id') or '',
    })
anno_df = pd.DataFrame(ann_rows)
anno_df

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[2], line 1
----> 1 anno = load('mygene_TREM2')  # probe one
      2 ann_rows = []
      3 for g in ['TREM2','GFAP','SLC17A7','PDGFRA','PDGFRB','APOE','MAPT','APP','PSEN1','TYROBP','CLU']:
      4     mg = load(f'mygene_{g}')

NameError: name 'load' is not defined

go_bp = load('enrichr_GO_Biological_Process')[:10]
go_df = pd.DataFrame(go_bp)[['term','p_value','odds_ratio','genes']]
go_df['p_value'] = go_df['p_value'].apply(lambda p: f'{p:.2e}')
go_df['odds_ratio'] = go_df['odds_ratio'].round(1)
go_df['term'] = go_df['term'].str[:60]
go_df['n_hits'] = go_df['genes'].apply(len)
go_df['genes'] = go_df['genes'].apply(lambda g: ', '.join(g))
go_df[['term','n_hits','p_value','odds_ratio','genes']]

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[3], line 1
----> 1 go_bp = load('enrichr_GO_Biological_Process')[:10]
      2 go_df = pd.DataFrame(go_bp)[['term','p_value','odds_ratio','genes']]
      3 go_df['p_value'] = go_df['p_value'].apply(lambda p: f'{p:.2e}')
      4 go_df['odds_ratio'] = go_df['odds_ratio'].round(1)

NameError: name 'load' is not defined

# Visualize top GO BP enrichment (−log10 p-value bar chart)
import numpy as np
go_bp = load('enrichr_GO_Biological_Process')[:8]
terms = [t['term'][:45] for t in go_bp][::-1]
neglogp = [-np.log10(t['p_value']) for t in go_bp][::-1]
fig, ax = plt.subplots(figsize=(9, 4.5))
ax.barh(terms, neglogp, color='#4fc3f7')
ax.set_xlabel('-log10(p-value)')
ax.set_title('Top GO:BP enrichment for SEA-AD vulnerability gene set (Enrichr)')
ax.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[4], line 3
      1 # Visualize top GO BP enrichment (−log10 p-value bar chart)
      2 import numpy as np
----> 3 go_bp = load('enrichr_GO_Biological_Process')[:8]
      4 terms = [t['term'][:45] for t in go_bp][::-1]
      5 neglogp = [-np.log10(t['p_value']) for t in go_bp][::-1]
      6 fig, ax = plt.subplots(figsize=(9, 4.5))

NameError: name 'load' is not defined

cm = load('enrichr_CellMarker_Cell_Types')[:10]
cm_df = pd.DataFrame(cm)[['term','p_value','odds_ratio','genes']]
cm_df['genes'] = cm_df['genes'].apply(lambda g: ', '.join(g))
cm_df['p_value'] = cm_df['p_value'].apply(lambda p: f'{p:.2e}')
cm_df['odds_ratio'] = cm_df['odds_ratio'].round(1)
cm_df

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[5], line 1
----> 1 cm = load('enrichr_CellMarker_Cell_Types')[:10]
      2 cm_df = pd.DataFrame(cm)[['term','p_value','odds_ratio','genes']]
      3 cm_df['genes'] = cm_df['genes'].apply(lambda g: ', '.join(g))
      4 cm_df['p_value'] = cm_df['p_value'].apply(lambda p: f'{p:.2e}')

NameError: name 'load' is not defined

ppi = load('string_network')
ppi_df = pd.DataFrame(ppi)
if not ppi_df.empty:
    ppi_df = ppi_df.sort_values('score', ascending=False)
    display_cols = [c for c in ['protein1','protein2','score','escore','tscore'] if c in ppi_df.columns]
    print(f'{len(ppi_df)} STRING edges among {len(set(list(ppi_df.protein1)+list(ppi_df.protein2)))} proteins')
    ppi_df[display_cols].head(20)
else:
    print('No STRING edges returned (API may be rate-limited)')

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[6], line 1
----> 1 ppi = load('string_network')
      2 ppi_df = pd.DataFrame(ppi)
      3 if not ppi_df.empty:
      4     ppi_df = ppi_df.sort_values('score', ascending=False)

NameError: name 'load' is not defined

# Simple network figure using matplotlib (no networkx dep)
ppi = load('string_network')
if ppi:
    import math
    nodes = sorted({p for e in ppi for p in (e['protein1'], e['protein2'])})
    n = len(nodes)
    pos = {n_: (math.cos(2*math.pi*i/n), math.sin(2*math.pi*i/n)) for i, n_ in enumerate(nodes)}
    fig, ax = plt.subplots(figsize=(7, 7))
    for e in ppi:
        x1,y1 = pos[e['protein1']]; x2,y2 = pos[e['protein2']]
        ax.plot([x1,x2],[y1,y2], color='#888', alpha=0.3+0.5*e['score'], linewidth=0.5+2*e['score'])
    for name,(x,y) in pos.items():
        ax.scatter([x],[y], s=450, color='#ffd54f', edgecolors='#333', zorder=3)
        ax.annotate(name, (x,y), ha='center', va='center', fontsize=9, fontweight='bold', zorder=4)
    ax.set_aspect('equal'); ax.axis('off')
    ax.set_title(f'STRING physical PPI network ({len(ppi)} edges, score ≥ 0.4)')
    plt.tight_layout(); plt.show()

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[7], line 2
      1 # Simple network figure using matplotlib (no networkx dep)
----> 2 ppi = load('string_network')
      3 if ppi:
      4     import math
      5     nodes = sorted({p for e in ppi for p in (e['protein1'], e['protein2'])})

NameError: name 'load' is not defined

pw_rows = []
for g in ['TREM2','GFAP','SLC17A7','PDGFRA','PDGFRB','APOE','MAPT','APP','PSEN1','TYROBP','CLU']:
    pws = load(f'reactome_{g}')
    pw_rows.append({'gene': g, 'n_pathways': len(pws),
                    'top_pathway': (pws[0]['name'] if pws else '—')[:70]})
pw_df = pd.DataFrame(pw_rows).sort_values('n_pathways', ascending=False)
pw_df

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[8], line 3
      1 pw_rows = []
      2 for g in ['TREM2','GFAP','SLC17A7','PDGFRA','PDGFRB','APOE','MAPT','APP','PSEN1','TYROBP','CLU']:
----> 3     pws = load(f'reactome_{g}')
      4     pw_rows.append({'gene': g, 'n_pathways': len(pws),
      5                     'top_pathway': (pws[0]['name'] if pws else '—')[:70]})
      6 pw_df = pd.DataFrame(pw_rows).sort_values('n_pathways', ascending=False)

NameError: name 'load' is not defined

from collections import Counter
ac = load('allen_celltypes_TREM2')  # same for any gene (not gene-filtered at API level)
ct = pd.DataFrame(ac.get('cell_types', []))
if not ct.empty:
    ct_display = ct.head(15)
else:
    ct_display = pd.DataFrame()
ct_display

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[9], line 2
      1 from collections import Counter
----> 2 ac = load('allen_celltypes_TREM2')  # same for any gene (not gene-filtered at API level)
      3 ct = pd.DataFrame(ac.get('cell_types', []))
      4 if not ct.empty:
      5     ct_display = ct.head(15)

NameError: name 'load' is not defined

ish_rows = []
for g in sa.TARGET_GENES:
    ish = load(f'allen_ish_{g}')
    regions = ish.get('regions') or []
    ish_rows.append({
        'gene': g,
        'n_ish_regions': len(regions),
        'top_region': (regions[0].get('structure','') if regions else '—')[:45],
        'top_energy': round(regions[0].get('expression_energy',0), 2) if regions else None,
        'note': (ish.get('note') or '')[:60],
    })
ish_df = pd.DataFrame(ish_rows)
ish_df

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[10], line 2
      1 ish_rows = []
----> 2 for g in sa.TARGET_GENES:
      3     ish = load(f'allen_ish_{g}')
      4     regions = ish.get('regions') or []
      5     ish_rows.append({

NameError: name 'sa' is not defined

hid = 'h-seaad-5b3cb8ea'
papers = load(f'pubmed_{hid}')
if papers:
    lit = pd.DataFrame(papers)[['year','journal','title','pmid']]
    lit['title'] = lit['title'].str[:80]
    lit['journal'] = lit['journal'].str[:30]
    lit.sort_values('year', ascending=False, inplace=True)
    display_df = lit
else:
    display_df = pd.DataFrame([{'note':'no PubMed results for this hypothesis query'}])
display_df

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[11], line 2
      1 hid = 'h-seaad-5b3cb8ea'
----> 2 papers = load(f'pubmed_{hid}')
      3 if papers:
      4     lit = pd.DataFrame(papers)[['year','journal','title','pmid']]
      5     lit['title'] = lit['title'].str[:80]

NameError: name 'load' is not defined

hid = 'h-seaad-51323624'
papers = load(f'pubmed_{hid}')
if papers:
    lit = pd.DataFrame(papers)[['year','journal','title','pmid']]
    lit['title'] = lit['title'].str[:80]
    lit['journal'] = lit['journal'].str[:30]
    lit.sort_values('year', ascending=False, inplace=True)
    display_df = lit
else:
    display_df = pd.DataFrame([{'note':'no PubMed results for this hypothesis query'}])
display_df

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[12], line 2
      1 hid = 'h-seaad-51323624'
----> 2 papers = load(f'pubmed_{hid}')
      3 if papers:
      4     lit = pd.DataFrame(papers)[['year','journal','title','pmid']]
      5     lit['title'] = lit['title'].str[:80]

NameError: name 'load' is not defined

hid = 'h-seaad-7f15df4c'
papers = load(f'pubmed_{hid}')
if papers:
    lit = pd.DataFrame(papers)[['year','journal','title','pmid']]
    lit['title'] = lit['title'].str[:80]
    lit['journal'] = lit['journal'].str[:30]
    lit.sort_values('year', ascending=False, inplace=True)
    display_df = lit
else:
    display_df = pd.DataFrame([{'note':'no PubMed results for this hypothesis query'}])
display_df

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[13], line 2
      1 hid = 'h-seaad-7f15df4c'
----> 2 papers = load(f'pubmed_{hid}')
      3 if papers:
      4     lit = pd.DataFrame(papers)[['year','journal','title','pmid']]
      5     lit['title'] = lit['title'].str[:80]

NameError: name 'load' is not defined

hid = 'h-seaad-fa5ea82d'
papers = load(f'pubmed_{hid}')
if papers:
    lit = pd.DataFrame(papers)[['year','journal','title','pmid']]
    lit['title'] = lit['title'].str[:80]
    lit['journal'] = lit['journal'].str[:30]
    lit.sort_values('year', ascending=False, inplace=True)
    display_df = lit
else:
    display_df = pd.DataFrame([{'note':'no PubMed results for this hypothesis query'}])
display_df

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[14], line 2
      1 hid = 'h-seaad-fa5ea82d'
----> 2 papers = load(f'pubmed_{hid}')
      3 if papers:
      4     lit = pd.DataFrame(papers)[['year','journal','title','pmid']]
      5     lit['title'] = lit['title'].str[:80]

NameError: name 'load' is not defined

hid = 'h-seaad-56fa6428'
papers = load(f'pubmed_{hid}')
if papers:
    lit = pd.DataFrame(papers)[['year','journal','title','pmid']]
    lit['title'] = lit['title'].str[:80]
    lit['journal'] = lit['journal'].str[:30]
    lit.sort_values('year', ascending=False, inplace=True)
    display_df = lit
else:
    display_df = pd.DataFrame([{'note':'no PubMed results for this hypothesis query'}])
display_df

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[15], line 2
      1 hid = 'h-seaad-56fa6428'
----> 2 papers = load(f'pubmed_{hid}')
      3 if papers:
      4     lit = pd.DataFrame(papers)[['year','journal','title','pmid']]
      5     lit['title'] = lit['title'].str[:80]

NameError: name 'load' is not defined

from pathlib import Path
bundle_path = REPO / 'data/analysis_outputs/analysis-SEAAD-20260402/mechanistic_de/bundle.json'
if bundle_path.exists():
    mech_bundle = json.loads(bundle_path.read_text())
    print("Mechanistic highlights:")
    for item in mech_bundle.get('mechanistic_highlights', []):
        print(f"- {item}")
    mech_df = pd.DataFrame([
        {
            'gene': gene,
            'dx_hits': len((payload.get('differential_expression') or {}).get('experiments', [])),
            'top_pathway': ((payload.get('reactome_pathways') or [{}])[0].get('name', '')),
            'top_paper': ((payload.get('literature') or [{}])[0].get('title', '')),
        }
        for gene, payload in mech_bundle.get('per_gene', {}).items()
    ])
    mech_df
else:
    print(f"Missing mechanistic evidence bundle: {bundle_path}")

Missing mechanistic evidence bundle: /home/ubuntu/scidex/.claude/worktrees/task-9c070f5d-b36b-46a0-8518-ac7a8b7ffcd0/site/notebooks/data/analysis_outputs/analysis-SEAAD-20260402/mechanistic_de/bundle.json

Gap	Task
Bulk SEA-AD h5ad download + local cache	`19c06875`
Per-cell DE from SEA-AD in the debate loop	`70b96f50`
ABC Atlas + MERFISH spatial queries	`f9ba4c33`
Forge data-validation layer	`4bd2f9de`

SEA-AD Cell-Type Vulnerability Analysis

SEA-AD Cell-Type Vulnerability Analysis¶

Research question¶

Approach¶

1. Forge tool chain¶

2. Target gene annotations (MyGene.info + Human Protein Atlas)¶

3. GO Biological Process enrichment (Enrichr)¶

4. Cell-type enrichment (Enrichr CellMarker)¶

5. STRING physical protein interaction network¶

6. Reactome pathway footprint per gene¶

7. Allen Brain Cell Atlas — cell-type specimen metadata¶

8. Allen Brain Atlas ISH regional expression¶

9. Evidence bound to analysis hypotheses¶

Hypothesis 1: Complement C1QA Spatial Gradient in Cortical Layers¶

Molecular Mechanism of C1QA-Mediated Synaptic Elimination¶

Hypothesis 2: Cell-Type Specific TREM2 Upregulation in DAM Microglia¶

TREM2 Molecular Biology and Signaling¶

Hypothesis 3: Excitatory Neuron Vulnerability via SLC17A7 Downregulation¶

Molecular Function of SLC17A7/VGLUT1¶

Hypothesis 4: APOE Isoform Expression Across Glial Subtypes¶

Hypothesis 5: GFAP-Positive Reactive Astrocyte Subtype Delineation¶

GFAP Biology and the Astrocyte Reactivity Spectrum¶

10. Mechanistic differential-expression synthesis¶

11. Caveats & what's still aggregated¶