RNA binding protein dysregulation across ALS FTD and AD¶
Notebook ID: nb-sda-2026-04-01-gap-v2-68d9c9c1 · Analysis: sda-2026-04-01-gap-v2-68d9c9c1 · Generated: 2026-04-10
Research question¶
RNA binding protein dysregulation across ALS FTD and AD
Approach¶
This notebook is generated programmatically from real Forge tool calls and SciDEX debate data. Code cells load cached evidence bundles from data/forge_cache/seaad/*.json and query live data from scidex.db. Re-run python3 scripts/regenerate_notebooks.py --analysis sda-2026-04-01-gap-v2-68d9c9c1 --force to refresh.
7 hypotheses were generated and debated. The knowledge graph has 73 edges.
Debate Summary¶
Quality score: 0.425 · Rounds: 4 · Personas: Theorist, Skeptic, Domain_Expert, Synthesizer
1. Forge tool provenance¶
import json, sys, sqlite3
from pathlib import Path
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
matplotlib.rcParams['figure.dpi'] = 110
matplotlib.rcParams['figure.facecolor'] = 'white'
REPO = Path('.').resolve()
sys.path.insert(0, str(REPO))
CACHE_SUB = 'seaad'
CACHE = REPO / 'data' / 'forge_cache' / CACHE_SUB
def load(name):
p = CACHE / f'{name}.json'
if p.exists():
return json.loads(p.read_text())
return {}
db_path = Path('/home/ubuntu/scidex/scidex.db')
try:
db = sqlite3.connect(str(db_path))
prov = pd.read_sql_query('''
SELECT skill_id, status, COUNT(*) AS n_calls,
ROUND(AVG(duration_ms),0) AS mean_ms
FROM tool_calls
WHERE created_at >= date('now','-30 days')
GROUP BY skill_id, status
ORDER BY n_calls DESC
''', db)
db.close()
prov['tool'] = prov['skill_id'].str.replace('tool_', '', regex=False)
print(f'{len(prov)} tool-call aggregates (last 30 days):')
prov[['tool','status','n_calls','mean_ms']].head(20)
except Exception as e:
print(f'Provenance unavailable: {e}')
77 tool-call aggregates (last 30 days):
2. Target gene annotations¶
ann_rows = []
for g in ['SETX', 'SYNCRIP', 'TARDBP']:
mg = load(f'mygene_{g}')
hpa = load(f'hpa_{g}')
if not mg and not hpa:
ann_rows.append({'gene': g, 'name': '—', 'protein_class': '—',
'disease_involvement': '—'})
continue
ann_rows.append({
'gene': g,
'name': (mg.get('name') or '')[:55],
'protein_class': ', '.join((hpa.get('protein_class') or [])[:2])[:55]
if isinstance(hpa.get('protein_class'), list)
else str(hpa.get('protein_class') or '—')[:55],
'disease_involvement': ', '.join((hpa.get('disease_involvement') or [])[:2])[:55]
if isinstance(hpa.get('disease_involvement'), list)
else str(hpa.get('disease_involvement') or '')[:55],
})
pd.DataFrame(ann_rows)
| gene | name | protein_class | disease_involvement | |
|---|---|---|---|---|
| 0 | SETX | — | — | — |
| 1 | SYNCRIP | — | — | — |
| 2 | TARDBP | — | — | — |
3. GO Biological Process enrichment (Enrichr)¶
go_bp = load('enrichr_GO_Biological_Process')
if isinstance(go_bp, list) and go_bp:
go_df = pd.DataFrame(go_bp[:10])[['term','p_value','odds_ratio','genes']]
go_df['p_value'] = go_df['p_value'].apply(lambda p: f'{p:.2e}')
go_df['odds_ratio'] = go_df['odds_ratio'].round(1)
go_df['term'] = go_df['term'].str[:60]
go_df['n_hits'] = go_df['genes'].apply(len)
go_df['genes'] = go_df['genes'].apply(lambda g: ', '.join(g))
go_df[['term','n_hits','p_value','odds_ratio','genes']]
else:
print('No GO:BP enrichment data')
# Visualize top GO BP enrichment
go_bp = load('enrichr_GO_Biological_Process')
if isinstance(go_bp, list) and go_bp:
top = go_bp[:8]
terms = [t['term'][:45] for t in top][::-1]
neglogp = [-np.log10(max(t['p_value'], 1e-300)) for t in top][::-1]
fig, ax = plt.subplots(figsize=(9, 4.5))
ax.barh(terms, neglogp, color='#4fc3f7')
ax.set_xlabel('-log10(p-value)')
ax.set_title('Top GO:BP enrichment (Enrichr)')
ax.grid(axis='x', alpha=0.3)
plt.tight_layout(); plt.show()
else:
print('No GO:BP data to plot')
4. KEGG pathway enrichment¶
kegg = load('enrichr_KEGG_Pathways')
if isinstance(kegg, list) and kegg:
kegg_df = pd.DataFrame(kegg[:10])[['term','p_value','odds_ratio','genes']]
kegg_df['genes'] = kegg_df['genes'].apply(lambda g: ', '.join(g))
kegg_df['p_value'] = kegg_df['p_value'].apply(lambda p: f'{p:.2e}')
kegg_df['odds_ratio'] = kegg_df['odds_ratio'].round(1)
kegg_df
else:
print('No KEGG enrichment data')
No KEGG enrichment data
5. STRING protein interaction network¶
ppi = load('string_network')
if isinstance(ppi, list) and ppi:
ppi_df = pd.DataFrame(ppi).sort_values('score', ascending=False)
display_cols = [c for c in ['protein1','protein2','score','escore','tscore'] if c in ppi_df.columns]
print(f'{len(ppi_df)} STRING edges')
ppi_df[display_cols].head(20)
else:
print('No STRING edges returned')
11 STRING edges
# Network figure
ppi = load('string_network')
if isinstance(ppi, list) and ppi:
import math
nodes = sorted({p for e in ppi for p in (e['protein1'], e['protein2'])})
n = len(nodes)
pos = {n_: (math.cos(2*math.pi*i/n), math.sin(2*math.pi*i/n)) for i, n_ in enumerate(nodes)}
fig, ax = plt.subplots(figsize=(7, 7))
for e in ppi:
x1,y1 = pos[e['protein1']]; x2,y2 = pos[e['protein2']]
ax.plot([x1,x2],[y1,y2], color='#888', alpha=0.3+0.5*e['score'],
linewidth=0.5+2*e['score'])
for name,(x,y) in pos.items():
ax.scatter([x],[y], s=450, color='#ffd54f', edgecolors='#333', zorder=3)
ax.annotate(name, (x,y), ha='center', va='center', fontsize=8, fontweight='bold', zorder=4)
ax.set_aspect('equal'); ax.axis('off')
ax.set_title(f'STRING PPI network ({len(ppi)} edges)')
plt.tight_layout(); plt.show()
else:
print('No STRING data to visualize')
6. Reactome pathway footprint¶
pw_rows = []
for g in ['SETX', 'SYNCRIP', 'TARDBP']:
pws = load(f'reactome_{g}')
if isinstance(pws, list):
pw_rows.append({'gene': g, 'n_pathways': len(pws),
'top_pathway': (pws[0]['name'] if pws else '—')[:70]})
else:
pw_rows.append({'gene': g, 'n_pathways': 0, 'top_pathway': '—'})
pd.DataFrame(pw_rows).sort_values('n_pathways', ascending=False)
| gene | n_pathways | top_pathway | |
|---|---|---|---|
| 0 | SETX | 0 | — |
| 1 | SYNCRIP | 0 | — |
| 2 | TARDBP | 0 | — |
7. Allen Brain Atlas ISH regional expression¶
ish_rows = []
for g in ['SETX', 'SYNCRIP', 'TARDBP']:
ish = load(f'allen_ish_{g}')
regions = ish.get('regions') or [] if isinstance(ish, dict) else []
ish_rows.append({
'gene': g,
'n_ish_regions': len(regions),
'top_region': (regions[0].get('structure','') if regions else '—')[:45],
'top_energy': round(regions[0].get('expression_energy',0), 2) if regions else None,
})
pd.DataFrame(ish_rows)
| gene | n_ish_regions | top_region | top_energy | |
|---|---|---|---|---|
| 0 | SETX | 0 | — | — |
| 1 | SYNCRIP | 0 | — | — |
| 2 | TARDBP | 0 | — | — |
8. Hypothesis ranking (7 hypotheses)¶
hyp_data = [('Stress Granule Phase Separation Modulators', 0.539), ('Cross-Seeding Prevention Strategy', 0.528), ('Cryptic Exon Silencing Restoration', 0.524), ('Mitochondrial RNA Granule Rescue Pathway', 0.519), ('Axonal RNA Transport Reconstitution', 0.517), ('Nucleolar Stress Response Normalization', 0.502), ('R-Loop Resolution Enhancement Therapy', 0.501)]
titles = [h[0] for h in hyp_data][::-1]
scores = [h[1] for h in hyp_data][::-1]
fig, ax = plt.subplots(figsize=(10, max(8, len(titles)*0.4)))
colors = ['#ef5350' if s >= 0.6 else '#ffa726' if s >= 0.5 else '#66bb6a' for s in scores]
ax.barh(range(len(titles)), scores, color=colors)
ax.set_yticks(range(len(titles))); ax.set_yticklabels(titles, fontsize=7)
ax.set_xlabel('Composite Score'); ax.set_title('RNA binding protein dysregulation across ALS FTD and AD')
ax.grid(axis='x', alpha=0.3)
plt.tight_layout(); plt.show()
9. Score dimension heatmap (top 10)¶
labels = ['Stress Granule Phase Separation Modulato', 'Cross-Seeding Prevention Strategy', 'Cryptic Exon Silencing Restoration', 'Mitochondrial RNA Granule Rescue Pathway', 'Axonal RNA Transport Reconstitution', 'Nucleolar Stress Response Normalization', 'R-Loop Resolution Enhancement Therapy']
matrix = np.array([[0.7, 0.75, 0.8, 0.85, 0.09, 0.85, 0.75, 0.65, 0.6], [0.55, 0.64, 0.71, 0.72, 0.573, 0.75, 0.59, 0.58, 0.62], [0.65, 0.6, 0.72, 0.75, 0.573, 0.75, 0.62, 0.58, 0.55], [0.75, 0.35, 0.55, 0.5, 0.485, 0.5, 0.45, 0.25, 0.7], [0.85, 0.4, 0.65, 0.65, 0.485, 0.6, 0.55, 0.25, 0.6], [0.65, 0.3, 0.4, 0.45, 0.6, 0.45, 0.4, 0.2, 0.35], [0.8, 0.45, 0.6, 0.6, 0.452, 0.55, 0.5, 0.35, 0.4]])
dims = ['novelty_score', 'feasibility_score', 'impact_score', 'mechanistic_plausibility_score', 'clinical_relevance_score', 'data_availability_score', 'reproducibility_score', 'druggability_score', 'safety_profile_score']
if matrix.size:
fig, ax = plt.subplots(figsize=(10, 5))
im = ax.imshow(matrix, cmap='RdYlGn', aspect='auto', vmin=0, vmax=1)
ax.set_xticks(range(len(dims)))
ax.set_xticklabels([d.replace('_score','').replace('_',' ').title() for d in dims],
rotation=45, ha='right', fontsize=8)
ax.set_yticks(range(len(labels))); ax.set_yticklabels(labels, fontsize=7)
ax.set_title('Score dimensions — top hypotheses')
plt.colorbar(im, ax=ax, shrink=0.8)
plt.tight_layout(); plt.show()
else:
print('No score data available')
10. PubMed evidence per hypothesis¶
Hypothesis 1: Stress Granule Phase Separation Modulators¶
Target genes: G3BP1 · Composite score: 0.539
Molecular Mechanism and Rationale
The hypothesis centers on the pharmacological modulation of stress granule dynamics through targeting G3BP1 (Ras GTPase-activating protein-binding protein 1), a key nucleator of stress granule formation via liquid-liquid phase separation (LLPS). Under physiological stress conditions, G3BP1 undergoes phase separation through its intrinsically disordered regions (IDRs) and RNA-binding domains, forming membrane-less organelles that sequester mRNAs and associat
hid = 'h-97aa8486'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
| note | |
|---|---|
| 0 | no PubMed results |
Hypothesis 2: Cross-Seeding Prevention Strategy¶
Target genes: TARDBP · Composite score: 0.528
Molecular Mechanism and Rationale
The cross-seeding prevention strategy targets the pathological interaction between TAR DNA-binding protein 43 (TDP-43), encoded by TARDBP, and classical neurodegenerative disease proteins such as amyloid-beta (Aβ), tau, and alpha-synuclein. TDP-43 is a 414-amino acid RNA-binding protein containing two RNA recognition motifs (RRM1 and RRM2), a nuclear localization signal, and a glycine-rich C-terminal domain that is prone to aggregation. Under physiological
hid = 'h-eea667a9'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
| note | |
|---|---|
| 0 | no PubMed results |
Hypothesis 3: Cryptic Exon Silencing Restoration¶
Target genes: TARDBP · Composite score: 0.524
Molecular Mechanism and Rationale
The TAR DNA-binding protein 43 (TDP-43), encoded by the TARDBP gene, serves as a critical RNA-binding protein (RBP) that orchestrates complex post-transcriptional regulatory networks essential for neuronal homeostasis. Under physiological conditions, TDP-43 functions as a master regulator of cryptic exon silencing through its preferential binding to UG-rich and GU-rich sequences located within introns and 3' untranslated regions of target transcripts. The p
hid = 'h-4fabd9ce'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
| note | |
|---|---|
| 0 | no PubMed results |
Hypothesis 4: Mitochondrial RNA Granule Rescue Pathway¶
Target genes: SYNCRIP · Composite score: 0.519
Molecular Mechanism and Rationale
The mitochondrial RNA granule rescue pathway represents a novel therapeutic approach targeting the fundamental disruption of mitochondrial RNA transport and local translation that occurs across multiple neurodegenerative diseases. The central mechanism revolves around SYNCRIP (Synaptotagmin Binding Cytoplasmic RNA Interacting Protein), a heterogeneous nuclear ribonucleoprotein (hnRNP) that serves as a critical regulator of mitochondrial RNA granule dynamics
hid = 'h-1e2bd420'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
| note | |
|---|---|
| 0 | no PubMed results |
Hypothesis 5: Axonal RNA Transport Reconstitution¶
Target genes: HNRNPA2B1 · Composite score: 0.517
Molecular Mechanism and Rationale
The axonal RNA transport reconstitution hypothesis centers on the critical role of heterogeneous nuclear ribonucleoprotein A2/B1 (HNRNPA2B1) in facilitating kinesin-mediated transport of RNA granules along microtubules in neuronal axons. HNRNPA2B1 functions as a key RNA-binding protein that recognizes specific trafficking signals, particularly the A2 response element (A2RE) sequences found in mRNAs destined for axonal and synaptic localization. Under physio
hid = 'h-8196b893'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
| note | |
|---|---|
| 0 | no PubMed results |
Hypothesis 6: Nucleolar Stress Response Normalization¶
Target genes: NPM1 · Composite score: 0.502
Molecular Mechanism and Rationale
The nucleolus represents a critical subnuclear compartment where ribosomal RNA (rRNA) transcription, processing, and ribosome assembly occur. In neurodegenerative diseases, RNA-binding protein (RBP) dysfunction triggers a cascade of molecular events that disrupts nucleolar homeostasis, leading to impaired protein synthesis and ultimately neuronal death. The nucleolar stress response (NSR) serves as a cellular surveillance mechanism activated when ribosome b
hid = 'h-ecacd219'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
| note | |
|---|---|
| 0 | no PubMed results |
Hypothesis 7: R-Loop Resolution Enhancement Therapy¶
Target genes: SETX · Composite score: 0.501
Molecular Mechanism and Rationale
R-loops are three-stranded nucleic acid structures consisting of an RNA-DNA hybrid and a displaced single-strand DNA, which form naturally during transcription when nascent RNA hybridizes back to the template DNA strand. While R-loops serve important physiological functions in transcriptional regulation, DNA repair, and chromatin remodeling, their dysregulation contributes significantly to neurodegeneration through DNA damage accumulation and transcriptiona
hid = 'h-c463d225'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
| note | |
|---|---|
| 0 | no PubMed results |
11. Knowledge graph edges (73 total)¶
edge_data = [{'source': 'h-97aa8486', 'relation': 'implicated_in', 'target': 'neurodegeneration', 'strength': 0.8}, {'source': 'TDP-43', 'relation': 'regulates', 'target': 'cryptic_exon_silencing', 'strength': 0.8}, {'source': 'G3BP1', 'relation': 'controls', 'target': 'stress_granule_formation', 'strength': 0.8}, {'source': 'stress_granule_formation', 'relation': 'regulates', 'target': 'RNA_homeostasis', 'strength': 0.8}, {'source': 'RNA_homeostasis', 'relation': 'disrupted_in', 'target': 'neurodegeneration', 'strength': 0.8}, {'source': 'HNRNPA2B1', 'relation': 'mediates', 'target': 'axonal_RNA_transport', 'strength': 0.8}, {'source': 'axonal_RNA_transport', 'relation': 'maintains', 'target': 'synaptic_function', 'strength': 0.8}, {'source': 'SETX', 'relation': 'catalyzes', 'target': 'R-loop_resolution', 'strength': 0.8}, {'source': 'R-loop_resolution', 'relation': 'maintains', 'target': 'genomic_stability', 'strength': 0.8}, {'source': 'FUS', 'relation': 'mutation_causes', 'target': 'R-loop_accumulation', 'strength': 0.8}, {'source': 'NPM1', 'relation': 'regulates', 'target': 'nucleolar_function', 'strength': 0.8}, {'source': 'nucleolar_function', 'relation': 'controls', 'target': 'ribosome_biogenesis', 'strength': 0.8}, {'source': 'cryptic_exon_silencing', 'relation': 'dysregulated_in', 'target': 'ALS', 'strength': 0.8}, {'source': 'h-4fabd9ce', 'relation': 'implicated_in', 'target': 'neurodegeneration', 'strength': 0.7}, {'source': 'h-8196b893', 'relation': 'implicated_in', 'target': 'neurodegeneration', 'strength': 0.7}, {'source': 'h-eea667a9', 'relation': 'implicated_in', 'target': 'neurodegeneration', 'strength': 0.68}, {'source': 'h-c463d225', 'relation': 'implicated_in', 'target': 'neurodegeneration', 'strength': 0.65}, {'source': 'h-1e2bd420', 'relation': 'implicated_in', 'target': 'neurodegeneration', 'strength': 0.6}, {'source': 'HNRNPA2B1', 'relation': 'associated_with', 'target': 'neurodegeneration', 'strength': 0.57}, {'source': 'SETX', 'relation': 'associated_with', 'target': 'neurodegeneration', 'strength': 0.54}, {'source': 'h-ecacd219', 'relation': 'implicated_in', 'target': 'neurodegeneration', 'strength': 0.5}, {'source': 'SYNCRIP', 'relation': 'associated_with', 'target': 'neurodegeneration', 'strength': 0.49}, {'source': 'HNRNPA2B1', 'relation': 'participates_in', 'target': 'RNA transport / hnRNP processi', 'strength': 0.45}, {'source': 'SETX', 'relation': 'participates_in', 'target': 'Senataxin / R-loop resolution ', 'strength': 0.43}, {'source': 'NPM1', 'relation': 'associated_with', 'target': 'neurodegeneration', 'strength': 0.42}]
if edge_data:
pd.DataFrame(edge_data).head(25)
else:
print('No KG edge data available')
12. Caveats¶
This notebook uses real Forge tool calls cached from live APIs, but:
- Enrichment is against curated gene-set libraries, not genome-wide screens
- STRING/Reactome/HPA/MyGene reflect curated knowledge
- PubMed literature is search-relevance ranked, not systematic review
The cached evidence bundle is the minimum viable real-data analysis for this topic.