Skip to main content

Research Knowledge Graph: Papers, Authors, Topics, Citations

Academic research is inherently graph-shaped. A paper has authors. Authors belong to institutions. Papers cite other papers. Papers cover topics. Topics overlap. A flat document store loses all of that structure.

This tutorial builds a scholarly knowledge graph that enables citation traversal, co-author discovery, topical clustering, and semantic retrieval.


Graph shape

LabelWhat it represents
PAPERA research paper with title, abstract, year, DOI
AUTHORA researcher or collaborator
INSTITUTIONUniversity, lab, or company
TOPICA subject area or keyword cluster

Step 1: Ingest papers and authors

from rushdb import RushDB
import os

db = RushDB(os.environ["RUSHDB_API_KEY"], base_url="https://api.rushdb.com/api/v1")

db.records.import_json({
"label": "PAPER",
"data": [
{
"doi": "10.1000/xyz001",
"title": "Graph Databases for Scientific Knowledge Representation",
"abstract": "This paper surveys the use of graph databases in representing and querying scientific knowledge.",
"year": 2023, "venue": "VLDB", "citationCount": 47
},
{
"doi": "10.1000/xyz002",
"title": "Neural Retrieval Augmentation with Knowledge Graphs",
"abstract": "We propose a retrieval augmentation framework combining dense vector search with graph traversal.",
"year": 2024, "venue": "NeurIPS", "citationCount": 112
},
{
"doi": "10.1000/xyz003",
"title": "Scalable Graph Construction from Unstructured Text",
"abstract": "A pipeline for extracting entities and relationships from scientific text.",
"year": 2024, "venue": "ACL", "citationCount": 29
}
]
})

db.records.import_json({
"label": "AUTHOR",
"data": [
{"name": "Dr. Yuki Tanaka", "email": "y.tanaka@uni.edu", "hIndex": 18},
{"name": "Prof. Lena Müller", "email": "l.muller@institute.de", "hIndex": 34},
{"name": "Dr. Carlos Reyes", "email": "c.reyes@lab.com", "hIndex": 12}
]
})

db.records.import_json({
"label": "TOPIC",
"data": [
{"name": "graph databases", "category": "systems"},
{"name": "knowledge representation", "category": "ai"},
{"name": "retrieval augmented generation", "category": "nlp"},
{"name": "information extraction", "category": "nlp"}
]
})

Step 2: Build the relationship graph

papers  = db.records.find({"labels": ["PAPER"]})
authors = db.records.find({"labels": ["AUTHOR"]})
topics = db.records.find({"labels": ["TOPIC"]})

paper_map = {p.data["doi"]: p for p in papers.data}
author_map = {a.data["email"]: a for a in authors.data}
topic_map = {t.data["name"]: t for t in topics.data}

db.records.attach(paper_map["10.1000/xyz001"].id, author_map["y.tanaka@uni.edu"].id, {"type": "AUTHORED_BY", "direction": "out"})
db.records.attach(paper_map["10.1000/xyz001"].id, author_map["l.muller@institute.de"].id, {"type": "AUTHORED_BY", "direction": "out"})

db.records.attach(paper_map["10.1000/xyz002"].id, author_map["l.muller@institute.de"].id, {"type": "AUTHORED_BY", "direction": "out"})
db.records.attach(paper_map["10.1000/xyz002"].id, author_map["c.reyes@lab.com"].id, {"type": "AUTHORED_BY", "direction": "out"})
db.records.attach(paper_map["10.1000/xyz002"].id, paper_map["10.1000/xyz001"].id, {"type": "CITES", "direction": "out"})

db.records.attach(paper_map["10.1000/xyz001"].id, topic_map["graph databases"].id, {"type": "COVERS", "direction": "out"})
db.records.attach(paper_map["10.1000/xyz002"].id, topic_map["retrieval augmented generation"].id, {"type": "COVERS", "direction": "out"})

Step 3: Citation traversal queries

# Papers citing xyz001
citing = db.records.find({
"labels": ["PAPER"],
"where": {
"PAPER": {
"$relation": {"type": "CITES", "direction": "out"},
"doi": "10.1000/xyz001"
}
},
"orderBy": {"citationCount": "desc"}
})

# All papers by Lena Müller
muller_papers = db.records.find({
"labels": ["PAPER"],
"where": {
"AUTHOR": {
"$relation": {"type": "AUTHORED_BY", "direction": "out"},
"email": "l.muller@institute.de"
}
},
"orderBy": {"year": "desc"}
})

# Co-authors
co_authors = db.records.find({
"labels": ["AUTHOR"],
"where": {
"PAPER": {
"$relation": {"type": "AUTHORED_BY", "direction": "in"},
"AUTHOR": {
"$relation": {"type": "AUTHORED_BY", "direction": "out"},
"email": "l.muller@institute.de"
}
},
"email": {"$ne": "l.muller@institute.de"}
}
})
print("Co-authors:", [a.data.get("name") for a in co_authors.data])

Step 4: Semantic search over abstracts

Enable semantic search to retrieve papers by conceptual relevance rather than keyword matching.

import time

db.ai.indexes.create({"label": "PAPER", "propertyName": "abstract"})

while True:
stats = db.ai.indexes.stats("PAPER")
if stats.data.get("indexedRecords") == stats.data.get("totalRecords"):
break
time.sleep(2)

results = db.ai.search({
"query": "combining structured graphs with neural retrieval",
"propertyName": "abstract",
"labels": ["PAPER"],
"where": {"year": {"$gte": 2023}},
"limit": 5
})

for paper in results.data:
print(f"[{paper.get('year')}] {paper.get('title')} — score: {paper.score:.3f}")

Production caveat

Citation graphs become highly connected over time. Deep traversal queries (papers that cite papers that cite papers) fan out exponentially. All queries in these examples traverse at most two hops. Design your search queries to bound depth by using intermediate label filters rather than chaining open-ended relationship traversals.


Next steps