RAG Evaluation

Retrieval quality degrades silently: a new embedding model bumps up dimensions, a chunk-size change reduces recall, or an index backfill problem leaves some records un-vectorized. This tutorial builds a lightweight evaluation harness that measures Precision@k and Recall@k for your RushDB vector search pipeline — and plugs into CI so regressions are caught before they reach production.

Concepts

Metric	Definition	Formula
Precision@k	Of the k results returned, how many are relevant?	`
Recall@k	Of all relevant items, how many appear in the top k?	`

A well-tuned RAG retriever should maintain Precision@5 ≥ 0.6 and Recall@5 ≥ 0.5 for your domain. The exact thresholds depend on your corpus — establish a baseline first, then track drift.

Step 1: Build a ground-truth dataset

Create a small set of evaluation queries paired with expected top record IDs. Aim for 20–50 queries covering your key topic areas.

Python
JavaScript

# eval_dataset.py
EVAL_DATASET = [
    {
        'query': 'how to set up a self-hosted RushDB instance',
        'expected_ids': [
            'record-id-docker-guide',
            'record-id-env-vars',
            'record-id-first-boot'
        ]
    },
    {
        'query': 'connecting a Neo4j Aura database',
        'expected_ids': [
            'record-id-aura-setup',
            'record-id-bolt-url'
        ]
    },
    {
        'query': 'external embedding indexes BYOV',
        'expected_ids': [
            'record-id-byov-intro',
            'record-id-external-index',
            'record-id-vectors-key'
        ]
    }
]

// eval-dataset.js
export const evalDataset = [
  {
    query: 'how to set up a self-hosted RushDB instance',
    expectedIds: [
      'record-id-docker-guide',
      'record-id-env-vars',
      'record-id-first-boot'
    ]
  },
  {
    query: 'connecting a Neo4j Aura database',
    expectedIds: [
      'record-id-aura-setup',
      'record-id-bolt-url'
    ]
  },
  {
    query: 'external embedding indexes BYOV',
    expectedIds: [
      'record-id-byov-intro',
      'record-id-external-index',
      'record-id-vectors-key'
    ]
  }
  // Add 20–50 entries for statistical significance
]

Bootstrapping expected IDs

If you don't have ground-truth IDs yet, run a baseline search, manually review the results, and promote the relevant ones to your dataset:

Python
JavaScript

import os
from rushdb import RushDB

db = RushDB(os.environ['RUSHDB_API_KEY'])

hits = db.ai.search(
    query='how to set up a self-hosted instance',
    labels=['DOC_CHUNK'],
    property_name='text',
    limit=10
)

for h in hits:
    print(h.__id, f'{h.__score:.3f}', h.text[:80])
# Manually note which IDs are relevant → add to EVAL_DATASET

import RushDB from '@rushdb/javascript-sdk'
const db = new RushDB(process.env.RUSHDB_API_KEY)

// Run a query and inspect results to build ground truth
const hits = await db.ai.search({
  query: 'how to set up a self-hosted instance',
  labels: ['DOC_CHUNK'],
  propertyName: 'text',
  limit: 10
})

hits.forEach(h => console.log(h.__id, h.__score?.toFixed(3), h.text.slice(0, 80)))
// Manually note which IDs are relevant → add to evalDataset

Step 2: Implement the metrics

Python
JavaScript

# metrics.py
def precision_at_k(retrieved_ids: list[str], relevant_ids: list[str], k: int) -> float:
    top_k = retrieved_ids[:k]
    relevant_set = set(relevant_ids)
    hits = sum(1 for id_ in top_k if id_ in relevant_set)
    return hits / k

def recall_at_k(retrieved_ids: list[str], relevant_ids: list[str], k: int) -> float:
    if not relevant_ids:
        return 1.0
    top_k = retrieved_ids[:k]
    relevant_set = set(relevant_ids)
    hits = sum(1 for id_ in top_k if id_ in relevant_set)
    return hits / len(relevant_ids)

def mean_metric(values: list[float]) -> float:
    return sum(values) / len(values) if values else 0.0

// metrics.js
export function precisionAtK(retrievedIds, relevantIds, k) {
  const topK = retrievedIds.slice(0, k)
  const relevantSet = new Set(relevantIds)
  const hits = topK.filter(id => relevantSet.has(id)).length
  return hits / k
}

export function recallAtK(retrievedIds, relevantIds, k) {
  if (relevantIds.length === 0) return 1
  const topK = retrievedIds.slice(0, k)
  const relevantSet = new Set(relevantIds)
  const hits = topK.filter(id => relevantSet.has(id)).length
  return hits / relevantIds.length
}

export function meanMetric(values) {
  return values.reduce((sum, v) => sum + v, 0) / values.length
}

Step 3: Run the evaluation

Python
JavaScript

# evaluate.py
import os
from rushdb import RushDB
from eval_dataset import EVAL_DATASET
from metrics import precision_at_k, recall_at_k, mean_metric

db = RushDB(os.environ['RUSHDB_API_KEY'])
K = 5

def evaluate():
    precisions = []
    recalls = []

    for item in EVAL_DATASET:
        hits = db.ai.search(
            query=item['query'],
            labels=['DOC_CHUNK'],
            property_name='text',
            limit=K
        )
        retrieved_ids = [h.__id for h in hits]
        precisions.append(precision_at_k(retrieved_ids, item['expected_ids'], K))
        recalls.append(recall_at_k(retrieved_ids, item['expected_ids'], K))

    results = {
        'k': K,
        'queries': len(EVAL_DATASET),
        'mean_precision': mean_metric(precisions),
        'mean_recall': mean_metric(recalls),
        'per_query': [
            {
                'query': EVAL_DATASET[i]['query'],
                'precision': precisions[i],
                'recall': recalls[i]
            }
            for i in range(len(EVAL_DATASET))
        ]
    }

    print(f"\nEvaluation Results (k={K})")
    print(f"Mean Precision@{K}: {results['mean_precision']:.3f}")
    print(f"Mean Recall@{K}:    {results['mean_recall']:.3f}")

    return results

// evaluate.js
import RushDB from '@rushdb/javascript-sdk'
import { evalDataset } from './eval-dataset.js'
import { precisionAtK, recallAtK, meanMetric } from './metrics.js'

const db = new RushDB(process.env.RUSHDB_API_KEY)
const K = 5

async function evaluate() {
  const precisions = []
  const recalls = []

  for (const { query, expectedIds } of evalDataset) {
    const hits = await db.ai.search({
      query,
      labels: ['DOC_CHUNK'],
      propertyName: 'text',
      limit: K
    })

    const retrievedIds = hits.map(h => h.__id)
    precisions.push(precisionAtK(retrievedIds, expectedIds, K))
    recalls.push(recallAtK(retrievedIds, expectedIds, K))
  }

  const results = {
    k: K,
    queries: evalDataset.length,
    meanPrecision: meanMetric(precisions),
    meanRecall: meanMetric(recalls),
    perQuery: evalDataset.map((item, i) => ({
      query: item.query,
      precision: precisions[i],
      recall: recalls[i]
    }))
  }

  console.log(`\nEvaluation Results (k=${K})`)
  console.log(`Mean Precision@${K}: ${results.meanPrecision.toFixed(3)}`)
  console.log(`Mean Recall@${K}:    ${results.meanRecall.toFixed(3)}`)

  return results
}

export { evaluate }

Step 4: Score drift detection

Track __score for a fixed set of queries to detect when your embedding model or index quality changes.

Python
JavaScript

# score_drift.py
import json, os

SNAPSHOT_FILE = './eval_score_snapshot.json'

def capture_score_snapshot(db, queries: list[str]) -> dict:
    snapshot = {}
    for query in queries:
        hits = db.ai.search(
            query=query,
            labels=['DOC_CHUNK'],
            property_name='text',
            limit=5
        )
        snapshot[query] = [{'id': h.__id, 'score': h.__score} for h in hits]

    with open(SNAPSHOT_FILE, 'w') as f:
        json.dump(snapshot, f, indent=2)
    print('Score snapshot saved.')
    return snapshot

def compare_snapshots(baseline: dict, current: dict, drift_threshold: float = 0.05) -> list:
    drifts = []
    for query, base_results in baseline.items():
        if query not in current:
            continue
        base_scores = {r['id']: r['score'] for r in base_results}
        for result in current[query]:
            id_ = result['id']
            if id_ in base_scores:
                delta = abs(result['score'] - base_scores[id_])
                if delta > drift_threshold:
                    drifts.append({
                        'query': query,
                        'id': id_,
                        'baseline': base_scores[id_],
                        'current': result['score'],
                        'delta': delta
                    })
    return drifts

// score-drift.js
import fs from 'node:fs'

const SNAPSHOT_FILE = './eval-score-snapshot.json'

async function captureScoreSnapshot(db, queries) {
  const snapshot = {}
  for (const query of queries) {
    const hits = await db.ai.search({
      query,
      labels: ['DOC_CHUNK'],
      propertyName: 'text',
      limit: 5
    })
    snapshot[query] = hits.map(h => ({ id: h.__id, score: h.__score }))
  }
  fs.writeFileSync(SNAPSHOT_FILE, JSON.stringify(snapshot, null, 2))
  console.log('Score snapshot saved.')
  return snapshot
}

function compareSnapshots(baseline, current, driftThreshold = 0.05) {
  const drifts = []
  for (const query of Object.keys(baseline)) {
    if (!current[query]) continue
    const baseScores = Object.fromEntries(baseline[query].map(r => [r.id, r.score]))
    for (const { id, score } of current[query]) {
      if (id in baseScores) {
        const delta = Math.abs(score - baseScores[id])
        if (delta > driftThreshold) {
          drifts.push({ query, id, baseline: baseScores[id], current: score, delta })
        }
      }
    }
  }
  return drifts
}

export { captureScoreSnapshot, compareSnapshots }

Step 5: CI regression test

Plug the evaluation harness into your CI pipeline so merges that degrade retrieval quality are blocked.

Python
JavaScript

# eval_ci.py  — run with: python eval_ci.py
import sys
from evaluate import evaluate

PRECISION_THRESHOLD = 0.60
RECALL_THRESHOLD    = 0.50

results = evaluate()

passed = (
    results['mean_precision'] >= PRECISION_THRESHOLD and
    results['mean_recall']    >= RECALL_THRESHOLD
)

if not passed:
    print('\n❌ Retrieval regression detected!')
    print(f"  Precision@{results['k']}: {results['mean_precision']:.3f} (threshold: {PRECISION_THRESHOLD})")
    print(f"  Recall@{results['k']}:    {results['mean_recall']:.3f} (threshold: {RECALL_THRESHOLD})")

    worst = sorted(results['per_query'], key=lambda q: q['precision'])[:3]
    print('\nWorst-performing queries:')
    for q in worst:
        print(f"  \"{q['query']}\" → P={q['precision']:.2f}, R={q['recall']:.2f}")

    sys.exit(1)

print(f"\n✓ Retrieval quality OK (P@{results['k']}={results['mean_precision']:.3f}, R@{results['k']}={results['mean_recall']:.3f})")
sys.exit(0)

// eval.ci.js  — run with: node eval.ci.js
import { evaluate } from './evaluate.js'

const PRECISION_THRESHOLD = 0.60
const RECALL_THRESHOLD    = 0.50

const results = await evaluate()

const passed =
  results.meanPrecision >= PRECISION_THRESHOLD &&
  results.meanRecall    >= RECALL_THRESHOLD

if (!passed) {
  console.error(`\n❌ Retrieval regression detected!`)
  console.error(`  Precision@${results.k}: ${results.meanPrecision.toFixed(3)} (threshold: ${PRECISION_THRESHOLD})`)
  console.error(`  Recall@${results.k}:    ${results.meanRecall.toFixed(3)} (threshold: ${RECALL_THRESHOLD})`)

  // Surface worst-performing queries
  const worst = [...results.perQuery]
    .sort((a, b) => a.precision - b.precision)
    .slice(0, 3)
  console.error('\nWorst-performing queries:')
  worst.forEach(q => console.error(`  "${q.query}" → P=${q.precision.toFixed(2)}, R=${q.recall.toFixed(2)}`))

  process.exit(1)
}

console.log(`\n✓ Retrieval quality OK (P@${results.k}=${results.meanPrecision.toFixed(3)}, R@${results.k}=${results.meanRecall.toFixed(3)})`)
process.exit(0)

GitHub Actions integration

# .github/workflows/rag-eval.yml
name: RAG Evaluation

on:
  pull_request:
    paths:
      - 'src/**'   # Adjust to your source paths

jobs:
  eval:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - uses: actions/setup-node@v4
        with:
          node-version: '20'

      - name: Install dependencies
        run: npm ci

      - name: Run RAG evaluation
        env:
          RUSHDB_API_KEY: ${{ secrets.RUSHDB_API_KEY }}
        run: node eval.ci.js

Step 6: Track results over time

Save evaluation results as JSON artifacts to build a history of retrieval quality.

Python
JavaScript

import json, os
from datetime import datetime, timezone
from evaluate import evaluate

results = evaluate()

entry = {
    'timestamp': datetime.now(timezone.utc).isoformat(),
    'commit': os.environ.get('GITHUB_SHA', 'local'),
    **results
}

with open('./eval_history.jsonl', 'a') as f:
    f.write(json.dumps(entry) + '\n')
print('Results saved to eval_history.jsonl')

import { evaluate } from './evaluate.js'
import fs from 'node:fs'

const results = await evaluate()

const entry = {
  timestamp: new Date().toISOString(),
  commit: process.env.GITHUB_SHA ?? 'local',
  ...results
}

// Append to a JSONL history file
fs.appendFileSync('./eval-history.jsonl', JSON.stringify(entry) + '\n')
console.log('Results saved to eval-history.jsonl')

Interpreting results

Signal	Likely cause	Action
Low Precision@k (< 0.4)	Too many off-topic results	Reduce `limit`, tighten `where` filters, or increase score threshold
Low Recall@k (< 0.4)	Missing relevant chunks	Check index status (`ready`?), increase `limit`, review chunking strategy
Score drift across a PR	Embedding model update	Review BYOV or model config change; re-evaluate thresholds
Specific query clusters failing	Coverage gap in corpus	Ingest additional source material for those topics
Single label dominating results	Label imbalance	Balance corpus or search labels individually then merge

Tips

Start with k=5 or k=10. Going higher inflates recall trivially and makes the metric less useful for catching real degradation.
Weight your queries. If some queries are business-critical, run a weighted mean rather than a simple average.
Keep ground truth in version control. It becomes part of your specification — changes to expected IDs should be deliberate.
Re-validate ground truth after major corpus updates. Expected IDs may no longer exist after a data migration.

Next steps

RAG Reranking — improve Precision@k by adding a second retrieval stage
Multi-Source RAG — extend evaluation to cross-label retrieval
GraphRAG — measure enrichment quality alongside base retrieval metrics

Concepts​

Step 1: Build a ground-truth dataset​

Bootstrapping expected IDs​

Step 2: Implement the metrics​

Step 3: Run the evaluation​

Step 4: Score drift detection​

Step 5: CI regression test​

GitHub Actions integration​

Step 6: Track results over time​

Interpreting results​

Tips​

Next steps​