Retrieval-Augmented Generation (RAG)
Track retrieval context and vector search in RAG applications.
Basic RAG Pattern
const turn = session.turn()
try {
// 1. Retrieve context
const retrievalResults = await turn.recordTool(
'vector_search',
async () => {
return await vectorDB.search(userMessage, { limit: 5 })
},
{ type: 'vector_search', target: 'pinecone', version: 'v1' }
)
// 2. Add retrieval metadata
turn.addRetrieval({
source: 'documentation',
query: userMessage,
results: retrievalResults.map(r => ({
id: r.id,
score: r.score,
content: r.text.substring(0, 200) // Truncate for storage
}))
})
// 3. Generate with context
const context = retrievalResults.map(r => r.text).join('\n\n')
const response = await turn.wrapLLM(
async () => {
return await openai.chat.completions.create({
model: 'gpt-4o',
messages: [
{ role: 'system', content: `Context:\n${context}` },
{ role: 'user', content: userMessage }
],
})
},
{ model: 'gpt-4o', prompt_id: 'rag_v1' }
)
turn.setMessages([
{ role: 'user', content: userMessage },
{ role: 'assistant', content: response.choices[0].message.content || '' }
])
// 4. Annotate with RAG metadata
turn.annotate({
contextLength: context.length,
documentsRetrieved: retrievalResults.length,
avgRelevanceScore: retrievalResults.reduce((sum, r) => sum + r.score, 0) / retrievalResults.length
})
} finally {
await turn.finish()
}Multi-Step RAG
Track multiple retrieval steps (retrieval, reranking, filtering):
const turn = session.turn()
try {
// Step 1: Initial retrieval
const initialResults = await turn.recordTool(
'initial_search',
async () => {
return await vectorDB.search(userMessage, { limit: 20 })
},
{ type: 'vector_search', target: 'pinecone', version: 'v1', step: 'initial' }
)
// Step 2: Rerank results
const rerankedResults = await turn.recordTool(
'rerank',
async () => {
return await reranker.rerank(userMessage, initialResults)
},
{ type: 'rerank', target: 'cohere', version: 'v1', step: 'rerank' }
)
// Step 3: Filter by threshold
const finalResults = rerankedResults.filter(r => r.score > 0.7)
turn.addRetrieval({
source: 'documentation',
query: userMessage,
results: finalResults.map(r => ({
id: r.id,
score: r.score,
content: r.text.substring(0, 200)
}))
})
// Generate with filtered context
const context = finalResults.map(r => r.text).join('\n\n')
const response = await turn.wrapLLM(
async () => {
return await openai.chat.completions.create({
model: 'gpt-4o',
messages: [
{ role: 'system', content: `Context:\n${context}` },
{ role: 'user', content: userMessage }
],
})
},
{ model: 'gpt-4o', prompt_id: 'rag_v1' }
)
turn.setMessages([
{ role: 'user', content: userMessage },
{ role: 'assistant', content: response.choices[0].message.content || '' }
])
turn.annotate({
retrieval: {
initialCount: initialResults.length,
rerankedCount: rerankedResults.length,
finalCount: finalResults.length,
threshold: 0.7,
}
})
} finally {
await turn.finish()
}Hybrid Search (Dense + Sparse)
Combine vector search with keyword search:
const turn = session.turn()
try {
// Dense retrieval (vector search)
const vectorResults = await turn.recordTool(
'vector_search',
async () => {
return await vectorDB.search(embedding, { limit: 10 })
},
{ type: 'vector_search', target: 'pinecone', version: 'v1' }
)
// Sparse retrieval (BM25 keyword search)
const keywordResults = await turn.recordTool(
'keyword_search',
async () => {
return await elasticSearch.search(userMessage, { limit: 10 })
},
{ type: 'keyword_search', target: 'elasticsearch', version: 'v1' }
)
// Merge and deduplicate results
const mergedResults = mergeResults(vectorResults, keywordResults)
turn.addRetrieval({
source: 'documentation',
query: userMessage,
results: mergedResults.map(r => ({
id: r.id,
score: r.score,
content: r.text.substring(0, 200),
source: r.source // 'vector' or 'keyword'
}))
})
// Generate
const context = mergedResults.map(r => r.text).join('\n\n')
const response = await turn.wrapLLM(
async () => {
return await openai.chat.completions.create({
model: 'gpt-4o',
messages: [
{ role: 'system', content: `Context:\n${context}` },
{ role: 'user', content: userMessage }
],
})
},
{ model: 'gpt-4o', prompt_id: 'hybrid_rag_v1' }
)
turn.setMessages([
{ role: 'user', content: userMessage },
{ role: 'assistant', content: response.choices[0].message.content || '' }
])
turn.annotate({
hybridSearch: {
vectorResultsCount: vectorResults.length,
keywordResultsCount: keywordResults.length,
mergedCount: mergedResults.length,
}
})
} finally {
await turn.finish()
}Query Transformation
Track query rewriting and expansion:
const turn = session.turn()
try {
// Rewrite query for better retrieval
const rewrittenQuery = await turn.recordTool(
'query_rewrite',
async () => {
const response = await openai.chat.completions.create({
model: 'gpt-4o-mini',
messages: [{
role: 'user',
content: `Rewrite this query for better document retrieval: ${userMessage}`
}],
})
return response.choices[0].message.content
},
{ type: 'query_transform', target: 'openai', version: 'v1' }
)
// Retrieve with transformed query
const results = await turn.recordTool(
'vector_search',
async () => {
return await vectorDB.search(rewrittenQuery, { limit: 5 })
},
{ type: 'vector_search', target: 'pinecone', version: 'v1' }
)
turn.addRetrieval({
source: 'documentation',
query: rewrittenQuery,
originalQuery: userMessage,
results: results.map(r => ({
id: r.id,
score: r.score,
content: r.text.substring(0, 200)
}))
})
// Generate
const context = results.map(r => r.text).join('\n\n')
const response = await turn.wrapLLM(
async () => {
return await openai.chat.completions.create({
model: 'gpt-4o',
messages: [
{ role: 'system', content: `Context:\n${context}` },
{ role: 'user', content: userMessage }
],
})
},
{ model: 'gpt-4o', prompt_id: 'rag_v1' }
)
turn.setMessages([
{ role: 'user', content: userMessage },
{ role: 'assistant', content: response.choices[0].message.content || '' }
])
turn.annotate({
queryTransform: {
original: userMessage,
rewritten: rewrittenQuery,
}
})
} finally {
await turn.finish()
}Citations and Source Tracking
Track which sources were cited in the response:
const turn = session.turn()
try {
const results = await turn.recordTool(
'vector_search',
async () => {
return await vectorDB.search(userMessage, { limit: 5 })
},
{ type: 'vector_search', target: 'pinecone', version: 'v1' }
)
// Create numbered references
const context = results.map((r, i) =>
`[${i + 1}] ${r.text}`
).join('\n\n')
const response = await turn.wrapLLM(
async () => {
return await openai.chat.completions.create({
model: 'gpt-4o',
messages: [
{
role: 'system',
content: `Context:\n${context}\n\nCite sources using [1], [2], etc.`
},
{ role: 'user', content: userMessage }
],
})
},
{ model: 'gpt-4o', prompt_id: 'rag_with_citations' }
)
const assistantMessage = response.choices[0].message.content || ''
// Extract citations from response
const citations = extractCitations(assistantMessage)
turn.addRetrieval({
source: 'documentation',
query: userMessage,
results: results.map((r, i) => ({
id: r.id,
score: r.score,
content: r.text.substring(0, 200),
cited: citations.includes(i + 1)
}))
})
turn.setMessages([
{ role: 'user', content: userMessage },
{ role: 'assistant', content: assistantMessage }
])
turn.annotate({
citations: {
total: citations.length,
sources: citations,
}
})
} finally {
await turn.finish()
}
function extractCitations(text: string): number[] {
const matches = text.match(/\[(\d+)\]/g) || []
return [...new Set(matches.map(m => parseInt(m.match(/\d+/)[0])))]
}Best Practices
1. Always Track Retrieval Metadata
// ✅ Good: Track what was retrieved
turn.addRetrieval({
source: 'documentation',
query: userMessage,
results: retrievalResults.map(r => ({
id: r.id,
score: r.score,
content: r.text.substring(0, 200)
}))
})2. Record Each Retrieval Step
// ✅ Good: Record each step separately
await turn.recordTool('initial_search', () => vectorDB.search(...))
await turn.recordTool('rerank', () => reranker.rerank(...))
await turn.recordTool('filter', () => filter(...))3. Annotate with Retrieval Metrics
turn.annotate({
retrieval: {
documentsRetrieved: results.length,
avgRelevanceScore: avgScore,
contextLength: context.length,
threshold: scoreThreshold,
}
})4. Truncate Large Context
Don't store entire documents in retrieval results:
// ✅ Good: Truncate content
content: r.text.substring(0, 200)
// ❌ Bad: Store entire document
content: r.text // Could be 10KB+Next Steps
- Streaming - Handle streaming responses
- Multi-Model - Multi-step AI workflows
- Best Practices - General SDK best practices