Building a RAG Agent
Building a Complete RAG Agent
RAG (Retrieval-Augmented Generation) solves the LLM's two key limitations: outdated knowledge and hallucination. By retrieving relevant documents before generating, the agent answers from evidence rather than training data guesses. This lesson builds a production-quality RAG agent from scratch.
The RAG Architecture
User Query
↓
[Retriever] → Vector search → Top-K relevant chunks
↓
[LLM] receives: user query + retrieved chunks
↓
Grounded answer with source citations
The LLM's instructions: "Answer the question using ONLY the provided context. If the context doesn't contain the answer, say so."
This grounding is what eliminates hallucination on your domain-specific content.
Part 1: Document Ingestion
import os
from pathlib import Path
from langchain_community.document_loaders import (
PyPDFLoader, DirectoryLoader, TextLoader, WebBaseLoader
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
def build_knowledge_base(docs_path: str, db_path: str) -> Chroma:
"""
Load documents, split, embed, and store in vector database.
Run this once (or when documents change).
"""
# Load all PDFs from directory
loader = DirectoryLoader(
docs_path,
glob="**/*.pdf",
loader_cls=PyPDFLoader,
show_progress=True
)
documents = loader.load()
print(f"Loaded {len(documents)} document pages")
# Split into chunks
splitter = RecursiveCharacterTextSplitter(
chunk_size=800,
chunk_overlap=150,
add_start_index=True # Adds "start_index" to metadata
)
chunks = splitter.split_documents(documents)
print(f"Created {len(chunks)} chunks")
# Create and persist vector store
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vectorstore = Chroma.from_documents(
documents=chunks,
embedding=embeddings,
persist_directory=db_path,
collection_metadata={"hnsw:space": "cosine"}
)
print(f"Knowledge base built: {vectorstore._collection.count()} vectors")
return vectorstore
# Build once
if not Path("./knowledge_db").exists():
vectorstore = build_knowledge_base("./documents", "./knowledge_db")
Part 2: The RAG Agent
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
def load_knowledge_base(db_path: str) -> Chroma:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
return Chroma(persist_directory=db_path, embedding_function=embeddings)
def format_docs(docs) -> str:
"""Format retrieved documents into a string for the prompt."""
formatted = []
for i, doc in enumerate(docs, 1):
source = doc.metadata.get("source", "Unknown").split("/")[-1]
page = doc.metadata.get("page", "")
location = f"{source}" + (f", page {page}" if page else "")
formatted.append(f"[{i}] Source: {location}\n{doc.page_content}")
return "\n\n".join(formatted)
class RAGAgent:
def __init__(self, db_path: str, model: str = "gpt-4o"):
self.vectorstore = load_knowledge_base(db_path)
self.retriever = self.vectorstore.as_retriever(
search_type="mmr", # Diverse results
search_kwargs={"k": 5, "fetch_k": 15}
)
self.llm = ChatOpenAI(model=model, temperature=0)
self.chain = self._build_chain()
self.chat_history = []
def _build_chain(self):
prompt = ChatPromptTemplate.from_messages([
("system", """You are a helpful assistant that answers questions based on the provided context.
Rules:
1. Answer ONLY using information from the provided context
2. If the context doesn't contain enough information, say: "I don't have enough information in my knowledge base to answer this."
3. Always cite which source [number] you're drawing from
4. Be concise and direct
Context:
{context}"""),
MessagesPlaceholder("chat_history"),
("human", "{question}")
])
chain = (
{
"context": lambda x: format_docs(self.retriever.invoke(x["question"])),
"question": lambda x: x["question"],
"chat_history": lambda x: x["chat_history"]
}
| prompt
| self.llm
| StrOutputParser()
)
return chain
def ask(self, question: str) -> dict:
"""Ask a question and get an answer with sources."""
# Retrieve relevant documents
retrieved_docs = self.retriever.invoke(question)
# Generate answer
answer = self.chain.invoke({
"question": question,
"chat_history": self.chat_history
})
# Update history
from langchain_core.messages import HumanMessage, AIMessage
self.chat_history.append(HumanMessage(content=question))
self.chat_history.append(AIMessage(content=answer))
# Return answer with sources
sources = list(set([
doc.metadata.get("source", "Unknown").split("/")[-1]
for doc in retrieved_docs
]))
return {
"answer": answer,
"sources": sources,
"retrieved_docs": retrieved_docs
}
# Usage
agent = RAGAgent("./knowledge_db")
result = agent.ask("What is the refund policy for digital products?")
print(result["answer"])
print(f"\nSources: {', '.join(result['sources'])}")
Part 3: Query Transformation
Improve retrieval by rewriting the user's question for better search:
def create_query_transformer(llm):
"""Transform conversational questions into better search queries."""
prompt = ChatPromptTemplate.from_template("""Given a conversation history and the latest question,
generate 3 different search queries to find relevant documents.
Each query should approach the topic from a slightly different angle.
Chat history: {chat_history}
Current question: {question}
Generate 3 search queries (one per line, no numbering or bullets):""")
chain = prompt | llm | StrOutputParser()
def transform(question: str, chat_history: list) -> list[str]:
result = chain.invoke({
"question": question,
"chat_history": "\n".join([f"{m.type}: {m.content}" for m in chat_history[-4:]])
})
queries = [q.strip() for q in result.strip().split("\n") if q.strip()]
return [question] + queries[:2] # Original + 2 alternatives
return transform
# Multi-query retrieval
def multi_query_retrieve(queries: list[str], retriever) -> list:
all_docs = []
seen_ids = set()
for query in queries:
docs = retriever.invoke(query)
for doc in docs:
doc_id = hash(doc.page_content)
if doc_id not in seen_ids:
all_docs.append(doc)
seen_ids.add(doc_id)
return all_docs[:8] # Cap total retrieved docs
Part 4: Evaluation
Evaluate your RAG pipeline systematically:
def evaluate_rag(agent: RAGAgent, test_set: list[dict]) -> dict:
"""
test_set: [{"question": "...", "expected_answer": "...", "source": "..."}]
"""
results = {
"correct_source_count": 0,
"total": len(test_set),
"details": []
}
for case in test_set:
result = agent.ask(case["question"])
source_correct = case["source"] in result["sources"]
if source_correct:
results["correct_source_count"] += 1
results["details"].append({
"question": case["question"],
"answer": result["answer"][:200],
"source_correct": source_correct,
"returned_sources": result["sources"]
})
results["source_precision"] = results["correct_source_count"] / results["total"]
print(f"Source precision: {results['source_precision']:.1%}")
return results
test_set = [
{"question": "How many vacation days do new employees receive?", "source": "hr_policy.pdf"},
{"question": "What's the process for requesting a software license?", "source": "it_policy.pdf"},
]
eval_results = evaluate_rag(agent, test_set)
Common RAG Failure Modes and Fixes
| Problem | Symptom | Fix |
|---|---|---|
| Retrieval misses | Agent says "I don't have info" when you know it's in the docs | Smaller chunks, more overlap, check ingestion |
| Hallucination | Agent answers with wrong details | Stricter system prompt, verify with source in context |
| Context overflow | Irrelevant retrieved chunks hurt quality | Reduce k, add metadata filtering, use MMR |
| Multi-hop questions | Agent can't answer questions needing 2+ documents | Multi-query retrieval, or break question into sub-questions |
| History confusion | Agent loses track of the conversation | Keep history shorter, summarize older turns |
Next lesson: LangGraph introduction — building stateful, cyclical agent workflows.
Get this course's notes on Telegram!
Free cheat sheets, summaries & practice exercises