Building a RAG Agent | AI Agent Development Course | AiTechWorlds

Building a Complete RAG Agent

RAG (Retrieval-Augmented Generation) solves the LLM's two key limitations: outdated knowledge and hallucination. By retrieving relevant documents before generating, the agent answers from evidence rather than training data guesses. This lesson builds a production-quality RAG agent from scratch.

The RAG Architecture

User Query
    ↓
[Retriever] → Vector search → Top-K relevant chunks
    ↓
[LLM] receives: user query + retrieved chunks
    ↓
Grounded answer with source citations

The LLM's instructions: "Answer the question using ONLY the provided context. If the context doesn't contain the answer, say so."

This grounding is what eliminates hallucination on your domain-specific content.

Part 1: Document Ingestion

import os
from pathlib import Path
from langchain_community.document_loaders import (
    PyPDFLoader, DirectoryLoader, TextLoader, WebBaseLoader
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

def build_knowledge_base(docs_path: str, db_path: str) -> Chroma:
    """
    Load documents, split, embed, and store in vector database.
    Run this once (or when documents change).
    """
    # Load all PDFs from directory
    loader = DirectoryLoader(
        docs_path,
        glob="**/*.pdf",
        loader_cls=PyPDFLoader,
        show_progress=True
    )
    documents = loader.load()
    print(f"Loaded {len(documents)} document pages")
    
    # Split into chunks
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=150,
        add_start_index=True  # Adds "start_index" to metadata
    )
    chunks = splitter.split_documents(documents)
    print(f"Created {len(chunks)} chunks")
    
    # Create and persist vector store
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
    vectorstore = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        persist_directory=db_path,
        collection_metadata={"hnsw:space": "cosine"}
    )
    
    print(f"Knowledge base built: {vectorstore._collection.count()} vectors")
    return vectorstore

# Build once
if not Path("./knowledge_db").exists():
    vectorstore = build_knowledge_base("./documents", "./knowledge_db")

Part 2: The RAG Agent

from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

def load_knowledge_base(db_path: str) -> Chroma:
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
    return Chroma(persist_directory=db_path, embedding_function=embeddings)

def format_docs(docs) -> str:
    """Format retrieved documents into a string for the prompt."""
    formatted = []
    for i, doc in enumerate(docs, 1):
        source = doc.metadata.get("source", "Unknown").split("/")[-1]
        page = doc.metadata.get("page", "")
        location = f"{source}" + (f", page {page}" if page else "")
        formatted.append(f"[{i}] Source: {location}\n{doc.page_content}")
    return "\n\n".join(formatted)

class RAGAgent:
    def __init__(self, db_path: str, model: str = "gpt-4o"):
        self.vectorstore = load_knowledge_base(db_path)
        self.retriever = self.vectorstore.as_retriever(
            search_type="mmr",      # Diverse results
            search_kwargs={"k": 5, "fetch_k": 15}
        )
        self.llm = ChatOpenAI(model=model, temperature=0)
        self.chain = self._build_chain()
        self.chat_history = []
    
    def _build_chain(self):
        prompt = ChatPromptTemplate.from_messages([
            ("system", """You are a helpful assistant that answers questions based on the provided context.

Rules:
1. Answer ONLY using information from the provided context
2. If the context doesn't contain enough information, say: "I don't have enough information in my knowledge base to answer this."
3. Always cite which source [number] you're drawing from
4. Be concise and direct

Context:
{context}"""),
            MessagesPlaceholder("chat_history"),
            ("human", "{question}")
        ])
        
        chain = (
            {
                "context": lambda x: format_docs(self.retriever.invoke(x["question"])),
                "question": lambda x: x["question"],
                "chat_history": lambda x: x["chat_history"]
            }
            | prompt
            | self.llm
            | StrOutputParser()
        )
        return chain
    
    def ask(self, question: str) -> dict:
        """Ask a question and get an answer with sources."""
        # Retrieve relevant documents
        retrieved_docs = self.retriever.invoke(question)
        
        # Generate answer
        answer = self.chain.invoke({
            "question": question,
            "chat_history": self.chat_history
        })
        
        # Update history
        from langchain_core.messages import HumanMessage, AIMessage
        self.chat_history.append(HumanMessage(content=question))
        self.chat_history.append(AIMessage(content=answer))
        
        # Return answer with sources
        sources = list(set([
            doc.metadata.get("source", "Unknown").split("/")[-1] 
            for doc in retrieved_docs
        ]))
        
        return {
            "answer": answer,
            "sources": sources,
            "retrieved_docs": retrieved_docs
        }

# Usage
agent = RAGAgent("./knowledge_db")
result = agent.ask("What is the refund policy for digital products?")
print(result["answer"])
print(f"\nSources: {', '.join(result['sources'])}")

Part 3: Query Transformation

Improve retrieval by rewriting the user's question for better search:

def create_query_transformer(llm):
    """Transform conversational questions into better search queries."""
    prompt = ChatPromptTemplate.from_template("""Given a conversation history and the latest question, 
generate 3 different search queries to find relevant documents.
Each query should approach the topic from a slightly different angle.

Chat history: {chat_history}
Current question: {question}

Generate 3 search queries (one per line, no numbering or bullets):""")
    
    chain = prompt | llm | StrOutputParser()
    
    def transform(question: str, chat_history: list) -> list[str]:
        result = chain.invoke({
            "question": question, 
            "chat_history": "\n".join([f"{m.type}: {m.content}" for m in chat_history[-4:]])
        })
        queries = [q.strip() for q in result.strip().split("\n") if q.strip()]
        return [question] + queries[:2]  # Original + 2 alternatives
    
    return transform

# Multi-query retrieval
def multi_query_retrieve(queries: list[str], retriever) -> list:
    all_docs = []
    seen_ids = set()
    
    for query in queries:
        docs = retriever.invoke(query)
        for doc in docs:
            doc_id = hash(doc.page_content)
            if doc_id not in seen_ids:
                all_docs.append(doc)
                seen_ids.add(doc_id)
    
    return all_docs[:8]  # Cap total retrieved docs

Part 4: Evaluation

Evaluate your RAG pipeline systematically:

def evaluate_rag(agent: RAGAgent, test_set: list[dict]) -> dict:
    """
    test_set: [{"question": "...", "expected_answer": "...", "source": "..."}]
    """
    results = {
        "correct_source_count": 0,
        "total": len(test_set),
        "details": []
    }
    
    for case in test_set:
        result = agent.ask(case["question"])
        
        source_correct = case["source"] in result["sources"]
        if source_correct:
            results["correct_source_count"] += 1
        
        results["details"].append({
            "question": case["question"],
            "answer": result["answer"][:200],
            "source_correct": source_correct,
            "returned_sources": result["sources"]
        })
    
    results["source_precision"] = results["correct_source_count"] / results["total"]
    print(f"Source precision: {results['source_precision']:.1%}")
    return results

test_set = [
    {"question": "How many vacation days do new employees receive?", "source": "hr_policy.pdf"},
    {"question": "What's the process for requesting a software license?", "source": "it_policy.pdf"},
]
eval_results = evaluate_rag(agent, test_set)

Common RAG Failure Modes and Fixes

Problem	Symptom	Fix
Retrieval misses	Agent says "I don't have info" when you know it's in the docs	Smaller chunks, more overlap, check ingestion
Hallucination	Agent answers with wrong details	Stricter system prompt, verify with `source in context`
Context overflow	Irrelevant retrieved chunks hurt quality	Reduce k, add metadata filtering, use MMR
Multi-hop questions	Agent can't answer questions needing 2+ documents	Multi-query retrieval, or break question into sub-questions
History confusion	Agent loses track of the conversation	Keep history shorter, summarize older turns

Next lesson: LangGraph introduction — building stateful, cyclical agent workflows.