Agent Evaluation & Testing
Agent Evaluation and Testing
Agents are non-deterministic — the same input can produce different outputs. This makes traditional software testing insufficient. You can't just assert result == expected. Instead, you need evaluation frameworks that measure quality, reliability, and behavior at scale.
Why Agent Testing Is Different
Traditional software test: assert process_payment(100) == {"status": "success", "id": "abc"}
Agent test: Did the agent correctly identify the user's intent? Did it use the right tools? Was the answer accurate? Did it complete in a reasonable number of steps?
These require:
- LLM-as-judge evaluation (using another LLM to assess quality)
- Trajectory evaluation (did the agent take the right path, not just get the right answer?)
- Statistical sampling (run each test case multiple times to account for variance)
Unit Testing Agent Components
Test tools, chains, and nodes in isolation before testing the full agent:
import pytest
from unittest.mock import MagicMock, patch
# Test tools directly
def test_search_tool():
from tools.search import web_search
result = web_search.invoke("Python programming language")
assert isinstance(result, str)
assert len(result) > 50
assert "error" not in result.lower()
def test_search_tool_handles_empty_query():
from tools.search import web_search
result = web_search.invoke("")
assert "error" in result.lower() or len(result) < 20
# Test chains with mocked LLM
def test_summary_chain():
from langchain_core.messages import AIMessage
mock_llm = MagicMock()
mock_llm.invoke.return_value = AIMessage(content="Short summary: key points")
from chains.summary import create_summary_chain
chain = create_summary_chain(mock_llm)
result = chain.invoke({"text": "Long document content..."})
assert "summary" in result.lower()
mock_llm.invoke.assert_called_once()
# Test LangGraph nodes
def test_research_node():
from agents.research import research_node
with patch("agents.research.search_tool") as mock_search:
mock_search.invoke.return_value = "Mock search results about Python"
initial_state = {"task": "Learn about Python", "results": {}}
final_state = research_node(initial_state)
assert "research" in final_state["results"]
assert len(final_state["results"]["research"]) > 0
Integration Testing with Real LLMs
For integration tests that actually call the LLM, use a deterministic seed and a cheap model:
import pytest
from langchain_openai import ChatOpenAI
# Use a fast cheap model for integration tests
TEST_LLM = ChatOpenAI(model="gpt-4o-mini", temperature=0)
@pytest.fixture
def test_agent():
from agents.research_agent import ResearchAgent
return ResearchAgent(llm=TEST_LLM)
def test_agent_answers_factual_question(test_agent):
result = test_agent.run("What is the capital of France?")
assert "paris" in result.lower()
def test_agent_uses_search_for_current_events(test_agent):
"""Agent should use search tool for current events, not rely on training data."""
result = test_agent.run("What happened in the news today?")
# Check that the agent actually searched (verify via LangSmith or tool mock)
# Basic check: result shouldn't be "I don't know"
assert len(result) > 50
assert "don't know" not in result.lower()
def test_agent_handles_unanswerable_question(test_agent):
result = test_agent.run("What is my personal bank account balance?")
# Should gracefully decline, not hallucinate a number
assert any(phrase in result.lower() for phrase in [
"don't have access", "can't access", "no access", "unable to"
])
LLM-as-Judge Evaluation
Use a capable LLM to evaluate another LLM's output:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from pydantic import BaseModel
evaluator_llm = ChatOpenAI(model="gpt-4o", temperature=0)
class EvaluationResult(BaseModel):
score: int # 1-5
reasoning: str
is_accurate: bool
is_helpful: bool
has_hallucination: bool
def evaluate_response(question: str, response: str, context: str = "") -> EvaluationResult:
"""Evaluate an agent response using an LLM judge."""
parser = JsonOutputParser(pydantic_object=EvaluationResult)
prompt = ChatPromptTemplate.from_template("""Evaluate this AI assistant response.
Question: {question}
Context provided to agent: {context}
Agent Response: {response}
Rate on these criteria (1-5 scale):
- Accuracy: Is the response factually correct?
- Helpfulness: Does it answer what was asked?
- Hallucination: Does it state things not supported by the context?
Respond with JSON: {format_instructions}""")
chain = prompt | evaluator_llm | parser
return chain.invoke({
"question": question,
"context": context or "No context provided",
"response": response,
"format_instructions": parser.get_format_instructions()
})
# Use in test
def test_rag_agent_accuracy():
from agents.rag_agent import RAGAgent
agent = RAGAgent("./test_knowledge_db")
result = agent.ask("What is the company's refund policy?")
evaluation = evaluate_response(
question="What is the company's refund policy?",
response=result["answer"],
context="\n".join([d.page_content for d in result["retrieved_docs"]])
)
assert evaluation.score >= 4, f"Response quality too low: {evaluation.reasoning}"
assert not evaluation.has_hallucination, f"Hallucination detected: {evaluation.reasoning}"
assert evaluation.is_accurate
Dataset-Based Evaluation
Run evaluation across a curated dataset of question/answer pairs:
import json
import statistics
from dataclasses import dataclass, field
@dataclass
class EvalCase:
question: str
expected_answer: str
expected_sources: list[str] = field(default_factory=list)
tags: list[str] = field(default_factory=list) # e.g., ["factual", "multi-hop"]
EVAL_DATASET = [
EvalCase("What is the refund policy?", "30-day refund", expected_sources=["policy.pdf"]),
EvalCase("How do I contact support?", "24/7 chat support", expected_sources=["support.pdf"]),
EvalCase("What are the enterprise pricing tiers?", "Starter, Growth, Enterprise", expected_sources=["pricing.pdf"]),
]
def run_evaluation(agent, dataset: list[EvalCase]) -> dict:
results = []
for case in dataset:
try:
agent_result = agent.ask(case.question)
eval_result = evaluate_response(
question=case.question,
response=agent_result["answer"],
context="\n".join([d.page_content for d in agent_result.get("retrieved_docs", [])])
)
# Check source retrieval
correct_source = any(
expected in str(agent_result.get("sources", []))
for expected in case.expected_sources
) if case.expected_sources else True
results.append({
"question": case.question,
"score": eval_result.score,
"accurate": eval_result.is_accurate,
"hallucinated": eval_result.has_hallucination,
"correct_source": correct_source,
"tags": case.tags
})
except Exception as e:
results.append({"question": case.question, "error": str(e), "score": 0})
# Aggregate metrics
scores = [r["score"] for r in results if "score" in r]
return {
"avg_score": statistics.mean(scores),
"accuracy_rate": sum(r.get("accurate", False) for r in results) / len(results),
"hallucination_rate": sum(r.get("hallucinated", False) for r in results) / len(results),
"source_precision": sum(r.get("correct_source", False) for r in results) / len(results),
"total_cases": len(results),
"details": results
}
# Run and report
metrics = run_evaluation(agent, EVAL_DATASET)
print(f"Average score: {metrics['avg_score']:.2f}/5")
print(f"Accuracy: {metrics['accuracy_rate']:.1%}")
print(f"Hallucination rate: {metrics['hallucination_rate']:.1%}")
print(f"Source precision: {metrics['source_precision']:.1%}")
LangSmith for Production Evaluation
LangSmith provides a managed evaluation platform:
from langsmith import Client
from langsmith.evaluation import evaluate
client = Client()
# Create a dataset in LangSmith
dataset = client.create_dataset("rag-eval-v1")
for case in EVAL_DATASET:
client.create_example(
inputs={"question": case.question},
outputs={"answer": case.expected_answer},
dataset_id=dataset.id
)
# Define evaluators
def correctness_evaluator(run, example) -> dict:
response = run.outputs["answer"]
expected = example.outputs["answer"]
grade = evaluator_llm.invoke(f"""
Is this response correct given the expected answer?
Expected: {expected}
Actual: {response}
Respond with just: correct or incorrect""").content.strip()
return {"key": "correctness", "score": 1 if grade == "correct" else 0}
# Run evaluation
results = evaluate(
lambda inputs: {"answer": agent.ask(inputs["question"])["answer"]},
data="rag-eval-v1",
evaluators=[correctness_evaluator],
experiment_prefix="rag-agent-v2"
)
Trajectory Evaluation
Check not just the answer but the agent's reasoning path:
def evaluate_agent_trajectory(task: str, expected_tools: list[str]) -> dict:
"""Check if the agent used the expected tools in the expected order."""
tool_calls = []
for event in agent_app.stream({"messages": [("human", task)]}):
for key, value in event.items():
if key == "agent" and "messages" in value:
for msg in value["messages"]:
if hasattr(msg, 'tool_calls'):
tool_calls.extend([tc.function.name for tc in msg.tool_calls])
return {
"actual_tools": tool_calls,
"expected_tools": expected_tools,
"correct_tools_used": all(t in tool_calls for t in expected_tools),
"no_unexpected_tools": all(t in expected_tools for t in tool_calls)
}
result = evaluate_agent_trajectory(
"What is today's weather in Tokyo?",
expected_tools=["web_search"]
)
Next lesson: Streaming agent output — showing real-time progress to users.
Get this course's notes on Telegram!
Free cheat sheets, summaries & practice exercises