Follow AiTechWorlds on LinkedIn for professional AI content!Follow Now →
20 minLesson 17 of 23
Production Agents

Agent Evaluation & Testing

Agent Evaluation and Testing

Agents are non-deterministic — the same input can produce different outputs. This makes traditional software testing insufficient. You can't just assert result == expected. Instead, you need evaluation frameworks that measure quality, reliability, and behavior at scale.

Why Agent Testing Is Different

Traditional software test: assert process_payment(100) == {"status": "success", "id": "abc"}

Agent test: Did the agent correctly identify the user's intent? Did it use the right tools? Was the answer accurate? Did it complete in a reasonable number of steps?

These require:

  • LLM-as-judge evaluation (using another LLM to assess quality)
  • Trajectory evaluation (did the agent take the right path, not just get the right answer?)
  • Statistical sampling (run each test case multiple times to account for variance)

Unit Testing Agent Components

Test tools, chains, and nodes in isolation before testing the full agent:

import pytest
from unittest.mock import MagicMock, patch

# Test tools directly
def test_search_tool():
    from tools.search import web_search
    result = web_search.invoke("Python programming language")
    assert isinstance(result, str)
    assert len(result) > 50
    assert "error" not in result.lower()

def test_search_tool_handles_empty_query():
    from tools.search import web_search
    result = web_search.invoke("")
    assert "error" in result.lower() or len(result) < 20

# Test chains with mocked LLM
def test_summary_chain():
    from langchain_core.messages import AIMessage
    
    mock_llm = MagicMock()
    mock_llm.invoke.return_value = AIMessage(content="Short summary: key points")
    
    from chains.summary import create_summary_chain
    chain = create_summary_chain(mock_llm)
    result = chain.invoke({"text": "Long document content..."})
    
    assert "summary" in result.lower()
    mock_llm.invoke.assert_called_once()

# Test LangGraph nodes
def test_research_node():
    from agents.research import research_node
    
    with patch("agents.research.search_tool") as mock_search:
        mock_search.invoke.return_value = "Mock search results about Python"
        
        initial_state = {"task": "Learn about Python", "results": {}}
        final_state = research_node(initial_state)
        
        assert "research" in final_state["results"]
        assert len(final_state["results"]["research"]) > 0

Integration Testing with Real LLMs

For integration tests that actually call the LLM, use a deterministic seed and a cheap model:

import pytest
from langchain_openai import ChatOpenAI

# Use a fast cheap model for integration tests
TEST_LLM = ChatOpenAI(model="gpt-4o-mini", temperature=0)

@pytest.fixture
def test_agent():
    from agents.research_agent import ResearchAgent
    return ResearchAgent(llm=TEST_LLM)

def test_agent_answers_factual_question(test_agent):
    result = test_agent.run("What is the capital of France?")
    assert "paris" in result.lower()

def test_agent_uses_search_for_current_events(test_agent):
    """Agent should use search tool for current events, not rely on training data."""
    result = test_agent.run("What happened in the news today?")
    # Check that the agent actually searched (verify via LangSmith or tool mock)
    # Basic check: result shouldn't be "I don't know"
    assert len(result) > 50
    assert "don't know" not in result.lower()

def test_agent_handles_unanswerable_question(test_agent):
    result = test_agent.run("What is my personal bank account balance?")
    # Should gracefully decline, not hallucinate a number
    assert any(phrase in result.lower() for phrase in [
        "don't have access", "can't access", "no access", "unable to"
    ])

LLM-as-Judge Evaluation

Use a capable LLM to evaluate another LLM's output:

from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from pydantic import BaseModel

evaluator_llm = ChatOpenAI(model="gpt-4o", temperature=0)

class EvaluationResult(BaseModel):
    score: int  # 1-5
    reasoning: str
    is_accurate: bool
    is_helpful: bool
    has_hallucination: bool

def evaluate_response(question: str, response: str, context: str = "") -> EvaluationResult:
    """Evaluate an agent response using an LLM judge."""
    
    parser = JsonOutputParser(pydantic_object=EvaluationResult)
    
    prompt = ChatPromptTemplate.from_template("""Evaluate this AI assistant response.

Question: {question}
Context provided to agent: {context}
Agent Response: {response}

Rate on these criteria (1-5 scale):
- Accuracy: Is the response factually correct?
- Helpfulness: Does it answer what was asked?
- Hallucination: Does it state things not supported by the context?

Respond with JSON: {format_instructions}""")
    
    chain = prompt | evaluator_llm | parser
    
    return chain.invoke({
        "question": question,
        "context": context or "No context provided",
        "response": response,
        "format_instructions": parser.get_format_instructions()
    })

# Use in test
def test_rag_agent_accuracy():
    from agents.rag_agent import RAGAgent
    agent = RAGAgent("./test_knowledge_db")
    
    result = agent.ask("What is the company's refund policy?")
    evaluation = evaluate_response(
        question="What is the company's refund policy?",
        response=result["answer"],
        context="\n".join([d.page_content for d in result["retrieved_docs"]])
    )
    
    assert evaluation.score >= 4, f"Response quality too low: {evaluation.reasoning}"
    assert not evaluation.has_hallucination, f"Hallucination detected: {evaluation.reasoning}"
    assert evaluation.is_accurate

Dataset-Based Evaluation

Run evaluation across a curated dataset of question/answer pairs:

import json
import statistics
from dataclasses import dataclass, field

@dataclass
class EvalCase:
    question: str
    expected_answer: str
    expected_sources: list[str] = field(default_factory=list)
    tags: list[str] = field(default_factory=list)  # e.g., ["factual", "multi-hop"]

EVAL_DATASET = [
    EvalCase("What is the refund policy?", "30-day refund", expected_sources=["policy.pdf"]),
    EvalCase("How do I contact support?", "24/7 chat support", expected_sources=["support.pdf"]),
    EvalCase("What are the enterprise pricing tiers?", "Starter, Growth, Enterprise", expected_sources=["pricing.pdf"]),
]

def run_evaluation(agent, dataset: list[EvalCase]) -> dict:
    results = []
    
    for case in dataset:
        try:
            agent_result = agent.ask(case.question)
            eval_result = evaluate_response(
                question=case.question,
                response=agent_result["answer"],
                context="\n".join([d.page_content for d in agent_result.get("retrieved_docs", [])])
            )
            
            # Check source retrieval
            correct_source = any(
                expected in str(agent_result.get("sources", []))
                for expected in case.expected_sources
            ) if case.expected_sources else True
            
            results.append({
                "question": case.question,
                "score": eval_result.score,
                "accurate": eval_result.is_accurate,
                "hallucinated": eval_result.has_hallucination,
                "correct_source": correct_source,
                "tags": case.tags
            })
        except Exception as e:
            results.append({"question": case.question, "error": str(e), "score": 0})
    
    # Aggregate metrics
    scores = [r["score"] for r in results if "score" in r]
    return {
        "avg_score": statistics.mean(scores),
        "accuracy_rate": sum(r.get("accurate", False) for r in results) / len(results),
        "hallucination_rate": sum(r.get("hallucinated", False) for r in results) / len(results),
        "source_precision": sum(r.get("correct_source", False) for r in results) / len(results),
        "total_cases": len(results),
        "details": results
    }

# Run and report
metrics = run_evaluation(agent, EVAL_DATASET)
print(f"Average score: {metrics['avg_score']:.2f}/5")
print(f"Accuracy: {metrics['accuracy_rate']:.1%}")
print(f"Hallucination rate: {metrics['hallucination_rate']:.1%}")
print(f"Source precision: {metrics['source_precision']:.1%}")

LangSmith for Production Evaluation

LangSmith provides a managed evaluation platform:

from langsmith import Client
from langsmith.evaluation import evaluate

client = Client()

# Create a dataset in LangSmith
dataset = client.create_dataset("rag-eval-v1")
for case in EVAL_DATASET:
    client.create_example(
        inputs={"question": case.question},
        outputs={"answer": case.expected_answer},
        dataset_id=dataset.id
    )

# Define evaluators
def correctness_evaluator(run, example) -> dict:
    response = run.outputs["answer"]
    expected = example.outputs["answer"]
    
    grade = evaluator_llm.invoke(f"""
Is this response correct given the expected answer?
Expected: {expected}
Actual: {response}
Respond with just: correct or incorrect""").content.strip()
    
    return {"key": "correctness", "score": 1 if grade == "correct" else 0}

# Run evaluation
results = evaluate(
    lambda inputs: {"answer": agent.ask(inputs["question"])["answer"]},
    data="rag-eval-v1",
    evaluators=[correctness_evaluator],
    experiment_prefix="rag-agent-v2"
)

Trajectory Evaluation

Check not just the answer but the agent's reasoning path:

def evaluate_agent_trajectory(task: str, expected_tools: list[str]) -> dict:
    """Check if the agent used the expected tools in the expected order."""
    
    tool_calls = []
    
    for event in agent_app.stream({"messages": [("human", task)]}):
        for key, value in event.items():
            if key == "agent" and "messages" in value:
                for msg in value["messages"]:
                    if hasattr(msg, 'tool_calls'):
                        tool_calls.extend([tc.function.name for tc in msg.tool_calls])
    
    return {
        "actual_tools": tool_calls,
        "expected_tools": expected_tools,
        "correct_tools_used": all(t in tool_calls for t in expected_tools),
        "no_unexpected_tools": all(t in expected_tools for t in tool_calls)
    }

result = evaluate_agent_trajectory(
    "What is today's weather in Tokyo?",
    expected_tools=["web_search"]
)

Next lesson: Streaming agent output — showing real-time progress to users.

📱

Get this course's notes on Telegram!

Free cheat sheets, summaries & practice exercises

Get Notes Free →
!