Agent Evaluation & Testing | AI Agent Development Course | AiTechWorlds

Agent Evaluation and Testing

Agents are non-deterministic — the same input can produce different outputs. This makes traditional software testing insufficient. You can't just assert result == expected. Instead, you need evaluation frameworks that measure quality, reliability, and behavior at scale.

Why Agent Testing Is Different

Traditional software test: assert process_payment(100) == {"status": "success", "id": "abc"}

Agent test: Did the agent correctly identify the user's intent? Did it use the right tools? Was the answer accurate? Did it complete in a reasonable number of steps?

These require:

LLM-as-judge evaluation (using another LLM to assess quality)
Trajectory evaluation (did the agent take the right path, not just get the right answer?)
Statistical sampling (run each test case multiple times to account for variance)

Unit Testing Agent Components

Test tools, chains, and nodes in isolation before testing the full agent:

import pytest
from unittest.mock import MagicMock, patch

# Test tools directly
def test_search_tool():
    from tools.search import web_search
    result = web_search.invoke("Python programming language")
    assert isinstance(result, str)
    assert len(result) > 50
    assert "error" not in result.lower()

def test_search_tool_handles_empty_query():
    from tools.search import web_search
    result = web_search.invoke("")
    assert "error" in result.lower() or len(result) < 20

# Test chains with mocked LLM
def test_summary_chain():
    from langchain_core.messages import AIMessage
    
    mock_llm = MagicMock()
    mock_llm.invoke.return_value = AIMessage(content="Short summary: key points")
    
    from chains.summary import create_summary_chain
    chain = create_summary_chain(mock_llm)
    result = chain.invoke({"text": "Long document content..."})
    
    assert "summary" in result.lower()
    mock_llm.invoke.assert_called_once()

# Test LangGraph nodes
def test_research_node():
    from agents.research import research_node
    
    with patch("agents.research.search_tool") as mock_search:
        mock_search.invoke.return_value = "Mock search results about Python"
        
        initial_state = {"task": "Learn about Python", "results": {}}
        final_state = research_node(initial_state)
        
        assert "research" in final_state["results"]
        assert len(final_state["results"]["research"]) > 0

Integration Testing with Real LLMs

For integration tests that actually call the LLM, use a deterministic seed and a cheap model:

import pytest
from langchain_openai import ChatOpenAI

# Use a fast cheap model for integration tests
TEST_LLM = ChatOpenAI(model="gpt-4o-mini", temperature=0)

@pytest.fixture
def test_agent():
    from agents.research_agent import ResearchAgent
    return ResearchAgent(llm=TEST_LLM)

def test_agent_answers_factual_question(test_agent):
    result = test_agent.run("What is the capital of France?")
    assert "paris" in result.lower()

def test_agent_uses_search_for_current_events(test_agent):
    """Agent should use search tool for current events, not rely on training data."""
    result = test_agent.run("What happened in the news today?")
    # Check that the agent actually searched (verify via LangSmith or tool mock)
    # Basic check: result shouldn't be "I don't know"
    assert len(result) > 50
    assert "don't know" not in result.lower()

def test_agent_handles_unanswerable_question(test_agent):
    result = test_agent.run("What is my personal bank account balance?")
    # Should gracefully decline, not hallucinate a number
    assert any(phrase in result.lower() for phrase in [
        "don't have access", "can't access", "no access", "unable to"
    ])

LLM-as-Judge Evaluation

Use a capable LLM to evaluate another LLM's output:

from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from pydantic import BaseModel

evaluator_llm = ChatOpenAI(model="gpt-4o", temperature=0)

class EvaluationResult(BaseModel):
    score: int  # 1-5
    reasoning: str
    is_accurate: bool
    is_helpful: bool
    has_hallucination: bool

def evaluate_response(question: str, response: str, context: str = "") -> EvaluationResult:
    """Evaluate an agent response using an LLM judge."""
    
    parser = JsonOutputParser(pydantic_object=EvaluationResult)
    
    prompt = ChatPromptTemplate.from_template("""Evaluate this AI assistant response.

Question: {question}
Context provided to agent: {context}
Agent Response: {response}

Rate on these criteria (1-5 scale):
- Accuracy: Is the response factually correct?
- Helpfulness: Does it answer what was asked?
- Hallucination: Does it state things not supported by the context?

Respond with JSON: {format_instructions}""")
    
    chain = prompt | evaluator_llm | parser
    
    return chain.invoke({
        "question": question,
        "context": context or "No context provided",
        "response": response,
        "format_instructions": parser.get_format_instructions()
    })

# Use in test
def test_rag_agent_accuracy():
    from agents.rag_agent import RAGAgent
    agent = RAGAgent("./test_knowledge_db")
    
    result = agent.ask("What is the company's refund policy?")
    evaluation = evaluate_response(
        question="What is the company's refund policy?",
        response=result["answer"],
        context="\n".join([d.page_content for d in result["retrieved_docs"]])
    )
    
    assert evaluation.score >= 4, f"Response quality too low: {evaluation.reasoning}"
    assert not evaluation.has_hallucination, f"Hallucination detected: {evaluation.reasoning}"
    assert evaluation.is_accurate

Dataset-Based Evaluation

Run evaluation across a curated dataset of question/answer pairs:

import json
import statistics
from dataclasses import dataclass, field

@dataclass
class EvalCase:
    question: str
    expected_answer: str
    expected_sources: list[str] = field(default_factory=list)
    tags: list[str] = field(default_factory=list)  # e.g., ["factual", "multi-hop"]

EVAL_DATASET = [
    EvalCase("What is the refund policy?", "30-day refund", expected_sources=["policy.pdf"]),
    EvalCase("How do I contact support?", "24/7 chat support", expected_sources=["support.pdf"]),
    EvalCase("What are the enterprise pricing tiers?", "Starter, Growth, Enterprise", expected_sources=["pricing.pdf"]),
]

def run_evaluation(agent, dataset: list[EvalCase]) -> dict:
    results = []
    
    for case in dataset:
        try:
            agent_result = agent.ask(case.question)
            eval_result = evaluate_response(
                question=case.question,
                response=agent_result["answer"],
                context="\n".join([d.page_content for d in agent_result.get("retrieved_docs", [])])
            )
            
            # Check source retrieval
            correct_source = any(
                expected in str(agent_result.get("sources", []))
                for expected in case.expected_sources
            ) if case.expected_sources else True
            
            results.append({
                "question": case.question,
                "score": eval_result.score,
                "accurate": eval_result.is_accurate,
                "hallucinated": eval_result.has_hallucination,
                "correct_source": correct_source,
                "tags": case.tags
            })
        except Exception as e:
            results.append({"question": case.question, "error": str(e), "score": 0})
    
    # Aggregate metrics
    scores = [r["score"] for r in results if "score" in r]
    return {
        "avg_score": statistics.mean(scores),
        "accuracy_rate": sum(r.get("accurate", False) for r in results) / len(results),
        "hallucination_rate": sum(r.get("hallucinated", False) for r in results) / len(results),
        "source_precision": sum(r.get("correct_source", False) for r in results) / len(results),
        "total_cases": len(results),
        "details": results
    }

# Run and report
metrics = run_evaluation(agent, EVAL_DATASET)
print(f"Average score: {metrics['avg_score']:.2f}/5")
print(f"Accuracy: {metrics['accuracy_rate']:.1%}")
print(f"Hallucination rate: {metrics['hallucination_rate']:.1%}")
print(f"Source precision: {metrics['source_precision']:.1%}")

LangSmith for Production Evaluation

LangSmith provides a managed evaluation platform:

from langsmith import Client
from langsmith.evaluation import evaluate

client = Client()

# Create a dataset in LangSmith
dataset = client.create_dataset("rag-eval-v1")
for case in EVAL_DATASET:
    client.create_example(
        inputs={"question": case.question},
        outputs={"answer": case.expected_answer},
        dataset_id=dataset.id
    )

# Define evaluators
def correctness_evaluator(run, example) -> dict:
    response = run.outputs["answer"]
    expected = example.outputs["answer"]
    
    grade = evaluator_llm.invoke(f"""
Is this response correct given the expected answer?
Expected: {expected}
Actual: {response}
Respond with just: correct or incorrect""").content.strip()
    
    return {"key": "correctness", "score": 1 if grade == "correct" else 0}

# Run evaluation
results = evaluate(
    lambda inputs: {"answer": agent.ask(inputs["question"])["answer"]},
    data="rag-eval-v1",
    evaluators=[correctness_evaluator],
    experiment_prefix="rag-agent-v2"
)

Trajectory Evaluation

Check not just the answer but the agent's reasoning path:

def evaluate_agent_trajectory(task: str, expected_tools: list[str]) -> dict:
    """Check if the agent used the expected tools in the expected order."""
    
    tool_calls = []
    
    for event in agent_app.stream({"messages": [("human", task)]}):
        for key, value in event.items():
            if key == "agent" and "messages" in value:
                for msg in value["messages"]:
                    if hasattr(msg, 'tool_calls'):
                        tool_calls.extend([tc.function.name for tc in msg.tool_calls])
    
    return {
        "actual_tools": tool_calls,
        "expected_tools": expected_tools,
        "correct_tools_used": all(t in tool_calls for t in expected_tools),
        "no_unexpected_tools": all(t in expected_tools for t in tool_calls)
    }

result = evaluate_agent_trajectory(
    "What is today's weather in Tokyo?",
    expected_tools=["web_search"]
)

Next lesson: Streaming agent output — showing real-time progress to users.