Error Handling & Self-Healing Agents | AI Agent Development Course | AiTechWorlds

Agent Error Handling: Building Resilient Agents

Agents fail in ways that simple programs don't: LLM APIs time out, tools return unexpected results, the model calls a tool with wrong arguments, external services are unavailable. Production agents need defensive error handling at every layer.

The Error Taxonomy

API errors: Rate limits, timeouts, authentication failures from LLM providers or tool APIs Tool errors: External service unavailable, invalid input, unexpected output format Logic errors: Agent loops, the model calling nonexistent tools, invalid state transitions Content errors: The model produces output that doesn't match expected format (malformed JSON, truncated response) Safety errors: The model refuses to complete a task, or produces output that fails content filters

Retrying LLM API Calls

LLM APIs have rate limits and occasional transient failures. Always retry:

from langchain_openai import ChatOpenAI
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
import openai

@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=1, min=2, max=30),  # 2s, 4s, 8s...
    retry=retry_if_exception_type((
        openai.RateLimitError,
        openai.APITimeoutError,
        openai.APIConnectionError
    ))
)
def call_llm_with_retry(llm, messages):
    return llm.invoke(messages)

# LangChain also has built-in retry
llm_with_retry = ChatOpenAI(
    model="gpt-4o",
    max_retries=3,
    timeout=60  # 60-second timeout per request
)

Tool Error Handling

Tools should always return structured errors, never raise unhandled exceptions:

from langchain.tools import tool
import json

@tool
def search_database(query: str, table: str) -> str:
    """Search the database for records matching the query."""
    
    # Validate input
    allowed_tables = ["customers", "orders", "products"]
    if table not in allowed_tables:
        return json.dumps({
            "error": "invalid_table",
            "message": f"Table '{table}' not found. Available tables: {allowed_tables}"
        })
    
    try:
        results = db.execute(f"SELECT * FROM {table} WHERE ... LIMIT 10")
        
        if not results:
            return json.dumps({"status": "no_results", "query": query, "table": table})
        
        return json.dumps({"status": "success", "count": len(results), "data": results})
        
    except DatabaseConnectionError as e:
        # Log the real error internally
        logger.error(f"DB connection failed: {e}", exc_info=True)
        # Return a safe, informative error to the agent
        return json.dumps({
            "error": "database_unavailable",
            "message": "Database is temporarily unavailable. Try again in a moment."
        })
    
    except Exception as e:
        logger.error(f"Unexpected tool error: {e}", exc_info=True)
        return json.dumps({
            "error": "unexpected_error",
            "message": f"An unexpected error occurred: {type(e).__name__}"
        })

When tools return errors as JSON strings rather than raising exceptions, the agent can read the error and decide to try a different approach.

LangGraph Error Handling

Add fallback paths to your graph for node failures:

from langgraph.graph import StateGraph, END
from typing import TypedDict, Optional

class AgentState(TypedDict):
    task: str
    result: str
    error: Optional[str]
    retry_count: int

def primary_search_node(state: AgentState) -> AgentState:
    """Try primary search tool."""
    try:
        result = primary_search.invoke(state["task"])
        return {**state, "result": result, "error": None}
    except Exception as e:
        return {**state, "error": f"Primary search failed: {str(e)}"}

def fallback_search_node(state: AgentState) -> AgentState:
    """Fallback to secondary search tool."""
    try:
        result = fallback_search.invoke(state["task"])
        return {**state, "result": result, "error": None}
    except Exception as e:
        return {**state, "error": f"Fallback search also failed: {str(e)}"}

def graceful_failure_node(state: AgentState) -> AgentState:
    """Handle complete failure gracefully."""
    return {
        **state,
        "result": f"I was unable to complete this task due to a service issue. Error: {state['error']}. Please try again later."
    }

def route_after_primary(state: AgentState) -> str:
    if state.get("error"):
        return "fallback"
    return "done"

def route_after_fallback(state: AgentState) -> str:
    if state.get("error"):
        return "graceful_fail"
    return "done"

graph = StateGraph(AgentState)
graph.add_node("primary", primary_search_node)
graph.add_node("fallback", fallback_search_node)
graph.add_node("graceful_fail", graceful_failure_node)

graph.set_entry_point("primary")
graph.add_conditional_edges("primary", route_after_primary, {"fallback": "fallback", "done": END})
graph.add_conditional_edges("fallback", route_after_fallback, {"graceful_fail": "graceful_fail", "done": END})
graph.add_edge("graceful_fail", END)

Detecting and Breaking Loops

Agents can get stuck repeating the same tool calls:

from collections import Counter

def detect_loop(messages: list, window: int = 6) -> bool:
    """Detect if the agent is calling the same tool repeatedly."""
    recent = messages[-window:]
    tool_calls = []
    
    for msg in recent:
        if hasattr(msg, 'tool_calls') and msg.tool_calls:
            for tc in msg.tool_calls:
                tool_calls.append((tc.function.name, tc.function.arguments))
    
    if not tool_calls:
        return False
    
    # If any (tool, args) combo appears more than twice in the window, it's a loop
    counts = Counter(tool_calls)
    return any(count > 2 for count in counts.values())

def check_for_loops_node(state: AgentState) -> AgentState:
    """Interrupt if agent is looping."""
    if detect_loop(state.get("messages", [])):
        return {
            **state,
            "result": "I got stuck in a loop trying to complete this task. Could you rephrase or provide more context?",
            "force_end": True
        }
    return state

Handling Malformed LLM Output

When you expect structured output (JSON) and get something else:

import json
import re

def parse_llm_json(response: str) -> dict:
    """Parse JSON from LLM response, handling common formatting issues."""
    
    # Clean the response
    text = response.strip()
    
    # Remove markdown code blocks if present
    text = re.sub(r'^```(?:json)?\n?', '', text)
    text = re.sub(r'\n?```$', '', text)
    
    # Try direct parse
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        pass
    
    # Try to extract JSON object/array
    json_match = re.search(r'(\{[^{}]*\}|\[[^\[\]]*\])', text, re.DOTALL)
    if json_match:
        try:
            return json.loads(json_match.group(1))
        except json.JSONDecodeError:
            pass
    
    # Failed — return structured error
    raise ValueError(f"Could not parse JSON from response: {text[:200]}")

def safe_structured_output(llm, prompt: str, default: dict) -> dict:
    """Try to get structured output, return default on failure."""
    try:
        response = llm.invoke(prompt)
        return parse_llm_json(response.content)
    except (ValueError, Exception) as e:
        logger.warning(f"Structured output parsing failed: {e}")
        return default

Content Safety and Refusal Handling

When the LLM refuses to complete a task:

REFUSAL_INDICATORS = [
    "i cannot", "i can't", "i'm not able to", "i won't",
    "i'm unable to", "as an ai", "i don't feel comfortable"
]

def is_refusal(response: str) -> bool:
    lower = response.lower()
    return any(indicator in lower for indicator in REFUSAL_INDICATORS)

def handle_potential_refusal(llm, task: str) -> str:
    response = llm.invoke(task).content
    
    if is_refusal(response):
        # Try rephrasing the task
        rephrased = llm.invoke(
            f"Rephrase this task to be more clearly acceptable: {task}"
        ).content
        retry_response = llm.invoke(rephrased).content
        
        if is_refusal(retry_response):
            return f"I'm unable to complete this task as requested. You may want to adjust your approach."
        return retry_response
    
    return response

Global Error Handler for Agents

Wrap the entire agent run with comprehensive error handling:

import traceback
import time

def run_agent_safely(agent, task: str, config: dict, max_retries: int = 2) -> dict:
    """Run an agent with comprehensive error handling and logging."""
    
    start_time = time.time()
    last_error = None
    
    for attempt in range(max_retries + 1):
        try:
            result = agent.invoke({"messages": [("human", task)]}, config)
            
            return {
                "success": True,
                "result": result["messages"][-1].content,
                "duration_seconds": time.time() - start_time,
                "attempts": attempt + 1
            }
        
        except Exception as e:
            last_error = e
            error_type = type(e).__name__
            
            logger.warning(f"Agent attempt {attempt + 1} failed: {error_type}: {str(e)}")
            
            if attempt < max_retries:
                wait_time = 2 ** attempt  # Exponential backoff: 1s, 2s, 4s
                time.sleep(wait_time)
            else:
                logger.error(f"Agent failed after {max_retries + 1} attempts", exc_info=True)
    
    return {
        "success": False,
        "error": str(last_error),
        "error_type": type(last_error).__name__,
        "duration_seconds": time.time() - start_time,
        "attempts": max_retries + 1
    }

Monitoring in Production

import sentry_sdk  # Error tracking
from prometheus_client import Counter, Histogram  # Metrics

# Track agent failures
agent_errors = Counter('agent_errors_total', 'Agent errors', ['error_type', 'task_type'])
agent_duration = Histogram('agent_duration_seconds', 'Agent execution time')

def monitored_agent_run(agent, task: str):
    with agent_duration.time():
        try:
            result = run_agent_safely(agent, task, {})
            if not result["success"]:
                agent_errors.labels(
                    error_type=result["error_type"],
                    task_type="general"
                ).inc()
            return result
        except Exception as e:
            sentry_sdk.capture_exception(e)
            agent_errors.labels(error_type=type(e).__name__, task_type="general").inc()
            raise

Next lesson: Deploying agents with FastAPI — exposing your agents as production-ready APIs.