Error Handling & Self-Healing Agents
Agent Error Handling: Building Resilient Agents
Agents fail in ways that simple programs don't: LLM APIs time out, tools return unexpected results, the model calls a tool with wrong arguments, external services are unavailable. Production agents need defensive error handling at every layer.
The Error Taxonomy
API errors: Rate limits, timeouts, authentication failures from LLM providers or tool APIs Tool errors: External service unavailable, invalid input, unexpected output format Logic errors: Agent loops, the model calling nonexistent tools, invalid state transitions Content errors: The model produces output that doesn't match expected format (malformed JSON, truncated response) Safety errors: The model refuses to complete a task, or produces output that fails content filters
Retrying LLM API Calls
LLM APIs have rate limits and occasional transient failures. Always retry:
from langchain_openai import ChatOpenAI
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
import openai
@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=1, min=2, max=30), # 2s, 4s, 8s...
retry=retry_if_exception_type((
openai.RateLimitError,
openai.APITimeoutError,
openai.APIConnectionError
))
)
def call_llm_with_retry(llm, messages):
return llm.invoke(messages)
# LangChain also has built-in retry
llm_with_retry = ChatOpenAI(
model="gpt-4o",
max_retries=3,
timeout=60 # 60-second timeout per request
)
Tool Error Handling
Tools should always return structured errors, never raise unhandled exceptions:
from langchain.tools import tool
import json
@tool
def search_database(query: str, table: str) -> str:
"""Search the database for records matching the query."""
# Validate input
allowed_tables = ["customers", "orders", "products"]
if table not in allowed_tables:
return json.dumps({
"error": "invalid_table",
"message": f"Table '{table}' not found. Available tables: {allowed_tables}"
})
try:
results = db.execute(f"SELECT * FROM {table} WHERE ... LIMIT 10")
if not results:
return json.dumps({"status": "no_results", "query": query, "table": table})
return json.dumps({"status": "success", "count": len(results), "data": results})
except DatabaseConnectionError as e:
# Log the real error internally
logger.error(f"DB connection failed: {e}", exc_info=True)
# Return a safe, informative error to the agent
return json.dumps({
"error": "database_unavailable",
"message": "Database is temporarily unavailable. Try again in a moment."
})
except Exception as e:
logger.error(f"Unexpected tool error: {e}", exc_info=True)
return json.dumps({
"error": "unexpected_error",
"message": f"An unexpected error occurred: {type(e).__name__}"
})
When tools return errors as JSON strings rather than raising exceptions, the agent can read the error and decide to try a different approach.
LangGraph Error Handling
Add fallback paths to your graph for node failures:
from langgraph.graph import StateGraph, END
from typing import TypedDict, Optional
class AgentState(TypedDict):
task: str
result: str
error: Optional[str]
retry_count: int
def primary_search_node(state: AgentState) -> AgentState:
"""Try primary search tool."""
try:
result = primary_search.invoke(state["task"])
return {**state, "result": result, "error": None}
except Exception as e:
return {**state, "error": f"Primary search failed: {str(e)}"}
def fallback_search_node(state: AgentState) -> AgentState:
"""Fallback to secondary search tool."""
try:
result = fallback_search.invoke(state["task"])
return {**state, "result": result, "error": None}
except Exception as e:
return {**state, "error": f"Fallback search also failed: {str(e)}"}
def graceful_failure_node(state: AgentState) -> AgentState:
"""Handle complete failure gracefully."""
return {
**state,
"result": f"I was unable to complete this task due to a service issue. Error: {state['error']}. Please try again later."
}
def route_after_primary(state: AgentState) -> str:
if state.get("error"):
return "fallback"
return "done"
def route_after_fallback(state: AgentState) -> str:
if state.get("error"):
return "graceful_fail"
return "done"
graph = StateGraph(AgentState)
graph.add_node("primary", primary_search_node)
graph.add_node("fallback", fallback_search_node)
graph.add_node("graceful_fail", graceful_failure_node)
graph.set_entry_point("primary")
graph.add_conditional_edges("primary", route_after_primary, {"fallback": "fallback", "done": END})
graph.add_conditional_edges("fallback", route_after_fallback, {"graceful_fail": "graceful_fail", "done": END})
graph.add_edge("graceful_fail", END)
Detecting and Breaking Loops
Agents can get stuck repeating the same tool calls:
from collections import Counter
def detect_loop(messages: list, window: int = 6) -> bool:
"""Detect if the agent is calling the same tool repeatedly."""
recent = messages[-window:]
tool_calls = []
for msg in recent:
if hasattr(msg, 'tool_calls') and msg.tool_calls:
for tc in msg.tool_calls:
tool_calls.append((tc.function.name, tc.function.arguments))
if not tool_calls:
return False
# If any (tool, args) combo appears more than twice in the window, it's a loop
counts = Counter(tool_calls)
return any(count > 2 for count in counts.values())
def check_for_loops_node(state: AgentState) -> AgentState:
"""Interrupt if agent is looping."""
if detect_loop(state.get("messages", [])):
return {
**state,
"result": "I got stuck in a loop trying to complete this task. Could you rephrase or provide more context?",
"force_end": True
}
return state
Handling Malformed LLM Output
When you expect structured output (JSON) and get something else:
import json
import re
def parse_llm_json(response: str) -> dict:
"""Parse JSON from LLM response, handling common formatting issues."""
# Clean the response
text = response.strip()
# Remove markdown code blocks if present
text = re.sub(r'^```(?:json)?\n?', '', text)
text = re.sub(r'\n?```$', '', text)
# Try direct parse
try:
return json.loads(text)
except json.JSONDecodeError:
pass
# Try to extract JSON object/array
json_match = re.search(r'(\{[^{}]*\}|\[[^\[\]]*\])', text, re.DOTALL)
if json_match:
try:
return json.loads(json_match.group(1))
except json.JSONDecodeError:
pass
# Failed — return structured error
raise ValueError(f"Could not parse JSON from response: {text[:200]}")
def safe_structured_output(llm, prompt: str, default: dict) -> dict:
"""Try to get structured output, return default on failure."""
try:
response = llm.invoke(prompt)
return parse_llm_json(response.content)
except (ValueError, Exception) as e:
logger.warning(f"Structured output parsing failed: {e}")
return default
Content Safety and Refusal Handling
When the LLM refuses to complete a task:
REFUSAL_INDICATORS = [
"i cannot", "i can't", "i'm not able to", "i won't",
"i'm unable to", "as an ai", "i don't feel comfortable"
]
def is_refusal(response: str) -> bool:
lower = response.lower()
return any(indicator in lower for indicator in REFUSAL_INDICATORS)
def handle_potential_refusal(llm, task: str) -> str:
response = llm.invoke(task).content
if is_refusal(response):
# Try rephrasing the task
rephrased = llm.invoke(
f"Rephrase this task to be more clearly acceptable: {task}"
).content
retry_response = llm.invoke(rephrased).content
if is_refusal(retry_response):
return f"I'm unable to complete this task as requested. You may want to adjust your approach."
return retry_response
return response
Global Error Handler for Agents
Wrap the entire agent run with comprehensive error handling:
import traceback
import time
def run_agent_safely(agent, task: str, config: dict, max_retries: int = 2) -> dict:
"""Run an agent with comprehensive error handling and logging."""
start_time = time.time()
last_error = None
for attempt in range(max_retries + 1):
try:
result = agent.invoke({"messages": [("human", task)]}, config)
return {
"success": True,
"result": result["messages"][-1].content,
"duration_seconds": time.time() - start_time,
"attempts": attempt + 1
}
except Exception as e:
last_error = e
error_type = type(e).__name__
logger.warning(f"Agent attempt {attempt + 1} failed: {error_type}: {str(e)}")
if attempt < max_retries:
wait_time = 2 ** attempt # Exponential backoff: 1s, 2s, 4s
time.sleep(wait_time)
else:
logger.error(f"Agent failed after {max_retries + 1} attempts", exc_info=True)
return {
"success": False,
"error": str(last_error),
"error_type": type(last_error).__name__,
"duration_seconds": time.time() - start_time,
"attempts": max_retries + 1
}
Monitoring in Production
import sentry_sdk # Error tracking
from prometheus_client import Counter, Histogram # Metrics
# Track agent failures
agent_errors = Counter('agent_errors_total', 'Agent errors', ['error_type', 'task_type'])
agent_duration = Histogram('agent_duration_seconds', 'Agent execution time')
def monitored_agent_run(agent, task: str):
with agent_duration.time():
try:
result = run_agent_safely(agent, task, {})
if not result["success"]:
agent_errors.labels(
error_type=result["error_type"],
task_type="general"
).inc()
return result
except Exception as e:
sentry_sdk.capture_exception(e)
agent_errors.labels(error_type=type(e).__name__, task_type="general").inc()
raise
Next lesson: Deploying agents with FastAPI — exposing your agents as production-ready APIs.
Get this course's notes on Telegram!
Free cheat sheets, summaries & practice exercises