Research Agents¶
Build agents that learn from multiple documents and answer complex questions.
Pattern Overview¶
A research agent: - Ingests multiple documents efficiently - Synthesizes knowledge across sources - Answers complex multi-hop questions - Finds connections between entities
Basic Implementation¶
from graphmem import GraphMem, MemoryConfig
class ResearchAgent:
"""Agent that learns from multiple documents."""
def __init__(self):
# Option 1: OpenAI
self.config = MemoryConfig(
llm_provider="openai",
llm_api_key="sk-...",
llm_model="gpt-4o", # Use stronger model for research
embedding_provider="openai",
embedding_api_key="sk-...",
embedding_model="text-embedding-3-small",
# Use Neo4j for complex graph queries
neo4j_uri="neo4j+s://...",
neo4j_username="neo4j",
neo4j_password="...",
# Redis for caching
redis_url="redis://...",
# Aggressive evolution for research
evolution_enabled=True,
consolidation_threshold=0.75,
)
# Option 2: OpenRouter (with custom base URL)
# self.config = MemoryConfig(
# llm_provider="openai_compatible",
# llm_api_key="sk-or-v1-...",
# llm_api_base="https://openrouter.ai/api/v1", # Custom base URL
# llm_model="google/gemini-2.0-flash-001",
# embedding_provider="openai_compatible",
# embedding_api_key="sk-or-v1-...",
# embedding_api_base="https://openrouter.ai/api/v1", # Custom base URL
# embedding_model="openai/text-embedding-3-small",
# ...
# )
# Option 3: Azure OpenAI
# self.config = MemoryConfig(
# llm_provider="azure_openai",
# llm_api_key="your-azure-key",
# llm_api_base="https://your-resource.openai.azure.com/", # Azure endpoint
# azure_deployment="gpt-4",
# llm_model="gpt-4",
# azure_api_version="2024-02-15-preview",
# embedding_provider="azure_openai",
# embedding_api_key="your-azure-key",
# embedding_api_base="https://your-resource.openai.azure.com/", # Azure endpoint
# azure_embedding_deployment="text-embedding-ada-002",
# embedding_model="text-embedding-ada-002",
# ...
# )
self.memory = GraphMem(self.config, memory_id="research_agent", user_id="researcher")
def ingest_documents(self, documents: list[dict]):
"""Ingest multiple documents efficiently."""
result = self.memory.ingest_batch(
documents,
max_workers=20,
aggressive=True,
show_progress=True,
)
print(f"Ingested {result['documents_processed']} docs")
print(f"Extracted {result['total_entities']} entities")
print(f"Found {result['total_relationships']} relationships")
# Evolve after batch ingestion
self.memory.evolve()
def research(self, question: str) -> dict:
"""Answer complex research questions."""
response = self.memory.query(question)
return {
"answer": response.answer,
"confidence": response.confidence,
"sources": [n.name for n in response.nodes[:5]],
"related_entities": [n.name for n in response.nodes],
"context_tokens": len(response.context.split()),
}
def find_connections(self, entity_a: str, entity_b: str) -> str:
"""Find how two entities are connected."""
query = f"How are {entity_a} and {entity_b} related?"
response = self.memory.query(query)
return response.answer
def summarize_topic(self, topic: str) -> str:
"""Summarize everything known about a topic."""
query = f"Summarize everything known about {topic}"
response = self.memory.query(query)
return response.answer
Usage Example¶
agent = ResearchAgent()
# Ingest research papers
papers = [
{"id": "paper1", "content": open("transformer_paper.txt").read()},
{"id": "paper2", "content": open("attention_paper.txt").read()},
{"id": "paper3", "content": open("gpt_paper.txt").read()},
{"id": "paper4", "content": open("bert_paper.txt").read()},
]
agent.ingest_documents(papers)
# Ask complex questions
result = agent.research(
"How did attention mechanisms evolve into modern LLMs?"
)
print(result["answer"])
# Find connections
connection = agent.find_connections("Transformers", "GPT-4")
print(connection)
# Get topic summaries
summary = agent.summarize_topic("self-attention")
print(summary)
Multi-Source Research¶
Different Document Types¶
class MultiSourceResearchAgent(ResearchAgent):
def ingest_paper(self, content: str, metadata: dict):
"""Ingest academic paper with structured metadata."""
enriched = f"""
ACADEMIC PAPER
Title: {metadata.get('title', 'Unknown')}
Authors: {', '.join(metadata.get('authors', []))}
Year: {metadata.get('year', 'Unknown')}
Abstract:
{metadata.get('abstract', '')}
Content:
{content}
"""
self.memory.ingest(enriched, importance=MemoryImportance.HIGH)
def ingest_news(self, content: str, metadata: dict):
"""Ingest news article with temporal context."""
enriched = f"""
NEWS ARTICLE (Date: {metadata.get('date', 'Unknown')})
Source: {metadata.get('source', 'Unknown')}
{content}
"""
self.memory.ingest(enriched, importance=MemoryImportance.MEDIUM)
def ingest_internal_doc(self, content: str, metadata: dict):
"""Ingest internal document as critical."""
self.memory.ingest(content, importance=MemoryImportance.CRITICAL)
Citation Tracking¶
class CitationAwareAgent(ResearchAgent):
def research_with_citations(self, question: str) -> dict:
"""Answer with source citations."""
response = self.memory.query(question)
# Extract source documents
sources = []
for node in response.nodes:
if "source_document" in node.properties:
sources.append({
"document": node.properties["source_document"],
"excerpt": node.properties.get("source_chunk", "")[:200],
})
return {
"answer": response.answer,
"confidence": response.confidence,
"citations": sources[:5],
}
Comparative Analysis¶
class ComparativeAgent(ResearchAgent):
def compare(self, entity_a: str, entity_b: str, aspect: str) -> str:
"""Compare two entities on a specific aspect."""
query = f"""
Compare {entity_a} and {entity_b} in terms of {aspect}.
What are the key differences and similarities?
"""
return self.memory.query(query).answer
def timeline(self, entity: str) -> str:
"""Get chronological timeline for an entity."""
query = f"""
Provide a chronological timeline of key events for {entity}.
Include dates when available.
"""
return self.memory.query(query).answer
Best Practices¶
- Use batch ingestion - Much faster than sequential
- Enrich with metadata - Add structure to documents
- Evolve after ingestion - Consolidate knowledge
- Use stronger LLMs - Research benefits from GPT-4/Claude
- Set high importance - Research docs shouldn't decay
- Track sources - Include document IDs in metadata