Skip to content

Structured Data Ingestion

Learn how to ingest custom entities and relationships directly into GraphMem without relying on LLM extraction.

Use Cases

  • You already have structured data (databases, APIs, CRMs)
  • You want to define custom entity types (Person, Organization, Quote)
  • You have known relationships ("person works at organization")
  • You need precise control over the knowledge graph

Quick Start: Structured Ingestion

from graphmem import GraphMem, MemoryConfig
from graphmem.core.memory_types import MemoryNode, MemoryEdge, MemoryImportance
from datetime import datetime

# Initialize
config = MemoryConfig(
    llm_provider="openai",
    llm_api_key="sk-...",
    llm_model="gpt-4o-mini",
    embedding_provider="openai",
    embedding_api_key="sk-...",
    embedding_model="text-embedding-3-small",
    turso_db_path="my_structured_memory.db",
)

memory = GraphMem(config, memory_id="my_crm", user_id="default")
memory._ensure_initialized()  # Initialize internal state

# Create custom entities
person = MemoryNode(
    id="person_john_doe",
    name="John Doe",
    entity_type="Person",
    description="Senior Software Engineer at Acme Corp, 5 years experience",
    aliases={"John", "J. Doe", "Johnny"},
    properties={
        "email": "john.doe@acme.com",
        "department": "Engineering",
        "hire_date": "2019-03-15",
    },
    importance=MemoryImportance.HIGH,
    memory_id=memory.memory_id,
    user_id=memory.user_id,
)

organization = MemoryNode(
    id="org_acme_corp",
    name="Acme Corporation",
    entity_type="Organization",
    description="Technology company specializing in cloud infrastructure",
    aliases={"Acme Corp", "Acme", "ACME"},
    properties={
        "industry": "Technology",
        "founded": "2010",
        "headquarters": "San Francisco, CA",
    },
    importance=MemoryImportance.HIGH,
    memory_id=memory.memory_id,
    user_id=memory.user_id,
)

# Add entities to memory
memory._memory.add_node(person)
memory._memory.add_node(organization)

# Create relationships
works_at = MemoryEdge(
    id="rel_john_works_at_acme",
    source_id=person.id,
    target_id=organization.id,
    relation_type="works_at",
    description="John Doe is employed at Acme Corporation as Senior Software Engineer",
    properties={
        "role": "Senior Software Engineer",
        "department": "Engineering",
    },
    valid_from=datetime(2019, 3, 15),  # When relationship started
    valid_until=None,  # Still active (None = current)
    importance=MemoryImportance.HIGH,
    memory_id=memory.memory_id,
)

memory._memory.add_edge(works_at)

# Save to persistent storage
memory.save()

# Now query normally!
response = memory.query("Who works at Acme Corporation?")
print(response.answer)  # "John Doe works at Acme Corporation as Senior Software Engineer"

Custom Entity Types

Person

person = MemoryNode(
    id="person_jane_smith",
    name="Jane Smith",
    entity_type="Person",
    description="CEO of TechStart Inc., former VP at Google",
    aliases={"Jane", "J. Smith", "Dr. Jane Smith"},
    properties={
        "title": "CEO",
        "email": "jane@techstart.io",
        "linkedin": "linkedin.com/in/janesmith",
        "expertise": ["AI", "Product Management", "Strategy"],
    },
    importance=MemoryImportance.CRITICAL,  # Key person
    memory_id=memory.memory_id,
    user_id=memory.user_id,
)

Organization

org = MemoryNode(
    id="org_techstart",
    name="TechStart Inc.",
    entity_type="Organization",
    description="AI startup focused on enterprise automation",
    aliases={"TechStart", "Tech Start", "TSI"},
    properties={
        "industry": "Artificial Intelligence",
        "founded": "2022",
        "funding": "$50M Series B",
        "employees": 150,
        "headquarters": "Austin, TX",
    },
    importance=MemoryImportance.HIGH,
    memory_id=memory.memory_id,
    user_id=memory.user_id,
)

Quote

quote = MemoryNode(
    id="quote_vision_statement",
    name="AI will transform every industry",
    entity_type="Quote",
    description="Vision statement from CEO Jane Smith at TechCrunch 2024",
    properties={
        "speaker": "Jane Smith",
        "context": "TechCrunch Disrupt 2024 Keynote",
        "date": "2024-10-15",
        "full_text": "AI will transform every industry, and we're just at the beginning.",
    },
    importance=MemoryImportance.MEDIUM,
    memory_id=memory.memory_id,
    user_id=memory.user_id,
)

Product

product = MemoryNode(
    id="product_automate_pro",
    name="AutomatePro",
    entity_type="Product",
    description="Enterprise workflow automation platform powered by AI",
    aliases={"Automate Pro", "AP", "AutoPro"},
    properties={
        "version": "3.2",
        "launch_date": "2024-06-01",
        "pricing": "Enterprise",
        "features": ["Workflow Builder", "AI Assistant", "Analytics"],
    },
    importance=MemoryImportance.HIGH,
    memory_id=memory.memory_id,
    user_id=memory.user_id,
)

Event

event = MemoryNode(
    id="event_series_b",
    name="TechStart Series B Funding",
    entity_type="Event",
    description="$50M Series B funding round led by Sequoia Capital",
    properties={
        "date": "2024-03-01",
        "amount": "$50M",
        "lead_investor": "Sequoia Capital",
        "participants": ["Andreessen Horowitz", "Y Combinator"],
    },
    importance=MemoryImportance.HIGH,
    memory_id=memory.memory_id,
    user_id=memory.user_id,
)

Relationship Types

Employment Relationships

# Current employment
works_at = MemoryEdge(
    id="rel_john_works_at_acme",
    source_id="person_john_doe",
    target_id="org_acme_corp",
    relation_type="works_at",
    description="Current employment",
    properties={"role": "Senior Engineer", "department": "Engineering"},
    valid_from=datetime(2019, 3, 15),
    valid_until=None,  # Still employed
    memory_id=memory.memory_id,
)

# Past employment (with end date)
worked_at = MemoryEdge(
    id="rel_jane_worked_at_google",
    source_id="person_jane_smith",
    target_id="org_google",
    relation_type="worked_at",
    description="Former VP of Product at Google",
    properties={"role": "VP of Product"},
    valid_from=datetime(2015, 1, 1),
    valid_until=datetime(2022, 6, 30),  # Left in 2022
    memory_id=memory.memory_id,
)

Leadership Relationships

# CEO relationship
leads = MemoryEdge(
    id="rel_jane_leads_techstart",
    source_id="person_jane_smith",
    target_id="org_techstart",
    relation_type="leads",
    description="Jane Smith is CEO of TechStart",
    properties={"role": "CEO", "board_member": True},
    valid_from=datetime(2022, 7, 1),
    valid_until=None,  # Current
    memory_id=memory.memory_id,
)

# Reports to relationship
reports_to = MemoryEdge(
    id="rel_john_reports_to_jane",
    source_id="person_john_doe",
    target_id="person_jane_smith",
    relation_type="reports_to",
    description="Direct reporting relationship",
    valid_from=datetime(2023, 1, 1),
    memory_id=memory.memory_id,
)

Ownership/Investment Relationships

# Investment relationship
invested_in = MemoryEdge(
    id="rel_sequoia_invested_techstart",
    source_id="org_sequoia",
    target_id="org_techstart",
    relation_type="invested_in",
    description="Led Series B round",
    properties={"round": "Series B", "amount": "$50M", "lead": True},
    valid_from=datetime(2024, 3, 1),
    memory_id=memory.memory_id,
)

# Product ownership
develops = MemoryEdge(
    id="rel_techstart_develops_automatepro",
    source_id="org_techstart",
    target_id="product_automate_pro",
    relation_type="develops",
    description="TechStart develops AutomatePro",
    memory_id=memory.memory_id,
)

Quote/Statement Relationships

# Person said quote
said = MemoryEdge(
    id="rel_jane_said_quote",
    source_id="person_jane_smith",
    target_id="quote_vision_statement",
    relation_type="said",
    description="Statement made at TechCrunch 2024",
    properties={"context": "Keynote speech"},
    valid_from=datetime(2024, 10, 15),
    memory_id=memory.memory_id,
)

Batch Import from Structured Sources

From a Database/CRM

import json

# Example: Import from a JSON export
crm_data = {
    "contacts": [
        {"id": "1", "name": "John Doe", "company": "Acme Corp", "role": "Engineer"},
        {"id": "2", "name": "Jane Smith", "company": "TechStart", "role": "CEO"},
    ],
    "companies": [
        {"id": "c1", "name": "Acme Corp", "industry": "Technology"},
        {"id": "c2", "name": "TechStart", "industry": "AI"},
    ],
}

def import_crm_data(memory: GraphMem, crm_data: dict):
    """Import CRM data as structured entities and relationships."""

    memory._ensure_initialized()

    # Create company entities
    company_ids = {}
    for company in crm_data["companies"]:
        node = MemoryNode(
            id=f"org_{company['id']}",
            name=company["name"],
            entity_type="Organization",
            description=f"{company['industry']} company",
            properties={"industry": company["industry"]},
            importance=MemoryImportance.HIGH,
            memory_id=memory.memory_id,
            user_id=memory.user_id,
        )
        memory._memory.add_node(node)
        company_ids[company["name"]] = node.id

    # Create person entities and relationships
    for contact in crm_data["contacts"]:
        # Create person
        person = MemoryNode(
            id=f"person_{contact['id']}",
            name=contact["name"],
            entity_type="Person",
            description=f"{contact['role']} at {contact['company']}",
            properties={"role": contact["role"]},
            importance=MemoryImportance.MEDIUM,
            memory_id=memory.memory_id,
            user_id=memory.user_id,
        )
        memory._memory.add_node(person)

        # Create works_at relationship
        if contact["company"] in company_ids:
            edge = MemoryEdge(
                id=f"rel_{contact['id']}_works_at_{contact['company']}",
                source_id=person.id,
                target_id=company_ids[contact["company"]],
                relation_type="works_at",
                description=f"{contact['name']} works at {contact['company']}",
                properties={"role": contact["role"]},
                importance=MemoryImportance.MEDIUM,
                memory_id=memory.memory_id,
            )
            memory._memory.add_edge(edge)

    # Save all to storage
    memory.save()

    print(f"Imported {len(crm_data['companies'])} companies, {len(crm_data['contacts'])} contacts")

# Use it
import_crm_data(memory, crm_data)

From a CSV File

import csv

def import_from_csv(memory: GraphMem, csv_path: str, entity_type: str):
    """Import entities from a CSV file."""

    memory._ensure_initialized()

    with open(csv_path, 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            node = MemoryNode(
                id=f"{entity_type.lower()}_{row['id']}",
                name=row["name"],
                entity_type=entity_type,
                description=row.get("description", ""),
                properties={k: v for k, v in row.items() if k not in ["id", "name", "description"]},
                importance=MemoryImportance.MEDIUM,
                memory_id=memory.memory_id,
                user_id=memory.user_id,
            )
            memory._memory.add_node(node)

    memory.save()

Hybrid Approach: Structured + LLM Extraction

Combine structured data with natural language for rich context:

# 1. First, add known structured entities
memory._ensure_initialized()

# Add known entities
ceo = MemoryNode(
    id="person_ceo",
    name="Sarah Chen",
    entity_type="Person",
    description="CEO of Quantum Innovations",
    aliases={"Sarah", "Dr. Chen", "S. Chen"},
    properties={"title": "CEO", "linkedin": "linkedin.com/in/sarahchen"},
    importance=MemoryImportance.CRITICAL,
    memory_id=memory.memory_id,
    user_id=memory.user_id,
)
memory._memory.add_node(ceo)

company = MemoryNode(
    id="org_quantum",
    name="Quantum Innovations",
    entity_type="Organization",
    description="Quantum computing startup",
    aliases={"Quantum", "QI"},
    importance=MemoryImportance.CRITICAL,
    memory_id=memory.memory_id,
    user_id=memory.user_id,
)
memory._memory.add_node(company)

memory.save()

# 2. Then ingest natural language that references these entities
# GraphMem will automatically link to existing entities via alias matching!
memory.ingest("""
    In today's earnings call, Sarah Chen announced Quantum Innovations achieved 
    $100M ARR, a 150% increase from last year. The company plans to expand 
    into Europe by Q2 2025.
""")

# The LLM extraction will recognize "Sarah Chen" and "Quantum Innovations"
# and link them to your pre-existing structured entities!

Generating Embeddings for Custom Entities

For semantic search to work, entities need embeddings:

def add_entity_with_embedding(memory: GraphMem, node: MemoryNode):
    """Add entity and generate its embedding."""

    memory._ensure_initialized()

    # Generate embedding from name + description
    text_to_embed = f"{node.name}: {node.description}"

    embedding = memory._embeddings.embed_text(text_to_embed)
    node.embedding = embedding

    memory._memory.add_node(node)
    memory.save()

# Usage
person = MemoryNode(
    id="person_new",
    name="Alex Johnson",
    entity_type="Person",
    description="Product Manager at CloudTech, specializes in AI products",
    memory_id=memory.memory_id,
    user_id=memory.user_id,
)

add_entity_with_embedding(memory, person)

# Now semantic search will find this entity!
response = memory.query("Who works on AI products?")

Best Practices

1. Use Consistent ID Patterns

# ✅ Good: Consistent, readable IDs
"person_john_doe"
"org_acme_corp"
"product_automate_pro"
"event_series_b_2024"

# ❌ Bad: Random or inconsistent IDs
"12345"
"John"
"ORG_ACME"

2. Add Rich Aliases

# ✅ Good: Many aliases for better matching
aliases={"John Doe", "John", "J. Doe", "Johnny", "JD"}

# ❌ Bad: Only the canonical name
aliases={"John Doe"}

3. Use Temporal Validity

# ✅ Good: Track when relationships are valid
valid_from=datetime(2020, 1, 1),
valid_until=datetime(2023, 12, 31),  # Past relationship

valid_from=datetime(2024, 1, 1),
valid_until=None,  # Current relationship (None = present)

# ❌ Bad: No temporal information
# (You lose the ability to query "Who was CEO in 2020?")

4. Set Appropriate Importance

# ✅ Good: Key entities get high importance
MemoryImportance.CRITICAL  # CEOs, key customers, critical data
MemoryImportance.HIGH      # Important entities
MemoryImportance.MEDIUM    # Regular entities (default)
MemoryImportance.LOW       # Less important, may decay
MemoryImportance.EPHEMERAL # Temporary, will decay quickly

5. Always Call save() After Bulk Operations

# ✅ Good: Single save after all additions
for entity in entities:
    memory._memory.add_node(entity)
for relation in relations:
    memory._memory.add_edge(relation)
memory.save()  # One save at the end

# ❌ Bad: Save after each addition (slow)
for entity in entities:
    memory._memory.add_node(entity)
    memory.save()  # Slow!

Coming Soon: Public Structured API

We're working on a cleaner public API for structured data:

# 🔜 Future API (not yet available)
memory.add_entity(
    name="John Doe",
    entity_type="Person",
    properties={"role": "Engineer"},
    relationships=[
        {"target": "Acme Corp", "type": "works_at", "since": "2020-01-01"}
    ]
)

memory.add_relationship(
    source="John Doe",
    target="Acme Corp",
    relationship="works_at",
    valid_from="2020-01-01",
)

Want this feature? Vote on GitHub Issues!


Summary

Approach Best For
memory.ingest(text) Natural language, documents, unstructured content
memory._memory.add_node() Pre-existing structured data, custom entity types
Hybrid Structured base + natural language enrichment

GraphMem supports both automatic LLM extraction AND manual structured data ingestion. Use whichever fits your use case, or combine them for the best of both worlds!