Version: Next

How to Build and Query Vector Indexes

Create HNSW indexes for semantic search and similarity queries.

Problem

You have vector embeddings (from OpenAI, sentence-transformers, etc.) and need to perform fast similarity search.

Solution

1. Create a Vector Index (Python)

from sochdb import Database
from sochdb.bulk import bulk_build_index
import numpy as np

# Generate or load embeddings
embeddings = np.random.randn(10000, 384).astype(np.float32)
ids = np.arange(10000, dtype=np.uint64)

# Build HNSW index with optimal parameters
stats = bulk_build_index(
    embeddings,
    output="./my_index.hnsw",
    ids=ids,                    # Optional: custom IDs (must be uint64)
    m=16,                       # Graph connectivity (higher = better recall, more memory)
    ef_construction=200,        # Build quality (higher = better index, slower build)
)

print(f"Built index: {stats.vectors} vectors in {stats.elapsed_secs:.2f}s")

2. Query the Index

from sochdb.bulk import bulk_query_index

# Query vector (from your embedding model)
query = np.random.randn(384).astype(np.float32)

# Find k nearest neighbors
results = bulk_query_index(
    index_path="./my_index.hnsw",
    query=query,
    k=10,                       # Number of results
    ef=50                       # Search quality (optional, higher = better recall, slower)
)

for id, distance in results:
    print(f"ID: {id}, Distance: {distance:.4f}")

3. Integrated with Database (Rust)

use sochdb::{SochConnection, SchemaBuilder, SochType, SochValue};

fn main() -> Result<(), Box<dyn std::error::Error>> {
    let conn = SochConnection::open("./vector_db")?;

    // Create table with vector column
    let schema = SchemaBuilder::table("documents")
        .field("id", SochType::UInt)
        .field("content", SochType::Text)
        .field("embedding", SochType::Vector(384))
        .primary_key("id")
        .index("embedding", IndexType::HNSW {
            m: 16,
            ef_construction: 200,
        })
        .build();

    conn.create_table(schema)?;

    // Insert document with embedding
    let embedding: Vec<f32> = get_embedding("Hello world");
    conn.insert("documents", vec![
        ("id", SochValue::UInt(1)),
        ("content", SochValue::Text("Hello world".into())),
        ("embedding", SochValue::Vector(embedding)),
    ])?;

    // Vector search
    let query_embedding = get_embedding("Hi there");
    let results = conn.query("documents")
        .vector_search("embedding", &query_embedding, 10)
        .select(&["id", "content"])
        .execute()?;

    Ok(())
}

HNSW Parameter Tuning

Build Parameters

Parameter	Default	Range	Effect
`m`	16	4-64	Higher = better recall, more memory
`ef_construction`	200	50-500	Higher = better index quality, slower build

Guidelines:

Small datasets (< 10K): m=8, ef_construction=100
Medium (10K-1M): m=16, ef_construction=200
Large (> 1M): m=32, ef_construction=400

Search Parameters

Parameter	Default	Range	Effect
`ef`	50	10-500	Higher = better recall, slower query
`k`	10	1-1000	Number of results

Guidelines:

Fast search (< 1ms): ef=20
Balanced: ef=50
High recall (> 99%): ef=200

Quantization Trade-offs

Type	Memory	Precision	Use Case
`f32`	100%	Full	Default, best accuracy
`f16`	50%	~0.1% loss	Large indexes, memory-constrained
`bf16`	50%	~0.5% loss	ML models trained with bf16

Example: Semantic Search System

#!/usr/bin/env python3
"""Semantic search over documents using SochDB."""

from sochdb import Database
from sochdb.bulk import bulk_build_index, bulk_query_index
import numpy as np

# Simulated embedding function (replace with real model)
def get_embedding(text: str, dim: int = 384) -> np.ndarray:
    # In production: use sentence-transformers, OpenAI, etc.
    np.random.seed(hash(text) % 2**32)
    return np.random.randn(dim).astype(np.float32)

class SemanticSearch:
    def __init__(self, db_path: str, index_path: str):
        self.db = Database.open(db_path)
        self.index_path = index_path
        self.documents = []
    
    def add_documents(self, docs: list[dict]):
        """Add documents with their embeddings."""
        embeddings = []
        
        for i, doc in enumerate(docs):
            doc_id = len(self.documents) + i
            
            # Store document
            self.db.put(
                f"docs/{doc_id}/content".encode(),
                doc["content"].encode()
            )
            if "metadata" in doc:
                self.db.put(
                    f"docs/{doc_id}/metadata".encode(),
                    str(doc["metadata"]).encode()
                )
            
            # Get embedding
            embedding = get_embedding(doc["content"])
            embeddings.append(embedding)
            self.documents.append(doc)
        
        # Rebuild index with all embeddings
        all_embeddings = np.array(embeddings)
        bulk_build_index(
            all_embeddings,
            output=self.index_path,
            m=16,
            ef_construction=200
        )
    
    def search(self, query: str, k: int = 5) -> list[dict]:
        """Search for similar documents."""
        query_embedding = get_embedding(query)
        
        results = bulk_query_index(
            index_path=self.index_path,
            query=query_embedding,
            k=k,
            ef=50
        )
        
        search_results = []
        for doc_id, distance in results:
            content = self.db.get(f"docs/{doc_id}/content".encode())
            if content:
                search_results.append({
                    "id": doc_id,
                    "content": content.decode(),
                    "score": 1.0 - distance  # Convert distance to similarity
                })
        
        return search_results


# Usage
search = SemanticSearch("./search_db", "./search.hnsw")

# Add documents
search.add_documents([
    {"content": "SochDB is an LLM-native database"},
    {"content": "Vector search enables semantic queries"},
    {"content": "HNSW provides fast approximate nearest neighbor search"},
    {"content": "Python SDK makes integration easy"},
])

# Search
results = search.search("database for AI applications", k=3)
for r in results:
    print(f"[{r['score']:.3f}] {r['content']}")

Discussion

When to Use Vector Search

✅ Good for:

Semantic similarity (find similar documents)
Recommendation systems
RAG (Retrieval Augmented Generation)
Image/audio similarity

❌ Not for:

Exact matching (use regular indexes)
Structured queries (use SQL-like queries)
Small datasets (< 1000 items, just brute force)

Memory Estimation

Memory = vectors × dimensions × bytes_per_element × overhead

Example: 1M vectors × 384 dims × 4 bytes × 1.5 overhead
       = 1,000,000 × 384 × 4 × 1.5
       ≈ 2.3 GB

With F16 quantization: ~1.15 GB

Problem​

Solution​

1. Create a Vector Index (Python)​

2. Query the Index​

3. Integrated with Database (Rust)​

HNSW Parameter Tuning​

Build Parameters​

Search Parameters​

Quantization Trade-offs​

Example: Semantic Search System​

Discussion​

When to Use Vector Search​

Memory Estimation​

See Also​