Skip to main content
Version: Next

How to Build and Query Vector Indexes

Create HNSW indexes for semantic search and similarity queries.


Problem

You have vector embeddings (from OpenAI, sentence-transformers, etc.) and need to perform fast similarity search.


Solution

1. Create a Vector Index (Python)

from sochdb import Database
from sochdb.bulk import bulk_build_index
import numpy as np

# Generate or load embeddings
embeddings = np.random.randn(10000, 384).astype(np.float32)
ids = np.arange(10000, dtype=np.uint64)

# Build HNSW index with optimal parameters
stats = bulk_build_index(
embeddings,
output="./my_index.hnsw",
ids=ids, # Optional: custom IDs (must be uint64)
m=16, # Graph connectivity (higher = better recall, more memory)
ef_construction=200, # Build quality (higher = better index, slower build)
)

print(f"Built index: {stats.vectors} vectors in {stats.elapsed_secs:.2f}s")

2. Query the Index

from sochdb.bulk import bulk_query_index

# Query vector (from your embedding model)
query = np.random.randn(384).astype(np.float32)

# Find k nearest neighbors
results = bulk_query_index(
index_path="./my_index.hnsw",
query=query,
k=10, # Number of results
ef=50 # Search quality (optional, higher = better recall, slower)
)

for id, distance in results:
print(f"ID: {id}, Distance: {distance:.4f}")

3. Integrated with Database (Rust)

use sochdb::{SochConnection, SchemaBuilder, SochType, SochValue};

fn main() -> Result<(), Box<dyn std::error::Error>> {
let conn = SochConnection::open("./vector_db")?;

// Create table with vector column
let schema = SchemaBuilder::table("documents")
.field("id", SochType::UInt)
.field("content", SochType::Text)
.field("embedding", SochType::Vector(384))
.primary_key("id")
.index("embedding", IndexType::HNSW {
m: 16,
ef_construction: 200,
})
.build();

conn.create_table(schema)?;

// Insert document with embedding
let embedding: Vec<f32> = get_embedding("Hello world");
conn.insert("documents", vec![
("id", SochValue::UInt(1)),
("content", SochValue::Text("Hello world".into())),
("embedding", SochValue::Vector(embedding)),
])?;

// Vector search
let query_embedding = get_embedding("Hi there");
let results = conn.query("documents")
.vector_search("embedding", &query_embedding, 10)
.select(&["id", "content"])
.execute()?;

Ok(())
}

HNSW Parameter Tuning

Build Parameters

ParameterDefaultRangeEffect
m164-64Higher = better recall, more memory
ef_construction20050-500Higher = better index quality, slower build

Guidelines:

  • Small datasets (< 10K): m=8, ef_construction=100
  • Medium (10K-1M): m=16, ef_construction=200
  • Large (> 1M): m=32, ef_construction=400

Search Parameters

ParameterDefaultRangeEffect
ef5010-500Higher = better recall, slower query
k101-1000Number of results

Guidelines:

  • Fast search (< 1ms): ef=20
  • Balanced: ef=50
  • High recall (> 99%): ef=200

Quantization Trade-offs

TypeMemoryPrecisionUse Case
f32100%FullDefault, best accuracy
f1650%~0.1% lossLarge indexes, memory-constrained
bf1650%~0.5% lossML models trained with bf16

Example: Semantic Search System

#!/usr/bin/env python3
"""Semantic search over documents using SochDB."""

from sochdb import Database
from sochdb.bulk import bulk_build_index, bulk_query_index
import numpy as np

# Simulated embedding function (replace with real model)
def get_embedding(text: str, dim: int = 384) -> np.ndarray:
# In production: use sentence-transformers, OpenAI, etc.
np.random.seed(hash(text) % 2**32)
return np.random.randn(dim).astype(np.float32)

class SemanticSearch:
def __init__(self, db_path: str, index_path: str):
self.db = Database.open(db_path)
self.index_path = index_path
self.documents = []

def add_documents(self, docs: list[dict]):
"""Add documents with their embeddings."""
embeddings = []

for i, doc in enumerate(docs):
doc_id = len(self.documents) + i

# Store document
self.db.put(
f"docs/{doc_id}/content".encode(),
doc["content"].encode()
)
if "metadata" in doc:
self.db.put(
f"docs/{doc_id}/metadata".encode(),
str(doc["metadata"]).encode()
)

# Get embedding
embedding = get_embedding(doc["content"])
embeddings.append(embedding)
self.documents.append(doc)

# Rebuild index with all embeddings
all_embeddings = np.array(embeddings)
bulk_build_index(
all_embeddings,
output=self.index_path,
m=16,
ef_construction=200
)

def search(self, query: str, k: int = 5) -> list[dict]:
"""Search for similar documents."""
query_embedding = get_embedding(query)

results = bulk_query_index(
index_path=self.index_path,
query=query_embedding,
k=k,
ef=50
)

search_results = []
for doc_id, distance in results:
content = self.db.get(f"docs/{doc_id}/content".encode())
if content:
search_results.append({
"id": doc_id,
"content": content.decode(),
"score": 1.0 - distance # Convert distance to similarity
})

return search_results


# Usage
search = SemanticSearch("./search_db", "./search.hnsw")

# Add documents
search.add_documents([
{"content": "SochDB is an LLM-native database"},
{"content": "Vector search enables semantic queries"},
{"content": "HNSW provides fast approximate nearest neighbor search"},
{"content": "Python SDK makes integration easy"},
])

# Search
results = search.search("database for AI applications", k=3)
for r in results:
print(f"[{r['score']:.3f}] {r['content']}")

Discussion

Good for:

  • Semantic similarity (find similar documents)
  • Recommendation systems
  • RAG (Retrieval Augmented Generation)
  • Image/audio similarity

Not for:

  • Exact matching (use regular indexes)
  • Structured queries (use SQL-like queries)
  • Small datasets (< 1000 items, just brute force)

Memory Estimation

Memory = vectors × dimensions × bytes_per_element × overhead

Example: 1M vectors × 384 dims × 4 bytes × 1.5 overhead
= 1,000,000 × 384 × 4 × 1.5
≈ 2.3 GB

With F16 quantization: ~1.15 GB


See Also