跳到主要内容

Complete 示例

本示例演示了当前 pyseekdb 支持的全量能力。

包含以下操作:

  1. 连接,包含全部连接模式
  2. Collection 管理
  3. DML 操作,包括 add、update、upsert、delete
  4. DQL 操作,包括 query、get、hybrid_search
  5. Filter 操作符
  6. Collection 信息方法

示例

import uuid
import random
import pyseekdb

# ============================================================================
# PART 1: CLIENT CONNECTION
# ============================================================================

# Option 1: Embedded mode (local SeekDB)
client = pyseekdb.Client(
#path="./seekdb",
#database="test"
)

# Option 2: Server mode (remote SeekDB server)
# client = pyseekdb.Client(
# host="127.0.0.1",
# port=2881,
# database="test",
# user="root",
# password=""
# )

# Option 3: Remote server mode (OceanBase Server)
# client = pyseekdb.Client(
# host="127.0.0.1",
# port=2881,
# tenant="test", # OceanBase default tenant
# database="test",
# user="root",
# password=""
# )

# ============================================================================
# PART 2: COLLECTION MANAGEMENT
# ============================================================================

collection_name = "comprehensive_example"
dimension = 128

# 2.1 Create a collection
from pyseekdb import HNSWConfiguration
config = HNSWConfiguration(dimension=dimension, distance='cosine')
collection = client.get_or_create_collection(
name=collection_name,
configuration=config,
embedding_function=None # Explicitly set to None since we're using custom 128-dim embeddings
)

# 2.2 Check if collection exists
exists = client.has_collection(collection_name)

# 2.3 Get collection object
retrieved_collection = client.get_collection(collection_name, embedding_function=None)

# 2.4 List all collections
all_collections = client.list_collections()

# 2.5 Get or create collection (creates if doesn't exist)
config2 = HNSWConfiguration(dimension=64, distance='cosine')
collection2 = client.get_or_create_collection(
name="another_collection",
configuration=config2,
embedding_function=None # Explicitly set to None since we're using custom 64-dim embeddings
)

# ============================================================================
# PART 3: DML OPERATIONS - ADD DATA
# ============================================================================

# Generate sample data
random.seed(42)
documents = [
"Machine learning is transforming the way we solve problems",
"Python programming language is widely used in data science",
"Vector databases enable efficient similarity search",
"Neural networks mimic the structure of the human brain",
"Natural language processing helps computers understand human language",
"Deep learning requires large amounts of training data",
"Reinforcement learning agents learn through trial and error",
"Computer vision enables machines to interpret visual information"
]

# Generate embeddings (in real usage, use an embedding model)
embeddings = []
for i in range(len(documents)):
vector = [random.random() for _ in range(dimension)]
embeddings.append(vector)

ids = [str(uuid.uuid4()) for _ in documents]

# 3.1 Add single item
single_id = str(uuid.uuid4())
collection.add(
ids=single_id,
documents="This is a single document",
embeddings=[random.random() for _ in range(dimension)],
metadatas={"type": "single", "category": "test"}
)

# 3.2 Add multiple items
collection.add(
ids=ids,
documents=documents,
embeddings=embeddings,
metadatas=[
{"category": "AI", "score": 95, "tag": "ml", "year": 2023},
{"category": "Programming", "score": 88, "tag": "python", "year": 2022},
{"category": "Database", "score": 92, "tag": "vector", "year": 2023},
{"category": "AI", "score": 90, "tag": "neural", "year": 2022},
{"category": "NLP", "score": 87, "tag": "language", "year": 2023},
{"category": "AI", "score": 93, "tag": "deep", "year": 2023},
{"category": "AI", "score": 85, "tag": "reinforcement", "year": 2022},
{"category": "CV", "score": 91, "tag": "vision", "year": 2023}
]
)

# 3.3 Add with only embeddings (no documents)
vector_only_ids = [str(uuid.uuid4()) for _ in range(2)]
collection.add(
ids=vector_only_ids,
embeddings=[[random.random() for _ in range(dimension)] for _ in range(2)],
metadatas=[{"type": "vector_only"}, {"type": "vector_only"}]
)

# ============================================================================
# PART 4: DML OPERATIONS - UPDATE DATA
# ============================================================================

# 4.1 Update single item
collection.update(
ids=ids[0],
metadatas={"category": "AI", "score": 98, "tag": "ml", "year": 2024, "updated": True}
)

# 4.2 Update multiple items
collection.update(
ids=ids[1:3],
documents=["Updated document 1", "Updated document 2"],
embeddings=[[random.random() for _ in range(dimension)] for _ in range(2)],
metadatas=[
{"category": "Programming", "score": 95, "updated": True},
{"category": "Database", "score": 97, "updated": True}
]
)

# 4.3 Update embeddings
new_embeddings = [[random.random() for _ in range(dimension)] for _ in range(2)]
collection.update(
ids=ids[2:4],
embeddings=new_embeddings
)

# ============================================================================
# PART 5: DML OPERATIONS - UPSERT DATA
# ============================================================================

# 5.1 Upsert existing item (will update)
collection.upsert(
ids=ids[0],
documents="Upserted document (was updated)",
embeddings=[random.random() for _ in range(dimension)],
metadatas={"category": "AI", "upserted": True}
)

# 5.2 Upsert new item (will insert)
new_id = str(uuid.uuid4())
collection.upsert(
ids=new_id,
documents="This is a new document from upsert",
embeddings=[random.random() for _ in range(dimension)],
metadatas={"category": "New", "upserted": True}
)

# 5.3 Upsert multiple items
upsert_ids = [ids[4], str(uuid.uuid4())] # One existing, one new
collection.upsert(
ids=upsert_ids,
documents=["Upserted doc 1", "Upserted doc 2"],
embeddings=[[random.random() for _ in range(dimension)] for _ in range(2)],
metadatas=[{"upserted": True}, {"upserted": True}]
)

# ============================================================================
# PART 6: DQL OPERATIONS - QUERY (VECTOR SIMILARITY SEARCH)
# ============================================================================

# 6.1 Basic vector similarity query
query_vector = embeddings[0] # Query with first document's vector
results = collection.query(
query_embeddings=query_vector,
n_results=3
)
print(f"Query results: {len(results['ids'][0])} items")

# 6.2 Query with metadata filter (simplified equality)
results = collection.query(
query_embeddings=query_vector,
where={"category": "AI"},
n_results=5
)

# 6.3 Query with comparison operators
results = collection.query(
query_embeddings=query_vector,
where={"score": {"$gte": 90}},
n_results=5
)

# 6.4 Query with $in operator
results = collection.query(
query_embeddings=query_vector,
where={"tag": {"$in": ["ml", "python", "neural"]}},
n_results=5
)

# 6.5 Query with logical operators ($or) - simplified equality
results = collection.query(
query_embeddings=query_vector,
where={
"$or": [
{"category": "AI"},
{"tag": "python"}
]
},
n_results=5
)

# 6.6 Query with logical operators ($and) - simplified equality
results = collection.query(
query_embeddings=query_vector,
where={
"$and": [
{"category": "AI"},
{"score": {"$gte": 90}}
]
},
n_results=5
)

# 6.7 Query with document filter
results = collection.query(
query_embeddings=query_vector,
where_document={"$contains": "machine learning"},
n_results=5
)

# 6.8 Query with combined filters (simplified equality)
results = collection.query(
query_embeddings=query_vector,
where={"category": "AI", "year": {"$gte": 2023}},
where_document={"$contains": "learning"},
n_results=5
)

# 6.9 Query with multiple embeddings (batch query)
batch_embeddings = [embeddings[0], embeddings[1]]
batch_results = collection.query(
query_embeddings=batch_embeddings,
n_results=2
)
# batch_results["ids"][0] contains results for first query
# batch_results["ids"][1] contains results for second query

# 6.10 Query with specific fields
results = collection.query(
query_embeddings=query_vector,
include=["documents", "metadatas", "embeddings"],
n_results=2
)

# ============================================================================
# PART 7: DQL OPERATIONS - GET (RETRIEVE BY IDS OR FILTERS)
# ============================================================================

# 7.1 Get by single ID
result = collection.get(ids=ids[0])
# result["ids"] contains [ids[0]]
# result["documents"] contains document for ids[0]

# 7.2 Get by multiple IDs
results = collection.get(ids=ids[:3])
# results["ids"] contains ids[:3]
# results["documents"] contains documents for all IDs

# 7.3 Get by metadata filter (simplified equality)
results = collection.get(
where={"category": "AI"},
limit=5
)

# 7.4 Get with comparison operators
results = collection.get(
where={"score": {"$gte": 90}},
limit=5
)

# 7.5 Get with $in operator
results = collection.get(
where={"tag": {"$in": ["ml", "python"]}},
limit=5
)

# 7.6 Get with logical operators (simplified equality)
results = collection.get(
where={
"$or": [
{"category": "AI"},
{"category": "Programming"}
]
},
limit=5
)

# 7.7 Get by document filter
results = collection.get(
where_document={"$contains": "Python"},
limit=5
)

# 7.8 Get with pagination
results_page1 = collection.get(limit=2, offset=0)
results_page2 = collection.get(limit=2, offset=2)

# 7.9 Get with specific fields
results = collection.get(
ids=ids[:2],
include=["documents", "metadatas", "embeddings"]
)

# 7.10 Get all data
all_results = collection.get(limit=100)

# ============================================================================
# PART 8: DQL OPERATIONS - HYBRID SEARCH
# ============================================================================

# 8.1 Hybrid search with full-text and vector search
# Note: This requires query_embeddings to be provided directly
# In real usage, you might have an embedding function
hybrid_results = collection.hybrid_search(
query={
"where_document": {"$contains": "machine learning"},
"where": {"category": "AI"}, # Simplified equality
"n_results": 10
},
knn={
"query_embeddings": [embeddings[0]],
"where": {"year": {"$gte": 2022}},
"n_results": 10
},
rank={"rrf": {}}, # Reciprocal Rank Fusion
n_results=5,
include=["documents", "metadatas"]
)
# hybrid_results["ids"][0] contains IDs for the hybrid search
# hybrid_results["documents"][0] contains documents for the hybrid search
print(f"Hybrid search: {len(hybrid_results.get('ids', [[]])[0])} results")

# ============================================================================
# PART 9: DML OPERATIONS - DELETE DATA
# ============================================================================

# 9.1 Delete by IDs
delete_ids = [vector_only_ids[0], new_id]
collection.delete(ids=delete_ids)

# 9.2 Delete by metadata filter
collection.delete(where={"type": {"$eq": "vector_only"}})

# 9.3 Delete by document filter
collection.delete(where_document={"$contains": "Updated document"})

# 9.4 Delete with combined filters
collection.delete(
where={"category": {"$eq": "CV"}},
where_document={"$contains": "vision"}
)

# ============================================================================
# PART 10: COLLECTION INFORMATION
# ============================================================================

# 10.1 Get collection count
count = collection.count()
print(f"Collection count: {count} items")


# 10.3 Preview first few items in collection (returns all columns by default)
preview = collection.peek(limit=5)
print(f"Preview: {len(preview['ids'])} items")
for i in range(len(preview['ids'])):
print(f" ID: {preview['ids'][i]}, Document: {preview['documents'][i]}")
print(f" Metadata: {preview['metadatas'][i]}, Embedding dim: {len(preview['embeddings'][i]) if preview['embeddings'][i] else 0}")

# 10.4 Count collections in database
collection_count = client.count_collection()
print(f"Database has {collection_count} collections")

# ============================================================================
# PART 11: CLEANUP
# ============================================================================

# Delete test collections
try:
client.delete_collection("another_collection")
except Exception as e:
print(f"Could not delete 'another_collection': {e}")

# Uncomment to delete main collection
client.delete_collection(collection_name)

相关文档