引言
Qdrant 是一个向量相似性搜索引擎。它提供了一项可投入生产使用的服务,该服务具有便捷的API,可用于存储、搜索和管理带有额外载荷和扩展过滤支持的向量。这使得它适用于各种神经网络或基于语义的匹配、分面搜索以及其他应用场景。Qdrant 官网文档:https://qdrant.tech/documentation/
在 LangChain 中使用 Qdrant 作为向量搜索引擎,需要先安装 Qdrant 客户端库:https://pypi.org/project/qdrant-client/ , 然后在 LangChain 中配置 Qdrant 客户端,即可使用 Qdrant 作为向量数据库。开发文档:https://docs.langchain.com/oss/python/integrations/vectorstores/qdrant。
LangChain为向量存储提供了一个统一的接口,Vector store 向量存储集成开发文档:https://docs.langchain.com/oss/python/integrations/vectorstores
正文
下面看一下使用实例。
示例一
下面 langchain 版本为 1.2.15,qdrant-client 版本为 1.1.0,Python版本为 3.13.1。
一、使用OpenAI的 模型 作为向量嵌入模型。
数据写入向量数据库:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from uuid import uuid4
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
import os
OPENAI_API_KEY = "OPENAI_API_KEY"
OPENAI_API_API_BASE = "OPENAI_API_API_BASE"
document_1 = Document(
page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
metadata={"source": "tweet"},
)
document_2 = Document(
page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees Fahrenheit.",
metadata={"source": "news"},
)
document_3 = Document(
page_content="Building an exciting new project with LangChain - come check it out!",
metadata={"source": "tweet"},
)
document_4 = Document(
page_content="Robbers broke into the city bank and stole $1 million in cash.",
metadata={"source": "news"},
)
document_5 = Document(
page_content="Wow! That was an amazing movie. I can't wait to see it again.",
metadata={"source": "tweet"},
)
document_6 = Document(
page_content="Is the new iPhone worth the price? Read this review to find out.",
metadata={"source": "website"},
)
document_7 = Document(
page_content="The top 10 soccer players in the world right now.",
metadata={"source": "website"},
)
document_8 = Document(
page_content="LangGraph is the best framework for building stateful, agentic applications!",
metadata={"source": "tweet"},
)
document_9 = Document(
page_content="The stock market is down 500 points today due to fears of a recession.",
metadata={"source": "news"},
)
document_10 = Document(
page_content="I have a bad feeling I am going to get deleted :(",
metadata={"source": "tweet"},
)
documents = [
document_1,
document_2,
document_3,
document_4,
document_5,
document_6,
document_7,
document_8,
document_9,
document_10,
]
uuids = [str(uuid4()) for _ in range(len(documents))]
embeddings = OpenAIEmbeddings(
model="text-embedding-3-large",
api_key=OPENAI_API_KEY,
base_url=OPENAI_API_API_BASE
)
client = QdrantClient(path=r"/data/project/local_qdrand")
collection_name = "local_documents_name"
try:
if not client.collection_exists(collection_name):
client.create_collection(
collection_name=collection_name,
vectors_config=VectorParams(size=768, distance=Distance.COSINE),
)
print("集合创建成功")
except Exception as e:
print("创建集合出错:", e)
exit(1)
vector_store = QdrantVectorStore(
client=client,
collection_name=collection_name,
embedding=embeddings,
)
vector_store.add_documents(documents=documents, ids=uuids)
print("数据写入向量数据库完成")
获取数据:
# 向量数据库
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from langchain_openai import OpenAIEmbeddings
import os
embeddings = OpenAIEmbeddings(
model="text-embedding-3-large",
api_key=OPENAI_API_KEY,
base_url=OPENAI_API_API_BASE
)
client = QdrantClient(path=r"/data/project/local_qdrand")
collection_name = "local_documents_name"
vector_store = QdrantVectorStore(
client=client,
collection_name=collection_name,
embedding=embeddings,
)
# 搜索读取(最正确)
# 参考地址:https://docs.langchain.com/oss/python/integrations/vectorstores/qdrant#query-directly
# 方式一,直接使用向量存储进行相似度搜索
docs = vector_store.similarity_search("bank robbery", k=3)
for doc in docs:
print("内容:", doc.page_content)
print("元数据:", doc.metadata)
print("-" * 50)
print("=" * 50)
# 方式二,将向量存储转为检索器,进行搜索
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 3})
docs = retriever.invoke("bank robbery")
print(docs)
print("=" * 50)
for doc in docs:
print("内容:", doc.page_content)
print("元数据:", doc.metadata)
print("-" * 50)
# 在脚本结束前手动关闭客户端
client.close()
二、使用免费的网络模型 HuggingFaceEmbeddings 作为向量嵌入模型
HuggingFaceEmbeddings 是从 HuggingFace 模型库加载预训练的模型,用于将文本转换为向量表示。
数据写入向量数据库:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from langchain_huggingface import HuggingFaceEmbeddings
from uuid import uuid4
from langchain_core.documents import Document
import os
# 强制离线,不联网访问 huggingface(解决 SSL/网络错误)
os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["HF_HUB_OFFLINE"] = "1"
os.environ["USER_AGENT"] = "my-app"
document_1 = Document(
page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
metadata={"source": "tweet"},
)
document_2 = Document(
page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees Fahrenheit.",
metadata={"source": "news"},
)
document_3 = Document(
page_content="Building an exciting new project with LangChain - come check it out!",
metadata={"source": "tweet"},
)
document_4 = Document(
page_content="Robbers broke into the city bank and stole $1 million in cash.",
metadata={"source": "news"},
)
document_5 = Document(
page_content="Wow! That was an amazing movie. I can't wait to see it again.",
metadata={"source": "tweet"},
)
document_6 = Document(
page_content="Is the new iPhone worth the price? Read this review to find out.",
metadata={"source": "website"},
)
document_7 = Document(
page_content="The top 10 soccer players in the world right now.",
metadata={"source": "website"},
)
document_8 = Document(
page_content="LangGraph is the best framework for building stateful, agentic applications!",
metadata={"source": "tweet"},
)
document_9 = Document(
page_content="The stock market is down 500 points today due to fears of a recession.",
metadata={"source": "news"},
)
document_10 = Document(
page_content="I have a bad feeling I am going to get deleted :(",
metadata={"source": "tweet"},
)
documents = [
document_1,
document_2,
document_3,
document_4,
document_5,
document_6,
document_7,
document_8,
document_9,
document_10,
]
uuids = [str(uuid4()) for _ in range(len(documents))]
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-mpnet-base-v2",
model_kwargs={"local_files_only": True}
)
client = QdrantClient(path=r"/data/project/local_qdrand")
collection_name = "local_documents_name"
try:
if not client.collection_exists(collection_name):
client.create_collection(
collection_name=collection_name,
vectors_config=VectorParams(size=768, distance=Distance.COSINE),
)
print("集合创建成功")
except Exception as e:
print("创建集合出错:", e)
exit(1)
vector_store = QdrantVectorStore(
client=client,
collection_name=collection_name ,
embedding=embeddings,
)
vector_store.add_documents(documents=documents, ids=uuids)
print("数据写入向量数据库完成")
获取数据:
# 向量数据库
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from langchain_huggingface import HuggingFaceEmbeddings
import os
# 强制离线,不联网访问 huggingface(解决 SSL/网络错误)
os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["HF_HUB_OFFLINE"] = "1"
os.environ["USER_AGENT"] = "my-app"
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-mpnet-base-v2",
model_kwargs={"local_files_only": True}
)
client = QdrantClient(path=r"/data/project/local_qdrand")
collection_name = "local_documents_name"
vector_store = QdrantVectorStore(
client=client,
collection_name=collection_name,
embedding=embeddings,
)
# 搜索读取
# 参考地址:https://docs.langchain.com/oss/python/integrations/vectorstores/qdrant#query-directly
# 方式一,直接使用向量存储进行相似度搜索
docs = vector_store.similarity_search("bank robbery", k=3)
for doc in docs:
print("内容:", doc.page_content)
print("元数据:", doc.metadata)
print("-" * 50)
print("=" * 50)
# 方式二,将向量存储转为检索器,进行搜索
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 3})
docs = retriever.invoke("bank robbery")
print(docs)
print("=" * 50)
for doc in docs:
print("内容:", doc.page_content)
print("元数据:", doc.metadata)
print("-" * 50)
# 在脚本结束前手动关闭客户端
client.close()
三、获取网页数据进行向量化存储
数据写入向量数据库:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from langchain_huggingface import HuggingFaceEmbeddings
from uuid import uuid4
from langchain_core.documents import Document
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
# 强制离线,不联网访问 huggingface(解决 SSL/网络错误)
os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["HF_HUB_OFFLINE"] = "1"
os.environ["USER_AGENT"] = "my-app"
# 加载网页内容
URL = "https://www.runoob.com/python3/python3-tutorial.html"
loader = WebBaseLoader(URL)
docs = loader.load()
# 文本分割
documents = RecursiveCharacterTextSplitter(
chunk_size=800,
chunk_overlap=50,
).split_documents(docs)
uuids = [str(uuid4()) for _ in range(len(documents))]
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-mpnet-base-v2",
model_kwargs={"local_files_only": True}
)
client = QdrantClient(path=r"/data/project/local_qdrand")
collection_name = "local_documents_name"
try:
if not client.collection_exists(collection_name):
client.create_collection(
collection_name=collection_name,
vectors_config=VectorParams(size=768, distance=Distance.COSINE),
)
print("集合创建成功")
except Exception as e:
print("创建集合出错:", e)
exit(1)
vector_store = QdrantVectorStore(
client=client,
collection_name=collection_name,
embedding=embeddings,
)
vector_store.add_documents(documents=documents, ids=uuids)
print("数据写入向量数据库完成")
其他各类数据获取,包括本地文件、网络数据库、哔哩哔哩等各类网页应用,文档:https://docs.langchain.com/oss/python/integrations/providers/all_providers