Qdrant学习


Qdrant学习


引言

Qdrant 是一个向量相似性搜索引擎。它提供了一项可投入生产使用的服务,该服务具有便捷的API,可用于存储、搜索和管理带有额外载荷和扩展过滤支持的向量。这使得它适用于各种神经网络或基于语义的匹配、分面搜索以及其他应用场景。Qdrant 官网文档:https://qdrant.tech/documentation/

在 LangChain 中使用 Qdrant 作为向量搜索引擎,需要先安装 Qdrant 客户端库:https://pypi.org/project/qdrant-client/ , 然后在 LangChain 中配置 Qdrant 客户端,即可使用 Qdrant 作为向量数据库。开发文档:https://docs.langchain.com/oss/python/integrations/vectorstores/qdrant

LangChain为向量存储提供了一个统一的接口,Vector store 向量存储集成开发文档:https://docs.langchain.com/oss/python/integrations/vectorstores

正文

下面看一下使用实例。

示例一

下面 langchain 版本为 1.2.15,qdrant-client 版本为 1.1.0,Python版本为 3.13.1。

一、使用OpenAI的 模型 作为向量嵌入模型。

数据写入向量数据库:

from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from uuid import uuid4
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
import os

OPENAI_API_KEY = "OPENAI_API_KEY"
OPENAI_API_API_BASE = "OPENAI_API_API_BASE"

document_1 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees Fahrenheit.",
    metadata={"source": "news"},
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
)

document_4 = Document(
    page_content="Robbers broke into the city bank and stole $1 million in cash.",
    metadata={"source": "news"},
)

document_5 = Document(
    page_content="Wow! That was an amazing movie. I can't wait to see it again.",
    metadata={"source": "tweet"},
)

document_6 = Document(
    page_content="Is the new iPhone worth the price? Read this review to find out.",
    metadata={"source": "website"},
)

document_7 = Document(
    page_content="The top 10 soccer players in the world right now.",
    metadata={"source": "website"},
)

document_8 = Document(
    page_content="LangGraph is the best framework for building stateful, agentic applications!",
    metadata={"source": "tweet"},
)

document_9 = Document(
    page_content="The stock market is down 500 points today due to fears of a recession.",
    metadata={"source": "news"},
)

document_10 = Document(
    page_content="I have a bad feeling I am going to get deleted :(",
    metadata={"source": "tweet"},
)

documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10,
]
uuids = [str(uuid4()) for _ in range(len(documents))]

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large",
    api_key=OPENAI_API_KEY,
    base_url=OPENAI_API_API_BASE
)

client = QdrantClient(path=r"/data/project/local_qdrand")

collection_name = "local_documents_name"

try:
    if not client.collection_exists(collection_name):
        client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=768, distance=Distance.COSINE),
        )
        print("集合创建成功")
except Exception as e:
    print("创建集合出错:", e)
    exit(1)

vector_store = QdrantVectorStore(
    client=client,
    collection_name=collection_name,
    embedding=embeddings,
)

vector_store.add_documents(documents=documents, ids=uuids)

print("数据写入向量数据库完成")

获取数据:

# 向量数据库
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from langchain_openai import OpenAIEmbeddings
import os

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large",
    api_key=OPENAI_API_KEY,
    base_url=OPENAI_API_API_BASE
)

client = QdrantClient(path=r"/data/project/local_qdrand")

collection_name = "local_documents_name"

vector_store = QdrantVectorStore(
    client=client,
    collection_name=collection_name,
    embedding=embeddings,
)

# 搜索读取(最正确)
# 参考地址:https://docs.langchain.com/oss/python/integrations/vectorstores/qdrant#query-directly
# 方式一,直接使用向量存储进行相似度搜索
docs = vector_store.similarity_search("bank robbery", k=3)
for doc in docs:
    print("内容:", doc.page_content)
    print("元数据:", doc.metadata)
    print("-" * 50)

print("=" * 50)

# 方式二,将向量存储转为检索器,进行搜索
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 3})
docs = retriever.invoke("bank robbery")
print(docs)
print("=" * 50)
for doc in docs:
    print("内容:", doc.page_content)
    print("元数据:", doc.metadata)
    print("-" * 50)


# 在脚本结束前手动关闭客户端
client.close()

二、使用免费的网络模型 HuggingFaceEmbeddings 作为向量嵌入模型

HuggingFaceEmbeddings 是从 HuggingFace 模型库加载预训练的模型,用于将文本转换为向量表示。

数据写入向量数据库:

from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from langchain_huggingface import HuggingFaceEmbeddings
from uuid import uuid4
from langchain_core.documents import Document
import os

# 强制离线,不联网访问 huggingface(解决 SSL/网络错误)
os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["HF_HUB_OFFLINE"] = "1"
os.environ["USER_AGENT"] = "my-app"


document_1 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees Fahrenheit.",
    metadata={"source": "news"},
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
)

document_4 = Document(
    page_content="Robbers broke into the city bank and stole $1 million in cash.",
    metadata={"source": "news"},
)

document_5 = Document(
    page_content="Wow! That was an amazing movie. I can't wait to see it again.",
    metadata={"source": "tweet"},
)

document_6 = Document(
    page_content="Is the new iPhone worth the price? Read this review to find out.",
    metadata={"source": "website"},
)

document_7 = Document(
    page_content="The top 10 soccer players in the world right now.",
    metadata={"source": "website"},
)

document_8 = Document(
    page_content="LangGraph is the best framework for building stateful, agentic applications!",
    metadata={"source": "tweet"},
)

document_9 = Document(
    page_content="The stock market is down 500 points today due to fears of a recession.",
    metadata={"source": "news"},
)

document_10 = Document(
    page_content="I have a bad feeling I am going to get deleted :(",
    metadata={"source": "tweet"},
)

documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10,
]
uuids = [str(uuid4()) for _ in range(len(documents))]

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",
    model_kwargs={"local_files_only": True}
)

client = QdrantClient(path=r"/data/project/local_qdrand")

collection_name = "local_documents_name"

try:
    if not client.collection_exists(collection_name):
        client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=768, distance=Distance.COSINE),
        )
        print("集合创建成功")
except Exception as e:
    print("创建集合出错:", e)
    exit(1)

vector_store = QdrantVectorStore(
    client=client,
    collection_name=collection_name ,
    embedding=embeddings,
)

vector_store.add_documents(documents=documents, ids=uuids)

print("数据写入向量数据库完成")

获取数据:

# 向量数据库
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from langchain_huggingface import HuggingFaceEmbeddings
import os

# 强制离线,不联网访问 huggingface(解决 SSL/网络错误)
os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["HF_HUB_OFFLINE"] = "1"
os.environ["USER_AGENT"] = "my-app"

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",
    model_kwargs={"local_files_only": True}
)

client = QdrantClient(path=r"/data/project/local_qdrand")

collection_name = "local_documents_name"

vector_store = QdrantVectorStore(
    client=client,
    collection_name=collection_name,
    embedding=embeddings,
)

# 搜索读取
# 参考地址:https://docs.langchain.com/oss/python/integrations/vectorstores/qdrant#query-directly
# 方式一,直接使用向量存储进行相似度搜索
docs = vector_store.similarity_search("bank robbery", k=3)
for doc in docs:
    print("内容:", doc.page_content)
    print("元数据:", doc.metadata)
    print("-" * 50)

print("=" * 50)

# 方式二,将向量存储转为检索器,进行搜索
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 3})
docs = retriever.invoke("bank robbery")
print(docs)
print("=" * 50)
for doc in docs:
    print("内容:", doc.page_content)
    print("元数据:", doc.metadata)
    print("-" * 50)


# 在脚本结束前手动关闭客户端
client.close()

三、获取网页数据进行向量化存储

数据写入向量数据库:

from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from langchain_huggingface import HuggingFaceEmbeddings
from uuid import uuid4
from langchain_core.documents import Document
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os

# 强制离线,不联网访问 huggingface(解决 SSL/网络错误)
os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["HF_HUB_OFFLINE"] = "1"
os.environ["USER_AGENT"] = "my-app"

# 加载网页内容
URL = "https://www.runoob.com/python3/python3-tutorial.html"
loader = WebBaseLoader(URL)
docs = loader.load()

# 文本分割
documents = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=50,
).split_documents(docs)

uuids = [str(uuid4()) for _ in range(len(documents))]

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",
    model_kwargs={"local_files_only": True}
)

client = QdrantClient(path=r"/data/project/local_qdrand")

collection_name = "local_documents_name"

try:
    if not client.collection_exists(collection_name):
        client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=768, distance=Distance.COSINE),
        )
        print("集合创建成功")
except Exception as e:
    print("创建集合出错:", e)
    exit(1)

vector_store = QdrantVectorStore(
    client=client,
    collection_name=collection_name,
    embedding=embeddings,
)

vector_store.add_documents(documents=documents, ids=uuids)

print("数据写入向量数据库完成")

其他各类数据获取,包括本地文件、网络数据库、哔哩哔哩等各类网页应用,文档:https://docs.langchain.com/oss/python/integrations/providers/all_providers






参考资料


返回