简介:本文详细介绍如何基于LangChain、DeepSeek大模型与RAG架构在本地环境部署私有化AI问答系统,涵盖环境配置、模型集成、检索增强优化及性能调优全流程,助力开发者构建安全可控的智能应用。
推荐采用”LangChain+FastAPI+Milvus”架构:
用户请求 → FastAPI网关 → LangChain工作流 → DeepSeek推理 → Milvus向量检索 → 响应生成
硬件配置建议:
# 创建Python虚拟环境(推荐3.10版本)python -m venv langchain_envsource langchain_env/bin/activate # Linux/Mac# 或 langchain_env\Scripts\activate # Windows# 安装CUDA驱动(需匹配显卡型号)# NVIDIA官网下载对应版本的CUDA Toolkit
# LangChain生态组件pip install langchain chromadb faiss-cpu sentence-transformers# DeepSeek模型加载pip install transformers optimum# RAG相关组件pip install pymilvus unstructured[local-inference] tiktoken# Web服务框架pip install fastapi uvicorn
从HuggingFace下载量化版DeepSeek模型:
git lfs installgit clone https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite# 或使用优化后的量化版本wget https://example.com/path/to/deepseek-q4k.bin
from unstructured.partition.auto import partitiondef process_document(file_path):elements = partition(file=file_path)text_chunks = []for element in elements:if element.category == "Text":# 按段落分割,控制chunk大小在300-500词chunks = [text[i:i+500] for i in range(0, len(element.text), 500)]text_chunks.extend(chunks)return text_chunks
from pymilvus import connections, Collectiondef init_milvus():connections.connect(uri="localhost:19530",user="",password="",db_name="default")# 创建集合(需提前定义schema)collection = Collection(name="knowledge_base",schema={"fields": [{"name": "id", "type": "INT64", "is_primary": True},{"name": "embedding", "type": "FLOAT_VECTOR", "dim": 768},{"name": "text", "type": "VARCHAR", "max_length": 2048}]})return collection
from langchain.embeddings import HuggingFaceEmbeddingsfrom langchain.vectorstores import Milvusembeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5",model_kwargs={"device": "cuda"})vectorstore = Milvus(connection_args={"uri": "localhost:19530"},collection_name="knowledge_base",embedding_function=embeddings,text_field="text")def hybrid_search(query, k=5):# 混合检索:语义相似度+关键词匹配semantic_results = vectorstore.similarity_search(query, k)# 可扩展:添加BM25等传统检索方法return semantic_results
from langchain.llms import HuggingFacePipelinefrom transformers import AutoModelForCausalLM, AutoTokenizer, pipelinedef load_deepseek():model = AutoModelForCausalLM.from_pretrained("./deepseek-q4k",torch_dtype="auto",device_map="auto")tokenizer = AutoTokenizer.from_pretrained("./deepseek-q4k")pipe = pipeline("text-generation",model=model,tokenizer=tokenizer,max_new_tokens=512,temperature=0.7,do_sample=True)return HuggingFacePipeline(pipeline=pipe)
from langchain.chains import RetrievalQAWithSourcesChainfrom langchain.prompts import PromptTemplatecustom_prompt = PromptTemplate(input_variables=["context", "question"],template="""基于以下背景信息回答问题:{context}问题:{question}回答要求:1. 严格基于给定信息2. 使用中文回答3. 总字数控制在200字以内""")def build_rag_chain(llm, vectorstore):retriever = vectorstore.as_retriever(search_kwargs={"k": 3})chain = RetrievalQAWithSourcesChain.from_chain_type(llm=llm,chain_type="stuff",retriever=retriever,chain_type_kwargs={"prompt": custom_prompt},return_source_documents=True)return chain
from fastapi import FastAPIfrom pydantic import BaseModelapp = FastAPI()class QueryRequest(BaseModel):question: strhistory: list = []@app.post("/ask")async def ask_question(request: QueryRequest):result = rag_chain({"question": request.question},callbacks=[StreamingCallbackHandler()])return {"answer": result["answer"],"sources": result["sources"]}
anyio实现并发检索
#!/bin/bash# 启动Milvus服务docker run -d --name milvus \-p 19530:19530 \-p 9091:9091 \milvusdb/milvus:latest# 启动FastAPI服务uvicorn main:app --host 0.0.0.0 --port 8000 --workers 4
torch.compile进行模型优化bitsandbytes进行8位量化gpt2-large、e5-large)本方案通过模块化设计实现了LangChain、DeepSeek与RAG的高效集成,在保证数据安全的前提下,提供了接近云端服务的交互体验。实际部署时建议先在开发环境验证完整流程,再逐步迁移到生产环境。根据硬件条件,可通过调整模型量化级别(Q4_K_M/Q8_0)和检索策略(语义/混合)来平衡性能与效果。