简介:本文详解从零搭建MCP(Model Control Protocol)客户端与服务端的全流程,结合DeepSeek、ollama、vLLM三大模型接入实战,提供可复用的代码框架与优化方案。
MCP(Model Control Protocol)作为新一代模型服务通信标准,通过标准化接口实现客户端与服务端的解耦。其核心设计包含三部分:
ModelRequest与ModelResponse典型通信流程:
sequenceDiagramClient->>Server: StreamInit(metadata)Server-->>Client: StreamAck(config)loop Bi-directional StreamClient->>Server: ModelRequest(prompt)Server-->>Client: ModelResponse(chunk)endClient->>Server: StreamComplete
package mainimport ("context""net""log""google.golang.org/grpc"pb "path/to/mcp/proto")type server struct {pb.UnimplementedModelServiceServermodels map[string]ModelAdapter}func (s *server) StreamModel(stream pb.ModelService_StreamModelServer) error {// 实现双向流处理逻辑req, err := stream.Recv()if err != nil {return err}// 根据req.ModelId选择模型适配器adapter, ok := s.models[req.ModelId]if !ok {return status.Errorf(codes.NotFound, "model not found")}// 调用具体模型推理chunks := adapter.Generate(req.Prompt)for _, chunk := range chunks {if err := stream.Send(&pb.ModelResponse{Content: chunk}); err != nil {return err}}return nil}func main() {lis, _ := net.Listen("tcp", ":50051")s := grpc.NewServer()pb.RegisterModelServiceServer(s, &server{models: make(map[string]ModelAdapter),})// 注册模型适配器(下文详解)registerDeepSeekAdapter(s)registerOllamaAdapter(s)registerVLLMAdapter(s)log.Println("Server started on :50051")s.Serve(lis)}
采用适配器模式解耦不同模型的接口差异:
type ModelAdapter interface {Generate(prompt string) []stringGetMetadata() ModelMetadata}// DeepSeek适配器实现type DeepSeekAdapter struct {client *deepseek.Client}func (d *DeepSeekAdapter) Generate(prompt string) []string {resp, _ := d.client.Complete(prompt, deepseek.Options{MaxTokens: 2000,Temperature: 0.7,})return strings.Split(resp.Text, "\n")}
# Python客户端示例from deepseek_api import Clientclass DeepSeekMCPAdapter:def __init__(self, api_key):self.client = Client(api_key)self.model_id = "deepseek-chat"def generate(self, prompt):response = self.client.chat.completions.create(model=self.model_id,messages=[{"role": "user", "content": prompt}],stream=True)for chunk in response:yield chunk.choices[0].delta.content or ""
max_tokens(建议1000-4000)
# 启动ollama服务ollama serve --model-dir /path/to/models
type OllamaAdapter struct {endpoint string}func (o *OllamaAdapter) Generate(prompt string) []string {resp, _ := http.Post(o.endpoint+"/api/generate","application/json",bytes.NewBufferString(fmt.Sprintf(`{"model":"llama2","prompt":"%s"}`, prompt)))defer resp.Body.Close()body, _ := io.ReadAll(resp.Body)// 解析JSON响应...}
# vLLM启动参数示例from vllm import LLM, SamplingParamsllm = LLM(model="facebook/opt-125m",tokenizer="hf-internal-testing/llama-tokenizer",tensor_parallel_size=4,dtype="bfloat16")sampling_params = SamplingParams(temperature=0.7,top_p=0.9,max_tokens=100)
type VLLMAdapter struct {engine *vllm.Engine}func (v *VLLMAdapter) Generate(prompt string) []string {outputs := v.engine.Generate(prompt, samplingParams)var chunks []stringfor _, output := range outputs {chunks = append(chunks, output.Outputs[0].Text)}return chunks}
batch_size=autokv_cachecontinuous_batching=True
// TypeScript客户端示例import { createChannel, createClient } from "nice-grpc-web";import { ModelServiceClient } from "./proto/mcp_pb_service";class MCPClient {private client: ModelServiceClient;constructor(endpoint: string) {const channel = createChannel(endpoint);this.client = createClient(ModelServiceClient, channel);}async streamModel(prompt: string, modelId: string) {const call = this.client.streamModel({modelId,prompt});return new Promise<string[]>((resolve) => {const chunks: string[] = [];call.on("data", (response) => {chunks.push(response.content);});call.on("end", () => resolve(chunks));});}}
# Prometheus监控指标示例mcp_requests_total{model="deepseek"} 1024mcp_request_duration_seconds{model="ollama"} 0.45mcp_errors_total{type="timeout"} 12
# 多阶段构建示例FROM golang:1.21 as builderWORKDIR /appCOPY . .RUN CGO_ENABLED=0 GOOS=linux go build -o mcp-serverFROM alpine:latestCOPY --from=builder /app/mcp-server .EXPOSE 50051CMD ["./mcp-server"]
# StatefulSet配置示例apiVersion: apps/v1kind: StatefulSetmetadata:name: mcp-serverspec:serviceName: mcpreplicas: 3template:spec:containers:- name: mcpimage: mcp-server:latestresources:limits:nvidia.com/gpu: 1env:- name: MODELS_DIRvalue: "/models"
deadline = time.Now().Add(30 * time.Second)grpc.KeepaliveParams(keepalive.ClientParameters{...})docker run --shm-size=4g
// 重连机制实现func (c *Client) reconnect() error {for i := 0; i < 3; i++ {if conn, err := grpc.Dial(...); err == nil {c.conn = connreturn nil}time.Sleep(time.Duration(i*i) * time.Second)}return errors.New("max retries exceeded")}
本文提供的完整代码库与Docker镜像已在GitHub开源(示例链接),包含从基础协议实现到生产级部署的全套方案。开发者可根据实际需求选择模型组合,通过适配器模式快速扩展新模型支持。建议先在本地环境验证功能,再逐步迁移到测试/生产环境,配合监控系统实现全链路可观测性。