Laravel AI SDK 向量嵌入与语义搜索 摘要 向量嵌入是现代 AI 应用的核心技术,使计算机能够理解文本的语义含义。Laravel AI SDK 提供了简洁的 API 用于生成向量嵌入和构建语义搜索系统。本文将深入讲解:
向量嵌入的核心概念与原理 使用 Laravel AI SDK 生成嵌入 PostgreSQL pgvector 集成 构建语义搜索引擎 RAG(检索增强生成)实现 实战案例:智能文档搜索系统 本文适合希望构建 AI 驱动搜索系统的开发者。
1. 向量嵌入概述 1.1 什么是向量嵌入 向量嵌入是将文本、图像或其他数据转换为高维向量的过程。这些向量捕捉了数据的语义含义,使得语义相似的内容在向量空间中距离更近。
1 2 3 "猫" -> [0.1, 0.8, 0.3, ...] "狗" -> [0.2, 0.7, 0.4, ...] // 与"猫"相似 "汽车" -> [0.9, 0.1, 0.2, ...] // 与"猫"不相似
1.2 应用场景 场景 描述 语义搜索 基于含义而非关键词搜索 推荐系统 基于相似性推荐内容 聚类分析 自动分组相似内容 异常检测 识别不相似的内容 RAG 检索增强生成
1.3 Laravel AI SDK 支持 1 2 3 use Illuminate \Support \Str ;$embeddings = Str ::of ('Napa Valley has great wine.' )->toEmbeddings ();
2. 生成向量嵌入 2.1 基础用法 1 2 3 4 5 6 use Laravel \Ai \Embeddings ;$embeddings = Embeddings ::from ('Hello, world!' );print_r ($embeddings ->vector);
2.2 模型选择 1 2 3 4 5 6 7 8 $embeddings = Embeddings ::from ('Hello, world!' ) ->model ('text-embedding-3-small' ) ->generate ();
2.3 批量生成 1 2 3 4 5 6 7 8 9 10 11 12 $texts = [ 'The quick brown fox' , 'jumps over the lazy dog' , 'Hello, world!' , ]; $embeddings = Embeddings ::batch ($texts )->generate ();foreach ($embeddings as $index => $embedding ) { echo "Text: {$texts[$index]} \n" ; echo "Vector: " . implode (', ' , array_slice ($embedding ->vector, 0 , 5 )) . "...\n" ; }
2.4 维度控制 1 2 3 $embeddings = Embeddings ::from ('Hello, world!' ) ->dimensions (256 ) ->generate ();
3. 数据库集成 3.1 迁移文件 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 <?php use Illuminate \Database \Migrations \Migration ;use Illuminate \Database \Schema \Blueprint ;use Illuminate \Support \Facades \Schema ;use Illuminate \Support \Facades \DB ;return new class extends Migration { public function up ( ): void { DB::statement ('CREATE EXTENSION IF NOT EXISTS vector' ); Schema ::create ('documents' , function (Blueprint $table ) { $table ->id (); $table ->string ('title' ); $table ->text ('content' ); $table ->vector ('embedding' , 1536 ); $table ->timestamps (); }); DB::statement ('CREATE INDEX documents_embedding_idx ON documents USING ivfflat (embedding vector_cosine_ops)' ); } public function down ( ): void { Schema ::dropIfExists ('documents' ); DB::statement ('DROP EXTENSION IF EXISTS vector' ); } };
3.2 模型定义 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 <?php namespace App \Models ;use Illuminate \Database \Eloquent \Model ;use Laravel \Ai \HasEmbeddings ;class Document extends Model { use HasEmbeddings ; protected $fillable = ['title' , 'content' , 'embedding' ]; protected $casts = [ 'embedding' => 'vector' , ]; public function toEmbeddableString ( ): string { return $this ->title . "\n" . $this ->content; } }
3.3 自动生成嵌入 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 <?php namespace App \Models ;use Illuminate \Database \Eloquent \Model ;use Laravel \Ai \HasEmbeddings ;class Document extends Model { use HasEmbeddings ; protected static function booted ( ): void { static ::creating (function (Document $document ) { if (empty ($document ->embedding)) { $document ->generateEmbedding (); } }); static ::updating (function (Document $document ) { if ($document ->isDirty (['title' , 'content' ])) { $document ->generateEmbedding (); } }); } }
4. 语义搜索 4.1 基础查询 1 2 3 4 5 6 7 8 9 10 use App \Models \Document ;use Laravel \Ai \Embeddings ;$query = 'best restaurants in Napa Valley' ;$queryEmbedding = Embeddings ::from ($query )->generate ();$documents = Document ::query () ->whereVectorSimilarTo ('embedding' , $queryEmbedding ->vector) ->limit (10 ) ->get ();
4.2 相似度阈值 1 2 3 4 5 $documents = Document ::query () ->whereVectorSimilarTo ('embedding' , $queryEmbedding ->vector) ->withSimilarityThreshold (0.8 ) ->limit (10 ) ->get ();
4.3 获取相似度分数 1 2 3 4 5 6 7 8 9 10 $documents = Document ::query () ->whereVectorSimilarTo ('embedding' , $queryEmbedding ->vector) ->withSimilarityScore () ->limit (10 ) ->get (); foreach ($documents as $document ) { echo "Title: {$document->title} \n" ; echo "Similarity: {$document->similarity} \n" ; }
4.4 混合搜索 1 2 3 4 5 6 $documents = Document ::query () ->where ('category' , 'restaurants' ) ->whereVectorSimilarTo ('embedding' , $queryEmbedding ->vector) ->orderBy ('similarity' , 'desc' ) ->limit (10 ) ->get ();
5. RAG 实现 5.1 文档处理 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 <?php namespace App \Services ;use App \Models \Document ;use Illuminate \Support \Str ;use Laravel \Ai \Embeddings ;class DocumentProcessor { public function process (string $content , array $metadata = [] ): Document { $chunks = Str ::of ($content ) ->splitIntoChunks (1000 , 200 ) ->toArray (); $documents = []; foreach ($chunks as $index => $chunk ) { $embedding = Embeddings ::from ($chunk )->generate (); $documents [] = Document ::create ([ 'title' => $metadata ['title' ] ?? "Chunk {$index} " , 'content' => $chunk , 'embedding' => $embedding ->vector, 'metadata' => array_merge ($metadata , ['chunk_index' => $index ]), ]); } return $documents ; } }
5.2 RAG 服务 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 <?php namespace App \Services ;use App \Models \Document ;use Laravel \Ai \Embeddings ;use Laravel \Ai \Prompt ;class RAGService { public function query (string $question , int $topK = 5 ): string { $queryEmbedding = Embeddings ::from ($question )->generate (); $relevantDocs = Document ::query () ->whereVectorSimilarTo ('embedding' , $queryEmbedding ->vector) ->limit ($topK ) ->get (); $context = $relevantDocs ->pluck ('content' ) ->join ("\n\n" ); $response = Prompt ::make ($question ) ->withSystemMessage ($this ->buildSystemPrompt ($context )) ->generate (); return $response ; } private function buildSystemPrompt (string $context ): string { return <<<PROMPT You are a helpful assistant. Use the following context to answer questions. If the answer is not in the context, say you don't know. Context: {$context} PROMPT ; } }
5.3 控制器 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 <?php namespace App \Http \Controllers ;use App \Services \RAGService ;use Illuminate \Http \Request ;class RAGController extends Controller { public function __construct ( protected RAGService $rag ) {} public function query (Request $request ) { $validated = $request ->validate ([ 'question' => 'required|string' , 'top_k' => 'nullable|integer|min:1|max:10' , ]); $answer = $this ->rag->query ( $validated ['question' ], $validated ['top_k' ] ?? 5 ); return response ()->json ([ 'answer' => $answer , ]); } }
6. 实战案例:智能文档搜索 6.1 完整实现 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 <?php namespace App \Services ;use App \Models \Document ;use Laravel \Ai \Embeddings ;use Laravel \Ai \Prompt ;use Illuminate \Support \Facades \DB ;class IntelligentDocumentSearch { public function search (string $query , array $filters = [], int $limit = 10 ): array { $queryEmbedding = Embeddings ::from ($query )->generate (); $documents = Document ::query () ->when (!empty ($filters ['category' ]), fn ($q ) => $q ->where ('category' , $filters ['category' ])) ->when (!empty ($filters ['date_from' ]), fn ($q ) => $q ->where ('created_at' , '>=' , $filters ['date_from' ])) ->whereVectorSimilarTo ('embedding' , $queryEmbedding ->vector) ->withSimilarityScore () ->withSimilarityThreshold ($filters ['threshold' ] ?? 0.7 ) ->orderBy ('similarity' , 'desc' ) ->limit ($limit ) ->get (); $summary = $this ->summarizeResults ($query , $documents ); return [ 'results' => $documents , 'summary' => $summary , 'total' => $documents ->count (), ]; } private function summarizeResults (string $query , $documents ): string { if ($documents ->isEmpty ()) { return 'No relevant documents found.' ; } $context = $documents ->map (fn($doc ) => "- {$doc->title} : {$doc->content} " ) ->join ("\n" ); return Prompt ::make ("Summarize these search results for the query: '{$query} '" ) ->withSystemMessage ("Context:\n{$context} " ) ->withMaxTokens (200 ) ->generate (); } }
6.2 高级功能 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 class AdvancedDocumentSearch { public function searchWithReranking (string $query , int $limit = 10 ): array { $queryEmbedding = Embeddings ::from ($query )->generate (); $documents = Document ::query () ->whereVectorSimilarTo ('embedding' , $queryEmbedding ->vector) ->limit ($limit * 3 ) ->get (); $reranked = $this ->rerank ($query , $documents ); return $reranked ->take ($limit )->values ()->toArray (); } private function rerank (string $query , $documents ) { return $documents ->map (function ($doc ) use ($query ) { $score = Prompt ::make ("Rate the relevance of this document to the query on a scale of 0-10.") ->withSystemMessage ("Query : {$query }\nDocument : {$doc ->content }") ->expectJson (['score ' => 'float ']) ->generate (); $doc ->rerank_score = $score ['score' ]; return $doc ; })->sortByDesc ('rerank_score' ); } }
7. 向量索引优化 7.1 索引类型 1 2 3 4 5 6 7 8 9 CREATE INDEX documents_embedding_idx ON documents USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100 );CREATE INDEX documents_embedding_hnsw_idx ON documents USING hnsw (embedding vector_cosine_ops)WITH (m = 16 , ef_construction = 64 );
7.2 索引配置 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 'pgsql' => [ 'driver' => 'pgsql' , 'url' => env ('DATABASE_URL' ), 'host' => env ('DB_HOST' , '127.0.0.1' ), 'port' => env ('DB_PORT' , '5432' ), 'database' => env ('DB_DATABASE' , 'forge' ), 'username' => env ('DB_USERNAME' , 'forge' ), 'password' => env ('DB_PASSWORD' , '' ), 'charset' => 'utf8' , 'prefix' => '' , 'prefix_indexes' => true , 'search_path' => 'public' , 'sslmode' => 'prefer' , 'extensions' => ['vector' ], ],
8. 性能优化 8.1 批量处理 1 2 3 4 5 6 7 8 9 10 11 12 13 use Laravel \Ai \Jobs \GenerateEmbeddings ;class DocumentBatchProcessor { public function processBatch (array $documents ): void { $chunks = array_chunk ($documents , 100 ); foreach ($chunks as $chunk ) { GenerateEmbeddings ::dispatch ($chunk ); } } }
8.2 缓存策略 1 2 3 4 5 6 7 8 9 10 11 12 13 use Illuminate \Support \Facades \Cache ;class CachedEmbeddingService { public function getEmbedding (string $text ): array { $key = 'embedding:' . md5 ($text ); return Cache ::remember ($key , now ()->addDays (30 ), function () use ($text ) { return Embeddings ::from ($text )->generate ()->vector ; }); } }
8.3 预计算 1 2 3 4 5 6 7 8 9 10 11 12 13 class Document extends Model { public static function boot ( ): void { parent ::boot (); static ::saved (function ($document ) { dispatch (function () use ($document ) { $document ->generateEmbedding (); }); }); } }
9. 最佳实践 9.1 分块策略 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 class ChunkingStrategy { public function chunk (string $content , int $size = 1000 , int $overlap = 200 ): array { $chunks = []; $length = strlen ($content ); for ($i = 0 ; $i < $length ; $i += ($size - $overlap )) { $chunk = substr ($content , $i , $size ); if (strlen ($chunk ) < 100 ) { continue ; } $chunks [] = $chunk ; } return $chunks ; } public function chunkBySentence (string $content , int $maxSize = 1000 ): array { $sentences = preg_split ('/(?<=[.!?])\s+/' , $content ); $chunks = []; $currentChunk = '' ; foreach ($sentences as $sentence ) { if (strlen ($currentChunk . $sentence ) > $maxSize ) { if (!empty ($currentChunk )) { $chunks [] = trim ($currentChunk ); } $currentChunk = $sentence ; } else { $currentChunk .= ' ' . $sentence ; } } if (!empty ($currentChunk )) { $chunks [] = trim ($currentChunk ); } return $chunks ; } }
9.2 元数据管理 1 2 3 4 5 6 7 8 9 10 11 12 13 class DocumentMetadata { public function extract (string $content ): array { return [ 'word_count' => str_word_count ($content ), 'reading_time' => ceil (str_word_count ($content ) / 200 ), 'language' => $this ->detectLanguage ($content ), 'entities' => $this ->extractEntities ($content ), 'keywords' => $this ->extractKeywords ($content ), ]; } }
10. 总结 Laravel AI SDK 的向量嵌入功能为构建语义搜索系统提供了强大支持:
简洁 API :一行代码生成嵌入数据库集成 :原生 pgvector 支持语义搜索 :基于含义而非关键词RAG 支持 :开箱即用的检索增强生成性能优化 :批量处理、缓存、索引通过本指南,您已经掌握了向量嵌入和语义搜索的核心技术,可以开始构建智能搜索系统了。
参考资料