简介:本文详细探讨了MySQL数据库中搜索关键词表的设计原则、优化策略及高效查询指令,涵盖表结构设计、索引优化、分词技术实现以及实战SQL示例,为开发者提供完整的搜索功能实现方案。
在MySQL中设计搜索关键词表时,核心字段应包括:
CREATE TABLE search_keywords (
id BIGINT UNSIGNED PRIMARY KEY AUTO_INCREMENT,
keyword VARCHAR(255) NOT NULL COMMENT '原始关键词',
normalized_keyword VARCHAR(255) NOT NULL COMMENT '标准化后的关键词',
search_count INT UNSIGNED DEFAULT 0 COMMENT '搜索频次',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
INDEX idx_keyword (keyword),
INDEX idx_normalized (normalized_keyword),
INDEX idx_search_count (search_count)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
ALTER TABLE search_keywords ADD INDEX idx_keyword_prefix (keyword(20));
CREATE INDEX idx_freq_search ON search_keywords (normalized_keyword, search_count);
对于中文搜索场景,需要集成分词组件:
# Python示例:结巴分词入库
import jieba
def process_keyword(raw):
words = ' '.join(jieba.cut_for_search(raw))
return words.strip()
对应的分词表设计:
CREATE TABLE keyword_segments (
keyword_id BIGINT UNSIGNED,
segment VARCHAR(50) NOT NULL,
PRIMARY KEY (keyword_id, segment),
INDEX idx_segment (segment)
);
SELECT * FROM search_keywords
WHERE normalized_keyword = LOWER(TRIM(' 搜索词 '));
SELECT * FROM search_keywords
WHERE normalized_keyword LIKE CONCAT('%', REPLACE('输入词', ' ', '%'), '%')
ORDER BY search_count DESC LIMIT 10;
SELECT k.* FROM search_keywords k
JOIN keyword_segments s ON k.id = s.keyword_id
WHERE s.segment IN ('分词1', '分词2')
GROUP BY k.id
ORDER BY COUNT(*) DESC, k.search_count DESC;
SELECT DATE(created_at) as day,
COUNT(*) as total_searches,
COUNT(DISTINCT normalized_keyword) as unique_terms
FROM search_logs
GROUP BY day ORDER BY day DESC;
-- 使用MySQL查询缓存(适合读多写少场景)
SET GLOBAL query_cache_size = 64*1024*1024;
-- 或使用应用层缓存
CREATE TABLE keyword_cache (
hash CHAR(32) PRIMARY KEY,
result JSON NOT NULL,
expires_at DATETIME NOT NULL
);
-- 按关键词首字母分片
CREATE TABLE search_keywords_a_f (
CHECK (keyword REGEXP '^[a-fA-F]')
) INHERITS (search_keywords);
CREATE TABLE search_keywords_g_m (
CHECK (keyword REGEXP '^[g-mG-M]')
) INHERITS (search_keywords);
-- 查询缓存命中率
SHOW STATUS LIKE 'Qcache%';
-- 索引使用情况
EXPLAIN ANALYZE SELECT * FROM search_keywords WHERE keyword LIKE '%科技%';
INSERT INTO search_keywords (keyword, normalized_keyword, search_count)
SELECT keyword, normalized_keyword, SUM(search_count)
FROM temp_import GROUP BY normalized_keyword
ON DUPLICATE KEY UPDATE search_count = search_count + VALUES(search_count);
DELETE FROM search_keywords
WHERE normalized_keyword IN (SELECT word FROM stop_words);
设计关联词关系表:
CREATE TABLE keyword_relations (
keyword1 VARCHAR(255) NOT NULL,
keyword2 VARCHAR(255) NOT NULL,
relation_score FLOAT DEFAULT 0,
PRIMARY KEY (keyword1, keyword2)
);
使用NGINX+Lua实现毫秒级响应:
location /suggest {
content_by_lua_block {
local prefix = ngx.var.arg_q:lower()
local res = db.query("SELECT keyword FROM search_keywords "..
"WHERE normalized_keyword LIKE ? ORDER BY "..
"search_count DESC LIMIT 5", {prefix.."%"})
ngx.say(json.encode(res))
}
}
通过以上设计方案,开发者可以构建出支持千万级关键词的高性能搜索系统。实际实施时需根据具体业务需求调整字段设计和索引策略,建议通过A/B测试验证不同方案的性能差异。