From e3f680ad0f7e5c94cb11048ede0405f69c35915f Mon Sep 17 00:00:00 2001 From: Tim Date: Sun, 28 Sep 2025 17:58:10 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E7=B4=A2=E5=BC=95/=E6=9F=A5=E8=AF=A2?= =?UTF-8?q?=E8=A7=84=E5=88=99=E5=BE=AE=E8=B0=83?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../search/SearchIndexInitializer.java | 105 ++++++++--- .../com/openisle/service/SearchService.java | 178 ++++++++++-------- 2 files changed, 184 insertions(+), 99 deletions(-) diff --git a/backend/src/main/java/com/openisle/search/SearchIndexInitializer.java b/backend/src/main/java/com/openisle/search/SearchIndexInitializer.java index b61dad4df..1deb90e2f 100644 --- a/backend/src/main/java/com/openisle/search/SearchIndexInitializer.java +++ b/backend/src/main/java/com/openisle/search/SearchIndexInitializer.java @@ -52,15 +52,17 @@ public class SearchIndexInitializer { } } + // SearchIndexInitializer.java —— 只贴需要替换/新增的方法 + private TypeMapping postMapping() { return TypeMapping.of(builder -> builder .properties("type", Property.of(p -> p.keyword(k -> k))) - .properties("title", textWithPinyin()) - .properties("content", textWithPinyin()) - .properties("author", keywordWithPinyin()) - .properties("category", keywordWithPinyin()) - .properties("tags", keywordWithPinyin()) + .properties("title", textWithRawAndPinyin()) + .properties("content", textWithPinyinOnly()) // content 不做 .raw,避免超长 keyword + .properties("author", keywordWithRawAndPinyin()) + .properties("category", keywordWithRawAndPinyin()) + .properties("tags", keywordWithRawAndPinyin()) .properties("postId", Property.of(p -> p.long_(l -> l))) .properties( "createdAt", @@ -73,11 +75,11 @@ public class SearchIndexInitializer { return TypeMapping.of(builder -> builder .properties("type", Property.of(p -> p.keyword(k -> k))) - .properties("title", textWithPinyin()) - .properties("content", textWithPinyin()) - .properties("author", keywordWithPinyin()) - .properties("category", keywordWithPinyin()) - .properties("tags", keywordWithPinyin()) + .properties("title", textWithRawAndPinyin()) + .properties("content", textWithPinyinOnly()) + .properties("author", keywordWithRawAndPinyin()) + .properties("category", keywordWithRawAndPinyin()) + .properties("tags", keywordWithRawAndPinyin()) .properties("postId", Property.of(p -> p.long_(l -> l))) .properties( "createdAt", @@ -90,8 +92,8 @@ public class SearchIndexInitializer { return TypeMapping.of(builder -> builder .properties("type", Property.of(p -> p.keyword(k -> k))) - .properties("title", textWithPinyin()) - .properties("content", textWithPinyin()) + .properties("title", textWithRawAndPinyin()) + .properties("content", textWithPinyinOnly()) .properties( "createdAt", Property.of(p -> p.date(d -> d.format("strict_date_optional_time||epoch_millis"))) @@ -103,8 +105,8 @@ public class SearchIndexInitializer { return TypeMapping.of(builder -> builder .properties("type", Property.of(p -> p.keyword(k -> k))) - .properties("title", textWithPinyin()) - .properties("content", textWithPinyin()) + .properties("title", textWithRawAndPinyin()) + .properties("content", textWithPinyinOnly()) ); } @@ -112,8 +114,8 @@ public class SearchIndexInitializer { return TypeMapping.of(builder -> builder .properties("type", Property.of(p -> p.keyword(k -> k))) - .properties("title", textWithPinyin()) - .properties("content", textWithPinyin()) + .properties("title", textWithRawAndPinyin()) + .properties("content", textWithPinyinOnly()) .properties( "createdAt", Property.of(p -> p.date(d -> d.format("strict_date_optional_time||epoch_millis"))) @@ -121,45 +123,100 @@ public class SearchIndexInitializer { ); } - private Property textWithPinyin() { + // SearchIndexInitializer.java —— 只贴需要替换/新增的方法 + + /** 文本字段:.raw(keyword 精确) + .py(拼音短语精确) + .zh(ICU+2~3gram 召回) */ + private Property textWithRawAndPinyin() { return Property.of(p -> p.text(t -> - t.fields("py", field -> - field.text(sub -> sub.analyzer("py_index").searchAnalyzer("py_search")) - ) + t + .fields("raw", f -> f.keyword(k -> k.normalizer("lowercase_normalizer"))) + .fields("py", f -> f.text(sub -> sub.analyzer("py_index").searchAnalyzer("py_search"))) + .fields("zh", f -> + f.text(sub -> sub.analyzer("zh_ngram_index").searchAnalyzer("zh_search")) + ) ) ); } - private Property keywordWithPinyin() { + /** 长文本 content:保留拼音 + 新增 zh 子字段(不加 .raw,避免过长 keyword) */ + private Property textWithPinyinOnly() { + return Property.of(p -> + p.text(t -> + t + .fields("py", f -> f.text(sub -> sub.analyzer("py_index").searchAnalyzer("py_search"))) + .fields("zh", f -> + f.text(sub -> sub.analyzer("zh_ngram_index").searchAnalyzer("zh_search")) + ) + ) + ); + } + + /** 关键词字段(author/category/tags):keyword 等值 + .py + .zh(尽量对齐标题策略) */ + private Property keywordWithRawAndPinyin() { return Property.of(p -> p.keyword(k -> - k.fields("py", field -> - field.text(sub -> sub.analyzer("py_index").searchAnalyzer("py_search")) - ) + k + .normalizer("lowercase_normalizer") + .fields("raw", f -> f.keyword(kk -> kk.normalizer("lowercase_normalizer"))) + .fields("py", f -> f.text(sub -> sub.analyzer("py_index").searchAnalyzer("py_search"))) + .fields("zh", f -> + f.text(sub -> sub.analyzer("zh_ngram_index").searchAnalyzer("zh_search")) + ) ) ); } + /** 新增 zh 分析器(ICU + 2~3gram),并保留你已有的 pinyin/normalizer 设置 */ private IndexSettings.Builder applyPinyinAnalysis(IndexSettings.Builder builder) { Map settings = new LinkedHashMap<>(); + + // --- 已有:keyword normalizer(用于 .raw) + settings.put("analysis.normalizer.lowercase_normalizer.type", JsonData.of("custom")); + settings.put( + "analysis.normalizer.lowercase_normalizer.filter", + JsonData.of(List.of("lowercase")) + ); + + // --- 已有:pinyin filter + analyzers settings.put("analysis.filter.py_filter.type", JsonData.of("pinyin")); settings.put("analysis.filter.py_filter.keep_full_pinyin", JsonData.of(true)); settings.put("analysis.filter.py_filter.keep_joined_full_pinyin", JsonData.of(true)); settings.put("analysis.filter.py_filter.keep_first_letter", JsonData.of(false)); settings.put("analysis.filter.py_filter.remove_duplicated_term", JsonData.of(true)); + settings.put("analysis.analyzer.py_index.type", JsonData.of("custom")); settings.put("analysis.analyzer.py_index.tokenizer", JsonData.of("standard")); settings.put( "analysis.analyzer.py_index.filter", JsonData.of(List.of("lowercase", "py_filter")) ); + settings.put("analysis.analyzer.py_search.type", JsonData.of("custom")); settings.put("analysis.analyzer.py_search.tokenizer", JsonData.of("standard")); settings.put( "analysis.analyzer.py_search.filter", JsonData.of(List.of("lowercase", "py_filter")) ); + + settings.put("analysis.filter.zh_ngram_2_3.type", JsonData.of("ngram")); + settings.put("analysis.filter.zh_ngram_2_3.min_gram", JsonData.of(2)); + settings.put("analysis.filter.zh_ngram_2_3.max_gram", JsonData.of(3)); + + settings.put("analysis.analyzer.zh_ngram_index.type", JsonData.of("custom")); + settings.put("analysis.analyzer.zh_ngram_index.tokenizer", JsonData.of("icu_tokenizer")); + settings.put( + "analysis.analyzer.zh_ngram_index.filter", + JsonData.of(List.of("lowercase", "zh_ngram_2_3")) + ); + + settings.put("analysis.analyzer.zh_search.type", JsonData.of("custom")); + settings.put("analysis.analyzer.zh_search.tokenizer", JsonData.of("icu_tokenizer")); + settings.put( + "analysis.analyzer.zh_search.filter", + JsonData.of(List.of("lowercase", "zh_ngram_2_3")) + ); + settings.forEach(builder::customSettings); return builder; } diff --git a/backend/src/main/java/com/openisle/service/SearchService.java b/backend/src/main/java/com/openisle/service/SearchService.java index 61e845a32..ec9ad2770 100644 --- a/backend/src/main/java/com/openisle/service/SearchService.java +++ b/backend/src/main/java/com/openisle/service/SearchService.java @@ -26,6 +26,7 @@ import java.util.stream.Stream; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.opensearch.client.opensearch.OpenSearchClient; +import org.opensearch.client.opensearch._types.FieldValue; import org.opensearch.client.opensearch._types.query_dsl.TextQueryType; import org.opensearch.client.opensearch.core.SearchResponse; import org.opensearch.client.opensearch.core.search.Hit; @@ -229,6 +230,15 @@ public class SearchService { return openSearchProperties.isEnabled() && openSearchClient.isPresent(); } + // 在类里加上(字段或静态常量都可) + private static final java.util.regex.Pattern HANS_PATTERN = java.util.regex.Pattern.compile( + "\\p{IsHan}" + ); + + private static boolean containsHan(String s) { + return s != null && HANS_PATTERN.matcher(s).find(); + } + private List searchWithOpenSearch(String keyword) throws IOException { var client = openSearchClient.orElse(null); if (client == null) return List.of(); @@ -236,8 +246,7 @@ public class SearchService { final String qRaw = keyword == null ? "" : keyword.trim(); if (qRaw.isEmpty()) return List.of(); - final boolean enableWildcard = qRaw.length() >= 2; - final String qsEscaped = escapeForQueryString(qRaw); + final boolean hasHan = containsHan(qRaw); SearchResponse resp = client.search( b -> @@ -246,105 +255,110 @@ public class SearchService { .trackTotalHits(t -> t.enabled(true)) .query(qb -> qb.bool(bool -> { - // 1) 主召回:title/content + // ---------- 严格层 ---------- + // 中文/任意短语(轻微符号/空白扰动) bool.should(s -> - s.multiMatch(mm -> - mm - .query(qRaw) - .fields("title^3", "title.py^3", "content^2", "content.py^2") - .type(TextQueryType.BestFields) - .fuzziness("AUTO") - .minimumShouldMatch("70%") - .lenient(true) - ) + s.matchPhrase(mp -> mp.field("title").query(qRaw).slop(2).boost(6.0f)) + ); + bool.should(s -> + s.matchPhrase(mp -> mp.field("content").query(qRaw).slop(2).boost(2.5f)) ); - // 2) 兜底:open* 前缀命中 - if (enableWildcard) { - bool.should(s -> - s.queryString(qs -> - qs - .query( - "(title:" + - qsEscaped + - "* OR title.py:" + - qsEscaped + - "* OR content:" + - qsEscaped + - "* OR content.py:" + - qsEscaped + - "*)" - ) - .analyzeWildcard(true) - ) - ); - } - - // 3) 结构化字段(keyword) - // term 需要 FieldValue(用 lambda 设置 stringValue) + // 结构化等值(.raw) bool.should(s -> s.term(t -> t - .field("author") + .field("author.raw") .value(v -> v.stringValue(qRaw)) - .boost(2.0f) + .boost(4.0f) ) ); bool.should(s -> - s.match(m -> - m - .field("author.py") - .query(v -> v.stringValue(qRaw)) - .boost(2.0f) + s.term(t -> + t + .field("category.raw") + .value(v -> v.stringValue(qRaw)) + .boost(3.0f) ) ); bool.should(s -> - s.match(m -> - m - .field("category.py") - .query(v -> v.stringValue(qRaw)) - .boost(1.2f) - ) - ); - bool.should(s -> - s.match(m -> - m - .field("tags.py") - .query(v -> v.stringValue(qRaw)) - .boost(1.2f) + s.term(t -> + t + .field("tags.raw") + .value(v -> v.stringValue(qRaw)) + .boost(3.0f) ) ); - if (enableWildcard) { - // prefix/wildcard 这里的 value 是 String,直接传即可 - bool.should(s -> s.prefix(p -> p.field("category").value(qRaw).boost(1.2f))); - bool.should(s -> - s.wildcard(w -> w.field("category").value("*" + qRaw + "*").boost(1.0f)) - ); + // 拼音短语(严格) + bool.should(s -> + s.matchPhrase(mp -> mp.field("title.py").query(qRaw).slop(1).boost(4.0f)) + ); + bool.should(s -> + s.matchPhrase(mp -> mp.field("content.py").query(qRaw).slop(1).boost(1.8f)) + ); + bool.should(s -> + s.matchPhrase(mp -> mp.field("author.py").query(qRaw).slop(1).boost(2.2f)) + ); + bool.should(s -> + s.matchPhrase(mp -> mp.field("category.py").query(qRaw).slop(1).boost(2.0f)) + ); + bool.should(s -> + s.matchPhrase(mp -> mp.field("tags.py").query(qRaw).slop(1).boost(2.0f)) + ); + // ---------- 放宽层(仅当包含中文时启用) ---------- + if (hasHan) { + // title.zh bool.should(s -> - s.term(t -> - t - .field("tags") - .value(v -> v.stringValue(qRaw)) - .boost(1.2f) + s.match(m -> + m + .field("title.zh") + .query(org.opensearch.client.opensearch._types.FieldValue.of(qRaw)) + .operator(org.opensearch.client.opensearch._types.query_dsl.Operator.Or) + .minimumShouldMatch("2<-1 3<-1 4<-1 5<-2 6<-2 7<-3") + .boost(3.0f) ) ); + // content.zh bool.should(s -> - s.wildcard(w -> w.field("tags").value("*" + qRaw + "*").boost(1.0f)) + s.match(m -> + m + .field("content.zh") + .query(org.opensearch.client.opensearch._types.FieldValue.of(qRaw)) + .operator(org.opensearch.client.opensearch._types.query_dsl.Operator.Or) + .minimumShouldMatch("2<-1 3<-1 4<-1 5<-2 6<-2 7<-3") + .boost(1.6f) + ) ); } return bool.minimumShouldMatch("1"); }) ) - .highlight(h -> - h + // ---------- 高亮:允许跨子字段回填 + 匹配字段组 ---------- + .highlight(h -> { + var hb = h .preTags("") .postTags("") - .fields("title", f -> f.fragmentSize(highlightFragmentSize()).numberOfFragments(1)) + .requireFieldMatch(false) + .fields("title", f -> + f + .fragmentSize(highlightFragmentSize()) + .numberOfFragments(1) + .matchedFields(List.of("title", "title.zh", "title.py")) + ) + .fields("content", f -> + f + .fragmentSize(highlightFragmentSize()) + .numberOfFragments(1) + .matchedFields(List.of("content", "content.zh", "content.py")) + ) + .fields("title.zh", f -> f.fragmentSize(highlightFragmentSize()).numberOfFragments(1)) + .fields("content.zh", f -> + f.fragmentSize(highlightFragmentSize()).numberOfFragments(1) + ) .fields("title.py", f -> f.fragmentSize(highlightFragmentSize()).numberOfFragments(1)) - .fields("content", f -> f.fragmentSize(highlightFragmentSize()).numberOfFragments(1)) .fields("content.py", f -> f.fragmentSize(highlightFragmentSize()).numberOfFragments(1) ) @@ -353,8 +367,9 @@ public class SearchService { .fields("category", f -> f.numberOfFragments(0)) .fields("category.py", f -> f.numberOfFragments(0)) .fields("tags", f -> f.numberOfFragments(0)) - .fields("tags.py", f -> f.numberOfFragments(0)) - ) + .fields("tags.py", f -> f.numberOfFragments(0)); + return hb; + }) .size(DEFAULT_OPEN_SEARCH_LIMIT > 0 ? DEFAULT_OPEN_SEARCH_LIMIT : 10), SearchDocument.class ); @@ -435,8 +450,20 @@ public class SearchService { return null; } Map> highlight = hit.highlight(); - String highlightedContent = firstHighlight(highlight, "content", "content.py"); - String highlightedTitle = firstHighlight(highlight, "title", "title.py"); + String highlightedContent = firstHighlight( + highlight, + "content", + "content.py", + "content.zh", + "content.raw" + ); + String highlightedTitle = firstHighlight( + highlight, + "title", + "title.py", + "title.zh", + "title.raw" + ); String highlightedAuthor = firstHighlight(highlight, "author", "author.py"); String highlightedCategory = firstHighlight(highlight, "category", "category.py"); boolean highlightTitle = highlightedTitle != null && !highlightedTitle.isBlank(); @@ -451,6 +478,7 @@ public class SearchService { if (snippetHtml == null && highlightTitle) { snippetHtml = highlightedTitle; } + String snippet = snippetHtml != null && !snippetHtml.isBlank() ? cleanHighlight(snippetHtml) : null;