mirror of
https://github.com/nagisa77/OpenIsle.git
synced 2026-02-20 14:00:56 +08:00
fix: 索引/查询规则微调
This commit is contained in:
@@ -52,15 +52,17 @@ public class SearchIndexInitializer {
|
||||
}
|
||||
}
|
||||
|
||||
// SearchIndexInitializer.java —— 只贴需要替换/新增的方法
|
||||
|
||||
private TypeMapping postMapping() {
|
||||
return TypeMapping.of(builder ->
|
||||
builder
|
||||
.properties("type", Property.of(p -> p.keyword(k -> k)))
|
||||
.properties("title", textWithPinyin())
|
||||
.properties("content", textWithPinyin())
|
||||
.properties("author", keywordWithPinyin())
|
||||
.properties("category", keywordWithPinyin())
|
||||
.properties("tags", keywordWithPinyin())
|
||||
.properties("title", textWithRawAndPinyin())
|
||||
.properties("content", textWithPinyinOnly()) // content 不做 .raw,避免超长 keyword
|
||||
.properties("author", keywordWithRawAndPinyin())
|
||||
.properties("category", keywordWithRawAndPinyin())
|
||||
.properties("tags", keywordWithRawAndPinyin())
|
||||
.properties("postId", Property.of(p -> p.long_(l -> l)))
|
||||
.properties(
|
||||
"createdAt",
|
||||
@@ -73,11 +75,11 @@ public class SearchIndexInitializer {
|
||||
return TypeMapping.of(builder ->
|
||||
builder
|
||||
.properties("type", Property.of(p -> p.keyword(k -> k)))
|
||||
.properties("title", textWithPinyin())
|
||||
.properties("content", textWithPinyin())
|
||||
.properties("author", keywordWithPinyin())
|
||||
.properties("category", keywordWithPinyin())
|
||||
.properties("tags", keywordWithPinyin())
|
||||
.properties("title", textWithRawAndPinyin())
|
||||
.properties("content", textWithPinyinOnly())
|
||||
.properties("author", keywordWithRawAndPinyin())
|
||||
.properties("category", keywordWithRawAndPinyin())
|
||||
.properties("tags", keywordWithRawAndPinyin())
|
||||
.properties("postId", Property.of(p -> p.long_(l -> l)))
|
||||
.properties(
|
||||
"createdAt",
|
||||
@@ -90,8 +92,8 @@ public class SearchIndexInitializer {
|
||||
return TypeMapping.of(builder ->
|
||||
builder
|
||||
.properties("type", Property.of(p -> p.keyword(k -> k)))
|
||||
.properties("title", textWithPinyin())
|
||||
.properties("content", textWithPinyin())
|
||||
.properties("title", textWithRawAndPinyin())
|
||||
.properties("content", textWithPinyinOnly())
|
||||
.properties(
|
||||
"createdAt",
|
||||
Property.of(p -> p.date(d -> d.format("strict_date_optional_time||epoch_millis")))
|
||||
@@ -103,8 +105,8 @@ public class SearchIndexInitializer {
|
||||
return TypeMapping.of(builder ->
|
||||
builder
|
||||
.properties("type", Property.of(p -> p.keyword(k -> k)))
|
||||
.properties("title", textWithPinyin())
|
||||
.properties("content", textWithPinyin())
|
||||
.properties("title", textWithRawAndPinyin())
|
||||
.properties("content", textWithPinyinOnly())
|
||||
);
|
||||
}
|
||||
|
||||
@@ -112,8 +114,8 @@ public class SearchIndexInitializer {
|
||||
return TypeMapping.of(builder ->
|
||||
builder
|
||||
.properties("type", Property.of(p -> p.keyword(k -> k)))
|
||||
.properties("title", textWithPinyin())
|
||||
.properties("content", textWithPinyin())
|
||||
.properties("title", textWithRawAndPinyin())
|
||||
.properties("content", textWithPinyinOnly())
|
||||
.properties(
|
||||
"createdAt",
|
||||
Property.of(p -> p.date(d -> d.format("strict_date_optional_time||epoch_millis")))
|
||||
@@ -121,45 +123,100 @@ public class SearchIndexInitializer {
|
||||
);
|
||||
}
|
||||
|
||||
private Property textWithPinyin() {
|
||||
// SearchIndexInitializer.java —— 只贴需要替换/新增的方法
|
||||
|
||||
/** 文本字段:.raw(keyword 精确) + .py(拼音短语精确) + .zh(ICU+2~3gram 召回) */
|
||||
private Property textWithRawAndPinyin() {
|
||||
return Property.of(p ->
|
||||
p.text(t ->
|
||||
t.fields("py", field ->
|
||||
field.text(sub -> sub.analyzer("py_index").searchAnalyzer("py_search"))
|
||||
)
|
||||
t
|
||||
.fields("raw", f -> f.keyword(k -> k.normalizer("lowercase_normalizer")))
|
||||
.fields("py", f -> f.text(sub -> sub.analyzer("py_index").searchAnalyzer("py_search")))
|
||||
.fields("zh", f ->
|
||||
f.text(sub -> sub.analyzer("zh_ngram_index").searchAnalyzer("zh_search"))
|
||||
)
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
private Property keywordWithPinyin() {
|
||||
/** 长文本 content:保留拼音 + 新增 zh 子字段(不加 .raw,避免过长 keyword) */
|
||||
private Property textWithPinyinOnly() {
|
||||
return Property.of(p ->
|
||||
p.text(t ->
|
||||
t
|
||||
.fields("py", f -> f.text(sub -> sub.analyzer("py_index").searchAnalyzer("py_search")))
|
||||
.fields("zh", f ->
|
||||
f.text(sub -> sub.analyzer("zh_ngram_index").searchAnalyzer("zh_search"))
|
||||
)
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
/** 关键词字段(author/category/tags):keyword 等值 + .py + .zh(尽量对齐标题策略) */
|
||||
private Property keywordWithRawAndPinyin() {
|
||||
return Property.of(p ->
|
||||
p.keyword(k ->
|
||||
k.fields("py", field ->
|
||||
field.text(sub -> sub.analyzer("py_index").searchAnalyzer("py_search"))
|
||||
)
|
||||
k
|
||||
.normalizer("lowercase_normalizer")
|
||||
.fields("raw", f -> f.keyword(kk -> kk.normalizer("lowercase_normalizer")))
|
||||
.fields("py", f -> f.text(sub -> sub.analyzer("py_index").searchAnalyzer("py_search")))
|
||||
.fields("zh", f ->
|
||||
f.text(sub -> sub.analyzer("zh_ngram_index").searchAnalyzer("zh_search"))
|
||||
)
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
/** 新增 zh 分析器(ICU + 2~3gram),并保留你已有的 pinyin/normalizer 设置 */
|
||||
private IndexSettings.Builder applyPinyinAnalysis(IndexSettings.Builder builder) {
|
||||
Map<String, JsonData> settings = new LinkedHashMap<>();
|
||||
|
||||
// --- 已有:keyword normalizer(用于 .raw)
|
||||
settings.put("analysis.normalizer.lowercase_normalizer.type", JsonData.of("custom"));
|
||||
settings.put(
|
||||
"analysis.normalizer.lowercase_normalizer.filter",
|
||||
JsonData.of(List.of("lowercase"))
|
||||
);
|
||||
|
||||
// --- 已有:pinyin filter + analyzers
|
||||
settings.put("analysis.filter.py_filter.type", JsonData.of("pinyin"));
|
||||
settings.put("analysis.filter.py_filter.keep_full_pinyin", JsonData.of(true));
|
||||
settings.put("analysis.filter.py_filter.keep_joined_full_pinyin", JsonData.of(true));
|
||||
settings.put("analysis.filter.py_filter.keep_first_letter", JsonData.of(false));
|
||||
settings.put("analysis.filter.py_filter.remove_duplicated_term", JsonData.of(true));
|
||||
|
||||
settings.put("analysis.analyzer.py_index.type", JsonData.of("custom"));
|
||||
settings.put("analysis.analyzer.py_index.tokenizer", JsonData.of("standard"));
|
||||
settings.put(
|
||||
"analysis.analyzer.py_index.filter",
|
||||
JsonData.of(List.of("lowercase", "py_filter"))
|
||||
);
|
||||
|
||||
settings.put("analysis.analyzer.py_search.type", JsonData.of("custom"));
|
||||
settings.put("analysis.analyzer.py_search.tokenizer", JsonData.of("standard"));
|
||||
settings.put(
|
||||
"analysis.analyzer.py_search.filter",
|
||||
JsonData.of(List.of("lowercase", "py_filter"))
|
||||
);
|
||||
|
||||
settings.put("analysis.filter.zh_ngram_2_3.type", JsonData.of("ngram"));
|
||||
settings.put("analysis.filter.zh_ngram_2_3.min_gram", JsonData.of(2));
|
||||
settings.put("analysis.filter.zh_ngram_2_3.max_gram", JsonData.of(3));
|
||||
|
||||
settings.put("analysis.analyzer.zh_ngram_index.type", JsonData.of("custom"));
|
||||
settings.put("analysis.analyzer.zh_ngram_index.tokenizer", JsonData.of("icu_tokenizer"));
|
||||
settings.put(
|
||||
"analysis.analyzer.zh_ngram_index.filter",
|
||||
JsonData.of(List.of("lowercase", "zh_ngram_2_3"))
|
||||
);
|
||||
|
||||
settings.put("analysis.analyzer.zh_search.type", JsonData.of("custom"));
|
||||
settings.put("analysis.analyzer.zh_search.tokenizer", JsonData.of("icu_tokenizer"));
|
||||
settings.put(
|
||||
"analysis.analyzer.zh_search.filter",
|
||||
JsonData.of(List.of("lowercase", "zh_ngram_2_3"))
|
||||
);
|
||||
|
||||
settings.forEach(builder::customSettings);
|
||||
return builder;
|
||||
}
|
||||
|
||||
@@ -26,6 +26,7 @@ import java.util.stream.Stream;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.opensearch.client.opensearch.OpenSearchClient;
|
||||
import org.opensearch.client.opensearch._types.FieldValue;
|
||||
import org.opensearch.client.opensearch._types.query_dsl.TextQueryType;
|
||||
import org.opensearch.client.opensearch.core.SearchResponse;
|
||||
import org.opensearch.client.opensearch.core.search.Hit;
|
||||
@@ -229,6 +230,15 @@ public class SearchService {
|
||||
return openSearchProperties.isEnabled() && openSearchClient.isPresent();
|
||||
}
|
||||
|
||||
// 在类里加上(字段或静态常量都可)
|
||||
private static final java.util.regex.Pattern HANS_PATTERN = java.util.regex.Pattern.compile(
|
||||
"\\p{IsHan}"
|
||||
);
|
||||
|
||||
private static boolean containsHan(String s) {
|
||||
return s != null && HANS_PATTERN.matcher(s).find();
|
||||
}
|
||||
|
||||
private List<SearchResult> searchWithOpenSearch(String keyword) throws IOException {
|
||||
var client = openSearchClient.orElse(null);
|
||||
if (client == null) return List.of();
|
||||
@@ -236,8 +246,7 @@ public class SearchService {
|
||||
final String qRaw = keyword == null ? "" : keyword.trim();
|
||||
if (qRaw.isEmpty()) return List.of();
|
||||
|
||||
final boolean enableWildcard = qRaw.length() >= 2;
|
||||
final String qsEscaped = escapeForQueryString(qRaw);
|
||||
final boolean hasHan = containsHan(qRaw);
|
||||
|
||||
SearchResponse<SearchDocument> resp = client.search(
|
||||
b ->
|
||||
@@ -246,105 +255,110 @@ public class SearchService {
|
||||
.trackTotalHits(t -> t.enabled(true))
|
||||
.query(qb ->
|
||||
qb.bool(bool -> {
|
||||
// 1) 主召回:title/content
|
||||
// ---------- 严格层 ----------
|
||||
// 中文/任意短语(轻微符号/空白扰动)
|
||||
bool.should(s ->
|
||||
s.multiMatch(mm ->
|
||||
mm
|
||||
.query(qRaw)
|
||||
.fields("title^3", "title.py^3", "content^2", "content.py^2")
|
||||
.type(TextQueryType.BestFields)
|
||||
.fuzziness("AUTO")
|
||||
.minimumShouldMatch("70%")
|
||||
.lenient(true)
|
||||
)
|
||||
s.matchPhrase(mp -> mp.field("title").query(qRaw).slop(2).boost(6.0f))
|
||||
);
|
||||
bool.should(s ->
|
||||
s.matchPhrase(mp -> mp.field("content").query(qRaw).slop(2).boost(2.5f))
|
||||
);
|
||||
|
||||
// 2) 兜底:open* 前缀命中
|
||||
if (enableWildcard) {
|
||||
bool.should(s ->
|
||||
s.queryString(qs ->
|
||||
qs
|
||||
.query(
|
||||
"(title:" +
|
||||
qsEscaped +
|
||||
"* OR title.py:" +
|
||||
qsEscaped +
|
||||
"* OR content:" +
|
||||
qsEscaped +
|
||||
"* OR content.py:" +
|
||||
qsEscaped +
|
||||
"*)"
|
||||
)
|
||||
.analyzeWildcard(true)
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
// 3) 结构化字段(keyword)
|
||||
// term 需要 FieldValue(用 lambda 设置 stringValue)
|
||||
// 结构化等值(.raw)
|
||||
bool.should(s ->
|
||||
s.term(t ->
|
||||
t
|
||||
.field("author")
|
||||
.field("author.raw")
|
||||
.value(v -> v.stringValue(qRaw))
|
||||
.boost(2.0f)
|
||||
.boost(4.0f)
|
||||
)
|
||||
);
|
||||
bool.should(s ->
|
||||
s.match(m ->
|
||||
m
|
||||
.field("author.py")
|
||||
.query(v -> v.stringValue(qRaw))
|
||||
.boost(2.0f)
|
||||
s.term(t ->
|
||||
t
|
||||
.field("category.raw")
|
||||
.value(v -> v.stringValue(qRaw))
|
||||
.boost(3.0f)
|
||||
)
|
||||
);
|
||||
bool.should(s ->
|
||||
s.match(m ->
|
||||
m
|
||||
.field("category.py")
|
||||
.query(v -> v.stringValue(qRaw))
|
||||
.boost(1.2f)
|
||||
)
|
||||
);
|
||||
bool.should(s ->
|
||||
s.match(m ->
|
||||
m
|
||||
.field("tags.py")
|
||||
.query(v -> v.stringValue(qRaw))
|
||||
.boost(1.2f)
|
||||
s.term(t ->
|
||||
t
|
||||
.field("tags.raw")
|
||||
.value(v -> v.stringValue(qRaw))
|
||||
.boost(3.0f)
|
||||
)
|
||||
);
|
||||
|
||||
if (enableWildcard) {
|
||||
// prefix/wildcard 这里的 value 是 String,直接传即可
|
||||
bool.should(s -> s.prefix(p -> p.field("category").value(qRaw).boost(1.2f)));
|
||||
bool.should(s ->
|
||||
s.wildcard(w -> w.field("category").value("*" + qRaw + "*").boost(1.0f))
|
||||
);
|
||||
// 拼音短语(严格)
|
||||
bool.should(s ->
|
||||
s.matchPhrase(mp -> mp.field("title.py").query(qRaw).slop(1).boost(4.0f))
|
||||
);
|
||||
bool.should(s ->
|
||||
s.matchPhrase(mp -> mp.field("content.py").query(qRaw).slop(1).boost(1.8f))
|
||||
);
|
||||
bool.should(s ->
|
||||
s.matchPhrase(mp -> mp.field("author.py").query(qRaw).slop(1).boost(2.2f))
|
||||
);
|
||||
bool.should(s ->
|
||||
s.matchPhrase(mp -> mp.field("category.py").query(qRaw).slop(1).boost(2.0f))
|
||||
);
|
||||
bool.should(s ->
|
||||
s.matchPhrase(mp -> mp.field("tags.py").query(qRaw).slop(1).boost(2.0f))
|
||||
);
|
||||
|
||||
// ---------- 放宽层(仅当包含中文时启用) ----------
|
||||
if (hasHan) {
|
||||
// title.zh
|
||||
bool.should(s ->
|
||||
s.term(t ->
|
||||
t
|
||||
.field("tags")
|
||||
.value(v -> v.stringValue(qRaw))
|
||||
.boost(1.2f)
|
||||
s.match(m ->
|
||||
m
|
||||
.field("title.zh")
|
||||
.query(org.opensearch.client.opensearch._types.FieldValue.of(qRaw))
|
||||
.operator(org.opensearch.client.opensearch._types.query_dsl.Operator.Or)
|
||||
.minimumShouldMatch("2<-1 3<-1 4<-1 5<-2 6<-2 7<-3")
|
||||
.boost(3.0f)
|
||||
)
|
||||
);
|
||||
// content.zh
|
||||
bool.should(s ->
|
||||
s.wildcard(w -> w.field("tags").value("*" + qRaw + "*").boost(1.0f))
|
||||
s.match(m ->
|
||||
m
|
||||
.field("content.zh")
|
||||
.query(org.opensearch.client.opensearch._types.FieldValue.of(qRaw))
|
||||
.operator(org.opensearch.client.opensearch._types.query_dsl.Operator.Or)
|
||||
.minimumShouldMatch("2<-1 3<-1 4<-1 5<-2 6<-2 7<-3")
|
||||
.boost(1.6f)
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
return bool.minimumShouldMatch("1");
|
||||
})
|
||||
)
|
||||
.highlight(h ->
|
||||
h
|
||||
// ---------- 高亮:允许跨子字段回填 + 匹配字段组 ----------
|
||||
.highlight(h -> {
|
||||
var hb = h
|
||||
.preTags("<mark>")
|
||||
.postTags("</mark>")
|
||||
.fields("title", f -> f.fragmentSize(highlightFragmentSize()).numberOfFragments(1))
|
||||
.requireFieldMatch(false)
|
||||
.fields("title", f ->
|
||||
f
|
||||
.fragmentSize(highlightFragmentSize())
|
||||
.numberOfFragments(1)
|
||||
.matchedFields(List.of("title", "title.zh", "title.py"))
|
||||
)
|
||||
.fields("content", f ->
|
||||
f
|
||||
.fragmentSize(highlightFragmentSize())
|
||||
.numberOfFragments(1)
|
||||
.matchedFields(List.of("content", "content.zh", "content.py"))
|
||||
)
|
||||
.fields("title.zh", f -> f.fragmentSize(highlightFragmentSize()).numberOfFragments(1))
|
||||
.fields("content.zh", f ->
|
||||
f.fragmentSize(highlightFragmentSize()).numberOfFragments(1)
|
||||
)
|
||||
.fields("title.py", f -> f.fragmentSize(highlightFragmentSize()).numberOfFragments(1))
|
||||
.fields("content", f -> f.fragmentSize(highlightFragmentSize()).numberOfFragments(1))
|
||||
.fields("content.py", f ->
|
||||
f.fragmentSize(highlightFragmentSize()).numberOfFragments(1)
|
||||
)
|
||||
@@ -353,8 +367,9 @@ public class SearchService {
|
||||
.fields("category", f -> f.numberOfFragments(0))
|
||||
.fields("category.py", f -> f.numberOfFragments(0))
|
||||
.fields("tags", f -> f.numberOfFragments(0))
|
||||
.fields("tags.py", f -> f.numberOfFragments(0))
|
||||
)
|
||||
.fields("tags.py", f -> f.numberOfFragments(0));
|
||||
return hb;
|
||||
})
|
||||
.size(DEFAULT_OPEN_SEARCH_LIMIT > 0 ? DEFAULT_OPEN_SEARCH_LIMIT : 10),
|
||||
SearchDocument.class
|
||||
);
|
||||
@@ -435,8 +450,20 @@ public class SearchService {
|
||||
return null;
|
||||
}
|
||||
Map<String, List<String>> highlight = hit.highlight();
|
||||
String highlightedContent = firstHighlight(highlight, "content", "content.py");
|
||||
String highlightedTitle = firstHighlight(highlight, "title", "title.py");
|
||||
String highlightedContent = firstHighlight(
|
||||
highlight,
|
||||
"content",
|
||||
"content.py",
|
||||
"content.zh",
|
||||
"content.raw"
|
||||
);
|
||||
String highlightedTitle = firstHighlight(
|
||||
highlight,
|
||||
"title",
|
||||
"title.py",
|
||||
"title.zh",
|
||||
"title.raw"
|
||||
);
|
||||
String highlightedAuthor = firstHighlight(highlight, "author", "author.py");
|
||||
String highlightedCategory = firstHighlight(highlight, "category", "category.py");
|
||||
boolean highlightTitle = highlightedTitle != null && !highlightedTitle.isBlank();
|
||||
@@ -451,6 +478,7 @@ public class SearchService {
|
||||
if (snippetHtml == null && highlightTitle) {
|
||||
snippetHtml = highlightedTitle;
|
||||
}
|
||||
|
||||
String snippet = snippetHtml != null && !snippetHtml.isBlank()
|
||||
? cleanHighlight(snippetHtml)
|
||||
: null;
|
||||
|
||||
Reference in New Issue
Block a user