Files
OpenIsle/backend/src/main/java/com/openisle/search/SearchIndexInitializer.java
2025-09-28 17:58:58 +08:00

220 lines
8.1 KiB
Java
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package com.openisle.search;
import jakarta.annotation.PostConstruct;
import java.io.IOException;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.opensearch.client.json.JsonData;
import org.opensearch.client.opensearch.OpenSearchClient;
import org.opensearch.client.opensearch._types.mapping.Property;
import org.opensearch.client.opensearch._types.mapping.TypeMapping;
import org.opensearch.client.opensearch.indices.IndexSettings;
@Slf4j
@RequiredArgsConstructor
public class SearchIndexInitializer {
private final OpenSearchClient client;
private final OpenSearchProperties properties;
@PostConstruct
public void initialize() {
if (!properties.isEnabled() || !properties.isInitialize()) {
return;
}
ensureIndex(properties.postsIndex(), this::postMapping);
ensureIndex(properties.commentsIndex(), this::commentMapping);
ensureIndex(properties.usersIndex(), this::userMapping);
ensureIndex(properties.categoriesIndex(), this::categoryMapping);
ensureIndex(properties.tagsIndex(), this::tagMapping);
}
private void ensureIndex(String index, java.util.function.Supplier<TypeMapping> mappingSupplier) {
try {
boolean exists = client
.indices()
.exists(builder -> builder.index(index))
.value();
if (exists) {
return;
}
client
.indices()
.create(builder ->
builder.index(index).settings(this::applyPinyinAnalysis).mappings(mappingSupplier.get())
);
log.info("Created OpenSearch index {}", index);
} catch (IOException e) {
log.warn("Failed to initialize OpenSearch index {}", index, e);
}
}
private TypeMapping postMapping() {
return TypeMapping.of(builder ->
builder
.properties("type", Property.of(p -> p.keyword(k -> k)))
.properties("title", textWithRawAndPinyin())
.properties("content", textWithPinyinOnly()) // content 不做 .raw避免超长 keyword
.properties("author", keywordWithRawAndPinyin())
.properties("category", keywordWithRawAndPinyin())
.properties("tags", keywordWithRawAndPinyin())
.properties("postId", Property.of(p -> p.long_(l -> l)))
.properties(
"createdAt",
Property.of(p -> p.date(d -> d.format("strict_date_optional_time||epoch_millis")))
)
);
}
private TypeMapping commentMapping() {
return TypeMapping.of(builder ->
builder
.properties("type", Property.of(p -> p.keyword(k -> k)))
.properties("title", textWithRawAndPinyin())
.properties("content", textWithPinyinOnly())
.properties("author", keywordWithRawAndPinyin())
.properties("category", keywordWithRawAndPinyin())
.properties("tags", keywordWithRawAndPinyin())
.properties("postId", Property.of(p -> p.long_(l -> l)))
.properties(
"createdAt",
Property.of(p -> p.date(d -> d.format("strict_date_optional_time||epoch_millis")))
)
);
}
private TypeMapping userMapping() {
return TypeMapping.of(builder ->
builder
.properties("type", Property.of(p -> p.keyword(k -> k)))
.properties("title", textWithRawAndPinyin())
.properties("content", textWithPinyinOnly())
.properties(
"createdAt",
Property.of(p -> p.date(d -> d.format("strict_date_optional_time||epoch_millis")))
)
);
}
private TypeMapping categoryMapping() {
return TypeMapping.of(builder ->
builder
.properties("type", Property.of(p -> p.keyword(k -> k)))
.properties("title", textWithRawAndPinyin())
.properties("content", textWithPinyinOnly())
);
}
private TypeMapping tagMapping() {
return TypeMapping.of(builder ->
builder
.properties("type", Property.of(p -> p.keyword(k -> k)))
.properties("title", textWithRawAndPinyin())
.properties("content", textWithPinyinOnly())
.properties(
"createdAt",
Property.of(p -> p.date(d -> d.format("strict_date_optional_time||epoch_millis")))
)
);
}
/** 文本字段:.rawkeyword 精确) + .py拼音短语精确 + .zhICU+2~3gram 召回) */
private Property textWithRawAndPinyin() {
return Property.of(p ->
p.text(t ->
t
.fields("raw", f -> f.keyword(k -> k.normalizer("lowercase_normalizer")))
.fields("py", f -> f.text(sub -> sub.analyzer("py_index").searchAnalyzer("py_search")))
.fields("zh", f ->
f.text(sub -> sub.analyzer("zh_ngram_index").searchAnalyzer("zh_search"))
)
)
);
}
/** 长文本 content保留拼音 + 新增 zh 子字段(不加 .raw避免过长 keyword */
private Property textWithPinyinOnly() {
return Property.of(p ->
p.text(t ->
t
.fields("py", f -> f.text(sub -> sub.analyzer("py_index").searchAnalyzer("py_search")))
.fields("zh", f ->
f.text(sub -> sub.analyzer("zh_ngram_index").searchAnalyzer("zh_search"))
)
)
);
}
/** 关键词字段author/category/tagskeyword 等值 + .py + .zh尽量对齐标题策略 */
private Property keywordWithRawAndPinyin() {
return Property.of(p ->
p.keyword(k ->
k
.normalizer("lowercase_normalizer")
.fields("raw", f -> f.keyword(kk -> kk.normalizer("lowercase_normalizer")))
.fields("py", f -> f.text(sub -> sub.analyzer("py_index").searchAnalyzer("py_search")))
.fields("zh", f ->
f.text(sub -> sub.analyzer("zh_ngram_index").searchAnalyzer("zh_search"))
)
)
);
}
/** 新增 zh 分析器ICU + 2~3gram并保留你已有的 pinyin/normalizer 设置 */
private IndexSettings.Builder applyPinyinAnalysis(IndexSettings.Builder builder) {
Map<String, JsonData> settings = new LinkedHashMap<>();
// --- 已有keyword normalizer用于 .raw
settings.put("analysis.normalizer.lowercase_normalizer.type", JsonData.of("custom"));
settings.put(
"analysis.normalizer.lowercase_normalizer.filter",
JsonData.of(List.of("lowercase"))
);
// --- 已有pinyin filter + analyzers
settings.put("analysis.filter.py_filter.type", JsonData.of("pinyin"));
settings.put("analysis.filter.py_filter.keep_full_pinyin", JsonData.of(true));
settings.put("analysis.filter.py_filter.keep_joined_full_pinyin", JsonData.of(true));
settings.put("analysis.filter.py_filter.keep_first_letter", JsonData.of(false));
settings.put("analysis.filter.py_filter.remove_duplicated_term", JsonData.of(true));
settings.put("analysis.analyzer.py_index.type", JsonData.of("custom"));
settings.put("analysis.analyzer.py_index.tokenizer", JsonData.of("standard"));
settings.put(
"analysis.analyzer.py_index.filter",
JsonData.of(List.of("lowercase", "py_filter"))
);
settings.put("analysis.analyzer.py_search.type", JsonData.of("custom"));
settings.put("analysis.analyzer.py_search.tokenizer", JsonData.of("standard"));
settings.put(
"analysis.analyzer.py_search.filter",
JsonData.of(List.of("lowercase", "py_filter"))
);
settings.put("analysis.filter.zh_ngram_2_3.type", JsonData.of("ngram"));
settings.put("analysis.filter.zh_ngram_2_3.min_gram", JsonData.of(2));
settings.put("analysis.filter.zh_ngram_2_3.max_gram", JsonData.of(3));
settings.put("analysis.analyzer.zh_ngram_index.type", JsonData.of("custom"));
settings.put("analysis.analyzer.zh_ngram_index.tokenizer", JsonData.of("icu_tokenizer"));
settings.put(
"analysis.analyzer.zh_ngram_index.filter",
JsonData.of(List.of("lowercase", "zh_ngram_2_3"))
);
settings.put("analysis.analyzer.zh_search.type", JsonData.of("custom"));
settings.put("analysis.analyzer.zh_search.tokenizer", JsonData.of("icu_tokenizer"));
settings.put(
"analysis.analyzer.zh_search.filter",
JsonData.of(List.of("lowercase", "zh_ngram_2_3"))
);
settings.forEach(builder::customSettings);
return builder;
}
}