fix: add pinyin

This commit is contained in:
Tim
2025-09-28 14:28:45 +08:00
parent 23cc2d1606
commit 8869121bcb
2 changed files with 105 additions and 19 deletions

View File

@@ -2,11 +2,16 @@ package com.openisle.search;
import jakarta.annotation.PostConstruct;
import java.io.IOException;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.opensearch.client.json.JsonData;
import org.opensearch.client.opensearch.OpenSearchClient;
import org.opensearch.client.opensearch._types.mapping.Property;
import org.opensearch.client.opensearch._types.mapping.TypeMapping;
import org.opensearch.client.opensearch.indices.IndexSettings;
@Slf4j
@RequiredArgsConstructor
@@ -36,7 +41,11 @@ public class SearchIndexInitializer {
if (exists) {
return;
}
client.indices().create(builder -> builder.index(index).mappings(mappingSupplier.get()));
client
.indices()
.create(builder ->
builder.index(index).settings(this::applyPinyinAnalysis).mappings(mappingSupplier.get())
);
log.info("Created OpenSearch index {}", index);
} catch (IOException e) {
log.warn("Failed to initialize OpenSearch index {}", index, e);
@@ -47,11 +56,11 @@ public class SearchIndexInitializer {
return TypeMapping.of(builder ->
builder
.properties("type", Property.of(p -> p.keyword(k -> k)))
.properties("title", Property.of(p -> p.text(t -> t)))
.properties("content", Property.of(p -> p.text(t -> t)))
.properties("author", Property.of(p -> p.keyword(k -> k)))
.properties("category", Property.of(p -> p.keyword(k -> k)))
.properties("tags", Property.of(p -> p.keyword(k -> k)))
.properties("title", textWithPinyin())
.properties("content", textWithPinyin())
.properties("author", keywordWithPinyin())
.properties("category", keywordWithPinyin())
.properties("tags", keywordWithPinyin())
.properties("postId", Property.of(p -> p.long_(l -> l)))
.properties(
"createdAt",
@@ -64,11 +73,11 @@ public class SearchIndexInitializer {
return TypeMapping.of(builder ->
builder
.properties("type", Property.of(p -> p.keyword(k -> k)))
.properties("title", Property.of(p -> p.text(t -> t)))
.properties("content", Property.of(p -> p.text(t -> t)))
.properties("author", Property.of(p -> p.keyword(k -> k)))
.properties("category", Property.of(p -> p.keyword(k -> k)))
.properties("tags", Property.of(p -> p.keyword(k -> k)))
.properties("title", textWithPinyin())
.properties("content", textWithPinyin())
.properties("author", keywordWithPinyin())
.properties("category", keywordWithPinyin())
.properties("tags", keywordWithPinyin())
.properties("postId", Property.of(p -> p.long_(l -> l)))
.properties(
"createdAt",
@@ -81,8 +90,8 @@ public class SearchIndexInitializer {
return TypeMapping.of(builder ->
builder
.properties("type", Property.of(p -> p.keyword(k -> k)))
.properties("title", Property.of(p -> p.text(t -> t)))
.properties("content", Property.of(p -> p.text(t -> t)))
.properties("title", textWithPinyin())
.properties("content", textWithPinyin())
.properties(
"createdAt",
Property.of(p -> p.date(d -> d.format("strict_date_optional_time||epoch_millis")))
@@ -94,8 +103,8 @@ public class SearchIndexInitializer {
return TypeMapping.of(builder ->
builder
.properties("type", Property.of(p -> p.keyword(k -> k)))
.properties("title", Property.of(p -> p.text(t -> t)))
.properties("content", Property.of(p -> p.text(t -> t)))
.properties("title", textWithPinyin())
.properties("content", textWithPinyin())
);
}
@@ -103,12 +112,55 @@ public class SearchIndexInitializer {
return TypeMapping.of(builder ->
builder
.properties("type", Property.of(p -> p.keyword(k -> k)))
.properties("title", Property.of(p -> p.text(t -> t)))
.properties("content", Property.of(p -> p.text(t -> t)))
.properties("title", textWithPinyin())
.properties("content", textWithPinyin())
.properties(
"createdAt",
Property.of(p -> p.date(d -> d.format("strict_date_optional_time||epoch_millis")))
)
);
}
private Property textWithPinyin() {
return Property.of(p ->
p.text(t ->
t.fields("py", field ->
field.text(sub -> sub.analyzer("py_index").searchAnalyzer("py_search"))
)
)
);
}
private Property keywordWithPinyin() {
return Property.of(p ->
p.keyword(k ->
k.fields("py", field ->
field.text(sub -> sub.analyzer("py_index").searchAnalyzer("py_search"))
)
)
);
}
private IndexSettings.Builder applyPinyinAnalysis(IndexSettings.Builder builder) {
Map<String, JsonData> settings = new LinkedHashMap<>();
settings.put("analysis.filter.py_filter.type", JsonData.of("pinyin"));
settings.put("analysis.filter.py_filter.keep_full_pinyin", JsonData.of(true));
settings.put("analysis.filter.py_filter.keep_joined_full_pinyin", JsonData.of(true));
settings.put("analysis.filter.py_filter.keep_first_letter", JsonData.of(true));
settings.put("analysis.filter.py_filter.remove_duplicated_term", JsonData.of(true));
settings.put("analysis.analyzer.py_index.type", JsonData.of("custom"));
settings.put("analysis.analyzer.py_index.tokenizer", JsonData.of("standard"));
settings.put(
"analysis.analyzer.py_index.filter",
JsonData.of(List.of("lowercase", "py_filter"))
);
settings.put("analysis.analyzer.py_search.type", JsonData.of("custom"));
settings.put("analysis.analyzer.py_search.tokenizer", JsonData.of("standard"));
settings.put(
"analysis.analyzer.py_search.filter",
JsonData.of(List.of("lowercase", "py_filter"))
);
settings.forEach(builder::customSettings);
return builder;
}
}

View File

@@ -197,7 +197,7 @@ public class SearchService {
s.multiMatch(mm ->
mm
.query(qRaw)
.fields("title^3", "content^2")
.fields("title^3", "title.py^3", "content^2", "content.py^2")
.type(TextQueryType.BestFields)
.fuzziness("AUTO")
.minimumShouldMatch("70%")
@@ -210,7 +210,17 @@ public class SearchService {
bool.should(s ->
s.queryString(qs ->
qs
.query("(title:" + qsEscaped + "* OR content:" + qsEscaped + "*)")
.query(
"(title:" +
qsEscaped +
"* OR title.py:" +
qsEscaped +
"* OR content:" +
qsEscaped +
"* OR content.py:" +
qsEscaped +
"*)"
)
.analyzeWildcard(true)
)
);
@@ -226,6 +236,30 @@ public class SearchService {
.boost(2.0f)
)
);
bool.should(s ->
s.match(m ->
m
.field("author.py")
.query(v -> v.stringValue(qRaw))
.boost(2.0f)
)
);
bool.should(s ->
s.match(m ->
m
.field("category.py")
.query(v -> v.stringValue(qRaw))
.boost(1.2f)
)
);
bool.should(s ->
s.match(m ->
m
.field("tags.py")
.query(v -> v.stringValue(qRaw))
.boost(1.2f)
)
);
if (enableWildcard) {
// prefix/wildcard 这里的 value 是 String直接传即可