feat: optimize elasticsearch ai-search plugin and update related docs" (#2100)

This commit is contained in:
Se7en
2025-04-22 13:33:38 +08:00
committed by GitHub
parent 36d5d391b8
commit b8133a95b2
4 changed files with 44 additions and 33 deletions

View File

@@ -75,18 +75,22 @@ description: higress 支持通过集成搜索引擎Google/Bing/Arxiv/Elastics
## Elasticsearch 特定配置
| 名称 | 数据类型 | 填写要求 | 默认值 | 描述 |
|------|----------|----------|--------|-----------------------|
| index | string | 必填 | - | 要搜索的Elasticsearch索引名称 |
| contentField | string | 必填 | - | 要查询的内容字段名称 |
| semanticTextField | string | 必填 | - | 要查询的 embedding 字段名称 |
| linkField | string | 必填 | - | 结果链接字段名称 |
| titleField | string | 必填 | - | 结果标题字段名称 |
| username | string | 选填 | - | Elasticsearch 用户名 |
| password | string | 选填 | - | Elasticsearch 密码 |
| 名称 | 数据类型 | 填写要求 | 默认值 | 描述 |
|------|----------|------|--------|------------------------------------|
| index | string | 必填 | - | 要搜索的 Elasticsearch 索引名称 |
| contentField | string | 必填 | - | 要查询的内容字段名称 |
| semanticTextField | string | 必填 | - | 要查询的 embedding 字段名称 |
| linkField | string | 选填 | - | 结果链接字段名称,当配置 `needReference` 时需要填写 |
| titleField | string | 选填 | - | 结果标题字段名称,当配置 `needReference` 时需要填写 |
| username | string | 选填 | - | Elasticsearch 用户名 |
| password | string | 选填 | - | Elasticsearch 密码 |
混合搜索中使用的 [Reciprocal Rank Fusion (RRF)](https://www.elastic.co/guide/en/elasticsearch/reference/8.17/rrf.html) 查询要求 Elasticsearch 的版本在 8.8 及以上。
目前文档向量化依赖于 Elasticsearch 的 Embedding 模型,该功能需要 Elasticsearch 企业版 License或可使用 30 天的 Trial License。安装 Elasticsearch 内置 Embedding 模型的步骤可参考[该文档](https://www.elastic.co/docs/explore-analyze/machine-learning/nlp/ml-nlp-elser#alternative-download-deploy);若需安装第三方 Embedding 模型,可参考[该文档](https://www.elastic.co/docs/explore-analyze/machine-learning/nlp/ml-nlp-text-emb-vector-search-example)。
有关 ai-search 插件集成 Elasticsearch 的完整教程,请参考:[使用 LangChain + Higress + Elasticsearch 构建 RAG 应用](https://cr7258.github.io/blogs/original/2025/15-rag-higress-es-langchain)。
## Quark 特定配置
| 名称 | 数据类型 | 填写要求 | 默认值 | 描述 |
@@ -204,13 +208,9 @@ searchFrom:
searchFrom:
- type: elasticsearch
serviceName: "es-svc.static"
# 固定地址服务的端口默认是80
servicePort: 80
index: "knowledge_base"
contentField: "content"
semanticTextField: "semantic_text"
linkField: "url"
titleField: "title"
# username: "elastic"
# password: "password"
```

View File

@@ -80,13 +80,17 @@ It is strongly recommended to enable this feature when using Arxiv or Elasticsea
| index | string | Required | - | Elasticsearch index name to search |
| contentField | string | Required | - | Content field name to query |
| semanticTextField | string | Required | - | Embedding field name to query |
| linkField | string | Required | - | Result link field name |
| titleField | string | Required | - | Result title field name |
| linkField | string | Optional | - | Result link field name, needed when `needReference` is configured |
| titleField | string | Optional | - | Result title field name, needed when `needReference` is configured |
| username | string | Optional | - | Elasticsearch username |
| password | string | Optional | - | Elasticsearch password |
The [Reciprocal Rank Fusion (RRF)](https://www.elastic.co/guide/en/elasticsearch/reference/8.17/rrf.html) query used in hybrid search requires Elasticsearch version 8.8 or higher.
Currently, document vectorization relies on Elasticsearch's embedding model, which requires an Elasticsearch Enterprise license or a 30-day Trial license. To install the built-in embedding model in Elasticsearch, please refer to [this documentation](https://www.elastic.co/docs/explore-analyze/machine-learning/nlp/ml-nlp-elser#alternative-download-deploy). If you want to install a third-party embedding model, please refer to [this guide](https://www.elastic.co/docs/explore-analyze/machine-learning/nlp/ml-nlp-text-emb-vector-search-example).
For a complete tutorial on integrating the ai-search plugin with Elasticsearch, please refer to: [Building a RAG Application with LangChain + Higress + Elasticsearch](https://cr7258.github.io/blogs/original/2025/15-rag-higress-es-langchain).
## Quark Specific Configuration
| Name | Data Type | Requirement | Default Value | Description |
@@ -203,13 +207,9 @@ Note that excessive concurrency may lead to rate limiting, adjust according to a
searchFrom:
- type: elasticsearch
serviceName: "es-svc.static"
# static ip service use 80 as default port
servicePort: 80
index: "knowledge_base"
contentField: "content"
semanticTextField: "semantic_text"
linkField: "url"
titleField: "title"
# username: "elastic"
# password: "password"
```

View File

@@ -27,7 +27,7 @@ type ElasticsearchSearch struct {
password string
}
func NewElasticsearchSearch(config *gjson.Result) (*ElasticsearchSearch, error) {
func NewElasticsearchSearch(config *gjson.Result, needReference bool) (*ElasticsearchSearch, error) {
engine := &ElasticsearchSearch{}
serviceName := config.Get("serviceName").String()
if serviceName == "" {
@@ -35,7 +35,13 @@ func NewElasticsearchSearch(config *gjson.Result) (*ElasticsearchSearch, error)
}
servicePort := config.Get("servicePort").Int()
if servicePort == 0 {
return nil, errors.New("servicePort not found")
if strings.HasSuffix(serviceName, ".static") {
servicePort = 80
} else if strings.HasSuffix(serviceName, ".dns") {
servicePort = 443
} else {
return nil, errors.New("servicePort not found")
}
}
engine.client = wrapper.NewClusterClient(wrapper.FQDNCluster{
FQDN: serviceName,
@@ -54,14 +60,18 @@ func NewElasticsearchSearch(config *gjson.Result) (*ElasticsearchSearch, error)
if engine.semanticTextField == "" {
return nil, errors.New("semanticTextField not found")
}
engine.linkField = config.Get("linkField").String()
if engine.linkField == "" {
return nil, errors.New("linkField not found")
}
engine.titleField = config.Get("titleField").String()
if engine.titleField == "" {
return nil, errors.New("titleField not found")
if needReference {
engine.linkField = config.Get("linkField").String()
if engine.linkField == "" {
return nil, errors.New("linkField not found")
}
engine.titleField = config.Get("titleField").String()
if engine.titleField == "" {
return nil, errors.New("titleField not found")
}
}
engine.timeoutMillisecond = uint32(config.Get("timeoutMillisecond").Uint())
if engine.timeoutMillisecond == 0 {
engine.timeoutMillisecond = 5000
@@ -93,6 +103,9 @@ func (e ElasticsearchSearch) generateAuthorizationHeader() string {
func (e ElasticsearchSearch) generateQueryBody(ctx engine.SearchContext) string {
queryText := strings.Join(ctx.Querys, " ")
return fmt.Sprintf(`{
"_source":{
"excludes": "%s"
},
"retriever": {
"rrf": {
"retrievers": [
@@ -118,7 +131,7 @@ func (e ElasticsearchSearch) generateQueryBody(ctx engine.SearchContext) string
]
}
}
}`, e.contentField, queryText, e.semanticTextField, queryText)
}`, e.semanticTextField, e.contentField, queryText, e.semanticTextField, queryText)
}
func (e ElasticsearchSearch) CallArgs(ctx engine.SearchContext) engine.CallArgs {
@@ -145,9 +158,7 @@ func (e ElasticsearchSearch) ParseResult(ctx engine.SearchContext, response []by
Link: source.Get(e.linkField).String(),
Content: source.Get(e.contentField).String(),
}
if result.Valid() {
results = append(results, result)
}
results = append(results, result)
}
return results
}

View File

@@ -185,7 +185,7 @@ func parseConfig(json gjson.Result, config *Config, log wrapper.Log) error {
arxivExists = true
onlyQuark = false
case "elasticsearch":
searchEngine, err := elasticsearch.NewElasticsearchSearch(&e)
searchEngine, err := elasticsearch.NewElasticsearchSearch(&e, config.needReference)
if err != nil {
return fmt.Errorf("elasticsearch search engine init failed:%s", err)
}