mirror of
https://github.com/alibaba/higress.git
synced 2026-06-07 03:37:28 +08:00
feat: optimize elasticsearch ai-search plugin and update related docs" (#2100)
This commit is contained in:
@@ -75,18 +75,22 @@ description: higress 支持通过集成搜索引擎(Google/Bing/Arxiv/Elastics
|
|||||||
|
|
||||||
## Elasticsearch 特定配置
|
## Elasticsearch 特定配置
|
||||||
|
|
||||||
| 名称 | 数据类型 | 填写要求 | 默认值 | 描述 |
|
| 名称 | 数据类型 | 填写要求 | 默认值 | 描述 |
|
||||||
|------|----------|----------|--------|-----------------------|
|
|------|----------|------|--------|------------------------------------|
|
||||||
| index | string | 必填 | - | 要搜索的Elasticsearch索引名称 |
|
| index | string | 必填 | - | 要搜索的 Elasticsearch 索引名称 |
|
||||||
| contentField | string | 必填 | - | 要查询的内容字段名称 |
|
| contentField | string | 必填 | - | 要查询的内容字段名称 |
|
||||||
| semanticTextField | string | 必填 | - | 要查询的 embedding 字段名称 |
|
| semanticTextField | string | 必填 | - | 要查询的 embedding 字段名称 |
|
||||||
| linkField | string | 必填 | - | 结果链接字段名称 |
|
| linkField | string | 选填 | - | 结果链接字段名称,当配置 `needReference` 时需要填写 |
|
||||||
| titleField | string | 必填 | - | 结果标题字段名称 |
|
| titleField | string | 选填 | - | 结果标题字段名称,当配置 `needReference` 时需要填写 |
|
||||||
| username | string | 选填 | - | Elasticsearch 用户名 |
|
| username | string | 选填 | - | Elasticsearch 用户名 |
|
||||||
| password | string | 选填 | - | Elasticsearch 密码 |
|
| password | string | 选填 | - | Elasticsearch 密码 |
|
||||||
|
|
||||||
混合搜索中使用的 [Reciprocal Rank Fusion (RRF)](https://www.elastic.co/guide/en/elasticsearch/reference/8.17/rrf.html) 查询要求 Elasticsearch 的版本在 8.8 及以上。
|
混合搜索中使用的 [Reciprocal Rank Fusion (RRF)](https://www.elastic.co/guide/en/elasticsearch/reference/8.17/rrf.html) 查询要求 Elasticsearch 的版本在 8.8 及以上。
|
||||||
|
|
||||||
|
目前文档向量化依赖于 Elasticsearch 的 Embedding 模型,该功能需要 Elasticsearch 企业版 License,或可使用 30 天的 Trial License。安装 Elasticsearch 内置 Embedding 模型的步骤可参考[该文档](https://www.elastic.co/docs/explore-analyze/machine-learning/nlp/ml-nlp-elser#alternative-download-deploy);若需安装第三方 Embedding 模型,可参考[该文档](https://www.elastic.co/docs/explore-analyze/machine-learning/nlp/ml-nlp-text-emb-vector-search-example)。
|
||||||
|
|
||||||
|
有关 ai-search 插件集成 Elasticsearch 的完整教程,请参考:[使用 LangChain + Higress + Elasticsearch 构建 RAG 应用](https://cr7258.github.io/blogs/original/2025/15-rag-higress-es-langchain)。
|
||||||
|
|
||||||
## Quark 特定配置
|
## Quark 特定配置
|
||||||
|
|
||||||
| 名称 | 数据类型 | 填写要求 | 默认值 | 描述 |
|
| 名称 | 数据类型 | 填写要求 | 默认值 | 描述 |
|
||||||
@@ -204,13 +208,9 @@ searchFrom:
|
|||||||
searchFrom:
|
searchFrom:
|
||||||
- type: elasticsearch
|
- type: elasticsearch
|
||||||
serviceName: "es-svc.static"
|
serviceName: "es-svc.static"
|
||||||
# 固定地址服务的端口默认是80
|
|
||||||
servicePort: 80
|
|
||||||
index: "knowledge_base"
|
index: "knowledge_base"
|
||||||
contentField: "content"
|
contentField: "content"
|
||||||
semanticTextField: "semantic_text"
|
semanticTextField: "semantic_text"
|
||||||
linkField: "url"
|
|
||||||
titleField: "title"
|
|
||||||
# username: "elastic"
|
# username: "elastic"
|
||||||
# password: "password"
|
# password: "password"
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -80,13 +80,17 @@ It is strongly recommended to enable this feature when using Arxiv or Elasticsea
|
|||||||
| index | string | Required | - | Elasticsearch index name to search |
|
| index | string | Required | - | Elasticsearch index name to search |
|
||||||
| contentField | string | Required | - | Content field name to query |
|
| contentField | string | Required | - | Content field name to query |
|
||||||
| semanticTextField | string | Required | - | Embedding field name to query |
|
| semanticTextField | string | Required | - | Embedding field name to query |
|
||||||
| linkField | string | Required | - | Result link field name |
|
| linkField | string | Optional | - | Result link field name, needed when `needReference` is configured |
|
||||||
| titleField | string | Required | - | Result title field name |
|
| titleField | string | Optional | - | Result title field name, needed when `needReference` is configured |
|
||||||
| username | string | Optional | - | Elasticsearch username |
|
| username | string | Optional | - | Elasticsearch username |
|
||||||
| password | string | Optional | - | Elasticsearch password |
|
| password | string | Optional | - | Elasticsearch password |
|
||||||
|
|
||||||
The [Reciprocal Rank Fusion (RRF)](https://www.elastic.co/guide/en/elasticsearch/reference/8.17/rrf.html) query used in hybrid search requires Elasticsearch version 8.8 or higher.
|
The [Reciprocal Rank Fusion (RRF)](https://www.elastic.co/guide/en/elasticsearch/reference/8.17/rrf.html) query used in hybrid search requires Elasticsearch version 8.8 or higher.
|
||||||
|
|
||||||
|
Currently, document vectorization relies on Elasticsearch's embedding model, which requires an Elasticsearch Enterprise license or a 30-day Trial license. To install the built-in embedding model in Elasticsearch, please refer to [this documentation](https://www.elastic.co/docs/explore-analyze/machine-learning/nlp/ml-nlp-elser#alternative-download-deploy). If you want to install a third-party embedding model, please refer to [this guide](https://www.elastic.co/docs/explore-analyze/machine-learning/nlp/ml-nlp-text-emb-vector-search-example).
|
||||||
|
|
||||||
|
For a complete tutorial on integrating the ai-search plugin with Elasticsearch, please refer to: [Building a RAG Application with LangChain + Higress + Elasticsearch](https://cr7258.github.io/blogs/original/2025/15-rag-higress-es-langchain).
|
||||||
|
|
||||||
## Quark Specific Configuration
|
## Quark Specific Configuration
|
||||||
|
|
||||||
| Name | Data Type | Requirement | Default Value | Description |
|
| Name | Data Type | Requirement | Default Value | Description |
|
||||||
@@ -203,13 +207,9 @@ Note that excessive concurrency may lead to rate limiting, adjust according to a
|
|||||||
searchFrom:
|
searchFrom:
|
||||||
- type: elasticsearch
|
- type: elasticsearch
|
||||||
serviceName: "es-svc.static"
|
serviceName: "es-svc.static"
|
||||||
# static ip service use 80 as default port
|
|
||||||
servicePort: 80
|
|
||||||
index: "knowledge_base"
|
index: "knowledge_base"
|
||||||
contentField: "content"
|
contentField: "content"
|
||||||
semanticTextField: "semantic_text"
|
semanticTextField: "semantic_text"
|
||||||
linkField: "url"
|
|
||||||
titleField: "title"
|
|
||||||
# username: "elastic"
|
# username: "elastic"
|
||||||
# password: "password"
|
# password: "password"
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ type ElasticsearchSearch struct {
|
|||||||
password string
|
password string
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewElasticsearchSearch(config *gjson.Result) (*ElasticsearchSearch, error) {
|
func NewElasticsearchSearch(config *gjson.Result, needReference bool) (*ElasticsearchSearch, error) {
|
||||||
engine := &ElasticsearchSearch{}
|
engine := &ElasticsearchSearch{}
|
||||||
serviceName := config.Get("serviceName").String()
|
serviceName := config.Get("serviceName").String()
|
||||||
if serviceName == "" {
|
if serviceName == "" {
|
||||||
@@ -35,7 +35,13 @@ func NewElasticsearchSearch(config *gjson.Result) (*ElasticsearchSearch, error)
|
|||||||
}
|
}
|
||||||
servicePort := config.Get("servicePort").Int()
|
servicePort := config.Get("servicePort").Int()
|
||||||
if servicePort == 0 {
|
if servicePort == 0 {
|
||||||
return nil, errors.New("servicePort not found")
|
if strings.HasSuffix(serviceName, ".static") {
|
||||||
|
servicePort = 80
|
||||||
|
} else if strings.HasSuffix(serviceName, ".dns") {
|
||||||
|
servicePort = 443
|
||||||
|
} else {
|
||||||
|
return nil, errors.New("servicePort not found")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
engine.client = wrapper.NewClusterClient(wrapper.FQDNCluster{
|
engine.client = wrapper.NewClusterClient(wrapper.FQDNCluster{
|
||||||
FQDN: serviceName,
|
FQDN: serviceName,
|
||||||
@@ -54,14 +60,18 @@ func NewElasticsearchSearch(config *gjson.Result) (*ElasticsearchSearch, error)
|
|||||||
if engine.semanticTextField == "" {
|
if engine.semanticTextField == "" {
|
||||||
return nil, errors.New("semanticTextField not found")
|
return nil, errors.New("semanticTextField not found")
|
||||||
}
|
}
|
||||||
engine.linkField = config.Get("linkField").String()
|
|
||||||
if engine.linkField == "" {
|
if needReference {
|
||||||
return nil, errors.New("linkField not found")
|
engine.linkField = config.Get("linkField").String()
|
||||||
}
|
if engine.linkField == "" {
|
||||||
engine.titleField = config.Get("titleField").String()
|
return nil, errors.New("linkField not found")
|
||||||
if engine.titleField == "" {
|
}
|
||||||
return nil, errors.New("titleField not found")
|
engine.titleField = config.Get("titleField").String()
|
||||||
|
if engine.titleField == "" {
|
||||||
|
return nil, errors.New("titleField not found")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
engine.timeoutMillisecond = uint32(config.Get("timeoutMillisecond").Uint())
|
engine.timeoutMillisecond = uint32(config.Get("timeoutMillisecond").Uint())
|
||||||
if engine.timeoutMillisecond == 0 {
|
if engine.timeoutMillisecond == 0 {
|
||||||
engine.timeoutMillisecond = 5000
|
engine.timeoutMillisecond = 5000
|
||||||
@@ -93,6 +103,9 @@ func (e ElasticsearchSearch) generateAuthorizationHeader() string {
|
|||||||
func (e ElasticsearchSearch) generateQueryBody(ctx engine.SearchContext) string {
|
func (e ElasticsearchSearch) generateQueryBody(ctx engine.SearchContext) string {
|
||||||
queryText := strings.Join(ctx.Querys, " ")
|
queryText := strings.Join(ctx.Querys, " ")
|
||||||
return fmt.Sprintf(`{
|
return fmt.Sprintf(`{
|
||||||
|
"_source":{
|
||||||
|
"excludes": "%s"
|
||||||
|
},
|
||||||
"retriever": {
|
"retriever": {
|
||||||
"rrf": {
|
"rrf": {
|
||||||
"retrievers": [
|
"retrievers": [
|
||||||
@@ -118,7 +131,7 @@ func (e ElasticsearchSearch) generateQueryBody(ctx engine.SearchContext) string
|
|||||||
]
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}`, e.contentField, queryText, e.semanticTextField, queryText)
|
}`, e.semanticTextField, e.contentField, queryText, e.semanticTextField, queryText)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (e ElasticsearchSearch) CallArgs(ctx engine.SearchContext) engine.CallArgs {
|
func (e ElasticsearchSearch) CallArgs(ctx engine.SearchContext) engine.CallArgs {
|
||||||
@@ -145,9 +158,7 @@ func (e ElasticsearchSearch) ParseResult(ctx engine.SearchContext, response []by
|
|||||||
Link: source.Get(e.linkField).String(),
|
Link: source.Get(e.linkField).String(),
|
||||||
Content: source.Get(e.contentField).String(),
|
Content: source.Get(e.contentField).String(),
|
||||||
}
|
}
|
||||||
if result.Valid() {
|
results = append(results, result)
|
||||||
results = append(results, result)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return results
|
return results
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -185,7 +185,7 @@ func parseConfig(json gjson.Result, config *Config, log wrapper.Log) error {
|
|||||||
arxivExists = true
|
arxivExists = true
|
||||||
onlyQuark = false
|
onlyQuark = false
|
||||||
case "elasticsearch":
|
case "elasticsearch":
|
||||||
searchEngine, err := elasticsearch.NewElasticsearchSearch(&e)
|
searchEngine, err := elasticsearch.NewElasticsearchSearch(&e, config.needReference)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("elasticsearch search engine init failed:%s", err)
|
return fmt.Errorf("elasticsearch search engine init failed:%s", err)
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user