mirror of
https://github.com/alibaba/higress.git
synced 2026-06-09 04:37:31 +08:00
feat: support elasticsearch hybrid search (#1844)
This commit is contained in:
@@ -73,11 +73,16 @@ description: higress 支持通过集成搜索引擎(Google/Bing/Arxiv/Elastics
|
|||||||
## Elasticsearch 特定配置
|
## Elasticsearch 特定配置
|
||||||
|
|
||||||
| 名称 | 数据类型 | 填写要求 | 默认值 | 描述 |
|
| 名称 | 数据类型 | 填写要求 | 默认值 | 描述 |
|
||||||
|------|----------|----------|--------|------|
|
|------|----------|----------|--------|-----------------------|
|
||||||
| index | string | 必填 | - | 要搜索的Elasticsearch索引名称 |
|
| index | string | 必填 | - | 要搜索的Elasticsearch索引名称 |
|
||||||
| contentField | string | 必填 | - | 要查询的内容字段名称 |
|
| contentField | string | 必填 | - | 要查询的内容字段名称 |
|
||||||
|
| semanticTextField | string | 必填 | - | 要查询的 embedding 字段名称 |
|
||||||
| linkField | string | 必填 | - | 结果链接字段名称 |
|
| linkField | string | 必填 | - | 结果链接字段名称 |
|
||||||
| titleField | string | 必填 | - | 结果标题字段名称 |
|
| titleField | string | 必填 | - | 结果标题字段名称 |
|
||||||
|
| username | string | 选填 | - | Elasticsearch 用户名 |
|
||||||
|
| password | string | 选填 | - | Elasticsearch 密码 |
|
||||||
|
|
||||||
|
混合搜索中使用的 [Reciprocal Rank Fusion (RRF)](https://www.elastic.co/guide/en/elasticsearch/reference/8.17/rrf.html) 查询要求 Elasticsearch 的版本在 8.8 及以上。
|
||||||
|
|
||||||
## Quark 特定配置
|
## Quark 特定配置
|
||||||
|
|
||||||
@@ -200,8 +205,11 @@ searchFrom:
|
|||||||
servicePort: 80
|
servicePort: 80
|
||||||
index: "knowledge_base"
|
index: "knowledge_base"
|
||||||
contentField: "content"
|
contentField: "content"
|
||||||
|
semanticTextField: "semantic_text"
|
||||||
linkField: "url"
|
linkField: "url"
|
||||||
titleField: "title"
|
titleField: "title"
|
||||||
|
# username: "elastic"
|
||||||
|
# password: "password"
|
||||||
```
|
```
|
||||||
|
|
||||||
### 自定义引用格式
|
### 自定义引用格式
|
||||||
|
|||||||
@@ -73,11 +73,16 @@ It is strongly recommended to enable this feature when using Arxiv or Elasticsea
|
|||||||
## Elasticsearch Specific Configuration
|
## Elasticsearch Specific Configuration
|
||||||
|
|
||||||
| Name | Data Type | Requirement | Default Value | Description |
|
| Name | Data Type | Requirement | Default Value | Description |
|
||||||
|------|-----------|-------------|---------------|-------------|
|
|------|-----------|-------------|---------------|------------------------------------|
|
||||||
| index | string | Required | - | Elasticsearch index name to search |
|
| index | string | Required | - | Elasticsearch index name to search |
|
||||||
| contentField | string | Required | - | Content field name to query |
|
| contentField | string | Required | - | Content field name to query |
|
||||||
|
| semanticTextField | string | Required | - | Embedding field name to query |
|
||||||
| linkField | string | Required | - | Result link field name |
|
| linkField | string | Required | - | Result link field name |
|
||||||
| titleField | string | Required | - | Result title field name |
|
| titleField | string | Required | - | Result title field name |
|
||||||
|
| username | string | Optional | - | Elasticsearch username |
|
||||||
|
| password | string | Optional | - | Elasticsearch password |
|
||||||
|
|
||||||
|
The [Reciprocal Rank Fusion (RRF)](https://www.elastic.co/guide/en/elasticsearch/reference/8.17/rrf.html) query used in hybrid search requires Elasticsearch version 8.8 or higher.
|
||||||
|
|
||||||
## Quark Specific Configuration
|
## Quark Specific Configuration
|
||||||
|
|
||||||
@@ -199,8 +204,11 @@ searchFrom:
|
|||||||
servicePort: 80
|
servicePort: 80
|
||||||
index: "knowledge_base"
|
index: "knowledge_base"
|
||||||
contentField: "content"
|
contentField: "content"
|
||||||
|
semanticTextField: "semantic_text"
|
||||||
linkField: "url"
|
linkField: "url"
|
||||||
titleField: "title"
|
titleField: "title"
|
||||||
|
# username: "elastic"
|
||||||
|
# password: "password"
|
||||||
```
|
```
|
||||||
|
|
||||||
### Custom Reference Format
|
### Custom Reference Format
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package elasticsearch
|
package elasticsearch
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"encoding/base64"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"net/http"
|
"net/http"
|
||||||
@@ -16,11 +17,14 @@ type ElasticsearchSearch struct {
|
|||||||
client wrapper.HttpClient
|
client wrapper.HttpClient
|
||||||
index string
|
index string
|
||||||
contentField string
|
contentField string
|
||||||
|
semanticTextField string
|
||||||
linkField string
|
linkField string
|
||||||
titleField string
|
titleField string
|
||||||
start int
|
start int
|
||||||
count int
|
count int
|
||||||
timeoutMillisecond uint32
|
timeoutMillisecond uint32
|
||||||
|
username string
|
||||||
|
password string
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewElasticsearchSearch(config *gjson.Result) (*ElasticsearchSearch, error) {
|
func NewElasticsearchSearch(config *gjson.Result) (*ElasticsearchSearch, error) {
|
||||||
@@ -41,10 +45,15 @@ func NewElasticsearchSearch(config *gjson.Result) (*ElasticsearchSearch, error)
|
|||||||
if engine.index == "" {
|
if engine.index == "" {
|
||||||
return nil, errors.New("index not found")
|
return nil, errors.New("index not found")
|
||||||
}
|
}
|
||||||
|
|
||||||
engine.contentField = config.Get("contentField").String()
|
engine.contentField = config.Get("contentField").String()
|
||||||
if engine.contentField == "" {
|
if engine.contentField == "" {
|
||||||
return nil, errors.New("contentField not found")
|
return nil, errors.New("contentField not found")
|
||||||
}
|
}
|
||||||
|
engine.semanticTextField = config.Get("semanticTextField").String()
|
||||||
|
if engine.semanticTextField == "" {
|
||||||
|
return nil, errors.New("semanticTextField not found")
|
||||||
|
}
|
||||||
engine.linkField = config.Get("linkField").String()
|
engine.linkField = config.Get("linkField").String()
|
||||||
if engine.linkField == "" {
|
if engine.linkField == "" {
|
||||||
return nil, errors.New("linkField not found")
|
return nil, errors.New("linkField not found")
|
||||||
@@ -62,36 +71,66 @@ func NewElasticsearchSearch(config *gjson.Result) (*ElasticsearchSearch, error)
|
|||||||
if engine.count == 0 {
|
if engine.count == 0 {
|
||||||
engine.count = 10
|
engine.count = 10
|
||||||
}
|
}
|
||||||
|
|
||||||
|
engine.username = config.Get("username").String()
|
||||||
|
engine.password = config.Get("password").String()
|
||||||
|
|
||||||
return engine, nil
|
return engine, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (e ElasticsearchSearch) NeedExectue(ctx engine.SearchContext) bool {
|
func (e ElasticsearchSearch) NeedExectue(ctx engine.SearchContext) bool {
|
||||||
return ctx.EngineType == "private"
|
return ctx.EngineType == "private" || ctx.EngineType == ""
|
||||||
}
|
}
|
||||||
|
|
||||||
func (e ElasticsearchSearch) Client() wrapper.HttpClient {
|
func (e ElasticsearchSearch) Client() wrapper.HttpClient {
|
||||||
return e.client
|
return e.client
|
||||||
}
|
}
|
||||||
|
|
||||||
func (e ElasticsearchSearch) CallArgs(ctx engine.SearchContext) engine.CallArgs {
|
func (e ElasticsearchSearch) generateAuthorizationHeader() string {
|
||||||
searchBody := fmt.Sprintf(`{
|
return fmt.Sprintf(`Basic %s`, base64.StdEncoding.EncodeToString([]byte(e.username+":"+e.password)))
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e ElasticsearchSearch) generateQueryBody(ctx engine.SearchContext) string {
|
||||||
|
queryText := strings.Join(ctx.Querys, " ")
|
||||||
|
return fmt.Sprintf(`{
|
||||||
|
"retriever": {
|
||||||
|
"rrf": {
|
||||||
|
"retrievers": [
|
||||||
|
{
|
||||||
|
"standard": {
|
||||||
"query": {
|
"query": {
|
||||||
"match": {
|
"match": {
|
||||||
"%s": {
|
"%s": "%s"
|
||||||
"query": "%s",
|
|
||||||
"operator": "AND"
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}`, e.contentField, strings.Join(ctx.Querys, " "))
|
},
|
||||||
|
{
|
||||||
|
"standard": {
|
||||||
|
"query": {
|
||||||
|
"semantic": {
|
||||||
|
"field": "%s",
|
||||||
|
"query": "%s"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}`, e.contentField, queryText, e.semanticTextField, queryText)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e ElasticsearchSearch) CallArgs(ctx engine.SearchContext) engine.CallArgs {
|
||||||
|
queryBody := e.generateQueryBody(ctx)
|
||||||
return engine.CallArgs{
|
return engine.CallArgs{
|
||||||
Method: http.MethodPost,
|
Method: http.MethodPost,
|
||||||
Url: fmt.Sprintf("/%s/_search?from=%d&size=%d", e.index, e.start, e.count),
|
Url: fmt.Sprintf("/%s/_search?from=%d&size=%d", e.index, e.start, e.count),
|
||||||
Headers: [][2]string{
|
Headers: [][2]string{
|
||||||
{"Content-Type", "application/json"},
|
{"Content-Type", "application/json"},
|
||||||
|
{"Authorization", e.generateAuthorizationHeader()},
|
||||||
},
|
},
|
||||||
Body: []byte(searchBody),
|
Body: []byte(queryBody),
|
||||||
TimeoutMillisecond: e.timeoutMillisecond,
|
TimeoutMillisecond: e.timeoutMillisecond,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -172,7 +172,7 @@ func parseConfig(json gjson.Result, config *Config, log wrapper.Log) error {
|
|||||||
case "quark":
|
case "quark":
|
||||||
searchEngine, err := quark.NewQuarkSearch(&e)
|
searchEngine, err := quark.NewQuarkSearch(&e)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("elasticsearch search engine init failed:%s", err)
|
return fmt.Errorf("quark search engine init failed:%s", err)
|
||||||
}
|
}
|
||||||
config.engine = append(config.engine, searchEngine)
|
config.engine = append(config.engine, searchEngine)
|
||||||
internetExists = true
|
internetExists = true
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ none
|
|||||||
3. How: 分析对于要查询的知识和资料,应该提出什么样的问题
|
3. How: 分析对于要查询的知识和资料,应该提出什么样的问题
|
||||||
4. Adjust: 明确要向什么地方查询什么问题后,按下面方式对问题进行调整
|
4. Adjust: 明确要向什么地方查询什么问题后,按下面方式对问题进行调整
|
||||||
4.1. 向搜索引擎提问:用一句话概括问题,并且针对搜索引擎做问题优化
|
4.1. 向搜索引擎提问:用一句话概括问题,并且针对搜索引擎做问题优化
|
||||||
4.2. 向私有知识库提问:将问题拆分成多组关键词的组合,同时组合中的关键词个数尽量不要超过3个
|
4.2. 向私有知识库提问:用一句话概括问题,私有知识库不需要对关键词进行拆分
|
||||||
4.3. 向Arxiv论文资料库提问:
|
4.3. 向Arxiv论文资料库提问:
|
||||||
4.3.1. 明确问题所属领域,然后确定Arxiv的Category值,Category可选的枚举如下:
|
4.3.1. 明确问题所属领域,然后确定Arxiv的Category值,Category可选的枚举如下:
|
||||||
- cs.AI: Artificial Intelligence
|
- cs.AI: Artificial Intelligence
|
||||||
@@ -207,10 +207,6 @@ cs.AI: attention mechanism
|
|||||||
cs.AI: neuron
|
cs.AI: neuron
|
||||||
q-bio.NC: brain,attention mechanism
|
q-bio.NC: brain,attention mechanism
|
||||||
|
|
||||||
#### 向私有知识库查询多次
|
|
||||||
private: 电子钱包,密码
|
|
||||||
private: 张三,身份证号
|
|
||||||
|
|
||||||
#### 向多个查询目标查询多次
|
#### 向多个查询目标查询多次
|
||||||
internet: 中国未来房价趋势
|
internet: 中国未来房价趋势
|
||||||
internet: 最新中国经济政策
|
internet: 最新中国经济政策
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ none
|
|||||||
3. How: 分析对于要查询的知识和资料,应该提出什么样的问题
|
3. How: 分析对于要查询的知识和资料,应该提出什么样的问题
|
||||||
4. Adjust: 明确要向什么地方查询什么问题后,按下面方式对问题进行调整
|
4. Adjust: 明确要向什么地方查询什么问题后,按下面方式对问题进行调整
|
||||||
4.1. 向搜索引擎提问:用一句话概括问题,并且针对搜索引擎做问题优化
|
4.1. 向搜索引擎提问:用一句话概括问题,并且针对搜索引擎做问题优化
|
||||||
4.2. 向私有知识库提问:将问题拆分成多组关键词的组合,同时组合中的关键词个数尽量不要超过3个
|
4.2. 向私有知识库提问:用一句话概括问题,私有知识库不需要对关键词进行拆分
|
||||||
5. Final: 按照下面**回复内容示例**进行回复,注意:
|
5. Final: 按照下面**回复内容示例**进行回复,注意:
|
||||||
- 不要输出思考过程
|
- 不要输出思考过程
|
||||||
- 可以向多个查询目标分别查询多次,多个查询用换行分隔,总查询次数控制在5次以内
|
- 可以向多个查询目标分别查询多次,多个查询用换行分隔,总查询次数控制在5次以内
|
||||||
@@ -42,10 +42,6 @@ none
|
|||||||
internet: 黄金价格走势
|
internet: 黄金价格走势
|
||||||
internet: The trend of gold prices
|
internet: The trend of gold prices
|
||||||
|
|
||||||
#### 向私有知识库查询多次
|
|
||||||
private: 电子钱包,密码
|
|
||||||
private: 张三,身份证号
|
|
||||||
|
|
||||||
#### 向多个查询目标查询多次
|
#### 向多个查询目标查询多次
|
||||||
internet: 中国未来房价趋势
|
internet: 中国未来房价趋势
|
||||||
internet: 最新中国经济政策
|
internet: 最新中国经济政策
|
||||||
|
|||||||
Reference in New Issue
Block a user