feat: support elasticsearch hybrid search (#1844)

This commit is contained in:
Se7en
2025-03-11 11:25:58 +08:00
committed by GitHub
parent 5a5af4ecbf
commit 01cc7939ae
6 changed files with 80 additions and 33 deletions

View File

@@ -73,11 +73,16 @@ description: higress 支持通过集成搜索引擎Google/Bing/Arxiv/Elastics
## Elasticsearch 特定配置 ## Elasticsearch 特定配置
| 名称 | 数据类型 | 填写要求 | 默认值 | 描述 | | 名称 | 数据类型 | 填写要求 | 默认值 | 描述 |
|------|----------|----------|--------|------| |------|----------|----------|--------|-----------------------|
| index | string | 必填 | - | 要搜索的Elasticsearch索引名称 | | index | string | 必填 | - | 要搜索的Elasticsearch索引名称 |
| contentField | string | 必填 | - | 要查询的内容字段名称 | | contentField | string | 必填 | - | 要查询的内容字段名称 |
| semanticTextField | string | 必填 | - | 要查询的 embedding 字段名称 |
| linkField | string | 必填 | - | 结果链接字段名称 | | linkField | string | 必填 | - | 结果链接字段名称 |
| titleField | string | 必填 | - | 结果标题字段名称 | | titleField | string | 必填 | - | 结果标题字段名称 |
| username | string | 选填 | - | Elasticsearch 用户名 |
| password | string | 选填 | - | Elasticsearch 密码 |
混合搜索中使用的 [Reciprocal Rank Fusion (RRF)](https://www.elastic.co/guide/en/elasticsearch/reference/8.17/rrf.html) 查询要求 Elasticsearch 的版本在 8.8 及以上。
## Quark 特定配置 ## Quark 特定配置
@@ -200,8 +205,11 @@ searchFrom:
servicePort: 80 servicePort: 80
index: "knowledge_base" index: "knowledge_base"
contentField: "content" contentField: "content"
semanticTextField: "semantic_text"
linkField: "url" linkField: "url"
titleField: "title" titleField: "title"
# username: "elastic"
# password: "password"
``` ```
### 自定义引用格式 ### 自定义引用格式

View File

@@ -73,11 +73,16 @@ It is strongly recommended to enable this feature when using Arxiv or Elasticsea
## Elasticsearch Specific Configuration ## Elasticsearch Specific Configuration
| Name | Data Type | Requirement | Default Value | Description | | Name | Data Type | Requirement | Default Value | Description |
|------|-----------|-------------|---------------|-------------| |------|-----------|-------------|---------------|------------------------------------|
| index | string | Required | - | Elasticsearch index name to search | | index | string | Required | - | Elasticsearch index name to search |
| contentField | string | Required | - | Content field name to query | | contentField | string | Required | - | Content field name to query |
| semanticTextField | string | Required | - | Embedding field name to query |
| linkField | string | Required | - | Result link field name | | linkField | string | Required | - | Result link field name |
| titleField | string | Required | - | Result title field name | | titleField | string | Required | - | Result title field name |
| username | string | Optional | - | Elasticsearch username |
| password | string | Optional | - | Elasticsearch password |
The [Reciprocal Rank Fusion (RRF)](https://www.elastic.co/guide/en/elasticsearch/reference/8.17/rrf.html) query used in hybrid search requires Elasticsearch version 8.8 or higher.
## Quark Specific Configuration ## Quark Specific Configuration
@@ -199,8 +204,11 @@ searchFrom:
servicePort: 80 servicePort: 80
index: "knowledge_base" index: "knowledge_base"
contentField: "content" contentField: "content"
semanticTextField: "semantic_text"
linkField: "url" linkField: "url"
titleField: "title" titleField: "title"
# username: "elastic"
# password: "password"
``` ```
### Custom Reference Format ### Custom Reference Format

View File

@@ -1,6 +1,7 @@
package elasticsearch package elasticsearch
import ( import (
"encoding/base64"
"errors" "errors"
"fmt" "fmt"
"net/http" "net/http"
@@ -16,11 +17,14 @@ type ElasticsearchSearch struct {
client wrapper.HttpClient client wrapper.HttpClient
index string index string
contentField string contentField string
semanticTextField string
linkField string linkField string
titleField string titleField string
start int start int
count int count int
timeoutMillisecond uint32 timeoutMillisecond uint32
username string
password string
} }
func NewElasticsearchSearch(config *gjson.Result) (*ElasticsearchSearch, error) { func NewElasticsearchSearch(config *gjson.Result) (*ElasticsearchSearch, error) {
@@ -41,10 +45,15 @@ func NewElasticsearchSearch(config *gjson.Result) (*ElasticsearchSearch, error)
if engine.index == "" { if engine.index == "" {
return nil, errors.New("index not found") return nil, errors.New("index not found")
} }
engine.contentField = config.Get("contentField").String() engine.contentField = config.Get("contentField").String()
if engine.contentField == "" { if engine.contentField == "" {
return nil, errors.New("contentField not found") return nil, errors.New("contentField not found")
} }
engine.semanticTextField = config.Get("semanticTextField").String()
if engine.semanticTextField == "" {
return nil, errors.New("semanticTextField not found")
}
engine.linkField = config.Get("linkField").String() engine.linkField = config.Get("linkField").String()
if engine.linkField == "" { if engine.linkField == "" {
return nil, errors.New("linkField not found") return nil, errors.New("linkField not found")
@@ -62,36 +71,66 @@ func NewElasticsearchSearch(config *gjson.Result) (*ElasticsearchSearch, error)
if engine.count == 0 { if engine.count == 0 {
engine.count = 10 engine.count = 10
} }
engine.username = config.Get("username").String()
engine.password = config.Get("password").String()
return engine, nil return engine, nil
} }
func (e ElasticsearchSearch) NeedExectue(ctx engine.SearchContext) bool { func (e ElasticsearchSearch) NeedExectue(ctx engine.SearchContext) bool {
return ctx.EngineType == "private" return ctx.EngineType == "private" || ctx.EngineType == ""
} }
func (e ElasticsearchSearch) Client() wrapper.HttpClient { func (e ElasticsearchSearch) Client() wrapper.HttpClient {
return e.client return e.client
} }
func (e ElasticsearchSearch) CallArgs(ctx engine.SearchContext) engine.CallArgs { func (e ElasticsearchSearch) generateAuthorizationHeader() string {
searchBody := fmt.Sprintf(`{ return fmt.Sprintf(`Basic %s`, base64.StdEncoding.EncodeToString([]byte(e.username+":"+e.password)))
}
func (e ElasticsearchSearch) generateQueryBody(ctx engine.SearchContext) string {
queryText := strings.Join(ctx.Querys, " ")
return fmt.Sprintf(`{
"retriever": {
"rrf": {
"retrievers": [
{
"standard": {
"query": { "query": {
"match": { "match": {
"%s": { "%s": "%s"
"query": "%s",
"operator": "AND"
} }
} }
} }
}`, e.contentField, strings.Join(ctx.Querys, " ")) },
{
"standard": {
"query": {
"semantic": {
"field": "%s",
"query": "%s"
}
}
}
}
]
}
}
}`, e.contentField, queryText, e.semanticTextField, queryText)
}
func (e ElasticsearchSearch) CallArgs(ctx engine.SearchContext) engine.CallArgs {
queryBody := e.generateQueryBody(ctx)
return engine.CallArgs{ return engine.CallArgs{
Method: http.MethodPost, Method: http.MethodPost,
Url: fmt.Sprintf("/%s/_search?from=%d&size=%d", e.index, e.start, e.count), Url: fmt.Sprintf("/%s/_search?from=%d&size=%d", e.index, e.start, e.count),
Headers: [][2]string{ Headers: [][2]string{
{"Content-Type", "application/json"}, {"Content-Type", "application/json"},
{"Authorization", e.generateAuthorizationHeader()},
}, },
Body: []byte(searchBody), Body: []byte(queryBody),
TimeoutMillisecond: e.timeoutMillisecond, TimeoutMillisecond: e.timeoutMillisecond,
} }
} }

View File

@@ -172,7 +172,7 @@ func parseConfig(json gjson.Result, config *Config, log wrapper.Log) error {
case "quark": case "quark":
searchEngine, err := quark.NewQuarkSearch(&e) searchEngine, err := quark.NewQuarkSearch(&e)
if err != nil { if err != nil {
return fmt.Errorf("elasticsearch search engine init failed:%s", err) return fmt.Errorf("quark search engine init failed:%s", err)
} }
config.engine = append(config.engine, searchEngine) config.engine = append(config.engine, searchEngine)
internetExists = true internetExists = true

View File

@@ -25,7 +25,7 @@ none
3. How: 分析对于要查询的知识和资料,应该提出什么样的问题 3. How: 分析对于要查询的知识和资料,应该提出什么样的问题
4. Adjust: 明确要向什么地方查询什么问题后,按下面方式对问题进行调整 4. Adjust: 明确要向什么地方查询什么问题后,按下面方式对问题进行调整
4.1. 向搜索引擎提问:用一句话概括问题,并且针对搜索引擎做问题优化 4.1. 向搜索引擎提问:用一句话概括问题,并且针对搜索引擎做问题优化
4.2. 向私有知识库提问:将问题拆分成多组关键词的组合同时组合中的关键词个数尽量不要超过3个 4.2. 向私有知识库提问:用一句话概括问题,私有知识库不需要对关键词进行拆分
4.3. 向Arxiv论文资料库提问 4.3. 向Arxiv论文资料库提问
4.3.1. 明确问题所属领域然后确定Arxiv的Category值Category可选的枚举如下: 4.3.1. 明确问题所属领域然后确定Arxiv的Category值Category可选的枚举如下:
- cs.AI: Artificial Intelligence - cs.AI: Artificial Intelligence
@@ -207,10 +207,6 @@ cs.AI: attention mechanism
cs.AI: neuron cs.AI: neuron
q-bio.NC: brain,attention mechanism q-bio.NC: brain,attention mechanism
#### 向私有知识库查询多次
private: 电子钱包,密码
private: 张三,身份证号
#### 向多个查询目标查询多次 #### 向多个查询目标查询多次
internet: 中国未来房价趋势 internet: 中国未来房价趋势
internet: 最新中国经济政策 internet: 最新中国经济政策

View File

@@ -25,7 +25,7 @@ none
3. How: 分析对于要查询的知识和资料,应该提出什么样的问题 3. How: 分析对于要查询的知识和资料,应该提出什么样的问题
4. Adjust: 明确要向什么地方查询什么问题后,按下面方式对问题进行调整 4. Adjust: 明确要向什么地方查询什么问题后,按下面方式对问题进行调整
4.1. 向搜索引擎提问:用一句话概括问题,并且针对搜索引擎做问题优化 4.1. 向搜索引擎提问:用一句话概括问题,并且针对搜索引擎做问题优化
4.2. 向私有知识库提问:将问题拆分成多组关键词的组合同时组合中的关键词个数尽量不要超过3个 4.2. 向私有知识库提问:用一句话概括问题,私有知识库不需要对关键词进行拆分
5. Final: 按照下面**回复内容示例**进行回复,注意: 5. Final: 按照下面**回复内容示例**进行回复,注意:
- 不要输出思考过程 - 不要输出思考过程
- 可以向多个查询目标分别查询多次多个查询用换行分隔总查询次数控制在5次以内 - 可以向多个查询目标分别查询多次多个查询用换行分隔总查询次数控制在5次以内
@@ -42,10 +42,6 @@ none
internet: 黄金价格走势 internet: 黄金价格走势
internet: The trend of gold prices internet: The trend of gold prices
#### 向私有知识库查询多次
private: 电子钱包,密码
private: 张三,身份证号
#### 向多个查询目标查询多次 #### 向多个查询目标查询多次
internet: 中国未来房价趋势 internet: 中国未来房价趋势
internet: 最新中国经济政策 internet: 最新中国经济政策