mirror of
https://github.com/alibaba/higress.git
synced 2026-02-27 06:00:51 +08:00
feat: support elasticsearch hybrid search (#1844)
This commit is contained in:
@@ -72,12 +72,17 @@ description: higress 支持通过集成搜索引擎(Google/Bing/Arxiv/Elastics
|
||||
|
||||
## Elasticsearch 特定配置
|
||||
|
||||
| 名称 | 数据类型 | 填写要求 | 默认值 | 描述 |
|
||||
|------|----------|----------|--------|------|
|
||||
| 名称 | 数据类型 | 填写要求 | 默认值 | 描述 |
|
||||
|------|----------|----------|--------|-----------------------|
|
||||
| index | string | 必填 | - | 要搜索的Elasticsearch索引名称 |
|
||||
| contentField | string | 必填 | - | 要查询的内容字段名称 |
|
||||
| linkField | string | 必填 | - | 结果链接字段名称 |
|
||||
| titleField | string | 必填 | - | 结果标题字段名称 |
|
||||
| contentField | string | 必填 | - | 要查询的内容字段名称 |
|
||||
| semanticTextField | string | 必填 | - | 要查询的 embedding 字段名称 |
|
||||
| linkField | string | 必填 | - | 结果链接字段名称 |
|
||||
| titleField | string | 必填 | - | 结果标题字段名称 |
|
||||
| username | string | 选填 | - | Elasticsearch 用户名 |
|
||||
| password | string | 选填 | - | Elasticsearch 密码 |
|
||||
|
||||
混合搜索中使用的 [Reciprocal Rank Fusion (RRF)](https://www.elastic.co/guide/en/elasticsearch/reference/8.17/rrf.html) 查询要求 Elasticsearch 的版本在 8.8 及以上。
|
||||
|
||||
## Quark 特定配置
|
||||
|
||||
@@ -200,8 +205,11 @@ searchFrom:
|
||||
servicePort: 80
|
||||
index: "knowledge_base"
|
||||
contentField: "content"
|
||||
semanticTextField: "semantic_text"
|
||||
linkField: "url"
|
||||
titleField: "title"
|
||||
# username: "elastic"
|
||||
# password: "password"
|
||||
```
|
||||
|
||||
### 自定义引用格式
|
||||
|
||||
@@ -72,12 +72,17 @@ It is strongly recommended to enable this feature when using Arxiv or Elasticsea
|
||||
|
||||
## Elasticsearch Specific Configuration
|
||||
|
||||
| Name | Data Type | Requirement | Default Value | Description |
|
||||
|------|-----------|-------------|---------------|-------------|
|
||||
| index | string | Required | - | Elasticsearch index name to search |
|
||||
| contentField | string | Required | - | Content field name to query |
|
||||
| linkField | string | Required | - | Result link field name |
|
||||
| titleField | string | Required | - | Result title field name |
|
||||
| Name | Data Type | Requirement | Default Value | Description |
|
||||
|------|-----------|-------------|---------------|------------------------------------|
|
||||
| index | string | Required | - | Elasticsearch index name to search |
|
||||
| contentField | string | Required | - | Content field name to query |
|
||||
| semanticTextField | string | Required | - | Embedding field name to query |
|
||||
| linkField | string | Required | - | Result link field name |
|
||||
| titleField | string | Required | - | Result title field name |
|
||||
| username | string | Optional | - | Elasticsearch username |
|
||||
| password | string | Optional | - | Elasticsearch password |
|
||||
|
||||
The [Reciprocal Rank Fusion (RRF)](https://www.elastic.co/guide/en/elasticsearch/reference/8.17/rrf.html) query used in hybrid search requires Elasticsearch version 8.8 or higher.
|
||||
|
||||
## Quark Specific Configuration
|
||||
|
||||
@@ -199,8 +204,11 @@ searchFrom:
|
||||
servicePort: 80
|
||||
index: "knowledge_base"
|
||||
contentField: "content"
|
||||
semanticTextField: "semantic_text"
|
||||
linkField: "url"
|
||||
titleField: "title"
|
||||
# username: "elastic"
|
||||
# password: "password"
|
||||
```
|
||||
|
||||
### Custom Reference Format
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package elasticsearch
|
||||
|
||||
import (
|
||||
"encoding/base64"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/http"
|
||||
@@ -16,11 +17,14 @@ type ElasticsearchSearch struct {
|
||||
client wrapper.HttpClient
|
||||
index string
|
||||
contentField string
|
||||
semanticTextField string
|
||||
linkField string
|
||||
titleField string
|
||||
start int
|
||||
count int
|
||||
timeoutMillisecond uint32
|
||||
username string
|
||||
password string
|
||||
}
|
||||
|
||||
func NewElasticsearchSearch(config *gjson.Result) (*ElasticsearchSearch, error) {
|
||||
@@ -41,10 +45,15 @@ func NewElasticsearchSearch(config *gjson.Result) (*ElasticsearchSearch, error)
|
||||
if engine.index == "" {
|
||||
return nil, errors.New("index not found")
|
||||
}
|
||||
|
||||
engine.contentField = config.Get("contentField").String()
|
||||
if engine.contentField == "" {
|
||||
return nil, errors.New("contentField not found")
|
||||
}
|
||||
engine.semanticTextField = config.Get("semanticTextField").String()
|
||||
if engine.semanticTextField == "" {
|
||||
return nil, errors.New("semanticTextField not found")
|
||||
}
|
||||
engine.linkField = config.Get("linkField").String()
|
||||
if engine.linkField == "" {
|
||||
return nil, errors.New("linkField not found")
|
||||
@@ -62,36 +71,66 @@ func NewElasticsearchSearch(config *gjson.Result) (*ElasticsearchSearch, error)
|
||||
if engine.count == 0 {
|
||||
engine.count = 10
|
||||
}
|
||||
|
||||
engine.username = config.Get("username").String()
|
||||
engine.password = config.Get("password").String()
|
||||
|
||||
return engine, nil
|
||||
}
|
||||
|
||||
func (e ElasticsearchSearch) NeedExectue(ctx engine.SearchContext) bool {
|
||||
return ctx.EngineType == "private"
|
||||
return ctx.EngineType == "private" || ctx.EngineType == ""
|
||||
}
|
||||
|
||||
func (e ElasticsearchSearch) Client() wrapper.HttpClient {
|
||||
return e.client
|
||||
}
|
||||
|
||||
func (e ElasticsearchSearch) CallArgs(ctx engine.SearchContext) engine.CallArgs {
|
||||
searchBody := fmt.Sprintf(`{
|
||||
"query": {
|
||||
"match": {
|
||||
"%s": {
|
||||
"query": "%s",
|
||||
"operator": "AND"
|
||||
}
|
||||
func (e ElasticsearchSearch) generateAuthorizationHeader() string {
|
||||
return fmt.Sprintf(`Basic %s`, base64.StdEncoding.EncodeToString([]byte(e.username+":"+e.password)))
|
||||
}
|
||||
|
||||
func (e ElasticsearchSearch) generateQueryBody(ctx engine.SearchContext) string {
|
||||
queryText := strings.Join(ctx.Querys, " ")
|
||||
return fmt.Sprintf(`{
|
||||
"retriever": {
|
||||
"rrf": {
|
||||
"retrievers": [
|
||||
{
|
||||
"standard": {
|
||||
"query": {
|
||||
"match": {
|
||||
"%s": "%s"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"standard": {
|
||||
"query": {
|
||||
"semantic": {
|
||||
"field": "%s",
|
||||
"query": "%s"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}`, e.contentField, strings.Join(ctx.Querys, " "))
|
||||
}`, e.contentField, queryText, e.semanticTextField, queryText)
|
||||
}
|
||||
|
||||
func (e ElasticsearchSearch) CallArgs(ctx engine.SearchContext) engine.CallArgs {
|
||||
queryBody := e.generateQueryBody(ctx)
|
||||
return engine.CallArgs{
|
||||
Method: http.MethodPost,
|
||||
Url: fmt.Sprintf("/%s/_search?from=%d&size=%d", e.index, e.start, e.count),
|
||||
Headers: [][2]string{
|
||||
{"Content-Type", "application/json"},
|
||||
{"Authorization", e.generateAuthorizationHeader()},
|
||||
},
|
||||
Body: []byte(searchBody),
|
||||
Body: []byte(queryBody),
|
||||
TimeoutMillisecond: e.timeoutMillisecond,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -172,7 +172,7 @@ func parseConfig(json gjson.Result, config *Config, log wrapper.Log) error {
|
||||
case "quark":
|
||||
searchEngine, err := quark.NewQuarkSearch(&e)
|
||||
if err != nil {
|
||||
return fmt.Errorf("elasticsearch search engine init failed:%s", err)
|
||||
return fmt.Errorf("quark search engine init failed:%s", err)
|
||||
}
|
||||
config.engine = append(config.engine, searchEngine)
|
||||
internetExists = true
|
||||
|
||||
@@ -25,7 +25,7 @@ none
|
||||
3. How: 分析对于要查询的知识和资料,应该提出什么样的问题
|
||||
4. Adjust: 明确要向什么地方查询什么问题后,按下面方式对问题进行调整
|
||||
4.1. 向搜索引擎提问:用一句话概括问题,并且针对搜索引擎做问题优化
|
||||
4.2. 向私有知识库提问:将问题拆分成多组关键词的组合,同时组合中的关键词个数尽量不要超过3个
|
||||
4.2. 向私有知识库提问:用一句话概括问题,私有知识库不需要对关键词进行拆分
|
||||
4.3. 向Arxiv论文资料库提问:
|
||||
4.3.1. 明确问题所属领域,然后确定Arxiv的Category值,Category可选的枚举如下:
|
||||
- cs.AI: Artificial Intelligence
|
||||
@@ -207,10 +207,6 @@ cs.AI: attention mechanism
|
||||
cs.AI: neuron
|
||||
q-bio.NC: brain,attention mechanism
|
||||
|
||||
#### 向私有知识库查询多次
|
||||
private: 电子钱包,密码
|
||||
private: 张三,身份证号
|
||||
|
||||
#### 向多个查询目标查询多次
|
||||
internet: 中国未来房价趋势
|
||||
internet: 最新中国经济政策
|
||||
|
||||
@@ -25,7 +25,7 @@ none
|
||||
3. How: 分析对于要查询的知识和资料,应该提出什么样的问题
|
||||
4. Adjust: 明确要向什么地方查询什么问题后,按下面方式对问题进行调整
|
||||
4.1. 向搜索引擎提问:用一句话概括问题,并且针对搜索引擎做问题优化
|
||||
4.2. 向私有知识库提问:将问题拆分成多组关键词的组合,同时组合中的关键词个数尽量不要超过3个
|
||||
4.2. 向私有知识库提问:用一句话概括问题,私有知识库不需要对关键词进行拆分
|
||||
5. Final: 按照下面**回复内容示例**进行回复,注意:
|
||||
- 不要输出思考过程
|
||||
- 可以向多个查询目标分别查询多次,多个查询用换行分隔,总查询次数控制在5次以内
|
||||
@@ -42,10 +42,6 @@ none
|
||||
internet: 黄金价格走势
|
||||
internet: The trend of gold prices
|
||||
|
||||
#### 向私有知识库查询多次
|
||||
private: 电子钱包,密码
|
||||
private: 张三,身份证号
|
||||
|
||||
#### 向多个查询目标查询多次
|
||||
internet: 中国未来房价趋势
|
||||
internet: 最新中国经济政策
|
||||
|
||||
Reference in New Issue
Block a user