Files
higress/plugins/wasm-go/extensions/ai-search/engine/arxiv/arxiv.go
2025-02-24 11:14:47 +08:00

135 lines
3.8 KiB
Go

package arxiv
import (
"bytes"
"errors"
"fmt"
"net/http"
"net/url"
"strings"
"github.com/alibaba/higress/plugins/wasm-go/pkg/wrapper"
"github.com/antchfx/xmlquery"
"github.com/tidwall/gjson"
"github.com/alibaba/higress/plugins/wasm-go/extensions/ai-search/engine"
)
type ArxivSearch struct {
optionArgs map[string]string
start int
count int
timeoutMillisecond uint32
client wrapper.HttpClient
arxivCategory string
}
func NewArxivSearch(config *gjson.Result) (*ArxivSearch, error) {
engine := &ArxivSearch{}
serviceName := config.Get("serviceName").String()
if serviceName == "" {
return nil, errors.New("serviceName not found")
}
servicePort := config.Get("servicePort").Int()
if servicePort == 0 {
return nil, errors.New("servicePort not found")
}
engine.client = wrapper.NewClusterClient(wrapper.FQDNCluster{
FQDN: serviceName,
Port: servicePort,
})
engine.start = int(config.Get("start").Uint())
engine.count = int(config.Get("count").Uint())
if engine.count == 0 {
engine.count = 10
}
engine.timeoutMillisecond = uint32(config.Get("timeoutMillisecond").Uint())
if engine.timeoutMillisecond == 0 {
engine.timeoutMillisecond = 5000
}
engine.optionArgs = map[string]string{}
for key, value := range config.Get("optionArgs").Map() {
valStr := value.String()
if valStr != "" {
engine.optionArgs[key] = value.String()
}
}
engine.arxivCategory = config.Get("arxivCategory").String()
return engine, nil
}
func (a ArxivSearch) NeedExectue(ctx engine.SearchContext) bool {
return ctx.EngineType == "arxiv"
}
func (a ArxivSearch) Client() wrapper.HttpClient {
return a.client
}
func (a ArxivSearch) CallArgs(ctx engine.SearchContext) engine.CallArgs {
var searchQueryItems []string
for _, q := range ctx.Querys {
searchQueryItems = append(searchQueryItems, fmt.Sprintf("all:%s", url.QueryEscape(q)))
}
searchQuery := strings.Join(searchQueryItems, "+AND+")
category := ctx.ArxivCategory
if category == "" {
category = a.arxivCategory
}
if category != "" {
searchQuery = fmt.Sprintf("%s+AND+cat:%s", searchQuery, category)
}
queryUrl := fmt.Sprintf("https://export.arxiv.org/api/query?search_query=%s&max_results=%d&start=%d",
searchQuery, a.count, a.start)
var extraArgs []string
for key, value := range a.optionArgs {
extraArgs = append(extraArgs, fmt.Sprintf("%s=%s", key, url.QueryEscape(value)))
}
if len(extraArgs) > 0 {
queryUrl = fmt.Sprintf("%s&%s", queryUrl, strings.Join(extraArgs, "&"))
}
return engine.CallArgs{
Method: http.MethodGet,
Url: queryUrl,
Headers: [][2]string{{"Accept", "application/atom+xml"}},
TimeoutMillisecond: a.timeoutMillisecond,
}
}
func (a ArxivSearch) ParseResult(ctx engine.SearchContext, response []byte) []engine.SearchResult {
var results []engine.SearchResult
doc, err := xmlquery.Parse(bytes.NewReader(response))
if err != nil {
return results
}
entries := xmlquery.Find(doc, "//entry")
for _, entry := range entries {
title := entry.SelectElement("title").InnerText()
link := ""
for _, l := range entry.SelectElements("link") {
if l.SelectAttr("rel") == "alternate" && l.SelectAttr("type") == "text/html" {
link = l.SelectAttr("href")
break
}
}
summary := entry.SelectElement("summary").InnerText()
publishTime := entry.SelectElement("published").InnerText()
authors := entry.SelectElements("author")
var authorNames []string
for _, author := range authors {
authorNames = append(authorNames, author.SelectElement("name").InnerText())
}
content := fmt.Sprintf("%s\nAuthors: %s\nPublication time: %s", summary, strings.Join(authorNames, ", "), publishTime)
result := engine.SearchResult{
Title: title,
Link: link,
Content: content,
}
if result.Valid() {
results = append(results, result)
}
}
return results
}