From fae222806b2316ca0393d51033343775f39bfae2 Mon Sep 17 00:00:00 2001 From: onlypiglet <46953658+OnlyPiglet@users.noreply.github.com> Date: Fri, 2 Feb 2024 16:57:41 +0800 Subject: [PATCH] Implement the Go Wasm plugin: bot-detect (#747) --- .../wasm-go/extensions/bot-detect/README.md | 58 ++++++++ .../extensions/bot-detect/README_EN.md | 58 ++++++++ plugins/wasm-go/extensions/bot-detect/VERSION | 1 + .../extensions/bot-detect/botdetect.yaml | 30 ++++ .../bot-detect/config/bot_detect_config.go | 68 +++++++++ .../config/bot_detect_config_test.go | 138 ++++++++++++++++++ plugins/wasm-go/extensions/bot-detect/go.mod | 23 +++ plugins/wasm-go/extensions/bot-detect/go.sum | 36 +++++ plugins/wasm-go/extensions/bot-detect/main.go | 104 +++++++++++++ .../conformance/tests/go-wasm-bot-detect.go | 118 +++++++++++++++ .../conformance/tests/go-wasm-bot-detect.yaml | 53 +++++++ 11 files changed, 687 insertions(+) create mode 100644 plugins/wasm-go/extensions/bot-detect/README.md create mode 100644 plugins/wasm-go/extensions/bot-detect/README_EN.md create mode 100644 plugins/wasm-go/extensions/bot-detect/VERSION create mode 100644 plugins/wasm-go/extensions/bot-detect/botdetect.yaml create mode 100644 plugins/wasm-go/extensions/bot-detect/config/bot_detect_config.go create mode 100644 plugins/wasm-go/extensions/bot-detect/config/bot_detect_config_test.go create mode 100644 plugins/wasm-go/extensions/bot-detect/go.mod create mode 100644 plugins/wasm-go/extensions/bot-detect/go.sum create mode 100644 plugins/wasm-go/extensions/bot-detect/main.go create mode 100644 test/e2e/conformance/tests/go-wasm-bot-detect.go create mode 100644 test/e2e/conformance/tests/go-wasm-bot-detect.yaml diff --git a/plugins/wasm-go/extensions/bot-detect/README.md b/plugins/wasm-go/extensions/bot-detect/README.md new file mode 100644 index 000000000..b56fba8e8 --- /dev/null +++ b/plugins/wasm-go/extensions/bot-detect/README.md @@ -0,0 +1,58 @@ +

+ English | 中文 +

+ +# 功能说明 +`bot-detect`插件可以用于识别并阻止互联网爬虫对站点资源的爬取 + +# 配置字段 + +| 名称 | 数据类型 | 填写要求 | 默认值 | 描述 | +| -------- | -------- | -------- | -------- | -------- | +| allow | array of string | 选填 | - | 配置匹配 User-Agent 请求头的正则表达式,匹配命中时将允许其访问 | +| deny | array of string | 选填 | - | 配置匹配 User-Agent 请求头的正则表达式,匹配命中时将屏蔽请求 | +| blocked_code | number | 选填 | 403 | 配置请求被屏蔽时返回的 HTTP 状态码 | +| blocked_message | string | 选填 | - | 配置请求被屏蔽时返回的 HTTP 应答 Body | + +`allow` 和 `deny` 字段可以均不配置,则执行默认的爬虫判断逻辑,通过配置 `allow` 字段可以将原本命中默认爬虫判断逻辑的请求放行,通过配置 `deny` 字段可以增加额外的爬虫判断逻辑。 + +默认的爬虫判断正则表达式集合如下: + +```bash +# Bots General matcher 'name/0.0' + (?:\/[A-Za-z0-9\.]+|) {0,5}([A-Za-z0-9 \-_\!\[\]:]{0,50}(?:[Aa]rchiver|[Ii]ndexer|[Ss]craper|[Bb]ot|[Ss]pider|[Cc]rawl[a-z]{0,50}))[/ ](\d+)(?:\.(\d+)(?:\.(\d+)|)|) +# Bots General matcher 'name 0.0' + (?:\/[A-Za-z0-9\.]+|) {0,5}([A-Za-z0-9 \-_\!\[\]:]{0,50}(?:[Aa]rchiver|[Ii]ndexer|[Ss]craper|[Bb]ot|[Ss]pider|[Cc]rawl[a-z]{0,50})) (\d+)(?:\.(\d+)(?:\.(\d+)|)|) +# Bots containing spider|scrape|bot(but not CUBOT)|Crawl + ((?:[A-z0-9]{1,50}|[A-z\-]{1,50} ?|)(?: the |)(?:[Ss][Pp][Ii][Dd][Ee][Rr]|[Ss]crape|[Cc][Rr][Aa][Ww][Ll])[A-z0-9]{0,50})(?:(?:[ /]| v)(\d+)(?:\.(\d+)|)(?:\.(\d+)|)|) +# Bots Pattern '/name-0.0' + /((?:Ant-)?Nutch|[A-z]+[Bb]ot|[A-z]+[Ss]pider|Axtaris|fetchurl|Isara|ShopSalad|Tailsweep)[ \-](\d+)(?:\.(\d+)(?:\.(\d+))?)? +# Bots Pattern 'name/0.0' + \b(008|Altresium|Argus|BaiduMobaider|BoardReader|DNSGroup|DataparkSearch|EDI|Goodzer|Grub|INGRID|Infohelfer|LinkedInBot|LOOQ|Nutch|OgScrper|PathDefender|Peew|PostPost|Steeler|Twitterbot|VSE|WebCrunch|WebZIP|Y!J-BR[A-Z]|YahooSeeker|envolk|sproose|wminer)/(\d+)(?:\.(\d+)|)(?:\.(\d+)|) +# More bots + (CSimpleSpider|Cityreview Robot|CrawlDaddy|CrawlFire|Finderbots|Index crawler|Job Roboter|KiwiStatus Spider|Lijit Crawler|QuerySeekerSpider|ScollSpider|Trends Crawler|USyd-NLP-Spider|SiteCat Webbot|BotName\/\$BotVersion|123metaspider-Bot|1470\.net crawler|50\.nu|8bo Crawler Bot|Aboundex|Accoona-[A-z]{1,30}-Agent|AdsBot-Google(?:-[a-z]{1,30}|)|altavista|AppEngine-Google|archive.{0,30}\.org_bot|archiver|Ask Jeeves|[Bb]ai[Dd]u[Ss]pider(?:-[A-Za-z]{1,30})(?:-[A-Za-z]{1,30}|)|bingbot|BingPreview|blitzbot|BlogBridge|Bloglovin|BoardReader Blog Indexer|BoardReader Favicon Fetcher|boitho.com-dc|BotSeer|BUbiNG|\b\w{0,30}favicon\w{0,30}\b|\bYeti(?:-[a-z]{1,30}|)|Catchpoint(?: bot|)|[Cc]harlotte|Checklinks|clumboot|Comodo HTTP\(S\) Crawler|Comodo-Webinspector-Crawler|ConveraCrawler|CRAWL-E|CrawlConvera|Daumoa(?:-feedfetcher|)|Feed Seeker Bot|Feedbin|findlinks|Flamingo_SearchEngine|FollowSite Bot|furlbot|Genieo|gigabot|GomezAgent|gonzo1|(?:[a-zA-Z]{1,30}-|)Googlebot(?:-[a-zA-Z]{1,30}|)|Google SketchUp|grub-client|gsa-crawler|heritrix|HiddenMarket|holmes|HooWWWer|htdig|ia_archiver|ICC-Crawler|Icarus6j|ichiro(?:/mobile|)|IconSurf|IlTrovatore(?:-Setaccio|)|InfuzApp|Innovazion Crawler|InternetArchive|IP2[a-z]{1,30}Bot|jbot\b|KaloogaBot|Kraken|Kurzor|larbin|LEIA|LesnikBot|Linguee Bot|LinkAider|LinkedInBot|Lite Bot|Llaut|lycos|Mail\.RU_Bot|masscan|masidani_bot|Mediapartners-Google|Microsoft .{0,30} Bot|mogimogi|mozDex|MJ12bot|msnbot(?:-media {0,2}|)|msrbot|Mtps Feed Aggregation System|netresearch|Netvibes|NewsGator[^/]{0,30}|^NING|Nutch[^/]{0,30}|Nymesis|ObjectsSearch|OgScrper|Orbiter|OOZBOT|PagePeeker|PagesInventory|PaxleFramework|Peeplo Screenshot Bot|PlantyNet_WebRobot|Pompos|Qwantify|Read%20Later|Reaper|RedCarpet|Retreiver|Riddler|Rival IQ|scooter|Scrapy|Scrubby|searchsight|seekbot|semanticdiscovery|SemrushBot|Simpy|SimplePie|SEOstats|SimpleRSS|SiteCon|Slackbot-LinkExpanding|Slack-ImgProxy|Slurp|snappy|Speedy Spider|Squrl Java|Stringer|TheUsefulbot|ThumbShotsBot|Thumbshots\.ru|Tiny Tiny RSS|Twitterbot|WhatsApp|URL2PNG|Vagabondo|VoilaBot|^vortex|Votay bot|^voyager|WASALive.Bot|Web-sniffer|WebThumb|WeSEE:[A-z]{1,30}|WhatWeb|WIRE|WordPress|Wotbox|www\.almaden\.ibm\.com|Xenu(?:.s|) Link Sleuth|Xerka [A-z]{1,30}Bot|yacy(?:bot|)|YahooSeeker|Yahoo! Slurp|Yandex\w{1,30}|YodaoBot(?:-[A-z]{1,30}|)|YottaaMonitor|Yowedo|^Zao|^Zao-Crawler|ZeBot_www\.ze\.bz|ZooShot|ZyBorg)(?:[ /]v?(\d+)(?:\.(\d+)(?:\.(\d+)|)|)|) +``` + +# 配置示例 + +## 放行原本命中爬虫规则的请求 +```yaml +allow: +- ".*Go-http-client.*" +``` + +若不作该配置,默认的 Golang 网络库请求会被视做爬虫,被禁止访问 + + +## 增加爬虫判断 +```yaml +deny: +- "spd-tools.*" +``` + +根据该配置,下列请求将被禁止访问: + +```bash +curl http://example.com -H 'User-Agent: spd-tools/1.1' +curl http://exmaple.com -H 'User-Agent: spd-tools' +``` \ No newline at end of file diff --git a/plugins/wasm-go/extensions/bot-detect/README_EN.md b/plugins/wasm-go/extensions/bot-detect/README_EN.md new file mode 100644 index 000000000..44eab957a --- /dev/null +++ b/plugins/wasm-go/extensions/bot-detect/README_EN.md @@ -0,0 +1,58 @@ +

+ English | 中文 +

+ +# Description +`bot-detect` plugin can be used to identify and prevent web crawlers from crawling websites. + +# Configuration Fields + +| Name | Type | Requirement | Default Value | Description | +| -------- | -------- | -------- | -------- | -------- | +| allow | array of string | Optional | - | A regular expression to match the User-Agent request header and will allow access if the match hits | +| deny | array of string | Optional | - | A regular expression to match the User-Agent request header and will block the request if the match hits | +| blocked_code | number | Optional | 403 | The HTTP status code returned when a request is blocked | +| blocked_message | string | Optional | - | The HTTP response Body returned when a request is blocked | + +If field `allow` and field `deny` are not configured at the same time, the default logic to identify crawlers will be executed. By configuring the `allow` field, requests that would otherwise hit the default logic can be allowed. The judgement can be extended by configuring the `deny` field + +The default set of crawler judgment regular expressions is as follows: + +```bash +# Bots General matcher 'name/0.0' + (?:\/[A-Za-z0-9\.]+|) {0,5}([A-Za-z0-9 \-_\!\[\]:]{0,50}(?:[Aa]rchiver|[Ii]ndexer|[Ss]craper|[Bb]ot|[Ss]pider|[Cc]rawl[a-z]{0,50}))[/ ](\d+)(?:\.(\d+)(?:\.(\d+)|)|) +# Bots General matcher 'name 0.0' + (?:\/[A-Za-z0-9\.]+|) {0,5}([A-Za-z0-9 \-_\!\[\]:]{0,50}(?:[Aa]rchiver|[Ii]ndexer|[Ss]craper|[Bb]ot|[Ss]pider|[Cc]rawl[a-z]{0,50})) (\d+)(?:\.(\d+)(?:\.(\d+)|)|) +# Bots containing spider|scrape|bot(but not CUBOT)|Crawl + ((?:[A-z0-9]{1,50}|[A-z\-]{1,50} ?|)(?: the |)(?:[Ss][Pp][Ii][Dd][Ee][Rr]|[Ss]crape|[Cc][Rr][Aa][Ww][Ll])[A-z0-9]{0,50})(?:(?:[ /]| v)(\d+)(?:\.(\d+)|)(?:\.(\d+)|)|) +# Bots Pattern '/name-0.0' + /((?:Ant-)?Nutch|[A-z]+[Bb]ot|[A-z]+[Ss]pider|Axtaris|fetchurl|Isara|ShopSalad|Tailsweep)[ \-](\d+)(?:\.(\d+)(?:\.(\d+))?)? +# Bots Pattern 'name/0.0' + \b(008|Altresium|Argus|BaiduMobaider|BoardReader|DNSGroup|DataparkSearch|EDI|Goodzer|Grub|INGRID|Infohelfer|LinkedInBot|LOOQ|Nutch|OgScrper|PathDefender|Peew|PostPost|Steeler|Twitterbot|VSE|WebCrunch|WebZIP|Y!J-BR[A-Z]|YahooSeeker|envolk|sproose|wminer)/(\d+)(?:\.(\d+)|)(?:\.(\d+)|) +# More bots + (CSimpleSpider|Cityreview Robot|CrawlDaddy|CrawlFire|Finderbots|Index crawler|Job Roboter|KiwiStatus Spider|Lijit Crawler|QuerySeekerSpider|ScollSpider|Trends Crawler|USyd-NLP-Spider|SiteCat Webbot|BotName\/\$BotVersion|123metaspider-Bot|1470\.net crawler|50\.nu|8bo Crawler Bot|Aboundex|Accoona-[A-z]{1,30}-Agent|AdsBot-Google(?:-[a-z]{1,30}|)|altavista|AppEngine-Google|archive.{0,30}\.org_bot|archiver|Ask Jeeves|[Bb]ai[Dd]u[Ss]pider(?:-[A-Za-z]{1,30})(?:-[A-Za-z]{1,30}|)|bingbot|BingPreview|blitzbot|BlogBridge|Bloglovin|BoardReader Blog Indexer|BoardReader Favicon Fetcher|boitho.com-dc|BotSeer|BUbiNG|\b\w{0,30}favicon\w{0,30}\b|\bYeti(?:-[a-z]{1,30}|)|Catchpoint(?: bot|)|[Cc]harlotte|Checklinks|clumboot|Comodo HTTP\(S\) Crawler|Comodo-Webinspector-Crawler|ConveraCrawler|CRAWL-E|CrawlConvera|Daumoa(?:-feedfetcher|)|Feed Seeker Bot|Feedbin|findlinks|Flamingo_SearchEngine|FollowSite Bot|furlbot|Genieo|gigabot|GomezAgent|gonzo1|(?:[a-zA-Z]{1,30}-|)Googlebot(?:-[a-zA-Z]{1,30}|)|Google SketchUp|grub-client|gsa-crawler|heritrix|HiddenMarket|holmes|HooWWWer|htdig|ia_archiver|ICC-Crawler|Icarus6j|ichiro(?:/mobile|)|IconSurf|IlTrovatore(?:-Setaccio|)|InfuzApp|Innovazion Crawler|InternetArchive|IP2[a-z]{1,30}Bot|jbot\b|KaloogaBot|Kraken|Kurzor|larbin|LEIA|LesnikBot|Linguee Bot|LinkAider|LinkedInBot|Lite Bot|Llaut|lycos|Mail\.RU_Bot|masscan|masidani_bot|Mediapartners-Google|Microsoft .{0,30} Bot|mogimogi|mozDex|MJ12bot|msnbot(?:-media {0,2}|)|msrbot|Mtps Feed Aggregation System|netresearch|Netvibes|NewsGator[^/]{0,30}|^NING|Nutch[^/]{0,30}|Nymesis|ObjectsSearch|OgScrper|Orbiter|OOZBOT|PagePeeker|PagesInventory|PaxleFramework|Peeplo Screenshot Bot|PlantyNet_WebRobot|Pompos|Qwantify|Read%20Later|Reaper|RedCarpet|Retreiver|Riddler|Rival IQ|scooter|Scrapy|Scrubby|searchsight|seekbot|semanticdiscovery|SemrushBot|Simpy|SimplePie|SEOstats|SimpleRSS|SiteCon|Slackbot-LinkExpanding|Slack-ImgProxy|Slurp|snappy|Speedy Spider|Squrl Java|Stringer|TheUsefulbot|ThumbShotsBot|Thumbshots\.ru|Tiny Tiny RSS|Twitterbot|WhatsApp|URL2PNG|Vagabondo|VoilaBot|^vortex|Votay bot|^voyager|WASALive.Bot|Web-sniffer|WebThumb|WeSEE:[A-z]{1,30}|WhatWeb|WIRE|WordPress|Wotbox|www\.almaden\.ibm\.com|Xenu(?:.s|) Link Sleuth|Xerka [A-z]{1,30}Bot|yacy(?:bot|)|YahooSeeker|Yahoo! Slurp|Yandex\w{1,30}|YodaoBot(?:-[A-z]{1,30}|)|YottaaMonitor|Yowedo|^Zao|^Zao-Crawler|ZeBot_www\.ze\.bz|ZooShot|ZyBorg)(?:[ /]v?(\d+)(?:\.(\d+)(?:\.(\d+)|)|)|) +``` + +# Configuration Samples + +## Release Requests that would otherwise Hit the Crawler Rules +```yaml +allow: +- ".*Go-http-client.*" +``` + +Without this configuration, the default Golang web library request will be treated as a crawler and access will be denied. + + +## Add Crawler Judgement +```yaml +deny: +- "spd-tools.*" +``` + +According to this configuration, the following requests will be denied: + +```bash +curl http://example.com -H 'User-Agent: spd-tools/1.1' +curl http://exmaple.com -H 'User-Agent: spd-tools' +``` \ No newline at end of file diff --git a/plugins/wasm-go/extensions/bot-detect/VERSION b/plugins/wasm-go/extensions/bot-detect/VERSION new file mode 100644 index 000000000..afaf360d3 --- /dev/null +++ b/plugins/wasm-go/extensions/bot-detect/VERSION @@ -0,0 +1 @@ +1.0.0 \ No newline at end of file diff --git a/plugins/wasm-go/extensions/bot-detect/botdetect.yaml b/plugins/wasm-go/extensions/bot-detect/botdetect.yaml new file mode 100644 index 000000000..c5c032f0a --- /dev/null +++ b/plugins/wasm-go/extensions/bot-detect/botdetect.yaml @@ -0,0 +1,30 @@ +apiVersion: extensions.higress.io/v1alpha1 +kind: WasmPlugin +metadata: + annotations: + higress.io/wasm-plugin-description: 用于识别并阻止互联网爬虫对站点资源的爬取 + higress.io/wasm-plugin-title: Bot Detect + creationTimestamp: '2024-01-03T10:34:36Z' + generation: 2 + labels: + higress.io/resource-definer: higress + higress.io/wasm-plugin-built-in: 'true' + higress.io/wasm-plugin-category: custom + higress.io/wasm-plugin-name: bot-detect + higress.io/wasm-plugin-version: 1.0.0 + name: bot-detect + namespace: higress-system +spec: + defaultConfigDisable: true + matchRules: + - config: + blocked_code: 401 + blocked_message: a bot + deny: + - Chrome + configDisable: false + ingress: + - test + phase: AUTHN + priority: 310 + url: oci://higress-registry.cn-hangzhou.cr.aliyuncs.com/20240103/bot-detect:1.0.0 \ No newline at end of file diff --git a/plugins/wasm-go/extensions/bot-detect/config/bot_detect_config.go b/plugins/wasm-go/extensions/bot-detect/config/bot_detect_config.go new file mode 100644 index 000000000..275512b1f --- /dev/null +++ b/plugins/wasm-go/extensions/bot-detect/config/bot_detect_config.go @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2022 Alibaba Group Holding Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package config + +import ( + regexp "github.com/wasilibs/go-re2" +) + +var DefaultBotRegex = []*regexp.Regexp{ + regexp.MustCompile(`(\/[A-Za-z0-9\.]+|) {0,5}([A-Za-z0-9 \-_\!\[\]:]{0,50}([Aa]rchiver|[Ii]ndexer|[Ss]craper|[Bb]ot|[Ss]pider|[Cc]rawl[a-z]{0,50}))[/ ](\d+)(\.(\d+)(\.(\d+)|)|)`), + regexp.MustCompile(`((\/[A-Za-z0-9\.]+|) {0,5}([A-Za-z0-9 \-_\!\[\]:]{0,50}([Aa]rchiver|[Ii]ndexer|[Ss]craper|[Bb]ot|[Ss]pider|[Cc]rawl[a-z]{0,50})) (\d+)(\.(\d+)(\.(\d+)|)|))`), + regexp.MustCompile(`((([A-z0-9]{1,50}|[A-z\-]{1,} ?|)( the |)(?:[Ss][Pp][Ii][Dd][Ee][Rr]|[Ss]crape|[Cc][Rr][Aa][Ww][Ll])[A-z0-9]{0,50})(([ /]| v)(\d+)(\.(\d+)|)(\.(\d+)|)|))`), + regexp.MustCompile(`((Ant-)?Nutch|[A-z]+[Bb]ot|[A-z]+[Ss]pider|Axtaris|fetchurl|Isara|ShopSalad|Tailsweep)[ \-](\d+)(\.(\d+)(\.(\d+))?)?`), + regexp.MustCompile(`\b(008|Altresium|Argus|BaiduMobaider|BoardReader|DNSGroup|DataparkSearch|EDI|Goodzer|Grub|INGRID|Infohelfer|LinkedInBot|LOOQ|Nutch|OgScrper|PathDefender|Peew|PostPost|Steeler|Twitterbot|VSE|WebCrunch|WebZIP|Y!J-BR[A-Z]|YahooSeeker|envolk|sproose|wminer)/(\d+)(?:\.(\d+)|)(?:\.(\d+)|)`), + regexp.MustCompile(`((CSimpleSpider|Cityreview Robot|CrawlDaddy|CrawlFire|Finderbots|Index crawler|Job Roboter|KiwiStatus Spider|Lijit Crawler|QuerySeekerSpider|ScollSpider|Trends Crawler|USyd-NLP-Spider|SiteCat Webbot|BotName\/\$BotVersion|123metaspider-Bot|1470\.net crawler|50\.nu|8bo Crawler Bot|Aboundex|Accoona-[A-z]{1,30}-Agent|AdsBot-Google(?:-[a-z]{1,30}|)|altavista|AppEngine-Google|archive.{0,30}\.org_bot|archiver|Ask Jeeves|[Bb]ai[Dd]u[Ss]pider(?:-[A-Za-z]{1,30})(?:-[A-Za-z]{1,30}|)|bingbot|BingPreview|blitzbot|BlogBridge|Bloglovin|BoardReader Blog Indexer|BoardReader Favicon Fetcher|boitho.com-dc|BotSeer|BUbiNG|\b\w{0,30}favicon\w{0,30}\b|\bYeti(?:-[a-z]{1,30}|)|Catchpoint(?: bot|)|[Cc]harlotte|Checklinks|clumboot|Comodo HTTP\(S\) Crawler|Comodo-Webinspector-Crawler|ConveraCrawler|CRAWL-E|CrawlConvera|Daumoa(?:-feedfetcher|)|Feed Seeker Bot|Feedbin|findlinks|Flamingo_SearchEngine|FollowSite Bot|furlbot|Genieo|gigabot|GomezAgent|gonzo1|(?:[a-zA-Z]{1,30}-|)Googlebot(?:-[a-zA-Z]{1,30}|)|Google SketchUp|grub-client|gsa-crawler|heritrix|HiddenMarket|holmes|HooWWWer|htdig|ia_archiver|ICC-Crawler|Icarus6j|ichiro(?:/mobile|)|IconSurf|IlTrovatore(?:-Setaccio|)|InfuzApp|Innovazion Crawler|InternetArchive|IP2[a-z]{1,30}Bot|jbot\b|KaloogaBot|Kraken|Kurzor|larbin|LEIA|LesnikBot|Linguee Bot|LinkAider|LinkedInBot|Lite Bot|Llaut|lycos|Mail\.RU_Bot|masscan|masidani_bot|Mediapartners-Google|Microsoft .{0,30} Bot|mogimogi|mozDex|MJ12bot|msnbot(?:-media {0,2}|)|msrbot|Mtps Feed Aggregation System|netresearch|Netvibes|NewsGator[^/]{0,30}|^NING|Nutch[^/]{0,30}|Nymesis|ObjectsSearch|OgScrper|Orbiter|OOZBOT|PagePeeker|PagesInventory|PaxleFramework|Peeplo Screenshot Bot|PlantyNet_WebRobot|Pompos|Qwantify|Read%20Later|Reaper|RedCarpet|Retreiver|Riddler|Rival IQ|scooter|Scrapy|Scrubby|searchsight|seekbot|semanticdiscovery|SemrushBot|Simpy|SimplePie|SEOstats|SimpleRSS|SiteCon|Slackbot-LinkExpanding|Slack-ImgProxy|Slurp|snappy|Speedy Spider|Squrl Java|Stringer|TheUsefulbot|ThumbShotsBot|Thumbshots\.ru|Tiny Tiny RSS|Twitterbot|WhatsApp|URL2PNG|Vagabondo|VoilaBot|^vortex|Votay bot|^voyager|WASALive.Bot|Web-sniffer|WebThumb|WeSEE:[A-z]{1,30}|WhatWeb|WIRE|WordPress|Wotbox|www\.almaden\.ibm\.com|Xenu(?:.s|) Link Sleuth|Xerka [A-z]{1,30}Bot|yacy(?:bot|)|YahooSeeker|Yahoo! Slurp|Yandex\w{1,30}|YodaoBot(?:-[A-z]{1,30}|)|YottaaMonitor|Yowedo|^Zao|^Zao-Crawler|ZeBot_www\.ze\.bz|ZooShot|ZyBorg)(?:[ /]v?(\d+)(?:\.(\d+)(?:\.(\d+)|)|)|))`), +} + +type BotDetectConfig struct { + BlockedCode uint32 `json:"blocked_code"` + BlockedMessage string `json:"blocked_message"` + Allow []*regexp.Regexp `json:"allow"` + Deny []*regexp.Regexp `json:"deny"` +} + +func (bdc *BotDetectConfig) FillDefaultValue() { + if bdc.BlockedCode == 0 { + bdc.BlockedCode = 403 + } + if bdc.BlockedMessage == "" { + bdc.BlockedMessage = "Invalid User-Agent" + } +} + +func (bdc *BotDetectConfig) Process(ua string) (bool, string) { + if ua == "" { + return false, "can not be empty" + } + for _, allowRule := range bdc.Allow { + if allowRule.MatchString(ua) { + return true, "" + } + } + for _, denyRule := range bdc.Deny { + if denyRule.MatchString(ua) { + return false, denyRule.String() + } + } + for _, defaultRule := range DefaultBotRegex { + if defaultRule.MatchString(ua) { + return false, defaultRule.String() + } + } + return true, "" +} diff --git a/plugins/wasm-go/extensions/bot-detect/config/bot_detect_config_test.go b/plugins/wasm-go/extensions/bot-detect/config/bot_detect_config_test.go new file mode 100644 index 000000000..aee7adb95 --- /dev/null +++ b/plugins/wasm-go/extensions/bot-detect/config/bot_detect_config_test.go @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2022 Alibaba Group Holding Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package config + +import ( + "github.com/stretchr/testify/assert" + regexp "github.com/wasilibs/go-re2" + "log" + "testing" +) + +func toRegexMatch(regexs []string) []*regexp.Regexp { + re := make([]*regexp.Regexp, 0) + for _, regex := range regexs { + c, err := regexp.Compile(regex) + if err != nil { + log.Default().Fatal(err.Error()) + } + re = append(re, c) + } + return re +} + +func TestBotDetectConfig_ProcessTest(t *testing.T) { + + tests := []struct { + name string + ua string + allow []string + deny []string + blockCode uint32 + blockMessage string + want bool + }{ + { + "test empty bot detect", + "", + []string{}, + []string{}, + 401, + "bot has been blocked", + false, + }, + { + "test default bot detect", + "Ant-Tailsweep-1", + []string{}, + []string{}, + 401, + "bot has been blocked", + false, + }, + { + "test default bot detect", + "indexer/1.2", + []string{}, + []string{}, + 401, + "bot has been blocked", + false, + }, + { + "test default bot detect", + "indexer/1.1.0", + []string{}, + []string{}, + 401, + "bot has been blocked", + false, + }, + { + "test default bot detect", + "YottaaMonitor", + []string{}, + []string{}, + 401, + "bot has been blocked", + false, + }, + { + "test allow bot detect", + "BaiduMobaider", + []string{"BaiduMobaider"}, + []string{}, + 401, + "bot has been blocked", + true, + }, + { + "test deny bot detect", + "Chrome", + []string{}, + []string{"Chrome"}, + 401, + "bot has been blocked", + false, + }, + { + "test allow and deny bot detect", + "SameBotDetect", + []string{"SameBotDetect"}, + []string{"SameBotDetect"}, + 401, + "bot has been blocked", + true, + }, + } + + for _, test := range tests { + + t.Run(test.name, func(t *testing.T) { + bdc := BotDetectConfig{ + BlockedCode: test.blockCode, + BlockedMessage: test.blockMessage, + Allow: toRegexMatch(test.allow), + Deny: toRegexMatch(test.deny), + } + actual, _ := bdc.Process(test.ua) + assert.Equal(t, test.want, actual, "") + }) + + } + +} diff --git a/plugins/wasm-go/extensions/bot-detect/go.mod b/plugins/wasm-go/extensions/bot-detect/go.mod new file mode 100644 index 000000000..b83dc5be5 --- /dev/null +++ b/plugins/wasm-go/extensions/bot-detect/go.mod @@ -0,0 +1,23 @@ +module bot-detect + +go 1.19 + +require ( + github.com/alibaba/higress/plugins/wasm-go v1.3.2 + github.com/stretchr/testify v1.8.0 + github.com/tetratelabs/proxy-wasm-go-sdk v0.19.1-0.20220822060051-f9d179a57f8c + github.com/tidwall/gjson v1.14.3 + github.com/wasilibs/go-re2 v1.4.1 +) + +require ( + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/google/uuid v1.3.0 // indirect + github.com/higress-group/nottinygc v0.0.0-20231101025119-e93c4c2f8520 // indirect + github.com/magefile/mage v1.15.0 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/tetratelabs/wazero v1.6.0 // indirect + github.com/tidwall/match v1.1.1 // indirect + github.com/tidwall/pretty v1.2.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect +) diff --git a/plugins/wasm-go/extensions/bot-detect/go.sum b/plugins/wasm-go/extensions/bot-detect/go.sum new file mode 100644 index 000000000..d6701c2ba --- /dev/null +++ b/plugins/wasm-go/extensions/bot-detect/go.sum @@ -0,0 +1,36 @@ +github.com/alibaba/higress/plugins/wasm-go v1.3.2 h1:OKFo9zK7PFxvtSq9TmT8TwI6xqmNq5LZXfDBqPLOgkw= +github.com/alibaba/higress/plugins/wasm-go v1.3.2/go.mod h1:WZ/68vwe8qWhusa6C4/gMwUqas0jvHWSOa1bp8iK8F4= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= +github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/higress-group/nottinygc v0.0.0-20231101025119-e93c4c2f8520 h1:IHDghbGQ2DTIXHBHxWfqCYQW1fKjyJ/I7W1pMyUDeEA= +github.com/higress-group/nottinygc v0.0.0-20231101025119-e93c4c2f8520/go.mod h1:Nz8ORLaFiLWotg6GeKlJMhv8cci8mM43uEnLA5t8iew= +github.com/magefile/mage v1.15.0 h1:BvGheCMAsG3bWUDbZ8AyXXpCNwU9u5CB6sM+HNb9HYg= +github.com/magefile/mage v1.15.0/go.mod h1:z5UZb/iS3GoOSn0JgWuiw7dxlurVYTu+/jHXqQg881A= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PKk= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/tetratelabs/proxy-wasm-go-sdk v0.19.1-0.20220822060051-f9d179a57f8c h1:OCUFXVGixHLfNjg6/QYEhv+jHJ5mRGhpEUVFv9eWPJE= +github.com/tetratelabs/proxy-wasm-go-sdk v0.19.1-0.20220822060051-f9d179a57f8c/go.mod h1:5t/pWFNJ9eMyu/K/Z+OeGhDJ9sN9eCo8fc2pyM/Qjg4= +github.com/tetratelabs/wazero v1.6.0 h1:z0H1iikCdP8t+q341xqepY4EWvHEw8Es7tlqiVzlP3g= +github.com/tetratelabs/wazero v1.6.0/go.mod h1:0U0G41+ochRKoPKCJlh0jMg1CHkyfK8kDqiirMmKY8A= +github.com/tidwall/gjson v1.14.3 h1:9jvXn7olKEHU1S9vwoMGliaT8jq1vJ7IH/n9zD9Dnlw= +github.com/tidwall/gjson v1.14.3/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= +github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= +github.com/tidwall/pretty v1.2.0 h1:RWIZEg2iJ8/g6fDDYzMpobmaoGh5OLl4AXtGUGPcqCs= +github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= +github.com/wasilibs/go-re2 v1.4.1 h1:E5+9O1M8UoGeqLB2A9omeoaWImqpuYDs9cKwvTJq/Oo= +github.com/wasilibs/go-re2 v1.4.1/go.mod h1:ynB8eCwd9JsqUnsk8WlPDk6cEeme8BguZmnqOSURE4Y= +github.com/wasilibs/nottinygc v0.4.0 h1:h1TJMihMC4neN6Zq+WKpLxgd9xCFMw7O9ETLwY2exJQ= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/plugins/wasm-go/extensions/bot-detect/main.go b/plugins/wasm-go/extensions/bot-detect/main.go new file mode 100644 index 000000000..59dbcbe05 --- /dev/null +++ b/plugins/wasm-go/extensions/bot-detect/main.go @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2022 Alibaba Group Holding Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package main + +import ( + "bot-detect/config" + "github.com/alibaba/higress/plugins/wasm-go/pkg/wrapper" + "github.com/tetratelabs/proxy-wasm-go-sdk/proxywasm" + "github.com/tetratelabs/proxy-wasm-go-sdk/proxywasm/types" + "github.com/tidwall/gjson" + regexp "github.com/wasilibs/go-re2" +) + +func main() { + wrapper.SetCtx( + "bot-detect", + wrapper.ParseConfigBy(parseConfig), + wrapper.ProcessRequestHeadersBy(onHttpRequestHeaders), + ) +} + +func parseConfig(json gjson.Result, botDetectConfig *config.BotDetectConfig, log wrapper.Log) error { + log.Debug("parseConfig()") + + if json.Get("blocked_code").Exists() { + botDetectConfig.BlockedCode = uint32(int(json.Get("blocked_code").Int())) + } + + if json.Get("blocked_message").Exists() { + botDetectConfig.BlockedMessage = json.Get("blocked_message").String() + } + + allowRules := make([]gjson.Result, 0) + denyRules := make([]gjson.Result, 0) + + allowRulesValue := json.Get("allow") + if allowRulesValue.Exists() && allowRulesValue.IsArray() { + allowRules = json.Get("allow").Array() + } + + denyRulesValue := json.Get("deny") + if denyRulesValue.Exists() && denyRulesValue.IsArray() { + denyRules = json.Get("deny").Array() + } + + for _, allowRule := range allowRules { + c, err := regexp.Compile(allowRule.String()) + if err != nil { + return err + } + botDetectConfig.Allow = append(botDetectConfig.Allow, c) + } + + for _, denyRule := range denyRules { + c, err := regexp.Compile(denyRule.String()) + if err != nil { + return err + } + botDetectConfig.Deny = append(botDetectConfig.Deny, c) + } + + // Fill default values + botDetectConfig.FillDefaultValue() + log.Debugf("botDetectConfig:%+v", botDetectConfig) + return nil + +} + +func onHttpRequestHeaders(ctx wrapper.HttpContext, botDetectConfig config.BotDetectConfig, log wrapper.Log) types.Action { + log.Debug("onHttpRequestHeaders()") + //// Get user-agent header + ua, err := proxywasm.GetHttpRequestHeader("user-agent") + if err != nil { + log.Warnf("failed to get user-agent: %v", err) + return types.ActionPause + } + host := ctx.Host() + scheme := ctx.Scheme() + path := ctx.Path() + method := ctx.Method() + + if ok, rule := botDetectConfig.Process(ua); !ok { + proxywasm.SendHttpResponse(botDetectConfig.BlockedCode, nil, []byte(botDetectConfig.BlockedMessage), -1) + log.Debugf("scheme:%s, host:%s, method:%s, path:%s user-agent:%s has been blocked by rule:%s", scheme, host, method, path, ua, rule) + return types.ActionPause + } + + log.Debugf("scheme:%s, host:%s, method:%s, path:%s user-agent:%s has been passed", scheme, host, method, path, ua) + return types.ActionContinue +} diff --git a/test/e2e/conformance/tests/go-wasm-bot-detect.go b/test/e2e/conformance/tests/go-wasm-bot-detect.go new file mode 100644 index 000000000..4475ecbd9 --- /dev/null +++ b/test/e2e/conformance/tests/go-wasm-bot-detect.go @@ -0,0 +1,118 @@ +// Copyright (c) 2022 Alibaba Group Holding Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tests + +import ( + "testing" + + "github.com/alibaba/higress/test/e2e/conformance/utils/http" + "github.com/alibaba/higress/test/e2e/conformance/utils/suite" +) + +func init() { + Register(WasmPluginsBotDetect) +} + +var WasmPluginsBotDetect = suite.ConformanceTest{ + ShortName: "WasmPluginsBotDetect", + Description: "The Ingress in the higress-conformance-infra namespace test the bot detect WASM plugin.", + Manifests: []string{"tests/go-wasm-bot-detect.yaml"}, + Features: []suite.SupportedFeature{suite.WASMGoConformanceFeature}, + Test: func(t *testing.T, suite *suite.ConformanceTestSuite) { + testcases := []http.Assertion{ + { + Meta: http.AssertionMeta{ + TestCaseName: "case 1: Test Default Deny", + TargetBackend: "infra-backend-v1", + TargetNamespace: "higress-conformance-infra", + }, + Request: http.AssertionRequest{ + ActualRequest: http.Request{ + Host: "foo.com", + Path: "/foo", + Headers: map[string]string{"User-Agent": "BaiduMobaider/1.1.0"}, + }, + }, + Response: http.AssertionResponse{ + ExpectedResponse: http.Response{ + StatusCode: 401, + }, + }, + }, + { + Meta: http.AssertionMeta{ + TestCaseName: "case 2: Test Default Allow", + TargetBackend: "infra-backend-v1", + TargetNamespace: "higress-conformance-infra", + }, + Request: http.AssertionRequest{ + ActualRequest: http.Request{ + Host: "foo.com", + Path: "/foo", + Headers: map[string]string{"User-Agent": "Mozilla/5.0"}, + }, + }, + Response: http.AssertionResponse{ + ExpectedResponse: http.Response{ + StatusCode: 200, + }, + }, + }, + { + Meta: http.AssertionMeta{ + TestCaseName: "case 3: Test Rule Allow", + TargetBackend: "infra-backend-v1", + TargetNamespace: "higress-conformance-infra", + }, + Request: http.AssertionRequest{ + ActualRequest: http.Request{ + Host: "foo.com", + Path: "/foo", + Headers: map[string]string{"User-Agent": "Tailsweep"}, + }, + }, + Response: http.AssertionResponse{ + ExpectedResponse: http.Response{ + StatusCode: 200, + }, + }, + }, + { + Meta: http.AssertionMeta{ + TestCaseName: "case 4: Test Rule Deny", + TargetBackend: "infra-backend-v1", + TargetNamespace: "higress-conformance-infra", + }, + Request: http.AssertionRequest{ + ActualRequest: http.Request{ + Host: "foo.com", + Path: "/foo", + Headers: map[string]string{"User-Agent": "Go-Client"}, + }, + }, + Response: http.AssertionResponse{ + ExpectedResponse: http.Response{ + StatusCode: 401, + }, + }, + }, + } + t.Run("WasmPlugins bot detect", func(t *testing.T) { + for _, testcase := range testcases { + http.MakeRequestAndExpectEventuallyConsistentResponse(t, suite.RoundTripper, suite.TimeoutConfig, suite.GatewayAddress, testcase) + } + }) + }, +} diff --git a/test/e2e/conformance/tests/go-wasm-bot-detect.yaml b/test/e2e/conformance/tests/go-wasm-bot-detect.yaml new file mode 100644 index 000000000..0f0cd90f8 --- /dev/null +++ b/test/e2e/conformance/tests/go-wasm-bot-detect.yaml @@ -0,0 +1,53 @@ +# Copyright (c) 2022 Alibaba Group Holding Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + name: wasmplugin-bot-detect + namespace: higress-conformance-infra +spec: + ingressClassName: higress + rules: + - host: "foo.com" + http: + paths: + - pathType: Prefix + path: "/foo" + backend: + service: + name: infra-backend-v1 + port: + number: 8080 +--- +apiVersion: extensions.higress.io/v1alpha1 +kind: WasmPlugin +metadata: + name: bot-detect + namespace: higress-system +spec: + defaultConfigDisable: false + matchRules: + - config: + blocked_code: 401 + blocked_message: deny by bot detect + allow: + - Tailsweep + deny: + - Go-Client + configDisable: false + ingress: + - higress-conformance-infra/wasmplugin-bot-detect + url: file:///opt/plugins/wasm-go/extensions/bot-detect/plugin.wasm \ No newline at end of file