Add plugins (#27)

This commit is contained in:
澄潭
2022-11-04 17:46:43 +08:00
committed by GitHub
parent 5ac966495c
commit 1a0ed73cd5
92 changed files with 35435 additions and 1 deletions

View File

@@ -0,0 +1,57 @@
load("@proxy_wasm_cpp_sdk//bazel/wasm:wasm.bzl", "wasm_cc_binary")
load("//bazel:wasm.bzl", "declare_wasm_image_targets")
wasm_cc_binary(
name = "bot_detect.wasm",
srcs = [
"plugin.cc",
"plugin.h",
],
deps = [
"@com_google_absl//absl/strings",
"@com_google_absl//absl/time",
"//common:json_util",
"@proxy_wasm_cpp_sdk//:proxy_wasm_intrinsics",
"//common:http_util",
"//common:regex_util",
"//common:rule_util",
],
)
cc_library(
name = "bot_detect_lib",
srcs = [
"plugin.cc",
],
hdrs = [
"plugin.h",
],
copts = ["-DNULL_PLUGIN"],
deps = [
"@com_google_absl//absl/strings",
"//common:json_util",
"@proxy_wasm_cpp_host//:lib",
"//common:http_util",
"//common:regex_util",
"//common:rule_util",
],
)
cc_test(
name = "bot_detect_test",
srcs = [
"plugin_test.cc",
],
copts = ["-DNULL_PLUGIN"],
deps = [
":bot_detect_lib",
"@com_google_googletest//:gtest",
"@com_google_googletest//:gtest_main",
"@proxy_wasm_cpp_host//:lib",
],
)
declare_wasm_image_targets(
name = "bot_detect",
wasm_file = ":bot_detect.wasm",
)

View File

@@ -0,0 +1,188 @@
// Copyright (c) 2022 Alibaba Group Holding Ltd.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "extensions/bot_detect/plugin.h"
#include <array>
#include <memory>
#include <stdexcept>
#include <string_view>
#include "absl/strings/str_cat.h"
#include "absl/strings/str_join.h"
#include "absl/strings/str_split.h"
#include "common/json_util.h"
using ::nlohmann::json;
using ::Wasm::Common::JsonArrayIterate;
using ::Wasm::Common::JsonGetField;
using ::Wasm::Common::JsonObjectIterate;
using ::Wasm::Common::JsonValueAs;
#ifdef NULL_PLUGIN
namespace proxy_wasm {
namespace null_plugin {
namespace bot_detect {
PROXY_WASM_NULL_PLUGIN_REGISTRY
#endif
static RegisterContextFactory register_BotDetect(
CONTEXT_FACTORY(PluginContext), ROOT_FACTORY(PluginRootContext));
static std::array<std::string, 6> default_bot_regex = {
R"(/((?:Ant-)?Nutch|[A-z]+[Bb]ot|[A-z]+[Ss]pider|Axtaris|fetchurl|Isara|ShopSalad|Tailsweep)[ \-](\d+)(?:\.(\d+)(?:\.(\d+))?)?)",
R"((?:\/[A-Za-z0-9\.]+|) {0,5}([A-Za-z0-9 \-_\!\[\]:]{0,50}(?:[Aa]rchiver|[Ii]ndexer|[Ss]craper|[Bb]ot|[Ss]pider|[Cc]rawl[a-z]{0,50}))[/ ](\d+)(?:\.(\d+)(?:\.(\d+)|)|))",
R"((?:\/[A-Za-z0-9\.]+|) {0,5}([A-Za-z0-9 \-_\!\[\]:]{0,50}(?:[Aa]rchiver|[Ii]ndexer|[Ss]craper|[Bb]ot|[Ss]pider|[Cc]rawl[a-z]{0,50})) (\d+)(?:\.(\d+)(?:\.(\d+)|)|))",
R"(((?:[A-z0-9]{1,50}|[A-z\-]{1,50} ?|)(?: the |)(?:[Ss][Pp][Ii][Dd][Ee][Rr]|[Ss]crape|[Cc][Rr][Aa][Ww][Ll])[A-z0-9]{0,50})(?:(?:[ /]| v)(\d+)(?:\.(\d+)|)(?:\.(\d+)|)|))",
R"(\b(008|Altresium|Argus|BaiduMobaider|BoardReader|DNSGroup|DataparkSearch|EDI|Goodzer|Grub|INGRID|Infohelfer|LinkedInBot|LOOQ|Nutch|OgScrper|PathDefender|Peew|PostPost|Steeler|Twitterbot|VSE|WebCrunch|WebZIP|Y!J-BR[A-Z]|YahooSeeker|envolk|sproose|wminer)/(\d+)(?:\.(\d+)|)(?:\.(\d+)|))",
R"((CSimpleSpider|Cityreview Robot|CrawlDaddy|CrawlFire|Finderbots|Index crawler|Job Roboter|KiwiStatus Spider|Lijit Crawler|QuerySeekerSpider|ScollSpider|Trends Crawler|USyd-NLP-Spider|SiteCat Webbot|BotName\/\$BotVersion|123metaspider-Bot|1470\.net crawler|50\.nu|8bo Crawler Bot|Aboundex|Accoona-[A-z]{1,30}-Agent|AdsBot-Google(?:-[a-z]{1,30}|)|altavista|AppEngine-Google|archive.{0,30}\.org_bot|archiver|Ask Jeeves|[Bb]ai[Dd]u[Ss]pider(?:-[A-Za-z]{1,30})(?:-[A-Za-z]{1,30}|)|bingbot|BingPreview|blitzbot|BlogBridge|Bloglovin|BoardReader Blog Indexer|BoardReader Favicon Fetcher|boitho.com-dc|BotSeer|BUbiNG|\b\w{0,30}favicon\w{0,30}\b|\bYeti(?:-[a-z]{1,30}|)|Catchpoint(?: bot|)|[Cc]harlotte|Checklinks|clumboot|Comodo HTTP\(S\) Crawler|Comodo-Webinspector-Crawler|ConveraCrawler|CRAWL-E|CrawlConvera|Daumoa(?:-feedfetcher|)|Feed Seeker Bot|Feedbin|findlinks|Flamingo_SearchEngine|FollowSite Bot|furlbot|Genieo|gigabot|GomezAgent|gonzo1|(?:[a-zA-Z]{1,30}-|)Googlebot(?:-[a-zA-Z]{1,30}|)|Google SketchUp|grub-client|gsa-crawler|heritrix|HiddenMarket|holmes|HooWWWer|htdig|ia_archiver|ICC-Crawler|Icarus6j|ichiro(?:/mobile|)|IconSurf|IlTrovatore(?:-Setaccio|)|InfuzApp|Innovazion Crawler|InternetArchive|IP2[a-z]{1,30}Bot|jbot\b|KaloogaBot|Kraken|Kurzor|larbin|LEIA|LesnikBot|Linguee Bot|LinkAider|LinkedInBot|Lite Bot|Llaut|lycos|Mail\.RU_Bot|masscan|masidani_bot|Mediapartners-Google|Microsoft .{0,30} Bot|mogimogi|mozDex|MJ12bot|msnbot(?:-media {0,2}|)|msrbot|Mtps Feed Aggregation System|netresearch|Netvibes|NewsGator[^/]{0,30}|^NING|Nutch[^/]{0,30}|Nymesis|ObjectsSearch|OgScrper|Orbiter|OOZBOT|PagePeeker|PagesInventory|PaxleFramework|Peeplo Screenshot Bot|PlantyNet_WebRobot|Pompos|Qwantify|Read%20Later|Reaper|RedCarpet|Retreiver|Riddler|Rival IQ|scooter|Scrapy|Scrubby|searchsight|seekbot|semanticdiscovery|SemrushBot|Simpy|SimplePie|SEOstats|SimpleRSS|SiteCon|Slackbot-LinkExpanding|Slack-ImgProxy|Slurp|snappy|Speedy Spider|Squrl Java|Stringer|TheUsefulbot|ThumbShotsBot|Thumbshots\.ru|Tiny Tiny RSS|Twitterbot|WhatsApp|URL2PNG|Vagabondo|VoilaBot|^vortex|Votay bot|^voyager|WASALive.Bot|Web-sniffer|WebThumb|WeSEE:[A-z]{1,30}|WhatWeb|WIRE|WordPress|Wotbox|www\.almaden\.ibm\.com|Xenu(?:.s|) Link Sleuth|Xerka [A-z]{1,30}Bot|yacy(?:bot|)|YahooSeeker|Yahoo! Slurp|Yandex\w{1,30}|YodaoBot(?:-[A-z]{1,30}|)|YottaaMonitor|Yowedo|^Zao|^Zao-Crawler|ZeBot_www\.ze\.bz|ZooShot|ZyBorg)(?:[ /]v?(\d+)(?:\.(\d+)(?:\.(\d+)|)|)|))",
};
bool PluginRootContext::parsePluginConfig(const json& configuration,
BotDetectConfigRule& rule) {
auto it = configuration.find("blocked_code");
if (it != configuration.end()) {
auto blocked_code = JsonValueAs<int64_t>(it.value());
if (blocked_code.second != Wasm::Common::JsonParserResultDetail::OK) {
LOG_WARN("cannot parse status code");
return false;
}
rule.blocked_code = blocked_code.first.value();
}
it = configuration.find("blocked_message");
if (it != configuration.end()) {
auto blocked_message = JsonValueAs<std::string>(it.value());
if (blocked_message.second != Wasm::Common::JsonParserResultDetail::OK) {
LOG_WARN("cannot parse blocked_message");
return false;
}
rule.blocked_message = blocked_message.first.value();
}
if (!JsonArrayIterate(configuration, "allow", [&](const json& item) -> bool {
auto regex = JsonValueAs<std::string>(item);
if (regex.second != Wasm::Common::JsonParserResultDetail::OK) {
LOG_WARN("cannot parse allow");
return false;
}
try {
rule.allow.push_back(
std::make_unique<ReMatcher>(regex.first.value()));
} catch (const std::runtime_error& e) {
LOG_WARN(e.what());
return false;
}
return true;
})) {
LOG_WARN("failed to parse configuration for allow.");
return false;
}
if (!JsonArrayIterate(configuration, "deny", [&](const json& item) -> bool {
auto regex = JsonValueAs<std::string>(item);
if (regex.second != Wasm::Common::JsonParserResultDetail::OK) {
LOG_WARN("cannot parse deny");
return false;
}
try {
rule.deny.push_back(std::make_unique<ReMatcher>(regex.first.value()));
} catch (const std::runtime_error& e) {
LOG_WARN(e.what());
return false;
}
return true;
})) {
LOG_WARN("failed to parse configuration for deny.");
return false;
}
return true;
}
bool PluginRootContext::onConfigure(size_t size) {
// Parse configuration JSON string.
if (size > 0 && !configure(size)) {
LOG_WARN("configuration has errors initialization will not continue.");
setInvalidConfig();
return true;
}
if (size == 0) {
// support empty config
setEmptyGlobalConfig();
}
for (auto& regex : default_bot_regex) {
default_matchers_.push_back(std::make_unique<ReMatcher>(regex, false));
}
return true;
}
bool PluginRootContext::configure(size_t configuration_size) {
auto configuration_data = getBufferBytes(WasmBufferType::PluginConfiguration,
0, configuration_size);
// Parse configuration JSON string.
auto result = ::Wasm::Common::JsonParse(configuration_data->view());
if (!result.has_value()) {
LOG_WARN(absl::StrCat("cannot parse plugin configuration JSON string: ",
configuration_data->view()));
return false;
}
if (!parseRuleConfig(result.value())) {
LOG_WARN(absl::StrCat("cannot parse plugin configuration JSON string: ",
configuration_data->view()));
return false;
}
return true;
}
bool PluginRootContext::checkHeader(const BotDetectConfigRule& rule) {
GET_HEADER_VIEW(Wasm::Common::Http::Header::UserAgent, user_agent);
for (const auto& matcher : rule.allow) {
if (matcher->match(user_agent)) {
LOG_DEBUG("bot detected by allow rule");
return true;
}
}
for (const auto& matcher : rule.deny) {
if (matcher->match(user_agent)) {
LOG_DEBUG("bot detected by deny rule");
sendLocalResponse(rule.blocked_code, "", rule.blocked_message, {});
return false;
}
}
for (const auto& matcher : default_matchers_) {
if (matcher->match(user_agent)) {
LOG_DEBUG("bot detected by default rule");
sendLocalResponse(rule.blocked_code, "", rule.blocked_message, {});
return false;
}
}
return true;
}
FilterHeadersStatus PluginContext::onRequestHeaders(uint32_t, bool) {
auto* rootCtx = rootContext();
return rootCtx->checkRule([rootCtx](const auto& config) {
return rootCtx->checkHeader(config);
})
? FilterHeadersStatus::Continue
: FilterHeadersStatus::StopIteration;
}
#ifdef NULL_PLUGIN
} // namespace bot_detect
} // namespace null_plugin
} // namespace proxy_wasm
#endif

View File

@@ -0,0 +1,88 @@
/*
* Copyright (c) 2022 Alibaba Group Holding Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <assert.h>
#include <string>
#include <unordered_map>
#include "common/http_util.h"
#include "common/regex.h"
#include "common/route_rule_matcher.h"
#define ASSERT(_X) assert(_X)
#ifndef NULL_PLUGIN
#include "proxy_wasm_intrinsics.h"
#else
#include "include/proxy-wasm/null_plugin.h"
namespace proxy_wasm {
namespace null_plugin {
namespace bot_detect {
#endif
using ReMatcher = Wasm::Common::Regex::CompiledGoogleReMatcher;
using ReMatcherPtr = std::unique_ptr<ReMatcher>;
struct BotDetectConfigRule {
int blocked_code = 403;
std::string blocked_message;
std::vector<ReMatcherPtr> allow;
std::vector<ReMatcherPtr> deny;
};
// PluginRootContext is the root context for all streams processed by the
// thread. It has the same lifetime as the worker thread and acts as target for
// interactions that outlives individual stream, e.g. timer, async calls.
class PluginRootContext : public RootContext,
public RouteRuleMatcher<BotDetectConfigRule> {
public:
PluginRootContext(uint32_t id, std::string_view root_id)
: RootContext(id, root_id) {}
~PluginRootContext() {}
bool onConfigure(size_t) override;
bool checkHeader(const BotDetectConfigRule&);
bool configure(size_t);
private:
bool parsePluginConfig(const json&, BotDetectConfigRule&) override;
std::vector<ReMatcherPtr> default_matchers_;
};
// Per-stream context.
class PluginContext : public Context {
public:
explicit PluginContext(uint32_t id, RootContext* root) : Context(id, root) {}
FilterHeadersStatus onRequestHeaders(uint32_t, bool) override;
private:
inline PluginRootContext* rootContext() {
return dynamic_cast<PluginRootContext*>(this->root());
}
};
#ifdef NULL_PLUGIN
} // namespace bot_detect
} // namespace null_plugin
} // namespace proxy_wasm
#endif

View File

@@ -0,0 +1,178 @@
// Copyright (c) 2022 Alibaba Group Holding Ltd.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "extensions/bot_detect/plugin.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "include/proxy-wasm/context.h"
#include "include/proxy-wasm/null.h"
namespace proxy_wasm {
namespace null_plugin {
namespace bot_detect {
NullPluginRegistry* context_registry_;
RegisterNullVmPluginFactory register_bot_detect_plugin("bot_detect", []() {
return std::make_unique<NullPlugin>(bot_detect::context_registry_);
});
class MockContext : public proxy_wasm::ContextBase {
public:
MockContext(WasmBase* wasm) : ContextBase(wasm) {}
MOCK_METHOD(BufferInterface*, getBuffer, (WasmBufferType));
MOCK_METHOD(WasmResult, log, (uint32_t, std::string_view));
MOCK_METHOD(WasmResult, getHeaderMapValue,
(WasmHeaderMapType /* type */, std::string_view /* key */,
std::string_view* /*result */));
MOCK_METHOD(WasmResult, getHeaderMapPairs, (WasmHeaderMapType, Pairs*));
MOCK_METHOD(WasmResult, sendLocalResponse,
(uint32_t /* response_code */, std::string_view /* body */,
Pairs /* additional_headers */, uint32_t /* grpc_status */,
std::string_view /* details */));
MOCK_METHOD(WasmResult, getProperty, (std::string_view, std::string*));
};
class BotDetectTest : public ::testing::Test {
protected:
BotDetectTest() {
// Initialize test VM
test_vm_ = createNullVm();
wasm_base_ = std::make_unique<WasmBase>(
std::move(test_vm_), "test-vm", "", "",
std::unordered_map<std::string, std::string>{},
AllowedCapabilitiesMap{});
wasm_base_->load("bot_detect");
wasm_base_->initialize();
// Initialize host side context
mock_context_ = std::make_unique<MockContext>(wasm_base_.get());
current_context_ = mock_context_.get();
ON_CALL(*mock_context_, log(testing::_, testing::_))
.WillByDefault([](uint32_t, std::string_view m) {
std::cerr << m << "\n";
return WasmResult::Ok;
});
ON_CALL(*mock_context_, getHeaderMapValue(WasmHeaderMapType::RequestHeaders,
testing::_, testing::_))
.WillByDefault([&](WasmHeaderMapType, std::string_view header,
std::string_view* result) {
if (header == ":authority") {
*result = authority_;
}
if (header == "user-agent") {
*result = user_agent_;
}
return WasmResult::Ok;
});
ON_CALL(*mock_context_, getProperty(testing::_, testing::_))
.WillByDefault([&](std::string_view path, std::string* result) {
*result = route_name_;
return WasmResult::Ok;
});
// Initialize Wasm sandbox context
root_context_ = std::make_unique<PluginRootContext>(0, "");
context_ = std::make_unique<PluginContext>(1, root_context_.get());
}
~BotDetectTest() override {}
std::unique_ptr<WasmBase> wasm_base_;
std::unique_ptr<WasmVm> test_vm_;
std::unique_ptr<MockContext> mock_context_;
std::unique_ptr<PluginRootContext> root_context_;
std::unique_ptr<PluginContext> context_;
std::string authority_;
std::string route_name_;
std::string user_agent_;
};
TEST_F(BotDetectTest, UseDefault) {
std::string configuration = R"(
{
"blocked_code": 404
})";
BufferBase buffer;
buffer.set({configuration.data(), configuration.size()});
EXPECT_CALL(*mock_context_, getBuffer(WasmBufferType::PluginConfiguration))
.WillOnce([&buffer](WasmBufferType) { return &buffer; });
EXPECT_TRUE(root_context_->onConfigure(configuration.size()));
user_agent_ = "BaiduMobaider/1.1.0";
EXPECT_CALL(*mock_context_, sendLocalResponse(404, testing::_, testing::_,
testing::_, testing::_));
EXPECT_EQ(context_->onRequestHeaders(0, false),
FilterHeadersStatus::StopIteration);
user_agent_ = "Go-client";
EXPECT_EQ(context_->onRequestHeaders(0, false),
FilterHeadersStatus::Continue);
}
TEST_F(BotDetectTest, UseAllowAndDeny) {
std::string configuration = R"(
{
"blocked_code": 404,
"allow": ["BaiduMobaider.*"],
"deny": ["Go-client"]
})";
BufferBase buffer;
buffer.set({configuration.data(), configuration.size()});
EXPECT_CALL(*mock_context_, getBuffer(WasmBufferType::PluginConfiguration))
.WillOnce([&buffer](WasmBufferType) { return &buffer; });
EXPECT_TRUE(root_context_->onConfigure(configuration.size()));
user_agent_ = "BaiduMobaider/1.1.0";
EXPECT_EQ(context_->onRequestHeaders(0, false),
FilterHeadersStatus::Continue);
user_agent_ = "Go-client";
EXPECT_CALL(*mock_context_, sendLocalResponse(404, testing::_, testing::_,
testing::_, testing::_));
EXPECT_EQ(context_->onRequestHeaders(0, false),
FilterHeadersStatus::StopIteration);
}
TEST_F(BotDetectTest, UseEmptyConfig) {
std::string configuration = R"()";
BufferBase buffer;
buffer.set({configuration.data(), configuration.size()});
EXPECT_TRUE(root_context_->onConfigure(configuration.size()));
user_agent_ = "BaiduMobaider/1.1.0";
EXPECT_CALL(*mock_context_, sendLocalResponse(403, testing::_, testing::_,
testing::_, testing::_));
EXPECT_EQ(context_->onRequestHeaders(0, false),
FilterHeadersStatus::StopIteration);
user_agent_ = "Go-client";
EXPECT_EQ(context_->onRequestHeaders(0, false),
FilterHeadersStatus::Continue);
}
} // namespace bot_detect
} // namespace null_plugin
} // namespace proxy_wasm