mirror of
https://github.com/alibaba/higress.git
synced 2026-06-09 12:47:28 +08:00
Ai data mask deny word match optimize (#1453)
This commit is contained in:
@@ -18,5 +18,5 @@ md5 = "0"
|
|||||||
grok = "2"
|
grok = "2"
|
||||||
lazy_static = "1"
|
lazy_static = "1"
|
||||||
jieba-rs = "0"
|
jieba-rs = "0"
|
||||||
rust-embed="8.5.0"
|
rust-embed = "8.5.0"
|
||||||
jsonpath-rust = "0"
|
jsonpath-rust = "0"
|
||||||
|
|||||||
@@ -148,4 +148,4 @@ curl -X POST \
|
|||||||
- 流模式中,如果敏感词语被多个chunk拆分,可能会有敏感词的一部分返回给用户的情况
|
- 流模式中,如果敏感词语被多个chunk拆分,可能会有敏感词的一部分返回给用户的情况
|
||||||
- grok 内置规则列表 https://help.aliyun.com/zh/sls/user-guide/grok-patterns
|
- grok 内置规则列表 https://help.aliyun.com/zh/sls/user-guide/grok-patterns
|
||||||
- 内置敏感词库数据来源 https://github.com/houbb/sensitive-word/tree/master/src/main/resources
|
- 内置敏感词库数据来源 https://github.com/houbb/sensitive-word/tree/master/src/main/resources
|
||||||
|
- 由于敏感词列表是在文本分词后进行匹配的,所以请将 `deny_words` 设置为单个单词,英文多单词情况如 `hello word` 可能无法匹配
|
||||||
|
|||||||
@@ -129,3 +129,4 @@ Please note that you need to replace `"key":"value"` with the actual data conten
|
|||||||
- In streaming mode, if sensitive words are split across multiple chunks, there may be cases where part of the sensitive word is returned to the user
|
- In streaming mode, if sensitive words are split across multiple chunks, there may be cases where part of the sensitive word is returned to the user
|
||||||
- Grok built-in rule list: https://help.aliyun.com/zh/sls/user-guide/grok-patterns
|
- Grok built-in rule list: https://help.aliyun.com/zh/sls/user-guide/grok-patterns
|
||||||
- Built-in sensitive word library data source: https://github.com/houbb/sensitive-word/tree/master/src/main/resources
|
- Built-in sensitive word library data source: https://github.com/houbb/sensitive-word/tree/master/src/main/resources
|
||||||
|
- Since the sensitive word list is matched after tokenizing the text, please set `deny_words` to single words. In the case of multiple words in English, such as `hello world`, the match may not be successful.
|
||||||
|
|||||||
@@ -0,0 +1,54 @@
|
|||||||
|
use std::collections::HashSet;
|
||||||
|
|
||||||
|
use jieba_rs::Jieba;
|
||||||
|
|
||||||
|
use crate::Asset;
|
||||||
|
|
||||||
|
#[derive(Default, Debug, Clone)]
|
||||||
|
pub(crate) struct DenyWord {
|
||||||
|
jieba: Jieba,
|
||||||
|
words: HashSet<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DenyWord {
|
||||||
|
pub(crate) fn from_iter<T: IntoIterator<Item = impl Into<String>>>(words: T) -> Self {
|
||||||
|
let mut deny_word = DenyWord::default();
|
||||||
|
|
||||||
|
for word in words {
|
||||||
|
let word_s = word.into();
|
||||||
|
let w = word_s.trim();
|
||||||
|
if w.is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
deny_word.jieba.add_word(w, None, None);
|
||||||
|
deny_word.words.insert(w.to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
deny_word
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn empty() -> Self {
|
||||||
|
DenyWord {
|
||||||
|
jieba: Jieba::empty(),
|
||||||
|
words: HashSet::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn system() -> Self {
|
||||||
|
if let Some(file) = Asset::get("sensitive_word_dict.txt") {
|
||||||
|
if let Ok(data) = std::str::from_utf8(file.data.as_ref()) {
|
||||||
|
return DenyWord::from_iter(data.split('\n'));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Self::empty()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn check(&self, message: &str) -> Option<String> {
|
||||||
|
for word in self.jieba.cut(message, true) {
|
||||||
|
if self.words.contains(word) {
|
||||||
|
return Some(word.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -12,13 +12,15 @@
|
|||||||
// See the License for the specific language governing permissions and
|
// See the License for the specific language governing permissions and
|
||||||
// limitations under the License.
|
// limitations under the License.
|
||||||
|
|
||||||
|
mod deny_word;
|
||||||
|
|
||||||
|
use crate::deny_word::DenyWord;
|
||||||
use fancy_regex::Regex;
|
use fancy_regex::Regex;
|
||||||
use grok::patterns;
|
use grok::patterns;
|
||||||
use higress_wasm_rust::log::Log;
|
use higress_wasm_rust::log::Log;
|
||||||
use higress_wasm_rust::plugin_wrapper::{HttpContextWrapper, RootContextWrapper};
|
use higress_wasm_rust::plugin_wrapper::{HttpContextWrapper, RootContextWrapper};
|
||||||
use higress_wasm_rust::request_wrapper::has_request_body;
|
use higress_wasm_rust::request_wrapper::has_request_body;
|
||||||
use higress_wasm_rust::rule_matcher::{on_configure, RuleMatcher, SharedRuleMatcher};
|
use higress_wasm_rust::rule_matcher::{on_configure, RuleMatcher, SharedRuleMatcher};
|
||||||
use jieba_rs::Jieba;
|
|
||||||
use jsonpath_rust::{JsonPath, JsonPathValue};
|
use jsonpath_rust::{JsonPath, JsonPathValue};
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
use proxy_wasm::traits::{Context, HttpContext, RootContext};
|
use proxy_wasm::traits::{Context, HttpContext, RootContext};
|
||||||
@@ -29,7 +31,7 @@ use serde::Deserialize;
|
|||||||
use serde::Deserializer;
|
use serde::Deserializer;
|
||||||
use serde_json::{json, Value};
|
use serde_json::{json, Value};
|
||||||
use std::cell::RefCell;
|
use std::cell::RefCell;
|
||||||
use std::collections::{BTreeMap, HashMap, HashSet, VecDeque};
|
use std::collections::{BTreeMap, HashMap, VecDeque};
|
||||||
use std::ops::DerefMut;
|
use std::ops::DerefMut;
|
||||||
use std::rc::Rc;
|
use std::rc::Rc;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
@@ -47,11 +49,6 @@ const GROK_PATTERN: &str = r"%\{(?<name>(?<pattern>[A-z0-9]+)(?::(?<alias>[A-z0-
|
|||||||
#[folder = "res/"]
|
#[folder = "res/"]
|
||||||
struct Asset;
|
struct Asset;
|
||||||
|
|
||||||
#[derive(Default, Debug, Clone)]
|
|
||||||
struct DenyWord {
|
|
||||||
jieba: Jieba,
|
|
||||||
words: HashSet<String>,
|
|
||||||
}
|
|
||||||
struct System {
|
struct System {
|
||||||
deny_word: DenyWord,
|
deny_word: DenyWord,
|
||||||
grok_regex: Regex,
|
grok_regex: Regex,
|
||||||
@@ -227,52 +224,12 @@ static SYSTEM_PATTERNS: &[(&str, &str)] = &[
|
|||||||
("IDCARD", r#"\d{17}[0-9xX]|\d{15}"#),
|
("IDCARD", r#"\d{17}[0-9xX]|\d{15}"#),
|
||||||
];
|
];
|
||||||
|
|
||||||
impl DenyWord {
|
|
||||||
fn empty() -> Self {
|
|
||||||
DenyWord {
|
|
||||||
jieba: Jieba::empty(),
|
|
||||||
words: HashSet::new(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fn from_iter<T: IntoIterator<Item = impl Into<String>>>(words: T) -> Self {
|
|
||||||
let mut deny_word = DenyWord::empty();
|
|
||||||
|
|
||||||
for word in words {
|
|
||||||
let word_s = word.into();
|
|
||||||
let w = word_s.trim();
|
|
||||||
if w.is_empty() {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
deny_word.jieba.add_word(w, None, None);
|
|
||||||
deny_word.words.insert(w.to_string());
|
|
||||||
}
|
|
||||||
|
|
||||||
deny_word
|
|
||||||
}
|
|
||||||
fn default() -> Self {
|
|
||||||
if let Some(file) = Asset::get("sensitive_word_dict.txt") {
|
|
||||||
if let Ok(data) = std::str::from_utf8(file.data.as_ref()) {
|
|
||||||
return DenyWord::from_iter(data.split('\n'));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
DenyWord::empty()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn check(&self, message: &str) -> Option<String> {
|
|
||||||
for word in self.jieba.cut(message, true) {
|
|
||||||
if self.words.contains(word) {
|
|
||||||
return Some(word.to_string());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
None
|
|
||||||
}
|
|
||||||
}
|
|
||||||
impl System {
|
impl System {
|
||||||
fn new() -> Self {
|
fn new() -> Self {
|
||||||
let grok_regex = Regex::new(GROK_PATTERN).unwrap();
|
let grok_regex = Regex::new(GROK_PATTERN).unwrap();
|
||||||
let grok_patterns = BTreeMap::new();
|
let grok_patterns = BTreeMap::new();
|
||||||
let mut system = System {
|
let mut system = System {
|
||||||
deny_word: DenyWord::default(),
|
deny_word: DenyWord::system(),
|
||||||
grok_regex,
|
grok_regex,
|
||||||
grok_patterns,
|
grok_patterns,
|
||||||
};
|
};
|
||||||
@@ -335,6 +292,7 @@ impl System {
|
|||||||
(ret, ok)
|
(ret, ok)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl AiDataMaskingRoot {
|
impl AiDataMaskingRoot {
|
||||||
fn new() -> Self {
|
fn new() -> Self {
|
||||||
AiDataMaskingRoot {
|
AiDataMaskingRoot {
|
||||||
@@ -382,6 +340,7 @@ impl RootContextWrapper<AiDataMaskingConfig> for AiDataMaskingRoot {
|
|||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl AiDataMasking {
|
impl AiDataMasking {
|
||||||
fn check_message(&self, message: &str) -> bool {
|
fn check_message(&self, message: &str) -> bool {
|
||||||
if let Some(config) = &self.config {
|
if let Some(config) = &self.config {
|
||||||
@@ -532,6 +491,7 @@ impl AiDataMasking {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Context for AiDataMasking {}
|
impl Context for AiDataMasking {}
|
||||||
|
|
||||||
impl HttpContext for AiDataMasking {
|
impl HttpContext for AiDataMasking {
|
||||||
fn on_http_request_headers(
|
fn on_http_request_headers(
|
||||||
&mut self,
|
&mut self,
|
||||||
@@ -607,6 +567,7 @@ impl HttpContext for AiDataMasking {
|
|||||||
DataAction::Continue
|
DataAction::Continue
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl HttpContextWrapper<AiDataMaskingConfig> for AiDataMasking {
|
impl HttpContextWrapper<AiDataMaskingConfig> for AiDataMasking {
|
||||||
fn log(&self) -> &Log {
|
fn log(&self) -> &Log {
|
||||||
&self.log
|
&self.log
|
||||||
|
|||||||
@@ -153,6 +153,12 @@ var RustWasmPluginsAiDataMasking = suite.ConformanceTest{
|
|||||||
[]byte("test"),
|
[]byte("test"),
|
||||||
[]byte("{\"errmsg\":\"提问或回答中包含敏感词,已被屏蔽\"}"),
|
[]byte("{\"errmsg\":\"提问或回答中包含敏感词,已被屏蔽\"}"),
|
||||||
))
|
))
|
||||||
|
testcases = append(testcases, gen_assertion(
|
||||||
|
"system_no_deny.raw.com",
|
||||||
|
false,
|
||||||
|
[]byte("test"),
|
||||||
|
[]byte("{\"res\":\"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作\"}"),
|
||||||
|
))
|
||||||
testcases = append(testcases, gen_assertion(
|
testcases = append(testcases, gen_assertion(
|
||||||
"costom_word1.raw.com",
|
"costom_word1.raw.com",
|
||||||
false,
|
false,
|
||||||
|
|||||||
@@ -100,6 +100,12 @@ spec:
|
|||||||
headers:
|
headers:
|
||||||
- Content-Type=application/json
|
- Content-Type=application/json
|
||||||
"body": "{\"res\":\"fuck\"}"
|
"body": "{\"res\":\"fuck\"}"
|
||||||
|
- domain:
|
||||||
|
- system_no_deny.raw.com
|
||||||
|
config:
|
||||||
|
headers:
|
||||||
|
- Content-Type=application/json
|
||||||
|
"body": "{\"res\":\"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作\"}"
|
||||||
- domain:
|
- domain:
|
||||||
- costom_word1.raw.com
|
- costom_word1.raw.com
|
||||||
config:
|
config:
|
||||||
|
|||||||
Reference in New Issue
Block a user