From c1f2504e8781a2191d454b5486841900439d8922 Mon Sep 17 00:00:00 2001 From: 007gzs <007gzs@gmail.com> Date: Tue, 5 Nov 2024 15:26:55 +0800 Subject: [PATCH] Ai data mask deny word match optimize (#1453) --- .../extensions/ai-data-masking/Cargo.toml | 2 +- .../extensions/ai-data-masking/README.md | 2 +- .../extensions/ai-data-masking/README_EN.md | 1 + .../ai-data-masking/src/deny_word.rs | 54 ++++++++++++++++++ .../extensions/ai-data-masking/src/lib.rs | 57 +++---------------- .../tests/rust-wasm-ai-data-masking.go | 6 ++ .../tests/rust-wasm-ai-data-masking.yaml | 6 ++ 7 files changed, 78 insertions(+), 50 deletions(-) create mode 100644 plugins/wasm-rust/extensions/ai-data-masking/src/deny_word.rs diff --git a/plugins/wasm-rust/extensions/ai-data-masking/Cargo.toml b/plugins/wasm-rust/extensions/ai-data-masking/Cargo.toml index b8bd6df5b..aa7372fb2 100644 --- a/plugins/wasm-rust/extensions/ai-data-masking/Cargo.toml +++ b/plugins/wasm-rust/extensions/ai-data-masking/Cargo.toml @@ -18,5 +18,5 @@ md5 = "0" grok = "2" lazy_static = "1" jieba-rs = "0" -rust-embed="8.5.0" +rust-embed = "8.5.0" jsonpath-rust = "0" diff --git a/plugins/wasm-rust/extensions/ai-data-masking/README.md b/plugins/wasm-rust/extensions/ai-data-masking/README.md index b892b3dab..c1e7f7336 100644 --- a/plugins/wasm-rust/extensions/ai-data-masking/README.md +++ b/plugins/wasm-rust/extensions/ai-data-masking/README.md @@ -148,4 +148,4 @@ curl -X POST \ - 流模式中,如果敏感词语被多个chunk拆分,可能会有敏感词的一部分返回给用户的情况 - grok 内置规则列表 https://help.aliyun.com/zh/sls/user-guide/grok-patterns - 内置敏感词库数据来源 https://github.com/houbb/sensitive-word/tree/master/src/main/resources - + - 由于敏感词列表是在文本分词后进行匹配的,所以请将 `deny_words` 设置为单个单词,英文多单词情况如 `hello word` 可能无法匹配 diff --git a/plugins/wasm-rust/extensions/ai-data-masking/README_EN.md b/plugins/wasm-rust/extensions/ai-data-masking/README_EN.md index 45e162287..d7cb72378 100644 --- a/plugins/wasm-rust/extensions/ai-data-masking/README_EN.md +++ b/plugins/wasm-rust/extensions/ai-data-masking/README_EN.md @@ -129,3 +129,4 @@ Please note that you need to replace `"key":"value"` with the actual data conten - In streaming mode, if sensitive words are split across multiple chunks, there may be cases where part of the sensitive word is returned to the user - Grok built-in rule list: https://help.aliyun.com/zh/sls/user-guide/grok-patterns - Built-in sensitive word library data source: https://github.com/houbb/sensitive-word/tree/master/src/main/resources + - Since the sensitive word list is matched after tokenizing the text, please set `deny_words` to single words. In the case of multiple words in English, such as `hello world`, the match may not be successful. diff --git a/plugins/wasm-rust/extensions/ai-data-masking/src/deny_word.rs b/plugins/wasm-rust/extensions/ai-data-masking/src/deny_word.rs new file mode 100644 index 000000000..99ecab4d5 --- /dev/null +++ b/plugins/wasm-rust/extensions/ai-data-masking/src/deny_word.rs @@ -0,0 +1,54 @@ +use std::collections::HashSet; + +use jieba_rs::Jieba; + +use crate::Asset; + +#[derive(Default, Debug, Clone)] +pub(crate) struct DenyWord { + jieba: Jieba, + words: HashSet, +} + +impl DenyWord { + pub(crate) fn from_iter>>(words: T) -> Self { + let mut deny_word = DenyWord::default(); + + for word in words { + let word_s = word.into(); + let w = word_s.trim(); + if w.is_empty() { + continue; + } + deny_word.jieba.add_word(w, None, None); + deny_word.words.insert(w.to_string()); + } + + deny_word + } + + pub(crate) fn empty() -> Self { + DenyWord { + jieba: Jieba::empty(), + words: HashSet::new(), + } + } + + pub(crate) fn system() -> Self { + if let Some(file) = Asset::get("sensitive_word_dict.txt") { + if let Ok(data) = std::str::from_utf8(file.data.as_ref()) { + return DenyWord::from_iter(data.split('\n')); + } + } + Self::empty() + } + + pub(crate) fn check(&self, message: &str) -> Option { + for word in self.jieba.cut(message, true) { + if self.words.contains(word) { + return Some(word.to_string()); + } + } + None + } +} diff --git a/plugins/wasm-rust/extensions/ai-data-masking/src/lib.rs b/plugins/wasm-rust/extensions/ai-data-masking/src/lib.rs index 2f2999d67..ca2db3da4 100644 --- a/plugins/wasm-rust/extensions/ai-data-masking/src/lib.rs +++ b/plugins/wasm-rust/extensions/ai-data-masking/src/lib.rs @@ -12,13 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. +mod deny_word; + +use crate::deny_word::DenyWord; use fancy_regex::Regex; use grok::patterns; use higress_wasm_rust::log::Log; use higress_wasm_rust::plugin_wrapper::{HttpContextWrapper, RootContextWrapper}; use higress_wasm_rust::request_wrapper::has_request_body; use higress_wasm_rust::rule_matcher::{on_configure, RuleMatcher, SharedRuleMatcher}; -use jieba_rs::Jieba; use jsonpath_rust::{JsonPath, JsonPathValue}; use lazy_static::lazy_static; use proxy_wasm::traits::{Context, HttpContext, RootContext}; @@ -29,7 +31,7 @@ use serde::Deserialize; use serde::Deserializer; use serde_json::{json, Value}; use std::cell::RefCell; -use std::collections::{BTreeMap, HashMap, HashSet, VecDeque}; +use std::collections::{BTreeMap, HashMap, VecDeque}; use std::ops::DerefMut; use std::rc::Rc; use std::str::FromStr; @@ -47,11 +49,6 @@ const GROK_PATTERN: &str = r"%\{(?(?[A-z0-9]+)(?::(?[A-z0- #[folder = "res/"] struct Asset; -#[derive(Default, Debug, Clone)] -struct DenyWord { - jieba: Jieba, - words: HashSet, -} struct System { deny_word: DenyWord, grok_regex: Regex, @@ -227,52 +224,12 @@ static SYSTEM_PATTERNS: &[(&str, &str)] = &[ ("IDCARD", r#"\d{17}[0-9xX]|\d{15}"#), ]; -impl DenyWord { - fn empty() -> Self { - DenyWord { - jieba: Jieba::empty(), - words: HashSet::new(), - } - } - fn from_iter>>(words: T) -> Self { - let mut deny_word = DenyWord::empty(); - - for word in words { - let word_s = word.into(); - let w = word_s.trim(); - if w.is_empty() { - continue; - } - deny_word.jieba.add_word(w, None, None); - deny_word.words.insert(w.to_string()); - } - - deny_word - } - fn default() -> Self { - if let Some(file) = Asset::get("sensitive_word_dict.txt") { - if let Ok(data) = std::str::from_utf8(file.data.as_ref()) { - return DenyWord::from_iter(data.split('\n')); - } - } - DenyWord::empty() - } - - fn check(&self, message: &str) -> Option { - for word in self.jieba.cut(message, true) { - if self.words.contains(word) { - return Some(word.to_string()); - } - } - None - } -} impl System { fn new() -> Self { let grok_regex = Regex::new(GROK_PATTERN).unwrap(); let grok_patterns = BTreeMap::new(); let mut system = System { - deny_word: DenyWord::default(), + deny_word: DenyWord::system(), grok_regex, grok_patterns, }; @@ -335,6 +292,7 @@ impl System { (ret, ok) } } + impl AiDataMaskingRoot { fn new() -> Self { AiDataMaskingRoot { @@ -382,6 +340,7 @@ impl RootContextWrapper for AiDataMaskingRoot { })) } } + impl AiDataMasking { fn check_message(&self, message: &str) -> bool { if let Some(config) = &self.config { @@ -532,6 +491,7 @@ impl AiDataMasking { } impl Context for AiDataMasking {} + impl HttpContext for AiDataMasking { fn on_http_request_headers( &mut self, @@ -607,6 +567,7 @@ impl HttpContext for AiDataMasking { DataAction::Continue } } + impl HttpContextWrapper for AiDataMasking { fn log(&self) -> &Log { &self.log diff --git a/test/e2e/conformance/tests/rust-wasm-ai-data-masking.go b/test/e2e/conformance/tests/rust-wasm-ai-data-masking.go index 9c1469a82..3b49c2c6a 100644 --- a/test/e2e/conformance/tests/rust-wasm-ai-data-masking.go +++ b/test/e2e/conformance/tests/rust-wasm-ai-data-masking.go @@ -153,6 +153,12 @@ var RustWasmPluginsAiDataMasking = suite.ConformanceTest{ []byte("test"), []byte("{\"errmsg\":\"提问或回答中包含敏感词,已被屏蔽\"}"), )) + testcases = append(testcases, gen_assertion( + "system_no_deny.raw.com", + false, + []byte("test"), + []byte("{\"res\":\"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作\"}"), + )) testcases = append(testcases, gen_assertion( "costom_word1.raw.com", false, diff --git a/test/e2e/conformance/tests/rust-wasm-ai-data-masking.yaml b/test/e2e/conformance/tests/rust-wasm-ai-data-masking.yaml index 9678ae3b7..71d7d620f 100644 --- a/test/e2e/conformance/tests/rust-wasm-ai-data-masking.yaml +++ b/test/e2e/conformance/tests/rust-wasm-ai-data-masking.yaml @@ -100,6 +100,12 @@ spec: headers: - Content-Type=application/json "body": "{\"res\":\"fuck\"}" + - domain: + - system_no_deny.raw.com + config: + headers: + - Content-Type=application/json + "body": "{\"res\":\"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作\"}" - domain: - costom_word1.raw.com config: