Ai data mask deny word match optimize (#1453)

This commit is contained in:
007gzs
2024-11-05 15:26:55 +08:00
committed by GitHub
parent 7e8b0445ad
commit c1f2504e87
7 changed files with 78 additions and 50 deletions

View File

@@ -18,5 +18,5 @@ md5 = "0"
grok = "2"
lazy_static = "1"
jieba-rs = "0"
rust-embed="8.5.0"
rust-embed = "8.5.0"
jsonpath-rust = "0"

View File

@@ -148,4 +148,4 @@ curl -X POST \
- 流模式中如果敏感词语被多个chunk拆分可能会有敏感词的一部分返回给用户的情况
- grok 内置规则列表 https://help.aliyun.com/zh/sls/user-guide/grok-patterns
- 内置敏感词库数据来源 https://github.com/houbb/sensitive-word/tree/master/src/main/resources
- 由于敏感词列表是在文本分词后进行匹配的,所以请将 `deny_words` 设置为单个单词,英文多单词情况如 `hello word` 可能无法匹配

View File

@@ -129,3 +129,4 @@ Please note that you need to replace `"key":"value"` with the actual data conten
- In streaming mode, if sensitive words are split across multiple chunks, there may be cases where part of the sensitive word is returned to the user
- Grok built-in rule list: https://help.aliyun.com/zh/sls/user-guide/grok-patterns
- Built-in sensitive word library data source: https://github.com/houbb/sensitive-word/tree/master/src/main/resources
- Since the sensitive word list is matched after tokenizing the text, please set `deny_words` to single words. In the case of multiple words in English, such as `hello world`, the match may not be successful.

View File

@@ -0,0 +1,54 @@
use std::collections::HashSet;
use jieba_rs::Jieba;
use crate::Asset;
#[derive(Default, Debug, Clone)]
pub(crate) struct DenyWord {
jieba: Jieba,
words: HashSet<String>,
}
impl DenyWord {
pub(crate) fn from_iter<T: IntoIterator<Item = impl Into<String>>>(words: T) -> Self {
let mut deny_word = DenyWord::default();
for word in words {
let word_s = word.into();
let w = word_s.trim();
if w.is_empty() {
continue;
}
deny_word.jieba.add_word(w, None, None);
deny_word.words.insert(w.to_string());
}
deny_word
}
pub(crate) fn empty() -> Self {
DenyWord {
jieba: Jieba::empty(),
words: HashSet::new(),
}
}
pub(crate) fn system() -> Self {
if let Some(file) = Asset::get("sensitive_word_dict.txt") {
if let Ok(data) = std::str::from_utf8(file.data.as_ref()) {
return DenyWord::from_iter(data.split('\n'));
}
}
Self::empty()
}
pub(crate) fn check(&self, message: &str) -> Option<String> {
for word in self.jieba.cut(message, true) {
if self.words.contains(word) {
return Some(word.to_string());
}
}
None
}
}

View File

@@ -12,13 +12,15 @@
// See the License for the specific language governing permissions and
// limitations under the License.
mod deny_word;
use crate::deny_word::DenyWord;
use fancy_regex::Regex;
use grok::patterns;
use higress_wasm_rust::log::Log;
use higress_wasm_rust::plugin_wrapper::{HttpContextWrapper, RootContextWrapper};
use higress_wasm_rust::request_wrapper::has_request_body;
use higress_wasm_rust::rule_matcher::{on_configure, RuleMatcher, SharedRuleMatcher};
use jieba_rs::Jieba;
use jsonpath_rust::{JsonPath, JsonPathValue};
use lazy_static::lazy_static;
use proxy_wasm::traits::{Context, HttpContext, RootContext};
@@ -29,7 +31,7 @@ use serde::Deserialize;
use serde::Deserializer;
use serde_json::{json, Value};
use std::cell::RefCell;
use std::collections::{BTreeMap, HashMap, HashSet, VecDeque};
use std::collections::{BTreeMap, HashMap, VecDeque};
use std::ops::DerefMut;
use std::rc::Rc;
use std::str::FromStr;
@@ -47,11 +49,6 @@ const GROK_PATTERN: &str = r"%\{(?<name>(?<pattern>[A-z0-9]+)(?::(?<alias>[A-z0-
#[folder = "res/"]
struct Asset;
#[derive(Default, Debug, Clone)]
struct DenyWord {
jieba: Jieba,
words: HashSet<String>,
}
struct System {
deny_word: DenyWord,
grok_regex: Regex,
@@ -227,52 +224,12 @@ static SYSTEM_PATTERNS: &[(&str, &str)] = &[
("IDCARD", r#"\d{17}[0-9xX]|\d{15}"#),
];
impl DenyWord {
fn empty() -> Self {
DenyWord {
jieba: Jieba::empty(),
words: HashSet::new(),
}
}
fn from_iter<T: IntoIterator<Item = impl Into<String>>>(words: T) -> Self {
let mut deny_word = DenyWord::empty();
for word in words {
let word_s = word.into();
let w = word_s.trim();
if w.is_empty() {
continue;
}
deny_word.jieba.add_word(w, None, None);
deny_word.words.insert(w.to_string());
}
deny_word
}
fn default() -> Self {
if let Some(file) = Asset::get("sensitive_word_dict.txt") {
if let Ok(data) = std::str::from_utf8(file.data.as_ref()) {
return DenyWord::from_iter(data.split('\n'));
}
}
DenyWord::empty()
}
fn check(&self, message: &str) -> Option<String> {
for word in self.jieba.cut(message, true) {
if self.words.contains(word) {
return Some(word.to_string());
}
}
None
}
}
impl System {
fn new() -> Self {
let grok_regex = Regex::new(GROK_PATTERN).unwrap();
let grok_patterns = BTreeMap::new();
let mut system = System {
deny_word: DenyWord::default(),
deny_word: DenyWord::system(),
grok_regex,
grok_patterns,
};
@@ -335,6 +292,7 @@ impl System {
(ret, ok)
}
}
impl AiDataMaskingRoot {
fn new() -> Self {
AiDataMaskingRoot {
@@ -382,6 +340,7 @@ impl RootContextWrapper<AiDataMaskingConfig> for AiDataMaskingRoot {
}))
}
}
impl AiDataMasking {
fn check_message(&self, message: &str) -> bool {
if let Some(config) = &self.config {
@@ -532,6 +491,7 @@ impl AiDataMasking {
}
impl Context for AiDataMasking {}
impl HttpContext for AiDataMasking {
fn on_http_request_headers(
&mut self,
@@ -607,6 +567,7 @@ impl HttpContext for AiDataMasking {
DataAction::Continue
}
}
impl HttpContextWrapper<AiDataMaskingConfig> for AiDataMasking {
fn log(&self) -> &Log {
&self.log

View File

@@ -153,6 +153,12 @@ var RustWasmPluginsAiDataMasking = suite.ConformanceTest{
[]byte("test"),
[]byte("{\"errmsg\":\"提问或回答中包含敏感词,已被屏蔽\"}"),
))
testcases = append(testcases, gen_assertion(
"system_no_deny.raw.com",
false,
[]byte("test"),
[]byte("{\"res\":\"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作\"}"),
))
testcases = append(testcases, gen_assertion(
"costom_word1.raw.com",
false,

View File

@@ -100,6 +100,12 @@ spec:
headers:
- Content-Type=application/json
"body": "{\"res\":\"fuck\"}"
- domain:
- system_no_deny.raw.com
config:
headers:
- Content-Type=application/json
"body": "{\"res\":\"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作\"}"
- domain:
- costom_word1.raw.com
config: