Ai data mask deny word match optimize (#1453)

This commit is contained in:
007gzs
2024-11-05 15:26:55 +08:00
committed by GitHub
parent 7e8b0445ad
commit c1f2504e87
7 changed files with 78 additions and 50 deletions

View File

@@ -0,0 +1,54 @@
use std::collections::HashSet;
use jieba_rs::Jieba;
use crate::Asset;
#[derive(Default, Debug, Clone)]
pub(crate) struct DenyWord {
jieba: Jieba,
words: HashSet<String>,
}
impl DenyWord {
pub(crate) fn from_iter<T: IntoIterator<Item = impl Into<String>>>(words: T) -> Self {
let mut deny_word = DenyWord::default();
for word in words {
let word_s = word.into();
let w = word_s.trim();
if w.is_empty() {
continue;
}
deny_word.jieba.add_word(w, None, None);
deny_word.words.insert(w.to_string());
}
deny_word
}
pub(crate) fn empty() -> Self {
DenyWord {
jieba: Jieba::empty(),
words: HashSet::new(),
}
}
pub(crate) fn system() -> Self {
if let Some(file) = Asset::get("sensitive_word_dict.txt") {
if let Ok(data) = std::str::from_utf8(file.data.as_ref()) {
return DenyWord::from_iter(data.split('\n'));
}
}
Self::empty()
}
pub(crate) fn check(&self, message: &str) -> Option<String> {
for word in self.jieba.cut(message, true) {
if self.words.contains(word) {
return Some(word.to_string());
}
}
None
}
}

View File

@@ -12,13 +12,15 @@
// See the License for the specific language governing permissions and
// limitations under the License.
mod deny_word;
use crate::deny_word::DenyWord;
use fancy_regex::Regex;
use grok::patterns;
use higress_wasm_rust::log::Log;
use higress_wasm_rust::plugin_wrapper::{HttpContextWrapper, RootContextWrapper};
use higress_wasm_rust::request_wrapper::has_request_body;
use higress_wasm_rust::rule_matcher::{on_configure, RuleMatcher, SharedRuleMatcher};
use jieba_rs::Jieba;
use jsonpath_rust::{JsonPath, JsonPathValue};
use lazy_static::lazy_static;
use proxy_wasm::traits::{Context, HttpContext, RootContext};
@@ -29,7 +31,7 @@ use serde::Deserialize;
use serde::Deserializer;
use serde_json::{json, Value};
use std::cell::RefCell;
use std::collections::{BTreeMap, HashMap, HashSet, VecDeque};
use std::collections::{BTreeMap, HashMap, VecDeque};
use std::ops::DerefMut;
use std::rc::Rc;
use std::str::FromStr;
@@ -47,11 +49,6 @@ const GROK_PATTERN: &str = r"%\{(?<name>(?<pattern>[A-z0-9]+)(?::(?<alias>[A-z0-
#[folder = "res/"]
struct Asset;
#[derive(Default, Debug, Clone)]
struct DenyWord {
jieba: Jieba,
words: HashSet<String>,
}
struct System {
deny_word: DenyWord,
grok_regex: Regex,
@@ -227,52 +224,12 @@ static SYSTEM_PATTERNS: &[(&str, &str)] = &[
("IDCARD", r#"\d{17}[0-9xX]|\d{15}"#),
];
impl DenyWord {
fn empty() -> Self {
DenyWord {
jieba: Jieba::empty(),
words: HashSet::new(),
}
}
fn from_iter<T: IntoIterator<Item = impl Into<String>>>(words: T) -> Self {
let mut deny_word = DenyWord::empty();
for word in words {
let word_s = word.into();
let w = word_s.trim();
if w.is_empty() {
continue;
}
deny_word.jieba.add_word(w, None, None);
deny_word.words.insert(w.to_string());
}
deny_word
}
fn default() -> Self {
if let Some(file) = Asset::get("sensitive_word_dict.txt") {
if let Ok(data) = std::str::from_utf8(file.data.as_ref()) {
return DenyWord::from_iter(data.split('\n'));
}
}
DenyWord::empty()
}
fn check(&self, message: &str) -> Option<String> {
for word in self.jieba.cut(message, true) {
if self.words.contains(word) {
return Some(word.to_string());
}
}
None
}
}
impl System {
fn new() -> Self {
let grok_regex = Regex::new(GROK_PATTERN).unwrap();
let grok_patterns = BTreeMap::new();
let mut system = System {
deny_word: DenyWord::default(),
deny_word: DenyWord::system(),
grok_regex,
grok_patterns,
};
@@ -335,6 +292,7 @@ impl System {
(ret, ok)
}
}
impl AiDataMaskingRoot {
fn new() -> Self {
AiDataMaskingRoot {
@@ -382,6 +340,7 @@ impl RootContextWrapper<AiDataMaskingConfig> for AiDataMaskingRoot {
}))
}
}
impl AiDataMasking {
fn check_message(&self, message: &str) -> bool {
if let Some(config) = &self.config {
@@ -532,6 +491,7 @@ impl AiDataMasking {
}
impl Context for AiDataMasking {}
impl HttpContext for AiDataMasking {
fn on_http_request_headers(
&mut self,
@@ -607,6 +567,7 @@ impl HttpContext for AiDataMasking {
DataAction::Continue
}
}
impl HttpContextWrapper<AiDataMaskingConfig> for AiDataMasking {
fn log(&self) -> &Log {
&self.log