finnbear
diff --git a/‎Cargo.toml‎
Lines changed: 7 additions & 2 deletions b/‎Cargo.toml‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎Makefile‎
Lines changed: 6 additions & 0 deletions b/‎Makefile‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/add_profanity.rs‎
Lines changed: 145 additions & 0 deletions b/‎src/add_profanity.rs‎
Lines changed: 145 additions & 0 deletions
diff --git a/‎src/censor.rs‎
Lines changed: 3 additions & 2 deletions b/‎src/censor.rs‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎src/dictionary_blocklist.txt‎
Lines changed: 2 additions & 0 deletions b/‎src/dictionary_blocklist.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/dictionary_extra.txt‎
Lines changed: 24 additions & 0 deletions b/‎src/dictionary_extra.txt‎
Lines changed: 24 additions & 0 deletions
@@ -1,7 +1,7 @@
 [package]
 name = "rustrict"
 authors = ["Finn Bear"]
-version = "0.7.36"
+version = "0.7.37"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/finnbear/rustrict/"
@@ -32,6 +32,11 @@ name = "trace"
 path = "src/trace.rs"
 required-features = ["trace"]
 
+[[bin]]
+name = "add_profanity"
+path = "src/add_profanity.rs"
+required-features = ["regex"]
+
 [features]
 default = ["censor", "context"]
 censor = ["arrayvec", "bitflags", "lazy_static", "itertools", "unicode-normalization", "rustc-hash"]
@@ -76,7 +81,7 @@ serde = {version = "1", features=["derive"], optional = true}
 rand = "0.8"
 csv = "1.1"
 censor_crate = { package = "censor", version = "0.3.0" }
-rustrict_old = { package = "rustrict", version = "0.7.24" }
+rustrict_old = { package = "rustrict", version = "0.7.36" }
 serial_test = "0.5"
 stfu_crate = { package = "stfu", version = "0.1.0" }
 bincode = "1.3.3"
 
@@ -15,12 +15,18 @@ false_positives:
 replacements:
 	cargo run --bin replacement_finder --features find_replacements
 
+add_profanity:
+	cargo run --bin add_profanity --features regex
+
 widths:
 	cargo run --bin character_analyzer --release --features imageproc,image,rusttype,walkdir,rayon,unicode-width
 
 test:
 	cargo test --release --features width,pii,serde -- --nocapture
 
+test_curated:
+	cargo test --release --features width,pii,serde -- --nocapture curated
+
 compare:
 	COMPARE=1 make test
 
 
@@ -177,7 +177,7 @@ is used as a dataset. Positive accuracy is the percentage of profanity detected
 
 | Crate | Accuracy | Positive Accuracy | Negative Accuracy | Time |
 |-------|----------|-------------------|-------------------|------|
-| [rustrict](https://crates.io/crates/rustrict) | 79.99% | 94.02% | 76.49% | 10s |
+| [rustrict](https://crates.io/crates/rustrict) | 79.53% | 94.08% | 75.90% | 10s |
 | [censor](https://crates.io/crates/censor) | 76.16%   | 72.76%            | 77.01%            | 23s  |
 | [stfu](https://crates.io/crates/stfu) | 91.74% | 77.69% | 95.25% | 45s |
 
 
@@ -0,0 +1,145 @@
+#![feature(lazy_cell)]
+
+use std::{cmp::Ordering, collections::BTreeMap, io::stdin, str::FromStr, sync::LazyLock};
+
+const PATH: &str = "./src/profanity.csv";
+
+pub fn main() {
+    let file = std::fs::read_to_string(PATH).unwrap();
+
+    let mut entries = BTreeMap::<Phrase, Severity>::new();
+
+    let header = file.lines().next().unwrap().to_owned();
+
+    for line in file.lines().skip(1) {
+        let (phrase, severity) = parse_line(line);
+
+        let old = entries.insert(phrase, severity);
+        assert!(old.is_none());
+    }
+
+    export(&header, &entries);
+
+    let mut line = String::new();
+    while let Ok(_) = stdin().read_line(&mut line) {
+        let (phrase, severity) = parse_line(&line[0..line.len() - 1]);
+
+        println!("adding {} with {severity:?}", phrase.0);
+
+        let old = entries.insert(phrase, severity);
+        assert!(old.is_none());
+
+        export(&header, &entries);
+
+        line.clear();
+    }
+}
+
+type Severity = [u8; 5];
+
+fn parse_line(line: &str) -> (Phrase, Severity) {
+    let (phrase, severity) = line.split_once(',').expect(line);
+    (
+        Phrase(phrase.to_owned()),
+        severity
+            .split(',')
+            .map(|n| u8::from_str(n).expect(line))
+            .collect::<Vec<_>>()
+            .try_into()
+            .expect(line),
+    )
+}
+
+fn export(header: &str, entries: &BTreeMap<Phrase, Severity>) {
+    let mut output = format!("{header}\n");
+    for (phrase, severity) in entries {
+        use std::fmt::Write;
+        writeln!(
+            output,
+            "{},{}",
+            phrase.0,
+            severity
+                .iter()
+                .map(|n| n.to_string())
+                .collect::<Vec<_>>()
+                .join(",")
+        )
+        .unwrap();
+    }
+
+    std::fs::write(PATH, output).unwrap();
+}
+
+fn is_emoji(input: &str) -> bool {
+    static EMOJI_REGEX: LazyLock<regex::Regex> = LazyLock::new(|| {
+        const EMOJI_REGEX: &str = r"^\p{Extended_Pictographic}(\p{EMod}|\x{FE0F}\x{20E3}?|[\x{E0020}-\x{E007E}]+\x{E007F})?(\x{200D}(\p{RI}\p{RI}|\p{Extended_Pictographic}(\p{EMod}|\x{FE0F}\x{20E3}?|[\x{E0020}-\x{E007E}]+\x{E007F})?))*$";
+        regex::Regex::new(EMOJI_REGEX).unwrap()
+    });
+
+    EMOJI_REGEX.is_match(&input)
+}
+
+fn is_cyrillic(c: char) -> bool {
+    matches!(c, '\u{0400}'..='\u{04FF}' // Cyrillic
+            | '\u{0500}'..='\u{052F}' // Cyrillic Supplementary
+            | '\u{2DE0}'..='\u{2DFF}' // Cyrillic Extended-A
+            | '\u{A640}'..='\u{A69F}' // Cyrillic Extended-B
+            | '\u{FE2E}'..='\u{FE2F}') // Combining Half Marks (some used with Cyrillic)
+}
+
+pub fn is_cjk(c: char) -> bool {
+    let cp: u32 = c.into();
+    (cp >= 0x4E00 && cp <= 0x9FFF)
+        || (cp >= 0x3400 && cp <= 0x4DBF)
+        || (cp >= 0x20000 && cp <= 0x2A6DF)
+        || (cp >= 0x2A700 && cp <= 0x2B73F)
+        || (cp >= 0x2B740 && cp <= 0x2B81F)
+        || (cp >= 0x2B820 && cp <= 0x2CEAF)
+        || (cp >= 0xF900 && cp <= 0xFAFF)
+        || (cp >= 0x2F800 && cp <= 0x2FA1F)
+}
+
+#[derive(Debug, Eq, PartialEq, Clone)]
+struct Phrase(String);
+
+#[derive(Eq, PartialEq, PartialOrd, Ord)]
+enum Class {
+    AnyEmoji,
+    Other,
+    AllAscii,
+    AnyCyrillic,
+    AnyCjk,
+}
+
+impl Phrase {
+    fn trim(&self) -> String {
+        self.0.trim_start().to_ascii_lowercase()
+    }
+
+    fn class(&self) -> Class {
+        let s = self.trim();
+        if s.chars().any(|c| is_emoji(&format!("{c}"))) {
+            Class::AnyEmoji
+        } else if s.bytes().take(3).collect::<Vec<_>>().is_ascii() {
+            Class::AllAscii
+        } else if s.chars().any(is_cyrillic) {
+            Class::AnyCyrillic
+        } else if s.chars().any(is_cjk) {
+            Class::AnyCjk
+        } else {
+            Class::Other
+        }
+    }
+}
+
+impl Ord for Phrase {
+    fn cmp(&self, other: &Self) -> Ordering {
+        (self.class(), self.trim()).cmp(&(other.class(), other.trim()))
+    }
+}
+
+impl PartialOrd for Phrase {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
@@ -566,8 +566,9 @@ impl<I: Iterator<Item = char>> Iterator for Censor<I> {
                         // space.
                         // ( and ) are for ignoring appositive phrases.
                         // Checking node.last is to collapse multiple spaces into one
-                        let new_space = matches!(c, ' ' | '.' | ',' | ':' | ';' | '…' | '(' | ')' | '_' | '-')
-                            && m.node.last != Some(' ');
+                        let new_space =
+                            matches!(c, ' ' | '.' | ',' | ':' | ';' | '…' | '(' | ')' | '_' | '-')
+                                && m.node.last != Some(' ');
                         let new_repetition: bool = !new_space && c == m.last;
                         let new_skip = !new_space && skippable && !ignore_sep && !new_repetition;
                         // dil -> dii
 
@@ -276,6 +276,7 @@ effs
 ejaculat(.*)
 ejaculation
 el
+enslavers
 eunuchs
 ens
 ep
@@ -496,6 +497,7 @@ lowlifes
 m
 making love
 male squirting
+marriage
 masochists
 massive wood
 master race
 
@@ -1,6 +1,7 @@
 #8
 # of
 (until
+(litera
 2 secs
 3 secs
 4 secs
@@ -16,6 +17,7 @@
 0 secs
 300 bot
 600 bot
+t watch
 twinkie
 two secs
 three secs
@@ -53,6 +55,7 @@ bishi
 blogger
 bob's
 bonjor
+but troll
 of elchasai
 braig
 brain cell
@@ -148,6 +151,7 @@ in 199
 in june
 irl
 isn't it
+it it
 it, its
 it's a hole
 it's ex
@@ -163,6 +167,7 @@ my jewish
 is jewish
 or jewish
 to jewish
+hi just
 jeff
 jewish ancestry
 jewish background
@@ -177,18 +182,35 @@ katyusha the
 kian
 kill ike
 killian
+kiss of death
 kshatr
 last? it
 left it
 little hovercraft
 lmao
 lol
+lol!
+lol.
+lol. i
+lol, i
+lol i
+lol. j
+lol, j
+lol j
+lol. l
+lol, l
+lol l
+lol. y
+lol, y
+lol y
+lolipop
 magnacumlaude
 maine coon
 make a hole
 s expired
 minigame
 mini game
+n 1997
 n't eat
 negativly
 ngad
@@ -259,6 +281,7 @@ tit for tat
 titch
 title section
 tito
+t to heli
 to heli
 too heli
 to helicopter
@@ -275,6 +298,7 @@ virgin group
 virgin islands
 wassup
 wasn't it
+we did our
 wish i t
 wouldn't it
 xD i do