WARNING: THIS SITE IS A MIRROR OF GITHUB.COM / IT CANNOT LOGIN OR REGISTER ACCOUNTS / THE CONTENTS ARE PROVIDED AS-IS / THIS SITE ASSUMES NO RESPONSIBILITY FOR ANY DISPLAYED CONTENT OR LINKS / IF YOU FOUND SOMETHING MAY NOT GOOD FOR EVERYONE, CONTACT ADMIN AT ilovescratch@foxmail.com
Skip to content

Commit e4a12fd

Browse files
committed
0.7.36 - wordlist improvements.
1 parent f36e697 commit e4a12fd

14 files changed

+1073
-517
lines changed

Cargo.toml

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[package]
22
name = "rustrict"
33
authors = ["Finn Bear"]
4-
version = "0.7.36"
4+
version = "0.7.37"
55
edition = "2021"
66
license = "MIT OR Apache-2.0"
77
repository = "https://github.com/finnbear/rustrict/"
@@ -32,6 +32,11 @@ name = "trace"
3232
path = "src/trace.rs"
3333
required-features = ["trace"]
3434

35+
[[bin]]
36+
name = "add_profanity"
37+
path = "src/add_profanity.rs"
38+
required-features = ["regex"]
39+
3540
[features]
3641
default = ["censor", "context"]
3742
censor = ["arrayvec", "bitflags", "lazy_static", "itertools", "unicode-normalization", "rustc-hash"]
@@ -76,7 +81,7 @@ serde = {version = "1", features=["derive"], optional = true}
7681
rand = "0.8"
7782
csv = "1.1"
7883
censor_crate = { package = "censor", version = "0.3.0" }
79-
rustrict_old = { package = "rustrict", version = "0.7.24" }
84+
rustrict_old = { package = "rustrict", version = "0.7.36" }
8085
serial_test = "0.5"
8186
stfu_crate = { package = "stfu", version = "0.1.0" }
8287
bincode = "1.3.3"

Makefile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,18 @@ false_positives:
1515
replacements:
1616
cargo run --bin replacement_finder --features find_replacements
1717

18+
add_profanity:
19+
cargo run --bin add_profanity --features regex
20+
1821
widths:
1922
cargo run --bin character_analyzer --release --features imageproc,image,rusttype,walkdir,rayon,unicode-width
2023

2124
test:
2225
cargo test --release --features width,pii,serde -- --nocapture
2326

27+
test_curated:
28+
cargo test --release --features width,pii,serde -- --nocapture curated
29+
2430
compare:
2531
COMPARE=1 make test
2632

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ is used as a dataset. Positive accuracy is the percentage of profanity detected
177177

178178
| Crate | Accuracy | Positive Accuracy | Negative Accuracy | Time |
179179
|-------|----------|-------------------|-------------------|------|
180-
| [rustrict](https://crates.io/crates/rustrict) | 79.99% | 94.02% | 76.49% | 10s |
180+
| [rustrict](https://crates.io/crates/rustrict) | 79.53% | 94.08% | 75.90% | 10s |
181181
| [censor](https://crates.io/crates/censor) | 76.16% | 72.76% | 77.01% | 23s |
182182
| [stfu](https://crates.io/crates/stfu) | 91.74% | 77.69% | 95.25% | 45s |
183183

src/add_profanity.rs

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
#![feature(lazy_cell)]
2+
3+
use std::{cmp::Ordering, collections::BTreeMap, io::stdin, str::FromStr, sync::LazyLock};
4+
5+
const PATH: &str = "./src/profanity.csv";
6+
7+
pub fn main() {
8+
let file = std::fs::read_to_string(PATH).unwrap();
9+
10+
let mut entries = BTreeMap::<Phrase, Severity>::new();
11+
12+
let header = file.lines().next().unwrap().to_owned();
13+
14+
for line in file.lines().skip(1) {
15+
let (phrase, severity) = parse_line(line);
16+
17+
let old = entries.insert(phrase, severity);
18+
assert!(old.is_none());
19+
}
20+
21+
export(&header, &entries);
22+
23+
let mut line = String::new();
24+
while let Ok(_) = stdin().read_line(&mut line) {
25+
let (phrase, severity) = parse_line(&line[0..line.len() - 1]);
26+
27+
println!("adding {} with {severity:?}", phrase.0);
28+
29+
let old = entries.insert(phrase, severity);
30+
assert!(old.is_none());
31+
32+
export(&header, &entries);
33+
34+
line.clear();
35+
}
36+
}
37+
38+
type Severity = [u8; 5];
39+
40+
fn parse_line(line: &str) -> (Phrase, Severity) {
41+
let (phrase, severity) = line.split_once(',').expect(line);
42+
(
43+
Phrase(phrase.to_owned()),
44+
severity
45+
.split(',')
46+
.map(|n| u8::from_str(n).expect(line))
47+
.collect::<Vec<_>>()
48+
.try_into()
49+
.expect(line),
50+
)
51+
}
52+
53+
fn export(header: &str, entries: &BTreeMap<Phrase, Severity>) {
54+
let mut output = format!("{header}\n");
55+
for (phrase, severity) in entries {
56+
use std::fmt::Write;
57+
writeln!(
58+
output,
59+
"{},{}",
60+
phrase.0,
61+
severity
62+
.iter()
63+
.map(|n| n.to_string())
64+
.collect::<Vec<_>>()
65+
.join(",")
66+
)
67+
.unwrap();
68+
}
69+
70+
std::fs::write(PATH, output).unwrap();
71+
}
72+
73+
fn is_emoji(input: &str) -> bool {
74+
static EMOJI_REGEX: LazyLock<regex::Regex> = LazyLock::new(|| {
75+
const EMOJI_REGEX: &str = r"^\p{Extended_Pictographic}(\p{EMod}|\x{FE0F}\x{20E3}?|[\x{E0020}-\x{E007E}]+\x{E007F})?(\x{200D}(\p{RI}\p{RI}|\p{Extended_Pictographic}(\p{EMod}|\x{FE0F}\x{20E3}?|[\x{E0020}-\x{E007E}]+\x{E007F})?))*$";
76+
regex::Regex::new(EMOJI_REGEX).unwrap()
77+
});
78+
79+
EMOJI_REGEX.is_match(&input)
80+
}
81+
82+
fn is_cyrillic(c: char) -> bool {
83+
matches!(c, '\u{0400}'..='\u{04FF}' // Cyrillic
84+
| '\u{0500}'..='\u{052F}' // Cyrillic Supplementary
85+
| '\u{2DE0}'..='\u{2DFF}' // Cyrillic Extended-A
86+
| '\u{A640}'..='\u{A69F}' // Cyrillic Extended-B
87+
| '\u{FE2E}'..='\u{FE2F}') // Combining Half Marks (some used with Cyrillic)
88+
}
89+
90+
pub fn is_cjk(c: char) -> bool {
91+
let cp: u32 = c.into();
92+
(cp >= 0x4E00 && cp <= 0x9FFF)
93+
|| (cp >= 0x3400 && cp <= 0x4DBF)
94+
|| (cp >= 0x20000 && cp <= 0x2A6DF)
95+
|| (cp >= 0x2A700 && cp <= 0x2B73F)
96+
|| (cp >= 0x2B740 && cp <= 0x2B81F)
97+
|| (cp >= 0x2B820 && cp <= 0x2CEAF)
98+
|| (cp >= 0xF900 && cp <= 0xFAFF)
99+
|| (cp >= 0x2F800 && cp <= 0x2FA1F)
100+
}
101+
102+
#[derive(Debug, Eq, PartialEq, Clone)]
103+
struct Phrase(String);
104+
105+
#[derive(Eq, PartialEq, PartialOrd, Ord)]
106+
enum Class {
107+
AnyEmoji,
108+
Other,
109+
AllAscii,
110+
AnyCyrillic,
111+
AnyCjk,
112+
}
113+
114+
impl Phrase {
115+
fn trim(&self) -> String {
116+
self.0.trim_start().to_ascii_lowercase()
117+
}
118+
119+
fn class(&self) -> Class {
120+
let s = self.trim();
121+
if s.chars().any(|c| is_emoji(&format!("{c}"))) {
122+
Class::AnyEmoji
123+
} else if s.bytes().take(3).collect::<Vec<_>>().is_ascii() {
124+
Class::AllAscii
125+
} else if s.chars().any(is_cyrillic) {
126+
Class::AnyCyrillic
127+
} else if s.chars().any(is_cjk) {
128+
Class::AnyCjk
129+
} else {
130+
Class::Other
131+
}
132+
}
133+
}
134+
135+
impl Ord for Phrase {
136+
fn cmp(&self, other: &Self) -> Ordering {
137+
(self.class(), self.trim()).cmp(&(other.class(), other.trim()))
138+
}
139+
}
140+
141+
impl PartialOrd for Phrase {
142+
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
143+
Some(self.cmp(other))
144+
}
145+
}

src/censor.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -566,8 +566,9 @@ impl<I: Iterator<Item = char>> Iterator for Censor<I> {
566566
// space.
567567
// ( and ) are for ignoring appositive phrases.
568568
// Checking node.last is to collapse multiple spaces into one
569-
let new_space = matches!(c, ' ' | '.' | ',' | ':' | ';' | '…' | '(' | ')' | '_' | '-')
570-
&& m.node.last != Some(' ');
569+
let new_space =
570+
matches!(c, ' ' | '.' | ',' | ':' | ';' | '…' | '(' | ')' | '_' | '-')
571+
&& m.node.last != Some(' ');
571572
let new_repetition: bool = !new_space && c == m.last;
572573
let new_skip = !new_space && skippable && !ignore_sep && !new_repetition;
573574
// dil -> dii

src/dictionary_blocklist.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,7 @@ effs
276276
ejaculat(.*)
277277
ejaculation
278278
el
279+
enslavers
279280
eunuchs
280281
ens
281282
ep
@@ -496,6 +497,7 @@ lowlifes
496497
m
497498
making love
498499
male squirting
500+
marriage
499501
masochists
500502
massive wood
501503
master race

src/dictionary_extra.txt

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#8
22
# of
33
(until
4+
(litera
45
2 secs
56
3 secs
67
4 secs
@@ -16,6 +17,7 @@
1617
0 secs
1718
300 bot
1819
600 bot
20+
t watch
1921
twinkie
2022
two secs
2123
three secs
@@ -53,6 +55,7 @@ bishi
5355
blogger
5456
bob's
5557
bonjor
58+
but troll
5659
of elchasai
5760
braig
5861
brain cell
@@ -148,6 +151,7 @@ in 199
148151
in june
149152
irl
150153
isn't it
154+
it it
151155
it, its
152156
it's a hole
153157
it's ex
@@ -163,6 +167,7 @@ my jewish
163167
is jewish
164168
or jewish
165169
to jewish
170+
hi just
166171
jeff
167172
jewish ancestry
168173
jewish background
@@ -177,18 +182,35 @@ katyusha the
177182
kian
178183
kill ike
179184
killian
185+
kiss of death
180186
kshatr
181187
last? it
182188
left it
183189
little hovercraft
184190
lmao
185191
lol
192+
lol!
193+
lol.
194+
lol. i
195+
lol, i
196+
lol i
197+
lol. j
198+
lol, j
199+
lol j
200+
lol. l
201+
lol, l
202+
lol l
203+
lol. y
204+
lol, y
205+
lol y
206+
lolipop
186207
magnacumlaude
187208
maine coon
188209
make a hole
189210
s expired
190211
minigame
191212
mini game
213+
n 1997
192214
n't eat
193215
negativly
194216
ngad
@@ -259,6 +281,7 @@ tit for tat
259281
titch
260282
title section
261283
tito
284+
t to heli
262285
to heli
263286
too heli
264287
to helicopter
@@ -275,6 +298,7 @@ virgin group
275298
virgin islands
276299
wassup
277300
wasn't it
301+
we did our
278302
wish i t
279303
wouldn't it
280304
xD i do

0 commit comments

Comments
 (0)