WARNING: THIS SITE IS A MIRROR OF GITHUB.COM / IT CANNOT LOGIN OR REGISTER ACCOUNTS / THE CONTENTS ARE PROVIDED AS-IS / THIS SITE ASSUMES NO RESPONSIBILITY FOR ANY DISPLAYED CONTENT OR LINKS / IF YOU FOUND SOMETHING MAY NOT GOOD FOR EVERYONE, CONTACT ADMIN AT ilovescratch@foxmail.com
Skip to content

Commit c98b60e

Browse files
committed
Incremental improvement to word/character data.
1 parent 1e2219a commit c98b60e

16 files changed

+690
-575
lines changed

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[package]
22
name = "rustrict"
33
authors = ["Finn Bear"]
4-
version = "0.7.35"
4+
version = "0.7.36"
55
edition = "2021"
66
license = "MIT OR Apache-2.0"
77
repository = "https://github.com/finnbear/rustrict/"

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ is used as a dataset. Positive accuracy is the percentage of profanity detected
177177

178178
| Crate | Accuracy | Positive Accuracy | Negative Accuracy | Time |
179179
|-------|----------|-------------------|-------------------|------|
180-
| [rustrict](https://crates.io/crates/rustrict) | 80.00% | 94.01% | 76.50% | 9s |
180+
| [rustrict](https://crates.io/crates/rustrict) | 79.99% | 94.02% | 76.49% | 10s |
181181
| [censor](https://crates.io/crates/censor) | 76.16% | 72.76% | 77.01% | 23s |
182182
| [stfu](https://crates.io/crates/stfu) | 91.74% | 77.69% | 95.25% | 45s |
183183
| [profane-rs](https://crates.io/crates/profane-rs) | 80.47% | 73.79% | 82.14% | 52s |

src/character_analyzer.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ fn main() {
5050
'𒐫' => 80,
5151
'𒈙' => 35,
5252
'༺' | '༻' => 25,
53+
'𒀱' => 45,
54+
'۞' => 19,
5355
_ => {
5456
let max_width = (max_width(c, &fonts) as f32 / 100f32).round() as u16;
5557
if max_width > u8::MAX as u16 {

src/character_widths.bin

3 Bytes
Binary file not shown.

src/dictionary_blacklist.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,7 @@ cancer
177177
castrates
178178
castrations
179179
ch
180+
child touche
180181
china flu
181182
china plague
182183
china virus

src/dictionary_extra.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ betcha
5151
bigger s
5252
bishi
5353
blogger
54+
bob's
5455
bonjor
5556
of elchasai
5657
braig
@@ -125,6 +126,8 @@ hi tirp
125126
hiu
126127
hi u
127128
hi v
129+
hochst
130+
hoechst
128131
ho ho ho
129132
honkeytonk
130133
honkey tonk
@@ -228,6 +231,7 @@ shoehorn your
228231
shouldn't it
229232
since 1
230233
since l
234+
sombra
231235
speed*
232236
sperm whale
233237
spick and span
@@ -240,6 +244,7 @@ tally ho
240244
tally-ho
241245
tea the
242246
test test test
247+
than a lot
243248
then i guess
244249
then talk
245250
then, talk

src/false_positives.txt

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -537,6 +537,7 @@ amphitrichate
537537
ample assured
538538
ample peer
539539
ampullar
540+
an a lot
540541
an ai
541542
an al
542543
an album
@@ -2735,6 +2736,7 @@ boards lut
27352736
boards perm
27362737
boards seeks
27372738
boastful
2739+
bob's
27382740
bobbish
27392741
bobsled
27402742
bobsleigh
@@ -3494,6 +3496,17 @@ buzz hit
34943496
buzz lut
34953497
buzz off
34963498
buzz perm
3499+
buzz syd
3500+
buzz sym
3501+
buzz syn
3502+
buzz syr
3503+
buzz sys
3504+
buzz ya
3505+
buzz ye
3506+
buzz yi
3507+
buzz yo
3508+
buzz yu
3509+
buzzy
34973510
bytes cumulative
34983511
bytes ext
34993512
bytes hilt
@@ -3816,6 +3829,7 @@ child dicke
38163829
child licking
38173830
child ongoing
38183831
child stew
3832+
child touched
38193833
childrens cumulative
38203834
childrens ext
38213835
childrens hilt
@@ -7945,6 +7959,7 @@ hobbies seeks
79457959
hoc
79467960
hod
79477961
hoecake
7962+
hoechst
79487963
hoed
79497964
hoeful
79507965
hoeing
@@ -11726,6 +11741,7 @@ muscles perm
1172611741
muscles seeks
1172711742
musklike
1172811743
muskroot
11744+
muslim kill
1172911745
musselcracker
1173011746
mussuck
1173111747
mustard
@@ -16807,6 +16823,7 @@ solo ser
1680716823
solo vary
1680816824
solo zer
1680916825
somalia bomb
16826+
sombra
1681016827
something hote
1681116828
something hotmail
1681216829
something hottest
@@ -19547,6 +19564,7 @@ weenier
1954719564
weeniest
1954819565
weet back
1954919566
weet dream
19567+
weet girl
1955019568
weeweed
1955119569
weeweeing
1955219570
weightier

src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ pub fn is_whitespace(c: char) -> bool {
9191
|| c.is_format()
9292
|| matches!(
9393
c,
94-
'\u{115F}' | '\u{1160}' | '\u{2800}' | '\u{3164}' | '\u{FFA0}' | '\u{FFFC}'
94+
'\u{115F}' | '\u{1160}' | '\u{20DD}' | '\u{2800}' | '\u{3164}' | '\u{FFA0}' | '\u{FFFC}'
9595
)
9696
}
9797

src/pii.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ lazy_static! {
88
static ref EMAIL_ADDRESS : Regex = Regex::new(r#"(?i)[a-z0-9_\-]{3,}\s*(@|[\[\(\s]at[\s\)\]])\s*[a-z0-9_\-]{5,}\s*(\.|dot)\s*(com|net|org|gov|biz|co|us|ru|uk|de|se|to|tv|io|info|online|site)"#).unwrap();
99
//static ref ADDRESS : Regex = Regex::new(r#"(?i)\d+[ ](?:[A-Za-z0-9\.-]+ )+(?:Avenue|Lane|Road|Boulevard|Drive|Street|Ave|Dr|Rd|Blvd|Ln|St)\.?(\s+#[0-9]{1,5})?"#).unwrap();
1010
static ref NAME : Regex = Regex::new(r#"(?i)(real\s)?name\s+is:?\s[a-zA-Z]+(\s[a-zA-z]+)?"#).unwrap();
11-
static ref URL : Regex = Regex::new(r#"(?i)(https?:?/*)?[a-zA-Z0-9]{3,}\.(com|net|org|gov|biz|co|us|ru|uk|de|se|to|tv|io|info|online|site|link)"#).unwrap();
11+
static ref URL : Regex = Regex::new(r#"(?i)(https?:?/*)?[a-zA-Z0-9]{3,}\.(com|net|org|gov|biz|co|us|ru|cc|uk|de|se|to|tv|io|gg|info|online|site|link)"#).unwrap();
1212
}
1313

1414
/// Returns [`s`] with personally-identifiable information censored out, and a `true` if

0 commit comments

Comments
 (0)