|
| 1 | +#![feature(lazy_cell)] |
| 2 | + |
| 3 | +use std::{cmp::Ordering, collections::BTreeMap, io::stdin, str::FromStr, sync::LazyLock}; |
| 4 | + |
| 5 | +const PATH: &str = "./src/profanity.csv"; |
| 6 | + |
| 7 | +pub fn main() { |
| 8 | + let file = std::fs::read_to_string(PATH).unwrap(); |
| 9 | + |
| 10 | + let mut entries = BTreeMap::<Phrase, Severity>::new(); |
| 11 | + |
| 12 | + let header = file.lines().next().unwrap().to_owned(); |
| 13 | + |
| 14 | + for line in file.lines().skip(1) { |
| 15 | + let (phrase, severity) = parse_line(line); |
| 16 | + |
| 17 | + let old = entries.insert(phrase, severity); |
| 18 | + assert!(old.is_none()); |
| 19 | + } |
| 20 | + |
| 21 | + export(&header, &entries); |
| 22 | + |
| 23 | + let mut line = String::new(); |
| 24 | + while let Ok(_) = stdin().read_line(&mut line) { |
| 25 | + let (phrase, severity) = parse_line(&line[0..line.len() - 1]); |
| 26 | + |
| 27 | + println!("adding {} with {severity:?}", phrase.0); |
| 28 | + |
| 29 | + let old = entries.insert(phrase, severity); |
| 30 | + assert!(old.is_none()); |
| 31 | + |
| 32 | + export(&header, &entries); |
| 33 | + |
| 34 | + line.clear(); |
| 35 | + } |
| 36 | +} |
| 37 | + |
| 38 | +type Severity = [u8; 5]; |
| 39 | + |
| 40 | +fn parse_line(line: &str) -> (Phrase, Severity) { |
| 41 | + let (phrase, severity) = line.split_once(',').expect(line); |
| 42 | + ( |
| 43 | + Phrase(phrase.to_owned()), |
| 44 | + severity |
| 45 | + .split(',') |
| 46 | + .map(|n| u8::from_str(n).expect(line)) |
| 47 | + .collect::<Vec<_>>() |
| 48 | + .try_into() |
| 49 | + .expect(line), |
| 50 | + ) |
| 51 | +} |
| 52 | + |
| 53 | +fn export(header: &str, entries: &BTreeMap<Phrase, Severity>) { |
| 54 | + let mut output = format!("{header}\n"); |
| 55 | + for (phrase, severity) in entries { |
| 56 | + use std::fmt::Write; |
| 57 | + writeln!( |
| 58 | + output, |
| 59 | + "{},{}", |
| 60 | + phrase.0, |
| 61 | + severity |
| 62 | + .iter() |
| 63 | + .map(|n| n.to_string()) |
| 64 | + .collect::<Vec<_>>() |
| 65 | + .join(",") |
| 66 | + ) |
| 67 | + .unwrap(); |
| 68 | + } |
| 69 | + |
| 70 | + std::fs::write(PATH, output).unwrap(); |
| 71 | +} |
| 72 | + |
| 73 | +fn is_emoji(input: &str) -> bool { |
| 74 | + static EMOJI_REGEX: LazyLock<regex::Regex> = LazyLock::new(|| { |
| 75 | + const EMOJI_REGEX: &str = r"^\p{Extended_Pictographic}(\p{EMod}|\x{FE0F}\x{20E3}?|[\x{E0020}-\x{E007E}]+\x{E007F})?(\x{200D}(\p{RI}\p{RI}|\p{Extended_Pictographic}(\p{EMod}|\x{FE0F}\x{20E3}?|[\x{E0020}-\x{E007E}]+\x{E007F})?))*$"; |
| 76 | + regex::Regex::new(EMOJI_REGEX).unwrap() |
| 77 | + }); |
| 78 | + |
| 79 | + EMOJI_REGEX.is_match(&input) |
| 80 | +} |
| 81 | + |
| 82 | +fn is_cyrillic(c: char) -> bool { |
| 83 | + matches!(c, '\u{0400}'..='\u{04FF}' // Cyrillic |
| 84 | + | '\u{0500}'..='\u{052F}' // Cyrillic Supplementary |
| 85 | + | '\u{2DE0}'..='\u{2DFF}' // Cyrillic Extended-A |
| 86 | + | '\u{A640}'..='\u{A69F}' // Cyrillic Extended-B |
| 87 | + | '\u{FE2E}'..='\u{FE2F}') // Combining Half Marks (some used with Cyrillic) |
| 88 | +} |
| 89 | + |
| 90 | +pub fn is_cjk(c: char) -> bool { |
| 91 | + let cp: u32 = c.into(); |
| 92 | + (cp >= 0x4E00 && cp <= 0x9FFF) |
| 93 | + || (cp >= 0x3400 && cp <= 0x4DBF) |
| 94 | + || (cp >= 0x20000 && cp <= 0x2A6DF) |
| 95 | + || (cp >= 0x2A700 && cp <= 0x2B73F) |
| 96 | + || (cp >= 0x2B740 && cp <= 0x2B81F) |
| 97 | + || (cp >= 0x2B820 && cp <= 0x2CEAF) |
| 98 | + || (cp >= 0xF900 && cp <= 0xFAFF) |
| 99 | + || (cp >= 0x2F800 && cp <= 0x2FA1F) |
| 100 | +} |
| 101 | + |
| 102 | +#[derive(Debug, Eq, PartialEq, Clone)] |
| 103 | +struct Phrase(String); |
| 104 | + |
| 105 | +#[derive(Eq, PartialEq, PartialOrd, Ord)] |
| 106 | +enum Class { |
| 107 | + AnyEmoji, |
| 108 | + Other, |
| 109 | + AllAscii, |
| 110 | + AnyCyrillic, |
| 111 | + AnyCjk, |
| 112 | +} |
| 113 | + |
| 114 | +impl Phrase { |
| 115 | + fn trim(&self) -> String { |
| 116 | + self.0.trim_start().to_ascii_lowercase() |
| 117 | + } |
| 118 | + |
| 119 | + fn class(&self) -> Class { |
| 120 | + let s = self.trim(); |
| 121 | + if s.chars().any(|c| is_emoji(&format!("{c}"))) { |
| 122 | + Class::AnyEmoji |
| 123 | + } else if s.bytes().take(3).collect::<Vec<_>>().is_ascii() { |
| 124 | + Class::AllAscii |
| 125 | + } else if s.chars().any(is_cyrillic) { |
| 126 | + Class::AnyCyrillic |
| 127 | + } else if s.chars().any(is_cjk) { |
| 128 | + Class::AnyCjk |
| 129 | + } else { |
| 130 | + Class::Other |
| 131 | + } |
| 132 | + } |
| 133 | +} |
| 134 | + |
| 135 | +impl Ord for Phrase { |
| 136 | + fn cmp(&self, other: &Self) -> Ordering { |
| 137 | + (self.class(), self.trim()).cmp(&(other.class(), other.trim())) |
| 138 | + } |
| 139 | +} |
| 140 | + |
| 141 | +impl PartialOrd for Phrase { |
| 142 | + fn partial_cmp(&self, other: &Self) -> Option<Ordering> { |
| 143 | + Some(self.cmp(other)) |
| 144 | + } |
| 145 | +} |
0 commit comments