fulcrumgenomics · nh13 · Mar 14, 2025 · theJasonFan · May 20, 2025 · theJasonFan
@@ -30,7 +30,6 @@ path = "src/bin/main.rs"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-ahash = "0.8.11"
 anyhow = "1.0.38"
 bstr = "1.0.1"
 clap = { version = "4.0.25", features = ["derive"] }
@@ -49,6 +48,7 @@ serde-aux = "4.1.2"
 seq_io = "0.3.1"
 thiserror = "1.0.37"
 proglog = {version = "0.3.0", features = ["pretty_counts"] }
+rustc-hash = "2.1.1"
 
 [dev-dependencies]
 csv = "1.1.6"

@@ -161,7 +161,7 @@ Options:
           [default: 2]
 
   -t, --threads <THREADS>
-          The number of threads to use. Cannot be less than 3
+          The number of threads to use. Cannot be less than 5
 
           [default: 8]
 
@@ -176,10 +176,10 @@ Options:
           1. `too-few-bases`: there are too few bases or qualities to extract given the read structures.  For example, if a read is 8bp long but the read structure is `10B`, or if a read is empty and the read structure is `+T`.
 
   -h, --help
-          Print help information (use `-h` for a summary)
+          Print help (see a summary with '-h')
 
   -V, --version
-          Print version information
+          Print version
 ```
 <!-- end usage -->
 

@@ -1,3 +1,3 @@
 [toolchain]
-channel = "1.85"
+channel = "1.85.0"
 components = ["rustfmt", "clippy"]
@@ -93,6 +93,7 @@ impl ReadSet {
     const SPACE: u8 = b' ';
     const COLON: u8 = b':';
     const PLUS: u8 = b'+';
+    const READ_NUMBERS: &[u8] = b"12345678";
-    const READ_NUMBERS: &[u8] = b"12345678";
+    const READ_NUMBERS: &[u8] = b"0123456789";
-    const READ_NUMBERS: &[u8] = b"12345678";
+    const READ_NUMBERS: &[u8] = b"0123456789";
 
     /// Produces an iterator over references to the template segments stored in this ``ReadSet``.
     fn template_segments(&self) -> SegmentIter {
@@ -213,7 +214,12 @@ impl ReadSet {
             None => {
                 // If no pre-existing comment, assume the read is a passing filter, non-control
                 // read and generate a comment for it (sample barcode is added below).
-                write!(writer, "{}:N:0:", read_num)?;
+                if read_num < Self::READ_NUMBERS.len() {
+                    writer.write_all(&[Self::READ_NUMBERS[read_num - 1]])?;
+                    write!(writer, ":N:0:")?;
+                } else {
+                    write!(writer, "{}:N:0:", read_num)?;
+                }
             }
             Some(chars) => {
                 // Else check it's a 4-part name... fix the read number at the front and
@@ -239,7 +245,11 @@ impl ReadSet {
                         &chars[first_colon_idx + 1..chars.len()]
                     };
 
-                    write!(writer, "{}:", read_num)?;
+                    if read_num < Self::READ_NUMBERS.len() {
-                    if read_num < Self::READ_NUMBERS.len() {
+                    if read_num <= Self::READ_NUMBERS.len() {
-                    if read_num < Self::READ_NUMBERS.len() {
+                    if read_num <= Self::READ_NUMBERS.len() {
+                        writer.write_all(&[Self::READ_NUMBERS[read_num - 1], b':'])?;
+                    } else {
+                        write!(writer, "{}:", read_num)?;
+                    }
                     writer.write_all(remainder)?;
 
                     if *remainder.last().unwrap() != Self::COLON {
@@ -619,7 +629,7 @@ pub(crate) struct Demux {
     #[clap(long, short = 'd', default_value = "2")]
     min_mismatch_delta: usize,
 
-    /// The number of threads to use. Cannot be less than 3.
+    /// The number of threads to use. Cannot be less than 5.
     #[clap(long, short = 't', default_value = "8")]
     threads: usize,
 
@@ -666,9 +676,12 @@ impl Demux {
                 read_structures.iter().map(|s| s.segments_by_type(*output_type).count()).sum();
 
             for idx in 1..=segment_count {
-                output_type_writers.push(BufWriter::new(File::create(
-                    output_dir.join(format!("{}.{}{}.fq.gz", prefix, file_type_code, idx)),
-                )?));
+                output_type_writers.push(BufWriter::with_capacity(
+                    65_536usize,
+                    File::create(
+                        output_dir.join(format!("{}.{}{}.fq.gz", prefix, file_type_code, idx)),
+                    )?,
+                ));
             }
 
             match output_type {
@@ -1189,6 +1202,7 @@ mod tests {
             skip_reasons: vec![],
         };
         let demux_result = demux_inputs.execute();
+        #[allow(clippy::permissions_set_readonly_false)]
         permissions.set_readonly(false);
         fs::set_permissions(tmp.path(), permissions).unwrap();
         demux_result.unwrap();
@@ -1963,7 +1977,7 @@ mod tests {
             vec!["AAAAAAA", &SAMPLE1_BARCODE[0..7]], // barcode too short
             vec!["CCCCCCC", SAMPLE1_BARCODE],        // barcode the correct length
             vec!["", SAMPLE1_BARCODE],               // template basese too short
-            vec!["G", SAMPLE1_BARCODE],              // barcode the correct length
+            vec!["G", SAMPLE1_BARCODE],
         ];
 
         let input_files = vec![
@@ -1999,7 +2013,7 @@ mod tests {
             vec!["AAAAAAA", &SAMPLE1_BARCODE[0..7]], // barcode too short
             vec!["CCCCCCC", SAMPLE1_BARCODE],        // barcode the correct length
             vec!["", SAMPLE1_BARCODE],               // template basese too short
-            vec!["G", SAMPLE1_BARCODE],              // barcode the correct length
+            vec!["G", SAMPLE1_BARCODE],
         ];
 
         let input_files = vec![

@@ -6,8 +6,7 @@ use crate::encode;
 use super::byte_is_nocall;
 use super::samples::Sample;
 use crate::bitenc::BitEnc;
-use ahash::HashMap as AHashMap;
-use ahash::HashMapExt;
+use rustc_hash::FxHashMap;
 
 const STARTING_CACHE_SIZE: usize = 1_000_000;
 
@@ -41,7 +40,7 @@ pub struct BarcodeMatcher {
     /// If true will attempt to use the cache when matching.
     use_cache: bool,
     /// Caching struct for storing results of previous matches
-    cache: AHashMap<Vec<u8>, BarcodeMatch>,
+    cache: FxHashMap<Vec<u8>, BarcodeMatch>,
 }
 
 impl BarcodeMatcher {
@@ -81,7 +80,7 @@ impl BarcodeMatcher {
             max_mismatches,
             min_mismatch_delta,
             use_cache,
-            cache: AHashMap::with_capacity(STARTING_CACHE_SIZE),
+            cache: FxHashMap::with_capacity_and_hasher(STARTING_CACHE_SIZE, Default::default()),
         }
     }
 
@@ -94,9 +93,7 @@ impl BarcodeMatcher {
     ) -> u8 {
         if observed_bases.nr_symbols() != expected_bases.nr_symbols() {
             let observed_string = decode(observed_bases);
-            assert_eq!(
-                observed_bases.nr_symbols(),
-                expected_bases.nr_symbols(),
+            panic!(
                 "Read barcode ({}) length ({}) differs from expected barcode ({}) length ({}) for sample {}",
                 observed_string,
                 observed_bases.nr_symbols(),
@@ -196,6 +193,7 @@ mod tests {
     fn barcode_to_sample(barcode: &str, idx: usize) -> Sample {
         Sample {
             barcode: barcode.to_string(),
+            barcode_bytes: barcode.as_bytes().to_vec(),
             sample_id: format!("sample_{idx}").to_string(),
             ordinal: idx,
         }

@@ -53,11 +53,7 @@ pub fn encode(bases: &[u8]) -> BitEnc {
             IUPAC_MASKS[b'N' as usize]
         } else {
             let value = base.to_ascii_uppercase() as usize;
-            if value < 256 {
-                IUPAC_MASKS[value]
-            } else {
-                0
-            }
+            if value < 256 { IUPAC_MASKS[value] } else { 0 }
         };
         vec.push(bit);
     }

@@ -19,6 +19,9 @@ pub struct Sample {
     pub sample_id: String,
     /// DNA barcode associated with the sample
     pub barcode: String,
+    /// DNA barcode as a byte
+    #[serde(skip_deserializing)]
+    pub barcode_bytes: Vec<u8>,
     /// index of the sample in the [`SampleGroup`] object, used for syncing indices across
     /// different structs
     #[serde(skip_deserializing)]
@@ -53,7 +56,8 @@ impl Sample {
             barcode.as_bytes().iter().all(|&b| is_valid_iupac(b)),
             "All sample barcode bases must be one of A, C, G, T, U, R, Y, S, W, K, M, D, V, H, B, N"
         );
-        Self { sample_id: name, barcode, ordinal }
+        let barcode_bytes = barcode.as_bytes().to_vec();
+        Self { sample_id: name, barcode, barcode_bytes, ordinal }
     }
 
     /// Returns the header line expected by serde when deserializing
@@ -294,11 +298,7 @@ mod tests {
         let barcode = "GATTACA".to_owned();
         let ordinal = 0;
         let sample = Sample::new(ordinal, name.clone(), barcode.clone());
-        assert_eq!(
-            Sample { sample_id: name, barcode, ordinal },
-            sample,
-            "Sample differed from expectation"
-        );
+        assert_eq!(Sample::new(ordinal, name, barcode), sample, "Sample differed from expectation");
     }
 
     // ############################################################################################