WARNING: THIS SITE IS A MIRROR OF GITHUB.COM / IT CANNOT LOGIN OR REGISTER ACCOUNTS / THE CONTENTS ARE PROVIDED AS-IS / THIS SITE ASSUMES NO RESPONSIBILITY FOR ANY DISPLAYED CONTENT OR LINKS / IF YOU FOUND SOMETHING MAY NOT GOOD FOR EVERYONE, CONTACT ADMIN AT ilovescratch@foxmail.com
Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
912 changes: 398 additions & 514 deletions Cargo.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ path = "src/bin/main.rs"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
ahash = "0.8.11"
anyhow = "1.0.38"
bstr = "1.0.1"
clap = { version = "4.0.25", features = ["derive"] }
Expand All @@ -49,6 +48,7 @@ serde-aux = "4.1.2"
seq_io = "0.3.1"
thiserror = "1.0.37"
proglog = {version = "0.3.0", features = ["pretty_counts"] }
rustc-hash = "2.1.1"

[dev-dependencies]
csv = "1.1.6"
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ Options:
[default: 2]

-t, --threads <THREADS>
The number of threads to use. Cannot be less than 3
The number of threads to use. Cannot be less than 5

[default: 8]

Expand All @@ -176,10 +176,10 @@ Options:
1. `too-few-bases`: there are too few bases or qualities to extract given the read structures. For example, if a read is 8bp long but the read structure is `10B`, or if a read is empty and the read structure is `+T`.

-h, --help
Print help information (use `-h` for a summary)
Print help (see a summary with '-h')

-V, --version
Print version information
Print version
```
<!-- end usage -->

Expand Down
2 changes: 1 addition & 1 deletion rust-toolchain.toml
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[toolchain]
channel = "1.85"
channel = "1.85.0"
components = ["rustfmt", "clippy"]
30 changes: 22 additions & 8 deletions src/bin/commands/demux.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ impl ReadSet {
const SPACE: u8 = b' ';
const COLON: u8 = b':';
const PLUS: u8 = b'+';
const READ_NUMBERS: &[u8] = b"12345678";
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minor (non-blocking) suggestion:

Start at 0 and add 9.

Then, below, you can just write:

writer.write_all(&[Self::READ_NUMBERS[read_num]])?;
Suggested change
const READ_NUMBERS: &[u8] = b"12345678";
const READ_NUMBERS: &[u8] = b"0123456789";


/// Produces an iterator over references to the template segments stored in this ``ReadSet``.
fn template_segments(&self) -> SegmentIter {
Expand Down Expand Up @@ -213,7 +214,12 @@ impl ReadSet {
None => {
// If no pre-existing comment, assume the read is a passing filter, non-control
// read and generate a comment for it (sample barcode is added below).
write!(writer, "{}:N:0:", read_num)?;
if read_num < Self::READ_NUMBERS.len() {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. Same comment as below about < -> <= if you don't start at 0.
  2. Probably worth leaving a comment about performance and why we want an optimization here.

writer.write_all(&[Self::READ_NUMBERS[read_num - 1]])?;
write!(writer, ":N:0:")?;
} else {
write!(writer, "{}:N:0:", read_num)?;
}
}
Some(chars) => {
// Else check it's a 4-part name... fix the read number at the front and
Expand All @@ -239,7 +245,11 @@ impl ReadSet {
&chars[first_colon_idx + 1..chars.len()]
};

write!(writer, "{}:", read_num)?;
if read_num < Self::READ_NUMBERS.len() {
Copy link
Member

@theJasonFan theJasonFan May 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you don't take the suggestion above... what if read_num == 8?

Suggested change
if read_num < Self::READ_NUMBERS.len() {
if read_num <= Self::READ_NUMBERS.len() {

writer.write_all(&[Self::READ_NUMBERS[read_num - 1], b':'])?;
} else {
write!(writer, "{}:", read_num)?;
}
writer.write_all(remainder)?;

if *remainder.last().unwrap() != Self::COLON {
Expand Down Expand Up @@ -619,7 +629,7 @@ pub(crate) struct Demux {
#[clap(long, short = 'd', default_value = "2")]
min_mismatch_delta: usize,

/// The number of threads to use. Cannot be less than 3.
/// The number of threads to use. Cannot be less than 5.
#[clap(long, short = 't', default_value = "8")]
threads: usize,

Expand Down Expand Up @@ -666,9 +676,12 @@ impl Demux {
read_structures.iter().map(|s| s.segments_by_type(*output_type).count()).sum();

for idx in 1..=segment_count {
output_type_writers.push(BufWriter::new(File::create(
output_dir.join(format!("{}.{}{}.fq.gz", prefix, file_type_code, idx)),
)?));
output_type_writers.push(BufWriter::with_capacity(
65_536usize,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was seeing write_all_cold in the flame graph, which means that our buffer is improperly set (default 8_192usize). Increasing this by 4x sped things up and the write_all_cold call no longer shows up.

File::create(
output_dir.join(format!("{}.{}{}.fq.gz", prefix, file_type_code, idx)),
)?,
));
}

match output_type {
Expand Down Expand Up @@ -1189,6 +1202,7 @@ mod tests {
skip_reasons: vec![],
};
let demux_result = demux_inputs.execute();
#[allow(clippy::permissions_set_readonly_false)]
permissions.set_readonly(false);
fs::set_permissions(tmp.path(), permissions).unwrap();
demux_result.unwrap();
Expand Down Expand Up @@ -1963,7 +1977,7 @@ mod tests {
vec!["AAAAAAA", &SAMPLE1_BARCODE[0..7]], // barcode too short
vec!["CCCCCCC", SAMPLE1_BARCODE], // barcode the correct length
vec!["", SAMPLE1_BARCODE], // template basese too short
vec!["G", SAMPLE1_BARCODE], // barcode the correct length
vec!["G", SAMPLE1_BARCODE],
];

let input_files = vec![
Expand Down Expand Up @@ -1999,7 +2013,7 @@ mod tests {
vec!["AAAAAAA", &SAMPLE1_BARCODE[0..7]], // barcode too short
vec!["CCCCCCC", SAMPLE1_BARCODE], // barcode the correct length
vec!["", SAMPLE1_BARCODE], // template basese too short
vec!["G", SAMPLE1_BARCODE], // barcode the correct length
vec!["G", SAMPLE1_BARCODE],
];

let input_files = vec![
Expand Down
12 changes: 5 additions & 7 deletions src/lib/barcode_matching.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@ use crate::encode;
use super::byte_is_nocall;
use super::samples::Sample;
use crate::bitenc::BitEnc;
use ahash::HashMap as AHashMap;
use ahash::HashMapExt;
use rustc_hash::FxHashMap;

const STARTING_CACHE_SIZE: usize = 1_000_000;

Expand Down Expand Up @@ -41,7 +40,7 @@ pub struct BarcodeMatcher {
/// If true will attempt to use the cache when matching.
use_cache: bool,
/// Caching struct for storing results of previous matches
cache: AHashMap<Vec<u8>, BarcodeMatch>,
cache: FxHashMap<Vec<u8>, BarcodeMatch>,
}

impl BarcodeMatcher {
Expand Down Expand Up @@ -81,7 +80,7 @@ impl BarcodeMatcher {
max_mismatches,
min_mismatch_delta,
use_cache,
cache: AHashMap::with_capacity(STARTING_CACHE_SIZE),
cache: FxHashMap::with_capacity_and_hasher(STARTING_CACHE_SIZE, Default::default()),
}
}

Expand All @@ -94,9 +93,7 @@ impl BarcodeMatcher {
) -> u8 {
if observed_bases.nr_symbols() != expected_bases.nr_symbols() {
let observed_string = decode(observed_bases);
assert_eq!(
observed_bases.nr_symbols(),
expected_bases.nr_symbols(),
panic!(
"Read barcode ({}) length ({}) differs from expected barcode ({}) length ({}) for sample {}",
observed_string,
observed_bases.nr_symbols(),
Expand Down Expand Up @@ -196,6 +193,7 @@ mod tests {
fn barcode_to_sample(barcode: &str, idx: usize) -> Sample {
Sample {
barcode: barcode.to_string(),
barcode_bytes: barcode.as_bytes().to_vec(),
sample_id: format!("sample_{idx}").to_string(),
ordinal: idx,
}
Expand Down
6 changes: 1 addition & 5 deletions src/lib/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,7 @@ pub fn encode(bases: &[u8]) -> BitEnc {
IUPAC_MASKS[b'N' as usize]
} else {
let value = base.to_ascii_uppercase() as usize;
if value < 256 {
IUPAC_MASKS[value]
} else {
0
}
if value < 256 { IUPAC_MASKS[value] } else { 0 }
};
vec.push(bit);
}
Expand Down
12 changes: 6 additions & 6 deletions src/lib/samples.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ pub struct Sample {
pub sample_id: String,
/// DNA barcode associated with the sample
pub barcode: String,
/// DNA barcode as a byte
#[serde(skip_deserializing)]
pub barcode_bytes: Vec<u8>,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question: Why add a barcode_bytes field?

String implements Clone and into_bytes so if a user needs bytes they can still call sample.barcode.clone().into_bytes() if they need a Vec<u8>. And if they need a slice they can use sample.barcode.as_bytes().

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm getting confused about parts of a previous implementation that has since been removed.

/// index of the sample in the [`SampleGroup`] object, used for syncing indices across
/// different structs
#[serde(skip_deserializing)]
Expand Down Expand Up @@ -53,7 +56,8 @@ impl Sample {
barcode.as_bytes().iter().all(|&b| is_valid_iupac(b)),
"All sample barcode bases must be one of A, C, G, T, U, R, Y, S, W, K, M, D, V, H, B, N"
);
Self { sample_id: name, barcode, ordinal }
let barcode_bytes = barcode.as_bytes().to_vec();
Self { sample_id: name, barcode, barcode_bytes, ordinal }
}

/// Returns the header line expected by serde when deserializing
Expand Down Expand Up @@ -294,11 +298,7 @@ mod tests {
let barcode = "GATTACA".to_owned();
let ordinal = 0;
let sample = Sample::new(ordinal, name.clone(), barcode.clone());
assert_eq!(
Sample { sample_id: name, barcode, ordinal },
sample,
"Sample differed from expectation"
);
assert_eq!(Sample::new(ordinal, name, barcode), sample, "Sample differed from expectation");
}

// ############################################################################################
Expand Down
Loading