WARNING: THIS SITE IS A MIRROR OF GITHUB.COM / IT CANNOT LOGIN OR REGISTER ACCOUNTS / THE CONTENTS ARE PROVIDED AS-IS / THIS SITE ASSUMES NO RESPONSIBILITY FOR ANY DISPLAYED CONTENT OR LINKS / IF YOU FOUND SOMETHING MAY NOT GOOD FOR EVERYONE, CONTACT ADMIN AT ilovescratch@foxmail.com
Skip to content

Commit 9bb8c02

Browse files
authored
Merge pull request #15 from ShawHahnLab/release-0.2.1
Release 0.2.1
2 parents b0217d4 + 2c00319 commit 9bb8c02

23 files changed

+656
-70
lines changed

.Rbuildignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
^.*\.Rproj$
22
^\.Rproj\.user$
33
^\.travis.yml$
4+
.utils
45
environment.yml
56
install_linux_conda.sh
67
install_linux.sh
78
install_windows.cmd
9+
install_windows.R
810
install_mac.command
911
README.md
1012
GUIDE.Rmd
1113
GUIDE.pdf
12-
prep_release.sh

.utils/lint.R

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/usr/bin/env Rscript
2+
3+
# Lint the package that contains this file's directory, minus some lint
4+
# categories that just annoy me.
5+
6+
args <- commandArgs()
7+
f <- gsub("^--file=", "", args[grep("^--file=", args)])
8+
f <- normalizePath(f)
9+
path <- dirname(dirname(f))
10+
11+
linters_no <- c("multiple_dots", # "Don't use dots in names"
12+
"camel_case", # "Don't capitalize stuff"
13+
"object_usage") # "I don't see that variable"
14+
linters_no <- paste0(linters_no, "_linter")
15+
linters <- lintr::default_linters[-match(linters_no,
16+
names(lintr::default_linters))]
17+
lintr::lint_package(path = path, linters = linters)

.utils/prep_release.sh

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#!/usr/bin/env bash
2+
3+
set -e
4+
5+
VERSION=$1
6+
7+
chiimp_check='x<-devtools::check();quit(save="no",status=length(c(x$errors,x$warnings)))'
8+
9+
# Update version in download link in README
10+
VER_MSG="The most recent released version is"
11+
TAG_URL="https\\://github.com/ShawHahnLab/chiimp/releases/tag"
12+
SED_README="s:$VER_MSG \\[[0-9.]+\\]\\($TAG_URL/[0-9.]+\\)\\.:$VER_MSG [$VERSION]($TAG_URL/$VERSION).:"
13+
sed -i -r "$SED_README" README.md
14+
15+
# Update version in DESCRIPTION and NEWS.md
16+
sed -i "s/Version: .*$/Version: $VERSION/" DESCRIPTION
17+
sed -i "s/# chiimp dev/# chiimp $VERSION/" NEWS.md
18+
19+
R --slave --vanilla -e "$chiimp_check"
20+
R --slave --vanilla -e "rmarkdown::render('GUIDE.Rmd', output_file = 'GUIDE.pdf', quiet = TRUE)"
21+
22+
# Create bundled ZIP and TGZ versions without hidden top level files (such as
23+
# the git and travis stuff) and with the GUIDE.pdf.
24+
pushd ..
25+
zip -r chiimp-v${VERISON}.zip chiimp/*
26+
tar czvf chiimp-v${VERSION}.tgz chiimp/*
27+
popd
28+
29+
# TODO show reminder of checks before tagging a release:
30+
# * full test on all three platforms
31+
# * make sure NEWS.md contains all updates under a heading matching this version
32+
# * make sure GUIDE.Rmd is up-to-date and the rendered GUIDE.pdf is correct

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Package: chiimp
22
Title: Computational, High-throughput Individual Identification through Microsatellite Profiling
3-
Version: 0.2.0
3+
Version: 0.2.1
44
Authors@R: person("Jesse", "Connell", email = "[email protected]", role = c("aut", "cre"))
55
Description: An R package to analyze microsatellites in high-throughput sequencing datasets.
66
Depends: R (>= 3.2.3)

GUIDE.Rmd

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
title: "CHIIMP User Guide"
66
author: "Jesse Connell"
7-
date: "2018/03/26"
7+
date: "2018/07/23"
88
output:
99
pdf_document:
1010
toc: true
@@ -290,9 +290,13 @@ For inter-sample comparisons, the alleles identified across samples for each
290290
locus are aligned to one another. The genotypes for each sample are clustered
291291
by number of matching alleles, showing similarity between samples. If a
292292
spreadsheet of known genotypes was given, the sample genotypes are also compared
293-
to the known genotypes, with any close matches reported. A single report
294-
document summarizes the genotyping and these other details. See the Output Data
295-
Organization section below for more information on the output.
293+
to the known genotypes, with any close matches reported. If a Name column was
294+
provided with the sample definition table as well as a known genotypes
295+
spreadsheet, the known-correct genotypes will be paired with applicable samples
296+
and a column tracking the result of the genotyping (Correct, Incorrect, Blank,
297+
or Dropped Allele) will be added. A single report document summarizes the
298+
genotyping and these other details. See the Output Data Organization section
299+
below for more information on the output.
296300

297301
These steps are handled by the `full_analysis` function in the R package.
298302

NAMESPACE

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ export(analyze_sample_guided)
77
export(analyze_sample_naive)
88
export(analyze_seqs)
99
export(calc_genotype_distance)
10+
export(categorize_genotype_results)
1011
export(config.defaults)
1112
export(find_closest_matches)
1213
export(full_analysis)
@@ -20,6 +21,7 @@ export(load_seqs)
2021
export(main)
2122
export(make_dist_mat)
2223
export(make_dist_mat_known)
24+
export(match_known_genotypes)
2325
export(plot_alignment)
2426
export(plot_cts_per_locus)
2527
export(plot_dist_mat)

NEWS.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,24 @@
1+
# chiimp 0.2.1
2+
3+
* Minor improvements to release process ([#14]).
4+
* Fixed install script for Mac OS ([#13]).
5+
* Fixed file-saving on Windows ([#12]).
6+
* Fixed installation on Windows for usernames with spaces ([#11]).
7+
* Added automatic categorization of genotyping results for samples from known
8+
individuals ([#8]).
9+
* Added function to pair samples with known correct genotypes,
10+
`match_known_genotypes`.
11+
* Added function to categorize results of genotyping for known individuals,
12+
`categorize_genotype_results`.
13+
* Enabled categorization features in `summarize_dataset` when Name column is
14+
supplied in results summary data frame.
15+
16+
[#14]: https://github.com/ShawHahnLab/chiimp/issues/14
17+
[#13]: https://github.com/ShawHahnLab/chiimp/issues/13
18+
[#12]: https://github.com/ShawHahnLab/chiimp/issues/12
19+
[#11]: https://github.com/ShawHahnLab/chiimp/issues/11
20+
[#8]: https://github.com/ShawHahnLab/chiimp/issues/8
21+
122
# chiimp 0.2.0
223

324
* Restructured code to avoid analyzing multiplexed samples more than once ([#3]).

R/categorize.R

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
# Interpret genotyping results for samples with known identity.
2+
3+
#' Associate known genotypes with samples
4+
#'
5+
#' Using the Name column of the given results summary data frame, pair each
6+
#' called genotype with the known alleles. A data frame with two columns,
7+
#' CorrectAllele1Seq and CorrectAllele2Seq, is returned. If matching entries are
8+
#' found in Allele1Seq and/or Allele2Seq the order will be preserved, and at
9+
#' this point the two allele entries should match up directly for genotypes that
10+
#' were called correctly.
11+
#'
12+
#' @param results_summary cross-sample summary data frame as produced by
13+
#' \code{\link{analyze_dataset}}.
14+
#' @param genotypes.known data frame of known genotypes that should be compared
15+
#' to the observed genotypes in the results, as loaded by
16+
#' \code{\link{load_genotypes}}.
17+
#'
18+
#' @return data frame with two columns for the two correct alleles, and rows
19+
#' matching the input summary table.
20+
#'
21+
#' @export
22+
match_known_genotypes <- function(results_summary, genotypes.known) {
23+
# match name/locus combos with genotypes
24+
id_tbl <- paste(results_summary$Name, results_summary$Locus)
25+
id_kg <- paste(genotypes.known$Name, genotypes.known$Locus)
26+
idx <- match(id_tbl, id_kg)
27+
# Build data frame of correct allele sequences
28+
result <- data.frame(CorrectAllele1Seq = genotypes.known[idx, "Allele1Seq"],
29+
CorrectAllele2Seq = genotypes.known[idx, "Allele2Seq"],
30+
stringsAsFactors = FALSE)
31+
# Ensure ordering within pairs matches samples, if possible.
32+
for (i in 1:nrow(result)) {
33+
a <- results_summary[i, c("Allele1Seq", "Allele2Seq")]
34+
kg <- result[i, ]
35+
idx <- match(a, kg)
36+
if (idx[1] %in% 2 || idx[2] %in% 1)
37+
result[i, ] <- rev(kg)
38+
}
39+
result
40+
}
41+
42+
#' Categorize genotyping results
43+
#'
44+
#' For a given results summary data frame that has CorrectAllele1Seq and Correct
45+
#' Allele2Seq columns (such as produced by \code{\link{match_known_genotypes}})
46+
#' added, create a factor labeling every row of the input data frame by its
47+
#' genotyping outcome.
48+
#'
49+
#' @details
50+
#' Levels in the returned factor, in order:
51+
#'
52+
#' * Correct: one/two alleles match.
53+
#' * Incorrect at least one allele does not match.
54+
#' * Blank: No alleles were called in the analysis even though known genotypes
55+
#' were supplied.
56+
#' * Dropped Allele: One called allele is correct for a heterozygous individual,
57+
#' but no second allele was called.
58+
#'
59+
#' Cases that should not occur, such as CorrectAllele1Seq and CorrectAllele2Seq
60+
#' both set to NA, map to NA in the returned factor.
61+
#' @md
62+
#'
63+
#' @param results_summary cross-sample summary data frame as produced by
64+
#' \code{\link{analyze_dataset}} with extra columns as produced by
65+
#' \code{\link{match_known_genotypes}}.
66+
#'
67+
#' @return factor defining genotyping result category for every row of the input
68+
#' data frame.
69+
#'
70+
#' @export
71+
categorize_genotype_results <- function(results_summary) {
72+
# Five possibilities for either NA/not NA plus outcome of non-NA pair
73+
# All five possibilities for a single allele check:
74+
# 0: Both non-NA, simple mismatch
75+
# 1: A not NA, C NA (no correct allele matched this one)
76+
# 2: A NA, C not NA (we missed a correct allele and left this blank)
77+
# 3: A NA, C NA (correctly did not report an allele)
78+
# 4: Both non-NA, match
79+
check_allele <- function(allele, ref) {
80+
a <- is.na(allele) * 2 + is.na(ref) # NA: 1, not NA: 0
81+
a[a == 0 & allele == ref] <- 4 # special distinction for one case
82+
a
83+
}
84+
85+
# Now, combine for both alleles to have all possible outcomes, and offset by
86+
# one to account for R's indexing.
87+
a1 <- check_allele(results_summary$Allele1Seq,
88+
results_summary$CorrectAllele1Seq)
89+
a2 <- check_allele(results_summary$Allele2Seq,
90+
results_summary$CorrectAllele2Seq)
91+
a <- a1 * 5 + a2 + 1
92+
93+
# Here's all the possible outcomes, categorized. Cases that should never come
94+
# up for correctly-labeled genotypes will evaluate to NA.
95+
lvls <- c(
96+
# A1 0: first allele simple mismatch. Whatever A2 is, this is Incorrect.
97+
"Incorrect", # both mismatch
98+
"Incorrect", # extra allele, mismatch
99+
"Incorrect", # drop
100+
"Incorrect", # correctly missing
101+
"Incorrect", # second correct
102+
# A1 1: first allele called, but no correct allele listed. Still Incorrect.
103+
"Incorrect", # simple mismatch
104+
NA, # second allele also not present?? weird case
105+
"Incorrect", # both mismatch
106+
NA, # no correct allele listed for second either?? weird case
107+
"Incorrect", # second is correct but first was wrong
108+
# A1 2: first allele incorrectly blank.
109+
"Incorrect", # simple mismatch
110+
"Incorrect", # wrong
111+
"Blank", # second allele also incorrectly blank
112+
"Incorrect", # though this *was* homozygous; we at least got that right.
113+
"Dropped Allele", # Got one right, but missed A1.
114+
# A1 3: first allele correctly blank (expecting true homozygote).
115+
"Incorrect", # simple mismatch
116+
NA, # but C2 also NA? weird case
117+
"Blank", # A2 also blank
118+
NA, # A2 NA but C2 also NA? weird case
119+
"Correct", # correct homozygote
120+
# A1 4: first allele correct.
121+
"Incorrect", # but second wrong.
122+
"Incorrect", # second wrongly given when should be blank.
123+
"Dropped Allele", # Got one right, but missed A2.
124+
"Correct", # correctly did not report a second allele (homozygote)
125+
"Correct" # correctly did report a second allele (heterozygote)
126+
)
127+
128+
# Map the integers for each case to text categories and create factor.
129+
factor(lvls[a], levels = c("Correct", "Dropped Allele", "Blank", "Incorrect"))
130+
}

R/summarize_dataset.R

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,19 @@
1515
#' * dist_mat_known: if genotypes.known is given, this distance matrix of
1616
#' sample-to-individual values will be present, from
1717
#' \code{\link{make_dist_mat_known}}.
18+
#'
19+
#' If genotypes.known is given *and* a Name column is present in
20+
#' \code{results$summary}, samples will be matched with the genotypes in
21+
#' genotypes.known and additional columns will be present in the summary data
22+
#' frame:
23+
#' * CorrectAllele1Seq: One correct allele sequence for the individual. The
24+
#' order of this and \code{CorrectAllele2Seq} will be matched to
25+
#' \code{Allele1Seq} and \code{Allele2Seq} if possible. See
26+
#' \code{\link{match_known_genotypes}}.
27+
#' * CorrectAllele2Seq: A second correct allele sequence, as above.
28+
#' * GenotypeResult: Categorization for each entry as Correct, Incorrect,
29+
#' Blank, or Dropped Allele. See \code{\link{categorize_genotype_results}}.
30+
#'
1831
#' @md
1932
#'
2033
#' @param results list containing summary data frame and sample-specific data
@@ -35,6 +48,12 @@ summarize_dataset <- function(results, genotypes.known=NULL) {
3548
results$dist_mat_known <- make_dist_mat_known(results$summary,
3649
genotypes.known)
3750
results$genotypes.known <- genotypes.known
51+
if ("Name" %in% colnames(results$summary)) {
52+
results$summary <- cbind(results$summary,
53+
match_known_genotypes(results$summary, results$genotypes.known))
54+
results$summary$GenotypeResult <- categorize_genotype_results(
55+
results$summary)
56+
}
3857
}
3958
return(results)
4059
}

R/util.R

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -156,14 +156,16 @@ name_alleles_in_table <- function(data, known_alleles=NULL, name_args=list()) {
156156
#' Remove shared path from file paths
157157
#'
158158
#' For the given character vector of file paths, create a modified version with
159-
#' any common prefix path removed.
159+
#' any common prefix path removed. Forward slashes are used as the path
160+
#' separator on all platforms.
160161
#'
161162
#' @param fps_full character vector of file paths.
162163
#'
163164
#' @return character vector of same length as input, with any common directory
164165
#' structure trimmed off.
165166
remove_shared_root_dir <- function(fps_full) {
166-
fps <- normalizePath(fps_full, mustWork = FALSE)
167+
fps <- gsub("\\\\", "/", fps_full)
168+
fps <- normalizePath(fps, mustWork = FALSE, winslash = "/")
167169
chunks <- lapply(strsplit(fps, "/"), function(segs) segs[segs != ""])
168170
minlen <- min(sapply(chunks, length))
169171
dirs <- do.call(rbind, lapply(chunks, "[", 1:minlen))

0 commit comments

Comments
 (0)