ShawHahnLab
diff --git a/‎.Rbuildignore‎
Lines changed: 2 additions & 1 deletion b/‎.Rbuildignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.utils/lint.R‎
Lines changed: 17 additions & 0 deletions b/‎.utils/lint.R‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎.utils/prep_release.sh‎
Lines changed: 32 additions & 0 deletions b/‎.utils/prep_release.sh‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎DESCRIPTION‎
Lines changed: 1 addition & 1 deletion b/‎DESCRIPTION‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎GUIDE.Rmd‎
Lines changed: 8 additions & 4 deletions b/‎GUIDE.Rmd‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎NAMESPACE‎
Lines changed: 2 additions & 0 deletions b/‎NAMESPACE‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎NEWS.md‎
Lines changed: 21 additions & 0 deletions b/‎NEWS.md‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎R/categorize.R‎
Lines changed: 130 additions & 0 deletions b/‎R/categorize.R‎
Lines changed: 130 additions & 0 deletions
diff --git a/‎R/summarize_dataset.R‎
Lines changed: 19 additions & 0 deletions b/‎R/summarize_dataset.R‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎R/util.R‎
Lines changed: 4 additions & 2 deletions b/‎R/util.R‎
Lines changed: 4 additions & 2 deletions
@@ -1,12 +1,13 @@
 ^.*\.Rproj$
 ^\.Rproj\.user$
 ^\.travis.yml$
+.utils
 environment.yml
 install_linux_conda.sh
 install_linux.sh
 install_windows.cmd
+install_windows.R
 install_mac.command
 README.md
 GUIDE.Rmd
 GUIDE.pdf
-prep_release.sh
 
@@ -0,0 +1,17 @@
+#!/usr/bin/env Rscript
+
+# Lint the package that contains this file's directory, minus some lint
+# categories that just annoy me.
+
+args <- commandArgs()
+f <- gsub("^--file=", "", args[grep("^--file=", args)])
+f <- normalizePath(f)
+path <- dirname(dirname(f))
+
+linters_no <- c("multiple_dots", # "Don't use dots in names"
+                "camel_case",    # "Don't capitalize stuff"
+                "object_usage")  # "I don't see that variable"
+linters_no <- paste0(linters_no, "_linter")
+linters <- lintr::default_linters[-match(linters_no,
+                                         names(lintr::default_linters))]
+lintr::lint_package(path = path, linters = linters)
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+
+set -e
+
+VERSION=$1
+
+chiimp_check='x<-devtools::check();quit(save="no",status=length(c(x$errors,x$warnings)))'
+
+# Update version in download link in README
+VER_MSG="The most recent released version is"
+TAG_URL="https\\://github.com/ShawHahnLab/chiimp/releases/tag"
+SED_README="s:$VER_MSG \\[[0-9.]+\\]\\($TAG_URL/[0-9.]+\\)\\.:$VER_MSG [$VERSION]($TAG_URL/$VERSION).:"
+sed -i -r "$SED_README" README.md
+
+# Update version in DESCRIPTION and NEWS.md
+sed -i "s/Version: .*$/Version: $VERSION/" DESCRIPTION
+sed -i "s/# chiimp dev/# chiimp $VERSION/" NEWS.md
+
+R --slave --vanilla -e "$chiimp_check"
+R --slave --vanilla -e "rmarkdown::render('GUIDE.Rmd', output_file = 'GUIDE.pdf', quiet = TRUE)"
+
+# Create bundled ZIP and TGZ versions without hidden top level files (such as
+# the git and travis stuff) and with the GUIDE.pdf.
+pushd ..
+zip -r chiimp-v${VERISON}.zip chiimp/*
+tar czvf chiimp-v${VERSION}.tgz chiimp/*
+popd
+
+# TODO show reminder of checks before tagging a release:
+# * full test on all three platforms
+# * make sure NEWS.md contains all updates under a heading matching this version
+# * make sure GUIDE.Rmd is up-to-date and the rendered GUIDE.pdf is correct
@@ -1,6 +1,6 @@
 Package: chiimp
 Title: Computational, High-throughput Individual Identification through Microsatellite Profiling
-Version: 0.2.0
+Version: 0.2.1
 Authors@R: person("Jesse", "Connell", email = "[email protected]", role = c("aut", "cre"))
 Description: An R package to analyze microsatellites in high-throughput sequencing datasets.
 Depends: R (>= 3.2.3)
 
@@ -4,7 +4,7 @@
 
 title: "CHIIMP User Guide"
 author: "Jesse Connell"
-date: "2018/03/26"
+date: "2018/07/23"
 output:
   pdf_document:
     toc: true
@@ -290,9 +290,13 @@ For inter-sample comparisons, the alleles identified across samples for each
 locus are aligned to one another.  The genotypes for each sample are clustered 
 by number of matching alleles, showing similarity between samples.  If a 
 spreadsheet of known genotypes was given, the sample genotypes are also compared
-to the known genotypes, with any close matches reported.  A single report 
-document summarizes the genotyping and these other details.  See the Output Data
-Organization section below for more information on the output.
+to the known genotypes, with any close matches reported.  If a Name column was
+provided with the sample definition table as well as a known genotypes
+spreadsheet, the known-correct genotypes will be paired with applicable samples
+and a column tracking the result of the genotyping (Correct, Incorrect, Blank,
+or Dropped Allele) will be added.  A single report document summarizes the
+genotyping and these other details.  See the Output Data Organization section
+below for more information on the output.
 
 These steps are handled by the `full_analysis` function in the R package.
 
 
@@ -7,6 +7,7 @@ export(analyze_sample_guided)
 export(analyze_sample_naive)
 export(analyze_seqs)
 export(calc_genotype_distance)
+export(categorize_genotype_results)
 export(config.defaults)
 export(find_closest_matches)
 export(full_analysis)
@@ -20,6 +21,7 @@ export(load_seqs)
 export(main)
 export(make_dist_mat)
 export(make_dist_mat_known)
+export(match_known_genotypes)
 export(plot_alignment)
 export(plot_cts_per_locus)
 export(plot_dist_mat)
 
@@ -1,3 +1,24 @@
+# chiimp 0.2.1
+
+ * Minor improvements to release process ([#14]).
+ * Fixed install script for Mac OS ([#13]).
+ * Fixed file-saving on Windows ([#12]).
+ * Fixed installation on Windows for usernames with spaces ([#11]).
+ * Added automatic categorization of genotyping results for samples from known
+ individuals ([#8]).
+   * Added function to pair samples with known correct genotypes,
+   `match_known_genotypes`.
+   * Added function to categorize results of genotyping for known individuals,
+   `categorize_genotype_results`.
+   * Enabled categorization features in `summarize_dataset` when Name column is
+   supplied in results summary data frame.
+
+[#14]: https://github.com/ShawHahnLab/chiimp/issues/14
+[#13]: https://github.com/ShawHahnLab/chiimp/issues/13
+[#12]: https://github.com/ShawHahnLab/chiimp/issues/12
+[#11]: https://github.com/ShawHahnLab/chiimp/issues/11
+[#8]: https://github.com/ShawHahnLab/chiimp/issues/8
+
 # chiimp 0.2.0
 
  * Restructured code to avoid analyzing multiplexed samples more than once ([#3]).
 
@@ -0,0 +1,130 @@
+# Interpret genotyping results for samples with known identity.
+
+#' Associate known genotypes with samples
+#'
+#' Using the Name column of the given results summary data frame, pair each
+#' called genotype with the known alleles.  A data frame with two columns,
+#' CorrectAllele1Seq and CorrectAllele2Seq, is returned. If matching entries are
+#' found in Allele1Seq and/or Allele2Seq the order will be preserved, and at
+#' this point the two allele entries should match up directly for genotypes that
+#' were called correctly.
+#'
+#' @param results_summary cross-sample summary data frame as produced by
+#'   \code{\link{analyze_dataset}}.
+#' @param genotypes.known data frame of known genotypes that should be compared
+#'   to the observed genotypes in the results, as loaded by
+#'   \code{\link{load_genotypes}}.
+#'
+#' @return data frame with two columns for the two correct alleles, and rows
+#'   matching the input summary table.
+#'
+#' @export
+match_known_genotypes <- function(results_summary, genotypes.known) {
+  # match name/locus combos with genotypes
+  id_tbl <- paste(results_summary$Name, results_summary$Locus)
+  id_kg <- paste(genotypes.known$Name, genotypes.known$Locus)
+  idx <- match(id_tbl, id_kg)
+  # Build data frame of correct allele sequences
+  result <- data.frame(CorrectAllele1Seq = genotypes.known[idx, "Allele1Seq"],
+                       CorrectAllele2Seq = genotypes.known[idx, "Allele2Seq"],
+                       stringsAsFactors = FALSE)
+  # Ensure ordering within pairs matches samples, if possible.
+  for (i in 1:nrow(result)) {
+    a <- results_summary[i, c("Allele1Seq", "Allele2Seq")]
+    kg <- result[i, ]
+    idx <- match(a, kg)
+    if (idx[1] %in% 2 || idx[2] %in% 1)
+      result[i, ] <- rev(kg)
+  }
+  result
+}
+
+#' Categorize genotyping results
+#'
+#' For a given results summary data frame that has CorrectAllele1Seq and Correct
+#' Allele2Seq columns (such as produced by \code{\link{match_known_genotypes}})
+#' added, create a factor labeling every row of the input data frame by its
+#' genotyping outcome.
+#'
+#' @details
+#' Levels in the returned factor, in order:
+#'
+#' * Correct: one/two alleles match.
+#' * Incorrect at least one allele does not match.
+#' * Blank: No alleles were called in the analysis even though known genotypes
+#'    were supplied.
+#' * Dropped Allele: One called allele is correct for a heterozygous individual,
+#'   but no second allele was called.
+#'
+#' Cases that should not occur, such as CorrectAllele1Seq and CorrectAllele2Seq
+#' both set to NA, map to NA in the returned factor.
+#' @md
+#'
+#' @param results_summary cross-sample summary data frame as produced by
+#'   \code{\link{analyze_dataset}} with extra columns as produced by
+#'   \code{\link{match_known_genotypes}}.
+#'
+#' @return factor defining genotyping result category for every row of the input
+#'   data frame.
+#'
+#' @export
+categorize_genotype_results <- function(results_summary) {
+  # Five possibilities for either NA/not NA plus outcome of non-NA pair
+  # All five possibilities for a single allele check:
+  #   0: Both non-NA, simple mismatch
+  #   1: A not NA, C NA (no correct allele matched this one)
+  #   2: A NA, C not NA (we missed a correct allele and left this blank)
+  #   3: A NA, C NA (correctly did not report an allele)
+  #   4: Both non-NA, match
+  check_allele <- function(allele, ref) {
+    a <- is.na(allele) * 2 + is.na(ref) # NA: 1, not NA: 0
+    a[a == 0 & allele == ref] <- 4 # special distinction for one case
+    a
+  }
+
+  # Now, combine for both alleles to have all possible outcomes, and offset by
+  # one to account for R's indexing.
+  a1 <- check_allele(results_summary$Allele1Seq,
+                     results_summary$CorrectAllele1Seq)
+  a2 <- check_allele(results_summary$Allele2Seq,
+                     results_summary$CorrectAllele2Seq)
+  a <- a1 * 5 + a2 + 1
+
+  # Here's all the possible outcomes, categorized.  Cases that should never come
+  # up for correctly-labeled genotypes will evaluate to NA.
+  lvls <- c(
+    # A1 0: first allele simple mismatch.  Whatever A2 is, this is Incorrect.
+    "Incorrect", # both mismatch
+    "Incorrect", # extra allele, mismatch
+    "Incorrect", # drop
+    "Incorrect", # correctly missing
+    "Incorrect", # second correct
+    # A1 1: first allele called, but no correct allele listed.  Still Incorrect.
+    "Incorrect", # simple mismatch
+    NA,          # second allele also not present?? weird case
+    "Incorrect", # both mismatch
+    NA,          # no correct allele listed for second either?? weird case
+    "Incorrect", # second is correct but first was wrong
+    # A1 2: first allele incorrectly blank.
+    "Incorrect", # simple mismatch
+    "Incorrect", # wrong
+    "Blank",     # second allele also incorrectly blank
+    "Incorrect", # though this *was* homozygous; we at least got that right.
+    "Dropped Allele", # Got one right, but missed A1.
+    # A1 3: first allele correctly blank (expecting true homozygote).
+    "Incorrect", # simple mismatch
+    NA,          # but C2 also NA? weird case
+    "Blank",     # A2 also blank
+    NA,          # A2 NA but C2 also NA? weird case
+    "Correct",   # correct homozygote
+    # A1 4: first allele correct.
+    "Incorrect", # but second wrong.
+    "Incorrect", # second wrongly given when should be blank.
+    "Dropped Allele", # Got one right, but missed A2.
+    "Correct",   # correctly did not report a second allele (homozygote)
+    "Correct"    # correctly did report a second allele (heterozygote)
+  )
+
+  # Map the integers for each case to text categories and create factor.
+  factor(lvls[a], levels = c("Correct", "Dropped Allele", "Blank", "Incorrect"))
+}
@@ -15,6 +15,19 @@
 #'   * dist_mat_known: if genotypes.known is given, this distance matrix of
 #'     sample-to-individual values will be present, from
 #'     \code{\link{make_dist_mat_known}}.
+#'
+#' If genotypes.known is given *and* a Name column is present in
+#' \code{results$summary}, samples will be matched with the genotypes in
+#' genotypes.known and additional columns will be present in the summary data
+#' frame:
+#'   * CorrectAllele1Seq: One correct allele sequence for the individual.  The
+#'   order of this and \code{CorrectAllele2Seq} will be matched to
+#'   \code{Allele1Seq} and \code{Allele2Seq} if possible.  See
+#'   \code{\link{match_known_genotypes}}.
+#'   * CorrectAllele2Seq: A second correct allele sequence, as above.
+#'   * GenotypeResult: Categorization for each entry as Correct, Incorrect,
+#'   Blank, or Dropped Allele.  See \code{\link{categorize_genotype_results}}.
+#'
 #' @md
 #'
 #' @param results list containing summary data frame and sample-specific data
@@ -35,6 +48,12 @@ summarize_dataset <- function(results, genotypes.known=NULL) {
     results$dist_mat_known <- make_dist_mat_known(results$summary,
                                                   genotypes.known)
     results$genotypes.known <- genotypes.known
+    if ("Name" %in% colnames(results$summary)) {
+      results$summary <- cbind(results$summary,
+                match_known_genotypes(results$summary, results$genotypes.known))
+      results$summary$GenotypeResult <- categorize_genotype_results(
+        results$summary)
+    }
   }
   return(results)
 }
 
@@ -156,14 +156,16 @@ name_alleles_in_table <- function(data, known_alleles=NULL, name_args=list()) {
 #' Remove shared path from file paths
 #'
 #' For the given character vector of file paths, create a modified version with
-#' any common prefix path removed.
+#' any common prefix path removed.  Forward slashes are used as the path
+#' separator on all platforms.
 #'
 #' @param fps_full character vector of file paths.
 #'
 #' @return character vector of same length as input, with any common directory
 #'   structure trimmed off.
 remove_shared_root_dir <- function(fps_full) {
-  fps <- normalizePath(fps_full, mustWork = FALSE)
+  fps <- gsub("\\\\", "/", fps_full)
+  fps <- normalizePath(fps, mustWork = FALSE, winslash = "/")
   chunks <- lapply(strsplit(fps, "/"), function(segs) segs[segs != ""])
   minlen <- min(sapply(chunks, length))
   dirs <- do.call(rbind, lapply(chunks, "[", 1:minlen))