|
| 1 | +# Interpret genotyping results for samples with known identity. |
| 2 | + |
| 3 | +#' Associate known genotypes with samples |
| 4 | +#' |
| 5 | +#' Using the Name column of the given results summary data frame, pair each |
| 6 | +#' called genotype with the known alleles. A data frame with two columns, |
| 7 | +#' CorrectAllele1Seq and CorrectAllele2Seq, is returned. If matching entries are |
| 8 | +#' found in Allele1Seq and/or Allele2Seq the order will be preserved, and at |
| 9 | +#' this point the two allele entries should match up directly for genotypes that |
| 10 | +#' were called correctly. |
| 11 | +#' |
| 12 | +#' @param results_summary cross-sample summary data frame as produced by |
| 13 | +#' \code{\link{analyze_dataset}}. |
| 14 | +#' @param genotypes.known data frame of known genotypes that should be compared |
| 15 | +#' to the observed genotypes in the results, as loaded by |
| 16 | +#' \code{\link{load_genotypes}}. |
| 17 | +#' |
| 18 | +#' @return data frame with two columns for the two correct alleles, and rows |
| 19 | +#' matching the input summary table. |
| 20 | +#' |
| 21 | +#' @export |
| 22 | +match_known_genotypes <- function(results_summary, genotypes.known) { |
| 23 | + # match name/locus combos with genotypes |
| 24 | + id_tbl <- paste(results_summary$Name, results_summary$Locus) |
| 25 | + id_kg <- paste(genotypes.known$Name, genotypes.known$Locus) |
| 26 | + idx <- match(id_tbl, id_kg) |
| 27 | + # Build data frame of correct allele sequences |
| 28 | + result <- data.frame(CorrectAllele1Seq = genotypes.known[idx, "Allele1Seq"], |
| 29 | + CorrectAllele2Seq = genotypes.known[idx, "Allele2Seq"], |
| 30 | + stringsAsFactors = FALSE) |
| 31 | + # Ensure ordering within pairs matches samples, if possible. |
| 32 | + for (i in 1:nrow(result)) { |
| 33 | + a <- results_summary[i, c("Allele1Seq", "Allele2Seq")] |
| 34 | + kg <- result[i, ] |
| 35 | + idx <- match(a, kg) |
| 36 | + if (idx[1] %in% 2 || idx[2] %in% 1) |
| 37 | + result[i, ] <- rev(kg) |
| 38 | + } |
| 39 | + result |
| 40 | +} |
| 41 | + |
| 42 | +#' Categorize genotyping results |
| 43 | +#' |
| 44 | +#' For a given results summary data frame that has CorrectAllele1Seq and Correct |
| 45 | +#' Allele2Seq columns (such as produced by \code{\link{match_known_genotypes}}) |
| 46 | +#' added, create a factor labeling every row of the input data frame by its |
| 47 | +#' genotyping outcome. |
| 48 | +#' |
| 49 | +#' @details |
| 50 | +#' Levels in the returned factor, in order: |
| 51 | +#' |
| 52 | +#' * Correct: one/two alleles match. |
| 53 | +#' * Incorrect at least one allele does not match. |
| 54 | +#' * Blank: No alleles were called in the analysis even though known genotypes |
| 55 | +#' were supplied. |
| 56 | +#' * Dropped Allele: One called allele is correct for a heterozygous individual, |
| 57 | +#' but no second allele was called. |
| 58 | +#' |
| 59 | +#' Cases that should not occur, such as CorrectAllele1Seq and CorrectAllele2Seq |
| 60 | +#' both set to NA, map to NA in the returned factor. |
| 61 | +#' @md |
| 62 | +#' |
| 63 | +#' @param results_summary cross-sample summary data frame as produced by |
| 64 | +#' \code{\link{analyze_dataset}} with extra columns as produced by |
| 65 | +#' \code{\link{match_known_genotypes}}. |
| 66 | +#' |
| 67 | +#' @return factor defining genotyping result category for every row of the input |
| 68 | +#' data frame. |
| 69 | +#' |
| 70 | +#' @export |
| 71 | +categorize_genotype_results <- function(results_summary) { |
| 72 | + # Five possibilities for either NA/not NA plus outcome of non-NA pair |
| 73 | + # All five possibilities for a single allele check: |
| 74 | + # 0: Both non-NA, simple mismatch |
| 75 | + # 1: A not NA, C NA (no correct allele matched this one) |
| 76 | + # 2: A NA, C not NA (we missed a correct allele and left this blank) |
| 77 | + # 3: A NA, C NA (correctly did not report an allele) |
| 78 | + # 4: Both non-NA, match |
| 79 | + check_allele <- function(allele, ref) { |
| 80 | + a <- is.na(allele) * 2 + is.na(ref) # NA: 1, not NA: 0 |
| 81 | + a[a == 0 & allele == ref] <- 4 # special distinction for one case |
| 82 | + a |
| 83 | + } |
| 84 | + |
| 85 | + # Now, combine for both alleles to have all possible outcomes, and offset by |
| 86 | + # one to account for R's indexing. |
| 87 | + a1 <- check_allele(results_summary$Allele1Seq, |
| 88 | + results_summary$CorrectAllele1Seq) |
| 89 | + a2 <- check_allele(results_summary$Allele2Seq, |
| 90 | + results_summary$CorrectAllele2Seq) |
| 91 | + a <- a1 * 5 + a2 + 1 |
| 92 | + |
| 93 | + # Here's all the possible outcomes, categorized. Cases that should never come |
| 94 | + # up for correctly-labeled genotypes will evaluate to NA. |
| 95 | + lvls <- c( |
| 96 | + # A1 0: first allele simple mismatch. Whatever A2 is, this is Incorrect. |
| 97 | + "Incorrect", # both mismatch |
| 98 | + "Incorrect", # extra allele, mismatch |
| 99 | + "Incorrect", # drop |
| 100 | + "Incorrect", # correctly missing |
| 101 | + "Incorrect", # second correct |
| 102 | + # A1 1: first allele called, but no correct allele listed. Still Incorrect. |
| 103 | + "Incorrect", # simple mismatch |
| 104 | + NA, # second allele also not present?? weird case |
| 105 | + "Incorrect", # both mismatch |
| 106 | + NA, # no correct allele listed for second either?? weird case |
| 107 | + "Incorrect", # second is correct but first was wrong |
| 108 | + # A1 2: first allele incorrectly blank. |
| 109 | + "Incorrect", # simple mismatch |
| 110 | + "Incorrect", # wrong |
| 111 | + "Blank", # second allele also incorrectly blank |
| 112 | + "Incorrect", # though this *was* homozygous; we at least got that right. |
| 113 | + "Dropped Allele", # Got one right, but missed A1. |
| 114 | + # A1 3: first allele correctly blank (expecting true homozygote). |
| 115 | + "Incorrect", # simple mismatch |
| 116 | + NA, # but C2 also NA? weird case |
| 117 | + "Blank", # A2 also blank |
| 118 | + NA, # A2 NA but C2 also NA? weird case |
| 119 | + "Correct", # correct homozygote |
| 120 | + # A1 4: first allele correct. |
| 121 | + "Incorrect", # but second wrong. |
| 122 | + "Incorrect", # second wrongly given when should be blank. |
| 123 | + "Dropped Allele", # Got one right, but missed A2. |
| 124 | + "Correct", # correctly did not report a second allele (homozygote) |
| 125 | + "Correct" # correctly did report a second allele (heterozygote) |
| 126 | + ) |
| 127 | + |
| 128 | + # Map the integers for each case to text categories and create factor. |
| 129 | + factor(lvls[a], levels = c("Correct", "Dropped Allele", "Blank", "Incorrect")) |
| 130 | +} |
0 commit comments