Toolkit to analyze genomic variation data, built on the GATK with Clojure



(this space intentionally left almost blank)



Convert BED interval contig names between compatible assemblies Handles Human hg19 to GRCh37 naming conversions.

(ns bcbio.align.interval
  (:use [clojure.java.io]
        [bcbio.align.ref :only [get-seq-name-map]]
        [bcbio.variation.normalize :only [prep-rename-map]])
  (:require [clojure.string :as string]
            [bcbio.run.itx :as itx]))

Remap the input contig into GRCh37 contig name.

(defn- update-contig-name
  [name-map line]
  (let [parts (string/split line #"\t")
        remap-contig (get name-map (first parts))]
    (when remap-contig
      (cons remap-contig (rest parts)))))

Rename BED coordinates to match supplied reference file

(defn rename-bed
  [bed-file ref-file & {:keys [out-dir]}]
  (let [out-file (itx/add-file-part bed-file "remap" out-dir)]
    (when (itx/needs-run? out-file)
      (let [name-map (prep-rename-map :GRCh37 ref-file)]
        (with-open [rdr (reader bed-file)
                    wtr (writer out-file)]
           (map #(.write wtr (str (string/join "\t" %) "\n"))
                (->> (line-seq rdr)
                     (map (partial update-contig-name name-map))
                     (remove nil?)))))))

Deal with reference sequences for alignment and variant calling.

(ns bcbio.align.ref
  (:import [org.broadinstitute.sting.gatk.datasources.reference ReferenceDataSource]
           [net.sf.picard.reference ReferenceSequenceFileFactory]
           [net.sf.picard.sam CreateSequenceDictionary])
  (:use [clojure.java.io]
        [ordered.map :only [ordered-map]])
  (:require [clojure.string :as string]
            [fs.core :as fs]
            [bcbio.run.itx :as itx]))
(defn create-ref-dict
  (let [dict-file (str (itx/file-root ref-file) ".dict")]
    (when (itx/needs-run? dict-file)
      (.instanceMain (CreateSequenceDictionary.)
                     (into-array [(str "r=" ref-file) (str "o=" dict-file)])))

Retrieve Picard sequence dictionary from FASTA reference file.

(defn get-seq-dict*
  (create-ref-dict ref-file)
  (ReferenceDataSource. (file ref-file))
  (-> (file ref-file)
(def get-seq-dict (memoize get-seq-dict*))

Retrieve Picard sequence dictionary and reference from FASTA file.

(defn get-seq-dict-and-ref*
  (create-ref-dict ref-file)
  (ReferenceDataSource. (file ref-file))
  (let [seq-ref (ReferenceSequenceFileFactory/getReferenceSequenceFile (file ref-file))
        seq-dict (-> seq-ref .getSequenceDictionary)]
    [seq-dict seq-ref]))
(def get-seq-dict-and-ref (memoize get-seq-dict-and-ref*))

Retrieve map of sequence names to index positions in the input reference. This is useful for sorting by position.

(defn get-seq-name-map
  (reduce (fn [coll [i x]] (assoc coll x i))
          (map-indexed vector
                       (map #(.getSequenceName %) (.getSequences (get-seq-dict ref-file))))))

Retrieve sequence in the provided region from input reference file. start and end are 1-based inclusive coordinates (VCF style)

(defn extract-sequence
  [ref-file contig start end]
  (let [[seq-dict seq-ref] (get-seq-dict-and-ref ref-file)]
    (when (and (contains? (set (map #(.getSequenceName %) (.getSequences seq-dict))) contig)
               (<= end (.getSequenceLength (.getSequence seq-dict contig))))
      (-> seq-ref
          (.getSubsequenceAt contig start end)
          (#(map char %))
          (#(apply str %))))))

Sort a BED file relative to the input reference. Takes a IO intensive approach over memory intensive by sorting in blocks of chromosomes. same-time-chrs handles the tradeoff between speed and memory by determining how many chromosomes to process simultaneously.

(defn sort-bed-file
  [bed-file ref-file]
  (letfn [(process-line [cur-chrs line]
            (let [tab-parts (string/split line #"\t")
                  parts (if (> (count tab-parts) 1)
                          (string/split line #" "))]
              (let [[chr start end] (take 3 parts)]
                (when (or (nil? cur-chrs) (contains? cur-chrs chr))
                  [[chr (Integer/parseInt start) (Integer/parseInt end)] line]))))
          (get-input-chrs [bed-file]
            (with-open [rdr (reader bed-file)]
              (->> (line-seq rdr)
                   (map (partial process-line nil))
                   (map ffirst)
    (let [out-file (itx/add-file-part bed-file "sorted")
          input-chrs (get-input-chrs bed-file)
          same-time-chrs 5]
      (when (or (itx/needs-run? out-file)
                (> (fs/mod-time bed-file) (fs/mod-time out-file)))
        (itx/with-tx-file [tx-out out-file]
          (with-open [wtr (writer tx-out)]
            (doseq [cur-chrs (->> (get-seq-dict ref-file)
                                  (map #(.getSequenceName %))
                                  (filter input-chrs)
                                  (partition-all same-time-chrs)
                                  (map set))]
              (with-open [rdr (reader bed-file)]
                (doseq [[_ line] (->> (line-seq rdr)
                                      (map (partial process-line cur-chrs))
                                      (remove nil?)
                                      (sort-by first))]
                  (.write wtr (str line "\n"))))))))

Reorder BAM alignment files to a reference dictionary, potentially swapping naming. Handles Human hg19 to GRCh37 naming conversions.

(ns bcbio.align.reorder
  (:import [net.sf.samtools SAMFileReader SAMFileWriterFactory SAMReadGroupRecord
            SAMTag SAMFileReader$ValidationStringency])
  (:use [clojure.java.io]
        [bcbio.align.ref :only [get-seq-dict]]
        [bcbio.run.broad :only [index-bam]]
        [bcbio.variation.normalize :only [prep-rename-map]])
  (:require [bcbio.run.itx :as itx]))

Add updated sequence dictionary and run group information to header.

(defn- updated-bam-header
  [in-bam ref-file call exp]
  (letfn [(update-rgs [rgs]
            (if-not (empty? rgs) rgs
                    [(doto (SAMReadGroupRecord. "1")
                       (.setLibrary (:sample exp))
                       (.setPlatform (get call :platform "illumina"))
                       (.setSample (:sample exp))
                       (.setPlatformUnit (:sample exp)))]))]
    (let [read-groups (update-rgs (-> in-bam .getFileHeader .getReadGroups))]
      (doto (-> in-bam .getFileHeader .clone)
        (.setSequenceDictionary (-> ref-file get-seq-dict))
        (.setReadGroups read-groups)))))

Retrieve order of chromosomes to fetch and mapping to new index.

(defn get-new-chr-order
  [bam-names ref-names ref-file]
  (letfn [(get-bam-name-map [bam-names orig-ref-names]
            (let [ref-names (set orig-ref-names)
                  name-remap (prep-rename-map :GRCh37 ref-file)]
              (reduce (fn [coll x]
                        (assoc coll (cond
                                     (contains? ref-names x) x
                                     (contains? name-remap x) (get name-remap x)
                                     :else (throw (Exception. (str "Could not map " x))))
                      {} bam-names)))
          (get-index-map [name-map]
            (let [bam-name-map (reduce (fn [coll [x y]] (assoc coll y x))
                                       {} name-map)]
              (reduce (fn [coll [i x]]
                        (assoc coll i (.indexOf ref-names (get bam-name-map x))))
                      {} (map-indexed vector bam-names))))]
    (when-not (every? #(apply = %) (partition 2 (interleave ref-names bam-names)))
      (let [name-map (get-bam-name-map bam-names ref-names)]
        {:names (remove nil? (map #(get name-map %) ref-names))
         :indexes (get-index-map name-map)}))))

Lazy sequence for BAM reads from a Picard iterator.

(defn bam-read-seq
   (when (.hasNext iter)
     (cons (.next iter) (bam-read-seq iter)))))

Write reordered BAM file in specified chromosome order.

(defn- write-reorder-bam
  [in-bam out-bam chr-order header]
  (let [default-rg-id (-> header .getReadGroups first .getId)]
    (letfn [(update-read [read]
              (let [new-rg-id (if-let [x (.getAttribute read (.name SAMTag/RG))] x
                (doto read
                  (.setHeader header)
                  (.setReferenceIndex (get (:indexes chr-order)
                                           (.getReferenceIndex read) -1))
                  (.setMateReferenceIndex (get (:indexes chr-order)
                                               (.getMateReferenceIndex read) -1))
                  (.setAttribute (.name SAMTag/RG) new-rg-id))))]
      (doseq [cur-chr (:names chr-order)]
        (with-open [iter (.query in-bam cur-chr 0 0 false)]
          (doseq [read (bam-read-seq iter)]
            (.addAlignment out-bam (update-read read)))))
      (with-open [iter (.queryUnmapped in-bam)]
        (doseq [read (bam-read-seq iter)]
          (.addAlignment out-bam (update-read read)))))))

Reorder and remap BAM file to match supplied reference file.

(defn reorder-bam
  [bam-file ref-file call exp & {:keys [out-dir]}]
  (let [out-file (itx/add-file-part bam-file "reorder" out-dir)]
    (when (itx/needs-run? out-file)
      (index-bam bam-file)
      (SAMFileReader/setDefaultValidationStringency SAMFileReader$ValidationStringency/LENIENT)
      (with-open [in-bam (SAMFileReader. (file bam-file))]
        (let [ref-names (map #(.getSequenceName %) (-> ref-file get-seq-dict .getSequences))
              bam-names (map #(.getSequenceName %) (-> in-bam .getFileHeader .getSequenceDictionary
              header (updated-bam-header in-bam ref-file call exp)]
          (if-let [chr-order (get-new-chr-order bam-names ref-names ref-file)]
              (with-open [out-bam (.makeSAMOrBAMWriter (SAMFileWriterFactory.)
                                                       header true (file out-file))]
                (write-reorder-bam in-bam out-bam chr-order header))
(defn -main [bam-file ref-file sample-name]
  (reorder-bam bam-file ref-file {} {:sample sample-name}))

High level functions to run software from Broad: GATK, Picard

(ns bcbio.run.broad
  (:import [org.broadinstitute.sting.gatk CommandLineGATK]
           [net.sf.samtools SAMFileReader SAMFileReader$ValidationStringency]
           [net.sf.picard.sam BuildBamIndex])
  (:use [clojure.java.io]
        [bcbio.align.ref :only [sort-bed-file create-ref-dict]])
  (:require [fs.core :as fs]
            [bcbio.run.itx :as itx]))

Ensure reference dictionary

(defn- create-ref-dict-gatk
  (when-let [ref-file (second (drop-while
                               #(not (contains? #{"-R" "--reference_sequence"} %))
    (create-ref-dict ref-file)))

Run a GATK commandline in an idempotent file-safe transaction. Contains a workaround to not die on errors while generating GATKReports, which occur when calling this externally as a library function.

(defn run-gatk
  [program args file-info map-info]
  (when (itx/needs-run? (map #(% file-info) (get map-info :out [])))
    (create-ref-dict-gatk args)
    (let [std-args (concat ["-T" program]
                           (when-not (contains? (set args) "--unsafe")
                             ["--unsafe" "LENIENT_VCF_PROCESSING"])
                           ["--read_filter" "BadCigar" "--read_filter" "NotPrimaryAlignment"])]
      (itx/with-tx-files [tx-file-info file-info (get map-info :out []) [".idx"]]
          (CommandLineGATK/start (CommandLineGATK.)
                                 (into-array (map str (itx/subs-kw-files
                                                       (concat std-args args)
          (catch java.lang.VerifyError e
            (when-not (.contains (.getMessage e) "GATKRunReport")
              (throw e))))))))

Generate BAM index, skipping if already present.

(defn index-bam
  (let [index-file (str in-bam ".bai")]
    (when (itx/needs-run? index-file)
      (SAMFileReader/setDefaultValidationStringency SAMFileReader$ValidationStringency/LENIENT)
      (BuildBamIndex/createIndex (SAMFileReader. (file in-bam)) (file index-file)))

Supply GATK commandline arguments for interval files, merging via intersection.

(defn gatk-cl-intersect-intervals
  [intervals ref-file & {:keys [vcf]}]
   (or (nil? intervals)
       (empty? intervals)) (if vcf ["-L" vcf] [])
   (coll? intervals) (concat (flatten (map #(list "-L" %)
                                           (map #(sort-bed-file % ref-file) intervals)))
                             ["--interval_set_rule" "INTERSECTION"])
   :else ["-L" (sort-bed-file intervals ref-file)]))

Functionality for running idempotent, transactional processes. Provides an API for long running processes in computational pipelines. Avoids re-running a process if it has produced the output file on a previous run, and leaving partially finished files in the case of premature termination.

(ns bcbio.run.itx
  (:import (java.io File))
  (:use [clojure.java.io])
  (:require [clojure.string :as string]
            [fs.core :as fs]))

Idempotent processing

avoid re-running when output files exist

Check if an output files need a run: any do not exist or empty file

(defn needs-run?
  [& fnames]
  (letfn [(file-non-empty? [f]
            (and (fs/exists? f)
                 (> (fs/size f) 0)))]
    (not-every? true?
                (map file-non-empty? (flatten fnames)))))

Substitute any keywords in the arguments from file information map.

(defn subs-kw-files
  [args file-info]
  (letfn [(maybe-sub-kw [x]
            (if (and (keyword? x)
                     (contains? file-info x))
              (get file-info x)
    (map maybe-sub-kw args)))


Handle output files in a separate transaction directory to avoid partially finished output files if long-running processes fail.

(defn temp-dir-w-prefix [root prefix]
  (let [dir (File/createTempFile prefix  (file root))]
    (fs/delete dir)
    (fs/mkdir dir)

Provide a temporary directory, removed when exiting the body.

(defmacro with-temp-dir
  [[tmp-dir base-dir] & body]
  `(let [~tmp-dir (temp-dir-w-prefix ~base-dir "tmp")]
        (fs/delete-dir ~tmp-dir)))))

Update file-info with need-tx files in a safe transaction directory.

(defn safe-tx-files
  [file-info need-tx]
  (let [tx-files (map #(get file-info %) need-tx)
        tx-dir (temp-dir-w-prefix (fs/parent (first tx-files)) "txtmp")]
    (reduce (fn [m [k v]]
              (assoc m k v))
            (zipmap need-tx
                    (map #(str (fs/file tx-dir (fs/base-name %))) tx-files)))))

Rename generated transaction files into expected file location.

(defn rename-tx-files
  [tx-file-info file-info need-tx exts]
  (doseq [tx-key need-tx]
    (let [tx-safe (get tx-file-info tx-key) 
          tx-final (get file-info tx-key)]
      (fs/rename tx-safe tx-final)
      (doseq [ext exts]
        (when (fs/exists? (str tx-safe ext))
          (fs/rename (str tx-safe ext) (str tx-final ext)))))))

Perform action with files, keeping need-tx files in a transaction.

(defmacro with-tx-files
  [[tx-file-info file-info need-tx exts] & body]
  (if (= (count need-tx) 0)
    `(do ~@body)
    `(let [~tx-file-info (safe-tx-files ~file-info ~need-tx)]
         (rename-tx-files ~tx-file-info ~file-info ~need-tx ~exts)
          (fs/delete-dir (fs/parent (get ~tx-file-info (first ~need-tx)))))))))

Handle a single file in a transaction directory.

(defmacro with-tx-file
  [[tx-file orig-file] & body]
  `(let [~tx-file (:out (safe-tx-files {:out ~orig-file} [:out]))]
       (rename-tx-files {:out ~tx-file} {:out ~orig-file} [:out] [])
        (fs/delete-dir (fs/parent ~tx-file))))))


Generate new file names from existing ones

Retrieve file name without extension: /path/to/fname.txt -> /path/to/fname

(defn file-root
  (let [i (.lastIndexOf fname ".")]
    (if (pos? i)
      (subs fname 0 i)

Add file extender: base.txt -> base-part.txt

(defn add-file-part
  ([fname part]
     (add-file-part fname part nil))
  ([fname part out-dir]
     (let [out-fname (format "%s-%s%s" (file-root fname) part (fs/extension fname))]
       (if-not (nil? out-dir)
         (str (fs/file out-dir (fs/base-name out-fname)))

Remove file specialization extender: base-part.txt -> base.txt

(defn remove-file-part
  [fname part]
  (string/replace (str fname) (str "-" part) ""))

Remove any zip extensions from the input filename

(defn remove-zip-ext
  (letfn [(maybe-remove-ext [fname ext]
            (if (.endsWith fname ext)
              (subs fname 0 (- (.length fname) (.length ext)))
    (let [exts [".tar.gz" "tar.bz2" ".gz" ".bz2" ".zip"]]
      (reduce maybe-remove-ext fname exts))))

File and directory manipulation

Remove file or directory only if it exists.

(defn remove-path
  (if (fs/exists? x)
    (if (fs/directory? x)
      (fs/delete-dir x)
      (fs/delete x))))

Predict functional consequences of variant changes leveraging snpEff.

(ns bcbio.variation.annotate.effects
  (:import [ca.mcgill.mcb.pcingola.snpEffect.commandLine
            SnpEffCmdEff SnpEffCmdDownload]
           [bcbio.variation.util ThreadLocalPrintStream])
  (:require [clojure.java.io :as io]
            [clojure.java.shell :as shell]
            [fs.core :as fs]
            [bcbio.run.itx :as itx]))


(defn- get-snpeff-config
  (let [data-dir (str (fs/file base-dir "snpeff" "data"))
        orig-config (-> (ClassLoader/getSystemClassLoader)
                        (.getResourceAsStream "snpEff.config"))
        config-file (str (fs/file base-dir "snpeff" "snpEff.config"))]
    (when-not (fs/exists? data-dir)
      (fs/mkdirs data-dir))
    (when (itx/needs-run? config-file)
      (with-open [rdr (io/reader orig-config)
                  wtr (io/writer config-file)]
        (doseq [line (line-seq rdr)]
          (.write wtr (str
                       (if (.startsWith line "data_dir")
                         (str "data_dir = " data-dir)
    {:data-dir data-dir
     :config-file config-file}))

Check for a snpEff genome index and download if not present.

(defn download-genome
  [genome base-dir]
  (let [{:keys [data-dir config-file]} (get-snpeff-config base-dir)
        genome-dir (str (fs/file data-dir genome))]
    (when-not (fs/exists? genome-dir)
      (fs/mkdirs genome-dir)
      (doto (SnpEffCmdDownload.)
        (.parseArgs (into-array ["-c" config-file genome]))
      (doseq [x (fs/glob (str "*" genome ".zip"))]
        (fs/delete x)))

Annotate the input file with snpEff, providing predictions of variant effects.

(defn snpeff-annotate
  [in-file genome base-dir & {:keys [out-dir]}]
  (let [config-file (download-genome genome base-dir)
        out-file (itx/add-file-part in-file "effects" out-dir)]
    (when (itx/needs-run? out-file)
      ;; snpEff prints to standard out so we need to safely redirect that to a file.
      (itx/with-tx-file [tx-out out-file]
        (let [orig-out System/out]
            (with-open [wtr (java.io.PrintStream. tx-out)]
              (System/setOut (ThreadLocalPrintStream. wtr))
              (doto (SnpEffCmdEff.)
                (.parseArgs (into-array ["-noStats" "-c" config-file genome in-file]))
              (System/setOut orig-out))))))


(defn- get-vep-cmd [vep-dir]
  (let [vep-file (when vep-dir (str (fs/file (fs/expand-home vep-dir)
    (when (and vep-file (fs/exists? vep-file))

Run Ensembl Variant Effects Predictor on input variant file. Re-annotates the input file with CSQ field compatible with Gemini.

(defn run-vep
  [in-file vep-dir & {:keys [re-run?]}]
  (when-let [vep-cmd (get-vep-cmd vep-dir)]
    (let [out-file (itx/add-file-part in-file "vep")]
      (when (or (itx/needs-run? out-file) re-run?)
        (itx/with-tx-file [tx-out out-file]
          (shell/sh "perl" vep-cmd "-i" in-file "-o" tx-out "--vcf" "--cache"
                    "--terms" "so" "--sift" "b" "--polyphen" "b" "--hgnc" "--numbers"
                    "--fields" "Consequence,Codons,Amino_acids,Gene,HGNC,Feature,EXON,PolyPhen,SIFT")))

Calculate Shannon entropy for flanking sequence surrounding variants. Used to identify low-complexity repeat regions in variants. Based on 'vcfentropy' from Erik Garrison's vcflib: https://github.com/ekg/vcflib

(ns bcbio.variation.annotate.entropy
  (:import [org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation]
           [org.broadinstitute.sting.utils.codecs.vcf VCFInfoHeaderLine VCFHeaderLineType])
   :name bcbio.variation.annotate.entropy.ShannonEntropy
   :extends org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation))

Shannon entropy

From John Lawrence Aspden's information theory posts https://github.com/johnlawrenceaspden/hobby-code/blob/master/averygentleintroduction-part5.clj

(defn bits [n]
  "How many bits to represent n alternatives? Fractions allowed! Also know as log2."
  (/ (Math/log n) (Math/log 2)))
(defn shannon-entropy [P]
  (let [odds (map second P)
        total (reduce + odds)
        bits-aliased (/ (reduce + (map * odds (map bits odds))) total)]
    (- (bits total) bits-aliased)))

Calculate entropy of a sequence based on distribution of dimers. Splits sequence into all dimer 2bp windows, calculates frequency of each dimer and then feeds distribution to shannon calculation.

(defn seq-entropy
  (->> seq
       (partition 2 1)

Helper function

Retrieve sequence surrounding the current variant, with nbp flanking sequence.

(defn get-flank-seq
  [ref-context nbp]
  (letfn [(subset-region [x]
            (let [want-size (inc (* 2 nbp))
                  end-subtract (/ (- (count x) want-size) 2)]
              (subs x end-subtract (- (count x) end-subtract))))]
    (->> ref-context
         (map char)
         (apply str)

GATK walker

(def flank-bp 12)
(defn -getKeyNames [_]
(defn -getDescriptions [_]
  [(VCFInfoHeaderLine. "Entropy" 1 VCFHeaderLineType/Float
                       (format "Shannon entropy of variant flanking regions, %sbp on both sides"

Retrieve flanking region surrounding variant and calculate entropy.

(defn -annotate
  [_ _ _ ref _ _ _]
  {"Entropy" (->> (get-flank-seq ref flank-bp)
                  (format "%.2f"))})

Calculate delta G Minimum Free Energy for sequence secondary structures. Extracts regions surrounding variants and identifies the free energy of the most problematic secondary structures. Larger negative free energy values are especially stable and problematic.

(ns bcbio.variation.annotate.mfe
  (:import [org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation]
           [org.broadinstitute.sting.utils.codecs.vcf VCFInfoHeaderLine VCFHeaderLineType])
  (:use [circdesigna.core :only [min-free-energy]]
        [bcbio.variation.annotate.entropy :only [get-flank-seq]])
   :name bcbio.variation.annotate.entropy.MinFreeEnergy
   :extends org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation))
(def flank-bp 15)
(defn -getKeyNames [_]
(defn -getDescriptions [_]
  [(VCFInfoHeaderLine. "MFE" 1 VCFHeaderLineType/Float
                       (format (str "delta G minimum free energy of the most problematic "
                                    "secondary structure +/- %sbp around variant")
(defn- safe-min-free-energy
    (min-free-energy seq)
    (catch java.lang.ArrayIndexOutOfBoundsException e

Retrieve flanking region surrounding variant and calculate MFE.

(defn -annotate
  [_ _ _ ref _ _ _]
  {"MFE" (->> (get-flank-seq ref flank-bp)
              (format "%.2f"))})

GATK annotator that calculates Mean Neighboring Base Quality (NBQ) for variants.

The motivation for this annotation is that regional base quality influences whether a call is correct. The Atlas2 paper describes the metric in more detail:


(ns bcbio.variation.annotate.nbq
  (:import [org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation]
           [org.broadinstitute.sting.utils.codecs.vcf VCFInfoHeaderLine VCFHeaderLineType]
           [org.broadinstitute.sting.utils BaseUtils])
  (:require [incanter.stats :as istats])
   :name bcbio.variation.annotate.nbq.MeanNeighboringBaseQuality
   :extends org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation))
(def flank-bp 5)
(defn -getKeyNames
(defn -getDescriptions
  [(VCFInfoHeaderLine. "NBQ" 1 VCFHeaderLineType/Float
                       (format "Mean Neighboring Base Quality, includes %sbp on both sides"

Provide Mean Neighboring Base Quality calculations at a position.

- Get a pileup for each sample context.
- Use pileup to retrieve reads and current offsets.
- Filter reads to those that match an alternative base
- Get quality from reads and pull out qualities in surrounding region
- Calculate mean and return.
(defn -annotate
  [_ _ _ _ contexts vc _]
  (letfn [(orient-reads [[offset read]]
            (if (.getReadNegativeStrandFlag read)
              {:offset offset
               :bases (BaseUtils/simpleReverseComplement (.getReadBases read))
               :quals (-> read .getBaseQualities vec reverse)}
              {:offset offset
               :bases (.getReadString read)
               :quals (-> read .getBaseQualities vec)}))
          (neighbor-qualities [{:keys [offset quals]}]
            (map #(nth quals % nil) (range (- offset flank-bp) (+ offset flank-bp))))
          (supports-alt? [alt-bases {:keys [offset bases]}]
            (let [base (char (nth bases offset))]
              (contains? alt-bases base)))
          (pileup-qualities [alt-bases pileup]
            (->> (map vector (.getOffsets pileup) (.getReads pileup))
                 (map orient-reads)
                 (filter (partial supports-alt? alt-bases))
                 (map neighbor-qualities)))]
    (let [alt-bases (->> (.getAlternateAlleles vc)
                              (map #(.getBaseString %))
                              (map first)
      (when (seq alt-bases)
        {"NBQ" (->> contexts
                    (map #(.getBasePileup %))
                    (map (partial pileup-qualities alt-bases))
                    (remove nil?)
                    (format "%.2f"))}))))

Annotate variant calls with metrics for assessing false positives http://www.broadinstitute.org/gsa/wiki/index.php/VariantAnnotator

(ns bcbio.variation.annotation
  (:use [bcbio.variation.utils.cgmetrics :only [add-cgmetrics]])
  (:require [fs.core :as fs]
            [bcbio.run.itx :as itx]
            [bcbio.run.broad :as broad]))

Standard annotations applied to variants

(def  std-annotations
  ["AlleleBalance" "BaseQualityRankSumTest" "DepthOfCoverage"
   "FisherStrand" "GCContent" "HaplotypeScore" "HomopolymerRun"
   "MappingQualityRankSumTest" "MappingQualityZero"
   "MeanNeighboringBaseQuality" "QualByDepth"
   "ReadPosRankSumTest" "RMSMappingQuality"
   "DepthPerAlleleBySample" "AlleleBalanceConfidenceInterval"
   "MostProbableGenotype" "ReadMeanLen" "ReadMeanPos"
   "ReadPosEndDist" "MinFreeEnergy" "ShannonEntropy"])

Add GATK annotation metrics to variant calls.

(defn add-gatk-annotations
  [in-vcf align-bam ref & {:keys [out-dir intervals annos]}]
  {:pre [(not (nil? align-bam))]}
  (let [file-info {:out-vcf (itx/add-file-part in-vcf "annotated" out-dir)}
        ready-annos (if annos annos std-annotations)
        args (concat ["-R" ref
                      "-I" align-bam
                      "--variant" in-vcf
                      "-o" :out-vcf]
                     (reduce #(concat %1 ["-A" %2]) [] ready-annos)
                     (broad/gatk-cl-intersect-intervals intervals ref :vcf in-vcf))]
    (broad/index-bam align-bam)
    (broad/run-gatk "VariantAnnotator" args file-info {:out [:out-vcf]})
    (:out-vcf file-info)))

Flexible addition of additions to a variant file. Handles GATK annotations and Complete Genomics metrics.

(defn add-variant-annotations
  [vcf-file bam-file ref-file call & {:keys [out-dir intervals]}]
  (let [x (get call :annotate "")
        ann (cond
             (true? x) "gatk"
             (false? x) ""
             :else x)]
     (and (= ann "gatk") (not (nil? bam-file)))
     (add-gatk-annotations vcf-file bam-file ref-file :out-dir out-dir :intervals intervals)
     (.contains ann "masterVar")
     (add-cgmetrics vcf-file ann ref-file :out-dir out-dir)
     :else vcf-file)))

Provide top level API for retrieving available files for a user. Encapsulates distributed storage in GenomeSpace as well as locally produced files.

(ns bcbio.variation.api.file
  (:use [clojure.java.io]
        [bcbio.variation.api.shared :only [web-config remote-file-cache]])
  (:require [clojure.java.shell :as shell]
            [clojure.string :as string]
            [clj-stacktrace.repl :as stacktrace]
            [fs.core :as fs]
            [bcbio.run.itx :as itx]
            [bcbio.variation.annotate.effects :as effects]
            [bcbio.variation.index.metrics :as metrics]
            [bcbio.variation.index.gemini :as gemini]
            [bcbio.variation.normalize :as normalize]
            [bcbio.variation.remote.core :as remote]))

File retrieval, with caching

Retrieve default + public directories for specific remote instances.

(defn- get-default-public
  (case (:type rclient)
    :gs (map (fn [x] {:id x})
             (cons "." (remove #(.contains % (:username rclient))
                               (get-in @web-config [:remote :public]))))
    :galaxy (remote/list-dirs rclient nil)

Update file cache for our current user and filetype

(defn- update-user-files
  [rclient ftype]
   (let [dirnames (get-default-public rclient)
         file-info (mapcat #(remote/list-files rclient % ftype) dirnames)]
     (swap! remote-file-cache assoc [(:username rclient) ftype] file-info)

Retrieve file information for files of the specified type, with caching.

(defn list-files-w-cache
  [rclient ftype]
  (let [cache-info (get @remote-file-cache [(:username rclient) ftype])]
    (if (seq cache-info)
        (future (update-user-files rclient ftype))
      (update-user-files rclient ftype))))

Pre-fetching of data for front end interactivity

Prepare biodata directory, synchronizing with GenomeSpace client.

(defn- prep-biodata-dir
  [rclient biodata-dir]
  (letfn [(download-and-unpack [gs-id]
            (let [zip-file (remote/get-file gs-id rclient)]
              (when (and (fs/exists? zip-file)
                         (itx/needs-run? (itx/remove-zip-ext zip-file)))
                (shell/sh "gunzip" zip-file)
                (spit zip-file (str "unzipped to " (itx/remove-zip-ext zip-file))))))]
    (doall (map #(download-and-unpack (:id %))
                (remote/list-files rclient biodata-dir :gz)))))

Set of files currently under preparation, preventing double work.

(def ^{:doc 
       :private true}
  prep-queue (atom #{}))

Do actual work of preparing input file: resort and annotate for effects.

(defn- prep-file*
  [in-file ref-info out-dir cache-dir]
  (let [current-ref (normalize/pick-best-ref in-file (cons (:genome ref-info) (:genome-alts ref-info)))]
    (-> in-file
        (normalize/prep-vcf (:genome ref-info) nil :out-dir out-dir :orig-ref-file current-ref
                            :config {:remove-refcalls false})
        (effects/snpeff-annotate (:effects ref-info) cache-dir :out-dir out-dir))))

Provide knob to avoid doing labor intensive prep for testing environments.

(def ^{:doc 
       :dynamic true}
  *skip-prep* false)

Setup preparation for preparing downloading input files.

(defn- prep-file
  [in-file ref-info is-new?]
  (let [out-file (if *skip-prep* in-file (itx/add-file-part in-file "prep"))
        cache-dir (get-in @web-config [:dir :cache])]
    (when (or (itx/needs-run? out-file) is-new?)
      (when-not (contains? @prep-queue in-file)
          (swap! prep-queue conj in-file)
          (itx/with-temp-dir [out-dir (fs/parent in-file)]
            (fs/rename (prep-file* in-file ref-info out-dir cache-dir)
          (catch Exception ex
            (stacktrace/pst ex)
            (throw ex))
           (swap! prep-queue disj in-file)))))

Provide list of files currently indexing.

  index-queue (atom #{}))

Check if a file is newly created, or out of date with server.

(defn- is-file-new?
  [local-file finfo]
  (if (or (nil? (:created-on finfo)) (nil? (:size finfo)))
    (or (not= (fs/size local-file) (:size finfo))
        (< (fs/mod-time local-file) (.getTime (:created-on finfo))))))

Retrieve a file from remote server and prepare for analysis: - Checks for remote updates - Downloads the file if necessary - Preps the file for analysis, including resorting to reference genome and annotating. - Indexes for metric retrieval.

(defn get-prep-and-index
  [finfo rclient]
  (let [ref-info (first (:ref @web-config))
        test-local-file (remote/get-file (:id finfo) rclient)
        is-new? (is-file-new? test-local-file finfo)
        local-file (if is-new?
                       (fs/delete test-local-file)
                       (remote/get-file (:id finfo) rclient))
        ready-file (prep-file local-file ref-info is-new?)]
    (when ready-file
      (when-not (contains? @index-queue ready-file)
          (swap! index-queue conj ready-file)
          (metrics/index-variant-file ready-file (:genome ref-info) :re-index? is-new?
                                      :subsample-params (:params @web-config))
          (gemini/index-variant-file ready-file (:genome ref-info) :re-index? is-new?
                                     :subsample-params (:params @web-config))
          (catch Exception ex
            (stacktrace/pst ex))
           (swap! index-queue disj ready-file)))))

Retrieve and pre-index files for analysis from the remote client.

(defn pre-fetch-remotes
  (doall (map (partial update-user-files rclient) [:vcf]))
  (when-let [cache-dir (get-in @web-config [:dir :cache])]
    (when-let [biodata-dir (get-in @web-config [:remote :biodata])]
      (prep-biodata-dir rclient biodata-dir))
    (doseq [x (list-files-w-cache rclient :vcf)]
      (get-prep-and-index x rclient))))

Client API, with pre-fetching

Top level retrieval of a client from username/password to pre-connected client. As a side effect, pre-retrieve and caches files and associated information.

(defn get-client
  [creds & {:keys [pre-fetch? allow-offline?]
            :or {pre-fetch? true}}]
  (let [rclient (remote/get-client (-> creds
                                       (assoc :allow-offline? allow-offline?)
                                       (assoc :type (get creds :type :gs))))]
    (when (and pre-fetch? (:conn rclient))
      (future (pre-fetch-remotes rclient)))

Retrieve local cached files supporting offline processing XXX Needs updating to hook back in for full offline analysis reboot.

(defn- get-local-dl-files
  [ftype & {:keys [dirnames]}]
  {:pre [(nil? dirnames)]}
  (letfn [(check-dir-for-type [root _ files]
            (->> files
                 (filter #(.endsWith % (str "." (name ftype))))
                 (map #(str (file root %)))))
          (convert-to-api [cache-dir fname]
            {:id fname
             :tags []
             :filename (str (fs/base-name fname))
             :folder (string/replace (str (fs/parent fname)) (str (fs/file cache-dir)) "")
             :size (fs/size fname)
             :created-on (java.util.Date. (fs/mod-time fname))})]
    (let [cache-dir (get-in @web-config [:dir :cache])]
      (->> (fs/walk check-dir-for-type cache-dir)
           (map (partial convert-to-api cache-dir))))))

Provide high level API for accessing variant associated metrics.

(ns bcbio.variation.api.metrics
  (:import [org.jfree.data.statistics HistogramDataset HistogramType])
  (:use [bcbio.variation.api.shared :only [web-config]]
        [bcbio.variation.variantcontext :only [get-vcf-iterator parse-vcf]])
  (:require [clojure.set :as set]
            [bcbio.variation.api.file :as fileapi]
            [bcbio.variation.index.metrics :as im]
            [bcbio.variation.index.gemini :as gemini]))

Helper functions

(declare available-metrics)
(defn- get-histogram-bins
  [items n bin-min bin-max]
  "Retrieve values binned into a histogram using JFree Chart."
  (let [ds (doto (HistogramDataset.)
             (.setType HistogramType/RELATIVE_FREQUENCY)
             (.addSeries 0 (double-array items) n bin-min bin-max))]
    {:x (map #(.getXValue ds 0 %) (range (.getItemCount ds 0)))
     :y (map #(.getYValue ds 0 %) (range (.getItemCount ds 0)))}))

Remove nil values and empty input metrics.

(defn- clean-raw-metrics
  [raw metrics]
  (reduce (fn [coll [k vs]]
            (let [clean-vs (remove nil? vs)]
              (if (empty? clean-vs)
                (assoc coll k clean-vs))))
          (zipmap metrics (map (fn [x] (map #(get % x) raw)) metrics))))

Retrieve the configured range of a metric

(defn- get-metric-range
  (-> (filter #(= (:id %) metric) (available-metrics nil))

Bin metrics in preparation for histogram display using predefined min-max boundaries.

(defn- prepare-plot-metrics
  [metric raw]
  (let [bins 100
        [bin-min bin-max] (get-metric-range metric)
        data (get-histogram-bins raw bins bin-min bin-max)]
    {:vals (:y data)
     :bin-width (- (second (:x data)) (first (:x data)))
     :x-scale {:type :linear
               :domain [bin-min bin-max]}
     :y-scale {:type :linear}}))

Retrieve raw metrics from multiple sources, combining on IDs.

(defn- combined-raw-metrics
  [vcf-file ref-file metrics use-subsample?]
  (letfn [(metrics-by-id [metrics-fn]
            (reduce (fn [coll x]
                      (assoc coll (:id x) x))
                    (metrics-fn vcf-file ref-file :metrics (when metrics (map :id metrics))
                                :use-subsample? use-subsample?)))
          (present-metrics-by-id [base metrics-fn]
            (-> (metrics-by-id metrics-fn)
                (select-keys (keys base))))]
    (let [base-metrics (metrics-by-id im/get-raw-metrics)]
      (->> (merge-with merge
                       (present-metrics-by-id base-metrics gemini/get-raw-metrics))
           (sort-by :id)))))

API functions

(defn available-metrics
  [file-id & {:keys [rclient]}]
  (let [vcf-file (when file-id (fileapi/get-prep-and-index {:id file-id} rclient))]
    (concat (im/available-metrics vcf-file)
            (gemini/available-metrics vcf-file))))

Provide metrics for a VCF file ready for plotting and visualization.

(defn plot-ready-metrics
  [in-vcf-file & {:keys [metrics rclient]}]
  (let [vcf-file (fileapi/get-prep-and-index {:id in-vcf-file} rclient)
        ref-file (-> @web-config :ref first :genome)
        plot-metrics (or metrics (available-metrics in-vcf-file :rclient rclient))
        raw-metrics (clean-raw-metrics
                     (combined-raw-metrics vcf-file ref-file plot-metrics false)
                     (map :id plot-metrics))]
    {:filename in-vcf-file
     :created-on (java.util.Date.)
     :metrics (map #(merge % (prepare-plot-metrics (:id %) (get raw-metrics (:id %))))
                   (remove #(nil? (get raw-metrics (:id %))) plot-metrics))}))

Retrieve available choices for categorical variables from raw data.

(defn- collect-category-choices
  [raw to-collect]
  (reduce (fn [coll [data cur-id]]
            (if-let [v (get data cur-id)]
              (let [vs (if (set? v) v #{v})]
                (assoc coll cur-id (set/union vs (get coll cur-id))))
          (into {} (for [y to-collect] [y #{}]))
          (for [x raw, y to-collect]
            [x y])))

Finalize metrics information providing high level choices for categorical variables.

(defn- finalize-metrics
  [metrics raw]
  (let [choices (->> metrics
                     (filter #(= :category (get-in % [:x-scale :type])))
                     (map :id)
                     (collect-category-choices raw))]
    (letfn [(add-choices [m]
              (if-let [c (get choices (:id m))]
                (assoc m :choices c)
            (finalize-metric [m]
              (-> m
                  (dissoc :rows)))]
      (map finalize-metric metrics))))

Retrieve raw metrics values from input VCF.

(defn get-raw-metrics
  [variant-id & {:keys [metrics rclient use-subsample?]}]
  (let [vcf-file (fileapi/get-prep-and-index {:id variant-id} rclient)
        ref-file (-> @web-config :ref first :genome)
        metrics (or metrics (available-metrics variant-id :rclient rclient))
        raw (combined-raw-metrics vcf-file ref-file metrics use-subsample?)]
    {:raw raw
     :metrics (finalize-metrics metrics raw)}))

High level API to run analyses.

(ns bcbio.variation.api.run
  (:use [bcbio.variation.filter :only [category-variant-filter]]
        [bcbio.variation.api.shared :only [web-config]])
  (:require [clojure.string :as string]
            [fs.core :as fs]
            [bcbio.run.itx :as itx]
            [bcbio.variation.api.file :as file-api]
            [bcbio.variation.remote.core :as remote]
            [bcbio.variation.workflow.xprize :as xprize]))

Run analysis on provided inputs, dispatching on analysis type

(defmulti do-analysis
  (fn [atype params rclient] (keyword atype)))
(defn- to-prep-file [x]
  (itx/add-file-part x "prep"))
(defn- from-prep-file [x]
  (itx/remove-file-part x "prep"))

Run filtering, pushing results to remote file store. Returns list of output files following pushing the filter.

(defn- run-filter
  [atype params rclient]
  (let [ref-file (-> @web-config :ref first :genome)
        in-file (to-prep-file (remote/get-file (:filename params) rclient))
        filter-file (category-variant-filter in-file (:metrics params) ref-file :remove? true)
        local-out-dir (fs/file (fs/parent in-file) (name atype))
        final-filter-file (str (fs/file local-out-dir (from-prep-file (fs/base-name filter-file))))]
    (when-not (fs/exists? local-out-dir)
      (fs/mkdirs local-out-dir))
    (doseq [ext ["" ".idx"]]
      (fs/rename (str filter-file ext) (str final-filter-file ext)))
    (doseq [ext ["-prep-gemini.db" "-prep-metrics.db" "-prep.vcf"]]
      (let [idx-name (str (fs/file local-out-dir (fs/name final-filter-file)) ext)]
        (when (fs/exists? idx-name)
          (fs/delete idx-name))))
    (remote/put-file rclient final-filter-file
                     {:input-file (:filename params)
                      :expose-fn (:expose-fn params)
                      :tag (name atype)
                      :file-type :vcf})
    ;; Do not need to re-prep this file since it's a subset of a prep
    (fs/touch final-filter-file)
    (fs/copy final-filter-file (to-prep-file final-filter-file))
    (future (file-api/pre-fetch-remotes rclient))
(defmethod do-analysis :filter
  ^{:doc "Filter an input file according to specified metrics.
            - filename: The file to process
            - metrics: A map of filters, with metrics names as keys
              and either ranges ([min max]) or categories as values."}
  [atype params rclient]
  {:runner (future (to-prep-file
                    (if (empty? (:metrics params))
                      (remote/get-file (:filename params) rclient)
                      (run-filter atype params rclient))))})

Prepare X prize input files, potentially pulling from remote directories.

(defn- prep-xprize-files
  [work-info rclient]
  (letfn [(get-remote-files [work-info]
            (reduce (fn [coll kw]
                      (assoc coll kw (when-let [f (get coll kw)]
                                       (remote/get-file f rclient :out-dir (:dir work-info)))))
                    work-info [:variant-file :region-file]))]
    (-> work-info
        (assoc :orig-variant-file (:variant-file work-info))

Upload X Prize results files back to remote directories.

(defn- upload-xprize-files
  [{:keys [work-info comparison]} rclient params]
  (when-not (nil? (:conn rclient))
    (when-let [remote-input (:orig-variant-file work-info)]
      (doseq [x (map #(get-in comparison [:c-files %])
                     [:concordant :discordant :discordant-missing :phasing-error :summary])]
        (let [ftype (cond
                     (.endsWith x ".vcf") :vcf
                     :else :tabular)]
          (remote/put-file rclient x {:dbkey :hg19
                                      :file-type ftype
                                      :input-file remote-input
                                      :expose-fn (:expose-fn params)
                                      :tag "xprize"})))))
(defmethod do-analysis :xprize
  ^{:doc "Run X Prize comparison and scoring analysis on provided input files.
            - variant-file: Input variant file, in VCF format, to compare.
            - region-file: Optional BED file of regions to score on.
            - comparison-genome: Name of genome to compare against. Used
              to look up comparison details in configuration file.
            - host-info: Host information for providing callbacks to a local server."}
  [atype params rclient]
  (let [work-info (xprize/prep-scoring params @web-config)]
    {:runner (future (-> work-info
                         (prep-xprize-files rclient)
                         (xprize/run-scoring-analysis rclient @web-config)
                         (upload-xprize-files rclient params)))
     :work-info work-info}))

Shared functionality useful across multiple API calls.

(ns bcbio.variation.api.shared
  (:use [clojure.java.io]
        [bcbio.variation.remote.client :only [gs-default-server]]
        [bcbio.variation.web.db :only [prepare-web-db]])
  (:require [clojure.string :as string]
            [clj-yaml.core :as yaml]))

Web configuration, loaded from input YAML file

  web-config (atom nil))

Hold directory of remote files by user and filetype.

  remote-file-cache (atom {}))
(defn url->dir
  (string/replace (.getHost (as-url url)) "." "_"))
(defn load-web-config
  (let [config (-> config-file slurp yaml/parse-string)]
    (letfn [(maybe-fix-biodata [x]
              (if (.startsWith x "biodata:")
                (str (get-in config [:dir :cache])
                     "/" (url->dir gs-default-server)
                     (get-in config [:remote :biodata])
                     (string/replace-first x "biodata:" ))
            (fix-gs-ref [ref]
              (reduce (fn [coll k]
                        (let [val (get coll k)]
                          (assoc coll k
                                  (string? val) (maybe-fix-biodata val)
                                  (seq? val) (map maybe-fix-biodata val)
                                  :else val))))
                      ref (keys ref)))]
      (assoc config :ref (map fix-gs-ref (:ref config))))))

Set configuration and database information from input YAML file.

(defn set-config-from-file!
  (let [config (load-web-config config-file)]
    (reset! web-config (assoc config :db
                              (prepare-web-db (str (file (get-in config [:dir :work])

Identify callable bases from a BAM alignment file. Help differentiate positions where we can not assess variation

(ns bcbio.variation.callable
  (:import [org.broad.tribble.bed BEDCodec]
           [org.broad.tribble.index IndexFactory]
           [org.broad.tribble AbstractFeatureReader])
  (:use [clojure.java.io]
        [bcbio.align.ref :only [sort-bed-file]]
        [bcbio.variation.variantcontext :only [get-vcf-source]])
  (:require [clojure.string :as string]
            [fs.core :as fs]
            [bcbio.run.itx :as itx]
            [bcbio.run.broad :as broad]))

Identify callable bases from the provided alignment file.

(defn identify-callable
  [align-bam ref & {:keys [out-dir intervals]}]
  (let [base-dir (if (or (nil? out-dir)
                         (fs/writeable? (fs/parent align-bam)))
                   (fs/parent align-bam)
        base-fname (str (file base-dir (-> align-bam fs/base-name itx/file-root)))
        file-info {:out-bed (format "%s-callable.bed" base-fname)
                   :out-summary (format "%s-callable-summary.txt" base-fname)}
        args (concat ["-R" ref
                      "-I" align-bam
                      "--out" :out-bed
                      "--summary" :out-summary]
                     (broad/gatk-cl-intersect-intervals intervals ref))]
    (if-not (fs/exists? base-dir)
      (fs/mkdirs base-dir))
    (broad/index-bam align-bam)
    (broad/run-gatk "CallableLoci" args file-info {:out [:out-bed :out-summary]})
    (:out-bed file-info)))
(defn features-in-region [source space start end]
  (with-open [bed-iter (.query source space start end)]
    (vec (for [f bed-iter]
           {:chr (.getChr f)
            :start (.getStart f)
            :end (.getEnd f)
            :name (.getName f)
            :score (.getScore f)
            :strand (.getStrand f)}))))

Provide tribble feature source for a BED formatted file.

(defn get-bed-source
  [bed-file ref-file]
  (let [batch-size 500
        work-bed (sort-bed-file bed-file ref-file)
        idx (IndexFactory/createIntervalIndex (file work-bed) (BEDCodec.) batch-size)]
    (AbstractFeatureReader/getFeatureReader work-bed (BEDCodec.) idx)))
(defn get-bed-iterator
  [bed-file ref-file]
  (.iterator (get-bed-source bed-file ref-file)))

Create BED file of callable regions from the BAM alignment file. Pass the callable BED to GATK for subsetting based on callable intervals.

(defn get-callable-bed
  [align-bam ref & {:keys [out-dir intervals]}]
  (let [orig-bed-file (identify-callable align-bam ref :out-dir out-dir
                                         :intervals intervals)
        out-file (itx/add-file-part orig-bed-file "intervals")]
    (with-open [bed-iter (get-bed-iterator orig-bed-file ref)
                wtr (writer out-file)]
      (doseq [f bed-iter]
        (when (= (.getName f) "CALLABLE")
          (.write wtr (format "%s\t%s\t%s\n" (.getChr f)
                              (dec (.getStart f)) (.getEnd f))))))

Limit input BED intervals to only chromosomes found in a VCF file.

(defn limit-bed-intervals
  [intervals call exp config]
  (let [out-file (itx/add-file-part intervals (:name call) (get-in config [:dir :prep]))]
    (when (or (itx/needs-run? out-file)
              (> (fs/mod-time intervals) (fs/mod-time out-file)))
      (with-open [rdr (reader intervals)
                  wtr (writer out-file)
                  call-vcf-s (get-vcf-source (:file call) (:ref exp))]
        (let [seq-names (set (.getSequenceNames call-vcf-s))]
          (doseq [x (filter #(contains? seq-names (first (string/split % #"\t")))
                            (line-seq rdr))]
            (.write wtr (str x "\n"))))))

Multiple callables

Provide callable checker for potentially multiple inputs

(defprotocol CallableChecker
  (has-callers? [this])
  (is-callable? [this space start end]))
(defrecord BamCallable [sources check-fn]
  (has-callers? [_]
    (not (empty? sources)))
  (is-callable? [_ space start end]
    (letfn [(source-is-callable? [source space start end]
              (if (<= start end)
                (> (count (features-in-region source space start end)) 0)
      (if (empty? sources)
        (check-fn #(source-is-callable? % space start end) sources))))
  (close [_]
    (doseq [x sources]
      (.close x))))

Retrieve generalized callabilitu checkers that handles multiple file inputs. Checks if a chromosome start end region is callable based on reads in input BAM files.

(defn get-callable-checker
  [bam-files ref & {:keys [out-dir intervals check-fn]
                 :or {check-fn some}}]
  (let [work-bam-files (remove nil? (if (coll? bam-files) bam-files [bam-files]))
        sources (map #(-> (get-callable-bed % ref :out-dir out-dir :intervals intervals)
                          (get-bed-source ref))
    (BamCallable. sources check-fn)))

Combine variant files, handling no-calls versus reference calls

  1. Combine the variants to create a merged set of positions to call at
  2. For each variant file: a. Generate callability at each position b. Combine original calls with merged positions c. Walk through each no-call and set as reference if callable
(ns bcbio.variation.combine
  (:import [org.broadinstitute.sting.utils.variantcontext 
  (:use [clojure.tools.cli :only [cli]]
        [bcbio.variation.complex :only [normalize-variants]]
        [bcbio.variation.filter.intervals :only [vcf-sample-name select-by-sample]]
        [bcbio.variation.haploid :only [diploid-calls-to-haploid]]
        [bcbio.variation.multisample :only [get-out-basename multiple-samples?]]
        [bcbio.variation.normalize :only [prep-vcf clean-problem-vcf]]
        [bcbio.variation.phasing :only [is-haploid?]]
        [bcbio.variation.structural :only [write-non-svs]]
        [bcbio.variation.variantcontext :only [get-vcf-header write-vcf-w-template
                                               get-vcf-iterator parse-vcf
                                               get-vcf-retriever variants-in-region]])
  (:require [fs.core :as fs]
            [clojure.string :as string]
            [bcbio.run.itx :as itx]
            [bcbio.run.broad :as broad]))

Combine multiple variant files with GATK CombineVariants. Only correctly handles all-by-all comparisons with the same ploidy level.

(defn combine-variants
  [vcfs ref & {:keys [merge-type out-dir intervals unsafe name-map base-ext check-ploidy? quiet-out?]
               :or {merge-type :unique
                    unsafe false
                    name-map {}
                    check-ploidy? true}}]
  (when (and check-ploidy?
             (> (count (set (remove nil? (map #(is-haploid? % ref) vcfs)))) 1))
    (throw (Exception. (format "Haploid and non-haploid combinations not supported: %s %s"
                               (vec vcfs) (vec (map #(is-haploid? % ref) vcfs))))))
  (letfn [(unique-name [i f]
            (if quiet-out?
              (str "v" i)
              (string/replace (get name-map f
                                   (-> f fs/base-name itx/file-root))
                              "-" "_")))]
    (let [base-dir (if (nil? out-dir) (fs/parent (first vcfs)) out-dir)
          full-base-name (-> vcfs first fs/base-name itx/remove-zip-ext)
          base-name (if (nil? base-ext) full-base-name
                        (format "%s-%s.vcf" (first (string/split full-base-name #"-"))
          file-info {:out-vcf (str (fs/file base-dir
                                            (itx/add-file-part base-name
                                                               (case merge-type
                                                                 :minimal "mincombine"
                                                                 :full "fullcombine"
          args (concat ["-R" ref
                        "-o" :out-vcf
                        "--rod_priority_list" (string/join "," (map-indexed unique-name vcfs))]
                       ;(if unsafe ["--unsafe" "ALLOW_SEQ_DICT_INCOMPATIBILITY"] [])
                       (if unsafe ["--unsafe" "ALL"] [])
                       (if quiet-out? ["--suppressCommandLineHeader" "--setKey" "null"] [])
                       (flatten (map-indexed #(list (str "--variant:" (unique-name %1 %2)) %2) vcfs))
                       (broad/gatk-cl-intersect-intervals intervals ref)
                       (case merge-type
                         :full ["--genotypemergeoption" "PRIORITIZE"]
                         :unique ["--genotypemergeoption" "UNIQUIFY"]
                         :minimal ["--sites_only" "--minimalVCF"]))]
      (if-not (fs/exists? base-dir)
        (fs/mkdirs base-dir))
      (broad/run-gatk "CombineVariants" args file-info {:out [:out-vcf]})
      (:out-vcf file-info))))

Clean multi-alleles

Clean up variant contexts with multi-allele, consolidating calls and removing unused alleles.

(defn- clean-multialleles
  [retriever vcs]
  (letfn [(others-at-pos [vcs retriever]
            (filter #(apply = (map (juxt :chr :start) [% (first vcs)]))
                    (apply variants-in-region
                           (cons retriever ((juxt :chr :start :end) (first vcs))))))
          (get-ref-alt-alleles [vc]
            (let [ref (.getDisplayString (:ref-allele vc))]
              (map (fn [x] [ref (.getDisplayString x)]) (:alt-alleles vc))))
          (sort-by-allele-count [xs]
            (let [count-groups (group-by val xs)
                  topcount-alleles (keys (get count-groups (apply max (keys count-groups))))]
              (first (sort-by #(count (first %)) topcount-alleles))))]
    (let [alleles (reduce (fn [coll x]
                            (assoc coll x (inc (get coll x 0))))
                          {} (mapcat get-ref-alt-alleles (others-at-pos vcs retriever)))
          final-alleles (if (empty? alleles)
                          (-> vcs first get-ref-alt-alleles first)
                          (sort-by-allele-count alleles))]
      (-> (VariantContextBuilder. (:vc (first vcs)))
          (.alleles final-alleles)
          (.stop (+ (:start (first vcs)) (if (= 0 (count (second final-alleles)))
                                           (count (first final-alleles))
                                           (max 0 (dec (count (first final-alleles)))))))

Fix multiple alleles in a VCF produced by combining multiple inputs. This combines calls present at multiple positions and removes multi-alleles not present in input calls.

(defn fix-minimal-combined
  [combined-vcf vcfs ref]
  (let [out-file (itx/add-file-part combined-vcf "fix")]
    (when (itx/needs-run? out-file)
      (with-open [vcf-iter (get-vcf-iterator combined-vcf ref)]
        (write-vcf-w-template combined-vcf {:out out-file}
                              (map (partial clean-multialleles (apply get-vcf-retriever (cons ref vcfs)))
                                   (partition-by (juxt :chr :start)
                                                 (parse-vcf vcf-iter)))

Check if interval BED files overlap with current analysis genome build. This is useful when an input VCF is from an alternate genome and needs conversion. In this case we shouldn't yet be using interval selection.

(defn- genome-safe-intervals
  [intervals ref-file exp]
  (if (or (nil? ref-file) (= ref-file (:ref exp)))

Prepare input file for comparisons based on configuration: - Selecting a single sample from multi-sample files - Resorting and fixing chromosome naming - Removing reference call genotypes This organizes the logic which get convoluted for different cases. The approach is to select a single sample and remove refcalls if we have a multiple sample file, so the sample name will be correct.

(defn- dirty-prep-work
  [in-file call exp intervals out-dir out-fname]
  (letfn [(run-sample-select [in-file ref-file ext]
            (select-by-sample (:sample exp) in-file (str (:name call) ext)
                              ref-file :out-dir out-dir
                              :intervals (genome-safe-intervals intervals ref-file exp)
                              :remove-refcalls (get call :remove-refcalls false)))]
    (let [sample-file (if (and (multiple-samples? in-file) (:sample exp))
                        (run-sample-select in-file (get call :ref (:ref exp)) "")
          prep-file (if (and (true? (:prep call))
                             (not= (:ref exp) (:ref call)))
                      (prep-vcf sample-file (:ref exp) (:sample exp) :out-dir out-dir
                                :out-fname out-fname :orig-ref-file (:ref call)
                                :config call)
          hap-file (if (true? (:make-haploid call))
                     (diploid-calls-to-haploid prep-file (:ref exp) :out-dir out-dir)
          noref-file (if (or (and (not (multiple-samples? in-file)) (:remove-refcalls call))
                             (and (not (nil? (:ref call))) (not (empty? intervals))))
                       (run-sample-select hap-file (:ref exp) "-noref")

Prepare call information for VCF comparisons by normalizing through GATK. Handles:

  1. Combining multiple input files
  2. Fixing reference and sample information.
  3. Splitting combined MNPs into phased SNPs
(defn gatk-normalize
  [call exp intervals out-dir transition]
  (if-not (fs/exists? out-dir)
    (fs/mkdirs out-dir))
  (letfn [(merge-call-files [call in-files]
            (let [ref (get call :ref (:ref exp))]
              (combine-variants in-files ref
                                :merge-type :full :out-dir out-dir
                                :intervals (genome-safe-intervals intervals ref exp)
                                :check-ploidy? false
                                :unsafe true)))]
    (let [in-files (if (coll? (:file call)) (:file call) [(:file call)])
          out-fname (str (get-out-basename exp call in-files) ".vcf")
          _ (transition :clean (str "Cleaning input VCF: " (:name call)))
          clean-files (vec (map #(if-not (:preclean call) %
                                         (clean-problem-vcf % (:ref exp) (:sample exp) :out-dir out-dir))
          _ (transition :merge (str "Merging multiple input files: " (:name call)))
          merge-file (if (> (count clean-files) 1)
                       (merge-call-files call clean-files)
                       (first clean-files))
          _ (transition :prep (str "Prepare VCF, resorting to genome build: " (:name call)))
          prep-file (dirty-prep-work merge-file call exp intervals out-dir out-fname)]
      (transition :normalize (str "Normalize MNP and indel variants: " (:name call)))
      (assoc call :file (if (true? (get call :normalize true))
                          (normalize-variants prep-file (:ref exp) out-dir
                                              :out-fname out-fname)

Top-level entry points

Provide convenient entry to fully normalize a variant file for comparisons.

(defn full-prep-vcf
  [vcf-file ref-file & {:keys [max-indel resort keep-ref]}]
  (let [out-file (itx/add-file-part vcf-file "fullprep")]
    (when (itx/needs-run? out-file)
      (itx/with-temp-dir [out-dir (fs/parent vcf-file)]
        (let [exp {:sample (-> vcf-file get-vcf-header .getGenotypeSamples first)
                   :ref ref-file :params {:max-indel max-indel}}
              call {:name "fullprep" :file vcf-file :preclean true
                    :prep true :normalize true :prep-sv-genotype false
                    :fix-sample-header true
                    :prep-sort-pos resort
                    :remove-refcalls (not keep-ref)}
              out-info (gatk-normalize call exp [] out-dir
                                       (fn [_ x] (println x)))
              nosv-file (if max-indel
                          (write-non-svs (:file out-info) (:ref exp) (:params exp))
                          (:file out-info))]
          (fs/rename nosv-file out-file))))
(defn -main [& args]
  (let [[options [vcf-file ref-file] banner]
        (cli args
             ["-i" "--max-indel" "Maximum indel size to include" :default nil
              :parse-fn #(Integer. %)]
             ["-s" "--resort" "Resort input file by coordinate position" :default false :flag true]
             ["-r" "--keep-ref" "Keep reference (0/0) calls" :default false :flag true])]
    (when (or (:help options) (nil? vcf-file) (nil? ref-file))
      (println "Required arguments:")
      (println "    <vcf-file> VCF input file to prepare.")
      (println "    <ref-file> Genome reference file (GRCh37/b37 coordinates)")
      (println banner)
      (System/exit 0))
    (let [out-file (full-prep-vcf vcf-file ref-file :max-indel (:max-indel options)
                                  :resort (:resort options) :keep-ref (:keep-ref options))]
      (println out-file)
      (System/exit 0))))

Generate comparisons between two sets of variant calls. Utilizes GATK walkers to generate detailed and summary statistics about two sets of calls:

  • Identify non-callable regions with CallableLociWalker
  • Combine variants from two samples
  • Use VariantEval to calculate overall concordance statistics
  • Provide output for concordant and discordant regions for detailed investigation
(ns bcbio.variation.compare
  (:use [clojure.java.io]
        [clojure.math.combinatorics :only [combinations]]
        [ordered.map :only [ordered-map]]
        [bcbio.align.reorder :only [reorder-bam]]
        [bcbio.variation.annotation :only [add-variant-annotations]]
        [bcbio.variation.callable :only [get-callable-bed]]
        [bcbio.variation.combine :only [combine-variants gatk-normalize]]
        [bcbio.variation.config :only [load-config do-transition]]
        [bcbio.variation.evaluate :only [calc-variant-eval-metrics]]
        [bcbio.variation.filter :only [variant-filter variant-format-filter
        [bcbio.variation.filter.intervals :only [combine-multiple-intervals]]
        [bcbio.variation.multiple :only [prep-cmp-name-lookup pipeline-compare-multiple]]
        [bcbio.variation.multisample :only [compare-two-vcf-flexible
        [bcbio.variation.recall :only [create-merged]]
        [bcbio.variation.structural :only [compare-sv-pipeline]]
        [bcbio.variation.validate :only [pipeline-validate]]
        [bcbio.variation.variantcontext :only [parse-vcf write-vcf-w-template
  (:require [clojure.string :as string]
            [clj-yaml.core :as yaml]
            [fs.core :as fs]
            [lonocloud.synthread :as ->]
            [bcbio.run.itx :as itx]
            [bcbio.run.broad :as broad]
            [bcbio.variation.grade :as grade]
            [bcbio.variation.phasing :as phasing]
            [bcbio.variation.report :as report]))

Variance assessment

Variant comparison producing 3 files: concordant and both directions discordant

(defn select-by-concordance
  [sample call1 call2 ref & {:keys [out-dir intervals]}]
  (let [base-dir (if (nil? out-dir) (fs/parent (:file call1)) out-dir)
        ready-intervals (remove nil? (flatten [intervals (:intervals call1)
                                               (:intervals call2)]))]
    (if-not (fs/exists? base-dir)
      (fs/mkdirs base-dir))
     (for [[c1 c2 cmp-type] [[call1 call2 "concordance"]
                             [call1 call2 "discordance"]
                             [call2 call1 "discordance"]]]
       (let [file-info {:out-vcf (str (fs/file base-dir
                                               (format "%s-%s-%s-%s.vcf"
                                                       sample (:name c1) (:name c2) cmp-type)))}
             args (concat
                   ["-R" ref
                    "--sample_name" sample
                    "--variant" (:file c1)
                    (str "--" cmp-type) (:file c2)
                    "--out" :out-vcf]
                   (broad/gatk-cl-intersect-intervals ready-intervals ref))]
         (broad/run-gatk "SelectVariants" args file-info {:out [:out-vcf]})
         (:out-vcf file-info))))))

Custom parsing and combinations

Utilizes GATK VariantContexts

Lazy stream of VariantContexts categorized by concordant/discordant matching.

(defn- vc-by-match-category
  (letfn [(genotype-alleles [g]
            (vec (map #(.toString %) (:alleles g))))
          (is-concordant? [vc]
            (= (-> (map genotype-alleles (:genotypes vc))
    (for [vc (parse-vcf vcf-iter)]
      [(if (is-concordant? vc) :concordant :discordant)
       (:vc vc)])))

Provide concordant and discordant variants for two variant files.

(defn split-variants-by-match
  [vcf1 vcf2 ref]
  (let [combo-file (combine-variants [vcf1 vcf2] ref)
        out-map {:concordant (itx/add-file-part combo-file "concordant")
                 :discordant (itx/add-file-part combo-file "discordant")}]
    (if-not (fs/exists? (:concordant out-map))
      (with-open [combo-vcf-iter (get-vcf-iterator combo-file ref)]
        (write-vcf-w-template combo-file out-map (vc-by-match-category combo-vcf-iter)


Process a directory of variant calls from multiple sources, generating a summary of concordance plus detailed metrics differences for tweaking filters.

Retrieve BAM files associated with alignments, normalizing if needed.

(defn- prepare-input-bams
  [exp out-dir]
  (let [call-bams (map (fn [c] [(get c :align (:align exp)) c]) (:calls exp))]
    (map (fn [[b c]] (when-not (nil? b)
                     (reorder-bam b (:ref exp) c exp :out-dir out-dir)))

Prepare merged and annotated VCF files for an experiment.

(defn- prepare-vcf-calls
  [exp config]
  (let [out-dir (get-in config [:dir :prep] (get-in config [:dir :out]))
        transition (partial do-transition config)
        align-bams (prepare-input-bams exp out-dir)
        all-intervals (remove nil? (map :intervals (cons exp (:calls exp))))
        start-vcfs (vec (map #(gatk-normalize % exp all-intervals out-dir transition)
                             (:calls exp)))
        _ (transition :combine "Creating merged VCF files for all comparisons")
        merged-vcfs (create-merged (map :file start-vcfs) align-bams exp
                                   :out-dir out-dir
                                   :intervals all-intervals)
        _ (transition :annotate "Annotate VCFs with metrics")
        ann-vcfs (map (fn [[v b c]]
                        (add-variant-annotations v b (:ref exp) c :out-dir out-dir
                                                 :intervals all-intervals))
                      (map vector merged-vcfs align-bams (:calls exp)))
        _ (transition :filter "Post annotation filtering")
        filter-vcfs (map (fn [[v c]]
                            (:filters c) (variant-filter v (:filters c) (:ref exp))
                            (:format-filters c) (variant-format-filter v (:format-filters c)
                                                                       (:ref exp))
                            :else v))
                         (map vector ann-vcfs (:calls exp)))]
    (map (fn [[c v b]] (-> c
                           (assoc :file v)
                           (assoc :align b)))
         (map vector (:calls exp) filter-vcfs align-bams))))

Compare two standard VCF files based on the supplied configuration.

(defn- compare-two-vcf-standard
  [c1 c2 exp config]
  (letfn [(callable-intervals [exp c1 c2]
            (let [out-dir (get-in config [:dir :prep] (get-in config [:dir :out]))
                  align-bams (remove nil? (map :align [c1 c2]))]
              (when (and (:intervals exp) (seq align-bams))
                (combine-multiple-intervals (:intervals exp) align-bams (:ref exp)
                                            :out-dir out-dir :name (:sample exp)))))
          (discordant-name [x]
            (format "%s-discordant" (:name x)))
          (zipmap-ordered [xs1 xs2]
            (apply ordered-map (interleave xs1 xs2)))]
    (let [c-files (select-by-concordance (:sample exp) c1 c2 (:ref exp)
                                         :out-dir (get-in config [:dir :out])
                                         :intervals (:intervals exp))
          eval (calc-variant-eval-metrics (:sample exp) (:file c1) (:file c2) (:ref exp)
                                          :out-base (first c-files)
                                          :intervals (:intervals exp))
          c-eval (calc-variant-eval-metrics (:sample exp) (:file c1) (:file c2) (:ref exp)
                                            :out-base (itx/add-file-part (first c-files) "callable")
                                            :intervals (callable-intervals exp c1 c2))]
      {:c-files (zipmap-ordered (map keyword
                                     ["concordant" (discordant-name c1) (discordant-name c2)])
       :c1 c1 :c2 c2 :exp exp :dir (config :dir)
       :metrics (report/concordance-report-metrics (:sample exp) eval)
       :callable-metrics (report/concordance-report-metrics (:sample exp) c-eval)})))

Compare two VCF files, handling standard and haploid specific comparisons.

(defn compare-two-vcf
  [c1 c2 exp config]
  (do-transition config :compare (format "Comparing VCFs: %s vs %s" (:name c1) (:name c2)))
  (let [[c1 c2 sv-cmp] (if-not (:mod c1)
                         (compare-sv-pipeline c1 c2 exp config)
                         [c1 c2 {}])
        phased-vcfs (group-by #(-> % :file (phasing/is-haploid? (:ref exp))) [c1 c2])
        out-cmp (cond
                 (get phased-vcfs true) (phasing/compare-two-vcf-phased phased-vcfs exp config)
                 (multiple-samples? (:file c1)) (compare-two-vcf-flexible c1 c2 exp config)
                 :else (compare-two-vcf-standard c1 c2 exp config))
        grade-cmp (if (grade/is-grade-cmp? exp)
                    (grade/annotate-discordant out-cmp)
    (assoc grade-cmp :c-files (reduce (fn [coll [k v]] (assoc coll k v))
                                      (:c-files grade-cmp) sv-cmp))))

Customizable finalizer comparisons

Run a post-pairwise comparison function, returning updated comparison details,

(defmulti run-finalizer
  (fn [cmps finalizer exp config] (-> finalizer :method keyword)))
(defmethod run-finalizer :recal-filter
  [& args]
  (apply pipeline-recalibration args))
(defmethod run-finalizer :multiple
  [& args]
  (apply pipeline-compare-multiple args))
(defmethod run-finalizer :validate
  [& args]
  (apply pipeline-validate args))

Finalize all comparisons with finished initial pass data.

(defn finalize-comparisons
  [cmps exp config]
  (letfn [(add-summary [x]
            (-> x
                (assoc :exp exp)
                (->/as cur-cmp
                  (assoc :summary (report/top-level-metrics cur-cmp)))
                (->/when (grade/is-grade-cmp? exp)
                  (->/as cmp-w-summary
                    (assoc :grade-breakdown (grade/prep-grade-breakdown cmp-w-summary))))))
          (update-w-finalizer [cur-cmps finalizer]
            "Update the current comparisons with a defined finalizer."
            (do-transition config :finalize
                           (format "Finalize %s: %s" (:method finalizer)
                                   (let [t (:target finalizer)]
                                     (if (coll? t) (string/join ", " t) t))))
            (let [updated-cmp (run-finalizer cur-cmps finalizer exp config)]
              (assoc cur-cmps (map #(get-in updated-cmp [% :name]) [:c1 :c2])
                     (if-not (:re-compare updated-cmp) updated-cmp
                             (compare-two-vcf (:c1 updated-cmp) (:c2 updated-cmp) exp config)))))]
    (->> (reduce update-w-finalizer
                 (prep-cmp-name-lookup cmps) (:finalize exp))
         (map add-summary))))


(defn- get-summary-writer [config config-file ext]
  (if-not (nil? (get-in config [:dir :out]))
      (if-not (fs/exists? (get-in config [:dir :out]))
        (fs/mkdirs (get-in config :dir :out)))
      (writer (str (fs/file (get-in config [:dir :out])
                            (format "%s-%s"
                                    (itx/file-root (fs/base-name config-file)) ext)))))
    (writer System/out)))

Perform comparison between variant calls using inputs from YAML config.

(defn variant-comparison-from-config
  (let [config (load-config config-file)
        comparisons (flatten
                     (for [exp (:experiments config)]
                       (let [cmps (for [[c1 c2] (combinations (prepare-vcf-calls exp config) 2)]
                                    (compare-two-vcf c1 c2 exp config))]
                         (finalize-comparisons cmps exp config))))
        grading-file (str (fs/file (get-in config [:dir :out])
                                   (format "%s-grading.yaml" (itx/file-root (fs/base-name config-file)))))]
    (do-transition config :summary "Summarize comparisons")
    (with-open [w (get-summary-writer config config-file "summary.txt")]
      (report/write-summary-txt w comparisons))
    (with-open [w (get-summary-writer config config-file "files.csv")]
      (report/write-files-csv w comparisons config))
    (with-open [w (get-summary-writer config config-file "summary.csv")]
      (report/write-summary-csv w comparisons))
    (when-let [bdowns (seq (remove nil? (map :grade-breakdown comparisons)))]
      (spit grading-file (yaml/generate-string bdowns)))
    (do-transition config :finished "Finished")
(defn -main [config-file]
    (variant-comparison-from-config config-file)
    (catch Throwable t
      (.printStackTrace t)
      (System/exit -1))
      (System/exit 0))))

Handle complex variations representations: multi-nucleotide polymorphisms and indels.

(ns bcbio.variation.complex
  (:import [org.broadinstitute.sting.utils.variantcontext Allele
            VariantContextBuilder GenotypesContext GenotypeBuilder
           [org.biojava3.core.sequence DNASequence]
           [org.biojava3.alignment Alignments SimpleGapPenalty
  (:use [clojure.java.io]
        [clojure.set :only [union]]
        [ordered.set :only [ordered-set]]
        [bcbio.align.ref :only [extract-sequence]]
        [bcbio.variation.variantcontext :only [parse-vcf write-vcf-w-template
  (:require [clojure.string :as string]
            [bcbio.run.broad :as broad]
            [bcbio.run.itx :as itx]
            [fs.core :as fs]))

Multi-nucleotide polymorphisms (MNPs)

Split into single variant primitives.

Do a set of alleles have any variants at a position.

(defn- has-variant-base?
  [alleles i]
  (> (count (set (map #(nth % i nil) alleles)))
(defn- get-vc-alleles [vc]
  (vec (map #(.getDisplayString %) (cons (:ref-allele vc) (:alt-alleles vc)))))

Identify complex indels that can be split into multiple calls.

(defn is-multi-indel?
  (letfn [(monomorphic-alleles? [vc]
            (= 1 (->> (get-vc-alleles vc)
                      (map set)
                      (apply union)
          (has-multiple-nonref-alleles? [vc]
            (and (> (.length (:ref-allele vc)) 1)
                 (> (apply min (map #(.length %) (:alt-alleles vc))) 1)
                 (not (monomorphic-alleles? vc))))
          (has-ref-padding-mismatch? [vc]
            (let [alleles (get-vc-alleles vc)]
              (not= (nth (first alleles) 0) (nth (second alleles) 0))))]
    (and (= "INDEL" (:type vc))
         (or (has-multiple-nonref-alleles? vc)
             (has-ref-padding-mismatch? vc)))))
(defn- contains-indel? [alleles i]
  (when (< i (count (first alleles)))
    (contains? (set (map #(str (nth % i)) alleles)) "-")))
(defn- starts-an-indel? [alleles i]
  (contains-indel? alleles (inc i)))
(defn- gap-end? [alleles i]
  (and (pos? i)
       (not (contains-indel? alleles i))
       (contains-indel? alleles (dec i))))
(defn- is-match? [alleles i]
  (= 1 (count (set (map #(str (nth % i)) alleles)))))

Detect single call SNP variants within a MNP genotype. Handles reference no-variant padding bases on the 5' end of the sequence, writing only variants at the adjusted positions.

(defn- split-alleles
  [vc alleles & {:keys [prev-pad]}]
  (letfn [(is-internal-indel? [alleles i]
            (and (pos? i)
                 (contains-indel? alleles i)
                 (is-match? alleles (dec i))))
          (is-anchor-mismatch? [alleles i]
            (and (= 1 i)
                 (not (is-match? alleles i))
                 (is-match? alleles 0)))
          (is-fiveprime-indel? [alleles i]
            (and (zero? i)
                  (starts-an-indel? alleles i)
                  (contains-indel? alleles i))))
          (needs-padding? [alleles i]
            (or (pos? i)
                (and (is-fiveprime-indel? alleles i)
                     (is-match? alleles i))
                (and (not (is-fiveprime-indel? alleles i))
                     (not (is-match? alleles i)))))
          (has-nopad-five-indel? [alleles i]
            (and (is-fiveprime-indel? alleles i)
                 (contains-indel? alleles i)))
          (extend-indels [alleles i]
            {:start (if (or (is-internal-indel? alleles i)
                            (is-fiveprime-indel? alleles i))
                      (max (dec i) 0)
             :end (inc (or (last (take-while #(or (contains-indel? alleles %)
                                                  (starts-an-indel? alleles %))
                                             (range i (count (first alleles)))))
          (ref-and-alt-alleles [cur-alleles]
            (let [refa (first cur-alleles)
                  alts (map (fn [x]
                              (if (= (.getDisplayString x) (.getDisplayString refa))
                                refa x))
                            (rest cur-alleles))]
              {:ref refa :alts alts}))
          (extract-variants [alleles pos]
            (let [{:keys [start end]} (extend-indels alleles pos)
                  str-alleles (map #(-> (str (if (has-nopad-five-indel? alleles start) prev-pad "")
                                             (subs % start end))
                                        (string/replace "-" ""))
                  cur-alleles (map-indexed (fn [i x] (Allele/create x (= 0 i)))
                  size (let [base (.length (first cur-alleles))]
                         (if (some empty? str-alleles) base (dec base)))
                  w-gap-start (-> (first alleles) (subs 0 start) (string/replace "-" "") count)
                  ready-alleles (ref-and-alt-alleles cur-alleles)]
              {:offset (+ w-gap-start (if (has-nopad-five-indel? alleles start) -1 0))
               :end (+ w-gap-start size)
               :next-start end
               :size size
               :orig-alleles alleles
               :ref-allele (:ref ready-alleles)
               :alleles (:alts ready-alleles)}))]
    (remove nil?
            (loop [i 0 final []]
               (>= i (-> alleles first count)) final
               (has-variant-base? alleles i)
               (let [next-var (extract-variants alleles i)]
                 (recur (:next-start next-var) (conj final next-var)))
               :else (recur (inc i) final))))))

Retrieve a new set of genotypes with the given alleles. Update genotypes from the VariantContext, copying the existing genotype and substituting in the provided alleles and phasing information.

(defn- genotype-w-alleles
  [vc alleles orig-alleles is-phased]
  (letfn [(get-new-allele [new-alleles orig-alleles]
            (let [old-map (into {} (map-indexed
                                    (fn [i x]
                                      [(string/replace x "-" "") i])
              (fn [old-allele]
                (nth new-alleles (get old-map (.getDisplayString old-allele))))))
          (add-new-genotype [allele-mapper context genotype]
            (doto context
              (.replace (-> (GenotypeBuilder. genotype)
                            (.alleles (map allele-mapper (.getAlleles genotype)))
                            (.phased (or (.isPhased genotype) is-phased))
    (reduce (partial add-new-genotype (get-new-allele alleles orig-alleles))
            (-> vc .getGenotypes GenotypesContext/copy)
            (.getGenotypes vc))))

Create a new VariantContext as a subset of an existing variant. allele-info specifies the location size and alleles for the new variant: {:offset :size :ref-allele :alleles}

(defn- new-split-vc
  [vc i allele-info]
  (let [pos (+ (:offset allele-info) (.getStart vc))
        all-alleles (cons (:ref-allele allele-info) (:alleles allele-info))]
    (-> (VariantContextBuilder. vc)
        (.start pos)
        (.stop (+ pos (get allele-info :size 0)))
        (.genotypes (genotype-w-alleles vc all-alleles (:orig-alleles allele-info)
                                        (> i 0)))
        (.alleles (set all-alleles))

Split a MNP into individual alleles

(defn- split-mnp
  (let [alleles (split-alleles vc (get-vc-alleles vc))]
    (map (fn [[i x]] (new-split-vc (:vc vc) i x)) (map-indexed vector alleles))))


Create a normalized representation for comparison.

Perform alignment of input sequences using BioJava.

(defn- multiple-alignment
  (letfn [(original-seq-position [seqs]
            (let [orig-order (into {} (map-indexed (fn [i x] [x i])
                                                   (into (ordered-set) seqs)))]
              (fn [x]
                (get orig-order (string/replace x "-" "")))))
          (unique-aligns [xs]
            (vals (reduce (fn [coll x]
                            (assoc coll (string/replace x "-" "") x))
                          {} xs)))
          (all-gap? [xs]
            (= (set (map str xs)) #{"-"}))
          (finalize-alignment [seqs]
            (let [n (count seqs)
                  gap-free (remove all-gap? (partition n (apply interleave (take n seqs))))]
              (map (fn [i]
                     (string/join "" (map #(nth % i) gap-free)))
                   (range n))))]
    (let [align-args (to-array [(SimpleGapPenalty. 20 1)])
          base-align (map #(.getSequenceAsString %)
                          (-> (map #(DNASequence. %) seqs)
                              (Alignments/getMultipleSequenceAlignment align-args)
          orig-align (sort-by (original-seq-position seqs) (unique-aligns base-align))]
      (finalize-alignment orig-align))))

Left align variants that start with a gap mismatch.

(defn- fix-gap-start-mismatch
  (letfn [(make-5-gap-wref [x]
            (let [anchor (subs x 0 1)
                  nogap-x (string/replace (subs x 1) "-" "")]
              (string/join "" (conj (vec (cons anchor
                                               (repeat (dec (- (count x) (count nogap-x))) "-")))
    (if (.contains (second alleles) "-")
      [(first alleles) (make-5-gap-wref (second alleles))]
      [(make-5-gap-wref (first alleles)) (second alleles)])))

Ensure reference alignment gaps next to variants are consistently left aligned. Adjacent SNP and indels can have the SNP placed anywhere within the indel. This left aligns them to maintain anchoring via the 5' reference. ATCT => ATCT AC-- A--C

(defn- left-align-complex
  {:pre [(= 2 (count alleles))]
   :post [(= (count (first alleles))
             (count (first %)))]}
  (letfn [(gap-start-mismatch? [alleles i]
            (or (and (starts-an-indel? alleles i)
                     (not (is-match? alleles i))
                     (not (contains-indel? alleles i)))
          (gap-allele-type [alleles i]
             (gap-start-mismatch? alleles i) :gs-mismatch
             (contains-indel? alleles i) :gap
             (gap-end? alleles i) :gap-end
             (is-match? alleles i) :match
             :else :mismatch))
          (split-at-match-gaps [[ann _]]
            (if (= :gap-end ann) ann :match))
          (get-region-allele [xs allele]
            (apply str (map #(nth allele (second %)) xs)))
          (get-region-alleles [alleles xs]
            (let [orig-alleles (map (partial get-region-allele xs) alleles)]
              (if (contains? (set (map first xs)) :gs-mismatch) 
                (fix-gap-start-mismatch orig-alleles)
          (concat-results [allele-parts]
            (vec (map #(apply str (map % allele-parts)) [first second])))]
     (->> (map (fn [x] [(gap-allele-type alleles x) x]) (range (count (first alleles))))
          (partition-by split-at-match-gaps)
          (map (partial get-region-alleles alleles))))))

Confirm that new variants match correctly back to original. Catch any potential errors in splitting by ensuring reference coordinates and sequences match original.

(defn- sanity-check-split-vcs
  [vc new-vcs]
  (letfn [(get-vc-info [vc]
            (let [alleles (map #(.getDisplayString %) (.getAlleles vc))]
              {:start (.getStart vc)
               :alleles alleles}))
          (get-check-ref [orig new]
            (let [int-pos (- (:start new) (:start orig))
                  check-ref (first (:alleles new))]
              (if (neg? int-pos)
                [0 (subs check-ref (Math/abs int-pos))]
                [int-pos check-ref])))
          (check-split-vc [orig new]
            (let [[int-pos check-ref] (get-check-ref orig new)]
              (when (or (>= int-pos (count (first (:alleles orig))))
                        (neg? int-pos)
                        (not= (subs (first (:alleles orig)) int-pos (+ int-pos (count check-ref)))
                (throw (Exception. (format "Matching problem with split alleles: %s %s %s %s"
                                           (:chr vc) (:start vc) orig new))))))]
    (doall (map (partial check-split-vc (get-vc-info (:vc vc)))
                (map get-vc-info new-vcs)))))

Split complex indels into individual variant components.

(defn- split-complex-indel
  [vc ref]
  (let [prev-pad (or (extract-sequence ref (:chr vc) (dec (:start vc)) (dec (:start vc))) "N")
        ; Do not use reference sequence. Causes more trouble than aligning ref/alt directly.
        ref-seq nil ;(extract-sequence ref (:chr vc) (:start vc) (:end vc))
        alleles (split-alleles vc (->> (conj (get-vc-alleles vc) ref-seq)
                                       (remove empty?)
                                       (remove nil?)
                                       (#(if (> (count %) 2) % (left-align-complex %))))
                               :prev-pad prev-pad)]
    (when-not (= (count alleles) (count (set (map :offset alleles))))
      (throw (Exception. (format "Mutiple alleles at same position: %s %s %s"
                                 (:chr vc) (:start vc) (vec alleles)))))
    (let [split-vcs (map (fn [[i x]] (new-split-vc (:vc vc) i x))
                         (map-indexed vector alleles))]
      (sanity-check-split-vcs vc split-vcs)

Remove extra variant bases, if necessary, from 5' end of indels. Checks both called alleles and potential alleles for extra 5' padding removing this if not needed to distinguish any potential alleles.

(defn- maybe-strip-indel
  (letfn [(strip-indel [vc i alleles]
            (let [start-pos (dec i)
                  ref-allele (subs (first alleles) start-pos)
                  cur-alleles (map #(Allele/create (subs % start-pos)
                                                   (= ref-allele (subs % start-pos)))
              (new-split-vc vc 0 {:offset start-pos
                                  :size (dec (count ref-allele))
                                  :orig-alleles alleles
                                  :ref-allele (first cur-alleles)
                                  :alleles (rest cur-alleles)})))
          (variant-allele-pos [input-alleles]
            (let [str-alleles (map #(.getDisplayString %) input-alleles)
                  first-var-i (first (filter #(has-variant-base? str-alleles %)
                                             (range (apply max (map count str-alleles)))))]
              [str-alleles first-var-i]))
          (used-alt-alleles [vc]
            (let [genotype-alleles (set (mapcat :alleles (:genotypes vc)))]
              (filter #(contains? genotype-alleles %) (:alt-alleles vc))))]
    (let [alt-alleles (used-alt-alleles vc)
          [orig-alleles first-var-i] (variant-allele-pos (cons (:ref-allele vc)
          [_ nocall-i] (variant-allele-pos (cons (:ref-allele vc) alt-alleles))]
      (if (or (nil? first-var-i) (<= first-var-i 1)
              (nil? nocall-i) (<= nocall-i 1))
        (:vc vc)
        (strip-indel (:vc vc) first-var-i orig-alleles)))))

VCF file conversion

Process entire files, normalizing complex variations

Round the start value of a VC to the nearest ten million. This heuristic will cause problems with out of order variant contexts that span this split junction (9999999 and 10000000) but saves having to work with overlapping groups allowing streaming.

(defn- round-vc-start
  (let [rounder 10000000.0]
    {:chr (.getChr vc)
     :pos (-> (.getStart vc)
              (/ rounder)
              (* rounder)

Sort a group of variant contexts by start position. Ensures that post-normalized variant contexts sort correctly within the blocks defined by round-vc-start.

(defn- sort-vc-group
  (sort-by #(.getStart %) vcs))

Lazy list of variant context with MNPs split into single genotypes and indels stripped.

(defn- get-normalized-vcs
  [vc-iter ref]
  (letfn [(process-vc [vc]
            (condp = (:type vc)
              "MNP" (split-mnp vc)
              "INDEL" (if (is-multi-indel? vc)
                        (split-complex-indel vc ref)
                        [(maybe-strip-indel vc)])
              [(:vc vc)]))]
    (->> (mapcat process-vc vc-iter)
         (partition-by round-vc-start)
         (mapcat sort-vc-group))))

Left align variants in an input VCF file for a standard representation. Checks final line count of prepared file, returning left-aligned files only when converting every variant in the input.

(defn left-align-variants
  [in-file ref & {:keys [out-dir rerun?]}]
  (letfn [(line-count [f]
            (with-open [rdr (reader f)]
              (count (remove #(.startsWith % "#") (line-seq rdr)))))]
    (let [file-info {:out-vcf (itx/add-file-part in-file "leftalign" out-dir)}
          args ["-R" ref "-o" :out-vcf "--variant" in-file]]
      (when (and rerun? (fs/exists? (:out-vcf file-info)))
        (fs/delete (:out-vcf file-info)))
      (broad/run-gatk "LeftAlignVariants" args file-info {:out [:out-vcf]})
      (if (= (line-count in-file) (line-count (:out-vcf file-info)))
        (:out-vcf file-info)

Convert MNPs and indels into normalized representation.

(defn normalize-variants
  ([in-file ref]
     (normalize-variants in-file ref nil))
  ([in-file ref out-dir & {:keys [out-fname]}]
     (let [base-name (if (nil? out-fname) (itx/remove-zip-ext in-file) out-fname)
           out-file (itx/add-file-part base-name "nomnp" out-dir)
           out-pre-file (itx/add-file-part base-name "worknomnp" out-dir)]
       (when (itx/needs-run? out-file)
         (when (fs/exists? out-pre-file)
           (fs/delete out-pre-file))
         (let [la-file (left-align-variants in-file ref :out-dir out-dir :rerun? true)]
           (with-open [vcf-iter (get-vcf-iterator la-file ref)]
             (write-vcf-w-template in-file {:out out-pre-file}
                                   (get-normalized-vcs (parse-vcf vcf-iter) ref)
           (fs/rename (left-align-variants out-pre-file ref :out-dir out-dir :rerun? true)

Load and prepare inputs from YAML configuration files.

(ns bcbio.variation.config
  (:use [clojure.java.io]
        [clj-time.local :only [format-local-time local-now]])
  (:require [clojure.string :as string]
            [clojure.tools.logging :as log]
            [clj-stacktrace.repl :as stacktrace]
            [clj-yaml.core :as yaml]
            [fs.core :as fs]
            [pallet.algo.fsm.fsm :as fsm-base]
            [pallet.algo.fsm.fsm-dsl :as fsm]
            [pallet.algo.fsm.event-machine :as event-machine]))


(defn- get-log-file [config]
  (let [out-dir (get-in config [:dir :out])]
    (when-not (nil? out-dir)
      (when-not (fs/exists? out-dir)
        (fs/mkdirs out-dir))
      (file out-dir "processing-status.log"))))

Retrieve current processing status information from state machine log file.

(defn get-log-status
  (when-let [log-file (get-log-file config)]
    (when (fs/exists? log-file)
      (with-open [rdr (reader log-file)]
        (let [[_ state-str info-str] (string/split (last (line-seq rdr)) #" :: ")]
          (-> (read-string info-str)
              (assoc :state (read-string (last (string/split state-str #" "))))))))))

Write an error exception to the processing log file

(defn traceback-to-log
  [e config]
  (with-open [wtr (writer (get-log-file config) :append true)]
    (binding [*out* wtr]
      (stacktrace/pst e)
      (println (str (format-local-time (local-now) :date-hour-minute-second)
                    " :: State :error :: {:desc \"Exception during processing: "
                    e "\"}")))))

Define a finite state machine of transitions during comparison processes.

(defn prep-comparison-fsm
  (let [out-file (get-log-file config)]
    (letfn [(log-transition [_ new-state]
              (let [out (format "State %s :: %s" (:state-kw new-state)
                                (:state-data new-state))]
                (log/log :info out)
                (when out-file
                  (spit out-file (str (format-local-time (local-now) :date-hour-minute-second)
                                      " :: " out "\n") :append true))))]
      (log-transition nil {:state-kw :begin :state-data {:desc "Starting variation analysis"}})
        (fsm/using-fsm-features (fsm-base/with-transition-observer log-transition))
        (fsm/initial-state :begin)
        (fsm/initial-state-data {})
        (fsm/state :begin
                   (fsm/valid-transitions :clean))
        (fsm/state :clean
                   (fsm/valid-transitions :merge))
        (fsm/state :merge
                   (fsm/valid-transitions :prep))
        (fsm/state :prep
                   (fsm/valid-transitions :normalize))
        (fsm/state :normalize
                   (fsm/valid-transitions :combine :clean))
        (fsm/state :combine
                   (fsm/valid-transitions :annotate))
        (fsm/state :annotate
                   (fsm/valid-transitions :filter))
        (fsm/state :filter
                   (fsm/valid-transitions :compare))
        (fsm/state :compare
                   (fsm/valid-transitions :compare :finalize :summary :clean :finished))
        (fsm/state :finalize
                   (fsm/valid-transitions :finalize :compare :summary :clean))
        (fsm/state :summary
                   (fsm/valid-transitions :finished :clean))
        (fsm/state :finished))))))

Perform a transition on configured finite state machine moving to the provided state

(defn do-transition
  [config state desc]
  (if-let [do-trans (get-in config [:fsm :transition])]
    (do-trans #(assoc % :state-kw state :state-data {:desc desc}))
    (println state desc)))


Add files of interest in a directory with the given extension. This allows batch processing of directories.

(defn- add-dir-files
  [config exts]
  (letfn [(files-from-dir [dir]
            (->> (fs/list-dir dir)
                 (filter #(contains? exts (fs/extension %)))
                 (map #(str (fs/file dir %)))))
          (process-call [call]
            (if-let [dir (:dir call)]
              (assoc call :file (files-from-dir dir))
          (process-exp [exp]
            (assoc exp :calls (map process-call (:calls exp))))]
    (assoc config :experiments
           (map process-exp (:experiments config)))))

Do not allow duplicate names in experiments.

(defn- no-duplicate-names?
  (letfn [(exp-no-duplicate? [exp]
            (every? (fn [[_ x]] (= 1 x)) (frequencies (map :name (:calls exp)))))]
    (every? exp-no-duplicate? (:experiments config))))

Load configuration file, handling conversion of relative to absolute paths.

(defn load-config
  {:post [(no-duplicate-names? %)]}
  (let [config (-> config-file slurp yaml/parse-string)
        base-dir (fs/file (get-in config [:dir :base] "."))
        to-process #{[:dir :out] [:dir :prep]
                     [:experiments :ref] [:experiments :intervals]
                     [:experiments :align] [:experiments :calls :file]
                     [:experiments :calls :align] [:experiments :calls :annotate]
                     [:experiments :calls :dir]}]
    (letfn [(make-absolute [x]
              (if (.isAbsolute (file x))
                (str (fs/file base-dir x))))
            (maybe-process [val path]
              (if (contains? to-process path)
                 (seq? val) (map make-absolute val)
                 (string? val) (make-absolute val)
                 :else val)
            (update-tree [config path]
              (cond (map? config)
                    (reduce (fn [item [k v]]
                              (assoc item k (cond
                                             (map? v) (update-tree v (conj path k))
                                             (seq? v) (map #(update-tree % (conj path k)) v)
                                             :else (maybe-process v (conj path k)))))
                            (vec config))
                    (contains? to-process path) (maybe-process config path)
                    :else config))]
      (-> config
          (update-tree [])
          (add-dir-files #{".vcf"})
          (#(assoc % :fsm (prep-comparison-fsm %)))))))
(ns bcbio.variation.core
  (:import [org.broadinstitute.sting.gatk CommandLineGATK])
  (:require [clojure.string :as string]

Mapping of special command line arguments to main functions

(def ^{:doc 
       :private true}
  {:compare bcbio.variation.compare/-main
   :prep bcbio.variation.combine/-main
   :haploid bcbio.variation.haploid/-main
   :reorder bcbio.align.reorder/-main
   :utils bcbio.variation.utils.core/-main})

Retrieve alternative main functions based on first argument.

(defn- get-altmain-fn
  (when (and (not (nil? arg))
             (.startsWith arg "variant-"))
    (get altmain-map
         (keyword (string/replace-first arg "variant-" "")))))
(defn -main [& args]
  (if-let [alt-fn (get-altmain-fn (first args))]
      (apply alt-fn (rest args))
      (System/exit 0))
    (CommandLineGATK/main (into-array (if-not (nil? args) args ["-h"])))))

Commandline dispatch for custom one-off exploratory code

(ns bcbio.variation.custom.core
  (:require [bcbio.variation.custom.nist :as nist]))
(defn -main [prog & args]
  (apply (case (keyword prog)
           :nist nist/summarize-discordants)

Explore variant calling from fosmid data against NIST whole genome datasets

(ns bcbio.variation.custom.nist
  (:use [bcbio.variation.filter.attr :only [prep-vc-attr-retriever]])
  (:require [clojure.string :as string]
            [incanter.stats :as istats]
            [bcbio.variation.variantcontext :as gvc]))

Retrieve filter information associated with NIST variant calls

(defn- get-nist-filter
  [retriever vc]
  (if-let [nist-vc (first (gvc/variants-in-region retriever vc))]
    (if (empty? (:filters nist-vc))
      (first (:filters nist-vc)))
(defn- get-gms-score
  [attr-get vc]
  (let [attr "gms_illumina"]
    (-> (attr-get [attr] vc)
        (get attr))))

Provide high level statistics on discordant calls.

(defn- collect-stats-for-discordant
  [fosmid-file nist-file ref-file]
  (let [attr-get (prep-vc-attr-retriever fosmid-file ref-file)]
    (with-open [vrn-iter (gvc/get-vcf-iterator fosmid-file ref-file)
                nist-retriever (gvc/get-vcf-retriever ref-file nist-file)]
      (reduce (fn [coll vc]
                {:filters (let [filt (get-nist-filter nist-retriever vc)]
                            ;; (when (= :ref-call filt)
                            ;;   (println ((juxt :chr :start) vc) (get-gms-score attr-get vc)))
                            (assoc (:filters coll) filt (inc (get-in coll [:filters filt] 0))))
                 :gms (cons (get-gms-score attr-get vc) (:gms coll))})
              {:filters {} :gms []}
              (filter #(= "SNP" (:type %)) (gvc/parse-vcf vrn-iter))))))

Split NIST filter names into individual components to summarize.

(defn- split-filter-name
  (-> x
      (string/replace "filtered" "")
      (string/split #"Tranche")
      (#(remove empty? %))))
(defn- counts-to-individual-filters
  (letfn [(split-by-filter [[k v]]
            (when-not (keyword? k)
              (partition 2 (interleave (split-filter-name k) (repeat v)))))]
    (reduce (fn [coll [k v]]
              (assoc coll k (+ v (get coll k 0))))
            (mapcat split-by-filter xs))))

Summarize discordant calls between fosmid and NIST calls.

(defn summarize-discordants
  [fosmid-file nist-file ref-file]
  (let [stats (collect-stats-for-discordant fosmid-file nist-file ref-file)
        ready-gms (filter #(< % 100.0) (:gms stats))
        special-filters [:ref-call :no-call]]
    (doseq [k special-filters]
      (println k (get-in stats [:filters k])))
    (doseq [[k v] (counts-to-individual-filters (:filters stats))]
      (println k v))
    (println "GMS" (count (:gms stats)) (count ready-gms) (istats/quantile ready-gms))))

Provide high level summary evaluation of variant results, building off GATK VariantEval.

(ns bcbio.variation.evaluate
  (:import [org.broadinstitute.sting.gatk.report GATKReport])
  (:use [clojure.java.io]
        [ordered.map :only [ordered-map]])
  (:require [clojure.string :as string]
            [doric.core :as doric]
            [bcbio.run.itx :as itx]
            [bcbio.run.broad :as broad]))

Compare two variant files with GenotypeConcordance in VariantEval

(defn calc-variant-eval-metrics
  [sample vcf1 vcf2 ref & {:keys [out-base intervals]}]
  (let [file-info {:out-eval (str (itx/file-root (if (nil? out-base) vcf1 out-base)) ".eval")}
        args (concat
              ["-R" ref
               "--out" :out-eval
               "--eval" vcf1
               "--comp" vcf2
               "--sample" sample
               "--evalModule" "CompOverlap"
               "--evalModule" "CountVariants"
               "--evalModule" "GenotypeConcordance"
               "--evalModule" "TiTvVariantEvaluator"
               "--evalModule" "ValidationReport"
               "--stratificationModule" "Sample"
               "--stratificationModule" "Filter"]
              (broad/gatk-cl-intersect-intervals intervals ref))]
    (broad/run-gatk "VariantEval" args file-info {:out [:out-eval]})
    (:out-eval file-info)))

Run VariantEval providing summary information for a VCF file

(defn- calc-summary-eval-metrics
  [vcf ref dbsnp intervals cmp-interval-file]
  (let [file-info {:out-eval (str (itx/file-root vcf) "-summary.eval")}
        args (concat
              ["-R" ref
               "--out" :out-eval
               "--eval" vcf
               "--evalModule" "CompOverlap"
               "--evalModule" "CountVariants"
               "--evalModule" "ThetaVariantEvaluator"
               "--evalModule" "TiTvVariantEvaluator"
               "--evalModule" "ValidationReport"
               "--evalModule" "VariantSummary"
               "--stratificationModule" "Filter"]
              (broad/gatk-cl-intersect-intervals intervals ref)
              (if (nil? dbsnp) [] ["--dbsnp" dbsnp])
              (if (nil? cmp-interval-file)
                ["--stratificationModule" "IntervalStratification"
                 "--stratIntervals" cmp-interval-file]))]
    (broad/run-gatk "VariantEval" args file-info {:out [:out-eval]})
    (:out-eval file-info)))

Parses a GATK output table and filters based on supplied input function.

(defn organize-gatk-report-table
  [eval-file table-name filter-fn]
  (let [table (-> (GATKReport. (file eval-file))
                  (.getTable table-name))
        cols (rest (.getColumnInfo table))
        headers (map #(keyword (.getColumnName %)) cols)]
    (->> (for [i (range (.getNumRows table))]
           (zipmap headers
                   (map #(.get table i (inc %)) (range (count headers)))))
         (filter filter-fn))))

Provide high level summary metrics of a single variant file.

(defn summary-eval-metrics
  [vcf ref & {:keys [intervals cmp-intervals dbsnp]}]
  (let [group-metrics (concat [:Novelty] (if intervals [:IntervalStratification] []))
        val-metrics [:nSamples :nProcessedLoci :nSNPs :TiTvRatio :TiTvRatioPerSample
                     :nSNPsPerSample :SNPNoveltyRate]
        count-metrics [:nSNPs :nInsertions :nDeletions :nHets :nHomVar :hetHomRatio]]
    (letfn [(all-called? [x]
              (and (= (:Filter x) "called")
                   (contains? #{nil "all"} (:Sample x))))
            (select-keys-ordered [metrics coll]
              (ordered-map (map (fn [x] [x (get coll x)]) metrics)))
            (get-table-info [eval-file table metrics]
              (->> (organize-gatk-report-table eval-file table all-called?)
                   (map (partial select-keys-ordered metrics))))
            (merge-line [vals]
              (reduce (fn [outer tbl-vals]
                        (reduce (fn [inner [k v]]
                                  (assoc inner k v))
                                outer (remove #(contains? (set group-metrics) %1) tbl-vals)))
                      (first vals) (rest vals)))
            (merge-tables [& tbls]
              (map merge-line
                   (partition (count tbls) (apply interleave tbls))))]
      (let [eval-file (calc-summary-eval-metrics vcf ref dbsnp
                                                 intervals cmp-intervals)]
         (get-table-info eval-file "CountVariants" (concat group-metrics count-metrics))
         (get-table-info eval-file "VariantSummary" (concat group-metrics val-metrics)))))))

Write high level summary metrics to CSV file.

(defn write-summary-eval-metrics
  [vcf ref & {:keys [intervals cmp-intervals dbsnp]}]
  (let [out-file (str (itx/file-root vcf) "-summary.csv")]
    (let [metrics (summary-eval-metrics vcf ref :intervals intervals :cmp-intervals cmp-intervals
                                        :dbsnp dbsnp)]
      (with-open [wtr (writer out-file)]
        (.write wtr (str (string/join "," (map name (-> metrics first keys))) "\n"))
        (doseq [xs metrics]
          (.write wtr (str (string/join "," (vals xs)) "\n")))))))
(defn -main
  ([vcf ref dbsnp intervals cmp-intervals]
     (write-summary-eval-metrics vcf ref :intervals intervals :cmp-intervals cmp-intervals
                                 :dbsnp dbsnp))
  ([vcf ref dbsnp cmp-intervals]
     (write-summary-eval-metrics vcf ref :cmp-intervals cmp-intervals
                                 :dbsnp dbsnp))
  ([vcf ref dbsnp]
     (write-summary-eval-metrics vcf ref :dbsnp dbsnp)))

Filter variant calls according to supplied criteria.

(ns bcbio.variation.filter
  (:use [clojure.string :only [split]]
        [bcbio.variation.filter.attr :only [get-vc-attr prep-vc-attr-retriever]]
        [bcbio.variation.filter.classify :only [pipeline-classify-filter]]
        [bcbio.variation.filter.specific :only [get-x-specific-variants]]
        [bcbio.variation.filter.trusted :only [get-support-vcfs get-trusted-variants]]
        [bcbio.variation.filter.util :only [remove-cur-filters]]
        [bcbio.variation.metrics :only [to-float passes-filter?]]
        [bcbio.variation.variantcontext :only [parse-vcf write-vcf-w-template
                                               get-vcf-iterator write-vcf-from-filter
  (:require [clojure.set :as set]
            [clojure.string :as string]
            [bcbio.run.broad :as broad]
            [bcbio.run.itx :as itx]))
(defn jexl-from-config [jexl-filters]
  "Retrieve GATK JEXL commandline expressions from filters."
  (letfn [(jexl-args [x]
            ["--filterName" (str (first (split x #"\s+")) "Filter")
             "--filterExpression" x])]
    (flatten (map jexl-args jexl-filters))))

Perform hard variant filtering with supplied JEXL expression criteria.

(defn variant-filter
  [in-vcf jexl-filters ref]
  (let [file-info {:out-vcf (itx/add-file-part in-vcf "filter")}
        args (concat ["-R" ref
                      "--variant" in-vcf
                      "-o" :out-vcf
                      "-l" "ERROR"
                      "--unsafe" "ALL" ;"ALLOW_SEQ_DICT_INCOMPATIBILITY"
                      (jexl-from-config jexl-filters))]
    (broad/run-gatk "VariantFiltration" args file-info {:out [:out-vcf]})
    (:out-vcf file-info)))

Perform hard variant filtration handling both range and category metrics

(defn category-variant-filter
  [in-vcf metrics ref & {:keys [remove?]}]
  (let [attr-getter (prep-vc-attr-retriever in-vcf ref)]
    (letfn [(infinity-flag? [x]
              (.contains (str x) "Infinity"))
            (in-range? [[orig-min orig-max] x]
              (let [min (if (infinity-flag? orig-min) (- Integer/MAX_VALUE) orig-min)
                    max (if (infinity-flag? orig-max) Integer/MAX_VALUE orig-max)]
                (and (>= x min) (<= x max))))
            (attr-passes? [got want]
               (set? want) (or (empty? want)
                               (not (empty? (set/intersection got want)))) 
               (or (vector? want) (list? want)) (in-range? want got)))
            (passes-metrics? [vc]
              (let [attrs (attr-getter (keys metrics) vc)]
                (and (passes-filter? vc) 
                     (every? (fn [[k v]]
                               (attr-passes? (get attrs k) v)) metrics))))
            (range-to-str [k [min max]]
               (infinity-flag? min) (format "%s > %.1f" k max)
               (infinity-flag? max) (format "%s < %.1f" k min)
               :else (format "%s not [%.1f %.1f]" k min max)))
            (metric-to-str [[k v]]
               (set? v) (format "%s not [%s]" k (string/join "," v))
               (or (vector? v) (list? v)) (range-to-str k v)))]
      (if remove?
        (select-variants in-vcf passes-metrics? "filter" ref)
        (write-vcf-from-filter in-vcf ref "filter"
                               "ManualRanges" (string/join "; " (map metric-to-str metrics))

Perform hard filtering base on JEXL expressions on metrics in the Genotype FORMAT field.

(defn variant-format-filter
  [in-vcf exps ref]
  (letfn [(format-filter [exp]
            (let [[attr op-str str-val] (string/split exp #" ")
                  val (to-float str-val)
                  op (eval (read-string op-str))]
              (fn [vc]
                (when-let [vc-val (get-vc-attr vc [:format attr] {})]
                  (not (op vc-val val))))))
          (format-filter-multi [exps]
            (let [int-filters (map format-filter exps)]
              (fn [vc]
                (every? true? (map #(% vc) int-filters)))))]
    (write-vcf-from-filter in-vcf ref "ffilter"
                           "FormatRanges" (string/join "; " exps)
                           (format-filter-multi exps))))

Perform the variant recalibration step with input training VCF files. training-vcfs is a list of {:file vcf-file :name name-to-use :prior probability}

(defn- variant-recalibration
  [in-vcf training-vcfs annotations ref & {:keys [lenient]}]
  (let [base-out (itx/file-root in-vcf)
        file-info {:out-recal (str base-out ".recal")
                   :out-tranch (str base-out ".tranches")
                   :out-r (str base-out "-recalplots.R")}
        args (concat ["-R" ref
                      "-input" in-vcf
                      "-recalFile" :out-recal
                      "-tranchesFile" :out-tranch
                      "-rscriptFile" :out-r
                      "--mode" "BOTH"]
                     (if lenient
                       ["--percentBadVariants" "0.05"
                        "--maxGaussians" "4"]
                       ["--percentBadVariants" "0.03"
                        "--maxGaussians" "10"])
                     (flatten (map (fn [x] ["-an" x]) annotations))
                     (flatten (map (fn [x] [(str "-resource:" (:name x)
                                                 ",truth=" (:truth x)
                                                 ",prior=" (:prior x)
                                                 ",bad=" (:bad x))
                                            (:file x)])
    (broad/run-gatk "VariantRecalibrator" args file-info {:out [:out-recal :out-tranch]})

Apply variant recalibration to input VCF.

(defn- apply-recalibration
  [in-vcf recal-files ref]
  (let [file-info {:out-vcf (itx/add-file-part in-vcf "recalfilter")}
        args ["-R" ref
              "-input" in-vcf
              "--ts_filter_level" "99.0"
              "--mode" "BOTH"
              "-tranchesFile" (:out-tranch recal-files)
              "-recalFile" (:out-recal recal-files)
              "-o" :out-vcf]]
    (broad/run-gatk "ApplyRecalibration" args file-info {:out [:out-vcf]})
    (:out-vcf file-info)))

Perform filtration using variant recalibration based on known variations. Training-vcfs is a list of true training sites along with associated probability and name.

(defn variant-recal-filter
  [in-vcf training-vcfs annotations ref & {:keys [lenient]}]
  (let [recal-files (variant-recalibration in-vcf training-vcfs annotations ref :lenient lenient)]
    (apply-recalibration in-vcf recal-files ref)))

Retrieve training information for GATK recalibration: - No support specified: use the target comparison - Support specified and a specific comparison pair - Support specified as a single target: use target versus all comparison

(defn- get-train-info
  [cmps-by-name support config]
  (let [support-vcfs (get-support-vcfs cmps-by-name support config)]
      [{:file (:true-positives support-vcfs)
        :name "concordant"
        :truth "true"
        :bad "false"
        :prior 10.0}
       {:file (:false-positives support-vcfs)
        :name "discordant"
        :truth "false"
        :bad "true"
        :prior 10.0}]))

Perform variant recalibration and filtration as part of processing pipeline.

(defn pipeline-recalibration
  [cmps-by-name finalizer exp config]
  (let [init-target (get cmps-by-name (:target finalizer)
                         (get cmps-by-name (reverse (:target finalizer))))
        all-params (let [x (:params finalizer)] (if (map? x) [x] x))]
    (reduce (fn [target [params fkey]]
              (let [in-vcf (remove-cur-filters (-> target fkey :file) (:ref exp))
                    support (get params :support (:target finalizer))
                    train-info (get-train-info cmps-by-name support config)
                    trusted-info [{:name "trusted"
                                   :file (when-let [trusted (:trusted params)]
                                           (get-trusted-variants cmps-by-name support trusted
                                                                 exp config))}
                                  {:name "xspecific"
                                   :file (when (:xspecific params)
                                           (get-x-specific-variants cmps-by-name support exp config))}]]
                (-> target
                    (assoc-in [fkey :file]
                              (-> in-vcf
                                  (#(if-let [anns (:annotations params)]
                                      (variant-recal-filter % train-info
                                                            anns (:ref exp)
                                                            :lenient (:lenient params))
                                  (#(if-let [hard-filters (:filters params)]
                                      (variant-filter % hard-filters (:ref exp))
                                  (#(if-not (:classifiers params)
                                      (pipeline-classify-filter % (concat trusted-info train-info)
                                                                (get target fkey)
                                                                exp params config)))))
                    (#(assoc-in % [fkey :name] (format "%s-%s" (get-in % [fkey :name]) "recal")))
                    (assoc-in [fkey :mod] "recal")
                    (assoc :re-compare true))))
            init-target (map vector all-params [:c1 :c2]))))

Provide generalized access to variant attributes, handling retrieval from multiple sources (VCF INFO file, VCF FORMAT field, Gemini).

(ns bcbio.variation.filter.attr
  (:use [bcbio.variation.haploid :only [get-likelihoods]]
        [bcbio.variation.metrics :only [to-float]])
  (:require [clojure.string :as string]
            [incanter.stats :as stats]
            [lonocloud.synthread :as ->]
            [bcbio.variation.variantcontext :as gvc]
            [bcbio.variation.index.gemini :as gemini]))

Generalized retrieval of attributes from variant with a single genotype.

(defmulti get-vc-attr
  (let [gemini-ids (set (map :id (gemini/available-metrics nil :include-noviz? true)))]
    (fn [vc attr retrievers]
      (if (contains? gemini-ids attr)
(defmethod get-vc-attr "AD"
  ^{:doc "AD: Allelic depth for ref and alt alleles. Converted to percent
          deviation from expected for haploid/diploid calls.
          Also calculates allele depth from AO and DP used by FreeBayes.
          AO is the count of the alternative allele."}
  [vc attr _]
  {:pre [(= 1 (:num-samples vc))
         (contains? #{1 2} (-> vc :genotypes first :alleles count))]}
  (letfn [(calc-expected [g ref-count allele-count]
            {:pre [(not (neg? ref-count))]}
            (when (or (pos? ref-count) (pos? allele-count))
              (when-let [e-pct (get {"HOM_VAR" 1.0 "HET" 0.5 "HOM_REF" 0.0} (:type g))]
                (Math/abs (- e-pct (/ allele-count (+ allele-count ref-count)))))))
          (from-ad [g]
            (let [ads (map float (get-in g [:attributes attr]))
                  ref-count (first ads)
                  allele-count (apply + (rest ads))]
              (calc-expected g ref-count allele-count)))
          (from-ao [g]
            (let [alt-count (apply + (map #(Float/parseFloat %)
                                          (string/split (get-in g [:attributes "AO"]) #",")))
                  total-count (float (get-in g [:attributes "DP"]))]
              (calc-expected g (- total-count alt-count) alt-count)))]
    (let [g (-> vc :genotypes first)]
       (get-in g [:attributes "AO"]) (from-ao g)
       (seq (get-in g [:attributes attr])) (from-ad g)
       :else nil
       ;; (println (format "AD not found in attributes %s %s %s"
       ;;                  (:attributes g) (:chr vc) (:start vc)))
(defmethod get-vc-attr [:format "AD"]
  [vc attr retrievers]
  (get-vc-attr vc "AD" retrievers))

Convert p-value into Phred scores compatible with bayesian likelihoods

(defn convert-pval-to-phred
  (max (* 10.0 (Math/log10 (to-float pval)))

Retrieve PLs, handling non-Bayesian callers by conversion of p-values to phred scores.

(defn get-pls
  {:pre [(= 1 (:num-samples vc))
         (contains? #{1 2} (-> vc :genotypes first :alleles count))]}
  (let [g (-> vc :genotypes first)
        pls (dissoc (get-likelihoods (:genotype g) :no-convert true)
                    (:type g))
        pval (when-let [pval (get-in g [:attributes "PVAL"])]
               (convert-pval-to-phred pval))]
    (-> (:genotype g)
        (get-likelihoods :no-convert true)
        (dissoc (:type g))
        (->/when pval
          (assoc "HOM_REF" pval)))))
(defmethod get-vc-attr "PL"
  ^{:doc "Provide likelihood confidence for the called genotype.
          For reference calls, retrieve the likelihood of the most likely
          variant (least negative). For variant calls, retrieve
          the reference likelihood.
          Handles non-Bayesian callers by conversion of p-values for phred scores."}
  [vc attr _]
  (let [g (-> vc :genotypes first)
        pls (get-pls vc)]
    (when-not (empty? pls)
      (if (= (:type g) "HOM_REF")
        (apply max (vals pls))
        (get pls "HOM_REF")))))
(defmethod get-vc-attr "PLratio"
  ^{:doc "Calculate ratio of reference likelihood call to alternative variant calls.
          This helps measure whether a call is increasingly likely to be reference
          compared with variant choices."}
  [vc attr _]
  {:pre [(= 1 (:num-samples vc))]}
  (let [g (-> vc :genotypes first)
        pls (dissoc (get-likelihoods (:genotype g) :no-convert true)
                    (:type g))]
    (when-not (zero? (count pls))
      (/ (get pls "HOM_REF")
         (apply min (cons -1.0 (-> pls (dissoc "HOM_REF") vals)))))))
(defmethod get-vc-attr "QUAL"
  [vc attr _]
  (:qual vc))
(defmethod get-vc-attr [:format "DP"]
  ^{:doc "Retrieve depth from Genotype FORMAT metrics.
          Handles custom cases like cortex_var with alternative
          depth attributes, and Illumina with (DPU and DPI)."}
  [vc attr _]
  {:pre [(= 1 (:num-samples vc))]}
  (letfn [(contains-good? [xs x]
            (and (contains? xs x)
                 (not= (get xs x) -1)
                 (not= (get xs x) [])))]
    (let [g-attrs (-> vc :genotypes first :attributes)]
       (contains-good? g-attrs "DP") (to-float (get g-attrs "DP"))
       (contains-good? g-attrs "AD") (to-float (apply + (get g-attrs "AD")))
       (contains-good? g-attrs "COV") (int (apply + (map to-float (string/split (get g-attrs "COV") #","))))
       (contains-good? g-attrs "DPU") (to-float (get g-attrs "DPU"))
       (contains-good? g-attrs "DPI") (to-float (get g-attrs "DPI"))
       :else nil))))
(defmethod get-vc-attr "DP"
  ^{:doc "Retrieve depth for an allele, first trying genotype information
          then falling back on information in INFO column."}
  [vc attr rets]
  (if-let [gt-dp (when (= 1 (:num-samples vc))
                   (get-vc-attr vc [:format attr] rets))]
    (to-float (get-in vc [:attributes attr]))))
(defmethod get-vc-attr "Context"
  ^{:doc "Retrieve cytosine context, relative to standard CG sites"}
  [vc attr _]
  (let [ctxt (get-in vc [:attributes attr])]
    (if (string? ctxt) ctxt (first ctxt))))
(defmethod get-vc-attr "CM"
  ^{:doc "Retrieve number of methylated cytosines, requires a single sample"}
  [vc _ _]
  (let [g-attrs (when (= 1 (:num-samples vc))
                  (select-keys (-> vc :genotypes first :attributes)
    (when (seq g-attrs)
      (get g-attrs "CM"))))
(defmethod get-vc-attr "CU"
  ^{:doc "Retrieve percentage of methylated cytosines, requires a single sample"}
  [vc _ _]
  (let [g-attrs (when (= 1 (:num-samples vc))
                  (reduce (fn [coll [k v]]
                            (assoc coll k (to-float v)))
                          (select-keys (-> vc :genotypes first :attributes)
                                       ["CM" "CU"])))]
    (when (= 2 (count g-attrs))
      (let [total (apply + (vals g-attrs))]
        (if (zero? total) 0.0 (/ (get g-attrs "CM") total))))))
(defmethod get-vc-attr :gemini
  ^{:doc "Retrieve attribute information from associated Gemini index."}
  [vc attr retrievers]
  (when-let [getter (:gemini retrievers)]
    (getter vc attr)))
(defmethod get-vc-attr :default
  [vc attr _]
  (let [x (get-in vc [:attributes attr])]
    (when-not (nil? x)
      (try (Float/parseFloat x)
           (catch java.lang.NumberFormatException _ x)))))

Retrieve attributes from variants independent of location.

(defn get-vc-attrs
  [vc attrs retrievers]
  (zipmap attrs (map #(get-vc-attr vc % retrievers) attrs)))

Retrieve quantile ranges of attributes for min/max normalization.

(defn get-vc-attr-ranges
  [attrs in-vcf ref retrievers]
  (letfn [(get-quartiles [[k v]]
            [k (stats/quantile (remove nil? v) :probs [0.05 0.95])])]
    (with-open [vcf-iter (gvc/get-vcf-iterator in-vcf ref)]
      (->> (reduce (fn [coll vc]
                    (reduce (fn [icoll [k v]]
                              (assoc icoll k (cons v (get icoll k))))
                            coll (get-vc-attrs vc attrs retrievers)))
                  (zipmap attrs (repeat [])) (gvc/parse-vcf vcf-iter))
           (map get-quartiles)
           (into {})))))
(defn- get-external-retrievers
  [in-file ref-file]
  {:gemini (gemini/vc-attr-retriever in-file ref-file)})

Normalized attributes for each variant context in an input file. Passed two input VCFs: - in-vcf -- provides full range of inputs for classification and used for building normalization ranges. - work-vcf -- file for attribute retrieval, used to setup variable retrieval from external sources like Gemini

(defmulti get-vc-attrs-normalized
  (fn [_ _ _ config] (keyword (get config :normalize "default"))))
(defmethod get-vc-attrs-normalized :minmax
  ^{:doc "Minimum/maximum normalization to a 0-1 scale using quartiles."}
  [attrs in-vcf ref config]
  (letfn [(min-max-norm [x [minv maxv]]
            (let [safe-maxv (if (= minv maxv) (inc maxv) maxv)
                  trunc-score-max (if (< x safe-maxv) x safe-maxv)
                  trunc-score (if (> trunc-score-max minv) trunc-score-max minv)]
              (/ (- trunc-score minv) (- safe-maxv minv))))
          (min-max-norm-ranges [mm-ranges [k v]]
            [k (when-not (nil? v)
                 (min-max-norm v (get mm-ranges k)))])]
    (let [retrievers (get-external-retrievers in-vcf ref)
          mm-ranges (get-vc-attr-ranges attrs in-vcf ref retrievers)]
      (fn [work-vcf]
        (let [work-retrievers (get-external-retrievers work-vcf ref)]
          (fn [vc]
            (->> (get-vc-attrs vc attrs work-retrievers)
                 (map (partial min-max-norm-ranges mm-ranges))
                 (into {}))))))))
(defmethod get-vc-attrs-normalized :log
  ^{:doc "Log normalization of specified input variables."}
  [attrs in-vcf ref config]
  (let [base-fn (get-vc-attrs-normalized attrs in-vcf ref (assoc config :normalize :default))
        need-log-vars (set (:log-attrs config))]
    (fn [work-vcf]
      (let [inner-fn (base-fn work-vcf)]
        (fn [vc]
          (reduce (fn [coll [k v]]
                    (assoc coll k
                           (if (contains? need-log-vars k) (Math/log v) v)))
                  {} (inner-fn vc)))))))
(defmethod get-vc-attrs-normalized :default
  ^{:doc "Attribute access without normalization."}
  [attrs _ ref config]
  (fn [work-vcf]
    (let [retrievers (get-external-retrievers work-vcf ref)]
      (fn [vc]
        (into {} (get-vc-attrs vc attrs retrievers))))))

Provide easy lookup of attributes from multiple input sources

(defn prep-vc-attr-retriever
  [in-file ref-file]
  (let [retrievers (get-external-retrievers in-file ref-file)]
    (fn [attrs vc]
      (into {} (get-vc-attrs vc attrs retrievers)))))

Provide classification based filtering for variants.

(ns bcbio.variation.filter.classify
  (:import [org.broadinstitute.sting.utils.variantcontext VariantContextBuilder]
           [org.broadinstitute.sting.utils.codecs.vcf VCFHeader VCFInfoHeaderLine
            VCFHeaderLineType VCFFilterHeaderLine VCFHeaderLineCount])
  (:use [ordered.set :only [ordered-set]]
        [clojure.math.combinatorics :only [cartesian-product]]
        [clj-ml.utils :only [serialize-to-file deserialize-from-file]]
        [clj-ml.data :only [make-dataset dataset-set-class make-instance]]
        [clj-ml.classifiers :only [make-classifier classifier-train
                                   classifier-evaluate classifier-classify]]
        [bcbio.variation.filter.util :only [remove-cur-filters]]
        [bcbio.variation.filter.attr :only [get-vc-attrs-normalized prep-vc-attr-retriever]]
        [bcbio.variation.filter.intervals :only [pipeline-combine-intervals]]
        [bcbio.variation.variantcontext :only [parse-vcf write-vcf-w-template
                                               get-vcf-iterator has-variants?
                                               get-vcf-retriever variants-in-region]])
  (:require [clojure.string :as string]
            [fs.core :as fs]
            [bcbio.run.itx :as itx]
            [bcbio.variation.filter.trusted :as trusted]
            [bcbio.variation.filter.rules :as rules]
            [bcbio.variation.variantcontext :as gvc]))

Split variants for classification

Define splitting of classifiers based on variant characteristics.

(defn- classifier-types
  (let [variant-types [:snp :complex]
        zygosity [:hom :het]]
    (map (fn [[vtype z]] {:variant-type vtype
                          :attr-key attr-key
                          :zygosity z})
         (cartesian-product variant-types zygosity))))

Convert a classifier types into a string name for output files.

(defn- ctype-to-str
  (str (name (:attr-key x)) "-"
       (name (:variant-type x)) "_"
       (name (:zygosity x))))

Map variant types to specialized classifiers.

(defn- get-classifier-type
  [vc attr-key attr-get]
  {:variant-type (case (:type vc)
                   "SNP" :snp
   :attr-key attr-key
   :zygosity (rules/vc-zygosity vc)})

Linear classifier

(defn- get-vc-inputs
  [attrs normalizer group vc]
  (let [n-vals (normalizer vc)]
    (conj (vec (map #(get n-vals %) attrs)) group)))

Retrieve normalized training inputs from VCF file.

(defn- get-train-inputs
  [group in-vcf ctype attrs normalizer ref]
  (let [attr-get (prep-vc-attr-retriever in-vcf ref)]
    (with-open [vcf-iter (get-vcf-iterator in-vcf ref)]
      (->> (parse-vcf vcf-iter)
           (filter #(= ctype (get-classifier-type % (:attr-key ctype) attr-get)))
           (map (partial get-vc-inputs attrs normalizer group))
(defn- get-dataset
  [attrs inputs]
  (make-dataset "ds" (conj (vec attrs) {:c [:pass :fail]}) inputs {:class :c}))

Do the work of training a variant classifier.

(defn- train-vcf-classifier
  [ctype attrs pre-normalizer true-vcf false-vcf ref config]
  (let [config (merge {:normalize :default} config)
        inputs (concat (get-train-inputs :pass true-vcf ctype attrs
                                         (pre-normalizer true-vcf)
                       (get-train-inputs :fail false-vcf ctype attrs
                                         (pre-normalizer false-vcf)
        classifier (case (keyword (get config :classifier-type :svm))
                     :svm (make-classifier :support-vector-machine :smo
                                           {:complexity-constant 100.0})
                     :svm-rbf (make-classifier :support-vector-machine :smo
                                               {:kernel-function {:radial-basis {:gamma 0.01}}
                                                :complexity-constant 100000.0})
                     :random-forest (make-classifier :decision-tree :random-forest
                                                     {:num-trees-in-forest 50
                                                      (-> attrs count Math/sqrt Math/ceil int)}))]
    (when (seq inputs)
      (->> (get-dataset attrs inputs)
           (classifier-train classifier)))))

Provide a variant classifier based on provided attributes and true/false examples.

(defn- build-vcf-classifiers
  [attr-map pre-normalizer base-vcf true-vcf false-vcf ref config out-dir]
  (letfn [(build-vcf-classifier [ctype attrs]
            (let [out-dir (if (nil? out-dir) (str (fs/parent base-vcf)) out-dir)
                  out-file (format "%s/%s-%s-classifier.bin" out-dir
                                   (fs/name base-vcf) (ctype-to-str ctype))]
              (if-not (itx/needs-run? out-file)
                (deserialize-from-file out-file)
                (when-let [classifier (train-vcf-classifier ctype attrs pre-normalizer true-vcf false-vcf
                                                            ref config)]
                  (serialize-to-file classifier out-file)
    (let [ctypes (mapcat classifier-types (keys attr-map))]
      (zipmap ctypes (map #(build-vcf-classifier % (get attr-map (:attr-key %))) ctypes)))))

Add details on the filtering to the VCF file header.

(defn- add-cfilter-header
  (fn [_ header]
    (let [str-attrs (map (fn [[k v]] (str (name k) ": " (string/join "," v))) attrs)
          desc (str "Classification filters based on true/false positives for: "
                    (string/join "; " str-attrs))
          new #{(VCFInfoHeaderLine. "CFILTERS" VCFHeaderLineCount/UNBOUNDED
                                    VCFHeaderLineType/String desc)
                (VCFFilterHeaderLine. "CScoreFilter" "Based on classifcation CFILTERS")}]
      (VCFHeader. (apply ordered-set (concat (.getMetaDataInInputOrder header) new))
                  (.getGenotypeSamples header)))))

Check if a variant passes, including external metadata annotations. - trusted: pass variants that overlap in the trusted set - xspecific: exclude variants specific to a technology or caller - otherwise check the variant filters that failed, passing those that are clean

(defn- vc-passes-w-meta?
  [vc c-filters meta-getters config]
  (letfn [(meta-has-variants? [kw]
            (has-variants? (get meta-getters kw)
                           (:chr vc) (:start vc) (:end vc)
                           (:ref-allele vc) (:alt-alleles vc)))]
     (meta-has-variants? :trusted) true
     (meta-has-variants? :xspecific) false
     (empty? c-filters) true
     :else false)))

Update a variant context with filter information from classifier.

(defn- filter-vc
  [cs normalizer attr-get meta-getters config vc]
  (letfn [(check-attrgroup-classifier [[attr-key attrs]]
            (let [c (get cs (get-classifier-type vc attr-key attr-get))
                  val (get-vc-inputs attrs normalizer :fail vc)
                  score (classifier-classify c (-> (get-dataset attrs 1)
                                                   (make-instance val)))]
              (when (pos? score) attr-key)))]
    (let [c-filters (->> (:classifiers config)
                         (map check-attrgroup-classifier)
                         (remove nil?))]
      (-> (VariantContextBuilder. (:vc vc))
          (.attributes (assoc (:attributes vc) "CFILTERS" (if (empty? c-filters)
                                                            (string/join "," (map name c-filters)))))
          (.filters (when-not (vc-passes-w-meta? vc c-filters meta-getters config)

Retrieve variants to use for true/false positive training. Dispatches based on approach used. For recalling, we can pull directly from input files

(defmulti get-train-variants
  (fn [orig-file train-files call exp config call-type out-dir]
    (let [is-recall (get call :recall false)
          recall-approach (keyword (get-in exp [:params :compare-approach] :consensus))]
      (if is-recall
        (if (= :consensus recall-approach)
          [:recall (keyword call-type) (if (:round train-files) :iterate :final)]
          [:recall :rewrite])
(defmethod get-train-variants [:recall :rewrite]
  ^{:doc "Retrieve variants from original file based on variants in target file."}
  [orig-file target-files _ exp _ ext out-dir]
  (letfn [(get-orig-variants [retriever vc]
            (->> (variants-in-region retriever (:chr vc) (:start vc) (:end vc))
                 (filter #(= (:start %) (:start vc)))
                 (map :vc)))]
    (let [out-file (itx/add-file-part orig-file ext out-dir)
          target-file (get target-files (keyword ext))]
      (when (itx/needs-run? out-file)
        (with-open [vcf-iter (get-vcf-iterator target-file (:ref exp))
                    retriever (get-vcf-retriever (:ref exp) orig-file)]
          (write-vcf-w-template orig-file {:out out-file}
                                (mapcat (partial get-orig-variants retriever)
                                        (parse-vcf vcf-iter))
                                (:ref exp))))
(defmethod get-train-variants [:recall :fps :iterate]
  ^{:doc "Identify false positive variants directly from recalled consensus calls.
          These contain the `set` key value pair with information about supporting
          calls. We filter variants that have low support from multiple callers, then
          compare based on novelty in dbSNP. Novel and know have different inclusion
          parameters derived from examining real true/false calls in replicate
          experiments. The logic for inclusion is:
          - Variants with support from less than `fp-freq` percentage of callers.
            This defaults to less than 25% of callers used.
          - We exclude low mapping quality reads, which end up being non-representative
            of more general cases since they are poorly represented in true positives.
            This is worth looking at more for generalizing the approach to these regions.
          - Include indels in low entropy regions which are often problematic.
          - Include novel variants not found in dbSNP that have low read support.
          - Include known variants, in dbSNP, depending on type:
             - SNP: include SNPs with high likelihood of being ref"}
  [orig-file _ call exp _ ext out-dir]
  (let [passes-rules? (rules/vc-checker orig-file call exp)]
    (letfn [(is-potential-fp? [vc]
              (or (passes-rules? vc
                                 :yes [:below-call-support :high-map-quality :het-snp :low-confidence])
                  (passes-rules? vc
                                 :yes [:below-call-support :high-map-quality :novel])
                  (passes-rules? vc
                                 :yes [:below-call-support :high-map-quality :low-confidence]
                                 :no [:novel])))]
    (gvc/select-variants orig-file is-potential-fp? ext (:ref exp)
                         :out-dir out-dir))))
(defmethod get-train-variants [:recall :tps :iterate]
  ^{:doc "Identify true positive training variants directly from recalled consensus.
          Use variants found in all input callers, then restrict similarly to false
          positives to maintain representative sets. We restrict by lower depth and
          problematic reference likelihoods. We also include high confidence calls
          with lower supporting calls to keep a wider range: these include SNPs with
          a low likelihood of being reference and known indels."}
  [orig-file _ call exp _ ext out-dir]
  (let [passes-rules? (rules/vc-checker orig-file call exp)]
    (letfn [(is-tp? [vc]
              (passes-rules? vc :yes [:all-callers :flex-low-confidence]))]
      (gvc/select-variants orig-file is-tp? ext (:ref exp)
                           :out-dir out-dir))))
(defmethod get-train-variants [:recall :trusted :iterate]
  ^{:doc "Retrieve set of trusted variants based on input parameters and recalled consensus."}
  [orig-file _ call exp params ext out-dir]
  (let [calls (remove #(= (:name %) (:name call)) (:calls exp))]
    (letfn [(is-trusted? [vc]
              (when-let [trusted (:trusted params)]
                (trusted/is-trusted-variant? vc trusted calls)))]
      (gvc/select-variants orig-file is-trusted? ext (:ref exp)
                           :out-dir out-dir))))
(defmethod get-train-variants [:recall :trusted :final]
  [orig-file train-files call exp params ext out-dir]
  (get-train-variants orig-file (assoc train-files :round 1) call exp params ext out-dir))
(defmethod get-train-variants [:recall :xspecific :final]
  ^{:doc "Retrieve specific variants to exclude, handling variants falling below untrusted thresh.
          The `untrusted` keyword in the configuration parameters specifies the threshold to use."}
  [orig-file train-files call exp params ext out-dir]
  (with-open [xspecific-get (gvc/get-vcf-retriever (:ref exp) (:xspecific train-files))]
    (let [calls (remove #(= (:name %) (:name call)) (:calls exp))]
      (letfn [(xspecific? [vc]
                (has-variants? xspecific-get
                               (:chr vc) (:start vc) (:end vc)
                               (:ref-allele vc) (:alt-alleles vc)))
              (untrusted? [vc]
                (when-let [untrusted (:untrusted params)]
                      (not (trusted/is-trusted-variant? vc untrusted calls))))
              (is-untrusted? [vc]
                (or (untrusted? vc) (xspecific? vc)))]
        (gvc/select-variants orig-file is-untrusted? ext (:ref exp)
                             :out-dir out-dir)))))
(defmethod get-train-variants [:recall :tps :final]
  ^{:doc "Iteratively identify true positive variants: low support variants
          that pass the previous round of filtering."}
  [orig-file train-files call exp params ext out-dir]
  (let [passes-rules? (rules/vc-checker orig-file call exp)
        out-file (itx/add-file-part orig-file "tps" out-dir)]
    (letfn [(low-support-novel? [vc]
              (passes-rules? vc
                             :yes [:below-call-support :het-snp :novel :low-depth]))
            (is-previous-tp? [vc]
              (when-not (low-support-novel? vc)
                 (passes-rules? vc
                                :yes [:below-call-support :passes-filter]
                                :no [:problem-allele-balance
                 (passes-rules? vc
                                :yes [:below-call-support :het-snp :good-pl]
                                :no [:problem-allele-balance :novel]))))]
      (when (itx/needs-run? out-file)
        (-> (:prev train-files)
            (gvc/select-variants is-previous-tp? ext (:ref exp)
                                 :out-dir out-dir)
            (remove-cur-filters (:ref exp))
            (fs/rename out-file))))
(defmethod get-train-variants [:recall :fps :final]
  ^{:doc "Iteratively identify false positive variants: low support variants
          that fail the previous round of filtering."}
  [orig-file train-files call exp params ext out-dir]
  (let [passes-rules? (rules/vc-checker orig-file call exp)
        out-file (itx/add-file-part orig-file "fps" out-dir)]
    (letfn [(well-supported-known? [vc]
              (passes-rules? vc
                             :yes [:below-call-support :het-snp]
                             :no [:novel :low-depth]))
            (is-previous-fp? [vc]
              (when-not (well-supported-known? vc)
                (or (passes-rules? vc
                                   :yes [:below-call-support :high-map-quality]
                                   :no [:passes-filter])
                    (passes-rules? vc
                                   :yes [:below-call-support :high-map-quality
                                         :het-snp :low-confidence :novel]))))]
      (when (itx/needs-run? out-file)
        (-> (:prev train-files)
            (gvc/select-variants is-previous-fp? ext (:ref exp)
                                 :out-dir out-dir)
            (remove-cur-filters (:ref exp))
            (fs/rename out-file))))
(defmethod get-train-variants :default
  ^{:doc "By default, return the prepped training file with no changes."}
  [_ train-files _ _ _ ext _]
  (get train-files (keyword ext)))

Filter an input VCF file using a trained classifier on true/false variants.

(defn filter-vcf-w-classifier
  [base-vcf train-files call exp config]
  (let [out-dir (when-let [tround (:round train-files)]
                  (str (fs/file (fs/parent base-vcf) "trainround") tround))
        out-file (itx/add-file-part base-vcf "cfilter" out-dir)]
    (when (and out-dir (not (fs/exists? out-dir)))
      (fs/mkdirs out-dir))
    (when (itx/needs-run? out-file)
      (let [true-vcf (get-train-variants base-vcf train-files call exp config
                                         "tps" out-dir)
            false-vcf (get-train-variants base-vcf train-files call exp config
                                          "fps" out-dir)
            trusted-vcf (get-train-variants base-vcf train-files call exp
                                            config "trusted" out-dir)
            xspecific-vcf (get-train-variants base-vcf train-files call exp config
                                              "xspecific" out-dir)
            ref (:ref exp)
            pre-normalizer (get-vc-attrs-normalized (apply concat (vals (:classifiers config)))
                                                    base-vcf ref config)
            cs (build-vcf-classifiers (:classifiers config) pre-normalizer base-vcf
                                      true-vcf false-vcf ref config out-dir)
            config (merge {:normalize :default} config)
            attr-get (prep-vc-attr-retriever base-vcf ref)]
        (println "Filter VCF with" (str cs))
        (with-open [vcf-iter (get-vcf-iterator base-vcf ref)
                    trusted-get (get-vcf-retriever ref trusted-vcf)
                    xspecific-get (get-vcf-retriever ref xspecific-vcf)]
          (write-vcf-w-template base-vcf {:out out-file}
                                (map (partial filter-vc cs (pre-normalizer base-vcf) attr-get
                                              {:trusted trusted-get :xspecific xspecific-get}
                                     (parse-vcf vcf-iter))
                                ref :header-update-fn (add-cfilter-header (:classifiers config))))))

Fit VCF classification-based filtering into analysis pipeline.

(defn pipeline-classify-filter
  [in-vcf train-info call exp params config]
  (letfn [(get-train-vcf [type]
            (-> (filter #(= type (:name %)) train-info)
          (fix-param-classifiers [params]
            (if (map? (:classifiers params))
              (assoc params :classifiers {:all (:classifiers params)})))
          (flatten-param-classifiers [params]
            (assoc params :classifiers
                   {:all (->> (:classifiers params) vals (apply concat) set vec)}))]
    (pipeline-combine-intervals exp config)
    (let [orig-trains {:tps (get-train-vcf "concordant")
                       :fps (get-train-vcf "discordant")
                       :trusted (get-train-vcf "trusted")
                       :xspecific (get-train-vcf "xspecific")}
          params (fix-param-classifiers params)
          x1 (filter-vcf-w-classifier in-vcf (assoc orig-trains :round 1)
                                      call exp (flatten-param-classifiers params))]
      (filter-vcf-w-classifier in-vcf (assoc orig-trains :prev x1) call exp params))))

Combined interval lists from filtered variants prepared via multiple calls. Multiple call approaches and technologies result in reduced call regions due to coverage. These functions manage creation of reduced BED files.

(ns bcbio.variation.filter.intervals
  (:import [org.broadinstitute.sting.utils.interval IntervalUtils
            IntervalMergingRule IntervalSetRule]
           [org.broadinstitute.sting.utils GenomeLocParser
           [org.broadinstitute.sting.utils.exceptions UserException$BadInput])
  (:use [clojure.java.io]
        [clojure.set :only [intersection]]
        [bcbio.align.ref :only [get-seq-dict]]
        [bcbio.variation.callable :only [get-callable-bed get-bed-iterator]]
        [bcbio.variation.variantcontext :only [get-vcf-header]])
  (:require [fs.core :as fs]
            [bcbio.run.itx :as itx]
            [bcbio.run.broad :as broad]))

interval VCF subsetting by BED

Retrieve samples identified in the input VCF file.

(defn get-sample-names
  (-> in-vcf get-vcf-header .getGenotypeSamples vec))

Retrieve the sample name in a provided VCF file, allowing for partial matches.

(defn vcf-sample-name
  [sample in-vcf ref-file]
  (letfn [(sample-match [x choices]
            (let [do-match (filter #(when (.contains % x) %) choices)]
              (when (= 1 (count do-match))
                (first do-match))))]
    (let [vcf-samples (-> in-vcf get-vcf-header .getGenotypeSamples set)]
       (contains? vcf-samples sample) sample
       (= 1 (count vcf-samples)) (first vcf-samples)
       :else (sample-match sample vcf-samples)))))

Select only the sample of interest from input VCF files.

(defn select-by-sample
  [sample in-file name ref & {:keys [out-dir intervals remove-refcalls ext
                              :or {remove-refcalls false}}]
  (let [base-dir (if (nil? out-dir) (fs/parent in-file) out-dir)
        file-info {:out-vcf (if ext (itx/add-file-part in-file ext out-dir)
                                (str (fs/file base-dir
                                              (format "%s-%s.vcf" sample name))))}
        args (concat ["-R" ref
                      "--sample_name" (vcf-sample-name sample in-file ref)
                      "--variant" in-file
                      "--unsafe" "ALL" ; "ALLOW_SEQ_DICT_INCOMPATIBILITY"
                      "--out" :out-vcf]
                     (when remove-refcalls ["--excludeNonVariants" "--excludeFiltered"])
                     (when exclude-intervals ["--excludeIntervals" exclude-intervals])
                     (broad/gatk-cl-intersect-intervals intervals ref))]
    (if-not (fs/exists? base-dir)
      (fs/mkdirs base-dir))
    (broad/run-gatk "SelectVariants" args file-info {:out [:out-vcf]})
    (:out-vcf file-info)))

BED manipulation

(defn- bed-to-intervals
  [bed-file ref-file loc-parser]
  (with-open [bed-iter (get-bed-iterator bed-file ref-file)]
    (doall (map #(.createGenomeLoc loc-parser %) bed-iter))))

Intersect a group of intervals present on a contig.

(defn- intersect-by-contig
  [start-intervals combine-rule]
  (loop [final []
         intervals start-intervals]
    (if (empty? intervals)
      (recur (try (IntervalUtils/mergeListsBySetOperator final (first intervals)
                                                         (if (= :union combine-rule)
                  (catch UserException$BadInput e []))
             (rest intervals)))))

Intersect and exclude intervals on a contig.

(defn- prep-intervals-by-contig
  [start-intervals exclude-intervals loc-parser combine-rule]
  (let [overlaps (intersect-by-contig start-intervals combine-rule)]
    (if (empty? exclude-intervals)
      (let [clean-intervals (->> (group-by #(.getStart %) exclude-intervals)
                                 (map (fn [xs] (sort-by #(.size %) > xs)))
                                 (map first))]
        (-> (GenomeLocSortedSet/createSetFromList loc-parser overlaps)
            (.subtractRegions (GenomeLocSortedSet/createSetFromList loc-parser clean-intervals))

Generate list of intervals that intersect in all provided BED files.

(defn intersection-of-bed-files
  [all-beds ref loc-parser & {:keys [exclude-bed combine-rule]}]
  (letfn [(intervals-by-chrom [bed-file]
            (group-by #(.getContig %) (bed-to-intervals bed-file ref loc-parser)))
          (get-by-contig [interval-groups contig]
            (map #(get % contig []) interval-groups))]
    (let [interval-groups (map intervals-by-chrom all-beds)
          exclude-by-contig (if exclude-bed (intervals-by-chrom exclude-bed) {})
          contigs (vec (apply intersection (map #(set (keys %)) interval-groups)))]
      (mapcat #(prep-intervals-by-contig (get-by-contig interval-groups %)
                                         (get exclude-by-contig % []) loc-parser

Combine intervals from an initial BED and coverage BAM files.

(defn combine-multiple-intervals
  [initial-bed align-bams ref & {:keys [out-dir name exclude-intervals combine-rule
  (let [all-beds (concat [initial-bed] more-beds
                         (map #(get-callable-bed % ref :out-dir out-dir
                                                 :intervals initial-bed)
        loc-parser (GenomeLocParser. (get-seq-dict ref))
        out-file (itx/add-file-part initial-bed
                                    (str (if name (str name "-") "") "multicombine")
    (when (itx/needs-run? out-file)
      (with-open [wtr (writer out-file)]
        (doseq [x (IntervalUtils/sortAndMergeIntervals
                   loc-parser (intersection-of-bed-files all-beds ref loc-parser
                                                         :exclude-bed exclude-intervals
                                                         :combine-rule combine-rule)
          (.write wtr (format "%s\t%s\t%s\n" (.getContig x) (dec (.getStart x)) (.getStop x))))))

Combine multiple intervals as part of processing and filtering pipeline.

(defn pipeline-combine-intervals
  [exp config]
  (let [base-intervals (:intervals exp)
        all-aligns (set (remove nil? (map :align (cons exp (:calls exp)))))]
    (when (and base-intervals (seq all-aligns))
      (combine-multiple-intervals base-intervals all-aligns
                                  (:ref exp)
                                  :exclude-intervals (:exclude-intervals exp)
                                  :name (:sample exp)
                                  :out-dir (get-in config [:dir :prep] (get-in config [:dir :out]))))))

Define filtration rules used to help identify true/false positives for variant classification. Helps organize the logic of selecting variants.

(ns bcbio.variation.filter.rules
  (:require [bcbio.variation.filter.attr :as attr]
            [bcbio.variation.metrics :as metrics]
            [bcbio.variation.multiple :as multiple]))
(defn vc-zygosity [vc]
  (if (some #(.startsWith (:type %) "HET") (:genotypes vc)) :het :hom))

Check if a variant context has a low amount of supporting variant calls.

(defn- below-support-thresh?
  [vc _ call exp]
  (let [freq (get call :fp-freq 0.25)
        thresh (Math/ceil (* freq (dec (count (:calls exp)))))]
    (-> (multiple/get-vc-set-calls vc (:calls exp))
        (disj (:name call))
        (<= thresh))))
(defn- is-intersection? [vc _ _ _]
  (when-let [set-val (get-in vc [:attributes "set"])]
    (= set-val "Intersection")))

Is a variant novel, or is it represented in dbSNP?

(defn- novel-variant?
  [vc _ _ _]
  (contains? #{nil "."} (:id vc)))
(defn- het-snp? [vc _ _ _]
  (and (= "SNP" (:type vc))
       (= :het (vc-zygosity vc))))
(defn- het-indel? [vc _ _ _]
  (and (not= "SNP" (:type vc))
       (= :het (vc-zygosity vc))))
(defn novel-het-indel? [vc g c e]
  (and (het-indel? vc g c e) (novel-variant? vc g c e)))

Define low confidence calls.

(defn- low-call-confidence?
  [vc attr-get _ _]
  (let [attrs (attr-get ["PL" "PLratio"] vc)]
    (when (not (nil? (get attrs "PL")))
      (or (> (get attrs "PL") -7.5)
          (< (or (get attrs "PLratio") Float/MAX_VALUE) 0.25)))))

Identify PL ratios with reasonable support for being a variant.

(defn- good-pl-support?
  [vc attr-get _ _]
  (let [attrs (attr-get ["PLratio"] vc)]
    (when (not-any? nil? (vals attrs))
      (> (get attrs "PLratio") 0.4))))
(defn- low-confidence-novel-het-snp?
  [vc attr-get c e]
  (and (low-call-confidence? vc attr-get c e)
       (novel-variant? vc attr-get c e)
       (het-snp? vc attr-get c e)))

Define calls with a more flexible low confidence call

(defn- flex-low-call-confidence?
  [vc attr-get _ _]
   (let [attrs (attr-get ["PL"] vc)]
    (when (not-any? nil? (vals attrs))
      (> (get attrs "PL") -20.0))))

Calls with low supporting depth

(defn- low-depth?
  [vc attr-get _ _]
  (let [attrs (attr-get ["DP"] vc)]
    (when (not-any? nil? (vals attrs))
      (< (get attrs "DP") 25.0))))

Avoid feeding low quality mapping into true/false positives.

(defn- passes-mapping-quality?
  [vc attr-get _ _]
  (let [attrs (attr-get ["MQ"] vc)]
    (when (not-any? nil? (vals attrs))
      (> (get attrs "MQ") 50.0))))

Identify skewed allele balances indicative of artifacts. This is a signature of problem heterozygote calls from GATK Haplotype caller.

(defn- artifact-allele-balance?
  [vc attr-get _ _]
  (let [attrs (attr-get ["AD"] vc)]
    (when (not-any? nil? (vals attrs))
      (> (get attrs "AD") 0.35))))
(defn- passes-filter?
  [vc _ _ _]
  (metrics/passes-filter? vc))

Define keyword mappings to function definitions

(def ^{:private true
       :doc }
  rules {:below-call-support below-support-thresh?
         :all-callers is-intersection? 
         :novel novel-variant?
         :het-snp het-snp?
         :het-indel het-indel?
         :novel-het-indel novel-het-indel?
         :low-confidence-novel-het-snp low-confidence-novel-het-snp?
         :low-confidence low-call-confidence?
         :good-pl good-pl-support?
         :flex-low-confidence flex-low-call-confidence?
         :low-depth low-depth?
         :passes-filter passes-filter?
         :high-map-quality passes-mapping-quality?
         :problem-allele-balance artifact-allele-balance?})

Identify variants conforming to supplied rules.

(defn vc-checker
  [orig-file call exp]
  (let [attr-get (attr/prep-vc-attr-retriever orig-file (:ref exp))]
    (letfn [(call-rule [vc rulekw]
              ((get rules rulekw) vc attr-get call exp))]
      (fn [vc & {:keys [yes no]}]
        (and (every? (partial call-rule vc) yes)
             (not-any? (partial call-rule vc) no))))))

Identify technology or caller specific variants from multiple combined callsets.

(ns bcbio.variation.filter.specific
  (:import [org.broadinstitute.sting.utils.variantcontext VariantContextBuilder]
           [org.broadinstitute.sting.utils.codecs.vcf VCFHeader VCFInfoHeaderLine
            VCFHeaderLineCount VCFHeaderLineType])
  (:use [ordered.set :only [ordered-set]]
        [bcbio.variation.filter.attr :only [get-vc-attrs]]
        [bcbio.variation.filter.trusted :only [variant-set-metadata
  (:require [bcbio.variation.variantcontext :as gvc])
  (:require [clojure.string :as string]
            [bcbio.run.itx :as itx]))
(defn- get-specific
  [data kw-want kw-cmp]
  (when (and (= 1 (count (kw-want data)))
             (> (count (kw-cmp data)) 1))
    (first (kw-want data))))

Check if a variant is specific to a caller or method.

(defn get-x-specific-designation
  [vc calls]
  (let [data (variant-set-metadata vc calls)]
    (reduce (fn [coll [kw-want kw-cmp]]
              (if-let [x (get-specific data kw-want kw-cmp)]
                (assoc coll kw-want x)
            {} [[:caller :technology] [:technology :caller]])))

Add specificity information to a VariantContext if present.

(defn- add-x-specific
  [vc calls]
  (letfn [(xspec-to-string [[k v]]
            (str (name k) ":" v))]
    (let [xspec (get-x-specific-designation vc calls)]
      (when (seq xspec)
        (-> (VariantContextBuilder. (:vc vc))
            (.attributes (assoc (:attributes vc)
                           "xspecific" (->> xspec
                                            (map xspec-to-string)
                                            (string/join ","))))
(defn- add-x-specific-header
  [_ header]
  (let [new #{(VCFInfoHeaderLine.
               "xspecific" VCFHeaderLineCount/UNBOUNDED VCFHeaderLineType/String
               "Identify variant call as specific to a technology or calling method.")}]
    (VCFHeader. (apply ordered-set (concat (.getMetaDataInInputOrder header) new))
                (.getGenotypeSamples header))))

Simple measure to evaluate call support based on depth and allele balance. This identifies poorly supported items, which primarily make up false positive examples.

(defn poor-call-support?
  [vc & {:keys [thresh]
         :or {thresh {:dp 100 :ad 0.05}}}]
  (let [attrs (get-vc-attrs vc [[:format "AD"] [:format "DP"]] {})]
    (and (when-let [dp (get attrs [:format "DP"])]
           (< dp (:dp thresh)))
         (when-let [ad (get attrs [:format "AD"])]
           (> ad (:ad thresh))))))

Filter VCF file generating output only variants specific to a technology or caller.

(defn get-x-specific-variants
  [cmps support exp config]
  (when-let [base-vcf (get-comparison-fullcombine cmps support config)]
    (let [out-file (itx/add-file-part base-vcf "xspecific")]
      (when (itx/needs-run? out-file)
        (with-open [base-vcf-iter (gvc/get-vcf-iterator base-vcf (:ref exp))]
          (gvc/write-vcf-w-template base-vcf {:out out-file}
                                    (->> (gvc/parse-vcf base-vcf-iter)
                                         (filter poor-call-support?)
                                         (map #(add-x-specific % (:calls exp)))
                                         (remove nil?))
                                    (:ref exp) :header-update-fn add-x-specific-header)))

Extract training cases from comparisons for machine learning approaches. Based on a comparison, identified potential true positives, false positives and false negatives to further tweak classifiers.

(ns bcbio.variation.filter.train
  (:use [clojure.java.io]
        [bcbio.variation.multiple :only [prep-cmp-name-lookup]])
  (:require [fs.core :as fs]
            [bcbio.run.broad :as broad]
            [bcbio.run.itx :as itx]))

Retrieve output file of concordant calls between two sets of variant calls

(defn- select-concordant
  [fname1 fname2 ref out-file]
  (let [args ["-R" ref
              "--variant" fname1
              "--concordance" fname2
              "--out" :out-vcf]]
    (broad/run-gatk "SelectVariants" args {:out-vcf out-file} {:out [:out-vcf]}))

Common infrastructure for generating training values

(defn- prep-common
  [case-kw file-ext cases out-base ref-file]
  (letfn [(get-discordant-by-kw [x]
            (get-in x [:c-files (keyword (str (get x case-kw) "-discordant"))]))]
    (let [out-file (str out-base file-ext)]
      (when (itx/needs-run? out-file)
        (apply select-concordant
               (concat (map get-discordant-by-kw cases) [ref-file out-file])))

Retrieve potential false negatives, discordant calls found in all of the comparison cases but not in the target.

(defn- prep-false-negatives
  [cases out-base ref-file]
  (prep-common :cmp "-potential-fns.vcf"
               cases out-base ref-file))

Retrieve potential false positives, discordant calls from the target not found in any of the comparison cases.

(defn- prep-false-positives
  [cases out-base ref-file]
  (prep-common :target "-potential-fps.vcf"
               cases out-base ref-file))

Retrieve cases to use for preparing training sets from supplied inputs. Prepares list of maps with :target :cmp :c-files, where the latter contains all original comparison files.

(defn- get-train-cases
  [cmps-orig train-info]
  (letfn [(get-train-case [cmps target cmp]
            {:target target :cmp cmp
             :c-files (:c-files (get cmps [target cmp] (get cmps [cmp target])))})]
    (let [cmps (prep-cmp-name-lookup cmps-orig)]
      (map (partial get-train-case cmps (:target train-info)) (:cmps train-info)))))

Prepare exploratory training cases based on specified inputs

(defn extract-train-cases
  [cmps train-info exp config]
  (let [out-dir (str (file (get-in config [:dir :prep] (get-in config [:dir :out])) "train"))
        cases (get-train-cases cmps train-info)
        out-base (str (file out-dir (format "%s-%s" (:sample exp) (:target train-info))))]
    (when (not (fs/exists? out-dir))
      (fs/mkdirs out-dir))
    {:fns (prep-false-negatives cases out-base (:ref exp))
     :fps (prep-false-positives cases out-base (:ref exp))}))

Retrieve trusted variants from comparisons based on configured thresholds. Allows specification of cases where we should trust variants to pass, such as: found in more than two sequencing technologies, or called in 3 aligners, or called in 7 out of 8 inputs.

(ns bcbio.variation.filter.trusted
  (:use [bcbio.variation.multiple :only [multiple-overlap-analysis remove-mod-name
                                         prep-cmp-name-lookup get-vc-set-calls]]
        [bcbio.variation.variantcontext :only [parse-vcf write-vcf-w-template
  (:require [clojure.string :as string]
            [bcbio.run.itx :as itx]))

Check if a comparison set is only pairwise and not multiple.

(defn- pairwise-only?
  (= 1 (count (set (map (fn [xs] (vec (map remove-mod-name xs))) cmp-names)))))

Retrieve supporting VCFs for a set of comparisons and specified support.

(defn get-support-vcfs
  [cmps support config & {:keys [remove-mods?]}]
  (let [cmps-by-name (if (map? cmps) cmps (prep-cmp-name-lookup cmps :remove-mods? remove-mods?))
        support (if (and (not (coll? support)) (pairwise-only? (keys cmps-by-name)))
                  (first (keys cmps-by-name))
    (if (coll? support)
      (zipmap [:true-positives :false-positives]
              (take 2 (-> cmps-by-name (get support) :c-files vals)))
      (let [x (multiple-overlap-analysis cmps-by-name config support)]
        (into {} (map (juxt identity x)
                      [:true-positives :false-positives :target-overlaps]))))))

Retrieve metadata associated with overlapping variants from combined set attribute.

(defn variant-set-metadata
  [vc calls]
  (when-let [set-calls (get-vc-set-calls vc calls)]
    (reduce (fn [coll x]
              (let [cur-name (string/replace (:name x) "-" "_")]
                (if-not (contains? set-calls cur-name)
                  (reduce (fn [inner [k v]]
                            (assoc inner k (conj (get inner k #{}) v)))
                          coll (assoc (get x :metadata {}) :total cur-name)))))
            {} calls)))

Determine if we trust a variant based on specified trust parameters. The params specify required counts for inclusion. For instance: {:total 4 :technology 3 :caller 2} includes variants located in 4 total calls or in three different technologies or in 2 different callers. It can also handle percentages for required inputs: {:total 1.0 :technology 0.75}

(defn is-trusted-variant?
  [vc params calls]
  (letfn [(collapse-md-by-type [calls]
            (reduce (fn [coll [k v]]
                      (assoc coll k (conj (get coll k #{}) v)))
                    {:total (set (map :name calls))}
                    (mapcat :metadata calls)))
          (calc-md-counts [calls]
            (reduce (fn [coll [k v]]
                      (assoc coll k (count v)))
                    (collapse-md-by-type calls)))
          (param-passes? [metadata md-counts [k v]]
            (let [n (count (get metadata k []))]
              (if (> v 1)
                (>= n v)
                (>= (/ n (get md-counts k)) v))))]
    (let [use-calls (remove :recall calls)]
      (some (partial param-passes? (variant-set-metadata vc use-calls)
                     (calc-md-counts use-calls))

Retrieve the all variant fullcombine VCF for a set of comparisons.

(defn get-comparison-fullcombine
  [cmps support config]
   (get-support-vcfs cmps (if (coll? support) (first support) support)
                     config :remove-mods? true)))

Retrieve VCF file of trusted variants based on specific parameters.

(defn get-trusted-variants
  [cmps support params exp config]
  (when-let [base-vcf (get-comparison-fullcombine cmps support config)]
    (let [out-file (itx/add-file-part base-vcf "trusted")]
      (when (itx/needs-run? out-file)
        (with-open [base-vcf-iter (get-vcf-iterator base-vcf (:ref exp))]
          (write-vcf-w-template base-vcf {:out out-file}
                                (->> (parse-vcf base-vcf-iter)
                                     (filter #(is-trusted-variant? % params (:calls exp)))
                                     (map :vc))
                                (:ref exp))))

Provide useful utilities dealing with filtering of variants

(ns bcbio.variation.filter.util
  (:import [org.broadinstitute.sting.utils.variantcontext
(:require [bcbio.run.itx :as itx]
          [bcbio.variation.variantcontext :as gvc]))

Remove any filter information in the supplied file.

(defn remove-cur-filters
  [in-vcf ref]
  (letfn [(remove-vc-filter [vc]
            [:out (-> (VariantContextBuilder. (:vc vc))
    (let [out-file (itx/add-file-part in-vcf "nofilter")]
      (when (itx/needs-run? out-file)
        (with-open [vcf-iter (gvc/get-vcf-iterator in-vcf ref)]
          (gvc/write-vcf-w-template in-vcf {:out out-file}
                                    (map remove-vc-filter (gvc/parse-vcf vcf-iter))

Support comparisons of variant calls to reference call sets, providing detailed metrics about problematic discordant calls.

(ns bcbio.variation.grade
  (:import [org.broadinstitute.sting.utils.codecs.vcf
            VCFInfoHeaderLine VCFHeaderLineType]
           [org.broadinstitute.sting.utils.variantcontext VariantContextBuilder])
  (:require [clojure.set :refer [intersection]]
            [clojure.string :as string]
            [clojure.math.combinatorics :as combo]
            [lonocloud.synthread :as ->]
            [bcbio.run.itx :as itx]
            [bcbio.variation.annotation :as annotation]
            [bcbio.variation.combine :as combine]
            [bcbio.variation.filter.attr :as attr]
            [bcbio.variation.phasing :as phasing]
            [bcbio.variation.variantcontext :as gvc]))

Utility functions

(defn is-grade-cmp?
  (= :grade (keyword (get exp :approach "compare"))))

Grading references are either haploid or identified with grading-ref type.

(defn is-grading-ref?
  [exp c]
  (or (= :grading-ref (keyword (:type c)))
      (-> c :file (phasing/is-haploid? (:ref exp)))))

Separate grading reference and evaluation genome based on type parameter. Defaults to the first being reference and second evaluation if not defined.

(defn- find-grading-and-eval-kws
  [exp c1 c2 c-files]
  (let [grade-groups (group-by (partial is-grading-ref? exp) [c1 c2])
        marked-ref (first (get grade-groups true))
        marked-eval (first (get grade-groups false))
        [truth-c eval-c] (if (and marked-ref marked-eval)
                           [marked-ref marked-eval]
                           [c1 c2])
        eval-kw (keyword (str (:name eval-c) "-discordant"))
        truth-kw (keyword (str (:name truth-c) "-discordant"))]
    {:eval (if (contains? c-files eval-kw) eval-kw :discordant)
     :truth (if (contains? c-files truth-kw) truth-kw :discordant-missing)}))

Summarize grading results

Decide on a likely reason for a discordant variant call

(defn- pick-discordant-reason
  [vc attr-getter]
  (letfn [(is-repeat-region? [attrs]
            (or (< (or (get attrs "gms_illumina") 100.0) 50.0)
                (contains? (or (get attrs "rmsk") #{}) "repeat")))
          (is-error-prone? [attrs]
            (contains? (or (get attrs "in_cse") #{}) "error-prone"))]
    (let [attrs (attr-getter ["DP" "rmsk" "gms_illumina" "in_cse"] vc)]
       (< (or (get attrs "DP") 500) 10) :low-coverage
       (is-error-prone? attrs) :error-prone
       (is-repeat-region? attrs) :repeat
       :else :other))))

Identify the variant type and discordant category. - variant types -- :snp :indel - discordant types -- :shared :missing :extra - reason types -- :hethom :vardiff :low-coverage :repeat :error-prone :other

(defn- identify-discordant-cat
  [vc attr-getter]
  (let [vtype (keyword (string/lower-case (:type vc)))
        cat (-> (get-in vc [:attributes "GradeCat"])
                (string/replace "discordant-" "")
        [dtype rtype] (case cat
                        (:missing :extra) [cat (pick-discordant-reason vc attr-getter)]
                        [:shared cat])]
    [vtype dtype rtype]))
(defn- count-discordant-categories
  [vcf-file ref-file]
  (let [attr-getter (attr/prep-vc-attr-retriever vcf-file ref-file)]
    (with-open [in-vcf-iter (gvc/get-vcf-iterator vcf-file ref-file)]
      (reduce (fn [coll vc]
                (let [cat (identify-discordant-cat vc attr-getter)]
                  (assoc-in coll cat (inc (get-in coll cat 0)))))
              {} (gvc/parse-vcf in-vcf-iter)))))

Prepare detailed grading breakdown of concordant and discordant variants. The goal is to help identify common causes of discordance.

(defn prep-grade-breakdown
  (let [kws (find-grading-and-eval-kws (:exp cmp) (:c1 cmp) (:c2 cmp)
                                       (:c-files cmp))
        vcf-file (get-in cmp [:c-files (:eval kws)])
        ref-file (get-in cmp [:exp :ref])
        summary (:summary cmp)]
    {:sample (:sample summary)
     :concordant (select-keys summary [:genotype_concordance :callable_concordance
     :discordant (count-discordant-categories vcf-file ref-file)}))

Identify grading references

Convert truth discordants into reference calls

(defn- to-refcalls
  [f ref-file]
  (let [out-file (itx/add-file-part f "asref")]
    (when (itx/needs-run? out-file)
      (with-open [in-vcf-iter (gvc/get-vcf-iterator f ref-file)]
        (gvc/write-vcf-w-template f {:out out-file}
                                  (map #(gvc/genotypes->refcall %)
                                       (gvc/parse-vcf in-vcf-iter))

Merge extra and missing discordant calls into single VCF.

(defn- merge-discordants
  [eval-vcf truth-vcf align-bam ref-file]
  (let [truth-dis-vcf (-> truth-vcf
                          (to-refcalls ref-file)
                          (->/when align-bam
                            (annotation/add-gatk-annotations align-bam ref-file :annos ["DepthOfCoverage"])))]
    (combine/combine-variants [eval-vcf truth-dis-vcf]
                              ref-file :merge-type :full :quiet-out? true :check-ploidy? false)))

Add grading info to VCF

Identify variants that are variant calls but discordant based on het/hom calls.

(defn hethom-discordant?
  [vc other-vcs]
  (letfn [(get-alleles [x]
            (->> (:genotypes x)
                 (map :alleles)
                 (remove #(.isReference %))
    (let [vc2 (first (filter #(and (= (:start vc) (:start %))
                                   (= (:ref-allele vc) (:ref-allele %)))
      (seq (intersection (get-alleles vc) (get-alleles vc2))))))

Is a variant context a no-variant (reference) call.

(defn is-novar-call?
  (->> (:genotypes vc)
       (map :alleles)
       (every? #(.isReference %))))

Assign a discordance category to variants that do not match grading reference.

(defn- assign-grade-cat
  [vc ref-vcs]
   (empty? ref-vcs) :discordant-extra
   (is-novar-call? vc) :discordant-missing
   (hethom-discordant? vc ref-vcs) :discordant-hethom
   :else :discordant-vardiff))

Determine likelihood of grading reference based on grading category. :discordant-missing -- probability that grading standard is actually reference. :discordant-hethom -- probability that grading standard is alternative variant.

(defn- get-grade-score
  [ref-vc cat]
  (let [pls (when ref-vc
              (attr/get-pls ref-vc))]
    (case cat
      :discordant-missing (get pls "HOM_REF")
      :discordant-hethom (first (vals (dissoc pls "HOM_REF")))

Add grading category and score, providing additional details on discordant variants.

(defn add-grade-cat
  [ref-get vc]
  (let [ref-vcs (gvc/variants-in-region ref-get vc)
        grade-cat (assign-grade-cat vc ref-vcs)]
    (-> (VariantContextBuilder. (:vc vc))
        (.attributes (-> (:attributes vc)
                         (assoc "GradeCat" (name grade-cat))
                         (->/when-let [score (get-grade-score (first ref-vcs) grade-cat)]
                           (assoc "GradeScore" (float score)))))

Add grading INFO fields to VCF output header

(defn add-grade-header
  [_ header]
   #{(VCFInfoHeaderLine. "GradeCat" 1 VCFHeaderLineType/String
                         "Grading category based on comparison with reference call set.")
     (VCFInfoHeaderLine. "GradeScore" 1 VCFHeaderLineType/Float
                         "Grading score: phred score of correct probability reference call.")}))

Update a comparison with annotated information on discordant grading

(defn annotate-discordant
  (let [kws (find-grading-and-eval-kws (:exp cmp) (:c1 cmp) (:c2 cmp)
                                       (:c-files cmp))
        eval-vcf (get-in cmp [:c-files (:eval kws)])
        truth-vcf (get-in cmp [:c-files (:truth kws)])
        ref-file (get-in cmp [:exp :ref])
        align-bam (get-in cmp [:exp :align])
        base-eval-vcf (merge-discordants eval-vcf truth-vcf align-bam ref-file)
        out-vcf (itx/add-file-part eval-vcf "annotate")]
    (when (itx/needs-run? out-vcf)
      (with-open [ref-get (gvc/get-vcf-retriever ref-file truth-vcf)
                  eval-iter (gvc/get-vcf-iterator base-eval-vcf ref-file)]
        (gvc/write-vcf-w-template base-eval-vcf {:out out-vcf}
                                  (map (partial add-grade-cat ref-get)
                                       (gvc/parse-vcf eval-iter))
                                  ref-file :header-update-fn add-grade-header)))
    (assoc-in cmp [:c-files (:eval kws)] out-vcf)))

Convert diploid variants from GATK into haploid calls based on genotype likelihoods. We assess diploid GATK calls based on the phred-normalized likelihood (PL). Lower variant PLs are likely to be true and included. The GATK documentation contains a detailed example of the format and interpretation: http://gatkforums.broadinstitute.org/discussion/1268/how-should-i-interpret-vcf-files-produced-by-the-gatk

(ns bcbio.variation.haploid
  (:import [org.broadinstitute.sting.utils.variantcontext 
            VariantContextBuilder GenotypesContext GenotypeBuilder Allele])
  (:use [clojure.java.io]
        [bcbio.variation.variantcontext :only [parse-vcf get-vcf-iterator write-vcf-w-template]])
  (:require [clojure.string :as string]
            [bcbio.run.itx :as itx]))

Convert diploid -> haploid

Threshold to include a heterozygous allele as a haploid homozygote variant. Based on type of variant: SNPs have lower threshold of inclusion. Includes two thresholds: for possibly being homozygous variant and for not being homozygous reference.

(defn- get-haploid-thresh
  (case (:type vc)
    "SNP" {"HOM_VAR" 1e-5
           "HOM_REF" 1e-20}
    {"HOM_VAR" 1e-50
     "HOM_REF" 1e-200}))

Retrieve all likelihoods (PL) for genotype.

(defn get-likelihoods
  [g & {:keys [no-convert]}]
  (when (and (.hasLikelihoods g)
             (> (count (vec (.getAsVector (.getLikelihoods g)))) 1))
    (let [pl-vec (vec (.getAsVector (.getLikelihoods g)))]
      (if (= (count pl-vec) 2)
        (zipmap ["HOM_REF" "HOM_VAR"] pl-vec)
        (let [in-map (-> (.getLikelihoods g) (.getAsMap (nil? no-convert)))]
          (zipmap (map #(.name %) (keys in-map)) (vals in-map)))))))

Can we reasonably convert a heterozygote call to haploid based on likelihoods? We allow a het to pass when the homozygous variant prob is less than our defined thresholds, or our homozygous reference prob is greater.

(defn het-can-be-haploid?
  [g vc]
  (let [probs (get-likelihoods g)
        thresh (get-haploid-thresh vc)]
    (letfn [(passes-var? []
              (when-let [p (get probs "HOM_VAR")]
                (> p (get thresh "HOM_VAR"))))
            (passes-ref? []
              (when-let [p (get probs "HOM_REF")]
                (< p (get thresh "HOM_REF"))))]
      (or (passes-var?) (passes-ref?)))))

Retrieve updated genotype with haploid allele.

(defn- get-haploid-genotype
  (letfn [(maybe-variant-haploid [g vc]
            (when (het-can-be-haploid? g vc)
              (first (filter #(and (.isNonReference %) (.isCalled %))
                               (.getAlleles g)))))
          (extract-mixed-allele [alleles]
            (let [ready (remove #(.isNoCall %) alleles)]
              (when (= 1 (count ready))
                (first ready))))
          (get-haploid-allele [g vc]
            (case (:type g)
              "HOM_VAR" (first (:alleles g))
              "MIXED" (extract-mixed-allele (:alleles g))
              "HET" (maybe-variant-haploid (:genotype g) vc)
          (add-haploid-genotype [context g]
            (let [allele (or (get-haploid-allele g vc) Allele/NO_CALL)]
              (doto context
                (.replace (-> (GenotypeBuilder. (:genotype g))
                              (.alleles [allele])
    (reduce add-haploid-genotype
            (-> vc :vc .getGenotypes GenotypesContext/copy)
            (:genotypes vc))))

Check for at least one called Allele in a list of Genotypes.

(defn- has-called-allele?
  (not-every? #(.isNoCall %) (mapcat #(.getAlleles %) genotypes)))

Convert diploid allele to haploid variant.

(defn- convert-to-haploid
  (let [new-genotype (get-haploid-genotype vc)]
    (if (has-called-allele? new-genotype)
      [:haploid (-> (VariantContextBuilder. (:vc vc))
                    (.genotypes new-genotype)
      [:unchanged (:vc vc)])))

Convert set of diploid GATK calls on a haploid genome based on likelihoods.

(defn diploid-calls-to-haploid
  [vcf ref & {:keys [out-dir]}]
  (let [out-files {:haploid (itx/add-file-part vcf "haploid" out-dir)
                   :unchanged (itx/add-file-part vcf "nonhaploid" out-dir)}]
    (when (itx/needs-run? (vals out-files))
      (with-open [vcf-iter (get-vcf-iterator vcf ref)]
        (write-vcf-w-template vcf out-files
                              (map convert-to-haploid (parse-vcf vcf-iter))
    (:haploid out-files)))

Examine diploid metrics

Write phred likelihoods for het calls to be haploid variants.

(defn write-het-variant-pls
  [vcf-file ref-file & attrs]
  (letfn [(get-pl [vc]
            (let [g (-> vc :genotypes first :genotype)]
              (when (.hasLikelihoods g)
                (let [in-map (-> (.getLikelihoods g) (.getAsMap true))]
                  (get (zipmap (map #(.name %) (keys in-map)) (vals in-map))
  (let [out-file (str (itx/file-root vcf-file) "-het-pls.csv")]
    (with-open [vcf-iter (get-vcf-iterator vcf-file ref-file)
                wtr (writer out-file)]
      (doseq [val (->> (parse-vcf vcf-iter)
                       (filter #(= "HET" (-> % :genotypes first :type)))
                       (map get-pl)
                       (remove nil?))]
        (.write wtr (str (string/join "," (cons val attrs)) "\n"))))
(defn -main
  [vcf ref]
  (diploid-calls-to-haploid vcf ref))

Index and retrieve variant associated population genetic and disease data. Built on the Gemini framework: https://github.com/arq5x/gemini

(ns bcbio.variation.index.gemini
  (:use [clojure.java.io]
        [ordered.map :only [ordered-map]]
        [bcbio.variation.api.shared :only [web-config]]
        [bcbio.variation.web.db :only [get-sqlite-db get-sqlite-db-pool]])
  (:require [clojure.java.jdbc :as sql]
            [clojure.java.shell :as shell]
            [clojure.string :as string]
            [fs.core :as fs]
            [bcbio.run.itx :as itx]))


(defn get-gemini-cmd []
  (let [cmd (get-in @web-config [:program :gemini] "gemini")
        info (try (shell/sh cmd "-h")
                  (catch java.io.IOException _
                    {:exit -1}))]
    (when (zero? (:exit info))

Check if the input file contains snpEff annotations.

(defn- has-snpeff-anns?
  (with-open [rdr (reader in-file)]
    (->> (line-seq rdr)
         (take-while #(.startsWith % "##"))
         (filter #(.startsWith % "##SnpEff"))

Pre-index a variant file with gemini, handling snpEff annotations.

(defn index-variant-file
  [in-file _ & {:keys [re-index?]}]
  (when-let [gemini-cmd (get-gemini-cmd)]
    (when in-file
      (let [index-file (str (itx/file-root in-file) "-gemini.db")]
        (when (or (itx/needs-run? index-file) re-index?)
          (itx/with-tx-file [tx-index index-file]
            (apply shell/sh
                   (concat [gemini-cmd "load" "-v" in-file]
                           (when (has-snpeff-anns? in-file)
                             ["-t" "snpEff"])

Gemini metrics to expose for query and visualization.

(def ^{:doc 
       :private true}
   "aaf_1kg_all" {:range [0.0 1.0]
                  :desc "1000 genomes allele frequency, all populations"}
   "gms_illumina" {:range [0.0 100.0]
                   :y-scale {:type :log}
                   :desc "Genome Mappability Score with an Illumina error model"}
   "in_cse" {:x-scale {:type :category}
             :desc "Presence of variant in an error prone genomic position"}
   "rmsk" {:x-scale {:type :category}
           :desc "Repeat status: is the variant in a known repeat region"}
   "type" {:x-scale {:type :category}
           :rows {:type "" :sub_type ""}
           :desc "Type of variant change"}
   "zygosity" {:x-scale {:type :category}
               :rows {:num_hom_ref "homozygous ref"
                      :num_het "heterozygous"
                      :num_hom_alt "homozygous"}
               :desc "Allele types present in individuals"}
   "encode_consensus_gm12878" {:x-scale {:type :category}
                               :desc "Chromatin status: consensus from ENCODE for NA12878"}
   "in_public" {:x-scale {:type :category}
                :rows {:in_dbsnp "dbSNP"
                       :in_hm3 "HapMap3"
                       :in_esp "ESP"
                       :in_1kg "1000genomes"}
                :desc "Presence in large variant projects like dbSNP and 1000 genomes"}
   "is_coding" {:x-scale {:type :category}
                :desc "Type of coding transcript influenced by variant"}
   "impact_severity" {:x-scale {:type :category}
                      :desc "Severity of variant impact on coding region"}))

Convert a gemini attribute into potentially multiple gemini column names.

(defn- attr->colnames
  (->> (if-let [rows (get-in gemini-metrics [(name attr) :rows])]
         (keys rows)
       (map name)))

Retrieve metrics available from Gemini.

(defn available-metrics
  [in-file & {:keys [noviz?]}]
  (let [all-metrics (->> gemini-metrics
                         (map (fn [[k v]] (assoc v :id k)))
                         (filter #(or noviz? (get % :viz true))))]
    (if-let [index-db (index-variant-file in-file nil)]
      (sql/with-connection (get-sqlite-db index-db)
        (letfn [(db-has-metric? [x]
                  (sql/with-query-results rows
                    [(str "SELECT chrom, start FROM variants WHERE "
                          (->> (attr->colnames (:id x))
                               (map #(str % " IS NOT NULL"))
                               (string/join " OR "))
                           " LIMIT 1")]
                    (seq rows)))]
          (doall (filter db-has-metric? all-metrics))))

Provide additional post-processing of gemini supplied attributes.

(defmulti finalize-gemini-attr
  (fn [attr row] (keyword (string/lower-case attr))))
(defmethod finalize-gemini-attr :sift_score
  [_ row]
  (let [val (first (vals row))]
    (if (nil? val) 1.0 val)))
(defmethod finalize-gemini-attr :polyphen_score
  [_ row]
  (let [val (first (vals row))]
    (if (nil? val) 0.0 val)))
(defmethod finalize-gemini-attr :gms_illumina
  [_ row]
  (let [val (first (vals row))]
    (if (nil? val) 100.0 val)))
(defmethod finalize-gemini-attr :gms_solid
  [_ row]
  (let [val (first (vals row))]
    (if (nil? val) 100.0 val)))
(defmethod finalize-gemini-attr :gms_iontorrent
  [_ row]
  (let [val (first (vals row))]
    (if (nil? val) 100.0 val)))
(defmethod finalize-gemini-attr :in_cse
  [_ row]
  (let [val (first (vals row))]
    (if (and (not (nil? val)) (pos? val)) #{"error-prone"} #{"standard"})))
(defmethod finalize-gemini-attr :rmsk
  [_ row]
  (let [val (first (vals row))]
    #{(if (nil? val) "non-repeat" "repeat")}))
(defmethod finalize-gemini-attr :type
  [_ row]
  (set (map #(case %
               "ts" "transition"
               "tv" "transversion"
               "ins" "insertion"
               "del" "deletion"
            (vals row))))

Convert a row into pre-configured names based on gemini-metrics :rows

(defn- row->names
  [attr row]
  (let [row-names (get-in gemini-metrics [attr :rows])]
    (reduce (fn [coll [k v]]
              (if (and (not (nil? v))
                       (pos? v))
                (conj coll (get row-names (keyword k)))
            #{} row)))
(defmethod finalize-gemini-attr :zygosity
  [attr row]
  (row->names attr row))
(defmethod finalize-gemini-attr :encode_consensus_gm12878
  ^{:doc "ENCODE chromatin segment predictions, from Table 3 of doi:10.1038/nature11247"}
  [_ row]
  (let [val (first (vals row))]
    #{(case val
        "CTCF" "CTCF-enriched"
        "E" "Enhancer"
        "PF" "Promoter flanking"
        "R" "Repressed"
        "TSS" "Promoter with TSS"
        "T" "Transcribed"
        "WE" "Weak enchancer"
(defmethod finalize-gemini-attr :in_public
  [attr row]
  (let [publics (row->names attr row)]
    (if (seq publics)
(defmethod finalize-gemini-attr :is_coding
  [_ row]
  (let [val (first (vals row))]
    (if (and (not (nil? val)) (pos? val)) #{"coding"} #{"noncoding"})))
(defmethod finalize-gemini-attr :impact_severity
  [_ row]
  (let [val (first (vals row))]
(defmethod finalize-gemini-attr :default
  [_ row]
  (let [val (first (vals row))]
(defn- gemini-metric-from-row
  [row attr]
  (let [colnames (attr->colnames attr)]
    (finalize-gemini-attr attr
                          (zipmap colnames
                                  (map #(get row (keyword (string/lower-case %))) colnames)))))

Retrieve metrics by name from a gemini index for provided VariantContexts.

(defn vc-attr-retriever
  [in-file ref-file]
  (if-let [index-db (index-variant-file in-file ref-file)]
    (let [pool (get-sqlite-db-pool index-db)]
      (fn [vc attr]
        (sql/with-connection pool
          (sql/with-query-results rows
            [(str "SELECT " (string/join "," (attr->colnames attr))
                  " FROM variants WHERE chrom = ? AND start = ? and ref = ?")
             (str "chr" (:chr vc)) (dec (:start vc)) (.getBaseString (:ref-allele vc))]
            (gemini-metric-from-row (first rows) attr)))))
    (fn [vc attr] nil)))

Retrieve table of Gemini metrics keyed on variant names.

(defn get-raw-metrics
  [in-file ref-file & {:keys [metrics noviz?]}]
  (when-let [index-db (index-variant-file in-file ref-file)]
    (let [plot-metrics (filter (partial contains? gemini-metrics)
                               (or metrics (map :id (available-metrics in-file
                                                                       :noviz? noviz?))))]
      (when (seq plot-metrics)
        (sql/with-connection (get-sqlite-db index-db)
          (sql/with-query-results rows
            [(str "SELECT chrom, start, ref, "
                  (string/join ", " (mapcat attr->colnames plot-metrics))
                  " FROM variants WHERE (filter is NULL or filter = 'PASS')"
                  " ORDER BY chrom, start")]
            (doall (map (fn [orig]
                          (reduce (fn [coll x]
                                    (assoc coll x (gemini-metric-from-row orig x)))
                                  {:id [(string/replace (:chrom orig) "chr" "") (inc (:start orig)) (:ref orig)]}

Pre-index a variant file for quick retrieval of associated metrics.

(ns bcbio.variation.index.metrics
  (:use [ordered.map :only [ordered-map]]
        [bcbio.variation.metrics :only [passes-filter?]]
        [bcbio.variation.filter.attr :only [get-vc-attrs]]
        [bcbio.variation.index.subsample :only [subsample-by-cluster]]
        [bcbio.variation.variantcontext :only [get-vcf-header get-vcf-iterator parse-vcf]]
        [bcbio.variation.web.db :only [get-sqlite-db]])
  (:require [clojure.string :as string]
            [clojure.java.jdbc :as sql]
            [fs.core :as fs]
            [bcbio.run.itx :as itx]))

Metrics to expose, ranked in order of priority with default min/max values.

  (ordered-map "QUAL" {:range [0.0 2000.0]
                       :desc "Variant quality score, phred-scaled"}
               "DP" {:range [0.0 250.0]
                     :desc "Read depth after filtering of low quality reads"}
               "MQ" {:range [25.0 75.0]
                     :desc "Mapping quality"}
               "QD" {:range [0.0 50.0]
                     :desc "Variant confidence by depth"}
               "FS" {:range [0.0 100.0]
                     :y-scale {:type :log}
                     :desc "Phred-scaled p-value using Fisher's exact test to detect strand bias"}
               "ReadPosEndDist" {:range [0.0 50.0]
                                 :desc "Mean distance from either end of read"}
               "MFE" {:range [-10.0 0.0]
                      :y-scale {:type :log}
                      :desc (str "Minimum Free Energy of secondary structure near variant. "
                                 "Larger negative values are more problematic.")}
               "Entropy" {:range [1.0 4.0]
                          :desc (str "Shannon entropy of variant flanking regions. "
                                     "Low values indicate repetitive sequence.")}
               "AD" {:range [0.0 1.0]
                     :y-scale {:type :log}
                     :desc "Deviation from expected allele balance for ref/alt alleles"}
               "PL" {:range [-100.0 0]
                     :desc "Normalized, phred-scaled likelihoods for alternative genotype"}
               "HaplotypeScore" {:range [0.0 50.0]
                                 :desc "Consistency of the site with at most two segregating haplotypes"}
               ;; Validation
               "GradeCat" {:x-scale {:type :category}
                           :desc "Validation category, differentiating discordant types"}
               ;; Methylation metrics from BisSNP
               "CS" {:x-scale {:type :category}
                     :desc "Strand of cytosine relative to reference genome"}
               "Context" {:x-scale {:type :category}
                          :desc "Cytosine context: homozygous or heterozygous CG sites"}
               "CM" {:range [0.0 50.0]
                     :y-scale {:type :log}
                     :desc "Number of unconverted, methylated, cytosines"}
               "CU" {:range [0.0 1.0]
                     :y-scale {:type :log}
                     :desc "Percentage of methylated bases at a position."}))

Default metrics that are always available.

(def ^{:doc  :private true}
  [{:id "QUAL" :type :float}])
(defmulti available-metrics
  (fn [in-file] in-file))
(defmethod available-metrics nil
  ^{:doc "Retrieve all available default metrics without file information"}
  (map (fn [[k v]] (assoc v :id k)) expose-metrics))
(defmethod available-metrics :default
  ^{:doc "Retrieve metrics available for variant input file."}
  (letfn [(convert-header [line]
            {:id (.getID line)
             :type (case (.name (.getType line))
                     "Integer" :float
                     "Float" :float
                     "String" :text
                     "Character" :text
                     :else nil)})
          (add-base-info [x]
            (merge x (get expose-metrics (:id x))))]
    (let [metrics-order (reduce (fn [coll [i x]] (assoc coll x i))
                                {} (map-indexed vector (keys expose-metrics)))]
      (->> (get-vcf-header vcf-file)
           (filter #(contains? #{"INFO" "FORMAT"} (.getKey %)))
           (filter #(contains? expose-metrics (.getID %)))
           (group-by #(.getID %))
           (map first)
           (map convert-header)
           (concat default-metrics)
           (map add-base-info)
           (sort-by #(get metrics-order (:id %)))))))

Common columns for variant metrics table.

(def ^{:doc 
       :private true}
  shared-metrics-cols [[:contig :text]
                       [:start :integer]
                       [:refallele :text]
                       [:issubsample :integer]])

Create table to represent variant metrics

(defn- create-metrics-tables
  (apply sql/create-table (concat [:metrics] shared-metrics-cols
                                  (map (fn [x] [(:id x) (:type x)]) metrics))))

Check if an index file has up to date columns

(defn- index-needs-update?
  [index-file metrics]
  (let [want-cols (set (concat (map first shared-metrics-cols)
                               (map #(keyword (string/lower-case (:id %))) metrics)))]
    (sql/with-connection (get-sqlite-db index-file)
      (sql/with-query-results rows
        ["SELECT * from metrics LIMIT 1"]
        (not= want-cols (-> rows first keys set))))))
(defn- num-variants
  (sql/with-connection (get-sqlite-db index-file)
    (sql/with-query-results rows
      ["SELECT count(*) from metrics"]
      (-> rows first vals first))))
(defn- all-metrics-as-subsample
  (sql/with-connection (get-sqlite-db index-file)
    (sql/update-values :metrics ["start > -1"] {:issubsample 1})))

Retrieve numerical raw metrics for filtering

(declare get-raw-metrics)
(defn get-raw-metrics-linear
  [in-file ref-file]
  (let [metrics (->> (available-metrics in-file)
                     (filter #(= :linear (get-in % [:x-scale :type] :linear)))
                     (map :id))]
    (get-raw-metrics in-file ref-file :metrics metrics)))

Identify a subsample of records to use in visualization

(defn- subsample-metrics
  [index-file in-file ref-file params]
  (let [sub-ids (subsample-by-cluster (get-raw-metrics-linear in-file ref-file) params)]
    (sql/with-connection (get-sqlite-db index-file)
       (doseq [xid sub-ids]
         (sql/update-values :metrics
                            (cons "contig=? AND start=? and refallele=?" (vec xid))
                            {:issubsample 1}))))))

Pre-index a variant file with associated metrics.

(defn index-variant-file
  [in-file ref-file & {:keys [re-index? subsample-params]}]
  (let [batch-size 10000
        metrics (available-metrics in-file)
        index-file (str (itx/file-root in-file) "-metrics.db")]
    (when (or re-index?
              (itx/needs-run? index-file)
              (index-needs-update? index-file metrics))
      (itx/with-tx-file [tx-index index-file]
        (sql/with-connection (get-sqlite-db tx-index :create true)
           (create-metrics-tables metrics))
          (with-open [vcf-iter (get-vcf-iterator in-file ref-file)]
            (doseq [vcs (partition-all batch-size (filter passes-filter? (parse-vcf vcf-iter)))]
               (doseq [vc vcs]
                 (sql/insert-record :metrics
                                    (-> (get-vc-attrs vc (map :id metrics) {})
                                        (assoc :contig (:chr vc))
                                        (assoc :start (:start vc))
                                        (assoc :refallele (.getBaseString (:ref-allele vc)))
                                        (assoc :issubsample 0)))))))))
      (if (and subsample-params
               (> (num-variants index-file) (get-in subsample-params [:subsample :count])))
        (subsample-metrics index-file in-file ref-file subsample-params)
        (all-metrics-as-subsample index-file)))

Provide category metrics as expected sets to match gemini usage.

(defn- maybe-fix-category
  [val attr]
  (if (= :category (get-in expose-metrics [attr :x-scale :type]))

Retrieve table of raw metrics using indexed variant file

(defn get-raw-metrics
  [in-file ref-file & {:keys [metrics use-subsample?]}]
  (let [index-db (index-variant-file in-file ref-file)
        plot-metrics (filter (partial contains? expose-metrics)
                             (or metrics (map :id (available-metrics in-file))))]
    (sql/with-connection (get-sqlite-db index-db)
      (sql/with-query-results rows
        [(str "SELECT contig, start, refallele, " (string/join ", " plot-metrics)
              " FROM metrics"
              (if use-subsample? " WHERE issubsample=1 " " "))]
              "ORDER BY contig, start"
        (doall (map (fn [orig]
                      (reduce (fn [coll x]
                                (assoc coll x (-> orig
                                                  (get (keyword (string/lower-case x)))
                                                  (maybe-fix-category x))))
                              {:id [(:contig orig) (:start orig) (:refallele orig)]}

Provide rapid subsampling capabilities for indexed retrieval of metrics. Used on index preparation to provide a representative subset of large datasets based on assessment metrics. Sub-sampled metrics allow interactive visualizations of large data.

(ns bcbio.variation.index.subsample
  (:require [clj-ml.data :as mldata]
            [clj-ml.clusterers :as mlclust]))

Return ids of subsampled metrics with a single representative from each cluster.

(defn- final-subsample-ids
  [xs clusters]
  (let [final-is (-> (reduce (fn [coll [i cluster]]
                               {:seen (conj (:seen coll) cluster)
                                :want (if (contains? (:seen coll) cluster)
                                        (:want coll)
                                        (conj (:want coll) i))})
                             {:seen #{} :want []} (map-indexed vector clusters))
    (remove nil?
            (map-indexed (fn [i x] (when (contains? final-is i) x)) xs))))

Subsample a set of metrics using clustering. Returns ids of representative items from each cluster.

(defn subsample-by-cluster
  [metrics params]
  (letfn [(get-attrs [attrs x] (map #(get x %) attrs))]
    (let [clusterer (mlclust/make-clusterer (keyword (get-in params [:subsample :method]))
                                            {:number-clusters (get-in params [:subsample :count])})
          attrs (-> metrics first (dissoc :id) keys)
          ds (mldata/make-dataset "ds" attrs
                                  (map (partial get-attrs attrs) metrics))]
      (mlclust/clusterer-build clusterer ds)
      (->> (mlclust/clusterer-cluster clusterer ds)
           (map mldata/instance-get-class)
           (final-subsample-ids (map :id metrics))))))

Accumulate and analyze metrics associated with each variant. This provides summaries intended to identify characteristic metrics to use for filtering.

(ns bcbio.variation.metrics
  (:use [clojure.java.io]
        [bcbio.variation.variantcontext :only [parse-vcf get-vcf-iterator]]
        [clojure.string :only [split-lines]]
        [clj-ml.data :only [make-dataset]]
        [clj-ml.classifiers :only [make-classifier classifier-train]]
        [ordered.set :only [ordered-set]])
  (:require [incanter.stats :as istats]
            [doric.core :as doric]))

Convenience functions

(defn to-float [x]
  (if (number? x)
    (float x)
      (Float/parseFloat x)
      (catch Exception e nil))))

Check if a VariantContext is not filtered.

(defn passes-filter?
  (= (count (:filters vc)) 0))

Check if a variant context is not filter and is not a reference call.

(defn nonref-passes-filter?
  (and (passes-filter? vc)
       (every? #(contains? #{"HET" "HOM_VAR"} (:type %)) (:genotypes vc))))

Retrieve numeric metrics associated with VariantContext.

(defn get-vc-metrics
  (reduce (fn [coll [k v]]
            (if-let [num-v (to-float v)]
              (assoc coll k num-v)
   (assoc (:attributes vc) "QUAL" (:qual vc))))

Summary metrics

Provide a summary-style presentation of distribution of metrics values.

(def header [{:name :metric}
             {:name :count}
             {:name :min :format #(format "%.2f" %)}
             {:name :pct25 :format #(format "%.2f" %)}
             {:name :median :format #(format "%.2f" %)}
             {:name :pct75 :format #(format "%.2f" %)}
             {:name :max :format #(format "%.2f" %)}])
(defn summary-stats [key vals]
  "Provide summary statistics on a list of values."
  (zipmap (map :name header)
          (concat [key (count vals)]
                  (istats/quantile vals))))

Accumulate raw statistics associated with variant calls from input VCF.

(defn- raw-vcf-stats
  [vcf-file ref-file]
  (letfn [(collect-attributes [collect [k v]]
            (if-not (nil? (to-float v))
              (assoc collect k (cons (to-float v) (get collect k [])))
          (collect-vc [collect vc]
            (assoc (reduce collect-attributes collect (:attributes vc))
              "QUAL" (cons (:qual vc)
                           (get collect "QUAL" []))))]
    (with-open [vcf-iter (get-vcf-iterator vcf-file ref-file)]
      (reduce collect-vc {} (filter passes-filter? (parse-vcf vcf-iter))))))


Provide metrics for files in preparation for automated classification.

Collect classification metrics from a single VCF file.

(defn- get-file-metrics
  [ref-file vcf-file]
  (letfn [(has-nil-names [metrics all-metrics all-names]
            (let [test-names (union (-> metrics keys set) all-names)]
              (apply union
                     (map (fn [xs] (set (keep #(when (nil? (get xs %)) %) test-names)))
                          (cons metrics (take-last 10 all-metrics))))))
          (classifier-metrics [coll vc]
            (let [cur-metrics (get-vc-metrics vc)]
              (-> coll
                  (assoc :rows (cons cur-metrics (:rows coll)))
                  (assoc :names (union (-> cur-metrics keys set) (:names coll)))
                  (assoc :nil-names (union (has-nil-names cur-metrics (:rows coll) (:names coll))
                                           (:nil-names coll))))))
          (prep-table [{rows :rows names :names nil-names :nil-names}]
            (let [sort-names (sort (vec names))]
              {:cols sort-names
               :with-nil-cols nil-names
               :rows (map (fn [x]
                            (map #(get x %) sort-names))
    (with-open [vcf-iter (get-vcf-iterator vcf-file ref-file)]
       (reduce classifier-metrics {:rows [] :names #{} :nil-names #{}}
               (filter passes-filter? (parse-vcf vcf-iter)))))))

Collect metrics from multiple vcf files into tables suitable for classification algorithms.

(defn get-vcf-classifier-metrics
  [ref-file vcf-files & {:keys [remove-nil-cols]
                         :or {remove-nil-cols true}}]
  (letfn [(get-shared-cols [xs]
            (-> (apply intersection (map #(set (:cols %)) xs))
          (filter-by-cols [orig-cols want-cols]
            (let [check-cols (set want-cols)
                  want (set (keep-indexed #(if (contains? check-cols %2) %1) orig-cols))]
              (fn [xs]
                (keep-indexed #(when (contains? want %1) %2) xs))))
          (subset-file-metrics [shared-cols nil-cols {cols :cols rows :rows}]
            (let [ready-cols (if-not remove-nil-cols shared-cols
                                     (remove #(contains? nil-cols %) shared-cols))
                  row-filter (filter-by-cols cols ready-cols)]
              {:cols ready-cols
               :rows (remove #(not= (count %) (count ready-cols)) (map row-filter rows))}))]
    (let [file-metrics (map (partial get-file-metrics ref-file) vcf-files)
          shared-cols (get-shared-cols file-metrics)
          nil-cols (apply union (map #(set (:with-nil-cols %)) file-metrics))]
      (map (partial subset-file-metrics shared-cols nil-cols) file-metrics))))

Retrieve classification metrics from a tree based classifier. Metric ordering is relative to the usefulness in classifying.

(defn- parse-classifier-nodes
  [classifier metrics]
  (->> classifier
       (map #(re-find #"label=\"(\w+)\"" %))
       (map second)
       (remove nil?)
       (filter #(contains? (set metrics) %))
       (apply ordered-set)))

Classify VCF files with INFO metrics using a decision tree classifier.

(defn classify-decision-tree
  (letfn [(prep-one-dataset [rows i]
            (map #(conj (vec %) (str i)) rows))
          (prep-dataset [metrics]
            (make-dataset "ds" (conj (-> metrics first :cols vec)
                                     {:c (map str (range (count metrics)))})
                          (apply concat (map-indexed #(prep-one-dataset (:rows %2) %1) metrics))
                          {:class :c}))]
    (let [ds (prep-dataset metrics)
          c (-> (make-classifier :decision-tree :c45)
                (classifier-train ds))]
      (vec (parse-classifier-nodes c (-> metrics first :cols))))))

Merge multiple classification approaches into a set of final metrics. in-metrics contains ordered best metric classifiers from the different approaches. Returns interleaved metrics ranked by present in these classifiers.

(defn merge-classified-metrics
  (loop [cur in-metrics
         final (ordered-set)]
    (if (every? empty? cur)
      {:top-metrics (vec final)}
      (recur (map rest cur)
             (reduce #(conj %1 %2) final (remove nil? (map first cur)))))))

Apply machine learning/classification approaches to distinguish useful metrics distinguishing VCF files.

(defn ml-on-vcf-metrics
  [ref-file vcf-files]
  (letfn [(run-classifier [remove-nil-cols]
            (-> (get-vcf-classifier-metrics ref-file vcf-files :remove-nil-cols remove-nil-cols)
    (merge-classified-metrics (map run-classifier [true false]))))

Handle useful comparisons from multiple variation calling approaches. High level API to consolidate pairwise variant comparisons.

(ns bcbio.variation.multiple
  (:use [clojure.set :only [union]]
        [ordered.map :only [ordered-map]]
        [bcbio.variation.annotation :only [add-variant-annotations]]
        [bcbio.variation.callable :only [get-callable-checker is-callable? has-callers?]]
        [bcbio.variation.combine :only [combine-variants]]
        [bcbio.variation.metrics :only [nonref-passes-filter?]]
        [bcbio.variation.variantcontext :only [parse-vcf get-vcf-retriever
                                               get-vcf-iterator write-vcf-w-template]])
  (:require [clojure.string :as string]
            [fs.core :as fs]
            [bcbio.run.itx :as itx]
            [bcbio.run.broad :as broad]))

Utility functions

(defn remove-mod-name [x & {:keys [mods] :or {mods ["recal"]}}]
  "Removes modification names from an approach name."
  (reduce (fn [final mod]
            (string/replace final (str "-" mod) ))
          x mods))

Lookup map of comparisons by method names. - ignore: a list of method names to ignore when creating the lookup map. - remove-mods?: Flag to remove naming modifications. This will replace original comparisons with recalibrated.

(defn prep-cmp-name-lookup
  [cmps & {:keys [ignore remove-mods?] :or {ignore #{}}}]
  (reduce (fn [m x]
            (let [names (map #(let [n (get-in x [% :name])]
                                (if-not remove-mods? n
                                        (remove-mod-name n :mods [(get-in x [% :mod])])))
                             [:c1 :c2])]
              (if (some #(contains? ignore %) names) m
                  (assoc m names x))))
(defn- not-target? [target-name xs]
  (not (contains? (set (map remove-mod-name xs)) target-name)))

Prepare multi-overlap sets

Retrieve all called items from a variant context 'set' attribute.

(defn get-vc-set-calls
  [vc calls]
  (when-let [set-val (get-in vc [:attributes "set"])]
    (if (= set-val "Intersection")
      (set (map :name calls))
      (->> (string/split set-val #"-")
           (remove #(.startsWith % "filter"))
           (map #(string/split % #"AND"))
           (apply concat)

Select samples based on name of a 'set' from CombineVariants.

(defmulti select-variant-by-set
  (fn [_ _ set-name] (keyword set-name)))
(defmethod select-variant-by-set :Intersection
  [vcf-in ref set-name]
  (let [file-info {:out-vcf (itx/add-file-part vcf-in set-name nil)}
        args ["-R" ref
              "-o" :out-vcf
              "--variant" vcf-in
              "-select" (format "set == '%s'" set-name)]]
    (broad/run-gatk "SelectVariants" args file-info {:out [:out-vcf]})
    (:out-vcf file-info)))
(defmethod select-variant-by-set :default
  ^{:doc "Select non-intersection names, handling GATK special cases like intersection
          and filtered."}
  [vcf-in ref set-name]
  (letfn [(in-set? [vc]
            (contains? (get-vc-set-calls vc [{:name set-name}])
    (let [out-file (itx/add-file-part vcf-in set-name nil)]
      (when (itx/needs-run? out-file)
        (with-open [in-iter (get-vcf-iterator vcf-in ref)]
          (write-vcf-w-template vcf-in {:out out-file}
                                (map :vc (filter in-set? (parse-vcf in-iter)))

Create VCF of the intersection of all concordant calls.

(defn- gen-all-concordant
  [cmps-by-name ref out-dir config & {:keys [do-include? base-ext]
                                      :or {base-ext "multiall"}}]
  (let [concordant-map (reduce (fn [m [k v]]
                                 (if (or (nil? do-include?) (do-include? k))
                                   (assoc m (get-in v [:c-files :concordant]) (string/join "AND" k))
                               (ordered-map) cmps-by-name)
        union-vcf (combine-variants (keys concordant-map) ref :merge-type :full :out-dir out-dir
                                    :name-map concordant-map :base-ext base-ext)]
    {:union union-vcf
     :intersection (select-variant-by-set union-vcf ref "Intersection")}))

Generate false positives, dispatching differently when the target is from recalling.

(defmulti gen-target-fps
  (fn [_ _ _ _ call _ _]
    (if (get call :recall false) :recall :default)))
(defmethod gen-target-fps :recall
  ^{:doc "False positive generation for combine call sets resulting from recalling.
          Report calls with low support across inputs callsets."}
  [target-cmps target-name _ target-overlap-vcf call ref out-dir]
  (let [calls (vec (set (mapcat (juxt :c1 :c2) (vals target-cmps))))
        out-file (itx/add-file-part target-overlap-vcf "lowcounts" out-dir)
        freq (get call :fp-freq 0.25)
        thresh (Math/ceil (* freq (dec (count calls))))]
    (letfn [(is-lowcount-fp? [vc]
              (-> (get-vc-set-calls vc calls)
                  (disj target-name)
                  (<= thresh)))]
      (when (itx/needs-run? out-file)
        (with-open [in-iter (get-vcf-iterator target-overlap-vcf ref)]
          (write-vcf-w-template target-overlap-vcf {:out out-file}
                                (map :vc (filter is-lowcount-fp? (parse-vcf in-iter)))
(defmethod gen-target-fps :default
  ^{:doc "False positive generation for single call sets: report discordant variants
          callable in other samples."}
  [target-cmps target-name other-conc-vcf _ _ ref out-dir]
  (letfn [(check-shared [fetch any-callable]
            (fn [x]
              (and (nonref-passes-filter? x)
                   (if (has-callers? any-callable)
                     (is-callable? any-callable (:chr x) (:start x) (:end x))
                     (not (empty? (variants-in-region fetch (:chr x)
                                                      (:start x) (:end x))))))))
          (get-shared-discordant [xs fetch any-callable]
            (let [pass-and-shared? (check-shared fetch any-callable)]
              (map :vc (filter pass-and-shared? xs))))]
    (let [disc-vcfs (remove nil? (map (fn [v]
                                        (get-in v [:c-files
                                                   (keyword (format "%s-discordant" target-name))]))
                                      (vals target-cmps)))
          disc-vcf (-> (combine-variants disc-vcfs ref :merge-type :full :out-dir out-dir
                                         :base-ext (format "dis%s" target-name))
                       (select-variant-by-set ref "Intersection"))
          out-file (itx/add-file-part disc-vcf "shared")
          align-bams (->> (vals target-cmps)
                          (map (juxt :c1 :c2))
                          (map :align)
                          (remove nil?))]
      (with-open [disc-iter (get-vcf-iterator disc-vcf ref)
                  other-retriever (get-vcf-retriever ref other-conc-vcf)
                  call-source (get-callable-checker align-bams ref
                                                    :out-dir (str (fs/parent out-dir)))]
        (when (itx/needs-run? out-file)
          (write-vcf-w-template disc-vcf {:out out-file}
                                (get-shared-discordant (parse-vcf disc-iter)
                                                       other-retriever call-source)

Create files of false negatives and positives from target-name.

(defn- gen-target-problems
  [target-name target-call cmps-by-name true-p-vcf target-overlap-vcf ref out-dir config]
  (let [notarget-concordant (gen-all-concordant cmps-by-name ref out-dir config
                                                :do-include? (partial not-target? target-name)
                                                :base-ext (format "multino%s" target-name))]
     (-> (combine-variants [true-p-vcf (:intersection notarget-concordant)]
                           ref :merge-type :full :out-dir out-dir
                           :name-map {true-p-vcf "truep"
                                      (:intersection notarget-concordant) target-name}
                           :base-ext (format "multiall-no%s" target-name))
         (select-variant-by-set ref target-name))
     :false-positives (gen-target-fps (remove #(not-target? target-name (first %))
                                      target-name (:union notarget-concordant)
                                      target-overlap-vcf target-call
                                      ref out-dir)}))

Provide high level concordance overlap comparisons for multiple call approaches. Organizes relative to the given target name generating: - VCF of calls concordant in all methods: intersection of all concordant calls. These are true positives. - VCF of calls discordant in the target method, but concordant in the remainder: the intersection of all concordant pairs not including target-name minus the overall intersection of concordants. These are false negatives. - VCF of non-ref calls discordant in the target method and called in any of the other methods. We restrict to shared calls to avoid penalizing unique calls. These are false positives.

(defn multiple-overlap-analysis
  [cmps config target-name & {:keys [dirname ignore] :or {dirname "multiple"
                                                          ignore #{}}}]
  (let [cmps-by-name (prep-cmp-name-lookup (if (map? cmps) (vals cmps) cmps)
                                           :ignore (union ignore #{"all" "validate"}))
        out-dir (str (fs/file (get-in config [:dir :prep] (get-in config [:dir :out]))
        ref (-> cmps-by-name vals first :exp :ref)
        target-call (->> cmps-by-name
                         (remove #(not-target? target-name (first %)))
                         ((juxt :c1 :c2))
                         (filter #(= (remove-mod-name (:name %)) target-name))
    (when-not (fs/exists? out-dir)
      (fs/mkdirs out-dir))
    (let [all-overlap (gen-all-concordant cmps-by-name ref out-dir config)
          true-p-vcf (:intersection all-overlap)
          target-overlaps (-> all-overlap
                              (select-variant-by-set ref target-name))
          target-problems (gen-target-problems target-name target-call cmps-by-name
                                               true-p-vcf target-overlaps ref out-dir config)]
      (ordered-map :true-positives true-p-vcf
                   :false-negatives (:false-negatives target-problems)
                   :false-positives (:false-positives target-problems)
                   :target-overlaps target-overlaps))))

Perform high level pipeline comparison of a target with multiple experiments.

(defn pipeline-compare-multiple
  [cmps finalizer exp config]
  (let [analysis (multiple-overlap-analysis cmps config (:target finalizer)
                                            :ignore (set (get finalizer :ignore #{})))]
    {:c-files analysis
     :c1 {:name (:target finalizer)}
     :c2 {:name "all"}
     :exp exp :dir (config :dir)}))

Compare multiple sample input files, allowing flexible configuration of concordance/discordance logic for comparison two sets of calls.

(ns bcbio.variation.multisample
  (:use [clojure.java.io]
        [clojure.set :only [intersection]]
        [ordered.map :only [ordered-map]])
  (:require [clojure.string :as string]
            [bcbio.run.itx :as itx]
            [bcbio.variation.variantcontext :as gvc]))

Check if the input VCF file has multiple genotyped samples.

(defn multiple-samples?
  [in-file & {:keys [sample]}]
  (let [samples (-> in-file gvc/get-vcf-header .getGenotypeSamples)]
    (or (> (count samples) 1)
        (and (not (nil? sample))
             (not (contains? (set samples) sample))))))

Retrieve basename for output display, handling multiple sample inputs.

(defn get-out-basename
  [exp call in-files]
  (let [sample-name (or (:sample exp)
                        (-> in-files first gvc/get-vcf-header .getGenotypeSamples first
                            (str "multi")))]
    (format "%s-%s" sample-name (:name call))))

Retrieve output files for a variant to variant comparison

(defn- get-cmp-outfiles
  [c1 c2 exp config]
  (let [out-dir (get-in config [:dir :out])
        base-out (str (file out-dir (format "%s-%s.vcf"
                                            (get-out-basename exp c1 [(:file c1)])
                                            (:name c2))))
        out-files (into (ordered-map :concordant (itx/add-file-part base-out "concordant"))
                        (map (fn [c]
                               [(keyword (str (:name c) "-discordant"))
                                (itx/add-file-part base-out (str (:name c) "-discordant"))])
                             [c1 c2]))]

Compare two genotyping calls for a single sample, returning details about match: - concordant: 100% match between alleles - phasing-mismatch: Alleles match but phasing information does not. - nocall-mismatch: Alleles mismatch due to a no-call in one of the genotypes. - partial-mismatch: Alleles match in at least one position but mismatch elsewhere. - discordant: No recoverable match characteristics

(defn compare-genotypes
  [g1 g2]
  (letfn [(has-nocall? [g]
            (some #(.isNoCall %) (:alleles g)))
          (phase-mismatch? [g1 g2]
            (and (or (:phased? g1) (:phased? g2))
                 (= (:alleles g1) (reverse (:alleles g2)))))
          (atleast-one-match? [g1 g2]
            (seq (intersection (set (:alleles g1)) (set (:alleles g2)))))
          (nocall-mismatch? [g1 g2]
            (and (or (has-nocall? g1) (has-nocall? g2))
                 (atleast-one-match? g1 g2)))
          (num-matches [g1 g2]
            (count (intersection
                    (set (map #(.getDisplayString %) (:alleles g1)))
                    (set (map #(.getDisplayString %) (:alleles g2))))))]
     (= (:alleles g1) (:alleles g2)) :concordant
     (phase-mismatch? g1 g2) :phasing-mismatch
     (nocall-mismatch? g1 g2) :phasing-nocall
     (atleast-one-match? g1 g2) :partial-mismatch
     :else :discordant)))

Check if variants have the same position and reference allele.

(defn- same-vc-coords?
  [& xs]
  (apply = (map (juxt :chr :start :end :ref-allele) xs)))

Flexible comparison of variants, assuming pre-checking of vc1 and vc2 to overlap in the same genomic region.

(defmulti compare-vcs
  (fn [vc1 vc2 params]
     (not (nil? (:compare-approach params))) (keyword (:compare-approach params))
     (or (> (:num-samples vc1) 1)
         (> (:num-samples vc2) 1)) :multiple
     :else :default)))
(defmethod compare-vcs :multiple
  ^{:doc "Compare variant contexts handling multiple sample comparisons."}
  [vc1 vc2 params]
  (letfn [(calc-genotype-score [g1 g2]
            (case (compare-genotypes g1 g2)
              :concordant 1.0
              :phasing-mismatch 1.0
              :nocall-mismatch 0.5
              :partial-mismatch 0.5
    (when (same-vc-coords? vc1 vc2)
      (let [score-thresh (get params :multiple-thresh 1.0)
            vc2-cmps (into {} (map (juxt :sample-name identity) (:genotypes vc2)))
            score (/ (apply + (map #(calc-genotype-score % (get vc2-cmps (:sample-name %)))
                                   (:genotypes vc1)))
                     (:num-samples vc1))]
        (>= score score-thresh)))))
(defmethod compare-vcs :approximate
  ^{:doc "Provide approximate comparisons between variants, handling cases
          like het versus homozygous variant calls and indels with
          different overlapping calls. The goal is to identify almost-match
          cases which are useful for variant evidence."}
  [vc1 vc2 params]
  (when (= (:type vc1) (:type vc2))
    (compare-vcs vc1 vc2 (assoc params :compare-approach
                                (str "approximate-" (-> vc1 :type string/lower-case))))))
(defmethod compare-vcs :approximate-indel
  ^{:doc "Approximate comparisons for indels, allowing overlapping
          indels to count as concordant."}
  [vc1 vc2 params]
  {:pre [(every? #(= 1 (:num-samples %)) [vc1 vc2])]}
  (letfn [(all-alleles [x]
            (map #(.getBaseString %) (cons (:ref-allele x) (-> x :genotypes first :alleles))))]
    (let [vc1-alleles (all-alleles vc1)
          vc2-alleles (all-alleles vc2)]
      (and (contains? (set (range (dec (:start vc1)) (:end vc1))) (dec (:start vc2)))
           (or (every? #(= 1 (count (first %))) [vc1-alleles vc2-alleles])
               (every? #(> (count (first %)) 1) [vc1-alleles vc2-alleles]))))))
(defmethod compare-vcs :approximate-snp
  ^{:doc "Approximate comparisons for SNPs, allowing matching het/hom calls."}
  [vc1 vc2 params]
  {:pre [(every? #(= 1 (:num-samples %)) [vc1 vc2])]}
  (when (same-vc-coords? vc1 vc2)
    (not (empty?
          (->> [vc1 vc2]
               (map #(-> % :genotypes first :alleles set))
               (apply intersection))))))
(defmethod compare-vcs :default
  ^{:doc "Provide exact comparisons for variants, requiring identical
          base coordinates and reference and identical allele calls."}
  [vc1 vc2 params]
  {:pre [(every? #(= 1 (:num-samples %)) [vc1 vc2])]}
  (when (same-vc-coords? vc1 vc2) 
    (apply = (map #(-> % :genotypes first :alleles) [vc1 vc2]))))

Top level comparison of variant contexts: check if any vc2s match vc1. Flexible handles different comparisons with compare-vcs

(defn find-concordant-vcs
  [vc1 vc2-checks params]
  (letfn [(are-concordant? [vc1 vc2]
            (when (compare-vcs vc1 vc2 params)
    (let [vc2-groups (group-by (partial are-concordant? vc1) vc2-checks)]
      [(get vc2-groups true [])
       (get vc2-groups nil [])])))
(defn- add-cmp-kw
  [xs kw]
  (partition 2 (interleave (repeat kw) xs)))

Compare variant context with second list of variants. Assumes sorted vc2-iter by position, returning any variants in the region that don't match.

(defn- compare-vc-w-iter
  [vc1 vc2-iter cmp-kws params]
  {:pre [(= 3 (count cmp-kws))]}
  (letfn [(less-than-vc? [vc1 vc2]
            (and (= (:chr vc2) (:chr vc1))
                 (<= (:start vc2) (:end vc1))))]
    (let [[cur-vc2-iter rest-vc2-iter] (split-with (partial less-than-vc? vc1) vc2-iter)
          [vc2-extras vc2-checks] (split-with #(< (:start %) (:start vc1)) cur-vc2-iter)
          [vc2-matches vc2-continues] (find-concordant-vcs vc1 vc2-checks params)]
      {:cur-cmps (concat (add-cmp-kw vc2-extras (last cmp-kws))
                         [(if (seq vc2-matches)
                            [(first cmp-kws) vc1]
                            [(second cmp-kws) vc1])])
       :cur-vc2-iter (concat vc2-continues rest-vc2-iter)})))

Lazy comparison of two sets of variants. Assumes identical ordering.

(defn- compare-two-vc-iters
  [vc1-iter vc2-iter cmp-kws params]
   (if-let [vc1 (first vc1-iter)]
     (let [{:keys [cur-cmps cur-vc2-iter]} (compare-vc-w-iter vc1 vc2-iter cmp-kws params)]
       (concat cur-cmps (compare-two-vc-iters (rest vc1-iter) cur-vc2-iter cmp-kws params)))
     (add-cmp-kw vc2-iter (last cmp-kws)))))

Compare two variant input files, with flexible matching conditions. TODO: restrict comparison by intervals.

(defn compare-two-vcf-flexible
  [c1 c2 exp config]
  (let [out-files (get-cmp-outfiles c1 c2 exp config)]
    (when (itx/needs-run? (vals out-files))
      (with-open [c1-iter (gvc/get-vcf-iterator (:file c1) (:ref exp))
                  c2-iter (gvc/get-vcf-iterator (:file c2) (:ref exp))]
        (gvc/write-vcf-w-template (:file c1) out-files
                                  (compare-two-vc-iters (gvc/parse-vcf c1-iter)
                                                        (gvc/parse-vcf c2-iter)
                                                        (keys out-files)
                                                        (get exp :params {}))
                                  (:ref exp))))
    {:c-files out-files :c1 c1 :c2 c2 :exp exp :dir (:dir config)}))

Prepare a VCF file for comparison by normalizing chromosome names, sort order, sample name, and genotype representation. This handles the work of making slightly different representations match, enabling VCF comparisons. Currently implemented for human only, with hooks to generalize for other organisms.

(ns bcbio.variation.normalize
  (:import [org.broadinstitute.sting.utils.variantcontext VariantContextBuilder GenotypeBuilder]
           [org.broadinstitute.sting.utils.codecs.vcf VCFHeader]
           [org.broad.tribble.readers AsciiLineReader])
  (:use [clojure.java.io]
        [bcbio.variation.variantcontext :only [write-vcf-w-template
                                               get-vcf-iterator parse-vcf
        [bcbio.align.ref :only [get-seq-dict get-seq-name-map extract-sequence]]
        [ordered.map :only (ordered-map)]
        [ordered.set :only (ordered-set)])
  (:require [clojure.string :as string]
            [fs.core :as fs]
            [lonocloud.synthread :as ->]
            [bcbio.run.itx :as itx]))

Chromosome name remapping

Provide mapping from variant chromosome names to reference keyed on the organism name. Currently only a human GRCh37 remap.

(defmulti chr-name-remap (fn [type & args] type))

Function to retrieve hg19 information. Requires korma and mysql connector.

(defn- get-hg19-map
  (defdb db (mysql {:db "hg19"
                    :user "genome"
                    :host "genome-mysql.cse.ucsc.edu"}))
  (defentity ucscToEnsembl)
  (->> (select ucscToEnsembl)
       (map (juxt :ucsc :ensembl))
       (into {}))))

Cached version of hg19 map to avoid having to make database connections

(def hg19-map
  {"chrM" "MT" "chrMT" "MT" "chrUn_gl000211" "GL000211", "chrUn_gl000222" "GL000222",
   "chrUn_gl000233" "GL000233", "chrUn_gl000244" "GL000244", "chrUn_gl000212" "GL000212",
   "chrUn_gl000223" "GL000223", "chrUn_gl000234" "GL000234", "chrUn_gl000245" "GL000245",
   "chrUn_gl000213" "GL000213", "chrUn_gl000224" "GL000224", "chrUn_gl000235" "GL000235",
   "chrUn_gl000246" "GL000246", "chr6_mcf_hap5" "HSCHR6_MHC_MCF", "chrUn_gl000214" "GL000214",
   "chrUn_gl000225" "GL000225", "chrUn_gl000236" "GL000236", "chrUn_gl000247" "GL000247",
   "chr1" "1", "chr6_cox_hap2" "HSCHR6_MHC_COX", "chrUn_gl000215" "GL000215",
   "chrUn_gl000226" "GL000226", "chrUn_gl000237" "GL000237", "chrUn_gl000248" "GL000248",
   "chr2" "2", "chrUn_gl000216" "GL000216", "chrUn_gl000227" "GL000227",
   "chrUn_gl000238" "GL000238", "chrUn_gl000249" "GL000249", "chr3" "3",
   "chrUn_gl000217" "GL000217", "chrUn_gl000228" "GL000228", "chrUn_gl000239" "GL000239",
   "chr9_gl000201_random" "GL000201", "chr4" "4", "chr11_gl000202_random" "GL000202",
   "chrUn_gl000218" "GL000218", "chrUn_gl000229" "GL000229", "chr9_gl000200_random" "GL000200",
   "chr19_gl000209_random" "GL000209", "chr5" "5", "chrUn_gl000219" "GL000219",
   "chr1_gl000192_random" "GL000192", "chr18_gl000207_random" "GL000207", "chr6" "6",
   "chr21_gl000210_random" "GL000210", "chr17_gl000206_random" "GL000206",
   "chr9_gl000199_random" "GL000199", "chr1_gl000191_random" "GL000191",
   "chr4_gl000194_random" "GL000194", "chr19_gl000208_random" "GL000208",
   "chr17_gl000205_random" "GL000205", "chr7" "7", "chr9_gl000198_random" "GL000198",
   "chr8_gl000197_random" "GL000197", "chr4_gl000193_random" "GL000193",
   "chr17_gl000204_random" "GL000204", "chr8" "8", "chrX" "X", "chr8_gl000196_random" "GL000196",
   "chr7_gl000195_random" "GL000195", "chr20" "20", "chr9" "9", "chrY" "Y",
   "chr17_gl000203_random" "GL000203", "chr10" "10", "chr21" "21", "chr6_dbb_hap3" "HSCHR6_MHC_DBB",
   "chr11" "11", "chr22" "22", "chr6_ssto_hap7" "HSCHR6_MHC_SSTO", "chr17_ctg5_hap1" "HSCHR17_1",
   "chr12" "12", "chr13" "13", "chr14" "14", "chr15" "15", "chr16" "16",
   "chr6_mann_hap4" "HSCHR6_MHC_MANN", "chr17" "17", "chr18" "18", "chr19" "19",
   "chr6_qbl_hap6" "HSCHR6_MHC_QBL", "chr6_apd_hap1" "HSCHR6_MHC_APD",
   "chrUn_gl000240" "GL000240", "chrUn_gl000230" "GL000230", "chrUn_gl000241" "GL000241",
   "chr4_ctg9_hap1" "HSCHR4_1", "chrUn_gl000220" "GL000220", "chrUn_gl000231" "GL000231",
   "chrUn_gl000242" "GL000242", "chrUn_gl000221" "GL000221", "chrUn_gl000232" "GL000232",
   "chrUn_gl000243" "GL000243"})

Convert any non-versioned names into the representative version in ref-dict.

(defn- fix-non-version-names
  [base-map ref-dict]
  (letfn [(find-best-match [x check]
            (first (filter #(.startsWith % x) check)))]
    (reduce (fn [coll [k v]]
              (assoc coll k
                     (if (contains? ref-dict v)
                       (find-best-match v (keys ref-dict)))))
            {} base-map)))

Add alternative key variations: - underscore to dash in hg19 names - chr added to all GRCh37 names instead of hg19 names

(defn- add-alt-keys
  [base-map modtype]
  {:pre [(= modtype :underscore)]}
  (reduce (fn [coll [k v]]
            (-> coll
                (assoc k v)
                (assoc (string/replace k "_" "-") v)
                (assoc (str "chr" v) v)))
          {} base-map))

Fix GRCh37/hg19 name mappings to handle common problem cases.

(defn prep-rename-map
  [map-key ref-file]
  (let [remappers {:GRCh37 hg19-map}]
    (-> (get remappers map-key)
        (fix-non-version-names (get-seq-name-map ref-file))
        (add-alt-keys :underscore))))

Retrieve a list of all chromosome names from a reference FASTA file.

(defn- chrs-from-fasta-file
  (map #(.getSequenceName %) (-> ref-file get-seq-dict .getSequences)))
(defmethod chr-name-remap :GRCh37
  [map-key ref-file orig-ref-file]
  (let [rename-map (prep-rename-map map-key ref-file)
        ref-chrs (set (chrs-from-fasta-file ref-file))
        vcf-chrs (when (and orig-ref-file (not= orig-ref-file ref-file))
                   (chrs-from-fasta-file orig-ref-file))]
    (letfn [(maybe-remap-name [x]
              (let [remap-x (get rename-map x)]
                (if (and remap-x (contains? ref-chrs remap-x))
                  remap-x x)))]
      (if vcf-chrs
        (zipmap vcf-chrs
                (map maybe-remap-name vcf-chrs))

Resort and normalize variants

Build a new variant context with updated sample name and normalized alleles. Based on :prep-allele-count in the configuration updates haploid allele calls. This normalizes the representation in Mitochondrial and Y chromosomes which are haploid but are often represented as diploid with a single call.

(defn- fix-vc
  [sample config orig]
  (letfn [(update-genotype-sample [vc sample]
            (if (and (not (nil? sample))
                     (= 1 (count (.getGenotypes vc))))
              (let [g (first (.getGenotypes vc))]
                [(-> (GenotypeBuilder. g)
                     (.name sample)
              (.getGenotypes vc)))
          (normalize-allele-calls [g]
            {:pre [(or (nil? (:prep-allele-count config))
                       (contains? (set [1 (:prep-allele-count config)]) (count (.getAlleles g))))]}
            (if (or (nil? (:prep-allele-count config))
                    (= (count (.getAlleles g)) (:prep-allele-count config)))
              (-> (GenotypeBuilder. g)
                  (.alleles (repeat (:prep-allele-count config)
                                    (first (.getAlleles g))))
    (-> orig
        (assoc :vc
          (-> (VariantContextBuilder. (:vc orig))
              (.genotypes (map normalize-allele-calls (update-genotype-sample (:vc orig) sample)))

Check if a variant has a non-informative no-call genotype.

(defn- no-call-genotype?
  [vc config]
  (let [to-remove (cond
                   (:prep-sv-genotype config) #{}
                   (:remove-refcalls config) #{"NO_CALL" "MIXED" "HOM_REF"}
                   :else #{"NO_CALL" "MIXED"})]
    (if-not (= 1 (:num-samples vc)) false
              (contains? to-remove (-> vc :genotypes first :type))
              (catch Exception e
                (println (:chr vc) (:start vc))
                (throw e))))))

Check a VCF input line for identical REF and ALT calls

(defn nochange-alt?
  (let [parts (string/split line #"\t")]
    (= (nth parts 3) (nth parts 4))))

Sort stream of line inputs by position. Requires loading the entire file into memory during the sort-by phase so will not work on massive files. Should be feasible with files split by chromosome.

(defn- sort-by-position
  (letfn [(add-position [line]
            (let [[chrom start] (take 2 (string/split line #"\t"))]
              [[chrom (Integer/parseInt start)] line]))]
    (->> line-seq
         (map add-position)
         (sort-by first)
         (map second))))

Provide genotype calls for structural variants to a single ref call. Structural variants often don't have proper genotype references since individual haplotypes are not called. This makes them a single reference if not specified or mixed.

(defn- normalize-sv-genotype
  [config sample orig]
  (letfn [(maybe-fix-vc [g alt-allele]
            (case (:type g)
              "MIXED" (-> (GenotypeBuilder. (:genotype g))
                          (.alleles) (remove #(.isNoCall %) (:alleles g))
              ("UNAVAILABLE" "NO_CALL") (-> (GenotypeBuilder. (:genotype g))
                                            (.alleles [alt-allele])
              (:genotype g)))
          (ref-vc-genotype [gs alt-allele]
            (case (count gs)
              0 [(-> (GenotypeBuilder.)
                     (.name sample)
                     (.alleles [alt-allele])
              1 [(maybe-fix-vc (first gs) alt-allele)]
              (map :genotype gs)))]
    (if (:prep-sv-genotype config)
      (let [new-gs (ref-vc-genotype (:genotypes orig)
                                    (first (:alt-alleles orig)))]
        (-> orig
            (assoc :vc
              (-> (VariantContextBuilder. (:vc orig))
                  (.genotypes new-gs)
            (assoc :genotypes (map from-genotype new-gs))))

Provide VariantContexts ordered by chromosome and normalized.

(defn- ordered-vc-iter
  [rdr vcf-decoder sample config]
  (->> rdr
       (#(if (:prep-sort-pos config) (sort-by-position %) %))
       (remove nochange-alt?)
       (map vcf-decoder)
       (remove #(no-call-genotype? % config))
       (map (partial normalize-sv-genotype config sample))
       (map (partial fix-vc sample config))
       (map :vc)))

Provide fixes to VCF input lines that do not require VariantContext parsing. Fixes: - INFO lines with empty attributes (starting with ';'), found in Complete Genomics VCF files - Chromosome renaming.

(defn- fix-vcf-line
  [line chr-map config]
  (letfn [(empty-attribute-info [info]
            (if (.startsWith info ";")
              (subs info 1)
          (fix-info [xs]
            (assoc xs 7 (empty-attribute-info (nth xs 7))))
          (fix-chrom [new xs]
            (assoc xs 0 new))]
    (let [parts (string/split line #"\t")
          cur-chrom (or (get chr-map (first parts)) (first parts))]
      {:chrom cur-chrom
       :line (->> parts
                  (fix-chrom cur-chrom)
                  (string/join "\t"))})))

Split input VCF into separate files by chromosome, returning a map of file names.

(defn- vcf-by-chrom
  [vcf-file ref-file orig-ref-file tmp-dir config]
  (letfn [(ref-chr-files [ref-file]
            (into (ordered-map)
                  (map (fn [x] [x (str (fs/file tmp-dir (str "prep" x ".vcf")))])
                       (chrs-from-fasta-file ref-file))))
          (write-by-chrom [ref-wrtrs chr-map line]
            (let [line-info (fix-vcf-line line chr-map config)]
              (if-let [wtr (get ref-wrtrs (:chrom line-info))]
                (.write wtr (str (:line line-info) "\n"))
                (throw (Exception. (format "Could not find remapping of chromosome %s in reference: %s"
                                           (:chrom line-info) (keys ref-wrtrs)))))))]
    (let [ref-chrs (ref-chr-files ref-file)
          ref-wrtrs (zipmap (keys ref-chrs) (map writer (vals ref-chrs)))
          chr-map (chr-name-remap (:prep-org config) ref-file orig-ref-file)]
      (with-open [rdr (reader vcf-file)]
        (->> rdr
             (drop-while #(.startsWith % "#"))
             (map (partial write-by-chrom ref-wrtrs chr-map))
        (doseq [x (vals ref-wrtrs)]
          (.close x)))

Top level functionality to manage inputs and writing.

Update header information, removing contig and adding sample names.

(defn- update-header
  [sample config]
  (letfn [(clean-metadata [header]
            (apply ordered-set (remove #(= "contig" (.getKey %)) (.getMetaDataInInputOrder header))))]
    (fn [_ header]
      (let [cur-samples (.getGenotypeSamples header)
            new-samples (if (and (:fix-sample-header config)
                                 (not (nil? sample))
                                 (< (count cur-samples) 2))
                          (ordered-set sample)
        (VCFHeader. (clean-metadata header) new-samples)))))

Update a VCF file with one item to have the given sample name.

(defn fix-vcf-sample
  [in-file sample ref]
  (let [out-file (itx/add-file-part in-file "samplefix")]
    (when (itx/needs-run? out-file)
      (with-open [vcf-iter (get-vcf-iterator in-file ref)]
        (write-vcf-w-template in-file {:out out-file}
                              (map #(:vc (fix-vc sample {} %)) (parse-vcf vcf-iter))
                              ref :header-update-fn (update-header sample {}))))

Check a VCF file for alternative alleles that match reference, removing them. This avoids issues where callers output alleles that match expected reference causing GATK errors.

(defn remove-ref-alts
  [in-file ref-file]
  (letfn [(alt-matches-ref? [vc]
            (when-let [ref-seq (extract-sequence ref-file (:chr vc) (:start vc) (:end vc))]
              (contains? (set (map #(.getBaseString %) (:alt-alleles vc))) ref-seq)))]
    (let [out-file (itx/add-file-part in-file "callprep")]
      (when (itx/needs-run? out-file)
        (with-open [vcf-iter (get-vcf-iterator in-file ref-file)]
          (write-vcf-w-template in-file {:out out-file}
                                (map :vc (remove alt-matches-ref? (parse-vcf vcf-iter)))

Write VCF file with correctly ordered and cleaned variants.

(defn- write-prepped-vcf
  [vcf-file out-info ref-file orig-ref-file sample config]
  (itx/with-temp-dir [tmp-dir (fs/parent (:out out-info))]
    (let [reader-by-chr (into (ordered-map) (map (fn [[k v]] [k (reader v)])
                                                 (vcf-by-chrom vcf-file ref-file orig-ref-file
                                                               tmp-dir config)))]
      (with-open [vcf-reader (AsciiLineReader. (input-stream vcf-file))]
        (let [vcf-decoder (get-vcf-line-parser vcf-reader)]
          (write-vcf-w-template vcf-file out-info
                                 (for [rdr (vals reader-by-chr)]
                                   (ordered-vc-iter rdr vcf-decoder sample config)))
                                :header-update-fn (update-header sample config))))
      (doseq [x (vals reader-by-chr)]
        (.close x)))))

Prepare VCF for comparison by normalizing high level attributes Assumes by position sorting of variants in the input VCF. Chromosomes do not require a specific order, but positions internal to a chromosome do. Currently configured for human preparation.

(defn prep-vcf
  [in-vcf-file ref-file sample & {:keys [out-dir out-fname config orig-ref-file]
                                  :or {config {}}}]
  (let [config (merge-with #(if (nil? %1) %2 %1) config
                           {:prep-org :GRCh37 :prep-allele-count 2
                            :prep-sort-pos false :prep-sv-genotype false
                            :fix-sample-header false
                            :remove-refcalls true})
        base-name (if (nil? out-fname) (itx/remove-zip-ext in-vcf-file) out-fname)
        out-file (itx/add-file-part base-name "prep" out-dir)]
    (when (itx/needs-run? out-file)
      (write-prepped-vcf in-vcf-file {:out out-file}
                         ref-file orig-ref-file
                         sample config))

Choose a reference genome for a variant file from set of choices.

(defn pick-best-ref
  [vcf refs]
  (letfn [(get-vcf-contig [fname]
            (with-open [rdr (reader fname)]
              (->> (line-seq rdr)
                   (drop-while #(.startsWith % "#"))
                   (#(string/split % #"\t"))
          (has-contig? [contig ref-file]
             (set (keys (get-seq-name-map ref-file)))
    (let [test-contig (get-vcf-contig vcf)]
      (first (filter (partial has-contig? test-contig) refs)))))

Remove problem characters

Handle cleanup for VCF files before feeding to any verifying parser.

Given a VCF line, retrieve the reference base prior to the variant. Used to include the required reference padding in indels missing them.

(defn- get-prev-pad
  (let [chr-map (prep-rename-map :GRCh37 ref-file)
        get-ref-chrom (fn [chrom]
                        (get chr-map chrom chrom))]
    (fn [xs]
      (let [before-start (dec (Integer/parseInt (second xs)))]
          (or (extract-sequence ref-file (get-ref-chrom (first xs)) before-start before-start) "N")))))))

Check reference and alt alleles for lack of a padding base on indels. The VCF spec requires this and GATK will parse incorrectly when a variant lacks a shared padding base for indels.

(defn- maybe-add-indel-pad-base
  [ref-file prev-pad xs]
  (letfn [(get-ref-alts [xs]
            [(nth xs 3) (string/split (nth xs 4) #",")])
          (indel? [xs]
            (let [[vc-ref vc-alts] (get-ref-alts xs)]
              (some #(and (not (.startsWith % "<"))
                          (not= (count vc-ref) (count %)))
          (is-5pad-n? [xs]
            (let [[vc-ref vc-alts] (get-ref-alts xs)]
              (every? #(and (.startsWith vc-ref "N") (.startsWith % "N")) vc-alts)))
          (fix-5pad-n [xs]
            (let [[vc-ref vc-alts] (get-ref-alts xs)]
              (-> xs
                  (assoc 3 (str (prev-pad xs) (subs vc-ref 1)))
                  (assoc 4 (string/join ","
                                        (map #(str (prev-pad xs) (subs % 1)) vc-alts))))))
          (no-pad? [xs]
            (let [[vc-ref vc-alts] (get-ref-alts xs)]
              (some #(and (not (.startsWith % "<"))
                          (not= (first vc-ref) (first %)))
          (fix-nopad [xs]
            (let [[vc-ref vc-alts] (get-ref-alts xs)]
              (-> xs
                  (assoc 1 (dec (Integer/parseInt (second xs))))
                  (assoc 3 (str (prev-pad xs) vc-ref))
                  (assoc 4 (string/join ","
                                        (map #(str (prev-pad xs) %) vc-alts))))))]
    (if (empty? xs) []
        (-> xs
            (->/as cur-xs
              (->/when (and (indel? cur-xs) (is-5pad-n? cur-xs))
            (->/as cur-xs
              (->/when (and (indel? cur-xs) (no-pad? cur-xs))

Remove calls where the reference base does not match expected reference allele.

(defn- remove-bad-ref
  [ref-file xs]
  (letfn [(is-bad-ref? [xs]
            (let [check-bases #{"A" "C" "G" "T"}
                  [chrom start _ vc-ref] (take 4 xs)
                  real-ref (extract-sequence ref-file chrom (Integer/parseInt start)
                                             (Integer/parseInt start))]
              (and (= 1 (count vc-ref))
                   (not (nil? real-ref))
                   (contains? check-bases (string/upper-case real-ref))
                   (contains? check-bases (string/upper-case vc-ref))
                   (not= (string/upper-case vc-ref) (string/upper-case real-ref)))))]
     (empty? xs) []
     (is-bad-ref? xs) []
     :else xs)))

Clean VCF file which GATK parsers cannot handle due to illegal characters. Fixes: - Gap characters (-) found in REF or ALT indels. - Fixes indels without reference padding or N padding. - Removes spaces in INFO fields. - Handles Illumina special case of SNPs with MAXGT and POLY calls. Uses the MAXGT calls which make no prior assumptions about polymorphism

(defn clean-problem-vcf
  [in-vcf-file ref-file sample & {:keys [out-dir]}]
  (letfn [(fix-bad-alt-header [x]
            (str "##ALT=<ID" (string/replace-first x "##ALT=Type" "") ">"))
          (rename-samples [xs want]
            (let [idx (ffirst (filter (fn [[i x]] (.startsWith x want))
                                      (map-indexed vector xs)))]
               idx (assoc (vec xs) idx want)
               (contains? #{0 1} (count xs)) [want]
               (.contains (first xs) "_MAXGT") (cons want (rest xs))
               :else xs)))
          (fix-sample-names [x]
            (if (> (count (string/split x #"\t")) 8)
              (let [[stay-parts samples] (split-at 9 (string/split x #"\t"))
                    fix-samples (if (contains? (set samples) sample)
                                  (rename-samples samples sample))]
                (string/join "\t" (concat stay-parts fix-samples)))
          (clean-header [x]
             (.startsWith x "##ALT=Type=") (fix-bad-alt-header x)
             (.startsWith x "##FORMAT=<ID=GL,Number=.,Type=String") ""
             (.startsWith x "#CHROM") (fix-sample-names x)
             :else x))
          (remove-gap [n xs]
            (assoc xs n
                   (-> (nth xs n)
                       (string/replace "-" "")
                       (string/replace "." ""))))
          (has-duplicate-alts? [alt]
            (let [alts (string/split alt #",")]
              (not= (count alts) (count (set alts)))))
          (remove-problem-alts [xs]
            (let [ref (nth xs 3)
                  alt (nth xs 4)]
               (empty? xs) []
               (= ref alt) []
               (= "." alt) []
               (has-duplicate-alts? alt) []
               :else xs)))
          (fix-info-spaces [xs]
            (assoc xs 7
                   (string/replace (nth xs 7) " " "_")))
          (clean-line [line]
            (if (.startsWith line "#")
              (clean-header line)
              (->> (string/split line #"\t")
                   (remove-gap 3)
                   (remove-gap 4)
                   (remove-bad-ref ref-file)
                   (maybe-add-indel-pad-base ref-file (get-prev-pad ref-file))
                   (string/join "\t"))))]
    (let [out-file (itx/add-file-part in-vcf-file "preclean" out-dir)]
      (when (itx/needs-run? out-file)
        (itx/with-tx-file [tx-out-file out-file]
          (with-open [rdr (reader in-vcf-file)
                      wtr (writer tx-out-file)]
             (map #(.write wtr (str % "\n"))
                  (remove empty? (map clean-line (line-seq rdr))))))))

Support phased haplotype comparisons between variant calls. Compares a phased set of calls versus haploid reference calls.

The comparison logic is:

  • Group calls into regions based on phasing
  • For each phase region:
    • Determine which set of haploid alleles to compare with the reference
    • With each position in this haploid:
      • Compare to reference allele
      • If mismatch and alternate allele matches reference, then phasing error
      • If mismatch and neither allele matches, then calling error
(ns bcbio.variation.phasing
  (:import [org.broadinstitute.sting.utils.interval IntervalUtils IntervalSetRule]
           [org.broadinstitute.sting.utils GenomeLocParser GenomeLoc])
  (:use [bcbio.variation.callable :only [get-bed-source features-in-region
                                         limit-bed-intervals get-bed-iterator]]
        [bcbio.variation.filter.intervals :only [intersection-of-bed-files
        [bcbio.variation.structural :only [prep-itree get-itree-overlap
                                           remove-itree-vc get-itree-all]]
        [bcbio.variation.variantcontext :only [parse-vcf get-vcf-retriever get-vcf-iterator
                                               variants-in-region merge-headers
        [bcbio.align.ref :only [get-seq-dict]]
        [ordered.map :only [ordered-map]])
  (:require [fs.core :as fs]
            [bcbio.run.itx :as itx]))

Find phased haplotypes in VCF

Check for phasing on a single genotype variant context based on: - variant has a single allele - variant has phasing specified (VCF | notation) - variant range overlaps previous variant (overlapping indels)

(defn- is-phased?
  [vc prev-vc bed-s]
  {:pre [(= 1 (:num-samples vc))]}
  (letfn [(safe-same-regions? [[a b]]
            (if (not-any? nil? [a b]) (= a b) true))
          (same-regions? [prev cur]
            (if (nil? bed-s)
               (map #((juxt :chr :start :end)
                      (first (features-in-region bed-s (:chr %) (:start %) (:end %))))
                    [prev cur]))))]
    (let [g (-> vc :genotypes first)]
      (and (= (:chr vc) (:chr prev-vc))
           (same-regions? prev-vc vc)
           (or (= 1 (count (:alleles g)))
               (.isPhased (:genotype g))
               (<= (:start vc) (:end prev-vc)))))))

Separate phased haplotypes provided in diploid input genome. We split at each phase break, returning a lazy list of variant contexts grouped into phases.

(defn parse-phased-haplotypes
  [vcf-iter ref-file & {:keys [intervals]}]
  (let [prev (atom nil)
        bed-source (when intervals
                     (get-bed-source intervals ref-file))]
    (letfn [(split-at-phased [vc]
              (let [continue-phase (or (nil? @prev)
                                       (is-phased? vc @prev bed-source))]
                (reset! prev vc)
      (partition-by split-at-phased (parse-vcf vcf-iter)))))

Compare phased variants

Retrieve the item with the highest count in the supplied list. We break ties by sorting by the actual list items

(defn highest-count
  (->> (frequencies xs)
       (sort-by val >)
       (partition-by second)
       (sort-by first)

Convenience function to get alleles for a single genotype variant context.

(defn- get-alleles
  {:pre [(= 1 (:num-samples vc))]}
  (-> vc :genotypes first :alleles))

Determine allele index where the variant context matches haploid reference.

(defn- matching-allele
  [vc ref-vcs]
  {:pre [(every? #(= 1 (:num-samples %)) ref-vcs)
         (= 1 (:num-samples vc))]}
   (= 1 (count (get-alleles vc))) 0
   (empty? ref-vcs) (.indexOf (get-alleles vc) (:ref-allele vc))
   :else (highest-count
          (remove neg?
                  (map #(.indexOf (get-alleles vc) (-> % get-alleles first)) ref-vcs)))))

Compare the haploid allele of a variant against the expected call.

(defn cmp-allele-to-expected
  [vc e-vc i]
  (letfn [(is-ref-allele? [x]
            (apply = (map #(.getDisplayString (% x)) [:cmp :ref])))
          (get-cmp-allele [i x]
            (when (< i (count (get-alleles x)))
              {:ref (:ref-allele x)
               :cmp (nth (get-alleles x) i)}))
          (get-all-alleles [x]
            (map #(get-cmp-allele % x) (range (count (get-alleles x)))))]
    (let [e-allele (when-not (nil? e-vc)
                     (get-cmp-allele 0 e-vc))
          call-hap (when-not (or (nil? i) (nil? vc) (neg? i))
                     (get-cmp-allele i vc))]
       (nil? call-hap) :discordant
       (and (is-ref-allele? call-hap)
            (or (nil? e-allele)
                (= e-allele call-hap))) :ref-concordant
       (nil? e-allele) :discordant
       (= e-allele call-hap) :concordant
       (some (partial = e-allele) (get-all-alleles vc)) :phasing-error
       :else :discordant))))

Retrieve the type of a set of variants involved in a comparison.

  • :indel -- insertions or deletions of more than 1bp
  • :snp -- Single nucleotide changes or single basepair changes
  • :unknown -- Other classs of variations (structural)
(defn get-variant-type
  (letfn [(is-indel? [x]
            (contains? #{"MIXED" "INDEL"} (:type x)))
          (is-multi-indel? [x]
            (and (is-indel? x)
                 (not-every? #(contains? #{0 1} %)
                             (map #(-> % .getDisplayString count) (cons (:ref-allele x)
                                                                     (:alt-alleles x))))))
          (is-snp? [x]
            (= "SNP" (:type x)))]
     (some is-multi-indel? vcs) :indel
     (some is-indel? vcs) :indel
     (every? is-snp? (remove nil? vcs)) :snp
     :else :unknown)))

Determine if the variant has a non-matching heterozygous alternative allele.

(defn- nomatch-het-alt?
  [vc e-vc]
  {:pre [(not (nil? vc))]}
  (let [match-allele-i (matching-allele vc (if (nil? e-vc) [] [e-vc]))
        no-match-alleles (remove nil? (map-indexed
                                       (fn [i x] (if-not (= i match-allele-i) x))
                                       (get-alleles vc)))]
    (and (= "HET" (-> vc :genotypes first :type))
         (not-every? #(.isReference %) no-match-alleles))))
(defn- deleted-bases
  (letfn [(is-deletion? [vc]
            (and (= (:type vc) "INDEL")
                 (pos? (.length (:ref-allele vc)))))]
    (if (is-deletion? vc)
      (map vector (repeat (:chr vc)) (range (:start vc) (inc (:end vc))))

Provide metrics for comparison of haploid expected alleles to variant calls.

(defn- comparison-metrics
  [cmp-itree i e-vc]
  (let [cmp-vc (->> (get-itree-overlap cmp-itree (:chr e-vc) (:start e-vc) (:end e-vc))
                    (filter #(= (:start %) (:start e-vc)))
    {:comparison (cmp-allele-to-expected cmp-vc e-vc i)
     :variant-type (get-variant-type [cmp-vc e-vc])
     :nomatch-het-alt (when-not (nil? cmp-vc) (nomatch-het-alt? cmp-vc e-vc))
     :start (if (nil? cmp-vc) (:start e-vc) (:start cmp-vc))
     :end (:end cmp-vc)
     :end-ref (:end e-vc)
     :deleted (deleted-bases e-vc)
     :vc (:vc cmp-vc)
     :ref-vc (:vc e-vc)}))

Provide scoring metrics for a phased region against expected haplotype variants. - Fetch all expected variants in the phased region. - Iterate over expected variants comparing to the called variants: - Keep IntervalTree of called variants, removing variants as evaluated. - Keep coordinates of expected deletion regions. - Add discordant variants for extra calls not in expected variants, avoiding variants in deleted regions.

(defn- score-phased-region
  [expect-get vcs]
  (let [vc-itree (atom (prep-itree vcs :start :end))]
    (letfn [(get-ref-vcs [x]
              (variants-in-region expect-get (:chr x) (:start x) (:end x)))
            (ref-match-allele [x]
              (matching-allele x (variants-in-region expect-get (:chr x) (:start x) (:end x))))
              {:pre [(= 1 (count (keys itree)))]}
              (let [[chr tree] (first itree)]
                (let [start (-> tree .min .getStart)]
                  (->> (variants-in-region expect-get chr start (dec (-> tree .max .getEnd)))
                       (remove #(< (:start %) start))
                       (sort-by :start)))))
            (compare-and-update [cmp-i info e-vc]
              (let [cmp (comparison-metrics @vc-itree cmp-i e-vc)]
                (reset! vc-itree (remove-itree-vc @vc-itree (:chr e-vc)
                                                  (:start cmp) (:end cmp)))
                (-> info
                    (assoc :out (cons cmp (:out info)))
                    (assoc :deleted (concat (:deleted info) (:deleted cmp))))))
            (in-deleted-region? [regions vc]
              (contains? regions [(:chr vc) (:start vc)]))
            (add-unmapped-cmps [cmp-i info]
              (concat (:out info)
                      (map (fn [vc] {:comparison (cmp-allele-to-expected vc nil cmp-i)
                                     :variant-type (get-variant-type [vc])
                                     :nomatch-het-alt (nomatch-het-alt? vc nil)
                                     :start (:start vc)
                                     :vc (:vc vc)
                                     :ref-vc nil})
                           (remove (partial in-deleted-region? (set (:deleted info)))
                                   (get-itree-all @vc-itree)))))]
      (let [cmp-allele-i (highest-count (remove #(or (nil? %) (neg? %))
                                                (map ref-match-allele vcs)))]
        (->> (reduce (partial compare-and-update cmp-allele-i)
                     {:deleted [] :out []}
                     (get-regional-expected-vcs @vc-itree))
             (add-unmapped-cmps cmp-allele-i)
             (sort-by :start))))))

Score a called VCF against expected haploid variants based on phased regions. Partitions phased regions into blocks of two concurrent regions. For each block: - Evaluate second region with standard scoring: expected to called - Collect expected variants in the intervening region between phased blocks, report those missing in the comparison input as errors.

(defn score-phased-calls
  [call-vcf-iter expect-get ref-file & {:keys [intervals]}]
  (let [prev (atom nil)]
    (letfn [(get-intervene-expect [region1 region2]
              (let [vc1 (last region1)
                    vc2 (first region2)
                    filter-end (if (nil? vc1) (dec (:start vc2)) (:end vc1))
                    vcs (cond
                         (nil? vc1)
                         (variants-in-region expect-get (:chr vc2) 0 (dec (:start vc2)))
                         (not= (:chr vc1) (:chr vc2))
                         (concat (variants-in-region expect-get (:chr vc1) (inc (:end vc1)) 1e10)
                                 (variants-in-region expect-get (:chr vc2) 0 (dec (:start vc2))))
                         (variants-in-region expect-get (:chr vc1) (inc (:end vc1))
                                             (dec (:start vc2))))]
                (->> vcs
                     (remove #(< (:start %) filter-end))
                     (map (fn [x] {:comparison :discordant
                                   :variant-type (get-variant-type [x])
                                   :nomatch-het-alt false
                                   :start (:start x)
                                   :vc nil
                                   :ref-vc (:vc x)}))
                     (sort-by :start))))
            (score-phased-and-intervene [region]
              (let [out (concat (get-intervene-expect @prev region)
                                (when (not= (:chr (first region)) "finished_sentinel")
                                  (score-phased-region expect-get region)))]
                (reset! prev region)
      (map score-phased-and-intervene
           (concat (parse-phased-haplotypes call-vcf-iter ref-file
                                            :intervals intervals)
                   [[{:chr "finished_sentinel" :start 1}]])))))

Summarize phased comparisons

Write concordant and discordant variants to VCF output files.

(defn- write-concordance-output
  [vc-info to-capture sample-name base-info other-info out-dir ref]
  (let [base-dir (if (nil? out-dir) (fs/parent (:file base-info)) out-dir)
        gen-file-name (fn [x] (str (fs/file base-dir (format "%s-%s-%s-%s.vcf"
                                                             sample-name (:name base-info)
                                                             (:name other-info) (name x)))))
        out-files (apply ordered-map (flatten (map (juxt identity gen-file-name)
    (if-not (fs/exists? base-dir)
      (fs/mkdirs base-dir))
    (when (itx/needs-run? (vals out-files))
      (write-vcf-w-template (:file base-info) out-files
                            (->> (flatten vc-info)
                                 (map (juxt :comparison :vc))
                                 (filter #(contains? (set to-capture) (first %))))
                            :header-update-fn (merge-headers (:file other-info))))

Provide counts for comparison: entire region plus user specified regions

(defn count-comparison-bases
  [total-bed call-bed ref-file]
  (letfn [(feature-size [x]
             (instance? GenomeLoc x) (- (.getStop x) (dec (.getStart x)))
             :else (- (.getEnd x) (dec (.getStart x)))))
          (count-bases [xs]
            (apply + (map feature-size xs)))
          (merge-intervals [x y]
            (intersection-of-bed-files [x y] ref-file (GenomeLocParser. (get-seq-dict ref-file))))]
    (if (nil? total-bed)
      {:percent 0.0 :compared 0 :total 0}
      (with-open [bed-iter (get-bed-iterator total-bed ref-file)]
        (let [total (count-bases bed-iter)
              compared (if (or (nil? call-bed) (= total-bed call-bed)) total
                           (count-bases (merge-intervals total-bed call-bed)))]
          {:percent (* 100.0 (/ compared total))
           :compared compared
           :total total})))))

Collect summary metrics for concordant/discordant and phasing calls

(defn- get-phasing-metrics
  [vc-info exp-interval-file call-interval-file ref-file]
  (letfn [(count-nomatch-het-alt [xs]
            (count (filter #(and (contains? #{:concordant :ref-concordant} (:comparison %))
                                 (:nomatch-het-alt %))
                           (flatten vc-info))))
          (blank-count-dict []
            {:snp 0 :indel 0})
          (add-current-count [coll x]
            (let [cur-val (map x [:comparison :variant-type])]
              (assoc-in coll cur-val (inc (get-in coll cur-val)))))]
    (reduce add-current-count
            {:haplotype-blocks (count vc-info)
             :total-bases (count-comparison-bases exp-interval-file call-interval-file ref-file)
             :nonmatch-het-alt (count-nomatch-het-alt vc-info)
             :concordant (blank-count-dict)
             :ref-concordant (blank-count-dict)
             :discordant (blank-count-dict)
             :phasing-error (blank-count-dict)}
            (flatten vc-info))))

Entry point for phased haploid VCF comparisons

Compare two VCF files including phasing with a haplotype reference Handle grading special case as well as standard comparisons.

(defmulti compare-two-vcf-phased
  (fn [_ exp _] (keyword (get exp :approach "compare"))))

Convert comparison into ready to write keywords for grading. Deals with discordant comparisons where the competition call is missing.

(defn- convert-cmp-to-grade
  (if (and (= (:comparison cmp) :discordant)
           (nil? (:vc cmp)))
    (-> cmp
        (assoc :comparison :discordant-missing)
        (assoc :vc (:ref-vc cmp)))

Associate a grading category with each variant based on comparison.

(defn- add-grading-info
  (letfn [(assign-discordant-cat [{:keys [vc ref-vc]}]
            (println vc)
            (println ref-vc))]
    (let [cat (case (:comparison cmp)
                :ref-concordant "concordant"
                :concordant "concordant"
                :discordant (assign-discordant-cat cmp) 
                :discordant-missing "discordant-nocall"
                :phasing-error "discordant-phasing")])

Retrieve the comparison file, filtering by intervals if present.

(defn- get-compare-file
  [in-file cmp-name exp intervals]
  (if (nil? intervals)
    (select-by-sample (:sample exp) in-file nil (:ref exp)
                      :intervals intervals :ext (str "cmp" cmp-name))))
(defmethod compare-two-vcf-phased :grade
  [phased-calls exp config]
  {:pre [(= 1 (count (get phased-calls true)))
         (= 1 (count (get phased-calls false)))]}
  (let [ref (first (get phased-calls true))
        call (first (get phased-calls false))
        call-intervals (when-let [f (get call :intervals (:intervals exp))]
                         (limit-bed-intervals f call exp config))]
    (with-open [ref-get (get-vcf-retriever (:ref exp)
                                           (get-compare-file (:file ref) (:name call)
                                                             exp call-intervals))
                call-vcf-iter (get-vcf-iterator (:file call) (:ref exp))]
      (let [compared-calls (score-phased-calls call-vcf-iter ref-get (:ref exp)
                                               :intervals call-intervals)]
        {:c-files (write-concordance-output (->> compared-calls
                                                 (map convert-cmp-to-grade)
                                                 (map add-grading-info))
                                            [:concordant :discordant
                                             :discordant-missing :phasing-error]
                                            (:sample exp) call ref
                                            (get-in config [:dir :out]) (:ref exp))
         :metrics (get-phasing-metrics compared-calls (:intervals exp)
                                       call-intervals (:ref exp))
         :c1 (assoc call :intervals call-intervals)
         :c2 ref :sample (:sample exp) :exp exp :dir (:dir config)}))))

Convert stream of variant context haploid comparison to standard, keyed by :concordant and :discordant-name keywords.

(defn- convert-cmps-to-compare
  [cmps name1 name2]
  (letfn [(update-keyword [coll x]
            (let [ref-x (-> x
                            (assoc :vc (:ref-vc x))
                            (dissoc :ref-vc))
                  [dis-kw1 dis-kw2] (map #(keyword (format "%s-discordant" %)) [name1 name2])]
              (case (:comparison x)
                :concordant (conj coll ref-x)
                (:discordant :phasing-error) (-> coll
                                                 (conj (assoc x :comparison dis-kw2))
                                                 (conj (assoc ref-x :comparison dis-kw1)))
          (update-keyword-hapblock [xs]
            (remove #(or (nil? %) (nil? (:vc %)))
                    (reduce update-keyword [] xs)))]
    (map update-keyword-hapblock cmps)))
(defmethod compare-two-vcf-phased :compare
  [phased-calls exp config]
  {:pre [(= 2 (count (flatten (vals phased-calls))))
         (pos? (count (get phased-calls true)))]}
  (let [cmp1 (first (get phased-calls true))
        cmp2 (if-let [nophased (get phased-calls false)]
               (first nophased)
               (second (get phased-calls true)))
        to-capture (concat [:concordant]
                           (map #(keyword (format "%s-discordant" (:name %)))
                                [cmp1 cmp2]))
        cmp-intervals (when-let [f (get cmp2 :intervals (:intervals exp))]
                        (limit-bed-intervals f cmp2 exp config))]
    (with-open [vcf1-get (get-vcf-retriever (:ref exp)
                                            (get-compare-file (:file cmp1) (:name cmp2)
                                                              exp cmp-intervals))
                vcf2-iter (get-vcf-iterator (:file cmp2) (:ref exp))]
      {:c-files (-> (score-phased-calls vcf2-iter vcf1-get
                                        (:ref exp) :intervals cmp-intervals)
                    (convert-cmps-to-compare (:name cmp1) (:name cmp2))
                    (write-concordance-output to-capture (:sample exp) cmp1 cmp2
                                              (get-in config [:dir :out]) (:ref exp)))
       :c1 cmp1 :c2 cmp2 :sample (:sample exp) :exp exp :dir (:dir config)})))

Utility functions

Is the provided VCF file a haploid genome (one genotype or all homozygous). Samples the first set of variants, checking for haploid calls.

(defn is-haploid?
  [vcf-file ref-file]
  (let [sample-size 1000]
    (letfn [(is-vc-haploid? [vc]
              (when-not (= 0 (:num-samples vc))
                (= 1 (apply max (map #(count (:alleles %)) (:genotypes vc))))))]
      (with-open [vcf-iter (get-vcf-iterator vcf-file ref-file)]
        (let [vcf-iter (parse-vcf vcf-iter)]
          (if-not (empty? vcf-iter)
            (every? is-vc-haploid? (take sample-size vcf-iter))

Recall batched sets of variants containing no-call regions. Combined variant calls from batches contain regions called in some samples but not others. The approach: - Split sample into called and no-call variants - Re-call the no-call variants using the UnifiedGenotyper - Merge previously called and re-called into final set. http://www.broadinstitute.org/gsa/wiki/index.php/Mergingbatchedcall_sets

(ns bcbio.variation.recall
  (:import [org.broadinstitute.sting.utils.variantcontext
            VariantContextBuilder GenotypesContext]
           [org.broadinstitute.sting.utils.codecs.vcf VCFHeader])
  (:use [clojure.java.io]
        [ordered.map :only [ordered-map]]
        [ordered.set :only [ordered-set]]
        [bcbio.variation.callable :only [get-callable-checker is-callable?]]
        [bcbio.variation.combine :only [combine-variants fix-minimal-combined]]
        [bcbio.variation.config :only [load-config]]
        [bcbio.variation.filter.intervals :only [select-by-sample]]
        [bcbio.variation.haploid :only [diploid-calls-to-haploid]]
        [bcbio.variation.multisample :only [multiple-samples?]]
        [bcbio.variation.normalize :only [fix-vcf-sample remove-ref-alts]]
        [bcbio.variation.phasing :only [is-haploid?]])
  (:require [clojure.string :as string]
            [fs.core :as fs]
            [bcbio.run.itx :as itx]
            [bcbio.run.broad :as broad]
            [bcbio.variation.filter.attr :as attr]
            [bcbio.variation.variantcontext :as gvc]))
(defn- set-header-to-sample [sample _ header]
  (VCFHeader. (.getMetaDataInInputOrder header) (ordered-set sample)))

Provide split VCFs of call and no-call variants for the given sample.

(defn- split-nocalls
  [in-vcf sample ref out-dir]
  (letfn [(sample-only-vc [vc]
            (-> (VariantContextBuilder. vc)
                (.genotypes (GenotypesContext/create
                             (into-array [(-> vc .getGenotypes (.get sample))])))
                (.attributes {})
          (split-nocall-vc [vc]
            (when (empty? (:filters vc))
              (let [cur-vc (sample-only-vc (:vc vc))]
                [(if (.isNoCall (-> cur-vc .getGenotypes (.get sample))) :nocall :called)
    (let [sample-str (if (.contains in-vcf sample) "" (str sample "-"))
          out {:called (itx/add-file-part in-vcf (str sample-str "called") out-dir)
               :nocall (itx/add-file-part in-vcf (str sample-str "nocall") out-dir)}]
      (when (itx/needs-run? (vals out))
        (with-open [in-vcf-iter (gvc/get-vcf-iterator in-vcf ref)]
          (gvc/write-vcf-w-template in-vcf out
                                    (remove nil? (map split-nocall-vc (gvc/parse-vcf in-vcf-iter)))
                                    :header-update-fn (partial set-header-to-sample sample))))

Do UnifiedGenotyper calling at known variation alleles.

(defn call-at-known-alleles
  [site-vcf align-bam ref & {:keys [cores]}]
  (let [file-info {:out-vcf (itx/add-file-part site-vcf "wrefs")}
        annotations ["DepthPerAlleleBySample"]
        args (concat ["-R" ref
                      "-o" :out-vcf
                      "-I" align-bam
                      "--alleles" site-vcf
                      "-L" site-vcf
                      "--genotyping_mode" "GENOTYPE_GIVEN_ALLELES"
                      "--output_mode" "EMIT_ALL_SITES"
                      "-stand_call_conf" "0.0"
                      "-stand_emit_conf" "0.0"
                      "--max_deletion_fraction" "2.0"
                      "--min_indel_count_for_genotyping" "3"
                      "--genotype_likelihoods_model" "BOTH"]
                     (if cores ["-nt" (str cores)] [])
                     (reduce #(concat %1 ["-A" %2]) [] annotations))]
    (broad/index-bam align-bam)
    (broad/run-gatk "UnifiedGenotyper" args file-info {:out [:out-vcf]})
    (:out-vcf file-info)))

Recall variations at no-calls in a sample using UnifiedGenotyper.

(defn recall-nocalls
  [in-vcf sample call-name align-bam ref & {:keys [out-dir cores]}]
  (let [sample-str (if (.contains in-vcf call-name) "" (str call-name "-"))
        out-file (itx/add-file-part in-vcf (str sample-str "wrefs") out-dir)]
    (when (itx/needs-run? out-file)
      (let [{:keys [called nocall]} (split-nocalls in-vcf sample ref out-dir)
            prep-nocall (remove-ref-alts nocall ref)
            orig-nocall (call-at-known-alleles prep-nocall align-bam ref :cores cores)
            fix-nocall (fix-vcf-sample orig-nocall sample ref)
            ready-nocall (if (is-haploid? called ref)
                           (diploid-calls-to-haploid fix-nocall ref)
            combine-out (combine-variants [called ready-nocall] ref :merge-type :full
                                          :quiet-out? true)]
        (fs/rename combine-out out-file)
        (fs/rename (str combine-out ".idx") (str out-file ".idx"))))

Retrieve inputs VCFs not involved in preparing a recall VCF. Avoid double pulling inputs with the same initial call files.

(defn- no-recall-vcfs
  [all-vcfs vcf-configs]
  (let [include-names (->> vcf-configs
                           (group-by :file)
                           (map second)
                           (map first)
                           (map :name)
    (->> (interleave all-vcfs vcf-configs)
         (partition 2)
         (map (fn [[x c]] (when (contains? include-names (:name c)) x)))
         (remove nil?))))

Filtering with recall testing

Does the supplied variant have a single supporting call

(defn- single-support?
  (when-let [callers (get-in vc [:attributes "set"])]
    (->> (string/split callers #"-")
         (map string/lower-case)
         (remove #(.startsWith % "filter"))
         (remove #(contains? #{"intersection" "combo"} %))
         (= 1))))
(defn- variant-id [vc]
  [(:chr vc) (:start vc) (:ref-allele vc) (first (:alt-alleles vc))])

Test if a recalled posterior likelihood supports a variant call.

(defn- supports-variant?
  [thresh vc]
  (when-let [pl (attr/get-vc-attr vc "PL" nil)]
    (< pl thresh)))

Retrieve set of variants that a GATK UnifiedGenotyper recaller can't identify.

(defn- get-norecall-variants
  [in-file bam-file sample ref]
  (let [support-thresh -7.5
        recall-file (-> (gvc/select-variants in-file single-support? "singles" ref)
                        (call-at-known-alleles bam-file ref))]
    (with-open [in-vcf-iter (gvc/get-vcf-iterator recall-file ref)]
      (->> (gvc/parse-vcf in-vcf-iter)
           (remove (partial supports-variant? support-thresh))
           (map variant-id)

Filter problematic single support calls by ability to recall at defined sites. Single method support calls can be especially difficult to filter as concordant and discordant sites have similar metrics. Testing ability to recall is a useful mechanism to help identify ones with little read support.

(defn filter-by-recalling
  [in-file bam-file exp]
  (let [norecall (get-norecall-variants in-file bam-file (:sample exp) (:ref exp))]
    (letfn [(can-recall? [vc]
              (not (contains? norecall (variant-id vc))))]
      (gvc/select-variants in-file can-recall? "recallfilter" (:ref exp)))))

Pick consensus variants

Retrieve variant alleles for the sample, sorted in a stable order.

(defn- get-sample-call
  [sample vc]
  (let [allele-order (->> (:alt-alleles vc)
                          (sort-by #(.getBaseString %))
                          (cons (:ref-allele vc))
                          (map-indexed vector)
                          (map reverse)
                          (map vec)
                          (into {}))
        g (->> (:genotypes vc)
                   (filter #(= sample (:sample-name %)))
    {:sample-name sample
     :qual (:qual vc)
     :vc-type (:type vc)
     :call-type (:type g)
     :ref-allele (:ref-allele vc)
     :alleles (sort-by allele-order (:alleles g))
     :attributes (select-keys (:attributes g) ["PL" "DP" "AD" "PVAL"])
     :has-likelihood (if (seq (get-in g [:attributes "PL"])) 1 0)
     :attr-count (+ (if (seq (get-in g [:attributes "PL"])) 1 0)
                    (if (seq (get-in g [:attributes "PVAL"])) 1 0)
                    (if (seq (get-in g [:attributes "AD"])) 1 0)
                    (if (get-in g [:attributes "DP"]) 1 0))
     :pl (attr/get-vc-attr vc "PL" nil)}))

Retrieve alleles with best support from multiple inputs. Use posterior likelihoods and quality scores to rank results with the same alleles and counts. We rank by total number of calls identified. We break ties in favor of homozygous calls if there isn't a consensus on het/hom variant calls, since we've failed to establish power for calling more difficult hets.

(defn- best-supported-alleles
  (letfn [(safe-sum [xs k]
            (apply + (remove nil? (map k xs))))
          (sum-plus-call-type [i xs]
            (let [pls (safe-sum xs :pl)
                  represent-x (last (sort-by #(vector (:has-likelihood %)
                                                      (or (:pl %) (- Integer/MIN_VALUE))
                                                      (:attr-count %))
                  call-code (if (= "HET" (:call-type represent-x)) 0 1)]
              [(count xs) call-code (- pls) i represent-x]))]
    (->> alleles
         (group-by :alleles)
         (map second)
         (map-indexed sum-plus-call-type)
         last ; Best item
         last ; Extract the alleles)))

Update a variant context with consensus genotype from multiple inputs. Calculates the consensus set of calls, swapping calls to that if it exists. If there is no consensus default to the existing allele call.

(defn- update-vc-w-consensus
  [vc sample input-vc-getter]
  (let [match-fn (juxt :start :ref-allele)
        most-likely (->> (gvc/variants-in-region input-vc-getter vc)
                         (filter #(= (match-fn %) (match-fn vc)))
                         (map (partial get-sample-call sample))
    (when most-likely
      (-> (VariantContextBuilder. (:vc vc))
          (.alleles (set (cons (:ref-allele vc) (:alleles most-likely))))
          (.genotypes (gvc/create-genotypes [most-likely] :attrs {"PL" "PVAL" "DP" "AD"}))

Recall variants in a combined set of variants based on consensus of all inputs.

(defn- recall-w-consensus
  [base-vcf input-vcfs sample ref-file]
  (let [out-file (itx/add-file-part base-vcf "consensus")]
    (when (itx/needs-run? out-file)
      (with-open [in-vcf-iter (gvc/get-vcf-iterator base-vcf ref-file)
                  input-vc-getter (apply gvc/get-vcf-retriever (cons ref-file input-vcfs))]
        (gvc/write-vcf-w-template base-vcf {:out out-file}
                                  (remove nil? (map #(update-vc-w-consensus % sample input-vc-getter)
                                                    (gvc/parse-vcf in-vcf-iter)))
                                  :header-update-fn (partial set-header-to-sample sample))))

Retrieve a minimal merged file with calls from input VCFs.

(defn- get-min-merged
  [vcfs exp out-dir intervals]
  (-> (combine-variants vcfs (:ref exp) :merge-type :minimal :intervals intervals
                        :out-dir out-dir :check-ploidy? false
                        :name-map (zipmap vcfs (map :name (:calls exp))))
      (fix-minimal-combined vcfs (:ref exp))))

Recall missing calls, handling merging or consensus based approaches

(defmulti recall-vcf
  (fn [in-info & _]
    (if (some nil? [in-info (:bam in-info) (:file in-info)])
      (keyword (get in-info :approach :consensus)))))
(defmethod recall-vcf :gatk-ug
  ^{:doc "Provide recalling of nocalls using GATK's UnifiedGenotyper"}
  [in-info vcfs exp out-dir intervals]
  (-> [(:file in-info) (get-min-merged vcfs exp out-dir intervals)]
      (combine-variants (:ref exp) :merge-type :full :intervals intervals
                        :out-dir out-dir :check-ploidy? false)
      (recall-nocalls (:sample exp) (:name in-info) (:bam in-info) (:ref exp)
                      :out-dir out-dir)))
(defmethod recall-vcf :consensus
  ^{:doc "Provide recalling of nocalls based on consensus from all inputs."}
  [in-info vcfs exp out-dir intervals]
  (-> vcfs
      (get-min-merged exp out-dir intervals)
      (recall-w-consensus (no-recall-vcfs vcfs (:calls exp))
                          (:sample exp) (:ref exp))))

Create merged VCF files with no-call/ref-calls for each of the inputs. Works at a higher level than recall-nocalls and does the work of preparing a set of all merged variants, then re-calling at non-missing positions.

(defn create-merged
  [vcfs align-bams exp & {:keys [out-dir intervals cores]}]
  (map (fn [[v b vcf-config]]
         (if (get vcf-config :recall false)
           (let [base-info {:name (:name vcf-config)
                            :approach (get-in exp [:params :recall-approach] :consensus)
                            :file v :bam b}
                 merged (recall-vcf base-info vcfs exp out-dir intervals)]
             (if (get vcf-config :remove-refcalls true)
               (select-by-sample (:sample exp) merged nil (:ref exp)
                                 :remove-refcalls true :ext "cleaned")
       (map vector vcfs align-bams (:calls exp))))

Split VCF line into shared attributes and sample specific genotypes. By default removes shared attributes which are no longer valid for split file.

(defn- split-vcf-sample-line
  ([line remove-info-attrs?]
     (let [parts (string/split line #"\t")
           orig-shared (vec (take 9 parts))
           shared (if remove-info-attrs? (assoc orig-shared 7 ".") orig-shared)]
       (for [s (drop 9 parts)] (conj shared s))))
     (split-vcf-sample-line line true)))

Split a multi-sample file to individual samples: writing the header.

(defn- split-vcf-to-samples-header
  [vcf-iter out-files]
  (letfn [(not-chrom? [l] (not (.startsWith l "#CHROM")))]
    (let [std-header (string/join "\n" (take-while not-chrom? vcf-iter))]
      (doseq [[i xs] (map-indexed vector (-> (drop-while not-chrom? vcf-iter)
                                             (split-vcf-sample-line false)))]
        (spit (get out-files i)
              (str std-header "\n" (string/join "\t" xs) "\n"))))))

Split multi-sample file to individual samples: variant lines Avoids opening all output handles, instead writing to individual files. Blocks writes into groups to reduce opening file penalties.

(defn split-vcf-to-samples-variants
  [vcf-iter out-files]
  (let [block-size 1000]
    (doseq [lines (partition-all block-size (drop-while #(.startsWith % "#") vcf-iter))]
      (let [sample-lines (reduce (fn [coll l]
                                   (reduce (fn [inner-coll [i xs]]
                                             (assoc inner-coll i (conj (get inner-coll i [])
                                                                       (string/join "\t" xs))))
                                           coll (map-indexed vector (split-vcf-sample-line l))))
                                 {} lines)]
        (doseq [[i xs] sample-lines]
          (spit (get out-files i)
                (str (string/join "\n" xs) "\n")
                :append true))))))

Create individual sample variant files from input VCF.

(defn split-vcf-to-samples
  [vcf-file & {:keys [out-dir]}]
  (let [samples (-> vcf-file gvc/get-vcf-header .getGenotypeSamples)
        out-files (into (ordered-map) (map (fn [x] [x (itx/add-file-part vcf-file x out-dir)])
    (when (itx/needs-run? (vals out-files))
      (with-open [rdr (reader vcf-file)]
        (itx/with-tx-files [tx-out-files out-files (keys out-files) []]
          (let [line-iter (line-seq rdr)]
            (split-vcf-to-samples-header line-iter (vec (vals tx-out-files)))
            (split-vcf-to-samples-variants line-iter (vec (vals tx-out-files)))))))

Split multiple sample inputs into individual samples before processing. This helps reduce the load on selecting from huge multi-sample files. Returns a list of configured calls with multi-samples set to individually separated input files.

(defn- split-config-multi
  [calls ref out-dir]
  (let [multi-files (filter multiple-samples? (set (map :file calls)))
        cur-samples (set (map :name calls))]
     (reduce (fn [coll vcf]
               (reduce (fn [inner-coll [sample split-vcf]]
                         (if (contains? cur-samples sample)
                           (assoc-in inner-coll [sample :file] split-vcf)
                       coll (split-vcf-to-samples vcf :out-dir out-dir)))
             (into (ordered-map) (map (fn [x] [(:name x) x]) calls))

Provide a cleaned VCF file without sample genotype information.

(defn- remove-sample-info
  [in-vcf out-dir]
  (letfn [(split-variant-line [line]
            (->> (string/split line #"\t")
                 (take 8)
                 (string/join "\t")))
          (process-line [line]
             (.startsWith line "##fileformat") line
             (.startsWith line "##INFO") line
             (.startsWith line "#CHROM") (split-variant-line line)
             (.startsWith line "#") nil
             :else (split-variant-line line)))]
    (let [out-file (itx/add-file-part in-vcf "nosamples" out-dir)]
      (when (itx/needs-run? out-file)
        (with-open [rdr (reader in-vcf)
                    wtr (writer out-file)]
          (doseq [line (map process-line (line-seq rdr))]
            (when line
              (.write wtr (str line "\n"))))))

Combine large numbers of variants via batches to avoid memory issues.

(defn batch-combine-variants
  [vcfs ref & {:keys [merge-type out-dir intervals unsafe name-map
                      base-ext check-ploidy? quiet-out? batch-size]
               :or {merge-type :unique
                    unsafe false
                    name-map {}
                    check-ploidy? true
                    batch-size 100}}]
  (letfn [(combine-w-args [xs]
            (combine-variants xs ref :merge-type merge-type :out-dir out-dir
                              :intervals intervals :unsafe unsafe :name-map name-map
                              :base-ext base-ext :check-ploidy? check-ploidy?
                              :quiet-out? quiet-out?))]
    (let [batch-vcfs (map combine-w-args (partition-all batch-size vcfs))]
      (combine-w-args batch-vcfs))))

Perform recalling on all specific inputs in an experiment

(defn- do-recall-exp
  [exp out-dir config]
  (let [recall-vcfs (map (fn [call]
                           (recall-nocalls (:file call) (:sample exp) (:name call) (:align call)
                                           (:ref exp) :out-dir out-dir
                                           :cores (get-in config [:resources :cores])))
                         (split-config-multi (:calls exp) (:ref exp) out-dir))
        clean-multi (map #(remove-sample-info % out-dir)
                         (filter multiple-samples? (set (map :file (:calls exp)))))]
    (batch-combine-variants (concat clean-multi recall-vcfs) (:ref exp) :merge-type :full
                            :quiet-out? true :check-ploidy? false)))

Convert no-calls into callable reference and real no-calls. Older functionality to re-call as reference when region is callable. Prefer recall-nocalls

(defn convert-no-calls-w-callability
  [in-vcf align-bam ref & {:keys [out-dir intervals num-alleles]}]
  (letfn [(maybe-callable-vc [vc call-source]
            {:pre (= 1 (:num-samples vc))}
            (let [g (-> vc :genotypes first)]
              (if (.isNoCall (-> g :alleles first))
                (if (is-callable? call-source (:chr vc) (:start vc) (:end vc))
                  (gvc/genotypes->refcall vc)
                  (-> (VariantContextBuilder. (:vc vc))
                      (.filters #{"NotCallable"})
                (:vc vc))))
          (convert-vcs [vcf-source call-source]
            (for [vc (gvc/parse-vcf vcf-source)]
              [:out (maybe-callable-vc vc call-source)]))]
    (let [out-file (itx/add-file-part in-vcf "wrefs")]
      (when (itx/needs-run? out-file)
        (with-open [in-vcf-iter (gvc/get-vcf-iterator in-vcf ref)
                    call-source (get-callable-checker align-bam ref :out-dir out-dir
                                                      :intervals intervals)]
          (gvc/write-vcf-w-template in-vcf {:out out-file}
                                    (convert-vcs in-vcf-iter call-source) ref)))
(defn -main [config-file]
  (let [config (load-config config-file)
        out-dir (get-in config [:dir :out])]
    (doseq [exp (:experiments config)]
      (do-recall-exp exp out-dir config))))

Establish a connection with a remote service for file management.

(ns bcbio.variation.remote.client
  (:use [clojure.java.io])
  (:require [clojure.string :as string]
            [blend.galaxy.core :as galaxy]
            [clj-genomespace.core :as gs]))
(defrecord RemoteClient [type conn username server])

Retrieve a remote client using authentication credentials creds is a map containing the :type of connection to establish and client specific authentication details.

(defmulti get-client
  (fn [creds]
    (:type creds)))
(def gs-default-server "http://www.genomespace.org/")
(defmethod get-client :gs
  ^{:doc "Retrieve a GenomeSpace client connection"}
  (let [{:keys [username password client allow-offline?]} creds
        gs-client (cond
                   (and client (gs/logged-in? client)) client
                   (and username password) (try (gs/get-client username :password password)
                                                (catch Exception e
                                                  (when-not allow-offline?
                                                    (throw e))))
                   :else nil)
        username (when gs-client
                   (gs/get-username gs-client))]
    (RemoteClient. :gs gs-client username gs-default-server)))
(defmethod get-client :galaxy
  ^{:doc "Retrieve a Galaxy client connection."}
  (let [{:keys [url api-key client allow-offline?]} creds
        galaxy-client (cond
                       (not (nil? client)) client
                       (and url api-key) (galaxy/get-client url api-key)
                       :else nil)
        user-info (try (galaxy/get-user-info galaxy-client)
                       (catch Exception e
                         (when-not allow-offline?
                           (throw e))))
        username (when user-info
                   (or (:username user-info) (:email user-info)))]
    (RemoteClient. :galaxy (when user-info galaxy-client) username url)))

Top level API for working with remote filestores.

(ns bcbio.variation.remote.core
  (:require [bcbio.variation.remote.file :as file]
            [bcbio.variation.remote.client :as client]))
(def get-client client/get-client)
(def list-dirs file/list-dirs)
(def list-files file/list-files)
(def get-file file/get-file)
(def put-file file/put-file)

List, retrieve and push files from a remote filestore.

(ns bcbio.variation.remote.file
  (:use [clojure.java.io]
        [bcbio.variation.api.shared :only [web-config url->dir]])
  (:require [clojure.string :as string]
            [fs.core :as fs]
            [blend.galaxy.core :as galaxy]
            [clj-genomespace.core :as gs]
            [bcbio.run.itx :as itx]))


Provide list of files currently under download.

(def ^{:doc 
       :private true}
  download-queue (atom #{}))

Generalized remote download to local cache directory. download-fn takes the client, remote name and local name, and handles the download step for a specific remote service.

(defn- download-to-local
  [fname rclient to-fileinfo-fn download-fn & {:keys [out-dir]}]
  (let [cache-dir (or out-dir (get-in @web-config [:dir :cache]))
        finfo (to-fileinfo-fn fname)
        local-file (str (file cache-dir (url->dir (:server rclient)) (:local-stub finfo)))
        local-dir (str (fs/parent local-file))]
    (when-not (fs/exists? local-file)
      (when-not (fs/exists? local-dir)
        (fs/mkdirs local-dir))
      (when-not (contains? @download-queue local-file)
        (itx/with-tx-file [out-file-tx local-file]
          (swap! download-queue conj local-file)
            (download-fn rclient finfo out-file-tx)
             (swap! download-queue disj local-file))))))

Retrieve files by name, transparently handling remote files.

(defmulti get-file
  (fn [fname & args]
    (let [parts (string/split fname #":" 2)]
      (when (= 2 (count parts))
        (keyword (first parts))))))
(defmethod get-file :gs
  ^{:doc "Retrieve a file from GenomeSpace to the local cache"}
  [fname rclient & {:keys [out-dir]}]
  (letfn [(fileinfo-gs [file-id]
            (let [remote-name (second (string/split file-id #":" 2))]
              {:dirname (str (fs/parent remote-name))
               :fname (fs/base-name remote-name)
               :local-stub (if (.startsWith remote-name "/")
                             (subs remote-name 1)
          (download-gs [rclient file-info out-file]
            (gs/download (:conn rclient) (:dirname file-info)
                         (:fname file-info) out-file))]
    (download-to-local fname rclient fileinfo-gs download-gs :out-dir out-dir)))
(defn- split-galaxy-id
  (when file-id
    (-> file-id
        (string/split #":" 2)
        (string/split #"/"))))
(defmethod get-file :galaxy
  ^{:doc "Retrieve a file from Galaxy to the local cache"}
  [fname rclient & {:keys [out-dir]}]
  (letfn [(fileinfo-galaxy [file-id]
            (let [[history-id ds-id] (split-galaxy-id file-id)
                  ds (galaxy/get-dataset-by-id (:conn rclient) history-id ds-id)
                  safe-username (string/replace (:username rclient) " " "-")]
              {:local-stub (str (file safe-username history-id (:name ds)))
               :ds ds}))
          (download-galaxy [rclient file-info out-file]
            (galaxy/download-dataset (:conn rclient) (:ds file-info) out-file))]
    (download-to-local fname rclient fileinfo-galaxy download-galaxy :out-dir out-dir)))
(defmethod get-file :default
  ^{:doc "Get local file: no-op, just return the file."}
  [fname _]


List directories available on the remote server. Returns map of directory :id and display :name.

(defmulti list-dirs
  (fn [rclient & args]
    (:type rclient)))
(defmethod list-dirs :gs
  ^{:doc "Retrieve available directories from GenomeSpace under the parent directory."}
  [rclient dirname]
  (map (fn [x] {:id x :name x})
       (gs/list-dirs (:conn rclient) dirname)))
(defmethod list-dirs :galaxy
  ^{:doc "Retrieve available histories from Galaxy connection."}
  [rclient _]
  (galaxy/list-histories (:conn rclient)))

List files in a remote directory of a specified type.

(defmulti list-files
  (fn [rclient & args]
    (:type rclient)))
(defmethod list-files :gs
  ^{:doc "List available files in GenomeSpace directory by type."}
  [rclient rdir ftype]
  (let [remote-dir (or (:id rdir) ".")]
     (map (fn [finfo]
            {:id (str "gs:" (:dirname finfo) "/" (:name finfo))
             :tags (remove nil?
                           [(first (drop 3 (string/split (:dirname finfo) #"/" 4)))])
             :folder (:dirname finfo)
             :filename (:name finfo)
             :size (:size finfo)
             :created-on (:date finfo)})
          (gs/list-files (:conn rclient) remote-dir (name ftype)))
     (mapcat #(list-files rclient % ftype) (list-dirs rclient remote-dir)))))
(defmethod list-files :galaxy
  ^{:doc "List available files from a Galaxy history."}
  [rclient hid ftype]
  (let [history-id (if (contains? hid :id) (:id hid) hid)
        history-name (or (:name hid) "")]
    (->> (galaxy/get-datasets-by-type (:conn rclient) ftype :history-id history-id)
         (remove #(:deleted %))
         (map (fn [ds]
                {:id (str "galaxy:" (:history-id ds) "/" (:id ds))
                 :tags [history-name]
                 :folder history-name
                 :filename (:name ds)
                 :size (:file-size ds)
                 :created-on nil})))))
(defmethod list-files :default
  ^{:doc "Retrieval of pre-downloaded files in our local cache."}
  [_ dir-info ftype])


Upload a file to a remote repository.

(defmulti put-file
  (fn [rclient & args]
    (:type rclient)))
(defmethod put-file :gs
  ^{:doc "Push file to GenomeSpace in the specified upload directory."}
  [rclient local-file params]
  (let [remote-dir (str (fs/file (fs/parent (last (string/split (:input-file params) #":" 2)))
                                 (:tag params)))]
    (gs/upload (:conn rclient) remote-dir local-file)
(defmethod put-file :galaxy
  ^{:doc "Push file to the current Galaxy history, using a remotely available URL."}
  [rclient local-file params]
  (let [host-info (:host-info params)
        history-id (first (split-galaxy-id (:input-file params)))
        provide-url ((:expose-fn params) local-file (:server rclient))]
    (galaxy/upload-to-history (:conn rclient) provide-url
                              (get params :dbkey :hg19)
                              (:file-type params)
                              :history-id history-id
                              :display-name (fs/base-name local-file))))

Parse and provide detailed information from GATKReport outputs.

(ns bcbio.variation.report
  (:use [ordered.map :only [ordered-map]]
        [clojure.math.combinatorics :only [cartesian-product]]
        [bcbio.variation.variantcontext :only [parse-vcf get-vcf-retriever
        [bcbio.variation.callable :only [get-callable-checker is-callable?]]
        [bcbio.variation.evaluate :only [organize-gatk-report-table]]
        [bcbio.variation.metrics :only [ml-on-vcf-metrics passes-filter? nonref-passes-filter?]])
  (:require [clojure.string :as string]
            [clojure.data.csv :as csv]
            [doric.core :as doric]
            [lonocloud.synthread :as ->]
            [bcbio.variation.grade :as grade]))

Retrieve high level concordance metrics from GATK VariantEval report.

(defn concordance-report-metrics
  [sample in-file]
  (letfn [(sample-in-row? [x]
            (and (= (:Sample x) sample)
                 (= (:Novelty x) "all")
                 (= (:Filter x) "called")))]
    (->> (organize-gatk-report-table in-file "GenotypeConcordance" sample-in-row?)
         (map (fn [x] [(keyword (:variable x)) (:value x)]))
         (into {}))))

Count variants that pass an optional checker function.

(defn count-variants
  [f ref-file check?]
  (with-open [vcf-iter (get-vcf-iterator f ref-file)]
    (count (filter check? (parse-vcf vcf-iter)))))

Provide metrics to distinguish types of discordance in a comparison. These identify variants which differ due to being missing in one variant call versus calls present in both with different genotypes. It also pulls out variants in diploids that differ due to het/hom calls.

(defn shared-discordant
  [file1 file2 ref-file]
  (with-open [file1-iter (get-vcf-iterator file1 ref-file)
              vcf-retriever (get-vcf-retriever ref-file file2)]
    (reduce (fn [coll vc]
              (let [other-vcs (variants-in-region vcf-retriever
                                                  (:chr vc) (:start vc) (:end vc))
                    vc-type (if-not (empty? other-vcs) :total :unique)]
                (-> coll
                    (assoc vc-type (inc (get coll vc-type)))
                    (assoc :hethom ((if (and (= :total vc-type)
                                             (grade/hethom-discordant? vc other-vcs))
                                      inc identity)
                                    (get coll :hethom))))))
            {:total 0 :unique 0 :hethom 0}
            (parse-vcf file1-iter))))

Calculate count of variant in input file without coverage in the comparison.

(defn nocoverage-count
  [in-vcf ref-file compare-kw compared]
  (let [out-dir (get-in compared [:dir :prep] (get-in compared [:dir :out]))
        align-file (get-in compared [compare-kw :align]
                           (get-in compared [:exp :align]))]
    (when-not (nil? align-file)
      (with-open [call-source (get-callable-checker align-file (-> compared :exp :ref)
                                                    :out-dir out-dir)]
        (count-variants in-vcf ref-file
                        #(and (passes-filter? %)
                              (not (is-callable? call-source (:chr %) (:start %) (:end %)))))))))

Retrieve expected summary level from configuration

(defn get-summary-level
  (letfn [(level-from-string [x]
            (case (when-not (nil? x) (string/lower-case x))
              "quick" :quick
              "full" :full
          (get-string-level [config to-try]
            (loop [cur-try to-try]
              (if (empty? cur-try) nil
                  (let [cur-level (get-in config (first cur-try))]
                    (if-not (nil? cur-level) cur-level
                            (recur (rest cur-try)))))))]
    (let [to-check (cartesian-product [:exp :c1 :c2 :call] [:summary-level])]
      (level-from-string (get-string-level config to-check)))))

Retrieve structural variation metrics from SV concordance files.

(defn get-sv-metrics
  [compared ref-file]
  (when-let [finfo (seq (->> (:c-files compared)
                             (drop-while #(not= (first %) :sv-concordant))
                             (take 3)))]
    (reduce (fn [coll [kw vcf-file]]
              (assoc coll kw
                      :total (count-variants vcf-file ref-file passes-filter?))))
            (ordered-map) finfo)))

Provide counts in a file, split by type of variation.

(defn- all-vrn-counts
  [fname cmp-kw compared]
  (letfn [(vrn-type-passes-filter? [vrn-type]
            (fn [vc]
              (and (passes-filter? vc)
                   (contains? vrn-type (:type vc)))))]
    (let [sum-level (get-summary-level compared)
          ref-file (get-in compared [:exp :ref])
          base {:total (count-variants fname ref-file passes-filter?)}]
      (if (= sum-level :quick) base
          (assoc base
            :nocoverage (nocoverage-count fname ref-file cmp-kw compared)
            :snp (count-variants fname ref-file
                                 (vrn-type-passes-filter? #{"SNP"}))
            :indel (count-variants fname ref-file
                                   (vrn-type-passes-filter? #{"INDEL"})))))))

Prepare grading based metrics to break down reasons for discordant calls

(defn- grading-discordance-metrics
  (println vcf-file))

Add detailed discordance metrics to the summary information.

(defn- add-discordance-metrics
  [m compared]
  (let [ref-file (get-in compared [:exp :ref])
        c-files (-> compared :c-files vals)]
    (-> m
        (assoc :discordant1 (all-vrn-counts (second c-files) :c2 compared))
        (->/when (> (count c-files) 2)
          (assoc :discordant2 (all-vrn-counts (nth c-files 2) :c1 compared))
          (assoc :discordant_both (apply shared-discordant (conj (vec (take 2 (rest c-files)))

Add detailed concordance metrics to summary information.

(defn- add-concordance-metrics
  [m compared]
  (let [ref-file (get-in compared [:exp :ref])
        c-files (-> compared :c-files vals)]
    (-> m
        (assoc :genotype_concordance (-> compared :metrics :percent_overall_genotype_concordance))
        (assoc :callable_concordance (-> compared :callable-metrics
        (assoc :nonref_discrepency (-> compared :metrics :percent_non_reference_discrepancy_rate))
        (assoc :nonref_sensitivity (-> compared :metrics :percent_non_reference_sensitivity))
        (assoc :concordant (all-vrn-counts (first c-files) nil compared))
        (assoc :nonref_concordant (count-variants (first c-files) ref-file

Provide one-line summary of similarity metrics for a VCF comparison.

(defn top-level-metrics
  (let [sum-level (get-summary-level compared)
        ref-file (get-in compared [:exp :ref])
        c-files (-> compared :c-files vals)]
    (-> (ordered-map)
        (assoc :sample (-> compared :exp :sample))
        (assoc :ftypes (take 3 (-> compared :c-files keys)))
        (add-concordance-metrics compared)
        (add-discordance-metrics compared)
        (->/when-let [sv-metrics (get-sv-metrics compared ref-file)]
          (assoc :sv sv-metrics))
        (->/when (= sum-level :full)
          (assoc :ml_metrics (ml-on-vcf-metrics ref-file (take 2 c-files)))))))

Calculate an overall accuracy score from input metrics. The accuracy logic is: (#correctly aligned bases - (1*(simple substitutions and indels) + 2*(larger errors)) / #correctly aligned bases)

(defn calc-accuracy
  [metrics error-items]
  (letfn [(get-penalty [[error-type call-type]]
            (case call-type
              :snp 1
              :indel 2
              :sv 2))]
    (let [error-items (cartesian-product error-items [:snp :indel :sv])
          error-score (apply + (map #(* (get-in metrics % 0) (get-penalty %)) error-items))
          total-bases (get-in metrics [:total-bases :compared] 1)]
       (* 100.0 (/ (- total-bases error-score) total-bases))))))

Summary table of high level variables and scoring metrics for comparison.

(defn prep-scoring-table
  [metrics sv-stats]
  (let [to-write (ordered-map :accuracy "Accuracy score"
                              :accuracy-phasing "Accuracy score, including phasing"
                              [:total-bases :percent] "Completeness"
                              [:total-bases :compared] "Total bases scored"
                              [:total-bases :total] "Possible evaluation bases"
                              [:discordant :snp] "Discordant SNPs"
                              [:discordant :indel] "Discordant indels"
                              [:discordant :sv] "Discordant structural variants"
                              [:phasing-error :snp] "Phasing Error SNPs"
                              [:phasing-error :indel] "Phasing Error indels"
                              :haplotype-blocks "Phased haplotype blocks"
                              ;:nonmatch-het-alt "Non-matching heterozygous alternative alleles")
        sv-metrics (assoc-in metrics [:discordant :sv]
                             (-> sv-stats vals second (get :total 0)))
        s-metrics (-> sv-metrics
                      (assoc :accuracy (calc-accuracy sv-metrics [:discordant]))
                      (assoc :accuracy-phasing (calc-accuracy sv-metrics
                                                              [:discordant :phasing-error])))
        need-percents {:accuracy 6
                       :accuracy-phasing 6
                       [:total-bases :percent] 2}]
    (letfn [(prep-row [[k x]]
              (let [val (if (coll? k) (get-in s-metrics k) (get s-metrics k))]
                {:metric x
                 :value (if (contains? need-percents k)
                          (format (str "%." (get need-percents k) "f") val)
      (map prep-row to-write))))

Write high level metrics table in readable format.

(defn- write-scoring-table
  [metrics sv-stats wrtr]
  (when-not (or (nil? metrics)
                (nil? (get-in metrics [:total-bases :total])))
    (.write wrtr (str (doric/table [:metric :value] (prep-scoring-table metrics sv-stats))

Summary table of metrics for assessing the score of a variant comparison.

(defn- write-concordance-metrics
  [metrics wrtr]
  (letfn [(metrics-info [ftype-i & kvs]
            (if (<= (count (:ftypes metrics)) ftype-i)
              (let [cur-name (name (nth (:ftypes metrics) ftype-i))]
                (apply concat
                       (map (fn [[k v]] [k (str cur-name ": " v)]) (partition 2 kvs))))))]
    (let [to-write (apply ordered-map
                          (concat [:genotype_concordance "Overall genotype concordance"
                                   :callable_concordance "Callable genotype concordance"
                                   :nonref_discrepency "Non-reference discrepancy rate"
                                   :nonref_sensitivity "Non-reference sensitivity"]
                                  (metrics-info 0
                                                [:concordant :total] "total"
                                                :nonref_concordant "non-reference"
                                                [:concordant :snp] "SNPs"
                                                [:concordant :indel] "indels")
                                  (metrics-info 1
                                                [:discordant1 :total] "total"
                                                [:discordant1 :nocoverage] "unique"
                                                [:discordant1 :snp] "SNPs"
                                                [:discordant1 :indel] "indels")
                                  (metrics-info 2
                                                [:discordant2 :total] "total"
                                                [:discordant2 :nocoverage] "unique"
                                                [:discordant2 :snp] "SNPs"
                                                [:discordant2 :indel] "indels")
                                  [[:discordant_both :total] "Shared discordant"
                                   [:discordant_both :hethom] "het/hom discordant"
                                   [:ml_metrics :top-metrics] "Classification metrics"]))]
      (letfn [(get-value [[k metric]]
                (when-let [val (if (coll? k) (get-in metrics k) (get metrics k))]
                  {:metric metric :value val}))]
        (.write wrtr (str (doric/table [:metric :value] (remove nil? (map get-value to-write)))

Summary table of structural variation comparions.

(defn- write-sv-metrics
  [sv-metrics wrtr]
  (letfn [(get-values [[base xs]]
            (map (fn [[inner-kw val]]
                   {:metric (str (name base) ": " (name inner-kw))
                    :value val})
    (.write wrtr "** Structural variation\n")
    (.write wrtr (str (doric/table [:metric :value]
                                   (->> (map get-values sv-metrics)
                                        (remove nil?)))

Classification metrics

Summary table of classification metrics from GATK variant recalibration.

(defn- write-classification-metrics
  [cmp-info wrtr]
  (letfn [(get-metric-counts [in-vcf]
            (with-open [vcf-iter (get-vcf-iterator in-vcf (get-in cmp-info [:exp :ref]))]
              (reduce (fn [coll vc]
                        (let [culprit (get-in vc [:attributes "culprit"])]
                          (if (or (nil? culprit) (= (count (:filters vc)) 0)) coll
                              (assoc coll culprit (inc (get coll culprit 0))))))
                      {} (parse-vcf vcf-iter))))
          (get-recal-metrics [in-vcf]
            (sort-by :count >
                     (map (fn [[m c]] {:metric m :count c}) (get-metric-counts in-vcf))))]
    (.write wrtr "** GATK recalibration filter metrics\n")
    (doseq [call (map (partial get cmp-info) [:c1 :c2])]
      (when (= (:mod call) "recal")
        (.write wrtr (str (doric/table [:metric :count]
                                       (get-recal-metrics (:file call)))

Top level reports

Write a summary text file with tables of useful concordance metrics.

(defn write-summary-txt
  [wtr comparisons]
  (doseq [x comparisons]
    (.write wtr (format "* %s : %s vs %s\n" (get-in x [:exp :sample])
                        (get-in x [:c1 :name]) (get-in x [:c2 :name])))
    (write-scoring-table (:metrics x) (get-in x [:summary :sv]) wtr)
    (write-concordance-metrics (:summary x) wtr)
    (when-let [sv-info (get-in x [:summary :sv])]
      (write-sv-metrics sv-info wtr))
    (when (get-in x [:c1 :mod])
      (write-classification-metrics x wtr))))

Retrieve values for CSV output checking for lots of special cases. - Display all values for :total lines - For :snp and :indel lines, only show values with dictionaries that have this detailed info. - Always display initial sample and call values (i <= 3)

(defn- get-summary-csv-vals
  [header cmp cur-type]
  (for [v (map-indexed (fn [i k]
                         (let [v (get cmp k)]
                            (= :type k) (name cur-type)
                            (and (not= :total cur-type)
                                 (> i 3)
                                 (not (map? v))) nil
                            :else v)))
    (if (map? v) (get v cur-type) v)))

Write a top level summary CSV file with useful concordance metrics.

(defn write-summary-csv
  [wtr comparisons]
  (doseq [[i [x cmp-orig]] (map-indexed vector (map (juxt identity :summary) comparisons))]
    (let [header (concat [:sample :call1 :call2 :type] (nnext (keys cmp-orig)))
          cmp (-> cmp-orig
                  (dissoc :ftypes)
                  (assoc :call1 (-> x :c1 :name))
                  (assoc :call2 (-> x :c2 :name)))]
      (when (= i 0)
        (csv/write-csv wtr [(map name header)]))
      (csv/write-csv wtr (for [cur-type [:total :snp :indel]]
                           (get-summary-csv-vals header cmp cur-type))))))

Write a CSV file with locations of useful files.

(defn write-files-csv
  [wtr comparisons config]
  (csv/write-csv wtr [["call1" "call2" "type" "fname"]])
  (doseq [x comparisons]
    (doseq [[k f] (:c-files x)]
      (csv/write-csv wtr [[(get-in x [:c1 :name]) (get-in x [:c2 :name]) (name k)
                           (string/replace-first f (str (get-in config [:dir :out]) "/") "")]]))))

Handle structural variations for larger insertions, deletions and genome rearrangements.

(ns bcbio.variation.structural
  (:import [org.broadinstitute.sting.utils.codecs.vcf VCFCodec]
           [org.broadinstitute.sting.utils.variantcontext VariantContextBuilder
           [org.broad.tribble.readers AsciiLineReader PositionalBufferedStream]
           [net.sf.picard.util IntervalTree])
  (:use [clojure.set :only [intersection]]
        [ordered.map :only [ordered-map]]
        [bcbio.variation.variantcontext :only [get-vcf-iterator parse-vcf merge-headers
                                               from-vc write-vcf-w-template]]
        [bcbio.variation.callable :only [get-bed-source features-in-region]])
  (:require [clojure.string :as string]
            [fs.core :as fs]
            [bcbio.run.itx :as itx]))

Default maximum indel size for exact comparisons. Based on assessment by Gavin Oliver: http://f1000r.es/MsY1QZ

(def ^{:private true
       :doc }
  max-indel 30)

Interval tree lookup

Retrieve an Interval with the specified start/end keywords.

(defn prep-itree
  [vc-iter start-kw end-kw]
  (reduce (fn [coll vc]
            (assoc coll (:chr vc)
                   (doto (get coll (:chr vc) (IntervalTree.))
                     (.put (get vc start-kw) (inc (get vc end-kw)) vc))))
          (ordered-map) vc-iter))

Convert IntervalTree Iterator into clojure seq. Catch deleted sequences and continue ignoring the deleted node.

(defn- itree-seq
   (when (.hasNext iter)
       (cons (.getValue (.next iter)) (itree-seq iter))
       (catch java.util.ConcurrentModificationException e
         (itree-seq iter))))))

Lazy sequence of items that overlap a region in a nested IntervalTree.

(defn get-itree-overlap
  [itree chrom start end]
  (let [chr-itree (get itree chrom)]
    (if (nil? chr-itree)
      (itree-seq (.overlappers chr-itree start end)))))

Lazy sequence of all items in an IntervalTree.

(defn get-itree-all
   (for [item (vals itree)]
     (sort-by :start
              (itree-seq (.iterator item))))))

Remove variant context from an IntervalTree

(defn remove-itree-vc
  [itree chr start end]
  (if (not-any? nil? [chr start end])
    (assoc itree chr
           (doto (get itree chr)
             (.remove start (inc end))))

Structural variation helpers

Determine the type of a structural variant. Expected types are:

- DEL: Deletion
- INS: Insertion
- DUP: Duplication
- INV: Inversion
- BND: Breakpoint end; paired with second variant
- CNV: Copy number variation
- nil: Not a structural variant.
(defn get-sv-type
  [vc params]
  (letfn [(max-allele-size [vc]
            (apply max (map #(.length %) (cons (:ref-allele vc) (:alt-alleles vc)))))
          (indel-type [vc]
            (if (> (.length (:ref-allele vc))
                   (apply max (map #(.length %) (:alt-alleles vc)))) :DEL :INS))
          (sv-type-from-symbol [allele]
            (->> allele
                 (re-find #"^<(\w+)(:|>)" )
          (alt-sv-type [vc]
            (let [allele (-> vc :alt-alleles first .getDisplayString)]
               (.startsWith allele "<") (sv-type-from-symbol allele)
               (or (.contains allele "[")
                   (.contains allele "]")) :BND)))]
     (and (= "INDEL" (:type vc))
          (> (max-allele-size vc) (or (:max-indel params) max-indel))) (indel-type vc)
     (= "SYMBOLIC" (:type vc)) (alt-sv-type vc)
     :else nil)))

Retrieve normalized integer values from an attribute.

(defn value-from-attr
  ([vc attr-name]
     (value-from-attr vc attr-name 0))
  ([vc attr-name attr-index]
      (-> vc
          (get attr-name (repeat (inc attr-index) "0"))
          (#(if (string? %) [%] %))
          (nth attr-index)

Concordance checking

Check if coordinates from two structural variants overlap. Considered an overlap if the two confidence intervals have shared bases.

(defmulti sv-ends-overlap?
  (fn [[end1 end2]] (type end1)))
(defmethod sv-ends-overlap? clojure.lang.PersistentVector
  [[[s1 e1] [s2 e2]]]
  (seq (intersection (set (range s1 (inc e1)))
                     (set (range s2 (inc e2))))))
(defmethod sv-ends-overlap? java.lang.String
  [[end1 end2]]
  (= end1 end2))
(defn- length-from-svlen [x] (value-from-attr x "SVLEN"))

Length of variation from INFO annotations, handling SVLEN and END.

(defn- length-from-info
  (max (length-from-svlen x)
       (- (value-from-attr x "END") (:start x))))

Length of insertion variation, handling ALT allele, INSEQ and well-known named insertions.

(defn- insertion-length
  (letfn [(get-insseq [x]
            (-> x :attributes (get "INSSEQ")))
          (length-by-insert-name [alt-allele]
             (.startsWith alt-allele "<INS:ME:") (-> alt-allele
                                                     (subs 1 (dec (count alt-allele)))
                                                     (string/split #":")
             (= alt-allele "<INS>") nil
             :else (throw (Exception. (str "Unknown insert allele" alt-allele)))))
          (get-allele-insert [x]
            (let [alt-allele (-> x :alt-alleles first .getDisplayString)]
              (if (.startsWith alt-allele "<")
                (length-by-insert-name alt-allele)
                (dec (count alt-allele)))))]
    (if-let [seq (get-insseq x)]
      (count seq)
      (if-let [named-insert (get-allele-insert x)]
        (length-from-info x)))))

Length of deletion variations, handling SVLEN and allele specifications.

(defn- deletion-length
  (let [svlen (length-from-svlen vc)]
    (if (pos? svlen)
      (- (-> vc :ref-allele .length)
         (apply min (map #(.length %) (:alt-alleles vc)))))))

Retrieve length of a structural variant for different variation types.

(defn- get-sv-length
  (case (:sv-type vc)
         :DEL (deletion-length vc)
         :INS (insertion-length vc)
         :INV (length-from-info vc)
         :DUP (length-from-info vc)
         :CNV (length-from-info vc)
         :BND 0
         (throw (Exception. (str "Structural variant type not handled: "
                                 (:sv-type vc))))))

Retrieve start and end with confidence intervals for a variation.

(defn- get-ci-start-end
  [vc params & {:keys [allow-named?]}]
  (letfn [(get-ci-range [orig attr default-ci]
            (let [left-ci (value-from-attr vc attr 0)
                  right-ci (value-from-attr vc attr 1)]
              [(- orig (if (pos? left-ci) left-ci default-ci))
               (+ orig (if (pos? right-ci) right-ci default-ci))]))
          (get-default-ci [length]
            (let [default (if-let [x (-> (:default-cis params) first second)] x 0)
                  by-length (when-not (string? length)
                              (second (first (drop-while #(< (first %) length)
                                                         (:default-cis params)))))]
              (if-not (nil? by-length) by-length default)))]
    (let [start (:start vc)
          length (get-sv-length vc)
          default-ci (get-default-ci length)
          end (cond
               (and allow-named? (string? length)) length
               (string? length) (:end vc)
               :else (max (+ start length) (:end vc)))]
      [(get-ci-range start "CIPOS" default-ci)
       (if (string? end) end
           (get-ci-range end "CIEND" default-ci))])))

Check for concordance of variants based on reported length: handles deletions, inversions. insertions and duplications.

(defn- sv-len-concordant?
  [sv1 sv2 params]
  (every? sv-ends-overlap?
          (partition 2 (interleave (get-ci-start-end sv1 params :allow-named? true)
                                   (get-ci-start-end sv2 params :allow-named? true)))))

Check if structural variants are concordant.

(defn sv-concordant?
  [params sv1 sv2]
  (and (apply = (map :sv-type [sv1 sv2]))
       (case (:sv-type sv1)
         (:DEL :INS :INV :DUP) (sv-len-concordant? sv1 sv2 params)
         (:BND :UNASSEMBLED_EVENT) false
         (throw (Exception. (str "Structural variant type not handled: "
                                 (:sv-type sv1)))))))

Parsing structural variants

Parse VCF file returning structural variants with confidence intervals. The :out-format keyword specifies how to return the parsed structural variants: - :itree -- Interval tree for variant lookup by chromosome and start/end. - default -- List of variants (non-lazy).

(defn parse-vcf-sv
  [vcf-file ref-file & {:keys [out-format interval-file params]
                        :or {params {}}}]
  (letfn [(updated-sv-vc [cur-vc]
            (when-let [sv-type (get-sv-type cur-vc params)]
              (let [[start-cis end-cis] (get-ci-start-end (assoc cur-vc :sv-type sv-type)
                (-> cur-vc
                    (assoc :start-ci (first start-cis))
                    (assoc :end-ci (second end-cis))
                    (assoc :sv-type sv-type)))))
          (in-intervals? [bed-source vc]
            (or (nil? bed-source)
                (seq (features-in-region bed-source (:chr vc) (:start-ci vc) (:end-ci vc)))))]
    (with-open [vcf-iter (get-vcf-iterator vcf-file ref-file)]
      (let [vs-iter (filter (partial in-intervals? (when interval-file
                                                     (get-bed-source interval-file ref-file)))
                            (keep updated-sv-vc (parse-vcf vcf-iter)))]
        (case out-format
          :itree (prep-itree vs-iter :start-ci :end-ci)
          (vec vs-iter))))))

Compare two structural variant files, returning variant contexts keyed by concordance.

(defn- find-concordant-svs
  [fname1 fname2 disc-kwds ref interval-file params]
  (let [cmp-tree (atom (parse-vcf-sv fname2 ref :out-format :itree :interval-file interval-file
                                     :params params))]
    (letfn [(check-sv-concordance [vc]
              (let [matches (filter (partial sv-concordant? params vc)
                                    (get-itree-overlap @cmp-tree (:chr vc)
                                                       (:start-ci vc) (inc (:end-ci vc))))]
                (doseq [m-vc matches]
                  (reset! cmp-tree (remove-itree-vc @cmp-tree (:chr m-vc)
                                                    (:start m-vc) (:end m-vc))))
                (if-let [match (first matches)]
                  [:sv-concordant (:vc match)]
                  [(:1 disc-kwds) (:vc vc)])))
            (remaining-cmp-svs [itree]
              (partition 2
                         (interleave (repeat (:2 disc-kwds)) (map :vc (get-itree-all itree)))))]
       (map check-sv-concordance (parse-vcf-sv fname1 ref :interval-file interval-file
                                               :params params))
       (remaining-cmp-svs @cmp-tree)))))

Retrieve list of non-structural variants in the provided input file.

(defn find-non-svs
  [kwd vcf-source params ignore-regions]
  (->> (parse-vcf vcf-source)
       (filter #(nil? (get-sv-type % params)))
       (remove #(contains? ignore-regions [(:chr %) (:start %)]))
       (map :vc)
       (interleave (repeat kwd))
       (partition 2)))

Write output file containing only non-structural variants

(defn write-non-svs
  [in-file ref params]
  (let [out-file (itx/add-file-part in-file "nosv")]
    (with-open [vcf-iter (get-vcf-iterator in-file ref)]
    (write-vcf-w-template in-file {:out out-file}
                          (find-non-svs :out vcf-iter params #{})

Produce set of coordinates resulting from large SV deletions. Useful for avoiding double comparisons where we match SVs and then match smaller variations: we want to avoid those in a SV

(defn- sv-deletion-coords
  [in-file ref params]
  (letfn [(deletion-positions [vc]
            (let [[[s _] [_ e]] (get-ci-start-end (assoc vc :sv-type :DEL) {})]
              (map #(list (:chr vc) %) (range s e))))]
    (with-open [vcf-iter (get-vcf-iterator in-file ref)]
      (->> (parse-vcf vcf-iter)
           (filter #(= :DEL (get-sv-type % params)))
           (mapcat deletion-positions)

Compare structural variants, producing concordant and discordant outputs

(defn compare-sv
  [c1 c2 ref & {:keys [out-dir interval-file params]
                :or {params {}}}]
  (let [base-out (str (fs/file (if (nil? out-dir) (fs/parent (:file c1)) out-dir)
                               (str (-> c1 :file fs/base-name (string/split #"-") first)
        disc-kwds {:1 (keyword (str "sv-" (:name c1) "-discordant"))
                   :2 (keyword (str "sv-" (:name c2) "-discordant"))}
        out-files (ordered-map
                   :sv-concordant (format base-out (:name c1) (:name c2) "svconcordance")
                   (:1 disc-kwds) (format base-out (:name c1) (:name c2) "svdiscordance")
                   (:2 disc-kwds) (format base-out (:name c2) (:name c1) "svdiscordance")
                   :nosv1 (itx/add-file-part (:file c1) "nosv" out-dir)
                   :nosv2 (itx/add-file-part (:file c2) "nosv" out-dir))]
    (when (itx/needs-run? (vals out-files))
      (with-open [vcf1-iter (get-vcf-iterator (:file c1) ref)
                  vcf2-iter (get-vcf-iterator (:file c2) ref)]
        (write-vcf-w-template (:file c1) out-files
                               (find-concordant-svs (:file c1) (:file c2) disc-kwds
                                                    ref interval-file params)
                               (find-non-svs :nosv1 vcf1-iter params #{})
                               (find-non-svs :nosv2 vcf2-iter params
                                             (sv-deletion-coords (:file c1) ref params)))
                              ref :header-update-fn (merge-headers (:file c2))))
      ;; Remove SV VCF indexes since they use alternative Codecs
      (doseq [fname (vals out-files)]
        (let [x (str fname ".idx")]
          (if (fs/exists? x)
            (fs/delete x)))))

Handle input decomposition running SV detection through the standard pipeline.

(defn compare-sv-pipeline
  [c1 c2 exp config]
  (let [out-dir (get-in config [:dir :prep] (get-in config [:dir :out]))
        intervals (get c1 :intervals (get c2 :intervals (:intervals exp)))
        default-params {:max-indel max-indel
                        :default-cis [[200 10] [500 100] [1000 200] [1e6 500]]}
        params (merge default-params (:params exp))
        out-files (compare-sv c1 c2 (:ref exp) :out-dir out-dir
                              :interval-file intervals :params params)]
    [(assoc c1 :file (:nosv1 out-files))
     (assoc c2 :file (:nosv2 out-files))
     (-> out-files
         (dissoc :nosv1)
         (dissoc :nosv2))]))

Utility functions

Lazy stream of structural variants overlapping in both inputs.

(defn- find-overlapping-svs
  [f1 f2 ref params]
  (letfn [(find-overlaps [cmp-tree vc]
            (when-let [cmp-vc (first (get-itree-overlap cmp-tree (:chr vc)
                                                        (:start-ci vc) (inc (:end-ci vc))))]
              [:out1 (:vc vc) :out2 (:vc cmp-vc)]))]
    (let [cmp-tree (parse-vcf-sv f2 ref :out-format :itree :params params)]
      (->> (parse-vcf-sv f1 ref :params params)
           (map (partial find-overlaps cmp-tree))
           (remove nil?)
           (partition 2)))))

Prepare VCF files of only overlapping structural variants present in both.

(defn overlapping-svs
  [f1 f2 ref params]
  (let [out-files {:out1 (itx/add-file-part f1 "overlap")
                   :out2 (itx/add-file-part f2 "overlap")}]
    (when (itx/needs-run? (vals out-files))
      (write-vcf-w-template f1 out-files (find-overlapping-svs f1 f2 ref params) ref
                            :header-update-fn (merge-headers f2)))

Prepare annotated VCF files to use as background for variant calling and recalibration. Batch variant calling and recalibration with GATK improves resulting calls. This provides a ready to use set of calls to batch with a single sample using 1000 genomes data.

(ns bcbio.variation.utils.background
  (:use [clojure.java.io]
        [bcbio.variation.config :only [load-config]]
        [bcbio.variation.combine :only [combine-variants]]
        [bcbio.variation.filter.intervals :only [select-by-sample]]
        [bcbio.variation.annotation :only [add-gatk-annotations]])
  (:require [clojure.java.shell :as shell]
            [clojure.string :as string]
            [fs.core :as fs]
            [aws.sdk.s3 :as s3]
            [bcbio.run.itx :as itx]))

Combine and annotate VCFs

Download BAM file and index for a sample from 1000 genomes FTP.

(defn- download-sample-bam
  [sample ftp-config out-dir]
  (letfn [(download [url fname]
            (when-not (fs/exists? fname)
              (println "Downloading" url "to" fname)
              (shell/with-sh-dir out-dir
                (shell/sh "wget" "-O" fname url))))]
    (let [dl-url (format (:bam-url ftp-config) sample sample)
          local-file (str (fs/file out-dir (string/replace (fs/base-name dl-url) ".*" "")))]
      (download dl-url local-file)
      (download (str dl-url ".bai") (str local-file ".bai"))

Check if a file exists, also checking for gzipped versions.

(defn- gzip-needs-run?
  (every? itx/needs-run? [x (str x ".gz")]))

Annotate genome sample VCFs with GATK metrics.

(defn- annotate-sample
  [sample-info ref ftp-config prep-dir out-dir]
  (let [final-file (str (fs/file out-dir (format "%s-annotated.vcf" (:sample sample-info))))]
    (when (gzip-needs-run? final-file)
      (let [sample-bam (download-sample-bam (:sample sample-info) ftp-config prep-dir)
            ann-vcf (add-gatk-annotations (:file sample-info) sample-bam ref)]
        (fs/rename ann-vcf final-file)
        (itx/remove-path sample-bam)))

Combine sample VCFs split by chromosome.

(defn- combine-samples
  [sample-info ref out-dir]
  (letfn [(combine-sample [[name xs]]
            {:sample name
             :file (combine-variants (map :file xs) ref :merge-type :full
                                     :out-dir out-dir)})]
    (map combine-sample
         (group-by :sample (flatten sample-info)))))

Subset VCF by sample

Download chromosome VCF from 1000 genomes for processing.

(defn- download-chrom-vcf
  [chrom ftp-config out-dir]
  (letfn [(download-vcf [url fname]
            (println "Downloading" url "to" fname)
            (shell/with-sh-dir out-dir
              (shell/sh "wget" "-O" fname url)
              (shell/sh "gunzip" (str (fs/base-name fname)))))]
    (let [dl-url (format (:vcf-url ftp-config) chrom)
          local-file (str (fs/file out-dir (fs/base-name dl-url)))
          final-file (itx/remove-zip-ext local-file)]
      (when-not (fs/exists? final-file)
        (download-vcf dl-url local-file))

Select samples from input 1000 genomes chromosome VCF.

(defn- select-samples-at-chrom
  [chrom samples ref ftp-config out-dir]
  (let [sample-info (map (fn [x] {:sample x
                                  :file (str (fs/file out-dir (format "%s-%s.vcf" x chrom)))})
    (when (apply itx/needs-run? (map :file sample-info))
      (let [chrom-vcf (download-chrom-vcf chrom ftp-config out-dir)]
        (doseq [sample samples]
          (select-by-sample sample chrom-vcf chrom ref :out-dir out-dir
                            :remove-refcalls true))
        (itx/remove-path chrom-vcf)))

Create combined background file

Prepare combined VCF file with background information from multiple inputs.

(defn prep-combined-background
  [vcfs config]
  (letfn [(maybe-bgzip-vcf [x]
            (first (filter fs/exists? [x (str x ".gz")])))]
    (let [out-dir (get-in config [:dir :out])
          out-file (str (fs/file out-dir (get-in config [:upload :combined-vcf])))]
      (when (gzip-needs-run? out-file)
        (-> (combine-variants (map maybe-bgzip-vcf vcfs) (:ref config)
                              :merge-type :full :out-dir out-dir)
            (fs/rename out-file)))

Tabix prep and upload

Prep VCF for tabix access by bgzipping and indexing.

(defn- tabix-prep-vcf
  (let [out-dir (str (fs/parent vcf))
        vcf-gz (str vcf ".gz")
        tbi-gz (str vcf-gz ".tbi")]
    (shell/with-sh-dir out-dir
      (when (itx/needs-run? vcf-gz)
        (shell/sh "bgzip" (str (fs/base-name vcf))))
      (when (itx/needs-run? tbi-gz)
        (shell/sh "tabix" "-p" "vcf" (str (fs/base-name vcf-gz)))))
    [vcf-gz tbi-gz]))

Upload prepared sample VCF bgzipped and tabix indexed.

(defmulti upload-result-vcf
  (fn [_ config] (keyword (get-in config [:upload :target]))))
(defmethod upload-result-vcf :s3
  [vcf config]
  (let [cred {:access-key (System/getenv "AWS_ACCESS_KEY_ID")
              :secret-key (System/getenv "AWS_SECRET_ACCESS_KEY")}
        bucket (get-in config [:upload :bucket])]
    (when-not (s3/bucket-exists? cred bucket)
      (s3/create-bucket cred bucket)
      (s3/update-bucket-acl cred bucket (s3/grant :all-users :read)))
    (doseq [fname (tabix-prep-vcf vcf)]
      (let [s3-key (format "%s/%s" (get-in config [:upload :folder])
                           (str (fs/base-name fname)))]
        (when-not (s3/object-exists? cred bucket s3-key)
          (s3/put-object cred bucket s3-key (file fname))
          (s3/update-object-acl cred bucket s3-key
                                (s3/grant :all-users :read)))
        (println s3-key)))))
(defn make-work-dirs [config]
  (doseq [dir-name (-> config :dir keys)]
    (let [cur-dir (get-in config [:dir dir-name])]
      (when-not (fs/exists? cur-dir)
        (fs/mkdirs cur-dir)))))
(defn -main [config-file]
  (let [config (load-config config-file)]
    (make-work-dirs config)
    (let [prep-dir (get-in config [:dir :prep])
          samples (map #(select-samples-at-chrom % (:genomes config) (:ref config)
                                                 (:ftp config) prep-dir)
                                        (get-in config [:ftp :chromosomes]))
          combo-samples (combine-samples samples (:ref config) prep-dir)
          ann-samples (map #(annotate-sample % (:ref config) (:ftp config)
                                             prep-dir (get-in config [:dir :out]))
                           (sort-by :sample combo-samples))]
      (doseq [ready-vcf (cons (prep-combined-background ann-samples config)
        (upload-result-vcf ready-vcf config)))))

Summarize a set of calls derived from multiple inputs to help with identifying filtering patterns.

(ns bcbio.variation.utils.callsummary
  (:use [clojure.java.io]
        [bcbio.variation.config :only [load-config]]
        [bcbio.variation.variantcontext :only [parse-vcf get-vcf-iterator
                                               get-vcf-retriever variants-in-region]])
  (:require [clojure.string :as string]
            [fs.core :as fs]
            [bcbio.run.itx :as itx]))
(defn- get-prepped-fname
  [call exp config]
  (let [dirname (get-in config [:dir :prep])]
    (str (file dirname (format "%s-%s-nomnp.vcf" (:sample exp) (:name call))))))

Report details on a variant based on items found in inputs.

(defn report-vrn-summary
  [wtr vc retriever fname-map]
  (letfn [(get-alt-alleles [vc]
            (map #(.getBaseString %) (:alt-alleles vc)))
          (get-match-variants [vc]
            (filter #(= (:start %) (:start vc))
                    (variants-in-region retriever (:chr vc) (:start vc) (:end vc))))]
    (let [hits (get-match-variants vc)]
      (.write wtr (str (string/join ","
                                    [(:chr vc) (:start vc)
                                     (.getBaseString (:ref-allele vc))
                                     (string/join ";" (get-alt-alleles vc))
                                     (string/join ";" (sort (vec (set (map #(get fname-map (:fname %))
                                     (string/join ";" (set (mapcat get-alt-alleles hits)))])

Annotate input VCF with summary details from input files.

(defn annotate-with-callsummary
  [in-file config-file]
  (let [config (load-config config-file)
        exp (-> config :experiments first)
        orig-files (filter fs/exists? (map #(get-prepped-fname % exp config) (:calls exp)))
        fname-map (zipmap orig-files (map :name (:calls exp)))
        retriever (apply get-vcf-retriever (cons (:ref exp) orig-files))
        out-file (str (itx/file-root in-file) ".csv")]
    (with-open [vrn-iter (get-vcf-iterator in-file (:ref exp))
                wtr (writer out-file)]
      (doseq [vc (filter #(empty? (:filters %)) (parse-vcf vrn-iter))]
        (report-vrn-summary wtr vc retriever fname-map)))

Add metrics from Complete Genomics masterVar file to a VCF. This updates a converted VCF from Complete Genomics with metrics information allowing assessment and filtering.

(ns bcbio.variation.utils.cgmetrics
  (:import [org.broadinstitute.sting.utils.variantcontext VariantContextBuilder]
           [org.broadinstitute.sting.utils.codecs.vcf VCFHeader VCFInfoHeaderLine
            VCFHeaderLineCount VCFHeaderLineType])
  (:use [clojure.java.io]
        [ordered.set :only (ordered-set)]
        [bcbio.variation.normalize :only [hg19-map]]
        [bcbio.variation.variantcontext :only [parse-vcf write-vcf-w-template
  (:require [clojure.data.csv :as csv]
            [bcbio.run.itx :as itx]))

Get lookup dictionary of CG variant metrics by position.

(defn- get-masterVar-metrics
  (letfn [(variant-score [line name]
            (let [alleles ["1" "2"]]
              (/ (apply + (map #(Float/parseFloat (get line (format "allele%sVarScore%s" % name)))
                 (count alleles))))
          (allele-balance [line]
            (/ (Float/parseFloat (get line "referenceAlleleReadCount"))
               (Float/parseFloat (get line "totalReadCount"))))]
    (with-open [rdr (reader in-file)]
      (let [csv-iter (drop-while #(< (count %) 3)
                                 (csv/read-csv rdr :separator \tab))
            header (first csv-iter)]
        (reduce (fn [coll xs]
                  (let [line (zipmap header xs)]
                    (assoc coll [(get hg19-map (get line "chromosome"))
                                 (inc (Integer/parseInt (get line "begin")))]
                           {:depth (get line "totalReadCount")
                            :qual-eaf (variant-score line "EAF")
                            :qual-vaf (variant-score line "VAF")
                            :ab (allele-balance line)})))
                {} (rest csv-iter))))))

Provide iterator of variants with CG metrics added

(defn- add-cgmetrics-iter
  [vcf-source metrics]
  (letfn [(update-cgmetrics [vc x]
            (-> (VariantContextBuilder. (:vc vc))
                (.attributes (assoc (:attributes vc)
                               "DPCALL" (:depth x)
                               "AB" (:ab x)
                               "QUALEAF" (:qual-eaf x)
                               "QUALVAF" (:qual-vaf x)))
    (map (fn [vc]
           (if-let [cur-metrics (get metrics [(:chr vc) (:start vc)])]
             (update-cgmetrics vc cur-metrics)
             (:vc vc)))
         (parse-vcf vcf-source))))

Add CG metrics definitions to the VCF input header.

(defn- add-cgmetrics-header
  [_ header]
  (let [new #{(VCFInfoHeaderLine. "DPCALL" 1
                                  VCFHeaderLineType/Integer "Total depth used for calls")
              (VCFInfoHeaderLine. "QUALEAF" 1
                                  "Variant quality under equal allele fraction model (EAF)")
              (VCFInfoHeaderLine. "QUALVAF" 1
                                  "Variant quality under maximum likelihood variable allele fraction model (VAF)")
              (VCFInfoHeaderLine. "AB" 1
                                  VCFHeaderLineType/Float "Allele Balance")}]
    (VCFHeader. (apply ordered-set (concat (.getMetaDataInInputOrder header) new))
                (.getGenotypeSamples header))))

Add metrics from Complete Genomics masterVar file to VCF.

(defn add-cgmetrics
  [vcf-file mastervar-file ref-file & {:keys [out-dir]}]
  (let [out-file (itx/add-file-part vcf-file "cgmetrics" out-dir)]
    (when (itx/needs-run? out-file)
      (with-open [vcf-iter (get-vcf-iterator vcf-file ref-file)]
        (write-vcf-w-template vcf-file {:out out-file}
                              (add-cgmetrics-iter vcf-iter
                                                  (get-masterVar-metrics mastervar-file))
                              ref-file :header-update-fn add-cgmetrics-header)))
(ns bcbio.variation.utils.core
  (:require [bcbio.variation.utils.callsummary :as callsummary]
            [bcbio.variation.utils.gms :as gms]
            [bcbio.variation.utils.illumina :as illumina]
            [bcbio.variation.utils.popfreq :as popfreq]
            [bcbio.variation.utils.summarize :as summarize]
            [bcbio.variation.utils.svmerge :as svmerge]))
(defn -main [cur-type & args]
  (apply (case (keyword cur-type)
           :callsummary callsummary/annotate-with-callsummary
           :gms gms/prepare-gms-vcfs-from-config
           :illumina illumina/prep-illumina-variants
           :popfreq popfreq/annotate-with-popfreq
           :summarize summarize/vcf-to-table-config
           :svmerge svmerge/into-calls)

Build reference Genomic Mappability Score (GMS) variant file. Uses full GMS files to generate VCF of potentially problematic low-GMS regions: http://sourceforge.net/apps/mediawiki/gma-bio/index.php

(ns bcbio.variation.utils.gms
  (:import [org.broadinstitute.sting.utils.variantcontext VariantContextBuilder Allele]
           [org.broadinstitute.sting.utils.variantcontext.writer VariantContextWriterFactory]
           [org.broadinstitute.sting.utils.codecs.vcf VCFHeader
            VCFInfoHeaderLine VCFHeaderLineCount VCFHeaderLineType])
  (:use [clojure.java.io]
        [ordered.map :only [ordered-map]]
        [bcbio.align.ref :only [get-seq-dict]]
        [bcbio.variation.combine :only [combine-variants]]
        [bcbio.variation.config :only [load-config]]
        [bcbio.variation.normalize :only [hg19-map]]
        [bcbio.variation.utils.background :only [make-work-dirs]])
  (:require [clojure.java.shell :as shell]
            [clojure.string :as string]
            [fs.core :as fs]
            [bcbio.run.itx :as itx]))

Download GMS data for all technologies at a chromosome.

(defn- download-chrom-gms-data
  [chrom ftp-config out-dir]
  (letfn [(download-gms [chrom tech]
            (let [dl-url (format (:gms-url ftp-config) (:genome-build ftp-config)
                                 tech chrom)
                  final-file (itx/add-file-part (itx/remove-zip-ext (fs/base-name dl-url))
                                                tech out-dir)
                  dl-file (str final-file ".gz")]
              (when (itx/needs-run? final-file)
                (shell/with-sh-dir out-dir
                  (println (format "Downloading %s to %s" dl-url dl-file))
                  (shell/sh "wget" "-O" dl-file dl-url)
                  (shell/sh "gunzip" dl-file)))
    (into (ordered-map)
          (map (juxt identity (partial download-gms chrom))
               (:technologies ftp-config)))))

Retrieve chromosome, position and GMS score for line in a GMS file

(defn- parse-gms-line
  (let [[chrom pos base _ _ score] (string/split line #"\t")]
    {:chrom chrom
     :pos (Integer/parseInt pos)
     :base base
     :score (Float/parseFloat score)}))
(defn- low-gms-score? [config gms-data]
  (let [thresh (get config :max-gms-score 50.0)]
    (and (> (:score gms-data) 0.0)
         (< (:score gms-data) thresh))))

Prepare variant context from set of GMS scores

(defn- gms-scores-to-vc
  [techs scores]
  (let [contig (get hg19-map (-> scores first :chrom))
        all-pos (filter pos? (map :pos scores))
        pos (if (= 1 (count (set all-pos)))
              (first all-pos)
              (throw (Exception. (str "Multiple positions found: " all-pos))))
        base (->> (map :base scores)
                  (filter #(not= % "*"))
    (when-not (or (zero? pos) (nil? base))
      (-> (VariantContextBuilder. contig contig pos pos [(Allele/create base true)])
          (.attributes (reduce (fn [coll [tech score]]
                                 (assoc coll (str "GMS_" tech) (format "%.1f" (:score score))))
                               {} (map vector techs scores)))
(defn- get-vcf-header [techs]
  (VCFHeader. (set
               (map #(VCFInfoHeaderLine. (format "GMS_%s" %) 1
                                         (format "Genome Mappability Score: %s" %))

Prepare an output VCF of low GMS values at the provided chromosome.

(defn- prepare-vcf-at-chrom
  [chrom ftp-config ref out-dir]
  (let [out-file (file out-dir (format "lowgms-scores-%s.vcf" chrom))]
    (when (itx/needs-run? out-file)
      (let [gms-files (download-chrom-gms-data chrom ftp-config out-dir)
            readers (map reader (vals gms-files))]
        (with-open [writer (VariantContextWriterFactory/create (file out-file)
                                                               (get-seq-dict ref))]
          (.writeHeader writer (get-vcf-header (keys gms-files)))
          (loop [line-iters (->> (map line-seq readers)
                                 (map (fn [x] (drop-while #(.startsWith % "#") x))))]
            (when-not (or (empty? (first line-iters))
                          (some nil? (map first line-iters)))
              (let [cur-gms (map (comp parse-gms-line first) line-iters)]
                (when (some (partial low-gms-score? ftp-config) cur-gms)
                  (when-let [vc (gms-scores-to-vc (keys gms-files) cur-gms)]
                    (.add writer vc))))
              (recur (map rest line-iters))))
          (doseq [x readers]
            (.close x)))
        (doseq [x (vals gms-files)]
          (itx/remove-path x))))
    (str out-file)))

Prepare individual chromosome VCF files with low GMS data by sequencing technology.

(defn prepare-gms-vcfs
  (let [ref (:ref config)
        out-dir (get-in config [:dir :out])
        gms-by-chrom (doall (map #(prepare-vcf-at-chrom % (:ftp config) ref out-dir)
                                  (get-in config [:ftp :chromosomes])))]
    (println gms-by-chrom)
    (combine-variants gms-by-chrom ref :merge-type :full :out-dir out-dir
                      :quiet-out? true))
(defn prepare-gms-vcfs-from-config [config-file]
  (let [config (load-config config-file)]
    (make-work-dirs config)
    (prepare-gms-vcfs config)))

Automate converting Illumina variant calls into GATK-ready format. - Select MAXGT calls from Illumina SNP file (no prior assumption of a variant) - Add sample names to SNP and Indel headers. - Remove illegal gap characters from indel files. - Convert into GRCh37 sorted coordinates. - Merge SNP and Indels into single callset.

(ns bcbio.variation.utils.illumina
  (:require [fs.core :as fs]
            [bcbio.run.itx :as itx]
            [bcbio.variation.combine :refer [gatk-normalize]]))
(defn- get-illumina-vcf
  [base-dir base-name]
  (-> (fs/file base-dir "Variations" (str base-name "*.vcf"))

Prepare Illumina variants from a standard directory structure. - base-dir: Directory containing Illumina information (will have subdirs like Assembly, Consensus and Variations) - sample-name: The name to include in updated VCF headers - ref-file: Reference file we want to sort to - orig-ref-file: Original reference file (hg19 for Illumina)

(defn prep-illumina-variants
  [base-dir sample-name ref-file orig-ref-file]
  (let [base-dir (fs/expand-home base-dir)
        out-file (str (fs/file base-dir "Variations" (str sample-name ".vcf")))]
    (when (itx/needs-run? out-file)
      (itx/with-temp-dir [out-dir base-dir]
        (let [call {:name "iprep" :file [(get-illumina-vcf base-dir "SNPs")
                                         (get-illumina-vcf base-dir "Indels")]
                    :preclean true :prep true :normalize true
                    :ref orig-ref-file}
              exp {:sample sample-name :ref ref-file}
              out-info (gatk-normalize call exp [] out-dir
                                       (fn [_ x] (println x)))]
          (fs/rename (:file out-info) out-file))))

Associate population allele frequency with a list of variants. Annotates the original file with population frequencies based on rs IDs. Arguments: - original VCF file - attribute ID to include population frequencies in file (ie. GMAF) - Description of new frequency for VCF header - population VCF file - attribute ID to use for frequencies from population file (ie. AF) - reference genome FASTA file

(ns bcbio.variation.utils.popfreq
  (:import [org.broadinstitute.sting.utils.variantcontext VariantContextBuilder]
           [org.broadinstitute.sting.utils.codecs.vcf VCFHeader VCFInfoHeaderLine
            VCFHeaderLineCount VCFHeaderLineType])
  (:use [ordered.set :only [ordered-set]]
        [bcbio.variation.config :only [load-config]]
        [bcbio.variation.combine :only [combine-variants]]
        [bcbio.variation.variantcontext :only [parse-vcf write-vcf-w-template
  (:require [bcbio.run.itx :as itx]))

Retrieve all rsIDs from the input vcf-file

(defn get-rsids
  [vcf-file ref]
  (with-open [vcf-iter (get-vcf-iterator vcf-file ref)]
    (set (remove nil? (map :id (parse-vcf vcf-iter))))))

Retrieve allele frequencies from population VCF for IDs of interest.

(defn get-allele-freqs
  [vcf-file ref want-ids targets]
  (println "Retrieving allele freqs" (count want-ids) targets)
  (with-open [vcf-iter (get-vcf-iterator vcf-file ref)]
    (reduce (fn [coll vc]
              (if (and (not (nil? (:id vc)))
                       (contains? want-ids (:id vc)))
                (assoc coll (:id vc) (zipmap (map :new-id targets)
                                             (map #(get-in vc [:attributes (:orig-id %)] 0.0)
            {} (parse-vcf vcf-iter))))

Lazy generator of variant contexts with added population frequencies.

(defn add-pop-freqs
  [vcf-iter allele-freqs ann-ids]
  (letfn [(update-allele-freq [vc new-freqs]
            (-> (VariantContextBuilder. (:vc vc))
                (.attributes (reduce (fn [coll cur-id]
                                       (assoc coll cur-id (get new-freqs cur-id 0.0)))
                                     (:attributes vc) ann-ids))
    (map #(update-allele-freq % (get allele-freqs (:id %) {}))
         (parse-vcf vcf-iter))))

Add new population frequency information to the VCF input header if needed.

(defn- add-popfreq-header
  (letfn [(header-has-id? [header test-id]
            (contains? (set (map #(when (= "INFO" (.getKey %))
                                    (.getID %)) (.getMetaDataInInputOrder header)))
    (fn [_ header]
      (let [new (->> new-ids
                     (remove #(header-has-id? header (:new-id %)))
                     (map #(VCFInfoHeaderLine. (:new-id %) 1
                                               VCFHeaderLineType/Float (:desc %)))
        (VCFHeader. (apply ordered-set (concat (.getMetaDataInInputOrder header) new))
                    (.getGenotypeSamples header))))))
(defn- add-annotations
  [call ref out-dir]
  (let [orig-vcf (if (coll? (:file call))
                   (combine-variants (:file call) ref :out-dir out-dir :merge-type :full)
                   (:file call))
        out-file (itx/add-file-part orig-vcf "popfreq" out-dir)
        allele-freqs (get-allele-freqs (get-in call [:annotate :file]) ref
                                       (get-rsids orig-vcf ref)
                                       (get-in call [:annotate :targets]))]
    (if (itx/needs-run? out-file)
      (with-open [vcf-iter (get-vcf-iterator orig-vcf ref)]
        (write-vcf-w-template orig-vcf {:out out-file}
                              (add-pop-freqs vcf-iter allele-freqs
                                             (map :new-id (get-in call [:annotate :targets])))
                              (add-popfreq-header (get-in call [:annotate :targets])))))
(defn annotate-with-popfreq [config-file]
  (let [config (load-config config-file)]
    (doseq [exp (:experiments config)]
      (doseq [call (:calls exp)]
        (add-annotations call (:ref exp) (get-in config [:dir :out]))))))

Parse and organize results from Sanger validation into VCF

(ns bcbio.variation.utils.sanger
  (:import [org.broadinstitute.sting.utils.variantcontext Allele
            VariantContextBuilder GenotypeBuilder GenotypesContext]
            VCFHeader VCFInfoHeaderLine VCFHeaderLineCount VCFHeaderLineType
  (:require [clojure.string :as string]
            [clojure.set :as set]
            [clojure.java.io :as io]
            [incanter.excel :as excel]
            [incanter.core :as icore]
            [fs.core :as fs]
            [bcbio.align.ref :refer [get-seq-dict]]
            [bcbio.run.itx :as itx]))

Validates if one end passes and validates our variant, and the other failed to map.

(defn- validates-only-one?
  [for rev]
  (or (and (.startsWith for "Pass")
           (.startsWith rev "Pass")
           (= for rev))
      (and (.startsWith for "Pass")
           (= rev "NA"))
      (and (.startsWith rev "Pass")
           (= for "NA"))))

Validates if both reads pass and match our expected result.

(defn- validates?
  [for rev]
  (or (and (.startsWith for "Pass")
           (.startsWith rev "Pass")
           (= for rev))))

Supports the reference sequence if both fail to validate and at least one call identifies the reference.

(defn- supports-ref?
  [for rev for-val rev-val]
  (and (= for "Fail") (= rev "Fail")
       (not (empty? (set/intersection #{"," "."} (into #{} [for-val rev-val]))))))
(defn- get-validate-allele
  [for rev]
  (-> (if (.startsWith for "Pass") for rev)
      (string/split #",")
      (string/replace "Pass_" )))

Map a row to variant information

(defn- row->varinfo
  [key ref alt group for rev for-val rev-val]
  (let [val (cond (validates-only-one? for rev) (get-validate-allele for rev)
                  (supports-ref? for rev for-val rev-val) ref
                  :else nil)]
    (when val
      (let [[chrom start] (string/split key #"_")
            alts (string/split alt #",")]
        {:chrom chrom
         :start (dec (Integer/parseInt start))
         :ref ref
         :alts alts
         :orig [for rev]
         :group group
         :val val}))))

Return confirmed calls from Sanger calls in input XLS file.

(defn- read-sanger-xls
  (->> (excel/read-xls (str in-file))
       (icore/$map row->varinfo ["key" "vcf ref" "vcf allele" "set"
                                 "forward validated" "reverse validated"
                                 "CE validation forward" "CE validation reverse"])
       (remove nil?)))
(defn- get-header [sample-name]
  (VCFHeader. #{(VCFInfoHeaderLine. "valgroup" 1 VCFHeaderLineType/String
                                    "Validation group")
                (VCFFormatHeaderLine. "GT" 1 VCFHeaderLineType/String

Retrieve chromosome and real position

(defn- chrom-coord
  (let [name->index (into {} (map-indexed (fn [i x] [(.getSequenceName x) i])
                                          (.getSequences ref-dict)))]
    (fn [x]
      [(name->index (:chrom x)) (:start x)])))

Convert a dictionary of validation information into a VariantContext.

(defn- val->vc
  [sample-name x]
  (println x)
  (-> (VariantContextBuilder. (:chrom x) (:chrom x)
                              (inc (:start x)) (+ (count (:ref x)) (:start x))
                              (cons (Allele/create (:ref x) true)
                                    (map #(Allele/create % false) (:alts x))))
      (.attributes {"valgroup" (:group x)})
      (.genotypes (GenotypesContext/create
                    [(GenotypeBuilder/create sample-name
                                             [(Allele/create (:val x) (= (:ref x) (:val x)))])])))

Read directory of Sanger Illumina validation results in Excel format. Convert into VCF with validated and reference variants.

(defn sanger->vcf
  [sanger-dir sample-name ref-file]
  (let [seq-dict (get-seq-dict ref-file)
        out-file (str (fs/file sanger-dir (str sample-name "-sanger-validate.vcf")))]
    (when true ;(itx/needs-run? out-file)
      (with-open [writer (VariantContextWriterFactory/create (io/file out-file)
        (.writeHeader writer (get-header sample-name))
        (doseq [vc (->> (mapcat read-sanger-xls (fs/glob (fs/file sanger-dir "*.xls*")))
                        (sort-by (chrom-coord seq-dict))
                        (map (partial val->vc sample-name)))]
          (.add writer vc))))))

Collapse a multi-sample VCF file into a CSV, R data.frame ready, parameter summary.

(ns bcbio.variation.utils.summarize
  (:use [clojure.java.io]
        [ordered.map :only [ordered-map]]
        [bcbio.variation.callable :only [get-bed-source features-in-region]]
        [bcbio.variation.config :only [load-config]]
        [bcbio.variation.metrics :only [passes-filter?]]
        [bcbio.variation.variantcontext :only [parse-vcf get-vcf-iterator]])
  (:require [clojure.string :as string]
            [clojure.data.csv :as csv]
            [incanter.stats :as istats]
            [bcbio.run.itx :as itx]))
(defn- to-float [x]
    (Float/parseFloat x)
    (catch Exception e (float x))))

Provide sample information from variant genotypes.

(defn- flatten-vc-samples
  [out vc attrs]
  (let [variant-types ["HET" "HOM_VAR"]]
    (letfn [(add-variant-totals [out gs]
              (let [counts (frequencies (map :type gs))]
                (reduce (fn [coll [k v]] (assoc coll k v))
                        out (map (fn [k] [k (get counts k 0)])
            (get-attr-avg [k gs]
              (istats/mean (->> gs
                                (filter #(contains? (set variant-types) (:type %)))
                                (map #(get-in % [:attributes k]))
                                (remove nil?)
                                (map #(to-float %)))))
            (add-attr-avgs [out gs attrs]
              (reduce (fn [coll k] (assoc coll (str k "_sample_mean")
                                          (get-attr-avg k gs)))
                      out attrs))]
      (-> out
          (add-variant-totals (:genotypes vc))
          (add-attr-avgs (:genotypes vc) attrs)))))

Prepare attributes for feeding into flattened table

(defmulti prep-attribute
  (fn [attr value default] attr))

Handle set attributes, where we want to report the total sets identified.

(defmethod prep-attribute "set"
  [_ value default]
   (= value "Intersection") default
   (= value "FilteredInAll") 0
   (nil? value) 0
   :else (count (string/split value #"\-"))))

Handle remaining attributes. For multi-allele VCFs return the first value.

(defmethod prep-attribute :default
  [_ value _]
  (if (instance? java.util.ArrayList value)
    (first value)

Extract attributes of interest from INFO field of variant.

(defn- flatten-vc-attrs
  [out vc attrs defaults]
  (reduce (fn [coll k] (assoc coll k (prep-attribute k (get-in vc [:attributes k])
                                                     (get defaults (keyword k)))))
          out attrs))

Check for presence of the variant in predefined intervals.

(defn- flatten-vc-intervals
  [out vc intervals]
  (letfn [(check-intervals [vc bed-s]
            (if (empty? (features-in-region bed-s (:chr vc) (:start vc) (:end vc))) 0 1))]
    (reduce (fn [coll interval]
              (assoc coll (:name interval) (check-intervals vc (:source interval))))
            out intervals)))

Provide tabular variant representation with provided attributes and sample information.

(defn- flatten-vc
  [config vc]
  (-> (reduce (fn [coll k] (assoc coll k (get vc k)))
              (ordered-map) [:chr :start :id :type :qual])
      (flatten-vc-intervals vc (get config :intervals []))
      (flatten-vc-attrs vc (:attrs config) (get config :attrs-defaults {}))
      (flatten-vc-samples vc (:sample-attrs config))))
(defn- add-interval-retrievers
  [config ref]
  (letfn [(add-int-retriever [coll]
            (assoc coll :source (get-bed-source (:file coll) ref)))]
    (assoc config :intervals (map add-int-retriever (:intervals config)))))

Convert a VCF input to flattened CSV table with provided attributes.

(defn vcf-to-table
  [vcf ref config]
  (let [out-file (str (itx/file-root vcf) "-variantsum.csv")]
    (when (itx/needs-run? out-file)
      (itx/with-tx-files [tx-out-files {:out out-file} [:out] []]
        (with-open [vcf-iter (get-vcf-iterator vcf ref)
                    wtr (writer (:out tx-out-files))]
          (doseq [[i out] (map-indexed vector
                                       (map (partial flatten-vc (add-interval-retrievers config ref))
                                            (filter passes-filter? (parse-vcf vcf-iter))))]
            (when (= i 0)
              (csv/write-csv wtr [(map name (keys out))]))
            (csv/write-csv wtr [(vals out)])
            (.flush wtr)))))

Prep a set of VCF to table conversions from input configuration file.

(defn vcf-to-table-config
  (let [config (load-config config-file)]
      (for [exp (:experiments config)]
        (for [call (:calls exp)]
          (vcf-to-table (:file call) (:ref exp) (:summary call))))))))

Merge structural variants into a set of smaller SNP/indel calls. Different software detects larger structural variants, requiring a final preparation step combining structural and standard calls, removing any smaller calls which overlap large insertions and deletions. This is currently tuned for fosmid merging and reconstruction but has knobs to generalize for diploid merging with appropriate phasing of variants.

(ns bcbio.variation.utils.svmerge
  (:require [clojure.java.io :as io]
            [fs.core :as fs]
            [bcbio.run.itx :as itx]
            [bcbio.variation.combine :as combine]
            [bcbio.variation.filter.intervals :as intervals]
            [bcbio.variation.normalize :as normalize]
            [bcbio.variation.structural :as structural]
            [bcbio.variation.variantcontext :as gvc]))

Create a BED file of structural variant regions from input VCF.

(defn- sv->bed
  [sv-file ref-file]
  (let [out-file (str (itx/file-root sv-file) "-regions.bed")]
    (with-open [wtr (io/writer out-file)]
      (doseq [vc (structural/parse-vcf-sv sv-file ref-file)]
        (when (contains? #{:DEL :INS} (:sv-type vc))
          (.write wtr (format "%s\t%s\t%s\n" (:chr vc) (dec (:start-ci vc)) (:end-ci vc))))))

Merge structural variants into calls, updating variants and BED regions to assess.

(defn into-calls
  [call-file region-file sv-file ref-file]
  (let [out-files {:calls (itx/add-file-part call-file "wsvs")
                   :regions (itx/add-file-part region-file "wsvs")}]
    (when (itx/needs-run? (vals out-files))
      (let [sample (first (intervals/get-sample-names call-file))
            svready-file (normalize/prep-vcf sv-file ref-file sample
                                             :config {:prep-sv-genotype true
                                                      :fix-sample-header true
                                                      :prep-allele-count 1})
            sv-bed (sv->bed svready-file ref-file)
            call-safesv (-> (intervals/select-by-sample sample call-file nil ref-file
                                                        :ext "safesv"
                                                        :exclude-intervals sv-bed))]
        (-> (combine/combine-variants [call-safesv svready-file] ref-file
                                      :merge-type :full)
            (fs/rename (:calls out-files)))
        (-> (intervals/combine-multiple-intervals region-file [] ref-file
                                                  :combine-rule :union
                                                  :more-beds [sv-bed])
            (fs/rename (:regions out-files)))))

Combine calls from a multiple technology comparison to produce a set of final variants plus a list for validation. The inputs are: - Target technology: The outlier technology for picking additional targets. This should be well understood enough to set threshold for validation. - Validation info: Details for prepping a set of variants for validation - thresholds: min and max thresholds for validation - approach: validation along the full range of thresholds, or validate top variants - count: total number of variants for validation. Produces: - Final calls - calls that overlap in all of the technologies - calls that overlap in all but the target, where the target technology quality is below the validation threshold. - Validate calls - calls that overlap in all but the target and fall below configurable threshold. These are either sampled from the distribution or picked off the top.

(ns bcbio.variation.validate
  (:use [ordered.map :only [ordered-map]]
        [bcbio.variation.combine :only [combine-variants]]
        [bcbio.variation.multiple :only [prep-cmp-name-lookup
        [bcbio.variation.report :only [count-variants]]
        [bcbio.variation.variantcontext :only [parse-vcf get-vcf-iterator
  (:require [bcbio.run.broad :as broad]
            [bcbio.run.itx :as itx]))

Base functionality for subsetting a file with SelectVariants.

(defn- select-by-general
  [select-args ext in-vcf ref]
  (let [file-info {:out-vcf (itx/add-file-part in-vcf ext)}
        args (concat ["-R" ref
                      "--variant" in-vcf
                      "-o" :out-vcf]
    (broad/run-gatk "SelectVariants" args file-info {:out [:out-vcf]})
    (:out-vcf file-info)))

Subset a VCF file with specific hard filters.

(defn select-by-filters
  [filters in-vcf ext ref]
  (select-by-general (interleave (repeat "--select_expressions") filters)
                     ext in-vcf ref))

Validation targets by random sampling

Subset a VCF file with a random number of variants.

(defn select-by-random
  [n in-vcf ref]
  (let [total (count-variants in-vcf ref (fn [x] true)) 
        frac (if (pos? total) (float (/ n total)) 0.0)]
    (select-by-general ["--select_random_fraction" frac] "randsubset" in-vcf ref)))

Select set of variants to validate from total set of potentials.

(defmulti get-to-validate
  (fn [in-vcf finalizer ref] (keyword (get-in finalizer [:params :validate :approach]))))
(defmethod get-to-validate :random
  [in-vcf finalizer ref]
  (select-by-random (get-in finalizer [:params :validate :count]) in-vcf ref))

Provide function to extract metric used in sorting from a variant context. Returns a list with the first being the count of items found in set overlap and the second the metric to sort by. Currently only handles a single metric for sorting.

(defn- extract-sort-metrics
  {:pre [(= 1 (count (get-in finalizer [:params :validate :top-metric])))]}
  (let [metric (get-in finalizer [:params :validate :top-metric 0 :name])
        mod (get-in finalizer [:params :validate :top-metric 0 :mod])]
    (fn [vc]
      (let [base (-> vc :attributes (get metric "-1000.0") (Float/parseFloat))]
        [(count (re-seq (re-pattern (:target finalizer))
                           (-> vc :attributes (get "set" ""))))
         (if mod (* mod base) base)]))))

Retrieve top variants sorted by metrics of interest.

(defn- get-top-variants
  [vcf-file finalizer ref]
  (with-open [vcf-iter (get-vcf-iterator vcf-file ref)]
    (let [metric-gettr (extract-sort-metrics finalizer)]
       (map (juxt :chr :start)
            (take (get-in finalizer [:params :validate :count])
                   (sort-by metric-gettr (parse-vcf vcf-iter)))))))))
(defmethod get-to-validate :top
  [in-vcf finalizer ref]
  (let [out-file (itx/add-file-part in-vcf "topsubset")]
    (when (itx/needs-run? out-file)
      (let [to-keep (get-top-variants in-vcf finalizer ref)]
        (with-open [vcf-iter (get-vcf-iterator in-vcf ref)]
          (write-vcf-w-template in-vcf {:out out-file}
                                (map :vc
                                     (filter #(contains? to-keep ((juxt :chr :start) %))
                                             (parse-vcf vcf-iter)))

Prepare files of calls: finalized and validation targets.

(defn get-final-and-tovalidate
  [cmps finalizer config]
  (let [cmps-by-name (prep-cmp-name-lookup (vals cmps) :remove-mods? true
                                           :ignore #{"all" "validate"})
        ref (-> cmps-by-name vals first :exp :ref)
        multi-prep (multiple-overlap-analysis cmps-by-name config (:target finalizer)
                                              :dirname "validate")]
     :final (if-let [keep-filters (get-in finalizer [:params :filters :keep])]
              (combine-variants [(:true-positives multi-prep)
                                 (select-by-filters keep-filters (:false-negatives multi-prep)
                                                    "keepsubset" ref)]
                                ref :merge-type :full)
              (:true-positives multi-prep))
     :validate (get-to-validate
                (let [orig (:target-overlaps multi-prep)]
                  (if-let [val-filters (get-in finalizer [:params :filters :validate])]
                    (select-by-filters val-filters orig "checksubset" ref)
                finalizer ref))))

High level pipeline entry for producing final and to-validate call sets.

(defn pipeline-validate
  [cmps finalizer exp config]
  {:c-files (get-final-and-tovalidate cmps finalizer config)
   :c1 {:name (:target finalizer)}
   :c2 {:name "validate"}
   :exp exp :dir (:dir config)})

Helper functions to retrieve information from GATK VariantContext objects, which represent variant data stored in VCF files.

(ns bcbio.variation.variantcontext
  (:import [org.broad.tribble.index IndexFactory]
           [org.broad.tribble AbstractFeatureReader]
           [org.broad.tribble.readers AsciiLineReader]
            VCFCodec VCFUtils VCFHeader VCFFilterHeaderLine]
           [org.broadinstitute.sting.utils.variantcontext VariantContextBuilder
            GenotypeBuilder GenotypesContext]
           [org.broadinstitute.sting.utils.variantcontext.writer VariantContextWriterFactory
           [org.broadinstitute.sting.gatk.refdata.tracks RMDTrackBuilder]
           [org.broadinstitute.sting.gatk.arguments ValidationExclusion$TYPE]
           [org.apache.log4j Logger]
           [java.util EnumSet])
  (:use [clojure.java.io]
        [clojure.set :only [intersection union]]
        [lazymap.core :only [lazy-hash-map]]
        [ordered.set :only [ordered-set]]
        [bcbio.align.ref :only [get-seq-dict]])
  (:require [clojure.string :as string]
            [lonocloud.synthread :as ->]
            [bcbio.run.itx :as itx]))

Represent VariantContext objects

Provide simple map-based access to important attributes of VariantContexts. There are 3 useful levels of abstraction:

  • VariantContext: Details about a variation. This captures a single line in a VCF file
  • Genotype: An individual genotype for a sample, at a variant position.
  • Allele: The actual alleles at a genotype.

Represent a sample genotype including alleles. :genotype stores the original java genotype object for direct access.

(defn from-genotype
   :sample-name (.getSampleName g)
   :qual (.getPhredScaledQual g)
   :type (-> g .getType .name)
   :phased? (.isPhased g)
   :attributes (merge {"DP" (.getDP g) "AD" (vec (.getAD g))
                       "GQ" (.getGQ g) "PL" (vec (.getPL g))}
                      (into {} (.getExtendedAttributes g)))
   :alleles (vec (.getAlleles g))
   :genotype g))

Provide a top level map of information from a variant context. :vc stores the original java VariantContext object for direct access.

(defn from-vc
   :chr (.getChr vc)
   :start (.getStart vc)
   :end (.getEnd vc)
   :id (when (.hasID vc) (.getID vc))
   :ref-allele (.getReference vc)
   :alt-alleles (vec (.getAlternateAlleles vc))
   :type (-> vc .getType .name)
   :filters (set (.getFilters vc))
   :attributes (into {} (.getAttributes vc))
   :qual (.getPhredScaledQual vc)
   :num-samples (.getNSamples vc)
   :genotypes (map from-genotype
                   (-> vc .getGenotypes .toArray vec))
   :vc vc))

Parsing VCF files

Create a Tribble FeatureSource for VCF file. Handles indexing and parsing of VCF into VariantContexts. We treat gzipped files as tabix indexed VCFs.

(defn get-vcf-source
  [in-file ref-file & {:keys [ensure-safe codec]}]
  (let [cur-codec (if (nil? codec) (VCFCodec.) codec)]
    (if (.endsWith in-file ".gz")
      (AbstractFeatureReader/getFeatureReader in-file cur-codec false)
      (let [validate (when (false? ensure-safe)
            idx (.loadIndex (RMDTrackBuilder. (get-seq-dict ref-file) nil validate)
                            (file in-file) cur-codec)]
        (AbstractFeatureReader/getFeatureReader (.getAbsolutePath (file in-file)) cur-codec idx)))))

Create an iterator over VCF VariantContexts.

(defn get-vcf-iterator
  [in-file ref-file & {:keys [ensure-safe codec]}]
  (.iterator (get-vcf-source in-file ref-file :ensure-safe ensure-safe
                             :codec codec)))

Retrieve variants located in potentially multiple variant files

(defn variants-in-region
  ([retriever vc]
     (variants-in-region retriever (:chr vc) (:start vc) (:end vc)))
  ([retriever space start end]
     (letfn [(get-vcs-in-source [[source fname]]
               (with-open [vcf-iter (.query source space start end)]
                 (doall (map #(assoc (from-vc %) :fname fname) (iterator-seq vcf-iter)))))]
       (mapcat get-vcs-in-source (map vector (:sources retriever) (:fnames retriever))))))

Look for matching variants present in any of the variant files.

(defn has-variants?
  [retriever space start end ref alt]
  (some #(and (= start (:start %))
              (= end (:end %))
              (= ref (:ref-allele %))
              (seq (intersection (set (:alt-alleles %)) (set alt))))
        (variants-in-region retriever space start end)))
(defrecord VariantRetriever [sources fnames]
  (close [_]
    (doseq [x sources]
      (.close x))))

Indexed variant file retrieval for zero to multiple files with clean handle closing.

(defn get-vcf-retriever
  [ref & vcf-files]
  (let [fnames (remove nil? vcf-files)]
    (VariantRetriever. (map #(get-vcf-source % ref) fnames)

Lazy iterator of VariantContext information from VCF file.

(defn parse-vcf
  (map from-vc (iterator-seq (.iterator vcf-source))))

Retrieve parser to do line-by-line parsing of VCF files.

(defn get-vcf-line-parser
  (let [codec (VCFCodec.)]
    (.readHeader codec vcf-reader)
    (fn [line]
      (from-vc (.decode codec line)))))
(defn- line-vcf-parser
  (let [parser (with-open [rdr (AsciiLineReader. (input-stream vcf))]
                 (get-vcf-line-parser rdr))]
    (map parser (drop-while #(.startsWith % "#") (line-seq (reader vcf))))))

Retrieve header from input VCF file.

(defn get-vcf-header
  (with-open [vcf-reader (AsciiLineReader. (input-stream vcf-file))]
    (.readHeader (VCFCodec.) vcf-reader)))

Writing VCF files

(defn merge-headers
  [& merge-files]
  (fn [_ header]
    (VCFHeader. (VCFUtils/smartMergeHeaders (cons header (map get-vcf-header merge-files))
                                            (Logger/getLogger ))
                (.getGenotypeSamples header))))

Update a header with new INFO and FILTER metadata.

(defn header-w-md
  [header new-md]
  (VCFHeader. (apply ordered-set (concat (.getMetaDataInInputOrder header) new-md))
              (.getGenotypeSamples header)))

Write VCF output files starting with an original input template VCF. Handles writing to multiple VCF files simultaneously with the different file handles represented as keywords. This allows lazy splitting of VCF files: vc-iter is a lazy sequence of (writer-keyword variant-context). out-file-map is a map of writer-keywords to output filenames.

(defn write-vcf-w-template
  [tmpl-file out-file-map vc-iter ref & {:keys [header-update-fn]}]
  (letfn [(make-vcf-writer [f ref]
            (VariantContextWriterFactory/create (file f) (get-seq-dict ref)
                                                (EnumSet/of Options/INDEX_ON_THE_FLY
          (convert-to-output [info]
            [(if (and (coll? info) (= 2 (count info))) (first info) :out)
             (if (coll? info) (last info) info)])]
    (itx/with-tx-files [tx-out-files out-file-map (keys out-file-map) [".idx"]]
      (let [tmpl-header (get-vcf-header tmpl-file)
            writer-map (zipmap (keys tx-out-files)
                               (map #(make-vcf-writer % ref) (vals tx-out-files)))]
        (doseq [[key out-vcf] writer-map]
          (.writeHeader out-vcf (if-not (nil? header-update-fn)
                                  (header-update-fn key tmpl-header)
        (doseq [[fkey item] (map convert-to-output vc-iter)]
          (let [ready-vc (if (and (map? item) (contains? item :vc)) (:vc item) item)]
            (when-not (nil? ready-vc)
              (.add (get writer-map fkey) ready-vc))))
        (doseq [x (vals writer-map)]
          (.close x))))))
(defn- add-filter-header
  [fname fdesc]
  (fn [_ header]
    (header-w-md header
                 #{(VCFFilterHeaderLine. fname fdesc)})))
(defn- maybe-add-filter
  [fname passes? vc]
  (if (passes? vc)
    (:vc vc)
    (-> (VariantContextBuilder. (:vc vc))
        (.filters (union #{fname} (:filters vc)))

Write VCF file from input using a filter function.

(defn write-vcf-from-filter
  [vcf ref out-part fname fdesc passes?]
  (let [out-file (itx/add-file-part vcf out-part)]
    (when (itx/needs-run? out-file)
      (with-open [vcf-iter (get-vcf-iterator vcf ref)]
        (write-vcf-w-template vcf {:out out-file}
                              (map (partial maybe-add-filter fname passes?) (parse-vcf vcf-iter))
                              :header-update-fn (add-filter-header fname fdesc))))

Select variants from an input file with supplied filter.

(defn select-variants
  [in-file passes? file-out-part ref-file & {:keys [out-dir]}]
  (let [out-file (itx/add-file-part in-file file-out-part out-dir)]
    (when (itx/needs-run? out-file)
      (with-open [in-iter (get-vcf-iterator in-file ref-file)]
        (write-vcf-w-template in-file {:out out-file}
                              (map :vc (filter passes? (parse-vcf in-iter)))

Genotype manipulation

Create Genotype objects for a samples with defined alleles, optionally including attributes from the parent genotype. gs is a list of genotype dictionaries, with the alleles modified to the new values desired. This converts them into GATK-ready objects.

(defn create-genotypes
  [gs & {:keys [attrs]}]
  (let [all-attrs [["PL" seq (fn [x _ v] (.PL x (int-array v)))]
                   ["PVAL" identity (fn [x k v] (.attribute x k v))]
                   ["DP" identity (fn [x _ v] (.DP x v))]
                   ["AD" seq (fn [x _ v] (.AD x (int-array v)))]]]
    (letfn [(alleles->genotype [g]
              (-> (GenotypeBuilder. (:sample-name g) (:alleles g))
                  (->/for [[attr val-fn add-fn] all-attrs]
                    (->/when (contains? attrs attr)
                      (->/when-let [val (val-fn (get-in g [:attributes attr]))]
                        (#(add-fn % attr val)))))
      (->> gs
           (map alleles->genotype)

Convert variant context genotypes into all reference calls (0/0).

(defn genotypes->refcall
  [vc & {:keys [attrs num-alleles]}]
  (letfn [(make-refcall [g]
            (assoc g :alleles
                   (repeat (if (nil? num-alleles)
                             (count (:alleles g))
                           (:ref-allele vc))))]
    (-> (VariantContextBuilder. (:vc vc))
        (.genotypes (create-genotypes (map make-refcall (:genotypes vc))
                                      :attrs attrs))
(defn -main [vcf ref approach]
  (with-open [vcf-iter (get-vcf-iterator vcf ref)]
    (letfn [(item-iter []
              (case approach
                "line" (map :vc (line-vcf-parser vcf))
                "gatk" (iterator-seq (.iterator vcf-iter))
                "orig" (map :vc (parse-vcf vcf-iter))))]
      (write-vcf-w-template vcf {:out "vctest.vcf"} (item-iter) ref)
      ;; (doseq [[i x] (map-indexed vector (item-iter))]
      ;;   (when (= 0 (mod i 10000))
      ;;      (println x))))))

Simple walker to parse a VCF file and display distribution of call quality scores

(ns bcbio.variation.vcfwalker
  (:import [bcbio.variation BaseVariantWalker])
  (:use [bcbio.variation.variantcontext :only [from-vc]])
  (:require ;[incanter.charts :as icharts]
            [incanter.core :as icore])
   :name bcbio.variation.vcfwalker.VcfSimpleStatsWalker
   :extends bcbio.variation.BaseVariantWalker))

Retrieve VariantContexts and extract the variant quality score.

(defn -map
  [this tracker ref context]
  (if-not (nil? tracker)
    (for [vc (map from-vc
                    (.getValues tracker (.variants (.invrns this))
                                (.getLocation context)))]
      (:qual vc))))

Initialize an empty list to collect our quality information

(defn -reduceInit

Add current quality information to the collected list.

(defn -reduce
  [this cur coll]
  (if-not (nil? cur)
    (vec (flatten [coll cur]))

Plot histogram of quality scores.

(defn -onTraversalDone
  [this result]
  (println result))

Provide basic persistence of user files and processes in local DB.

(ns bcbio.variation.web.db
  (:import [com.mchange.v2.c3p0 ComboPooledDataSource])
  (:require [clojure.string :as string]
            [clojure.java.jdbc :as sql]
            [fs.core :as fs]))
(defn get-sqlite-db [fname & {:as opts}]
  "Retrieve SQLite database connection"
   {:classname "org.sqlite.JDBC"
    :subprotocol "sqlite"
    :subname fname}
(defn get-sqlite-db-pool [fname]
  (let [spec (get-sqlite-db fname)]
    {:datasource (doto (ComboPooledDataSource.)
                   (.setDriverClass (:classname spec))
                   (.setJdbcUrl (str "jdbc:" (:subprotocol spec) ":"
                                     (:subname spec))))}))
(defn- create-user-tables []
  (sql/create-table :analysis
                    [:analysis_id :text "PRIMARY KEY"]
                    [:username :text]
                    [:type :text]
                    [:description :text]
                    [:location :text]
                    [:created :timestamp "NOT NULL" "DEFAULT CURRENT_TIMESTAMP"])
  (sql/create-table :files
                    [:analysis_id :text]
                    [:name :text]
                    [:location :text]))

Prepare input database for storing user and file information in SQLite.

(defn prepare-web-db
  (when-not (fs/exists? (fs/parent db-file))
    (fs/mkdirs (fs/parent db-file)))
  (when-not (fs/exists? db-file)
    (sql/with-connection (get-sqlite-db db-file :create true)

Retrieve list of analyses run for a specific user and analysis type

(defn get-analyses
  [username atype db-file]
  (sql/with-connection (get-sqlite-db db-file)
    (sql/with-query-results rows
      ["SELECT * FROM analysis WHERE username = ? AND type = ? ORDER BY created DESC"
       username atype]
      (vec (map #(assoc % :created (java.sql.Timestamp. (:created %))) rows)))))

Add an analysis and associated files to the database.

(defmulti add-analysis
  (fn [info _] (:type info)))
(defmethod add-analysis :scoring
  [info db-file]
  (letfn [(get-analysis-files [info]
            (map (fn [[k f]]
                   {:analysis_id (:analysis_id info)
                    :name (name k)
                    :location (string/replace f (str (:location info) "/") "")})
                 (:files info)))]
    (sql/with-connection (get-sqlite-db db-file)
       (sql/insert-record :analysis (-> info
                                        (dissoc :files)
                                        (assoc :created (java.sql.Timestamp. (.getTime (java.util.Date.))))))
       (doseq [x (get-analysis-files info)]
         (sql/insert-record :files x))))))

Perform X Prize scoring workflow, handling comparison of contestant input with reference.

(ns bcbio.variation.workflow.xprize
  (:import [java.util UUID])
  (:use [clojure.java.io]
        [bcbio.variation.config :only [traceback-to-log load-config]]
        [bcbio.variation.compare :only [variant-comparison-from-config]]
        [bcbio.variation.combine :only [combine-variants]]
        [bcbio.variation.normalize :only [pick-best-ref]]
        [bcbio.variation.report :only [prep-scoring-table]])
  (:require [clj-yaml.core :as yaml]
            [doric.core :as doric]
            [fs.core :as fs]
            [hiccup.core :as hiccup]
            [net.cgrand.enlive-html :as html]))

Create configuration for processing inputs using references supplied in config.

(defn create-work-config
  [work-info config]
  (if-not (fs/exists? (:dir work-info))
    (fs/mkdirs (:dir work-info)))
  (let [config-file (str (fs/file (:dir work-info) "process.yaml"))
        ref (first (filter #(= (:sample %) (:comparison-genome work-info))
                           (:ref config)))
        contestant-vcf (if-let [x (:variant-file work-info)]
                         (str (fs/file x))
                         (:default-compare ref))]
    (->> {:dir {:out (str (fs/file (:dir work-info) "grading"))
                :prep (str (fs/file (:dir work-info) "grading" "prep"))}
          :experiments [{:sample (:sample ref)
                         :ref (:genome ref)
                         :intervals (:intervals ref)
                         :approach "grade"
                         :calls [{:name "reference"
                                  :file (:variants ref)
                                  :normalize false}
                                 {:name "contestant"
                                  :prep true
                                  :preclean true
                                  :remove-refcalls true
                                  :ref (pick-best-ref contestant-vcf (cons (:genome ref)
                                                                           (:genome-alts ref)))
                                  :file contestant-vcf
                                  :intervals (if-let [x (:region-file work-info)]
                                               (str (fs/file x))
                                               (:intervals ref))}]}]}
         (spit config-file))

Output text summary file with scoring information.

(defn- write-scoring-summary
  [work-info comparison]
  (let [summary-file (str (fs/file (:dir work-info)
                                   (format "%s-scoring.tsv"
                                           (get-in comparison [:summary :sample]))))]
    (with-open [wtr (writer summary-file)]
      (doseq [x (prep-scoring-table (:metrics comparison)
                                    (get-in comparison [:summary :sv]))]
        (.write wtr (format "%s\t%s\n" (:metric x) (:value x)))))

Generate a summary table of scoring results.

(defn- html-summary-table
  (let [scoring-table (prep-scoring-table (:metrics comparison)
                                          (get-in comparison [:summary :sv]))]
    (apply str
           (-> (str (doric/table ^{:format doric/html} [:metric :value] scoring-table))
               (html/transform [:table] (html/set-attr :class "table table-condensed"))

Generate summary of scoring results for display.

(defn- write-html-scoring-summary
  [work-info comparison]
  (let [out-file (str (file (:dir work-info) "scoring-summary.html"))]
    (spit out-file
           [:h3 "Summary"]
           [:div {:id "score-table"}
            (html-summary-table comparison)]
           [:h3 "Variant files in VCF format"]
           [:div {:id "variant-file-download"}
             (for [[key txt] [["concordant" "Concordant variants"]
                              ["discordant" "Discordant variants"]
                              ["discordant-missing" "Missing variants"]
                              ["phasing" "Variants with phasing errors"]]]
               [:li [:a {:href (format "/dataset/%s/%s" (:id work-info) key)} txt]])]]))

Merge standard and structural variant outputs into final set of upload files.

(defn- prepare-final-files
  (letfn [(merge-files-into [comparison orig-kw addin-kw]
            (let [ref (get-in comparison [:exp :ref])
                  orig (get-in comparison [:c-files orig-kw])
                  addin (get-in comparison [:c-files addin-kw])
                  combine-vcf (combine-variants [orig addin] ref :merge-type :full
                                                :quiet-out? true)]
              (fs/rename combine-vcf orig)
              (fs/rename (str combine-vcf ".idx") (str orig ".idx"))))]
      (merge-files-into comparison :concordant :sv-concordant)
      (merge-files-into comparison :discordant :sv-contestant-discordant)
      (merge-files-into comparison :discordant-missing :sv-reference-discordant)))

Run X Prize scoring analysis from provided work information.

(defn run-scoring-analysis*
  [work-info rclient config-file]
  (let [comparison (first (variant-comparison-from-config config-file))]
    (prepare-final-files comparison)
    {:work-info work-info
     :comparison (-> comparison
                     (assoc-in [:c-files :summary] (write-scoring-summary work-info comparison))
                     (assoc-in [:c-files :summary-html] (write-html-scoring-summary work-info comparison)))}))

Safe running of X Prize workflow with exception catching.

(defn run-scoring-analysis
  [work-info rclient input-config]
  (prn input-config)
  (let [config-file (create-work-config work-info input-config)]
      (run-scoring-analysis* work-info rclient config-file)
      (catch Exception e
          (traceback-to-log e (load-config config-file))
          (throw e))))))

Prep directory for scoring analysis.

(defn prep-scoring
  [params config]
  (let [tmp-dir (file (get-in config [:dir :work]) "score")
        work-id (str (UUID/randomUUID))
        cur-dir (file tmp-dir work-id)]
    (fs/mkdirs cur-dir)
    (-> params
        (assoc :id work-id)
        (assoc :dir (str cur-dir)))))