From 75cf8f8c2a12e2dfdb365d9248a50d137976bb5f Mon Sep 17 00:00:00 2001 From: Lenna Peterson Date: Mon, 2 Jul 2012 01:01:25 -0400 Subject: [PATCH 001/168] Initial per-sample line filtering. --- scripts/vcf_sample_filter.py | 42 ++++++++++++++++++++++++++++++++++++ vcf/parser.py | 34 ++++++++++++++++++++++++++++- 2 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 scripts/vcf_sample_filter.py diff --git a/scripts/vcf_sample_filter.py b/scripts/vcf_sample_filter.py new file mode 100644 index 0000000..fb987ff --- /dev/null +++ b/scripts/vcf_sample_filter.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python +import sys + +import vcf +#from parser import Reader, Writer + +class SampleFilter(object): + def __init__(self, infile, outfile, arg=None): + self.parser = Reader(filename=infile) + self.samples = self.parser.samples + self.outfile = outfile + if arg is not None: + self.set_filters(arg) + self.write() + else: + print "Samples:" + for idx, val in enumerate(self.list_samples()): + print "{0}: {1}".format(idx, val) + + def list_samples(self): + return self.samples + + def set_filters(self, filters, invert=False): + filters = filters.split(",") + if invert: + #filters = + pass + + self.parser.set_sample_filter(filters) + + def write(self): + #writer = Writer(stream=self.outfile, template=self.parser) + test_row = self.parser.next() + print test_row.samples + +if __name__ == "__main__": + if len(sys.argv) < 4: + print "Usage: script.py infile outfile [filt1,filt2]" + if len(sys.argv) < 3: + raise SystemExit + + filt = SampleFilter(*sys.argv[1:]) diff --git a/vcf/parser.py b/vcf/parser.py index adafbd0..11c8454 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -172,7 +172,7 @@ def read_meta(self, meta_string): class Reader(object): """ Reader for a VCF v 4.0 file, an iterator returning ``_Record objects`` """ - def __init__(self, fsock=None, filename=None, compressed=False, prepend_chr=False): + def __init__(self, fsock=None, filename=None, compressed=False, prepend_chr=False, samp_filter=None): """ Create a new Reader for a VCF file. You must specify either fsock (stream) or filename. Gzipped streams @@ -215,6 +215,7 @@ def __init__(self, fsock=None, filename=None, compressed=False, prepend_chr=Fals self._prepend_chr = prepend_chr self._parse_metainfo() self._format_cache = {} + self.set_sample_filter(samp_filter) def __iter__(self): return self @@ -319,6 +320,34 @@ def _parse_info(self, info_str): return retdict + def set_sample_filter(self, samp_filter): + self._samp_filter = None + if samp_filter is None: + return None + if isinstance(samp_filter, basestring): + samp_filter = samp_filter.split(",") + # if filters aren't ints, try to convert to sample indices + try: + samp_filter = [int(x) for x in samp_filter] + except ValueError: + try: + samp_filter = [self._sample_indexes[samp] for samp in samp_filter] + except KeyError: + # TODO raise RuntimeWarning about sample not found + pass + self._samp_filter = samp_filter + + def _filter_samples(self, samples): + if self._samp_filter is None: + return samples + filt = self._samp_filter + self.samples = [val for idx,val in enumerate(self.samples) if idx not in filt] + samples = [val for idx,val in enumerate(samples) if idx not in filt] + # FIXME this loop doesn't alter the originals + #for samplist in (self.samples, samples): + #samplist = [val for idx,val in enumerate(samplist) if idx not in filt] + return samples + def _parse_sample_format(self, samp_fmt): """ Parse the format of the calls in this _Record """ samp_fmt = make_calldata_tuple(samp_fmt.split(':')) @@ -351,6 +380,9 @@ def _parse_samples(self, samples, samp_fmt, site): samp_fmt = self._format_cache[samp_fmt] + # filter samples + samples = self._filter_samples(samples) + if cparse: return cparse.parse_samples( self.samples, samples, samp_fmt, samp_fmt._types, samp_fmt._nums, site) From 18deb2a523eaa127b8a04e50639205e840a205c8 Mon Sep 17 00:00:00 2001 From: Lenna Peterson Date: Mon, 2 Jul 2012 13:56:25 -0400 Subject: [PATCH 002/168] Improved samp filter performance, allow invert. --- scripts/vcf_sample_filter.py | 39 ++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/scripts/vcf_sample_filter.py b/scripts/vcf_sample_filter.py index fb987ff..a3b2fb6 100644 --- a/scripts/vcf_sample_filter.py +++ b/scripts/vcf_sample_filter.py @@ -1,16 +1,18 @@ #!/usr/bin/env python import sys +import warnings -import vcf +from vcf import Reader, Writer #from parser import Reader, Writer class SampleFilter(object): - def __init__(self, infile, outfile, arg=None): + def __init__(self, infile, outfile, filters=None, **kwarg): self.parser = Reader(filename=infile) self.samples = self.parser.samples + self.smp_idx = dict([(v,k) for k,v in enumerate(self.samples)]) self.outfile = outfile - if arg is not None: - self.set_filters(arg) + if filters is not None: + self.set_filters(filters, **kwarg) self.write() else: print "Samples:" @@ -20,11 +22,29 @@ def __init__(self, infile, outfile, arg=None): def list_samples(self): return self.samples - def set_filters(self, filters, invert=False): - filters = filters.split(",") + def set_filters(self, filters, invert=False, **kwarg): + filt_l = filters.split(",") + filt_s = set(filt_l) + if len(filt_s) < len(filt_l): + warnings.warn("Non-unique filters, ignoring", RuntimeWarning) + def filt2idx(item): + """Convert filter to valid sample index""" + try: + item = int(item) + except ValueError: + # not an idx, check if it's a value + return self.smp_idx.get(item) + else: + # is int, check if it's an idx + if item < len(self.samples): + return item + filters = set(filter(lambda x: x is not None, map(filt2idx, filt_s))) + if len(filters) < len(filt_s): + # TODO print the filters that were ignored + warnings.warn("Invalid filters, ignoring", RuntimeWarning) + if invert: - #filters = - pass + filters = set(xrange(len(self.samples))).difference(filters) self.parser.set_sample_filter(filters) @@ -34,9 +54,12 @@ def write(self): print test_row.samples if __name__ == "__main__": + # TODO implement argparse if len(sys.argv) < 4: print "Usage: script.py infile outfile [filt1,filt2]" if len(sys.argv) < 3: raise SystemExit filt = SampleFilter(*sys.argv[1:]) + print "now invert:" + filt2 = SampleFilter(*sys.argv[1:], invert=True) From 8477e6f0ba788f02b8673e1d852a7c1c52f3e837 Mon Sep 17 00:00:00 2001 From: Lenna Peterson Date: Mon, 2 Jul 2012 16:28:52 -0400 Subject: [PATCH 003/168] Args can be provided all at once or in sequence. The latter style (filt3) allows semi-interactive at Python prompt. --- scripts/vcf_sample_filter.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/scripts/vcf_sample_filter.py b/scripts/vcf_sample_filter.py index a3b2fb6..821b152 100644 --- a/scripts/vcf_sample_filter.py +++ b/scripts/vcf_sample_filter.py @@ -6,14 +6,17 @@ #from parser import Reader, Writer class SampleFilter(object): - def __init__(self, infile, outfile, filters=None, **kwarg): + def __init__(self, infile, outfile=None, filters=None, invert=False): self.parser = Reader(filename=infile) self.samples = self.parser.samples self.smp_idx = dict([(v,k) for k,v in enumerate(self.samples)]) self.outfile = outfile + self.invert = invert + self.filters = filters if filters is not None: - self.set_filters(filters, **kwarg) - self.write() + self.set_filters() + if outfile is not None: + self.write() else: print "Samples:" for idx, val in enumerate(self.list_samples()): @@ -22,8 +25,12 @@ def __init__(self, infile, outfile, filters=None, **kwarg): def list_samples(self): return self.samples - def set_filters(self, filters, invert=False, **kwarg): - filt_l = filters.split(",") + def set_filters(self, filters=None, invert=False): + if filters is not None: + self.filters = filters + if invert: + self.invert = invert + filt_l = self.filters.split(",") filt_s = set(filt_l) if len(filt_s) < len(filt_l): warnings.warn("Non-unique filters, ignoring", RuntimeWarning) @@ -43,12 +50,15 @@ def filt2idx(item): # TODO print the filters that were ignored warnings.warn("Invalid filters, ignoring", RuntimeWarning) - if invert: + if self.invert: filters = set(xrange(len(self.samples))).difference(filters) self.parser.set_sample_filter(filters) - def write(self): + def write(self, outfile=None): + if outfile is not None: + self.outfile = outfile + print "outfile:", self.outfile #writer = Writer(stream=self.outfile, template=self.parser) test_row = self.parser.next() print test_row.samples @@ -63,3 +73,7 @@ def write(self): filt = SampleFilter(*sys.argv[1:]) print "now invert:" filt2 = SampleFilter(*sys.argv[1:], invert=True) + print "now sequential:" + filt3 = SampleFilter(sys.argv[1]) + filt3.set_filters(sys.argv[3]) + filt3.write(sys.argv[2]) From 73376c8347bfb61c713085822e73bbf9643ac109 Mon Sep 17 00:00:00 2001 From: Lenna Peterson Date: Mon, 2 Jul 2012 17:44:16 -0400 Subject: [PATCH 004/168] Reduced amount of sample filter code in parser. --- scripts/vcf_sample_filter.py | 2 +- vcf/parser.py | 29 +++++------------------------ 2 files changed, 6 insertions(+), 25 deletions(-) diff --git a/scripts/vcf_sample_filter.py b/scripts/vcf_sample_filter.py index 821b152..c8aa723 100644 --- a/scripts/vcf_sample_filter.py +++ b/scripts/vcf_sample_filter.py @@ -53,7 +53,7 @@ def filt2idx(item): if self.invert: filters = set(xrange(len(self.samples))).difference(filters) - self.parser.set_sample_filter(filters) + self.parser._set_sample_filter(filters) def write(self, outfile=None): if outfile is not None: diff --git a/vcf/parser.py b/vcf/parser.py index 11c8454..7c6a8f7 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -172,7 +172,7 @@ def read_meta(self, meta_string): class Reader(object): """ Reader for a VCF v 4.0 file, an iterator returning ``_Record objects`` """ - def __init__(self, fsock=None, filename=None, compressed=False, prepend_chr=False, samp_filter=None): + def __init__(self, fsock=None, filename=None, compressed=False, prepend_chr=False): """ Create a new Reader for a VCF file. You must specify either fsock (stream) or filename. Gzipped streams @@ -210,12 +210,12 @@ def __init__(self, fsock=None, filename=None, compressed=False, prepend_chr=Fals self.formats = None self.samples = None self._sample_indexes = None + self._samp_filter = None self._header_lines = [] self._tabix = None self._prepend_chr = prepend_chr self._parse_metainfo() self._format_cache = {} - self.set_sample_filter(samp_filter) def __iter__(self): return self @@ -320,32 +320,13 @@ def _parse_info(self, info_str): return retdict - def set_sample_filter(self, samp_filter): - self._samp_filter = None - if samp_filter is None: - return None - if isinstance(samp_filter, basestring): - samp_filter = samp_filter.split(",") - # if filters aren't ints, try to convert to sample indices - try: - samp_filter = [int(x) for x in samp_filter] - except ValueError: - try: - samp_filter = [self._sample_indexes[samp] for samp in samp_filter] - except KeyError: - # TODO raise RuntimeWarning about sample not found - pass + def _set_sample_filter(self, samp_filter): self._samp_filter = samp_filter def _filter_samples(self, samples): - if self._samp_filter is None: - return samples filt = self._samp_filter self.samples = [val for idx,val in enumerate(self.samples) if idx not in filt] samples = [val for idx,val in enumerate(samples) if idx not in filt] - # FIXME this loop doesn't alter the originals - #for samplist in (self.samples, samples): - #samplist = [val for idx,val in enumerate(samplist) if idx not in filt] return samples def _parse_sample_format(self, samp_fmt): @@ -377,11 +358,11 @@ def _parse_samples(self, samples, samp_fmt, site): # check whether we already know how to parse this format if samp_fmt not in self._format_cache: self._format_cache[samp_fmt] = self._parse_sample_format(samp_fmt) - samp_fmt = self._format_cache[samp_fmt] # filter samples - samples = self._filter_samples(samples) + if self._samp_filter is not None: + samples = self._filter_samples(samples) if cparse: return cparse.parse_samples( From 362bbab37cafaa920f6796fcfedf9ba1acf2f721 Mon Sep 17 00:00:00 2001 From: Lenna Peterson Date: Mon, 2 Jul 2012 22:00:16 -0400 Subject: [PATCH 005/168] Actually write out sample-filtered file. --- scripts/vcf_sample_filter.py | 20 ++++++++++---------- vcf/parser.py | 8 ++++---- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/scripts/vcf_sample_filter.py b/scripts/vcf_sample_filter.py index c8aa723..d08e933 100644 --- a/scripts/vcf_sample_filter.py +++ b/scripts/vcf_sample_filter.py @@ -58,10 +58,9 @@ def filt2idx(item): def write(self, outfile=None): if outfile is not None: self.outfile = outfile - print "outfile:", self.outfile - #writer = Writer(stream=self.outfile, template=self.parser) - test_row = self.parser.next() - print test_row.samples + writer = Writer(open(self.outfile, "w"), self.parser) + for row in self.parser: + writer.write_record(row) if __name__ == "__main__": # TODO implement argparse @@ -71,9 +70,10 @@ def write(self, outfile=None): raise SystemExit filt = SampleFilter(*sys.argv[1:]) - print "now invert:" - filt2 = SampleFilter(*sys.argv[1:], invert=True) - print "now sequential:" - filt3 = SampleFilter(sys.argv[1]) - filt3.set_filters(sys.argv[3]) - filt3.write(sys.argv[2]) + #print "now invert:" + #filt2 = SampleFilter(*sys.argv[1:], invert=True) + #print "now sequential:" + #filt3 = SampleFilter(sys.argv[1]) + #if len(sys.argv) > 3: + #filt3.set_filters(sys.argv[3]) + #filt3.write(sys.argv[2]) diff --git a/vcf/parser.py b/vcf/parser.py index 7c6a8f7..df44dde 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -324,10 +324,10 @@ def _set_sample_filter(self, samp_filter): self._samp_filter = samp_filter def _filter_samples(self, samples): - filt = self._samp_filter - self.samples = [val for idx,val in enumerate(self.samples) if idx not in filt] - samples = [val for idx,val in enumerate(samples) if idx not in filt] - return samples + filt = set(self._samp_filter) + self.samples = [val for idx,val in enumerate(self.samples) + if idx not in filt] + return [val for idx,val in enumerate(samples) if idx not in filt] def _parse_sample_format(self, samp_fmt): """ Parse the format of the calls in this _Record """ From d71b2cd6c184c33931d3b3d9aa464a3bf0176859 Mon Sep 17 00:00:00 2001 From: Lenna Peterson Date: Mon, 2 Jul 2012 22:09:40 -0400 Subject: [PATCH 006/168] Switched Writer \r\n to os.linesep. --- vcf/parser.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/vcf/parser.py b/vcf/parser.py index df44dde..5026572 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -1,10 +1,11 @@ +import codecs import collections -import re import csv import gzip -import sys import itertools -import codecs +import os +import re +import sys try: from collections import OrderedDict @@ -522,15 +523,15 @@ def fetch(self, chrom, start, end=None): class Writer(object): - """ VCF Writer """ + """VCF Writer. On Windows Python 2, open stream with 'wb'.""" fixed_fields = "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT".split() # Reverse keys and values in header field count dictionary counts = dict((v,k) for k,v in field_counts.iteritems()) - def __init__(self, stream, template, lineterminator="\r\n"): - self.writer = csv.writer(stream, delimiter="\t", lineterminator=lineterminator) + def __init__(self, stream, template, eol=os.linesep): + self.writer = csv.writer(stream, delimiter="\t", lineterminator=eol) self.template = template self.stream = stream From bce2c47774d675481f4d50a4200ac4aafb1ed05f Mon Sep 17 00:00:00 2001 From: Lenna Peterson Date: Tue, 3 Jul 2012 13:59:58 -0400 Subject: [PATCH 007/168] Fixed sample name list update/printing. --- scripts/vcf_sample_filter.py | 4 +++- vcf/parser.py | 17 +++++++++++++---- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/scripts/vcf_sample_filter.py b/scripts/vcf_sample_filter.py index d08e933..5d039f0 100644 --- a/scripts/vcf_sample_filter.py +++ b/scripts/vcf_sample_filter.py @@ -53,7 +53,9 @@ def filt2idx(item): if self.invert: filters = set(xrange(len(self.samples))).difference(filters) - self.parser._set_sample_filter(filters) + # sample_filter is a property that updates parser.samples + self.parser.sample_filter = filters + print "Keeping these samples:", self.parser.samples def write(self, outfile=None): if outfile is not None: diff --git a/vcf/parser.py b/vcf/parser.py index 5026572..279b155 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -211,7 +211,7 @@ def __init__(self, fsock=None, filename=None, compressed=False, prepend_chr=Fals self.formats = None self.samples = None self._sample_indexes = None - self._samp_filter = None + self.sample_filter = None self._header_lines = [] self._tabix = None self._prepend_chr = prepend_chr @@ -321,13 +321,22 @@ def _parse_info(self, info_str): return retdict - def _set_sample_filter(self, samp_filter): + @property + def sample_filter(self): + return self._samp_filter + + @sample_filter.setter + def sample_filter(self, samp_filter): self._samp_filter = samp_filter + # not None or empty list + if samp_filter: + self.samples = [val for idx,val in enumerate(self.samples) + if idx not in set(samp_filter)] + # XXX could update self._sample indexes or use it as history + def _filter_samples(self, samples): filt = set(self._samp_filter) - self.samples = [val for idx,val in enumerate(self.samples) - if idx not in filt] return [val for idx,val in enumerate(samples) if idx not in filt] def _parse_sample_format(self, samp_fmt): From 67744c0855b3cee1eec21833273784ce7a27d0b2 Mon Sep 17 00:00:00 2001 From: Lenna Peterson Date: Fri, 6 Jul 2012 18:58:32 -0400 Subject: [PATCH 008/168] Moved all sample filtering to filter script. --- scripts/vcf_sample_filter.py | 31 ++++++++++++++++++++++++++----- vcf/parser.py | 23 ----------------------- 2 files changed, 26 insertions(+), 28 deletions(-) diff --git a/scripts/vcf_sample_filter.py b/scripts/vcf_sample_filter.py index 5d039f0..56e76ee 100644 --- a/scripts/vcf_sample_filter.py +++ b/scripts/vcf_sample_filter.py @@ -7,9 +7,33 @@ class SampleFilter(object): def __init__(self, infile, outfile=None, filters=None, invert=False): + # Methods to add to Reader + def get_filter(self): + return self._samp_filter + + def set_filter(self, filt): + self._samp_filter = filt + if filt: + self.samples = [val for idx,val in enumerate(self.samples) + if idx not in set(filt)] + + def filter_samples(fn): + """Decorator function to filter sample parameter""" + def filt(self, samples, *args): + samples = [val for idx,val in enumerate(samples) + if idx not in set(self.sample_filter)] + return fn(self, samples, *args) + return filt + + # Add property to Reader for filter list + Reader.sample_filter = property(get_filter, set_filter) + # Modify Reader._parse_samples to filter samples + Reader._parse_samples = filter_samples(Reader._parse_samples) self.parser = Reader(filename=infile) + # Store initial samples and indices self.samples = self.parser.samples self.smp_idx = dict([(v,k) for k,v in enumerate(self.samples)]) + # Properties for filter/writer self.outfile = outfile self.invert = invert self.filters = filters @@ -19,12 +43,9 @@ def __init__(self, infile, outfile=None, filters=None, invert=False): self.write() else: print "Samples:" - for idx, val in enumerate(self.list_samples()): + for idx, val in enumerate(self.samples): print "{0}: {1}".format(idx, val) - def list_samples(self): - return self.samples - def set_filters(self, filters=None, invert=False): if filters is not None: self.filters = filters @@ -53,7 +74,7 @@ def filt2idx(item): if self.invert: filters = set(xrange(len(self.samples))).difference(filters) - # sample_filter is a property that updates parser.samples + # `sample_filter` setter updates `samples` self.parser.sample_filter = filters print "Keeping these samples:", self.parser.samples diff --git a/vcf/parser.py b/vcf/parser.py index 279b155..21fe696 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -211,7 +211,6 @@ def __init__(self, fsock=None, filename=None, compressed=False, prepend_chr=Fals self.formats = None self.samples = None self._sample_indexes = None - self.sample_filter = None self._header_lines = [] self._tabix = None self._prepend_chr = prepend_chr @@ -321,24 +320,6 @@ def _parse_info(self, info_str): return retdict - @property - def sample_filter(self): - return self._samp_filter - - @sample_filter.setter - def sample_filter(self, samp_filter): - self._samp_filter = samp_filter - # not None or empty list - if samp_filter: - self.samples = [val for idx,val in enumerate(self.samples) - if idx not in set(samp_filter)] - # XXX could update self._sample indexes or use it as history - - - def _filter_samples(self, samples): - filt = set(self._samp_filter) - return [val for idx,val in enumerate(samples) if idx not in filt] - def _parse_sample_format(self, samp_fmt): """ Parse the format of the calls in this _Record """ samp_fmt = make_calldata_tuple(samp_fmt.split(':')) @@ -370,10 +351,6 @@ def _parse_samples(self, samples, samp_fmt, site): self._format_cache[samp_fmt] = self._parse_sample_format(samp_fmt) samp_fmt = self._format_cache[samp_fmt] - # filter samples - if self._samp_filter is not None: - samples = self._filter_samples(samples) - if cparse: return cparse.parse_samples( self.samples, samples, samp_fmt, samp_fmt._types, samp_fmt._nums, site) From a048ec0a749eb7c01b28e94c6280f4112852e000 Mon Sep 17 00:00:00 2001 From: Lenna Peterson Date: Fri, 6 Jul 2012 20:15:00 -0400 Subject: [PATCH 009/168] Implemented argparse. --- scripts/vcf_sample_filter.py | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/scripts/vcf_sample_filter.py b/scripts/vcf_sample_filter.py index 56e76ee..d19a626 100644 --- a/scripts/vcf_sample_filter.py +++ b/scripts/vcf_sample_filter.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +import argparse import sys import warnings @@ -47,6 +48,7 @@ def filt(self, samples, *args): print "{0}: {1}".format(idx, val) def set_filters(self, filters=None, invert=False): + """Convert filters from string to list of indices, set on Reader""" if filters is not None: self.filters = filters if invert: @@ -82,21 +84,24 @@ def write(self, outfile=None): if outfile is not None: self.outfile = outfile writer = Writer(open(self.outfile, "w"), self.parser) + print "Writing to '{0}'".format(self.outfile) for row in self.parser: writer.write_record(row) if __name__ == "__main__": - # TODO implement argparse - if len(sys.argv) < 4: - print "Usage: script.py infile outfile [filt1,filt2]" - if len(sys.argv) < 3: - raise SystemExit + parser = argparse.ArgumentParser() + parser.add_argument("file", type=str, + help="VCF file to filter") + parser.add_argument("-f", "--filter", type=str, + help="Comma-separated list of sample indices or names to filter") + parser.add_argument("--invert", action="store_true", + help="Keep rather than discard the filtered samples") + parser.add_argument("-o", "--outfile", type=str, + help="File to write out filtered samples") + # TODO implement quiet (silent if both outfile and filter are specified) + parser.add_argument("-q", "--quiet", action="store_true", + help="Less output") - filt = SampleFilter(*sys.argv[1:]) - #print "now invert:" - #filt2 = SampleFilter(*sys.argv[1:], invert=True) - #print "now sequential:" - #filt3 = SampleFilter(sys.argv[1]) - #if len(sys.argv) > 3: - #filt3.set_filters(sys.argv[3]) - #filt3.write(sys.argv[2]) + args = parser.parse_args() + + SampleFilter(args.file, args.outfile, args.filter, args.invert) From 19ce645f7a5783cb0ff43f21c7a7815d719776ac Mon Sep 17 00:00:00 2001 From: Lenna Peterson Date: Fri, 6 Jul 2012 23:24:08 -0400 Subject: [PATCH 010/168] Tweak args, pep8, move empty outfile warning. --- scripts/vcf_sample_filter.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/scripts/vcf_sample_filter.py b/scripts/vcf_sample_filter.py index d19a626..5124643 100644 --- a/scripts/vcf_sample_filter.py +++ b/scripts/vcf_sample_filter.py @@ -4,7 +4,7 @@ import warnings from vcf import Reader, Writer -#from parser import Reader, Writer + class SampleFilter(object): def __init__(self, infile, outfile=None, filters=None, invert=False): @@ -15,13 +15,13 @@ def get_filter(self): def set_filter(self, filt): self._samp_filter = filt if filt: - self.samples = [val for idx,val in enumerate(self.samples) + self.samples = [val for idx, val in enumerate(self.samples) if idx not in set(filt)] def filter_samples(fn): """Decorator function to filter sample parameter""" def filt(self, samples, *args): - samples = [val for idx,val in enumerate(samples) + samples = [val for idx, val in enumerate(samples) if idx not in set(self.sample_filter)] return fn(self, samples, *args) return filt @@ -33,15 +33,14 @@ def filt(self, samples, *args): self.parser = Reader(filename=infile) # Store initial samples and indices self.samples = self.parser.samples - self.smp_idx = dict([(v,k) for k,v in enumerate(self.samples)]) + self.smp_idx = dict([(v, k) for k, v in enumerate(self.samples)]) # Properties for filter/writer self.outfile = outfile self.invert = invert self.filters = filters if filters is not None: self.set_filters() - if outfile is not None: - self.write() + self.write() else: print "Samples:" for idx, val in enumerate(self.samples): @@ -57,6 +56,7 @@ def set_filters(self, filters=None, invert=False): filt_s = set(filt_l) if len(filt_s) < len(filt_l): warnings.warn("Non-unique filters, ignoring", RuntimeWarning) + def filt2idx(item): """Convert filter to valid sample index""" try: @@ -83,6 +83,8 @@ def filt2idx(item): def write(self, outfile=None): if outfile is not None: self.outfile = outfile + if self.outfile is None: + raise IOError("write() called with no outfile") writer = Writer(open(self.outfile, "w"), self.parser) print "Writing to '{0}'".format(self.outfile) for row in self.parser: @@ -92,11 +94,12 @@ def write(self, outfile=None): parser = argparse.ArgumentParser() parser.add_argument("file", type=str, help="VCF file to filter") - parser.add_argument("-f", "--filter", type=str, - help="Comma-separated list of sample indices or names to filter") + parser.add_argument("-f", metavar="filters", type=str, + help="Comma-separated list of sample indices or names \ + to filter") parser.add_argument("--invert", action="store_true", help="Keep rather than discard the filtered samples") - parser.add_argument("-o", "--outfile", type=str, + parser.add_argument("-o", metavar="outfile", type=str, help="File to write out filtered samples") # TODO implement quiet (silent if both outfile and filter are specified) parser.add_argument("-q", "--quiet", action="store_true", From 95fc70b0bac79aca2ace82221ef78c0e66bd592f Mon Sep 17 00:00:00 2001 From: Lenna Peterson Date: Sat, 7 Jul 2012 01:45:34 -0400 Subject: [PATCH 011/168] Fixed argparse arg names. --- scripts/vcf_sample_filter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/vcf_sample_filter.py b/scripts/vcf_sample_filter.py index 5124643..228d450 100644 --- a/scripts/vcf_sample_filter.py +++ b/scripts/vcf_sample_filter.py @@ -107,4 +107,5 @@ def write(self, outfile=None): args = parser.parse_args() - SampleFilter(args.file, args.outfile, args.filter, args.invert) + SampleFilter(infile=args.file, outfile=args.o, + filters=args.f, invert=args.invert) From 67afb27fe7711cee0e41cac934fc2933cf195235 Mon Sep 17 00:00:00 2001 From: Lenna Peterson Date: Sat, 7 Jul 2012 02:02:18 -0400 Subject: [PATCH 012/168] Changed default out to sys.stdout --- scripts/vcf_sample_filter.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/scripts/vcf_sample_filter.py b/scripts/vcf_sample_filter.py index 228d450..deb20e6 100644 --- a/scripts/vcf_sample_filter.py +++ b/scripts/vcf_sample_filter.py @@ -78,28 +78,29 @@ def filt2idx(item): # `sample_filter` setter updates `samples` self.parser.sample_filter = filters - print "Keeping these samples:", self.parser.samples + sys.stderr.write("Keeping these samples: {0}\n".format(self.parser.samples)) def write(self, outfile=None): if outfile is not None: self.outfile = outfile if self.outfile is None: - raise IOError("write() called with no outfile") - writer = Writer(open(self.outfile, "w"), self.parser) - print "Writing to '{0}'".format(self.outfile) + _out = sys.stdout + else: + _out = open(self.outfile, "wb") + writer = Writer(_out, self.parser) + sys.stderr.write("Writing to '{0}'\n".format(self.outfile)) for row in self.parser: writer.write_record(row) if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("file", type=str, - help="VCF file to filter") - parser.add_argument("-f", metavar="filters", type=str, + parser.add_argument("file", help="VCF file to filter") + parser.add_argument("-f", metavar="filters", help="Comma-separated list of sample indices or names \ to filter") parser.add_argument("--invert", action="store_true", help="Keep rather than discard the filtered samples") - parser.add_argument("-o", metavar="outfile", type=str, + parser.add_argument("-o", metavar="outfile", help="File to write out filtered samples") # TODO implement quiet (silent if both outfile and filter are specified) parser.add_argument("-q", "--quiet", action="store_true", From 33d2b5cc17c73185e666ff8ccbb7fbcc13a2d247 Mon Sep 17 00:00:00 2001 From: Lenna Peterson Date: Sat, 7 Jul 2012 02:54:48 -0400 Subject: [PATCH 013/168] Added unit test for sample filtering script. --- vcf/test/test_vcf.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 809c237..bfa5cd7 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -3,6 +3,7 @@ import doctest import os import commands +import subprocess from StringIO import StringIO import vcf @@ -633,6 +634,27 @@ def testOpenFilenameGzipped(self): self.assertEqual(self.samples, r.samples) +class TestSampleFilter(unittest.TestCase): + def testListSamples(self): + s, out = commands.getstatusoutput('python scripts/vcf_sample_filter.py vcf/test/example-4.1.vcf') + self.assertEqual(s, 0) + expected_out = """Samples: +0: NA00001 +1: NA00002 +2: NA00003""" + self.assertEqual(out, expected_out) + + def testWithFilter(self): + out = subprocess.Popen('python scripts/vcf_sample_filter.py vcf/test/example-4.1.vcf -f 1,2', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()[0] + buf = StringIO() + buf.write(out) + buf.seek(0) + #print(buf.getvalue()) + reader = vcf.Reader(buf) + self.assertEqual(reader.samples, ['NA00001']) + #print(reader.next()) + + class TestFilter(unittest.TestCase): @@ -760,6 +782,7 @@ def test_trim(self): suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestWriter)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestTabix)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestOpenMethods)) +suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestSampleFilter)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestFilter)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(Test1kg)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestRecord)) From 792d685b90e3b29e55daf9457ce0685b6e26da98 Mon Sep 17 00:00:00 2001 From: Lenna Peterson Date: Sat, 7 Jul 2012 02:59:48 -0400 Subject: [PATCH 014/168] Added authorship statement. --- scripts/vcf_sample_filter.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scripts/vcf_sample_filter.py b/scripts/vcf_sample_filter.py index deb20e6..5314d2e 100644 --- a/scripts/vcf_sample_filter.py +++ b/scripts/vcf_sample_filter.py @@ -1,4 +1,9 @@ #!/usr/bin/env python + +# Author: Lenna X. Peterson +# github.com/lennax +# arklenna at gmail dot com + import argparse import sys import warnings From d78a94594597c04c300839a68ea2fc5097e9a13b Mon Sep 17 00:00:00 2001 From: Lenna Peterson Date: Sat, 7 Jul 2012 03:02:24 -0400 Subject: [PATCH 015/168] Added sample filter to list of scripts in setup. --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index bca3a0d..4ecec0a 100644 --- a/setup.py +++ b/setup.py @@ -44,7 +44,8 @@ setup( name='PyVCF', packages=['vcf', 'vcf.test'], - scripts=['scripts/vcf_melt', 'scripts/vcf_filter.py'], + scripts=['scripts/vcf_melt', 'scripts/vcf_filter.py', + 'scripts/vcf_sample_filter.py'], author='James Casbon and @jdoughertyii', author_email='casbon@gmail.com', description='Variant Call Format (VCF) parser for Python', From 75c4775e7b4e7737bbd53afbe93c7797dd4f1428 Mon Sep 17 00:00:00 2001 From: Lenna Peterson Date: Mon, 9 Jul 2012 13:34:56 -0400 Subject: [PATCH 016/168] Moved sample filter object to src dir. --- scripts/vcf_sample_filter.py | 94 ++-------------------------------- vcf/__init__.py | 3 +- vcf/sample_filter.py | 98 ++++++++++++++++++++++++++++++++++++ 3 files changed, 103 insertions(+), 92 deletions(-) create mode 100644 vcf/sample_filter.py diff --git a/scripts/vcf_sample_filter.py b/scripts/vcf_sample_filter.py index 5314d2e..6ff2bd3 100644 --- a/scripts/vcf_sample_filter.py +++ b/scripts/vcf_sample_filter.py @@ -5,98 +5,10 @@ # arklenna at gmail dot com import argparse -import sys -import warnings -from vcf import Reader, Writer +from vcf import SampleFilter -class SampleFilter(object): - def __init__(self, infile, outfile=None, filters=None, invert=False): - # Methods to add to Reader - def get_filter(self): - return self._samp_filter - - def set_filter(self, filt): - self._samp_filter = filt - if filt: - self.samples = [val for idx, val in enumerate(self.samples) - if idx not in set(filt)] - - def filter_samples(fn): - """Decorator function to filter sample parameter""" - def filt(self, samples, *args): - samples = [val for idx, val in enumerate(samples) - if idx not in set(self.sample_filter)] - return fn(self, samples, *args) - return filt - - # Add property to Reader for filter list - Reader.sample_filter = property(get_filter, set_filter) - # Modify Reader._parse_samples to filter samples - Reader._parse_samples = filter_samples(Reader._parse_samples) - self.parser = Reader(filename=infile) - # Store initial samples and indices - self.samples = self.parser.samples - self.smp_idx = dict([(v, k) for k, v in enumerate(self.samples)]) - # Properties for filter/writer - self.outfile = outfile - self.invert = invert - self.filters = filters - if filters is not None: - self.set_filters() - self.write() - else: - print "Samples:" - for idx, val in enumerate(self.samples): - print "{0}: {1}".format(idx, val) - - def set_filters(self, filters=None, invert=False): - """Convert filters from string to list of indices, set on Reader""" - if filters is not None: - self.filters = filters - if invert: - self.invert = invert - filt_l = self.filters.split(",") - filt_s = set(filt_l) - if len(filt_s) < len(filt_l): - warnings.warn("Non-unique filters, ignoring", RuntimeWarning) - - def filt2idx(item): - """Convert filter to valid sample index""" - try: - item = int(item) - except ValueError: - # not an idx, check if it's a value - return self.smp_idx.get(item) - else: - # is int, check if it's an idx - if item < len(self.samples): - return item - filters = set(filter(lambda x: x is not None, map(filt2idx, filt_s))) - if len(filters) < len(filt_s): - # TODO print the filters that were ignored - warnings.warn("Invalid filters, ignoring", RuntimeWarning) - - if self.invert: - filters = set(xrange(len(self.samples))).difference(filters) - - # `sample_filter` setter updates `samples` - self.parser.sample_filter = filters - sys.stderr.write("Keeping these samples: {0}\n".format(self.parser.samples)) - - def write(self, outfile=None): - if outfile is not None: - self.outfile = outfile - if self.outfile is None: - _out = sys.stdout - else: - _out = open(self.outfile, "wb") - writer = Writer(_out, self.parser) - sys.stderr.write("Writing to '{0}'\n".format(self.outfile)) - for row in self.parser: - writer.write_record(row) - if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("file", help="VCF file to filter") @@ -108,8 +20,8 @@ def write(self, outfile=None): parser.add_argument("-o", metavar="outfile", help="File to write out filtered samples") # TODO implement quiet (silent if both outfile and filter are specified) - parser.add_argument("-q", "--quiet", action="store_true", - help="Less output") + #parser.add_argument("-q", "--quiet", action="store_true", + #help="Less output") args = parser.parse_args() diff --git a/vcf/__init__.py b/vcf/__init__.py index 2935c73..586820e 100644 --- a/vcf/__init__.py +++ b/vcf/__init__.py @@ -59,7 +59,7 @@ >>> print record.INFO['AF'] [0.5] -There are a number of convienience methods and properties for each ``Record`` allowing you to +There are a number of convenience methods and properties for each ``Record`` allowing you to examine properties of interest:: >>> print record.num_called, record.call_rate, record.num_unknown @@ -176,5 +176,6 @@ from vcf.parser import VCFReader, VCFWriter from vcf.filters import Base as Filter from vcf.parser import RESERVED_INFO, RESERVED_FORMAT +from vcf.sample_filter import SampleFilter VERSION = '0.5.0' diff --git a/vcf/sample_filter.py b/vcf/sample_filter.py new file mode 100644 index 0000000..38acaec --- /dev/null +++ b/vcf/sample_filter.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python + +# Author: Lenna X. Peterson +# github.com/lennax +# arklenna at gmail dot com + +import sys +import warnings + + +from parser import Reader, Writer + + +class SampleFilter(object): + def __init__(self, infile, outfile=None, filters=None, invert=False): + # Methods to add to Reader + def get_filter(self): + return self._samp_filter + + def set_filter(self, filt): + self._samp_filter = filt + if filt: + self.samples = [val for idx, val in enumerate(self.samples) + if idx not in set(filt)] + + def filter_samples(fn): + """Decorator function to filter sample parameter""" + def filt(self, samples, *args): + samples = [val for idx, val in enumerate(samples) + if idx not in set(self.sample_filter)] + return fn(self, samples, *args) + return filt + + # Add property to Reader for filter list + Reader.sample_filter = property(get_filter, set_filter) + # Modify Reader._parse_samples to filter samples + Reader._parse_samples = filter_samples(Reader._parse_samples) + self.parser = Reader(filename=infile) + # Store initial samples and indices + self.samples = self.parser.samples + self.smp_idx = dict([(v, k) for k, v in enumerate(self.samples)]) + # Properties for filter/writer + self.outfile = outfile + self.invert = invert + self.filters = filters + if filters is not None: + self.set_filters() + self.write() + else: + print "Samples:" + for idx, val in enumerate(self.samples): + print "{0}: {1}".format(idx, val) + + def set_filters(self, filters=None, invert=False): + """Convert filters from string to list of indices, set on Reader""" + if filters is not None: + self.filters = filters + if invert: + self.invert = invert + filt_l = self.filters.split(",") + filt_s = set(filt_l) + if len(filt_s) < len(filt_l): + warnings.warn("Non-unique filters, ignoring", RuntimeWarning) + + def filt2idx(item): + """Convert filter to valid sample index""" + try: + item = int(item) + except ValueError: + # not an idx, check if it's a value + return self.smp_idx.get(item) + else: + # is int, check if it's an idx + if item < len(self.samples): + return item + filters = set(filter(lambda x: x is not None, map(filt2idx, filt_s))) + if len(filters) < len(filt_s): + # TODO print the filters that were ignored + warnings.warn("Invalid filters, ignoring", RuntimeWarning) + + if self.invert: + filters = set(xrange(len(self.samples))).difference(filters) + + # `sample_filter` setter updates `samples` + self.parser.sample_filter = filters + sys.stderr.write("Keeping these samples: {0}\n".format(self.parser.samples)) + + def write(self, outfile=None): + if outfile is not None: + self.outfile = outfile + if self.outfile is None: + _out = sys.stdout + else: + _out = open(self.outfile, "wb") + sys.stderr.write("Writing to '{0}'\n".format(self.outfile)) + writer = Writer(_out, self.parser) + for row in self.parser: + writer.write_record(row) From 0047032811b732668a45b6f63599b229c5cda20c Mon Sep 17 00:00:00 2001 From: Lenna Peterson Date: Mon, 9 Jul 2012 14:43:21 -0400 Subject: [PATCH 017/168] Using logging for easy quiet mode. --- scripts/vcf_sample_filter.py | 26 ++++++++++++++++++-------- vcf/sample_filter.py | 10 ++++------ 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/scripts/vcf_sample_filter.py b/scripts/vcf_sample_filter.py index 6ff2bd3..d71e6a3 100644 --- a/scripts/vcf_sample_filter.py +++ b/scripts/vcf_sample_filter.py @@ -5,6 +5,7 @@ # arklenna at gmail dot com import argparse +import logging from vcf import SampleFilter @@ -12,18 +13,27 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("file", help="VCF file to filter") + parser.add_argument("-o", metavar="outfile", + help="File to write out filtered samples") parser.add_argument("-f", metavar="filters", help="Comma-separated list of sample indices or names \ to filter") - parser.add_argument("--invert", action="store_true", + parser.add_argument("-i", "--invert", action="store_true", help="Keep rather than discard the filtered samples") - parser.add_argument("-o", metavar="outfile", - help="File to write out filtered samples") - # TODO implement quiet (silent if both outfile and filter are specified) - #parser.add_argument("-q", "--quiet", action="store_true", - #help="Less output") + parser.add_argument("-q", "--quiet", action="store_true", + help="Less output") args = parser.parse_args() - SampleFilter(infile=args.file, outfile=args.o, - filters=args.f, invert=args.invert) + if args.quiet: + log_level = logging.WARNING + else: + log_level = logging.INFO + logging.basicConfig(format='%(message)s', level=log_level) + + sf = SampleFilter(infile=args.file, outfile=args.o, + filters=args.f, invert=args.invert) + if args.f is None: + print "Samples:" + for idx, val in enumerate(sf.samples): + print "{0}: {1}".format(idx, val) diff --git a/vcf/sample_filter.py b/vcf/sample_filter.py index 38acaec..c9d4f31 100644 --- a/vcf/sample_filter.py +++ b/vcf/sample_filter.py @@ -4,6 +4,7 @@ # github.com/lennax # arklenna at gmail dot com +import logging import sys import warnings @@ -46,10 +47,6 @@ def filt(self, samples, *args): if filters is not None: self.set_filters() self.write() - else: - print "Samples:" - for idx, val in enumerate(self.samples): - print "{0}: {1}".format(idx, val) def set_filters(self, filters=None, invert=False): """Convert filters from string to list of indices, set on Reader""" @@ -83,7 +80,8 @@ def filt2idx(item): # `sample_filter` setter updates `samples` self.parser.sample_filter = filters - sys.stderr.write("Keeping these samples: {0}\n".format(self.parser.samples)) + logging.info("Keeping these samples: {0}\n".format(self.parser.samples)) + return self.parser.samples def write(self, outfile=None): if outfile is not None: @@ -92,7 +90,7 @@ def write(self, outfile=None): _out = sys.stdout else: _out = open(self.outfile, "wb") - sys.stderr.write("Writing to '{0}'\n".format(self.outfile)) + logging.info("Writing to '{0}'\n".format(self.outfile)) writer = Writer(_out, self.parser) for row in self.parser: writer.write_record(row) From 6b1fa897a7ee2339e40d2f9aae4d1fbad946d4df Mon Sep 17 00:00:00 2001 From: Lenna Peterson Date: Mon, 9 Jul 2012 15:39:32 -0400 Subject: [PATCH 018/168] Unit test for sample filter module. --- vcf/sample_filter.py | 7 +++++++ vcf/test/test_vcf.py | 28 +++++++++++++++++++++++++--- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/vcf/sample_filter.py b/vcf/sample_filter.py index c9d4f31..b963948 100644 --- a/vcf/sample_filter.py +++ b/vcf/sample_filter.py @@ -35,6 +35,7 @@ def filt(self, samples, *args): # Add property to Reader for filter list Reader.sample_filter = property(get_filter, set_filter) # Modify Reader._parse_samples to filter samples + self._orig_parse_samples = Reader._parse_samples Reader._parse_samples = filter_samples(Reader._parse_samples) self.parser = Reader(filename=infile) # Store initial samples and indices @@ -88,9 +89,15 @@ def write(self, outfile=None): self.outfile = outfile if self.outfile is None: _out = sys.stdout + elif hasattr(self.outfile, 'write'): + _out = self.outfile else: _out = open(self.outfile, "wb") logging.info("Writing to '{0}'\n".format(self.outfile)) writer = Writer(_out, self.parser) for row in self.parser: writer.write_record(row) + + def undo_monkey_patch(self): + delattr(Reader, 'sample_filter') + Reader._parse_samples = self._orig_parse_samples diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index bfa5cd7..36662b0 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -635,7 +635,7 @@ def testOpenFilenameGzipped(self): class TestSampleFilter(unittest.TestCase): - def testListSamples(self): + def testCLIListSamples(self): s, out = commands.getstatusoutput('python scripts/vcf_sample_filter.py vcf/test/example-4.1.vcf') self.assertEqual(s, 0) expected_out = """Samples: @@ -644,7 +644,7 @@ def testListSamples(self): 2: NA00003""" self.assertEqual(out, expected_out) - def testWithFilter(self): + def testCLIWithFilter(self): out = subprocess.Popen('python scripts/vcf_sample_filter.py vcf/test/example-4.1.vcf -f 1,2', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()[0] buf = StringIO() buf.write(out) @@ -652,7 +652,29 @@ def testWithFilter(self): #print(buf.getvalue()) reader = vcf.Reader(buf) self.assertEqual(reader.samples, ['NA00001']) - #print(reader.next()) + rec = reader.next() + self.assertEqual(len(rec.samples), 1) + + def testSampleFilterModule(self): + # init filter with filename, get list of samples + filt = vcf.SampleFilter('vcf/test/example-4.1.vcf') + self.assertEqual(filt.samples, ['NA00001', 'NA00002', 'NA00003']) + # set filter, check which samples will be kept + filtered = filt.set_filters(filters="0", invert=True) + self.assertEqual(filtered, ['NA00001']) + # write filtered file to StringIO + buf = StringIO() + filt.write(buf) + buf.seek(0) + #print(buf.getvalue()) + # undo monkey patch + filt.undo_monkey_patch() + # read output + reader = vcf.Reader(buf) + self.assertEqual(reader.samples, ['NA00001']) + print(dir(reader)) + rec = reader.next() + self.assertEqual(len(rec.samples), 1) class TestFilter(unittest.TestCase): From 817f5e9fd140ac35c99f57ee045446861889912c Mon Sep 17 00:00:00 2001 From: Lenna Peterson Date: Mon, 9 Jul 2012 15:58:32 -0400 Subject: [PATCH 019/168] Docs/test for undo_monkey_patch --- vcf/sample_filter.py | 11 ++++++++--- vcf/test/test_vcf.py | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/vcf/sample_filter.py b/vcf/sample_filter.py index b963948..6e5e66e 100644 --- a/vcf/sample_filter.py +++ b/vcf/sample_filter.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python - # Author: Lenna X. Peterson # github.com/lennax # arklenna at gmail dot com @@ -13,6 +11,13 @@ class SampleFilter(object): + """ + Modifies the vcf Reader to filter each row by sample as it is parsed. + When using the class, be sure to call `undo_monkey_patch()` to restore + the original functionality to the Reader. + + """ + def __init__(self, infile, outfile=None, filters=None, invert=False): # Methods to add to Reader def get_filter(self): @@ -99,5 +104,5 @@ def write(self, outfile=None): writer.write_record(row) def undo_monkey_patch(self): - delattr(Reader, 'sample_filter') Reader._parse_samples = self._orig_parse_samples + delattr(Reader, 'sample_filter') diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 36662b0..ab07f2e 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -669,10 +669,10 @@ def testSampleFilterModule(self): #print(buf.getvalue()) # undo monkey patch filt.undo_monkey_patch() + self.assertTrue('sample_filter' not in dir(vcf.Reader)) # read output reader = vcf.Reader(buf) self.assertEqual(reader.samples, ['NA00001']) - print(dir(reader)) rec = reader.next() self.assertEqual(len(rec.samples), 1) From 0b0d8093fc951cdde37627ad0e7897e8ebf7ca3d Mon Sep 17 00:00:00 2001 From: Lenna Peterson Date: Mon, 9 Jul 2012 16:20:46 -0400 Subject: [PATCH 020/168] Changed tests to use subprocess returncode. --- vcf/test/test_vcf.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index ab07f2e..63f740c 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -636,16 +636,19 @@ def testOpenFilenameGzipped(self): class TestSampleFilter(unittest.TestCase): def testCLIListSamples(self): - s, out = commands.getstatusoutput('python scripts/vcf_sample_filter.py vcf/test/example-4.1.vcf') - self.assertEqual(s, 0) - expected_out = """Samples: -0: NA00001 -1: NA00002 -2: NA00003""" - self.assertEqual(out, expected_out) + proc = subprocess.Popen('python scripts/vcf_sample_filter.py vcf/test/example-4.1.vcf', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + out, err = proc.communicate() + self.assertEqual(proc.returncode, 0) + self.assertFalse(err) + expected_out = ['Samples:', '0: NA00001', '1: NA00002', '2: NA00003'] + self.assertEqual(out.splitlines(), expected_out) def testCLIWithFilter(self): - out = subprocess.Popen('python scripts/vcf_sample_filter.py vcf/test/example-4.1.vcf -f 1,2', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()[0] + proc = subprocess.Popen('python scripts/vcf_sample_filter.py vcf/test/example-4.1.vcf -f 1,2 --quiet', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + out, err = proc.communicate() + self.assertEqual(proc.returncode, 0) + self.assertTrue(out) + self.assertFalse(err) buf = StringIO() buf.write(out) buf.seek(0) From 746ece940739e93aa45d43ca0101432d5a74df19 Mon Sep 17 00:00:00 2001 From: Lenna Peterson Date: Mon, 9 Jul 2012 16:40:25 -0400 Subject: [PATCH 021/168] Destructor undoes patch; warn if 0 samples kept --- vcf/sample_filter.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vcf/sample_filter.py b/vcf/sample_filter.py index 6e5e66e..8470158 100644 --- a/vcf/sample_filter.py +++ b/vcf/sample_filter.py @@ -54,6 +54,12 @@ def filt(self, samples, *args): self.set_filters() self.write() + def __del__(self): + try: + self.undo_monkey_patch() + except AttributeError: + pass + def set_filters(self, filters=None, invert=False): """Convert filters from string to list of indices, set on Reader""" if filters is not None: @@ -86,6 +92,8 @@ def filt2idx(item): # `sample_filter` setter updates `samples` self.parser.sample_filter = filters + if len(self.parser.samples) == 0: + warnings.warn("Number of samples to keep is zero", RuntimeWarning) logging.info("Keeping these samples: {0}\n".format(self.parser.samples)) return self.parser.samples From 30321c502710a5b9735737b803c59df56806955e Mon Sep 17 00:00:00 2001 From: Lenna Peterson Date: Mon, 9 Jul 2012 16:47:41 -0400 Subject: [PATCH 022/168] Recommend explicit use of del. --- vcf/sample_filter.py | 8 ++++---- vcf/test/test_vcf.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/vcf/sample_filter.py b/vcf/sample_filter.py index 8470158..2a80c5d 100644 --- a/vcf/sample_filter.py +++ b/vcf/sample_filter.py @@ -13,8 +13,8 @@ class SampleFilter(object): """ Modifies the vcf Reader to filter each row by sample as it is parsed. - When using the class, be sure to call `undo_monkey_patch()` to restore - the original functionality to the Reader. + After using this class, call del on its instance to remove filtering + and restore the original functionality to the Reader. """ @@ -56,7 +56,7 @@ def filt(self, samples, *args): def __del__(self): try: - self.undo_monkey_patch() + self._undo_monkey_patch() except AttributeError: pass @@ -111,6 +111,6 @@ def write(self, outfile=None): for row in self.parser: writer.write_record(row) - def undo_monkey_patch(self): + def _undo_monkey_patch(self): Reader._parse_samples = self._orig_parse_samples delattr(Reader, 'sample_filter') diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 63f740c..3bdee43 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -670,8 +670,8 @@ def testSampleFilterModule(self): filt.write(buf) buf.seek(0) #print(buf.getvalue()) - # undo monkey patch - filt.undo_monkey_patch() + # undo monkey patch by destroying instance + del filt self.assertTrue('sample_filter' not in dir(vcf.Reader)) # read output reader = vcf.Reader(buf) From 49f889731b69ea9a42e2425743cfe2d910d35bab Mon Sep 17 00:00:00 2001 From: Lenna Peterson Date: Mon, 9 Jul 2012 17:01:01 -0400 Subject: [PATCH 023/168] Added empty filter list; del is now less critical. --- vcf/sample_filter.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vcf/sample_filter.py b/vcf/sample_filter.py index 2a80c5d..b156b45 100644 --- a/vcf/sample_filter.py +++ b/vcf/sample_filter.py @@ -13,8 +13,6 @@ class SampleFilter(object): """ Modifies the vcf Reader to filter each row by sample as it is parsed. - After using this class, call del on its instance to remove filtering - and restore the original functionality to the Reader. """ @@ -39,6 +37,7 @@ def filt(self, samples, *args): # Add property to Reader for filter list Reader.sample_filter = property(get_filter, set_filter) + Reader._samp_filter = [] # Modify Reader._parse_samples to filter samples self._orig_parse_samples = Reader._parse_samples Reader._parse_samples = filter_samples(Reader._parse_samples) From e63960ccdc065f8c439105dc9609d1ba91adaa95 Mon Sep 17 00:00:00 2001 From: James Casbon Date: Tue, 27 Nov 2012 08:38:33 +0000 Subject: [PATCH 024/168] apply 0.6.0 release which seemed to get commited off of a branch --- docs/HISTORY.rst | 10 ++++++++++ vcf/__init__.py | 2 +- vcf/model.py | 42 +++++++++++++++++++++++++++--------------- vcf/test/prof.py | 2 +- 4 files changed, 39 insertions(+), 17 deletions(-) diff --git a/docs/HISTORY.rst b/docs/HISTORY.rst index 396ffa7..085e24c 100644 --- a/docs/HISTORY.rst +++ b/docs/HISTORY.rst @@ -17,6 +17,16 @@ New features should have test code sent with them. Changes ======= +0.6.0 Release +------------- + +* Backwards incompatible change: _Call.data is now a + namedtuple (previously it was a dict) +* Optional cython version, much improved performance. +* Improvements to writer (thanks @cmclean) +* Improvements to inheritance of classes (thanks @lennax) + + 0.5.0 Release ------------- diff --git a/vcf/__init__.py b/vcf/__init__.py index 2935c73..f7aa7ca 100644 --- a/vcf/__init__.py +++ b/vcf/__init__.py @@ -177,4 +177,4 @@ from vcf.filters import Base as Filter from vcf.parser import RESERVED_INFO, RESERVED_FORMAT -VERSION = '0.5.0' +VERSION = '0.6.0' diff --git a/vcf/model.py b/vcf/model.py index 9a27f87..9748784 100644 --- a/vcf/model.py +++ b/vcf/model.py @@ -2,6 +2,7 @@ import collections import sys + class _Call(object): """ A genotype call, a cell entry in a VCF file""" @@ -20,7 +21,7 @@ def __init__(self, site, sample, data): self.called = self.gt_nums is not None except AttributeError: self.gt_nums = None - # FIXME how do we know if a non GT call is called? + #62 a call without a genotype is not defined as called or not self.called = None def __repr__(self): @@ -70,10 +71,14 @@ def gt_type(self): if self.called: alleles = self.gt_alleles if all(X == alleles[0] for X in alleles[1:]): - if alleles[0] == "0": return 0 - else: return 2 - else: return 1 - else: return None + if alleles[0] == "0": + return 0 + else: + return 2 + else: + return 1 + else: + return None @property def phased(self): @@ -145,7 +150,7 @@ def __str__(self): return "Record(CHROM=%(CHROM)s, POS=%(POS)s, REF=%(REF)s, ALT=%(ALT)s)" % self.__dict__ def __cmp__(self, other): - return cmp( (self.CHROM, self.POS), (other.CHROM, other.POS)) + return cmp((self.CHROM, self.POS), (other.CHROM, other.POS)) def add_format(self, fmt): self.FORMAT = self.FORMAT + ':' + fmt @@ -199,7 +204,6 @@ def aaf(self): # skip if more than one alternate allele. assumes bi-allelic if len(self.ALT) > 1: return None - hom_ref = self.num_hom_ref het = self.num_het hom_alt = self.num_hom_alt num_chroms = float(2.0 * self.num_called) @@ -244,7 +248,8 @@ def get_unknowns(self): @property def is_snp(self): """ Return whether or not the variant is a SNP """ - if len(self.REF) > 1: return False + if len(self.REF) > 1: + return False for alt in self.ALT: if alt is None or alt.type != "SNV": return False @@ -257,7 +262,8 @@ def is_indel(self): """ Return whether or not the variant is an INDEL """ is_sv = self.is_sv - if len(self.REF) > 1 and not is_sv: return True + if len(self.REF) > 1 and not is_sv: + return True for alt in self.ALT: if alt is None: return True @@ -284,7 +290,8 @@ def is_sv(self): def is_transition(self): """ Return whether or not the SNP is a transition """ # if multiple alts, it is unclear if we have a transition - if len(self.ALT) > 1: return False + if len(self.ALT) > 1: + return False if self.is_snp: # just one alt allele @@ -294,14 +301,17 @@ def is_transition(self): (self.REF == "C" and alt_allele == "T") or (self.REF == "T" and alt_allele == "C")): return True - else: return False - else: return False + else: + return False + else: + return False @property def is_deletion(self): """ Return whether or not the INDEL is a deletion """ # if multiple alts, it is unclear if we have a transition - if len(self.ALT) > 1: return False + if len(self.ALT) > 1: + return False if self.is_indel: # just one alt allele @@ -310,8 +320,10 @@ def is_deletion(self): return True if len(self.REF) > len(alt_allele): return True - else: return False - else: return False + else: + return False + else: + return False @property def var_type(self): diff --git a/vcf/test/prof.py b/vcf/test/prof.py index 62c72fe..953d169 100755 --- a/vcf/test/prof.py +++ b/vcf/test/prof.py @@ -1,4 +1,4 @@ -import vcf +import vcf as vcf import cProfile import timeit import pstats From fb835a2a3023116e8477412949eb10d7459f6a39 Mon Sep 17 00:00:00 2001 From: Marco Falcioni Date: Wed, 14 Nov 2012 11:48:29 -0800 Subject: [PATCH 025/168] Changed the rule to split records into columns According to the specification the columns must be tab separated. I encountered an VCF file from NCBI that has spaces in the INFO column, which caused PyVCF to fail. http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41 --- vcf/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vcf/parser.py b/vcf/parser.py index f274e9c..6d938b6 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -437,7 +437,7 @@ def _parse_alt(self, str): def next(self): '''Return the next record in the file.''' line = self.reader.next() - row = re.split('\t| +', line) + row = re.split('\t+', line) chrom = row[0] if self._prepend_chr: chrom = 'chr' + chrom From b6c085b74ce5c2acd6a785452e6f0f9062b1789d Mon Sep 17 00:00:00 2001 From: James Casbon Date: Tue, 27 Nov 2012 08:22:53 +0000 Subject: [PATCH 026/168] add strict whitespace option to allow for well formed VCFs with spaces in sample names. --- vcf/parser.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/vcf/parser.py b/vcf/parser.py index 6d938b6..96ddba1 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -172,12 +172,19 @@ def read_meta(self, meta_string): class Reader(object): """ Reader for a VCF v 4.0 file, an iterator returning ``_Record objects`` """ - def __init__(self, fsock=None, filename=None, compressed=False, prepend_chr=False): + def __init__(self, fsock=None, filename=None, compressed=False, prepend_chr=False, + strict_whitespace=False): """ Create a new Reader for a VCF file. You must specify either fsock (stream) or filename. Gzipped streams or files are attempted to be recogized by the file extension, or gzipped can be forced with ``compressed=True`` + + 'prepend_chr=True' will put 'chr' before all the CHROM values, useful + for different sources. + + 'strict_whitespace=True' will split records on tabs only (as with VCF + spec) which allows you to parse files with spaces in the sample names. """ super(Reader, self).__init__() @@ -218,6 +225,11 @@ def __init__(self, fsock=None, filename=None, compressed=False, prepend_chr=Fals self._parse_metainfo() self._format_cache = {} + if strict_whitespace: + self._separator = '\t' + else: + self._separator = '\t| +' + def __iter__(self): return self @@ -437,7 +449,7 @@ def _parse_alt(self, str): def next(self): '''Return the next record in the file.''' line = self.reader.next() - row = re.split('\t+', line) + row = re.split(self._separator, line) chrom = row[0] if self._prepend_chr: chrom = 'chr' + chrom From 3cd09d5c637d368a65916de70c7a8ba80d936f31 Mon Sep 17 00:00:00 2001 From: James Casbon Date: Tue, 27 Nov 2012 08:46:30 +0000 Subject: [PATCH 027/168] 0.6.1 release --- README.rst | 16 ++++++++-------- docs/HISTORY.rst | 10 ++++++++++ vcf/__init__.py | 2 +- 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/README.rst b/README.rst index 83792ce..52bd780 100644 --- a/README.rst +++ b/README.rst @@ -14,7 +14,7 @@ There main interface is the class: ``Reader``. It takes a file-like object and acts as a reader:: >>> import vcf - >>> vcf_reader = vcf.Reader(open('vcf/test/example-4.0.vcf', 'rb')) + >>> vcf_reader = vcf.Reader(open('vcf/test/example-4.0.vcf', 'r')) >>> for record in vcf_reader: ... print record Record(CHROM=20, POS=14370, REF=G, ALT=[A]) @@ -49,7 +49,7 @@ one-entry Python lists (see, e.g., ``Record.ALT``). Semicolon-delimited lists of key=value pairs are converted to Python dictionaries, with flags being given a ``True`` value. Integers and floats are handled exactly as you'd expect:: - >>> vcf_reader = vcf.Reader(open('vcf/test/example-4.0.vcf', 'rb')) + >>> vcf_reader = vcf.Reader(open('vcf/test/example-4.0.vcf', 'r')) >>> record = vcf_reader.next() >>> print record.POS 14370 @@ -68,7 +68,7 @@ examine properties of interest:: >>> print record.nucl_diversity, record.aaf 0.6 0.5 >>> print record.get_hets() - [Call(sample=NA00002, GT=1|0, HQ=[51, 51], DP=8, GQ=48)] + [Call(sample=NA00002, CallData(GT=1|0, GQ=48, DP=8, HQ=[51, 51]))] >>> print record.is_snp, record.is_indel, record.is_transition, record.is_deletion True False True False >>> print record.var_type, record.var_subtype @@ -101,7 +101,7 @@ call data in ``data``:: >>> print call.sample NA00001 >>> print call.data - {'GT': '0|0', 'HQ': [58, 50], 'DP': 3, 'GQ': 49} + CallData(GT=0|0, GQ=49, DP=3, HQ=[58, 50]) Please note that as of release 0.4.0, attributes known to have single values (such as ``DP`` and ``GQ`` above) are returned as values. Other attributes are returned @@ -134,7 +134,7 @@ For example:: ALT records are actually classes, so that you can interrogate them:: - >>> reader = vcf.Reader(file('vcf/test/example-4.1-bnd.vcf')) + >>> reader = vcf.Reader(open('vcf/test/example-4.1-bnd.vcf')) >>> _ = reader.next(); row = reader.next() >>> print row Record(CHROM=1, POS=2, REF=T, ALT=[T[2:3[]) @@ -146,14 +146,14 @@ Random access is supported for files with tabix indexes. Simply call fetch for region you are interested in:: >>> vcf_reader = vcf.Reader(filename='vcf/test/tb.vcf.gz') - >>> for record in vcf_reader.fetch('20', 1110696, 1230237): + >>> for record in vcf_reader.fetch('20', 1110696, 1230237): # doctest: +SKIP ... print record Record(CHROM=20, POS=1110696, REF=A, ALT=[G, T]) Record(CHROM=20, POS=1230237, REF=T, ALT=[None]) Or extract a single row:: - >>> print vcf_reader.fetch('20', 1110696) + >>> print vcf_reader.fetch('20', 1110696) # doctest: +SKIP Record(CHROM=20, POS=1110696, REF=A, ALT=[G, T]) @@ -161,7 +161,7 @@ The ``Writer`` class provides a way of writing a VCF file. Currently, you must template ``Reader`` which provides the metadata:: >>> vcf_reader = vcf.Reader(filename='vcf/test/tb.vcf.gz') - >>> vcf_writer = vcf.Writer(file('/dev/null', 'w'), vcf_reader) + >>> vcf_writer = vcf.Writer(open('/dev/null', 'w'), vcf_reader) >>> for record in vcf_reader: ... vcf_writer.write_record(record) diff --git a/docs/HISTORY.rst b/docs/HISTORY.rst index 085e24c..1e61871 100644 --- a/docs/HISTORY.rst +++ b/docs/HISTORY.rst @@ -17,6 +17,16 @@ New features should have test code sent with them. Changes ======= +0.6.1 Release +------------- + +* Add strict whitespace mode for well formed VCFs with spaces + in sample names (thanks Marco) +* Ignore blank lines in files (thanks Martijn) +* Tweaks for handling missing data (thanks Sean) +* bcftools tests (thanks Martijn) +* record.FILTER is always a list + 0.6.0 Release ------------- diff --git a/vcf/__init__.py b/vcf/__init__.py index f7aa7ca..a8be533 100644 --- a/vcf/__init__.py +++ b/vcf/__init__.py @@ -177,4 +177,4 @@ from vcf.filters import Base as Filter from vcf.parser import RESERVED_INFO, RESERVED_FORMAT -VERSION = '0.6.0' +VERSION = '0.6.1' From f554810ea2510dc969fd2dcc8776433d06a08348 Mon Sep 17 00:00:00 2001 From: chapmanb Date: Mon, 3 Dec 2012 08:09:34 -0500 Subject: [PATCH 028/168] Allow flexibility in parsing INFO values specified as integers in the header: also allow float values. --- vcf/parser.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/vcf/parser.py b/vcf/parser.py index 96ddba1..4d54e74 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -310,7 +310,12 @@ def _parse_info(self, info_str): if entry_type == 'Integer': vals = entry[1].split(',') - val = self._map(int, vals) + try: + val = self._map(int, vals) + # Allow specified integers to be flexibly parsed as floats. + # Handles cases with incorrectly specified header types. + except ValueError: + val = self._map(float, vals) elif entry_type == 'Float': vals = entry[1].split(',') val = self._map(float, vals) @@ -392,7 +397,10 @@ def _parse_samples(self, samples, samp_fmt, site): if entry_num == 1 or ',' not in vals: if entry_type == 'Integer': - sampdat[i] = int(vals) + try: + sampdat[i] = int(vals) + except ValueError: + sampdat[i] = float(vals) elif entry_type == 'Float': sampdat[i] = float(vals) else: @@ -406,7 +414,10 @@ def _parse_samples(self, samples, samp_fmt, site): vals = vals.split(',') if entry_type == 'Integer': - sampdat[i] = _map(int, vals) + try: + sampdat[i] = _map(int, vals) + except ValueError: + sampdat[i] = _map(float, vals) elif entry_type == 'Float' or entry_type == 'Numeric': sampdat[i] = _map(float, vals) else: From b79302060ba68732870c123abe15936331518ee2 Mon Sep 17 00:00:00 2001 From: Sean Davis Date: Wed, 5 Dec 2012 12:11:06 -0500 Subject: [PATCH 029/168] Fixes #78 --- vcf/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vcf/parser.py b/vcf/parser.py index 4d54e74..51e8af1 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -328,7 +328,7 @@ def _parse_info(self, info_str): val = True try: - if self.infos[ID].num == 1 and entry_type != 'String': + if self.infos[ID].num == 1 and entry_type not in ( 'String', 'Flag'): val = val[0] except KeyError: pass From c957aab97a2dec018fe18e24010aa3a1ce11c2ba Mon Sep 17 00:00:00 2001 From: James Casbon Date: Thu, 6 Dec 2012 21:06:01 +0000 Subject: [PATCH 030/168] 0.6.2 version bump --- tox.ini | 1 + vcf/__init__.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 16847bb..771e15f 100644 --- a/tox.ini +++ b/tox.ini @@ -15,6 +15,7 @@ commands = deps = argparse ordereddict + cython pysam [testenv:py27] diff --git a/vcf/__init__.py b/vcf/__init__.py index a8be533..cdd1545 100644 --- a/vcf/__init__.py +++ b/vcf/__init__.py @@ -177,4 +177,4 @@ from vcf.filters import Base as Filter from vcf.parser import RESERVED_INFO, RESERVED_FORMAT -VERSION = '0.6.1' +VERSION = '0.6.2' From 95fd749220d3cf3dedda77624f7e9f1e544c01b7 Mon Sep 17 00:00:00 2001 From: James Casbon Date: Thu, 6 Dec 2012 21:07:23 +0000 Subject: [PATCH 031/168] history update --- docs/HISTORY.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/HISTORY.rst b/docs/HISTORY.rst index 1e61871..3147631 100644 --- a/docs/HISTORY.rst +++ b/docs/HISTORY.rst @@ -17,6 +17,11 @@ New features should have test code sent with them. Changes ======= +0.6.2 Release +------------- + +* issues #78, #79 (thanks Sean, Brad) + 0.6.1 Release ------------- From 8acaeb3a151a18385748d82ea1fa7310d0a9da4e Mon Sep 17 00:00:00 2001 From: chapmanb Date: Wed, 26 Dec 2012 07:51:19 -0500 Subject: [PATCH 032/168] Correctly format contig output lines from writer, making output VCFs compatible with GATK. Fixes #74 --- vcf/parser.py | 6 +++++- vcf/test/test_vcf.py | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/vcf/parser.py b/vcf/parser.py index 51e8af1..97a202d 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -549,12 +549,16 @@ def __init__(self, stream, template, lineterminator="\r\n"): two = '##{key}=\n' four = '##{key}=\n' + contig_format = '##contig=\n' _num = self._fix_field_count for (key, vals) in template.metadata.iteritems(): if key in SINGULAR_METADATA: vals = [vals] for val in vals: - stream.write('##{0}={1}\n'.format(key, val)) + if key == "contig": + stream.write(contig_format.format(**val)) + else: + stream.write('##{0}={1}\n'.format(key, val)) for line in template.infos.itervalues(): stream.write(four.format(key="INFO", *line, num=_num(line.num))) for line in template.formats.itervalues(): diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index e4c3426..a88f4ef 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -218,7 +218,11 @@ def testWrite(self): for record in records: writer.write_record(record) out.seek(0) - print (out.getvalue()) + out_str = out.getvalue() + for line in out_str.split("\n"): + if line.startswith("##contig"): + assert " Date: Thu, 10 Jan 2013 16:35:40 +0100 Subject: [PATCH 033/168] Correctly write meta lines with dictionary value Write meta lines with a dictionary-like value as ##meta= instead of as the Python dictionary string representation. This is a fix for jamescasbon#83 and a generalization of jamescasbon#81. A regression compared to jamescasbon#81 is that the order of fields in a `contig` line is no longer defined. --- vcf/parser.py | 7 ++++--- vcf/test/test_vcf.py | 24 +++++++++++++++++++++++- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/vcf/parser.py b/vcf/parser.py index 97a202d..4dd18f4 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -549,14 +549,15 @@ def __init__(self, stream, template, lineterminator="\r\n"): two = '##{key}=\n' four = '##{key}=\n' - contig_format = '##contig=\n' _num = self._fix_field_count for (key, vals) in template.metadata.iteritems(): if key in SINGULAR_METADATA: vals = [vals] for val in vals: - if key == "contig": - stream.write(contig_format.format(**val)) + if isinstance(val, dict): + values = ','.join('{0}={1}'.format(key, value) + for key, value in val.items()) + stream.write('##{0}=<{1}>\n'.format(key, values)) else: stream.write('##{0}={1}\n'.format(key, val)) for line in template.infos.itervalues(): diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index a88f4ef..73a2c22 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -221,7 +221,7 @@ def testWrite(self): out_str = out.getvalue() for line in out_str.split("\n"): if line.startswith("##contig"): - assert " Date: Thu, 10 Jan 2013 23:46:50 +0100 Subject: [PATCH 034/168] Preserve order in meta lines with dictionary value --- vcf/parser.py | 2 +- vcf/test/test_vcf.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vcf/parser.py b/vcf/parser.py index 4dd18f4..cbec08d 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -158,7 +158,7 @@ def read_meta_hash(self, meta_string): # Removing initial hash marks and final equal sign key = items[0][2:-1] hashItems = items[1].split(',') - val = dict(item.split("=") for item in hashItems) + val = OrderedDict(item.split("=") for item in hashItems) return key, val def read_meta(self, meta_string): diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 73a2c22..be060c0 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -273,7 +273,7 @@ def testWrite(self): out_str = out.getvalue() for line in out_str.split("\n"): if line.startswith("##PEDIGREE"): - assert line.startswith('##PEDIGREE=<'), "Found dictionary in meta line: {0}".format(line) + self.assertEquals(line, '##PEDIGREE=') if line.startswith("##SAMPLE"): assert line.startswith('##SAMPLE=<'), "Found dictionary in meta line: {0}".format(line) From 3256c66306b8432eac1ffc7f004f13aa551564d1 Mon Sep 17 00:00:00 2001 From: James Casbon Date: Wed, 16 Jan 2013 11:08:36 +0000 Subject: [PATCH 035/168] add missing cparse implementation of #79 --- vcf/cparse.pyx | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/vcf/cparse.pyx b/vcf/cparse.pyx index 4a473d7..682e6a7 100644 --- a/vcf/cparse.pyx +++ b/vcf/cparse.pyx @@ -48,7 +48,10 @@ def parse_samples( if entry_num == 1 or ',' not in vals: if entry_type == INTEGER: - sampdat[j] = int(vals) + try: + sampdat[j] = int(vals) + except ValueError: + sampdat[j] = float(vals) elif entry_type == FLOAT or entry_type == NUMERIC: sampdat[j] = float(vals) else: @@ -62,7 +65,10 @@ def parse_samples( vals = vals.split(',') if entry_type == INTEGER: - sampdat[j] = _map(int, vals) + try: + sampdat[j] = _map(int, vals) + except ValueError: + sampdat[j] = map(float, vals) elif entry_type == FLOAT or entry_type == NUMERIC: sampdat[j] = _map(float, vals) else: From 6a64d4b2e27821e77ce495f5899a059415001b0f Mon Sep 17 00:00:00 2001 From: James Casbon Date: Wed, 16 Jan 2013 11:40:59 +0000 Subject: [PATCH 036/168] version bump to 0.6.3 --- .gitignore | 1 + docs/HISTORY.rst | 6 ++++++ vcf/__init__.py | 2 +- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index b9e4fea..a18ec95 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ docs/_build .DS_Store vcf/cparse.c vcf/cparse.so +.coverage diff --git a/docs/HISTORY.rst b/docs/HISTORY.rst index 3147631..fc3f2b3 100644 --- a/docs/HISTORY.rst +++ b/docs/HISTORY.rst @@ -17,6 +17,12 @@ New features should have test code sent with them. Changes ======= +0.6.3 Release +------------- + +* cython port of #79 +* correct writing of meta lines #84 + 0.6.2 Release ------------- diff --git a/vcf/__init__.py b/vcf/__init__.py index cdd1545..7ab38ee 100644 --- a/vcf/__init__.py +++ b/vcf/__init__.py @@ -177,4 +177,4 @@ from vcf.filters import Base as Filter from vcf.parser import RESERVED_INFO, RESERVED_FORMAT -VERSION = '0.6.2' +VERSION = '0.6.3' From 53548b6a73220e328cb0d891ad25ecf0f1eab4d0 Mon Sep 17 00:00:00 2001 From: James Casbon Date: Thu, 17 Jan 2013 10:01:18 +0000 Subject: [PATCH 037/168] Update .travis.yml Fix the travis build, hopefully --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index cdbf63a..47b1002 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,7 +6,7 @@ python: - "3.2" - "pypy" install: - - "if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install --use-mirrors pysam argparse ordereddict; fi" - - "if [[ $TRAVIS_PYTHON_VERSION == '2.7' ]]; then pip install --use-mirrors pysam; fi" + - "if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install --use-mirrors cython && pip install --use-mirrors pysam argparse ordereddict; fi" + - "if [[ $TRAVIS_PYTHON_VERSION == '2.7' ]]; then pip install --use-mirrors cython && pip install --use-mirrors pysam; fi" - python setup.py install script: python setup.py test From 4ce6aff3f6930660bcfe52cdf0e08ff53a9ec969 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Mon, 28 Jan 2013 12:33:52 +0000 Subject: [PATCH 038/168] handle String INFO fields with multiple values --- vcf/parser.py | 5 +++-- vcf/test/example-4.1-info-multiple-values.vcf | 7 +++++++ vcf/test/test_vcf.py | 16 ++++++++++++++++ 3 files changed, 26 insertions(+), 2 deletions(-) create mode 100644 vcf/test/example-4.1-info-multiple-values.vcf diff --git a/vcf/parser.py b/vcf/parser.py index cbec08d..6d42f88 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -323,12 +323,13 @@ def _parse_info(self, info_str): val = True elif entry_type == 'String': try: - val = entry[1] + vals = entry[1].split(',') # commas are reserved characters indicating multiple values + val = self._map(str, vals) except IndexError: val = True try: - if self.infos[ID].num == 1 and entry_type not in ( 'String', 'Flag'): + if self.infos[ID].num == 1 and entry_type not in ( 'Flag', ): val = val[0] except KeyError: pass diff --git a/vcf/test/example-4.1-info-multiple-values.vcf b/vcf/test/example-4.1-info-multiple-values.vcf new file mode 100644 index 0000000..6faf95e --- /dev/null +++ b/vcf/test/example-4.1-info-multiple-values.vcf @@ -0,0 +1,7 @@ +##fileformat=VCFv4.1 +##contig= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT +Pf3D7_01_v3 401 . C T 53.99 PASS RepeatCopies=19.3,47.4,14.0;RepeatSize=42,14,56;RepeatConsensus=TCTTATCTTCTTACTTTTCATTCCTTACTCTTACTTACTTAC,TTACTCTTACTTAC,TTACTCTTACTTACTTACTCTTACTTACTTACTCTTACTTACTTACTCTTATCTTC diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index be060c0..072cfd2 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -571,6 +571,22 @@ def test_qual(self): self.assertEqual(expected, qual) self.assertEqual(type(expected), qtype) + def test_info_multiple_values(self): + reader = vcf.Reader(fh('example-4.1-info-multiple-values.vcf')) + var = reader.next() + # check Float type INFO field with multiple values + expected = [19.3, 47.4, 14.0] + actual = var.INFO['RepeatCopies'] + self.assertEqual(expected, actual) + # check Integer type INFO field with multiple values + expected = [42, 14, 56] + actual = var.INFO['RepeatSize'] + self.assertEqual(expected, actual) + # check String type INFO field with multiple values + expected = ['TCTTATCTTCTTACTTTTCATTCCTTACTCTTACTTACTTAC', 'TTACTCTTACTTAC', 'TTACTCTTACTTACTTACTCTTACTTACTTACTCTTACTTACTTACTCTTATCTTC'] + actual = var.INFO['RepeatConsensus'] + self.assertEqual(expected, actual) + class TestCall(unittest.TestCase): From 3540bb7feb13e21cb993770966d5d86992d95a44 Mon Sep 17 00:00:00 2001 From: bow Date: Wed, 30 Jan 2013 17:33:34 +0100 Subject: [PATCH 039/168] Update writer unit tests to test call data equality Samples written by the writer should have the exact same data before and after they are parsed. Previously this was not tested, since call data equality testing only checks for the sample name, genotype, and record (and not other data fields). --- vcf/test/test_vcf.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index be060c0..5bf1e6b 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -232,6 +232,11 @@ def testWrite(self): for l, r in zip(records, reader2): self.assertEquals(l.samples, r.samples) + # test for call data equality, since equality on the sample calls + # may not always mean their data are all equal + for l_call, r_call in zip(l.samples, r.samples): + self.assertEqual(l_call.data, r_call.data) + class TestBcfToolsOutputWriter(unittest.TestCase): @@ -256,6 +261,11 @@ def testWrite(self): for l, r in zip(records, reader2): self.assertEquals(l.samples, r.samples) + # test for call data equality, since equality on the sample calls + # may not always mean their data are all equal + for l_call, r_call in zip(l.samples, r.samples): + self.assertEqual(l_call.data, r_call.data) + class TestWriterDictionaryMeta(unittest.TestCase): From e3e54843fb6b0b88b97a4d5ea936d9b64e69a2d6 Mon Sep 17 00:00:00 2001 From: bow Date: Tue, 29 Jan 2013 01:21:13 +0100 Subject: [PATCH 040/168] Fix bug that removes sample data when GT field is not present Some programs (e.g. bcftools) may output VCF files whose samples do not have its GT field value. When dealing with files like these, PyVCF's writer will (previously) remove all non-GT data and replace it with './.' since the `_format_sample` function immediately returns upon failing to find GT data. This fix addresses the issue, so that the Writer keeps any non-GT data intact. --- vcf/parser.py | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/vcf/parser.py b/vcf/parser.py index cbec08d..a2e7eaa 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -620,9 +620,30 @@ def _format_info(self, info): return ';'.join([self._stringify_pair(x,y) for x, y in info.iteritems()]) def _format_sample(self, fmt, sample): - if getattr(sample.data, 'GT', None) is None: - return "./." - return ':'.join([self._stringify(x) for x in sample.data]) + try: + # Try to get the GT value first. + gt = getattr(sample.data, 'GT') + # PyVCF stores './.' GT values as None, so we need to revert it back + # to './.' when writing. + if gt is None: + gt = './.' + except AttributeError: + # Failing that, try to check whether 'GT' is specified in the FORMAT + # field. If yes, use the recommended empty value ('./.') + if 'GT' in fmt: + gt = './.' + # Otherwise use an empty string as the value + else: + gt = '' + # If gt is an empty string (i.e. not stored), write all other data + if not gt: + return ':'.join([self._stringify(x) for x in sample.data]) + # Otherwise use the GT values from above and combine it with the rest of + # the data. + # Note that this follows the VCF spec, where GT is always the first + # item whenever it is present. + else: + return ':'.join([gt] + [self._stringify(x) for x in sample.data[1:]]) def _stringify(self, x, none='.', delim=','): if type(x) == type([]): From 10b26fc4d5733d7e7f97336009449ec12160c1c2 Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Tue, 26 Feb 2013 16:44:07 +0100 Subject: [PATCH 041/168] Record with empty list of samples instead of None --- vcf/model.py | 2 +- vcf/test/1kg.sites.vcf | 200 +++++++++++++++++++++++++++++++++++++++++ vcf/test/test_vcf.py | 12 +++ 3 files changed, 213 insertions(+), 1 deletion(-) create mode 100644 vcf/test/1kg.sites.vcf diff --git a/vcf/model.py b/vcf/model.py index 9748784..7d28506 100644 --- a/vcf/model.py +++ b/vcf/model.py @@ -133,7 +133,7 @@ def __init__(self, CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, self.alleles = [self.REF] self.alleles.extend(self.ALT) #: list of ``_Calls`` for each sample ordered as in source VCF - self.samples = samples + self.samples = samples or [] self._sample_indexes = sample_indexes def __eq__(self, other): diff --git a/vcf/test/1kg.sites.vcf b/vcf/test/1kg.sites.vcf new file mode 100644 index 0000000..857a944 --- /dev/null +++ b/vcf/test/1kg.sites.vcf @@ -0,0 +1,200 @@ +##fileformat=VCFv4.1 +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##ALT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##reference=GRCh37 +#CHROM POS ID REF ALT QUAL FILTER INFO +1 10583 rs58108140 G A 100 PASS AVGPOST=0.7707;RSQ=0.4319;LDAF=0.2327;ERATE=0.0161;AN=2184;VT=SNP;AA=.;THETA=0.0046;AC=314;SNPSOURCE=LOWCOV;AF=0.14;ASN_AF=0.13;AMR_AF=0.17;AFR_AF=0.04;EUR_AF=0.21 +1 10611 rs189107123 C G 100 PASS AN=2184;THETA=0.0077;VT=SNP;AA=.;AC=41;ERATE=0.0048;SNPSOURCE=LOWCOV;AVGPOST=0.9330;LDAF=0.0479;RSQ=0.3475;AF=0.02;ASN_AF=0.01;AMR_AF=0.03;AFR_AF=0.01;EUR_AF=0.02 +1 13302 rs180734498 C T 100 PASS THETA=0.0048;AN=2184;AC=249;VT=SNP;AA=.;RSQ=0.6281;LDAF=0.1573;SNPSOURCE=LOWCOV;AVGPOST=0.8895;ERATE=0.0058;AF=0.11;ASN_AF=0.02;AMR_AF=0.08;AFR_AF=0.21;EUR_AF=0.14 +1 13327 rs144762171 G C 100 PASS AVGPOST=0.9698;AN=2184;VT=SNP;AA=.;RSQ=0.6482;AC=59;SNPSOURCE=LOWCOV;ERATE=0.0012;LDAF=0.0359;THETA=0.0204;AF=0.03;ASN_AF=0.02;AMR_AF=0.03;AFR_AF=0.02;EUR_AF=0.04 +1 13957 rs201747181 TC T 28 PASS AA=TC;AC=35;AF=0.02;AFR_AF=0.02;AMR_AF=0.02;AN=2184;ASN_AF=0.01;AVGPOST=0.8711;ERATE=0.0065;EUR_AF=0.02;LDAF=0.0788;RSQ=0.2501;THETA=0.0100;VT=INDEL +1 13980 rs151276478 T C 100 PASS AN=2184;AC=45;ERATE=0.0034;THETA=0.0139;RSQ=0.3603;LDAF=0.0525;VT=SNP;AA=.;AVGPOST=0.9221;SNPSOURCE=LOWCOV;AF=0.02;ASN_AF=0.02;AMR_AF=0.02;AFR_AF=0.01;EUR_AF=0.02 +1 30923 rs140337953 G T 100 PASS AC=1584;AA=T;AN=2184;RSQ=0.5481;VT=SNP;THETA=0.0162;SNPSOURCE=LOWCOV;ERATE=0.0183;LDAF=0.6576;AVGPOST=0.7335;AF=0.73;ASN_AF=0.89;AMR_AF=0.80;AFR_AF=0.48;EUR_AF=0.73 +1 46402 rs199681827 C CTGT 31 PASS AA=.;AC=8;AF=0.0037;AFR_AF=0.01;AN=2184;ASN_AF=0.0017;AVGPOST=0.8325;ERATE=0.0072;LDAF=0.0903;RSQ=0.0960;THETA=0.0121;VT=INDEL +1 47190 rs200430748 G GA 192 PASS AA=G;AC=29;AF=0.01;AFR_AF=0.06;AMR_AF=0.0028;AN=2184;AVGPOST=0.9041;ERATE=0.0041;LDAF=0.0628;RSQ=0.2883;THETA=0.0153;VT=INDEL +1 51476 rs187298206 T C 100 PASS ERATE=0.0021;AA=C;AC=18;AN=2184;VT=SNP;THETA=0.0103;LDAF=0.0157;SNPSOURCE=LOWCOV;AVGPOST=0.9819;RSQ=0.5258;AF=0.01;ASN_AF=0.01;AMR_AF=0.01;AFR_AF=0.01;EUR_AF=0.01 +1 51479 rs116400033 T A 100 PASS RSQ=0.7414;AVGPOST=0.9085;AA=T;AN=2184;THETA=0.0131;AC=235;VT=SNP;LDAF=0.1404;SNPSOURCE=LOWCOV;ERATE=0.0012;AF=0.11;ASN_AF=0.0035;AMR_AF=0.16;AFR_AF=0.03;EUR_AF=0.22 +1 51914 rs190452223 T G 100 PASS ERATE=0.0004;AVGPOST=0.9985;THETA=0.0159;AA=T;AN=2184;VT=SNP;SNPSOURCE=LOWCOV;AC=1;RSQ=0.4089;LDAF=0.0012;AF=0.0005;ASN_AF=0.0017 +1 51935 rs181754315 C T 100 PASS THETA=0.0126;AA=C;AN=2184;RSQ=0.1888;AVGPOST=0.9972;LDAF=0.0015;VT=SNP;AC=0;SNPSOURCE=LOWCOV;ERATE=0.0006;AF=0 +1 51954 rs185832753 G C 100 PASS LDAF=0.0021;AA=G;AN=2184;RSQ=0.4692;AVGPOST=0.9975;VT=SNP;SNPSOURCE=LOWCOV;THETA=0.0029;ERATE=0.0006;AC=2;AF=0.0009;AMR_AF=0.01 +1 52058 rs62637813 G C 100 PASS AA=C;ERATE=0.0057;AN=2184;AVGPOST=0.9264;VT=SNP;RSQ=0.4882;AC=64;SNPSOURCE=LOWCOV;LDAF=0.0620;THETA=0.0069;AF=0.03;ASN_AF=0.0017;AMR_AF=0.04;AFR_AF=0.02;EUR_AF=0.05 +1 52144 rs190291950 T A 100 PASS THETA=0.0093;ERATE=0.0013;LDAF=0.0156;AA=T;AN=2184;VT=SNP;RSQ=0.5220;AVGPOST=0.9811;SNPSOURCE=LOWCOV;AC=21;AF=0.01;ASN_AF=0.0035;AMR_AF=0.01;AFR_AF=0.01;EUR_AF=0.01 +1 52185 rs201374420 TTAA T 244 PASS AA=.;AC=10;AF=0.0046;AFR_AF=0.0020;AMR_AF=0.02;AN=2184;ASN_AF=0.0035;AVGPOST=0.9840;ERATE=0.0037;LDAF=0.0124;RSQ=0.4271;THETA=0.0232;VT=INDEL +1 52238 rs150021059 T G 100 PASS THETA=0.0132;AA=G;AN=2184;RSQ=0.6256;VT=SNP;ERATE=0.0026;AVGPOST=0.8617;SNPSOURCE=LOWCOV;AC=1941;LDAF=0.8423;AF=0.89;ASN_AF=0.99;AMR_AF=0.93;AFR_AF=0.64;EUR_AF=0.95 +1 53234 rs199502715 CAT C 227 PASS AA=CAT;AC=10;AF=0.0046;AFR_AF=0.02;AMR_AF=0.0028;AN=2184;AVGPOST=0.9936;ERATE=0.0007;LDAF=0.0074;RSQ=0.6237;THETA=0.0119;VT=INDEL +1 54353 rs140052487 C A 100 PASS THETA=0.0026;AA=C;AN=2184;AC=16;VT=SNP;RSQ=0.5074;SNPSOURCE=LOWCOV;AVGPOST=0.9844;LDAF=0.0146;ERATE=0.0058;AF=0.01;ASN_AF=0.01;AMR_AF=0.0028;AFR_AF=0.02;EUR_AF=0.0013 +1 54421 rs146477069 A G 100 PASS ERATE=0.0013;AN=2184;AC=220;VT=SNP;RSQ=0.7869;AVGPOST=0.9461;AA=A;THETA=0.0025;SNPSOURCE=LOWCOV;LDAF=0.1190;AF=0.10;ASN_AF=0.25;AMR_AF=0.12;AFR_AF=0.03;EUR_AF=0.02 +1 54490 rs141149254 G A 100 PASS ERATE=0.0004;THETA=0.0074;AA=G;AN=2184;VT=SNP;RSQ=0.8366;AVGPOST=0.9646;AC=175;SNPSOURCE=LOWCOV;LDAF=0.0929;AF=0.08;ASN_AF=0.0035;AMR_AF=0.12;AFR_AF=0.03;EUR_AF=0.15 +1 54676 rs2462492 C T 100 PASS LDAF=0.1528;RSQ=0.6989;AA=T;AN=2184;AC=267;VT=SNP;AVGPOST=0.8998;SNPSOURCE=LOWCOV;THETA=0.0110;ERATE=0.0037;AF=0.12;ASN_AF=0.02;AMR_AF=0.20;AFR_AF=0.09;EUR_AF=0.18 +1 54753 rs143174675 T G 100 PASS AA=T;AN=2184;RSQ=0.6820;AC=65;VT=SNP;THETA=0.0080;ERATE=0.0016;SNPSOURCE=LOWCOV;AVGPOST=0.9697;LDAF=0.0399;AF=0.03;AMR_AF=0.04;AFR_AF=0.07;EUR_AF=0.03 +1 55164 rs3091274 C A 100 PASS AN=2184;VT=SNP;ERATE=0.0045;AA=A;THETA=0.0162;SNPSOURCE=LOWCOV;AC=1955;RSQ=0.6373;AVGPOST=0.8686;LDAF=0.8489;AF=0.90;ASN_AF=0.99;AMR_AF=0.94;AFR_AF=0.65;EUR_AF=0.96 +1 55249 rs200769871 C CTATGG 443 PASS AA=C;AC=151;AF=0.07;AFR_AF=0.03;AMR_AF=0.08;AN=2184;ASN_AF=0.16;AVGPOST=0.9073;ERATE=0.0063;EUR_AF=0.02;LDAF=0.0968;RSQ=0.5891;THETA=0.0038;VT=INDEL +1 55299 rs10399749 C T 100 PASS RSQ=0.7602;LDAF=0.2954;AN=2184;VT=SNP;ERATE=0.0051;AA=c;AC=554;SNPSOURCE=LOWCOV;AVGPOST=0.8845;THETA=0.0070;AF=0.25;ASN_AF=0.33;AMR_AF=0.21;AFR_AF=0.39;EUR_AF=0.13 +1 55313 rs182462964 A T 100 PASS ERATE=0.0004;RSQ=0.6112;AVGPOST=0.9994;AN=2184;VT=SNP;THETA=0.0057;AA=A;SNPSOURCE=LOWCOV;AC=1;LDAF=0.0008;AF=0.0005;AFR_AF=0.0020 +1 55326 rs3107975 T C 100 PASS AA=C;ERATE=0.0074;AN=2184;THETA=0.0085;VT=SNP;SNPSOURCE=LOWCOV;AVGPOST=0.9622;AC=90;RSQ=0.6901;LDAF=0.0562;AF=0.04;ASN_AF=0.07;AMR_AF=0.02;AFR_AF=0.07;EUR_AF=0.01 +1 55330 rs185215913 G A 100 PASS ERATE=0.0005;AA=G;AN=2184;VT=SNP;THETA=0.0086;AVGPOST=0.9988;LDAF=0.0011;SNPSOURCE=LOWCOV;AC=1;RSQ=0.4701;AF=0.0005;AFR_AF=0.0020 +1 55367 rs190850374 G A 100 PASS ERATE=0.0004;THETA=0.0044;AA=G;AN=2184;VT=SNP;LDAF=0.0029;RSQ=0.3860;SNPSOURCE=LOWCOV;AVGPOST=0.9961;AC=2;AF=0.0009;AMR_AF=0.01 +1 55388 rs182711216 C T 100 PASS THETA=0.0102;ERATE=0.0005;AA=C;AVGPOST=0.9983;AN=2184;LDAF=0.0010;VT=SNP;RSQ=0.2348;SNPSOURCE=LOWCOV;AC=1;AF=0.0005;ASN_AF=0.0017 +1 55394 rs2949420 T A 100 PASS AC=18;AN=2184;VT=SNP;AA=A;RSQ=0.4995;AVGPOST=0.9784;LDAF=0.0171;SNPSOURCE=LOWCOV;ERATE=0.0012;THETA=0.0063;AF=0.01;AMR_AF=0.01;AFR_AF=0.0041;EUR_AF=0.02 +1 55416 rs193242050 G A 100 PASS AA=G;AN=2184;AVGPOST=0.9944;VT=SNP;LDAF=0.0064;AC=9;THETA=0.0019;RSQ=0.6553;SNPSOURCE=LOWCOV;ERATE=0.0006;AF=0.0041;AFR_AF=0.02 +1 55427 rs183189405 T C 100 PASS THETA=0.0054;AA=T;AN=2184;VT=SNP;AVGPOST=0.9969;LDAF=0.0020;SNPSOURCE=LOWCOV;AC=1;RSQ=0.2759;ERATE=0.0007;AF=0.0005;AFR_AF=0.0020 +1 55816 rs187434873 G A 100 PASS AN=2184;THETA=0.0119;VT=SNP;AC=10;RSQ=0.4578;AA=A;SNPSOURCE=LOWCOV;AVGPOST=0.9844;LDAF=0.0108;ERATE=0.0007;AF=0.0046;AMR_AF=0.01;EUR_AF=0.01 +1 55850 rs191890754 C G 100 PASS AVGPOST=0.9921;AA=G;AN=2184;VT=SNP;RSQ=0.4083;THETA=0.0045;LDAF=0.0056;AC=5;SNPSOURCE=LOWCOV;ERATE=0.0006;AF=0.0023;EUR_AF=0.01 +1 55852 rs184233019 G C 100 PASS THETA=0.0137;AA=G;AN=2184;RSQ=0.5433;ERATE=0.0009;LDAF=0.0046;VT=SNP;AVGPOST=0.9953;AC=5;SNPSOURCE=LOWCOV;AF=0.0023;AMR_AF=0.01;EUR_AF=0.0013 +1 56644 rs143342222 A C 100 PASS AN=2184;AVGPOST=0.9962;LDAF=0.0040;ERATE=0.0024;VT=SNP;AA=A;RSQ=0.5700;AC=5;SNPSOURCE=LOWCOV;THETA=0.0117;AF=0.0023;AFR_AF=0.01 +1 57952 rs189727433 A C 100 PASS AA=C;ERATE=0.0085;AN=2184;LDAF=0.7878;VT=SNP;THETA=0.0076;RSQ=0.4712;AC=1902;SNPSOURCE=LOWCOV;AVGPOST=0.7578;AF=0.87;ASN_AF=0.98;AMR_AF=0.91;AFR_AF=0.64;EUR_AF=0.91 +1 58814 rs114420996 G A 100 PASS AC=223;THETA=0.0032;AA=G;AN=2184;RSQ=0.9087;LDAF=0.1074;VT=SNP;SNPSOURCE=LOWCOV;ERATE=0.0006;AVGPOST=0.9777;AF=0.10;ASN_AF=0.03;AMR_AF=0.17;AFR_AF=0.20;EUR_AF=0.06 +1 59040 rs149755937 T C 100 PASS AVGPOST=0.9710;AC=115;AA=T;AN=2184;RSQ=0.8248;VT=SNP;ERATE=0.0017;THETA=0.0025;SNPSOURCE=LOWCOV;LDAF=0.0613;AF=0.05;ASN_AF=0.03;AMR_AF=0.15;AFR_AF=0.0041;EUR_AF=0.06 +1 60726 rs192328835 C A 100 PASS AVGPOST=0.9092;AN=2184;RSQ=0.5988;ERATE=0.0081;AC=144;VT=SNP;THETA=0.0045;AA=A;SNPSOURCE=LOWCOV;LDAF=0.0959;AF=0.07;ASN_AF=0.05;AMR_AF=0.10;AFR_AF=0.11;EUR_AF=0.03 +1 61442 rs74970982 A G 100 PASS LDAF=0.9152;AA=G;AN=2184;VT=SNP;ERATE=0.0026;RSQ=0.4867;AVGPOST=0.9004;SNPSOURCE=LOWCOV;THETA=0.0013;AC=2084;AF=0.95;ASN_AF=1.00;AMR_AF=0.97;AFR_AF=0.84;EUR_AF=0.99 +1 61462 rs56992750 T A 100 PASS THETA=0.0023;LDAF=0.0378;RSQ=0.7396;AA=T;AN=2184;AVGPOST=0.9773;VT=SNP;AC=68;SNPSOURCE=LOWCOV;ERATE=0.0012;AF=0.03;AMR_AF=0.02;AFR_AF=0.13 +1 61743 rs184286948 G C 100 PASS AVGPOST=0.9939;LDAF=0.0047;AA=G;AN=2184;VT=SNP;ERATE=0.0011;SNPSOURCE=LOWCOV;AC=4;THETA=0.0016;RSQ=0.4838;AF=0.0018;AMR_AF=0.01;EUR_AF=0.0026 +1 61987 rs76735897 A G 100 PASS THETA=0.0015;AN=2184;AC=569;VT=SNP;AA=A;RSQ=0.7192;AVGPOST=0.8533;LDAF=0.2944;SNPSOURCE=LOWCOV;ERATE=0.0012;AF=0.26;ASN_AF=0.07;AMR_AF=0.31;AFR_AF=0.25;EUR_AF=0.39 +1 61989 rs77573425 G C 100 PASS RSQ=0.7254;AVGPOST=0.8584;AA=G;AN=2184;LDAF=0.2849;VT=SNP;AC=555;THETA=0.0019;SNPSOURCE=LOWCOV;ERATE=0.0007;AF=0.25;ASN_AF=0.07;AMR_AF=0.31;AFR_AF=0.22;EUR_AF=0.39 +1 61993 rs190553843 C T 100 PASS AC=7;RSQ=0.6106;AA=C;THETA=0.0143;AN=2184;ERATE=0.0009;VT=SNP;AVGPOST=0.9953;SNPSOURCE=LOWCOV;LDAF=0.0050;AF=0.0032;AFR_AF=0.01 +1 62156 rs181864839 C T 100 PASS ERATE=0.0005;AA=C;AN=2184;AVGPOST=0.9979;LDAF=0.0015;VT=SNP;THETA=0.0094;SNPSOURCE=LOWCOV;AC=1;RSQ=0.4561;AF=0.0005;AFR_AF=0.0020 +1 62157 rs10399597 G A 100 PASS AVGPOST=0.9945;AA=G;AN=2184;ERATE=0.0025;VT=SNP;RSQ=0.5217;AC=5;THETA=0.0066;SNPSOURCE=LOWCOV;LDAF=0.0050;AF=0.0023;AFR_AF=0.01 +1 62162 rs140556834 G A 100 PASS AA=G;AN=2184;AC=8;LDAF=0.0057;VT=SNP;THETA=0.0018;ERATE=0.0017;RSQ=0.6089;AVGPOST=0.9948;SNPSOURCE=LOWCOV;AF=0.0037;AMR_AF=0.0028;AFR_AF=0.01;EUR_AF=0.0013 +1 63276 rs185977555 G A 100 PASS RSQ=0.2744;AA=G;AN=2184;AVGPOST=0.9947;VT=SNP;ERATE=0.0010;SNPSOURCE=LOWCOV;AC=1;THETA=0.0010;LDAF=0.0031;AF=0.0005;AFR_AF=0.0020 +1 63297 rs188886746 G A 100 PASS ERATE=0.0005;AVGPOST=0.9986;AA=G;AN=2184;VT=SNP;AC=0;SNPSOURCE=LOWCOV;RSQ=0.2459;THETA=0.0024;LDAF=0.0008;AF=0 +1 63671 rs116440577 G A 100 PASS AA=G;AN=2184;ERATE=0.0047;LDAF=0.1773;VT=SNP;THETA=0.0072;AC=369;SNPSOURCE=LOWCOV;RSQ=0.8980;AVGPOST=0.9652;AF=0.17;ASN_AF=0.05;AMR_AF=0.22;AFR_AF=0.35;EUR_AF=0.11 +1 63735 rs201888535 CCTA C 455 PASS AA=CCTA;AC=829;AF=0.38;AFR_AF=0.13;AMR_AF=0.33;AN=2184;ASN_AF=0.69;AVGPOST=0.7654;ERATE=0.0047;EUR_AF=0.34;LDAF=0.4128;RSQ=0.6424;THETA=0.0062;VT=INDEL +1 64649 rs181431124 A C 100 PASS RSQ=0.6975;AN=2184;VT=SNP;AA=.;ERATE=0.0008;AVGPOST=0.9918;SNPSOURCE=LOWCOV;AC=21;THETA=0.0024;LDAF=0.0114;AF=0.01;AMR_AF=0.01;EUR_AF=0.03 +1 66162 rs62639105 A T 100 PASS THETA=0.0026;ERATE=0.0166;LDAF=0.3089;AN=2184;VT=SNP;AA=.;AC=544;SNPSOURCE=LOWCOV;RSQ=0.5681;AVGPOST=0.7777;AF=0.25;ASN_AF=0.07;AMR_AF=0.30;AFR_AF=0.23;EUR_AF=0.38 +1 66176 rs28552463 T A 100 PASS AN=2184;RSQ=0.4451;VT=SNP;AA=.;THETA=0.0095;LDAF=0.0631;AC=70;SNPSOURCE=LOWCOV;ERATE=0.0061;AVGPOST=0.9210;AF=0.03;ASN_AF=0.0017;AMR_AF=0.01;AFR_AF=0.13;EUR_AF=0.0013 +1 66219 rs181028663 A T 100 PASS LDAF=0.1137;ERATE=0.0074;AN=2184;VT=SNP;AA=.;AC=68;THETA=0.0059;RSQ=0.2946;AVGPOST=0.8268;SNPSOURCE=LOWCOV;AF=0.03;ASN_AF=0.08;AMR_AF=0.04;AFR_AF=0.01;EUR_AF=0.01 +1 66331 rs186063952 A C 100 PASS THETA=0.0126;AVGPOST=0.7656;RSQ=0.1616;AN=2184;LDAF=0.1387;ERATE=0.0093;VT=SNP;AA=.;SNPSOURCE=LOWCOV;AC=42;AF=0.02;ASN_AF=0.0035;AMR_AF=0.01;AFR_AF=0.07 +1 66442 rs192044252 T A 100 PASS RSQ=0.1763;AVGPOST=0.7894;AN=2184;THETA=0.0031;VT=SNP;AA=.;SNPSOURCE=LOWCOV;AC=36;ERATE=0.0107;LDAF=0.1241;AF=0.02;ASN_AF=0.0035;AMR_AF=0.03;AFR_AF=0.02;EUR_AF=0.01 +1 66457 rs13328655 T A 100 PASS ERATE=0.0085;AN=2184;VT=SNP;AA=.;AC=31;AVGPOST=0.8340;LDAF=0.0957;RSQ=0.1836;SNPSOURCE=LOWCOV;THETA=0.0024;AF=0.01;ASN_AF=0.01;AMR_AF=0.01;AFR_AF=0.03;EUR_AF=0.01 +1 66507 rs12401368 T A 100 PASS ERATE=0.0197;AN=2184;VT=SNP;AA=.;THETA=0.0122;SNPSOURCE=LOWCOV;AC=170;RSQ=0.2110;LDAF=0.2457;AVGPOST=0.6536;AF=0.08;ASN_AF=0.07;AMR_AF=0.09;AFR_AF=0.05;EUR_AF=0.09 +1 67179 rs149952626 C G 100 PASS AVGPOST=0.9946;AN=2184;VT=SNP;AA=.;THETA=0.0046;SNPSOURCE=LOWCOV;ERATE=0.0012;AC=11;RSQ=0.6333;LDAF=0.0069;AF=0.01;ASN_AF=0.02 +1 67181 rs77662731 A G 100 PASS AVGPOST=0.9817;THETA=0.0096;ERATE=0.0013;AN=2184;RSQ=0.8542;AC=104;LDAF=0.0529;VT=SNP;AA=.;SNPSOURCE=LOWCOV;AF=0.05;AMR_AF=0.02;AFR_AF=0.20 +1 69511 rs75062661 A G 100 PASS LDAF=0.6051;AC=1424;ERATE=0.0237;AN=2184;RSQ=0.5669;VT=SNP;AA=.;AVGPOST=0.7173;SNPSOURCE=LOWCOV;THETA=0.0052;AF=0.65;ASN_AF=0.87;AMR_AF=0.65;AFR_AF=0.33;EUR_AF=0.70 +1 69534 rs190717287 T C 100 PASS AVGPOST=0.9986;LDAF=0.0013;AN=2184;VT=SNP;AA=.;SNPSOURCE=LOWCOV;AC=1;RSQ=0.4002;THETA=0.0016;ERATE=0.0006;AF=0.0005;ASN_AF=0.0017 +1 69536 rs200013390 C T 100 PASS AA=.;AC=0;AF=0;AN=2184;AVGPOST=0.9986;ERATE=0.0006;LDAF=0.0008;RSQ=0.0677;SNPSOURCE=EXOME;THETA=0.0087;VT=SNP +1 72119 rs199639004 G GTA 158 PASS AA=.;AC=8;AF=0.0037;AMR_AF=0.0028;AN=2184;ASN_AF=0.01;AVGPOST=0.9589;ERATE=0.0026;EUR_AF=0.0013;LDAF=0.0243;RSQ=0.2268;THETA=0.0016;VT=INDEL +1 72148 rs182862337 C T 100 PASS AN=2184;RSQ=0.2794;THETA=0.0130;VT=SNP;AA=.;LDAF=0.0019;AVGPOST=0.9971;SNPSOURCE=LOWCOV;AC=1;ERATE=0.0007;AF=0.0005;AMR_AF=0.0028 +1 72297 rs200651397 G GTAT 160 PASS AA=G;AC=19;AF=0.01;AMR_AF=0.02;AN=2184;ASN_AF=0.01;AVGPOST=0.9383;ERATE=0.0055;EUR_AF=0.01;LDAF=0.0399;RSQ=0.3194;THETA=0.0064;VT=INDEL +1 73841 rs143773730 C T 100 PASS ERATE=0.0303;THETA=0.0044;AN=2184;AVGPOST=0.8178;RSQ=0.5832;VT=SNP;AA=.;SNPSOURCE=LOWCOV;LDAF=0.2588;AC=425;AF=0.19;ASN_AF=0.15;AMR_AF=0.22;AFR_AF=0.17;EUR_AF=0.23 +1 77462 rs188023513 G A 100 PASS LDAF=0.1685;AN=2184;AVGPOST=0.8149;VT=SNP;AA=.;RSQ=0.4624;AC=198;THETA=0.0100;SNPSOURCE=LOWCOV;ERATE=0.0222;AF=0.09;ASN_AF=0.11;AMR_AF=0.12;AFR_AF=0.08;EUR_AF=0.07 +1 77470 rs192898053 T C 100 PASS LDAF=0.0047;AN=2184;VT=SNP;AA=.;ERATE=0.0011;AVGPOST=0.9918;THETA=0.0025;RSQ=0.1818;SNPSOURCE=LOWCOV;AC=1;AF=0.0005;AFR_AF=0.0020 +1 77874 rs184538873 G A 100 PASS THETA=0.0068;LDAF=0.0516;AN=2184;VT=SNP;AA=.;AVGPOST=0.9594;ERATE=0.0011;AC=87;SNPSOURCE=LOWCOV;RSQ=0.6970;AF=0.04;ASN_AF=0.01;AMR_AF=0.12;AFR_AF=0.0041;EUR_AF=0.04 +1 77961 rs78385339 G A 100 PASS AVGPOST=0.9114;AN=2184;VT=SNP;AA=.;THETA=0.0072;RSQ=0.6667;ERATE=0.0011;SNPSOURCE=LOWCOV;AC=192;LDAF=0.1180;AF=0.09;ASN_AF=0.20;AMR_AF=0.14;AFR_AF=0.01;EUR_AF=0.03 +1 79033 rs62641298 A G 100 PASS AVGPOST=0.7371;THETA=0.0022;LDAF=0.7962;AN=2184;ERATE=0.0054;VT=SNP;AA=.;AC=1961;SNPSOURCE=LOWCOV;RSQ=0.3963;AF=0.90;ASN_AF=0.98;AMR_AF=0.95;AFR_AF=0.65;EUR_AF=0.97 +1 79050 rs62641299 G T 100 PASS AC=1871;AN=2184;THETA=0.0031;RSQ=0.3928;VT=SNP;AA=.;AVGPOST=0.6803;SNPSOURCE=LOWCOV;LDAF=0.7318;ERATE=0.0107;AF=0.86;ASN_AF=0.98;AMR_AF=0.93;AFR_AF=0.54;EUR_AF=0.94 +1 79137 rs143777184 A T 100 PASS AN=2184;AC=55;ERATE=0.0009;AVGPOST=0.9773;LDAF=0.0324;VT=SNP;AA=.;THETA=0.0091;SNPSOURCE=LOWCOV;RSQ=0.7309;AF=0.03;AMR_AF=0.01;AFR_AF=0.10 +1 79417 rs184768190 C T 100 PASS ERATE=0.0005;THETA=0.0166;AN=2184;RSQ=0.5026;AVGPOST=0.9975;VT=SNP;AA=.;LDAF=0.0022;SNPSOURCE=LOWCOV;AC=2;AF=0.0009;ASN_AF=0.0035 +1 79772 rs147215883 C G 100 PASS LDAF=0.1066;AN=2184;THETA=0.0138;RSQ=0.7199;VT=SNP;AA=.;AVGPOST=0.9271;AC=176;ERATE=0.0011;SNPSOURCE=LOWCOV;AF=0.08;ASN_AF=0.07;AMR_AF=0.06;AFR_AF=0.10;EUR_AF=0.09 +1 79872 rs189224661 T G 100 PASS THETA=0.0054;AN=2184;LDAF=0.0057;VT=SNP;AA=.;ERATE=0.0017;AC=9;AVGPOST=0.9956;SNPSOURCE=LOWCOV;RSQ=0.6548;AF=0.0041;AFR_AF=0.02 +1 80454 rs144226842 G C 100 PASS RSQ=0.6549;LDAF=0.0035;AN=2184;AVGPOST=0.9975;VT=SNP;AA=.;AC=5;SNPSOURCE=LOWCOV;THETA=0.0110;ERATE=0.0015;AF=0.0023;ASN_AF=0.01 +1 81949 rs181567186 T C 100 PASS AN=2184;ERATE=0.0009;VT=SNP;AA=.;AVGPOST=0.9948;LDAF=0.0030;SNPSOURCE=LOWCOV;AC=1;THETA=0.0052;RSQ=0.2129;AF=0.0005;ASN_AF=0.0017 +1 82163 rs139113303 G A 100 PASS AN=2184;LDAF=0.0375;ERATE=0.0009;VT=SNP;AA=.;RSQ=0.7842;AC=66;THETA=0.0053;SNPSOURCE=LOWCOV;AVGPOST=0.9761;AF=0.03;ASN_AF=0.0017;AMR_AF=0.01;AFR_AF=0.0020;EUR_AF=0.08 +1 82249 rs1851945 A G 100 PASS THETA=0.0137;LDAF=0.0712;AVGPOST=0.9150;AN=2184;VT=SNP;AA=.;RSQ=0.4689;AC=75;ERATE=0.0116;SNPSOURCE=LOWCOV;AF=0.03;ASN_AF=0.03;AMR_AF=0.04;AFR_AF=0.02;EUR_AF=0.04 +1 82609 rs149189449 C G 100 PASS ERATE=0.0005;AN=2184;LDAF=0.0364;VT=SNP;AA=.;AC=68;AVGPOST=0.9822;RSQ=0.8408;SNPSOURCE=LOWCOV;THETA=0.0024;AF=0.03;AMR_AF=0.02;AFR_AF=0.0020;EUR_AF=0.08 +1 82676 rs185237834 T G 100 PASS LDAF=0.1144;AN=2184;AVGPOST=0.9264;VT=SNP;AA=.;RSQ=0.7176;AC=198;THETA=0.0025;SNPSOURCE=LOWCOV;ERATE=0.0056;AF=0.09;ASN_AF=0.07;AMR_AF=0.08;AFR_AF=0.12;EUR_AF=0.10 +1 82734 rs4030331 T C 100 PASS AN=2184;THETA=0.0008;VT=SNP;AA=.;ERATE=0.0158;RSQ=0.6316;AVGPOST=0.8280;LDAF=0.2433;SNPSOURCE=LOWCOV;AC=435;AF=0.20;ASN_AF=0.15;AMR_AF=0.28;AFR_AF=0.24;EUR_AF=0.17 +1 82957 rs189774606 C T 100 PASS RSQ=0.5163;AN=2184;VT=SNP;AA=.;LDAF=0.0072;THETA=0.0028;AC=9;AVGPOST=0.9918;SNPSOURCE=LOWCOV;ERATE=0.0012;AF=0.0041;AMR_AF=0.01;AFR_AF=0.01 +1 83084 rs181193408 T A 100 PASS AN=2184;AVGPOST=0.8261;VT=SNP;AA=.;RSQ=0.5750;AC=1914;LDAF=0.8278;SNPSOURCE=LOWCOV;ERATE=0.0061;THETA=0.0064;AF=0.88;ASN_AF=0.99;AMR_AF=0.92;AFR_AF=0.58;EUR_AF=0.96 +1 83088 rs186081601 G C 100 PASS ERATE=0.0013;AN=2184;LDAF=0.0043;VT=SNP;AA=.;AVGPOST=0.9922;RSQ=0.1618;THETA=0.0019;SNPSOURCE=LOWCOV;AC=1;AF=0.0005;AFR_AF=0.0020 +1 83771 rs189906733 T G 100 PASS RSQ=0.6473;AN=2184;AVGPOST=0.9871;VT=SNP;AA=.;AC=24;ERATE=0.0011;SNPSOURCE=LOWCOV;THETA=0.0043;LDAF=0.0158;AF=0.01;AMR_AF=0.01;AFR_AF=0.04;EUR_AF=0.0013 +1 83977 rs180759811 A G 100 PASS AN=2184;ERATE=0.0009;VT=SNP;AA=.;THETA=0.0059;LDAF=0.0038;RSQ=0.2074;SNPSOURCE=LOWCOV;AC=1;AVGPOST=0.9932;AF=0.0005;AFR_AF=0.0020 +1 84002 rs28850140 G A 100 PASS THETA=0.0050;ERATE=0.0211;AN=2184;AC=236;VT=SNP;AA=.;AVGPOST=0.8144;LDAF=0.1921;SNPSOURCE=LOWCOV;RSQ=0.4810;AF=0.11;ASN_AF=0.12;AMR_AF=0.15;AFR_AF=0.07;EUR_AF=0.11 +1 84005 rs202079949 AG A 78 PASS AA=.;AC=52;AF=0.02;AFR_AF=0.02;AMR_AF=0.03;AN=2184;ASN_AF=0.01;AVGPOST=0.9360;ERATE=0.0049;EUR_AF=0.04;LDAF=0.0514;RSQ=0.4690;THETA=0.0005;VT=INDEL +1 84010 rs186443818 G A 100 PASS AVGPOST=0.9169;AN=2184;VT=SNP;AA=.;AC=97;THETA=0.0087;LDAF=0.0789;SNPSOURCE=LOWCOV;ERATE=0.0061;RSQ=0.5318;AF=0.04;ASN_AF=0.03;AMR_AF=0.05;AFR_AF=0.03;EUR_AF=0.06 +1 84079 rs190867312 T C 100 PASS ERATE=0.0021;AN=2184;AC=6;VT=SNP;AA=.;LDAF=0.0049;AVGPOST=0.9956;RSQ=0.5906;SNPSOURCE=LOWCOV;THETA=0.0016;AF=0.0027;AMR_AF=0.0028;AFR_AF=0.01 +1 84139 rs183605470 A T 100 PASS THETA=0.0023;AC=28;AN=2184;RSQ=0.6469;VT=SNP;AA=.;LDAF=0.0180;SNPSOURCE=LOWCOV;AVGPOST=0.9835;ERATE=0.0006;AF=0.01;ASN_AF=0.0017;AMR_AF=0.07;AFR_AF=0.0041 +1 84156 rs188652299 A C 100 PASS THETA=0.0009;AVGPOST=0.9936;ERATE=0.0014;RSQ=0.3359;AN=2184;VT=SNP;AA=.;LDAF=0.0044;SNPSOURCE=LOWCOV;AC=3;AF=0.0014;AMR_AF=0.0028;AFR_AF=0.0041 +1 84244 rs191297051 A C 100 PASS LDAF=0.1204;AN=2184;VT=SNP;AA=.;AVGPOST=0.9398;RSQ=0.7828;THETA=0.0025;ERATE=0.0018;SNPSOURCE=LOWCOV;AC=222;AF=0.10;ASN_AF=0.08;AMR_AF=0.08;AFR_AF=0.14;EUR_AF=0.11 +1 84295 rs183209871 G A 100 PASS LDAF=0.0067;AVGPOST=0.9946;AN=2184;THETA=0.0038;VT=SNP;AA=.;AC=9;SNPSOURCE=LOWCOV;ERATE=0.0007;RSQ=0.6599;AF=0.0041;AMR_AF=0.01;EUR_AF=0.01 +1 84346 rs187855973 T C 100 PASS THETA=0.0044;AN=2184;AVGPOST=0.9981;VT=SNP;AA=.;LDAF=0.0014;SNPSOURCE=LOWCOV;AC=1;ERATE=0.0007;RSQ=0.3659;AF=0.0005;EUR_AF=0.0013 +1 84453 rs191379015 C G 100 PASS LDAF=0.0021;RSQ=0.2866;AN=2184;VT=SNP;AA=.;THETA=0.0018;ERATE=0.0008;SNPSOURCE=LOWCOV;AC=1;AVGPOST=0.9968;AF=0.0005;AMR_AF=0.0028 +1 84705 rs183470350 T G 100 PASS LDAF=0.0033;AVGPOST=0.9943;AN=2184;VT=SNP;AA=.;THETA=0.0030;RSQ=0.2658;SNPSOURCE=LOWCOV;AC=2;ERATE=0.0007;AF=0.0009;AMR_AF=0.0028;EUR_AF=0.0013 +1 85063 rs187802690 T C 100 PASS THETA=0.0093;AN=2184;VT=SNP;AA=.;ERATE=0.0051;LDAF=0.0255;RSQ=0.6868;AVGPOST=0.9806;AC=38;SNPSOURCE=LOWCOV;AF=0.02;ASN_AF=0.01;AMR_AF=0.02;AFR_AF=0.01;EUR_AF=0.02 +1 85597 rs192472955 A C 100 PASS AC=145;AVGPOST=0.9322;AN=2184;LDAF=0.0880;VT=SNP;AA=.;RSQ=0.6993;SNPSOURCE=LOWCOV;THETA=0.0020;ERATE=0.0022;AF=0.07;AMR_AF=0.07;AFR_AF=0.11;EUR_AF=0.09 +1 85622 rs185273034 A T 100 PASS ERATE=0.0005;AVGPOST=0.9963;AN=2184;RSQ=0.5194;VT=SNP;AA=.;THETA=0.0174;LDAF=0.0034;SNPSOURCE=LOWCOV;AC=4;AF=0.0018;AFR_AF=0.01 +1 85892 rs147185795 A G 100 PASS AVGPOST=0.9936;RSQ=0.7759;AN=2184;VT=SNP;AA=.;LDAF=0.0122;SNPSOURCE=LOWCOV;AC=21;THETA=0.0116;ERATE=0.0007;AF=0.01;AMR_AF=0.0028;AFR_AF=0.04 +1 86000 rs140628094 A C 100 PASS AN=2184;LDAF=0.0062;VT=SNP;AA=.;AC=10;THETA=0.0018;ERATE=0.0008;RSQ=0.7700;SNPSOURCE=LOWCOV;AVGPOST=0.9968;AF=0.0046;AFR_AF=0.02 +1 86018 rs142878000 C G 100 PASS ERATE=0.0036;RSQ=0.7867;AVGPOST=0.9429;AN=2184;AC=213;LDAF=0.1166;VT=SNP;AA=.;THETA=0.0030;SNPSOURCE=LOWCOV;AF=0.10;ASN_AF=0.08;AMR_AF=0.08;AFR_AF=0.12;EUR_AF=0.11 +1 86028 rs114608975 T C 100 PASS ERATE=0.0005;AC=73;AN=2184;RSQ=0.8713;VT=SNP;AA=.;THETA=0.0108;SNPSOURCE=LOWCOV;AVGPOST=0.9841;LDAF=0.0388;AF=0.03;AMR_AF=0.02;AFR_AF=0.0041;EUR_AF=0.08 +1 86064 rs190167736 G A 100 PASS ERATE=0.0004;AN=2184;VT=SNP;AA=.;THETA=0.0081;SNPSOURCE=LOWCOV;AC=1;RSQ=0.5628;AVGPOST=0.9992;LDAF=0.0008;AF=0.0005;AFR_AF=0.0020 +1 86065 rs116504101 G C 100 PASS ERATE=0.0005;LDAF=0.0398;AN=2184;AVGPOST=0.9846;VT=SNP;AA=.;AC=76;THETA=0.0057;RSQ=0.8725;SNPSOURCE=LOWCOV;AF=0.03;AMR_AF=0.02;AFR_AF=0.01;EUR_AF=0.09 +1 86282 rs192830046 T G 100 PASS LDAF=0.0036;AN=2184;VT=SNP;AA=.;SNPSOURCE=LOWCOV;ERATE=0.0012;RSQ=0.2764;AVGPOST=0.9941;AC=2;THETA=0.0034;AF=0.0009;AMR_AF=0.0028;EUR_AF=0.0013 +1 86303 rs2949417 G T 100 PASS THETA=0.0021;RSQ=0.8008;LDAF=0.1194;AN=2184;AC=214;VT=SNP;AA=.;AVGPOST=0.9465;SNPSOURCE=LOWCOV;ERATE=0.0007;AF=0.10;ASN_AF=0.08;AMR_AF=0.08;AFR_AF=0.12;EUR_AF=0.11 +1 86331 rs115209712 A G 100 PASS THETA=0.0047;AN=2184;VT=SNP;AA=.;AC=216;LDAF=0.1195;RSQ=0.8119;ERATE=0.0008;AVGPOST=0.9495;SNPSOURCE=LOWCOV;AF=0.10;ASN_AF=0.08;AMR_AF=0.08;AFR_AF=0.12;EUR_AF=0.11 +1 86982 rs184970101 G A 100 PASS THETA=0.0050;AN=2184;AVGPOST=0.9979;LDAF=0.0015;VT=SNP;AA=.;SNPSOURCE=LOWCOV;AC=1;ERATE=0.0006;RSQ=0.3541;AF=0.0005;AFR_AF=0.0020 +1 87021 rs188486692 T C 100 PASS AN=2184;RSQ=0.4348;VT=SNP;AA=.;THETA=0.0112;AVGPOST=0.9687;ERATE=0.0011;SNPSOURCE=LOWCOV;AC=19;LDAF=0.0221;AF=0.01;AMR_AF=0.01;AFR_AF=0.0041;EUR_AF=0.02 +1 87114 rs200095900 CT C 192 PASS AA=.;AC=8;AF=0.0037;AFR_AF=0.02;AN=2184;AVGPOST=0.9976;ERATE=0.0010;LDAF=0.0042;RSQ=0.7479;THETA=0.0149;VT=INDEL +1 87190 rs1524602 G A 100 PASS AN=2184;LDAF=0.2822;VT=SNP;AA=.;RSQ=0.7549;THETA=0.0148;AC=540;SNPSOURCE=LOWCOV;ERATE=0.0096;AVGPOST=0.8739;AF=0.25;ASN_AF=0.29;AMR_AF=0.35;AFR_AF=0.38;EUR_AF=0.08 +1 87360 rs180907504 C T 100 PASS THETA=0.0014;AN=2184;ERATE=0.0025;RSQ=0.3869;VT=SNP;AA=.;AC=14;LDAF=0.0170;SNPSOURCE=LOWCOV;AVGPOST=0.9768;AF=0.01;ASN_AF=0.02;AMR_AF=0.0028;AFR_AF=0.01 +1 87409 rs139490478 C T 100 PASS AN=2184;AC=80;RSQ=0.8364;AVGPOST=0.9797;THETA=0.0075;VT=SNP;AA=.;ERATE=0.0011;SNPSOURCE=LOWCOV;LDAF=0.0438;AF=0.04;ASN_AF=0.0017;AMR_AF=0.02;AFR_AF=0.01;EUR_AF=0.09 +1 87590 rs185279164 G A 100 PASS THETA=0.0068;ERATE=0.0005;AN=2184;VT=SNP;AA=.;LDAF=0.0026;RSQ=0.6866;SNPSOURCE=LOWCOV;AC=4;AVGPOST=0.9982;AF=0.0018;AFR_AF=0.01 +1 87647 rs146836579 T C 100 PASS AN=2184;AC=111;THETA=0.0041;VT=SNP;AA=.;LDAF=0.0558;AVGPOST=0.9811;SNPSOURCE=LOWCOV;ERATE=0.0015;RSQ=0.8636;AF=0.05;AMR_AF=0.03;AFR_AF=0.20 +1 87755 rs140735660 G A 100 PASS ERATE=0.0027;RSQ=0.5060;AN=2184;AC=16;VT=SNP;AA=.;SNPSOURCE=LOWCOV;AVGPOST=0.9847;LDAF=0.0138;THETA=0.0069;AF=0.01;AFR_AF=0.03 +1 87970 rs189643077 T C 100 PASS ERATE=0.0005;AN=2184;RSQ=0.5846;VT=SNP;AA=.;AVGPOST=0.9976;THETA=0.0053;LDAF=0.0025;SNPSOURCE=LOWCOV;AC=3;AF=0.0014;AFR_AF=0.01 +1 87978 rs182297743 G A 100 PASS AVGPOST=0.9963;THETA=0.0074;AN=2184;VT=SNP;AA=.;LDAF=0.0023;RSQ=0.2883;SNPSOURCE=LOWCOV;AC=1;ERATE=0.0006;AF=0.0005;AMR_AF=0.0028 +1 88136 rs59529791 G A 100 PASS RSQ=0.8406;AN=2184;VT=SNP;AA=.;ERATE=0.0010;THETA=0.0059;AVGPOST=0.9778;SNPSOURCE=LOWCOV;AC=106;LDAF=0.0548;AF=0.05;AMR_AF=0.03;AFR_AF=0.20 +1 88169 rs940550 C T 100 PASS RSQ=0.7811;AN=2184;VT=SNP;AA=.;THETA=0.0055;LDAF=0.2576;ERATE=0.0018;SNPSOURCE=LOWCOV;AC=506;AVGPOST=0.8932;AF=0.23;ASN_AF=0.29;AMR_AF=0.33;AFR_AF=0.33;EUR_AF=0.08 +1 88172 rs940551 G A 100 PASS RSQ=0.7703;AVGPOST=0.9669;LDAF=0.0483;AN=2184;ERATE=0.0009;VT=SNP;AA=.;THETA=0.0027;SNPSOURCE=LOWCOV;AC=86;AF=0.04;ASN_AF=0.01;AMR_AF=0.03;AFR_AF=0.01;EUR_AF=0.09 +1 88177 rs143215837 G C 100 PASS ERATE=0.0004;AN=2184;LDAF=0.0456;VT=SNP;AA=.;AVGPOST=0.9686;AC=82;THETA=0.0089;SNPSOURCE=LOWCOV;RSQ=0.7787;AF=0.04;AMR_AF=0.03;AFR_AF=0.01;EUR_AF=0.09 +1 88188 rs148331237 C A 100 PASS THETA=0.0039;AN=2184;VT=SNP;AA=.;AC=9;LDAF=0.0085;SNPSOURCE=LOWCOV;ERATE=0.0007;RSQ=0.5212;AVGPOST=0.9910;AF=0.0041;AMR_AF=0.0028;EUR_AF=0.01 +1 88236 rs186918018 C T 100 PASS AVGPOST=0.9904;AN=2184;ERATE=0.0031;RSQ=0.5511;VT=SNP;AA=.;THETA=0.0087;LDAF=0.0097;SNPSOURCE=LOWCOV;AC=11;AF=0.01;AMR_AF=0.02;AFR_AF=0.0041;EUR_AF=0.0026 +1 88250 rs191950833 T A 100 PASS LDAF=0.0013;AN=2184;RSQ=0.1387;VT=SNP;AA=.;AC=0;THETA=0.0019;SNPSOURCE=LOWCOV;ERATE=0.0007;AVGPOST=0.9974;AF=0 +1 88316 rs113759966 G A 100 PASS LDAF=0.0531;AN=2184;VT=SNP;AA=.;THETA=0.0071;RSQ=0.7791;AVGPOST=0.9644;ERATE=0.0008;AC=87;SNPSOURCE=LOWCOV;AF=0.04;ASN_AF=0.0017;AMR_AF=0.04;AFR_AF=0.01;EUR_AF=0.09 +1 88324 rs183326616 A G 100 PASS ERATE=0.0004;AN=2184;AVGPOST=0.9996;VT=SNP;AA=.;RSQ=0.7073;LDAF=0.0006;SNPSOURCE=LOWCOV;AC=1;THETA=0.0092;AF=0.0005;AFR_AF=0.0020 +1 88338 rs55700207 G A 100 PASS THETA=0.0035;RSQ=0.7967;AN=2184;ERATE=0.0034;LDAF=0.1019;VT=SNP;AA=.;AC=186;SNPSOURCE=LOWCOV;AVGPOST=0.9507;AF=0.09;ASN_AF=0.03;AMR_AF=0.15;AFR_AF=0.14;EUR_AF=0.05 +1 88370 rs185487977 G A 100 PASS AVGPOST=0.9957;LDAF=0.0035;AN=2184;ERATE=0.0009;RSQ=0.4507;VT=SNP;AA=.;SNPSOURCE=LOWCOV;AC=4;THETA=0.0043;AF=0.0018;AMR_AF=0.0028;EUR_AF=0.0040 +1 88376 rs189954431 T G 100 PASS RSQ=0.6404;AVGPOST=0.9994;AN=2184;VT=SNP;AA=.;THETA=0.0057;SNPSOURCE=LOWCOV;AC=1;ERATE=0.0003;LDAF=0.0008;AF=0.0005;AFR_AF=0.0020 +1 88388 rs182344336 C T 100 PASS THETA=0.0048;ERATE=0.0005;RSQ=0.3843;AVGPOST=0.9977;LDAF=0.0016;AN=2184;VT=SNP;AA=.;SNPSOURCE=LOWCOV;AC=1;AF=0.0005;ASN_AF=0.0017 +1 88429 rs146027550 T C 100 PASS LDAF=0.0083;AN=2184;AC=13;RSQ=0.6097;VT=SNP;AA=.;ERATE=0.0010;AVGPOST=0.9922;SNPSOURCE=LOWCOV;THETA=0.0069;AF=0.01;AMR_AF=0.01;AFR_AF=0.02;EUR_AF=0.0013 +1 88710 rs186575039 C G 100 PASS ERATE=0.0005;AC=73;THETA=0.0022;AN=2184;VT=SNP;AA=.;AVGPOST=0.9774;LDAF=0.0389;SNPSOURCE=LOWCOV;RSQ=0.8058;AF=0.03;AMR_AF=0.01;AFR_AF=0.01;EUR_AF=0.08 +1 89165 rs192631277 A C 100 PASS RSQ=0.2647;AN=2184;ERATE=0.0009;VT=SNP;AA=.;AVGPOST=0.9969;LDAF=0.0020;SNPSOURCE=LOWCOV;AC=1;THETA=0.0043;AF=0.0005;ASN_AF=0.0017 +1 89744 rs184101761 A G 100 PASS THETA=0.0068;ERATE=0.0004;AVGPOST=0.9985;LDAF=0.0016;AN=2184;RSQ=0.5853;VT=SNP;AA=.;SNPSOURCE=LOWCOV;AC=2;AF=0.0009;AFR_AF=0.0041 +1 89794 rs188661839 T C 100 PASS AN=2184;THETA=0.0130;VT=SNP;AA=.;SNPSOURCE=LOWCOV;AC=1;ERATE=0.0006;RSQ=0.3812;AVGPOST=0.9984;LDAF=0.0012;AF=0.0005;AFR_AF=0.0020 +1 89946 rs138808727 A T 100 PASS RSQ=0.7414;LDAF=0.1417;AN=2184;ERATE=0.0009;AC=236;VT=SNP;AA=.;THETA=0.0100;AVGPOST=0.9001;SNPSOURCE=LOWCOV;AF=0.11;ASN_AF=0.0035;AMR_AF=0.16;AFR_AF=0.02;EUR_AF=0.22 +1 91190 rs143856811 G A 100 PASS AN=2184;AC=77;ERATE=0.0009;VT=SNP;AA=.;LDAF=0.0447;SNPSOURCE=LOWCOV;RSQ=0.7517;THETA=0.0113;AVGPOST=0.9690;AF=0.04;ASN_AF=0.0017;AMR_AF=0.02;AFR_AF=0.02;EUR_AF=0.08 +1 91228 rs139873689 A G 100 PASS AN=2184;AVGPOST=0.9924;AC=8;RSQ=0.5097;VT=SNP;AA=.;ERATE=0.0010;THETA=0.0012;SNPSOURCE=LOWCOV;LDAF=0.0070;AF=0.0037;AFR_AF=0.02 +1 91536 rs77418980 G T 100 PASS AC=695;AN=2184;ERATE=0.0025;AVGPOST=0.7792;VT=SNP;AA=.;THETA=0.0018;LDAF=0.3255;RSQ=0.6634;SNPSOURCE=LOWCOV;AF=0.32;ASN_AF=0.34;AMR_AF=0.30;AFR_AF=0.04;EUR_AF=0.50 +1 91581 rs151118460 G A 100 PASS AVGPOST=0.7763;THETA=0.0078;AN=2184;VT=SNP;AA=.;AC=716;ERATE=0.0035;LDAF=0.3353;RSQ=0.6618;SNPSOURCE=LOWCOV;AF=0.33;ASN_AF=0.37;AMR_AF=0.30;AFR_AF=0.04;EUR_AF=0.50 +1 91605 rs141083882 C T 100 PASS AC=105;LDAF=0.0597;AN=2184;RSQ=0.7792;VT=SNP;AA=.;ERATE=0.0010;AVGPOST=0.9660;SNPSOURCE=LOWCOV;THETA=0.0070;AF=0.05;AMR_AF=0.02;AFR_AF=0.20 +1 92633 rs149776517 C T 100 PASS THETA=0.0054;AN=2184;VT=SNP;AA=.;AC=44;AVGPOST=0.9592;ERATE=0.0008;LDAF=0.0366;SNPSOURCE=LOWCOV;RSQ=0.5870;AF=0.02;AMR_AF=0.02;EUR_AF=0.05 +1 92858 rs147061536 G T 100 PASS AC=248;THETA=0.0212;RSQ=0.7567;AN=2184;ERATE=0.0046;VT=SNP;AA=.;SNPSOURCE=LOWCOV;AVGPOST=0.9072;LDAF=0.1433;AF=0.11;ASN_AF=0.01;AMR_AF=0.15;AFR_AF=0.05;EUR_AF=0.22 +1 92875 rs193157612 T C 100 PASS THETA=0.0048;AVGPOST=0.9957;AN=2184;LDAF=0.0040;ERATE=0.0009;VT=SNP;AA=.;RSQ=0.4901;SNPSOURCE=LOWCOV;AC=4;AF=0.0018;EUR_AF=0.01 +1 94421 rs200856736 TC T 90 PASS AA=TC;AC=253;AF=0.12;AFR_AF=0.01;AMR_AF=0.20;AN=2184;ASN_AF=0.26;AVGPOST=0.7183;ERATE=0.0117;EUR_AF=0.03;LDAF=0.2244;RSQ=0.3175;THETA=0.0159;VT=INDEL +1 94986 rs185004859 C T 100 PASS ERATE=0.0166;AN=2184;AC=100;THETA=0.0227;VT=SNP;AA=.;LDAF=0.0872;AVGPOST=0.9040;SNPSOURCE=LOWCOV;RSQ=0.4650;AF=0.05;ASN_AF=0.03;AMR_AF=0.04;AFR_AF=0.07;EUR_AF=0.04 +1 94991 rs188832636 G A 100 PASS THETA=0.0048;LDAF=0.0041;AN=2184;AVGPOST=0.9944;ERATE=0.0009;VT=SNP;AA=.;SNPSOURCE=LOWCOV;RSQ=0.4157;AC=3;AF=0.0014;AMR_AF=0.0028;EUR_AF=0.0026 +1 98583 rs141344361 T A 100 PASS AVGPOST=0.9463;AC=248;THETA=0.0099;AN=2184;VT=SNP;AA=.;RSQ=0.8090;LDAF=0.1336;ERATE=0.0008;SNPSOURCE=LOWCOV;AF=0.11;ASN_AF=0.30;AMR_AF=0.16;AFR_AF=0.01;EUR_AF=0.02 +1 98929 rs12184306 A G 100 PASS RSQ=0.6226;AVGPOST=0.8784;AN=2184;VT=SNP;AA=.;ERATE=0.0045;LDAF=0.1723;SNPSOURCE=LOWCOV;AC=264;THETA=0.0070;AF=0.12;ASN_AF=0.16;AMR_AF=0.08;AFR_AF=0.16;EUR_AF=0.09 +1 98946 rs191775802 C G 100 PASS AVGPOST=0.9945;ERATE=0.0013;AN=2184;LDAF=0.0046;VT=SNP;AA=.;RSQ=0.4807;SNPSOURCE=LOWCOV;AC=4;THETA=0.0097;AF=0.0018;AFR_AF=0.01 +1 98974 rs12184307 A G 100 PASS AVGPOST=0.8921;AN=2184;AC=224;THETA=0.0130;VT=SNP;AA=.;LDAF=0.1405;RSQ=0.6149;SNPSOURCE=LOWCOV;ERATE=0.0012;AF=0.10;ASN_AF=0.14;AMR_AF=0.07;AFR_AF=0.14;EUR_AF=0.06 +1 99671 rs146209971 A T 100 PASS THETA=0.0199;AN=2184;AC=13;RSQ=0.4401;VT=SNP;AA=.;ERATE=0.0010;AVGPOST=0.9802;SNPSOURCE=LOWCOV;LDAF=0.0158;AF=0.01;AMR_AF=0.02;AFR_AF=0.0020;EUR_AF=0.01 +1 99687 rs139153227 C T 100 PASS THETA=0.0211;LDAF=0.0470;AN=2184;VT=SNP;AA=.;ERATE=0.0010;RSQ=0.6276;AVGPOST=0.9548;AC=64;SNPSOURCE=LOWCOV;AF=0.03;AMR_AF=0.03;AFR_AF=0.01;EUR_AF=0.07 +1 99719 rs183898652 C T 100 PASS AN=2184;RSQ=0.5856;VT=SNP;AA=.;AC=10;ERATE=0.0018;SNPSOURCE=LOWCOV;LDAF=0.0076;AVGPOST=0.9925;THETA=0.0251;AF=0.0046;AMR_AF=0.01;AFR_AF=0.01;EUR_AF=0.0013 +1 100676 rs188226172 A T 100 PASS THETA=0.0212;LDAF=0.0047;AN=2184;VT=SNP;AA=.;RSQ=0.2646;SNPSOURCE=LOWCOV;ERATE=0.0012;AC=2;AVGPOST=0.9925;AF=0.0009;AMR_AF=0.0028;AFR_AF=0.0020 +1 103905 rs142403309 A G 100 PASS AN=2184;THETA=0.0131;ERATE=0.0025;AVGPOST=0.8782;AC=220;VT=SNP;AA=.;LDAF=0.1434;SNPSOURCE=LOWCOV;RSQ=0.5994;AF=0.10;ASN_AF=0.10;AMR_AF=0.14;AFR_AF=0.15;EUR_AF=0.05 +1 106544 rs180741296 C G 100 PASS AC=205;AVGPOST=0.5776;AN=2184;VT=SNP;AA=.;LDAF=0.3120;SNPSOURCE=LOWCOV;ERATE=0.0061;THETA=0.0372;RSQ=0.1442;AF=0.09;ASN_AF=0.11;AMR_AF=0.13;AFR_AF=0.11;EUR_AF=0.05 +1 109107 rs201432136 G GT 67 PASS AA=G;AC=63;AF=0.03;AFR_AF=0.01;AMR_AF=0.04;AN=2184;ASN_AF=0.03;AVGPOST=0.8840;ERATE=0.0122;EUR_AF=0.04;LDAF=0.0890;RSQ=0.3660;THETA=0.0210;VT=INDEL +1 111513 rs199911222 C CTA 249 PASS AA=.;AC=58;AF=0.03;AFR_AF=0.09;AMR_AF=0.03;AN=2184;ASN_AF=0.0017;AVGPOST=0.9145;ERATE=0.0024;EUR_AF=0.0013;LDAF=0.0665;RSQ=0.4694;THETA=0.0292;VT=INDEL diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 696cdfe..c0641b0 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -205,6 +205,17 @@ def test_issue_49(self): pass +class Test1kgSites(unittest.TestCase): + + def test_reader(self): + """The samples attribute should be the empty list.""" + reader = vcf.Reader(fh('1kg.sites.vcf', 'r')) + + self.assertEqual(reader.samples, []) + for record in reader: + self.assertEqual(record.samples, []) + + class TestGatkOutputWriter(unittest.TestCase): def testWrite(self): @@ -841,6 +852,7 @@ def test_trim(self): suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestOpenMethods)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestFilter)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(Test1kg)) +suite.addTests(unittest.TestLoader().loadTestsFromTestCase(Test1kgSites)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestRecord)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestCall)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestRegression)) From 9d7f44f71b817a83378b459491ae3193c4b5a170 Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Tue, 26 Feb 2013 17:34:41 +0100 Subject: [PATCH 042/168] Only write FORMAT if it is in the template Also, don't write any additional tab characters at the end of the record. --- vcf/parser.py | 13 ++++++++----- vcf/test/test_vcf.py | 22 ++++++++++++++++++++++ 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/vcf/parser.py b/vcf/parser.py index 10c6268..fc85ca8 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -220,6 +220,7 @@ def __init__(self, fsock=None, filename=None, compressed=False, prepend_chr=Fals self.samples = None self._sample_indexes = None self._header_lines = [] + self._column_headers = [] self._tabix = None self._prepend_chr = prepend_chr self._parse_metainfo() @@ -274,7 +275,8 @@ def _parse_metainfo(self): line = self.reader.next() - fields = re.split('\t| +', line) + fields = re.split('\t| +', line[1:]) + self._column_headers = fields[:9] self.samples = fields[9:] self._sample_indexes = dict([(x,i) for (i,x) in enumerate(self.samples)]) @@ -538,8 +540,6 @@ def fetch(self, chrom, start, end=None): class Writer(object): """ VCF Writer """ - fixed_fields = "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT".split() - # Reverse keys and values in header field count dictionary counts = dict((v,k) for k,v in field_counts.iteritems()) @@ -574,13 +574,16 @@ def __init__(self, stream, template, lineterminator="\r\n"): def _write_header(self): # TODO: write INFO, etc - self.writer.writerow(self.fixed_fields + self.template.samples) + self.stream.write('#' + '\t'.join(self.template._column_headers + + self.template.samples) + '\n') def write_record(self, record): """ write a record to the file """ ffs = self._map(str, [record.CHROM, record.POS, record.ID, record.REF]) \ + [self._format_alt(record.ALT), record.QUAL or '.', self._format_filter(record.FILTER), - self._format_info(record.INFO), record.FORMAT] + self._format_info(record.INFO)] + if record.FORMAT: + ffs.append(record.FORMAT) samples = [self._format_sample(record.FORMAT, sample) for sample in record.samples] diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index c0641b0..40de4df 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -192,6 +192,8 @@ class Test1kg(unittest.TestCase): def testParse(self): reader = vcf.Reader(fh('1kg.vcf.gz', 'rb')) + assert 'FORMAT' in reader._column_headers + self.assertEqual(len(reader.samples), 629) for _ in reader: pass @@ -211,10 +213,30 @@ def test_reader(self): """The samples attribute should be the empty list.""" reader = vcf.Reader(fh('1kg.sites.vcf', 'r')) + assert 'FORMAT' not in reader._column_headers + self.assertEqual(reader.samples, []) for record in reader: self.assertEqual(record.samples, []) + def test_writer(self): + """FORMAT should not be written if not present in the template and no + extra tab character should be printed if there are no FORMAT fields.""" + reader = vcf.Reader(fh('1kg.sites.vcf', 'r')) + out = StringIO() + writer = vcf.Writer(out, reader, lineterminator='\n') + + for record in reader: + writer.write_record(record) + out.seek(0) + out_str = out.getvalue() + for line in out_str.split('\n'): + if line.startswith('##'): + continue + if line.startswith('#CHROM'): + assert 'FORMAT' not in line + assert not line.endswith('\t') + class TestGatkOutputWriter(unittest.TestCase): From 4fb0c86b505548d11d382526c4f7b718828dcdc0 Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Sat, 16 Mar 2013 11:19:37 +0100 Subject: [PATCH 043/168] Adhere to `strict_whitespace` in parsing column headers Fixes parsing of sample names with space characters in `strict_whitespace` mode. Suggested by Lee Lichtenstein and Manaswi Gupta. --- vcf/parser.py | 12 ++++++------ vcf/test/test_vcf.py | 9 +++++++++ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/vcf/parser.py b/vcf/parser.py index fc85ca8..c1da964 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -205,6 +205,11 @@ def __init__(self, fsock=None, filename=None, compressed=False, prepend_chr=Fals if sys.version > '3': self._reader = codecs.getreader('ascii')(self._reader) + if strict_whitespace: + self._separator = '\t' + else: + self._separator = '\t| +' + self.reader = (line.strip() for line in self._reader if line.strip()) #: metadata fields from header (string or hash, depending) @@ -226,11 +231,6 @@ def __init__(self, fsock=None, filename=None, compressed=False, prepend_chr=Fals self._parse_metainfo() self._format_cache = {} - if strict_whitespace: - self._separator = '\t' - else: - self._separator = '\t| +' - def __iter__(self): return self @@ -275,7 +275,7 @@ def _parse_metainfo(self): line = self.reader.next() - fields = re.split('\t| +', line[1:]) + fields = re.split(self._separator, line[1:]) self._column_headers = fields[:9] self.samples = fields[9:] self._sample_indexes = dict([(x,i) for (i,x) in enumerate(self.samples)]) diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 40de4df..71df788 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -321,6 +321,14 @@ def testWrite(self): assert line.startswith('##SAMPLE=<'), "Found dictionary in meta line: {0}".format(line) +class TestSamplesSpace(unittest.TestCase): + filename = 'samples-space.vcf' + samples = ['NA 00001', 'NA 00002', 'NA 00003'] + def test_samples(self): + self.reader = vcf.Reader(fh(self.filename), strict_whitespace=True) + self.assertEqual(self.reader.samples, self.samples) + + class TestRecord(unittest.TestCase): def test_num_calls(self): @@ -875,6 +883,7 @@ def test_trim(self): suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestFilter)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(Test1kg)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(Test1kgSites)) +suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestSamplesSpace)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestRecord)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestCall)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestRegression)) From 0fd74aac844414852f3606af2105b8e146b03f28 Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Sat, 16 Mar 2013 11:24:20 +0100 Subject: [PATCH 044/168] Forgot to add test file --- vcf/test/samples-space.vcf | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 vcf/test/samples-space.vcf diff --git a/vcf/test/samples-space.vcf b/vcf/test/samples-space.vcf new file mode 100644 index 0000000..8c9bb9e --- /dev/null +++ b/vcf/test/samples-space.vcf @@ -0,0 +1,10 @@ +##fileformat=VCFv4.0 +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA 00001 NA 00002 NA 00003 +20 14370 rs6054257 G A 29 PASS . GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 ./. +20 76766 rs6054257 C T 29 PASS . GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 ./. From 46f83b1e936e23f7b3bad3a78635b580d0fb787a Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Thu, 6 Jun 2013 14:30:09 -0400 Subject: [PATCH 045/168] * adding support for contigs in the VCF header. --- vcf/parser.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/vcf/parser.py b/vcf/parser.py index c1da964..7972223 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -67,6 +67,7 @@ _Alt = collections.namedtuple('Alt', ['id', 'desc']) _Format = collections.namedtuple('Format', ['id', 'num', 'type', 'desc']) _SampleInfo = collections.namedtuple('SampleInfo', ['samples', 'gt_bases', 'gt_types', 'gt_phases']) +_Contig = collections.namedtuple('Contig', ['id', 'length']) class _vcf_metadata_parser(object): @@ -93,6 +94,10 @@ def __init__(self): Type=(?P.+), Description="(?P.*)" >''', re.VERBOSE) + self.contig_pattern = re.compile(r'''\#\#contig=< + ID=(?P[^,]+), + length=(?P-?\d+) + >''', re.VERBOSE) self.meta_pattern = re.compile(r'''##(?P.+?)=(?P.+)''') def vcf_field_count(self, num_str): @@ -152,6 +157,20 @@ def read_format(self, format_string): match.group('type'), match.group('desc')) return (match.group('id'), form) + + def read_contig(self, contig_string): + '''Read a meta-contigrmation INFO line.''' + match = self.contig_pattern.match(contig_string) + if not match: + raise SyntaxError( + "One of the contig lines is malformed: %s" % contig_string) + + length = self.vcf_field_count(match.group('length')) + + contig = _Contig(match.group('id'), length) + + return (match.group('id'), contig) + def read_meta_hash(self, meta_string): items = re.split("[<>]", meta_string) @@ -222,6 +241,8 @@ def __init__(self, fsock=None, filename=None, compressed=False, prepend_chr=Fals self.alts = None #: FORMAT fields from header self.formats = None + #: contig fields from header + self.contigs = None self.samples = None self._sample_indexes = None self._header_lines = [] @@ -239,7 +260,7 @@ def _parse_metainfo(self): The end user shouldn't have to use this. She can access the metainfo directly with ``self.metadata``.''' - for attr in ('metadata', 'infos', 'filters', 'alts', 'formats'): + for attr in ('metadata', 'infos', 'filters', 'alts', 'contigs', 'formats'): setattr(self, attr, OrderedDict()) parser = _vcf_metadata_parser() @@ -263,6 +284,10 @@ def _parse_metainfo(self): elif line.startswith('##FORMAT'): key, val = parser.read_format(line) self.formats[key] = val + + elif line.startswith('##contig'): + key, val = parser.read_contig(line) + self.contigs[key] = val else: key, val = parser.read_meta(line) From 33f0711b16031fca9df07599be7639c1e46d021c Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Thu, 6 Jun 2013 16:56:53 -0400 Subject: [PATCH 046/168] * ignore the rest of the contig information --- vcf/parser.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vcf/parser.py b/vcf/parser.py index 7972223..fe7ee8f 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -97,6 +97,7 @@ def __init__(self): self.contig_pattern = re.compile(r'''\#\#contig=< ID=(?P[^,]+), length=(?P-?\d+) + .* >''', re.VERBOSE) self.meta_pattern = re.compile(r'''##(?P.+?)=(?P.+)''') From c276e7b296ad89c4a79bc1bff6e44ca558d4df0d Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 11 Jul 2013 15:26:14 +0100 Subject: [PATCH 047/168] tests and fix for gatk header issue --- vcf/parser.py | 2 +- vcf/test/gatk_26_meta.vcf | 3 +++ vcf/test/test_vcf.py | 9 +++++++++ 3 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 vcf/test/gatk_26_meta.vcf diff --git a/vcf/parser.py b/vcf/parser.py index fe7ee8f..b4efa86 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -178,7 +178,7 @@ def read_meta_hash(self, meta_string): # Removing initial hash marks and final equal sign key = items[0][2:-1] hashItems = items[1].split(',') - val = OrderedDict(item.split("=") for item in hashItems) + val = OrderedDict(item.split("=", 1) for item in hashItems) return key, val def read_meta(self, meta_string): diff --git a/vcf/test/gatk_26_meta.vcf b/vcf/test/gatk_26_meta.vcf new file mode 100644 index 0000000..2f7ec78 --- /dev/null +++ b/vcf/test/gatk_26_meta.vcf @@ -0,0 +1,3 @@ +##fileformat=VCFv4.1 +##GATKCommandLine= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 71df788..5650908 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -870,6 +870,15 @@ def test_trim(self): +class TestGATKMeta(unittest.TestCase): + + def test_meta(self): + # expect no exceptions raised + reader = vcf.Reader(fh('gatk_26_meta.vcf')) + assert 'GATKCommandLine' in reader.metadata + assert reader.metadata['GATKCommandLine'][0]['CommandLineOptions'] == '"analysis_type=LeftAlignAndTrimVariants"' + + suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestGatkOutput)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestFreebayesOutput)) From b19f2bdc6b7cd4353e06fb6d8fc5f2644ce72800 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 11 Jul 2013 16:55:49 +0100 Subject: [PATCH 048/168] added test and fix for commas inside quoted value --- vcf/parser.py | 32 ++++++++++++++++++++++++++++++-- vcf/test/gatk_26_meta.vcf | 1 + vcf/test/test_vcf.py | 1 + 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/vcf/parser.py b/vcf/parser.py index b4efa86..2848576 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -177,8 +177,36 @@ def read_meta_hash(self, meta_string): items = re.split("[<>]", meta_string) # Removing initial hash marks and final equal sign key = items[0][2:-1] - hashItems = items[1].split(',') - val = OrderedDict(item.split("=", 1) for item in hashItems) + # N.B., items can have quoted values, so cannot just split on comma + val = OrderedDict() + state = 0 + k = '' + v = '' + for c in items[1]: + if state == 0: # reading item key + if c == '=': + state = 1 # end of key, start reading value + else: + k += c # extend key + elif state == 1: # reading item value + if v == '' and c == '"': + v += c # include quote mark in value + state = 2 # start reading quoted value + elif c == ',': + val[k] = v # store parsed item + state = 0 # read next key + k = '' + v = '' + else: + v += c + elif state == 2: # reading quoted item value + if c == '"': + v += c # include quote mark in value + state = 1 # end quoting + else: + v += c + if k != '': + val[k] = v return key, val def read_meta(self, meta_string): diff --git a/vcf/test/gatk_26_meta.vcf b/vcf/test/gatk_26_meta.vcf index 2f7ec78..1dd2e56 100644 --- a/vcf/test/gatk_26_meta.vcf +++ b/vcf/test/gatk_26_meta.vcf @@ -1,3 +1,4 @@ ##fileformat=VCFv4.1 ##GATKCommandLine= +##GATKCommandLine= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 5650908..658669e 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -877,6 +877,7 @@ def test_meta(self): reader = vcf.Reader(fh('gatk_26_meta.vcf')) assert 'GATKCommandLine' in reader.metadata assert reader.metadata['GATKCommandLine'][0]['CommandLineOptions'] == '"analysis_type=LeftAlignAndTrimVariants"' + assert reader.metadata['GATKCommandLine'][1]['CommandLineOptions'] == '"analysis_type=VariantAnnotator annotation=[HomopolymerRun, VariantType, TandemRepeatAnnotator]"' From 51fac4ba22f89f87a4c8c361570f3b4465995e8a Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 11 Jul 2013 17:06:18 +0100 Subject: [PATCH 049/168] whitespace? --- vcf/parser.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vcf/parser.py b/vcf/parser.py index 2848576..d1fe275 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -183,6 +183,7 @@ def read_meta_hash(self, meta_string): k = '' v = '' for c in items[1]: + if state == 0: # reading item key if c == '=': state = 1 # end of key, start reading value From 2c9166529424ffaa29c7fa8883d6670040e84da5 Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Fri, 12 Jul 2013 12:36:08 +0200 Subject: [PATCH 050/168] Fix contig test case for new contig header parsing --- vcf/test/test_vcf.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 71df788..56f3e29 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -48,10 +48,7 @@ def test_vcf_4_1(self): self.assertEqual(reader.metadata['fileformat'], 'VCFv4.1') # contigs were added in vcf4.1 - # probably need to add a reader.contigs attribute - assert 'contig' in reader.metadata - assert 'ID' in reader.metadata['contig'][0] - assert reader.metadata['contig'][0]['ID'] == '20' + self.assertEqual(reader.contigs['20'].length, 62435964) # test we can walk the file at least for r in reader: From d2f96d8408576a69cd5837846a5e63f194217128 Mon Sep 17 00:00:00 2001 From: Sergei Lebedev Date: Fri, 12 Jul 2013 15:04:09 +0400 Subject: [PATCH 051/168] Added pickling support for '_Record' and '_CallData' -- closes #108 --- vcf/model.py | 11 +++++++++++ vcf/test/test_vcf.py | 7 +++++++ 2 files changed, 18 insertions(+) diff --git a/vcf/model.py b/vcf/model.py index 7d28506..17672f7 100644 --- a/vcf/model.py +++ b/vcf/model.py @@ -35,6 +35,13 @@ def __eq__(self, other): and self.sample == other.sample and self.gt_type == other.gt_type) + def __getstate__(self): + return dict((attr, getattr(self, attr)) for attr in self.__slots__) + + def __setstate__(self, state): + for attr in self.__slots__: + setattr(self, attr, state.get(attr)) + def gt_phase_char(self): return "/" if not self.phased else "|" @@ -540,4 +547,8 @@ def __str__(self): for (x, y) in zip(self._fields, self)]) return "CallData(" + dat + ')' + def __reduce__(self): + args = super(CallData, self).__reduce__() + return make_calldata_tuple, (fields, ) + return CallData diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 71df788..0930847 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -3,6 +3,7 @@ import doctest import os import commands +import cPickle from StringIO import StringIO import vcf @@ -638,6 +639,11 @@ def test_info_multiple_values(self): actual = var.INFO['RepeatConsensus'] self.assertEqual(expected, actual) + def test_pickle(self): + reader = vcf.Reader(fh('example-4.0.vcf')) + for var in reader: + assert cPickle.loads(cPickle.dumps(var)) == var + class TestCall(unittest.TestCase): @@ -688,6 +694,7 @@ def test_gt_types(self): elif var.POS == 1234567: self.assertEqual([None,1,2], gt_types) + class TestTabix(unittest.TestCase): def setUp(self): From 76afe7766623c0981b2d6a0c8cf4e2634c37ada3 Mon Sep 17 00:00:00 2001 From: James Casbon Date: Mon, 15 Jul 2013 08:17:45 +0100 Subject: [PATCH 052/168] add python 3.3 testing, HISTORY updates --- .travis.yml | 1 + docs/HISTORY.rst | 11 +++++++++++ tox.ini | 5 ++++- 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 47b1002..a1cb3a4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,6 +4,7 @@ python: - "2.6" - "2.7" - "3.2" + - "3.3" - "pypy" install: - "if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install --use-mirrors cython && pip install --use-mirrors pysam argparse ordereddict; fi" diff --git a/docs/HISTORY.rst b/docs/HISTORY.rst index fc3f2b3..15aba83 100644 --- a/docs/HISTORY.rst +++ b/docs/HISTORY.rst @@ -17,6 +17,17 @@ New features should have test code sent with them. Changes ======= +0.6.4 Release +------------- + +* Handle INFO fields with multiple values, thanks +* Support writing records without GT data #88, thanks @bow +* Pickleable call data #112, thanks @superbobry +* Write files without FORMAT #95 thanks Martijn +* Strict whitespace mode, thanks Martijn, Lee Lichtenstein and Manawsi Gupta +* Add support for contigs in header, thanks @gcnh and Martijn +* Fix GATK header parsing, thanks @alimanfoo + 0.6.3 Release ------------- diff --git a/tox.ini b/tox.ini index 771e15f..52e1085 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py26, py27, py32 +envlist = py26, py27, py32, py33 [testenv] commands = @@ -27,3 +27,6 @@ deps = deps = cython +[testenv:py32] +deps = + cython From 2dd86220a746be3e940cbcfe12e1a3a53e05317d Mon Sep 17 00:00:00 2001 From: James Casbon Date: Mon, 15 Jul 2013 17:57:41 +0100 Subject: [PATCH 053/168] version 0.6.4 --- vcf/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vcf/__init__.py b/vcf/__init__.py index 7ab38ee..3e49b09 100644 --- a/vcf/__init__.py +++ b/vcf/__init__.py @@ -177,4 +177,4 @@ from vcf.filters import Base as Filter from vcf.parser import RESERVED_INFO, RESERVED_FORMAT -VERSION = '0.6.3' +VERSION = '0.6.4' From 67b21a1350bfe8280a28f1886a9895962d41cdd4 Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Wed, 7 Aug 2013 16:47:53 +0200 Subject: [PATCH 054/168] Differentiate between no filtering and PASS Fixes jamescasbon#114 --- scripts/vcf_filter.py | 2 +- vcf/model.py | 5 ++++- vcf/parser.py | 4 +++- vcf/test/mixed-filtering.vcf | 24 ++++++++++++++++++++++++ vcf/test/test_vcf.py | 15 +++++++++++++++ 5 files changed, 47 insertions(+), 3 deletions(-) create mode 100644 vcf/test/mixed-filtering.vcf diff --git a/scripts/vcf_filter.py b/scripts/vcf_filter.py index 9a08629..fd32b39 100644 --- a/scripts/vcf_filter.py +++ b/scripts/vcf_filter.py @@ -162,7 +162,7 @@ def addfilt(filt): if output_record: # use PASS only if other filter names appear in the FILTER column #FIXME: is this good idea? - if record.FILTER == '.' and not drop_filtered: record.FILTER = 'PASS' + if record.FILTER is None and not drop_filtered: record.FILTER = 'PASS' output.write_record(record) if __name__ == '__main__': main() diff --git a/vcf/model.py b/vcf/model.py index 17672f7..a975a82 100644 --- a/vcf/model.py +++ b/vcf/model.py @@ -163,7 +163,10 @@ def add_format(self, fmt): self.FORMAT = self.FORMAT + ':' + fmt def add_filter(self, flt): - self.FILTER.append(flt) + if self.FILTER is None: + self.FILTER = [flt] + else: + self.FILTER.append(flt) def add_info(self, info, value=True): self.INFO[info] = value diff --git a/vcf/parser.py b/vcf/parser.py index d1fe275..2ad2368 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -541,7 +541,9 @@ def next(self): qual = None filt = row[6] - if filt == 'PASS' or filt == '.': + if filt == '.': + filt = None + elif filt == 'PASS': filt = [] else: filt = filt.split(';') diff --git a/vcf/test/mixed-filtering.vcf b/vcf/test/mixed-filtering.vcf new file mode 100644 index 0000000..f02e839 --- /dev/null +++ b/vcf/test/mixed-filtering.vcf @@ -0,0 +1,24 @@ +##fileformat=VCFv4.1 +##fileDate=20090805 +##source=myImputationProgramV3.1 +##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta +##contig= +##phasing=partial +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 +20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. +20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3 +20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4 +20 1230237 . T . 47 . NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 +20 1234567 microsat1 GTC G,GTCT 50 q10;q50 NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3 diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index b3fcf92..798b4db 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -327,6 +327,20 @@ def test_samples(self): self.assertEqual(self.reader.samples, self.samples) +class TestMixedFiltering(unittest.TestCase): + filename = 'mixed-filtering.vcf' + def test_mixed_filtering(self): + """ + Test mix of FILTER values (pass, filtered, no filtering). + """ + reader = vcf.Reader(fh(self.filename)) + self.assertEqual(next(reader).FILTER, []) + self.assertEqual(next(reader).FILTER, ['q10']) + self.assertEqual(next(reader).FILTER, []) + self.assertEqual(next(reader).FILTER, None) + self.assertEqual(next(reader).FILTER, ['q10', 'q50']) + + class TestRecord(unittest.TestCase): def test_num_calls(self): @@ -898,6 +912,7 @@ def test_meta(self): suite.addTests(unittest.TestLoader().loadTestsFromTestCase(Test1kg)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(Test1kgSites)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestSamplesSpace)) +suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestMixedFiltering)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestRecord)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestCall)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestRegression)) From cc70525e66cc4041e822478a48566939e6160945 Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Wed, 14 Aug 2013 09:52:32 +0200 Subject: [PATCH 055/168] Allow fields in contig definition before length This is a bit of a hack and should really be generalized to proper parsing of all header lines. --- vcf/parser.py | 1 + vcf/test/gonl.chr20.release4.gtc.vcf | 120 +++++++++++++++++++++++++++ vcf/test/test_vcf.py | 13 +++ 3 files changed, 134 insertions(+) create mode 100644 vcf/test/gonl.chr20.release4.gtc.vcf diff --git a/vcf/parser.py b/vcf/parser.py index d1fe275..a6eb99f 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -96,6 +96,7 @@ def __init__(self): >''', re.VERBOSE) self.contig_pattern = re.compile(r'''\#\#contig=< ID=(?P[^,]+), + .* length=(?P-?\d+) .* >''', re.VERBOSE) diff --git a/vcf/test/gonl.chr20.release4.gtc.vcf b/vcf/test/gonl.chr20.release4.gtc.vcf new file mode 100644 index 0000000..03588bf --- /dev/null +++ b/vcf/test/gonl.chr20.release4.gtc.vcf @@ -0,0 +1,120 @@ +##fileformat=VCFv4.1 +##ApplyRecalibration="analysis_type=ApplyRecalibration input_file=[] read_buffer_size=null phone_home=STANDARD gatk_key=null read_filter=[] intervals=null excludeIntervals=null interval_set_rule=UNION interval_merging=ALL reference_sequence=/target/gpfs2/gcc/resources/hg19/indices/human_g1k_v37.fa rodBind=[] nonDeterministicRandomSeed=false downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=1000 baq=OFF baqGapOpenPenalty=40.0 performanceLog=null useOriginalQualities=false BQSR=null defaultBaseQualities=-1 validation_strictness=SILENT unsafe=null num_threads=1 num_cpu_threads=null num_io_threads=null num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false logging_level=INFO log_to_file=null help=false input=[(RodBinding name=input source=/target/gpfs2/gcc/home/lfrancioli/gonl/projects/trio-analysis/results/snps/UG_raw_biallelic/gonl.biallelic.vcf)] recal_file=/target/gpfs2/gcc/home/lfrancioli/gonl/projects/trio-analysis/intermediate/snps/vqsr_1kg_phase1/gonl.biallelic.vcf.1kg_phase1.2.recal tranches_file=/target/gpfs2/gcc/home/lfrancioli/gonl/projects/trio-analysis/intermediate/snps/vqsr_1kg_phase1/gonl.biallelic.vcf.1kg_phase1.2.tranches out=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub NO_HEADER=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub ts_filter_level=99.5 ignore_filter=null mode=SNP filter_mismatching_base_and_quals=false" +##CombineVariants="analysis_type=CombineVariants input_file=[] sample_metadata=[] read_buffer_size=null phone_home=STANDARD read_filter=[] intervals=[1:123000001-126000000] excludeIntervals=null reference_sequence=/humgen/1kg/reference/human_g1k_v37.fasta rodBind=[/humgen/1kg/processing/production_wgs_phase1/consensus_wgs/v2b/calls/chr1/AFR/AFR.phase1.chr1.42.raw.snps.vcf, /humgen/1kg/processing/production_wgs_phase1/consensus_wgs/v2b/calls/chr1/ASN/ASN.phase1.chr1.42.raw.snps.vcf, /humgen/1kg/processing/production_wgs_phase1/consensus_wgs/v2b/calls/chr1/AMR/AMR.phase1.chr1.42.raw.snps.vcf, /humgen/1kg/processing/production_wgs_phase1/consensus_wgs/v2b/calls/chr1/EUR/EUR.phase1.chr1.42.raw.snps.vcf, /humgen/1kg/processing/production_wgs_phase1/consensus_wgs/v2b/calls/chr1/AFR.admix/AFR.admix.phase1.chr1.42.raw.snps.vcf, /humgen/1kg/processing/production_wgs_phase1/consensus_wgs/v2b/calls/chr1/ASN.admix/ASN.admix.phase1.chr1.42.raw.snps.vcf, /humgen/1kg/processing/production_wgs_phase1/consensus_wgs/v2b/calls/chr1/AMR.admix/AMR.admix.phase1.chr1.42.raw.snps.vcf, /humgen/1kg/processing/production_wgs_phase1/consensus_wgs/v2b/calls/chr1/EUR.admix/EUR.admix.phase1.chr1.42.raw.snps.vcf, /humgen/1kg/processing/production_wgs_phase1/consensus_wgs/v2b/calls/chr1/ALL/ALL.phase1.chr1.42.raw.snps.vcf] rodToIntervalTrackName=null BTI_merge_rule=UNION nonDeterministicRandomSeed=false DBSNP=null downsampling_type=null downsample_to_fraction=null downsample_to_coverage=null baq=OFF baqGapOpenPenalty=40.0 performanceLog=null useOriginalQualities=false defaultBaseQualities=-1 validation_strictness=SILENT unsafe=null num_threads=1 interval_merging=ALL read_group_black_list=null processingTracker=null restartProcessingTracker=false processingTrackerStatusFile=null processingTrackerID=-1 allow_intervals_with_unindexed_bam=false disable_experimental_low_memory_sharding=false logging_level=INFO log_to_file=null help=false out=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub NO_HEADER=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub genotypemergeoption=PRIORITIZE filteredrecordsmergetype=KEEP_IF_ANY_UNFILTERED rod_priority_list=ALL,AFR.admix,AMR.admix,EUR.admix,ASN.admix,AFR,AMR,EUR,ASN printComplexMerges=false filteredAreUncalled=false minimalVCF=false setKey=pop assumeIdenticalSamples=false minimumN=1 masterMerge=false mergeInfoWithMaxAC=true" +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##INFO= +##INFO= +##SelectVariants="analysis_type=SelectVariants input_file=[] read_buffer_size=null phone_home=STANDARD read_filter=[] intervals=[1:1-5000001] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL reference_sequence=/target/gpfs2/gcc/home/lfrancioli/gonl/resources/hg19/indices/human_g1k_v37.fa rodBind=[] nonDeterministicRandomSeed=false downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=1000 baq=OFF baqGapOpenPenalty=40.0 performanceLog=null useOriginalQualities=false defaultBaseQualities=-1 validation_strictness=SILENT unsafe=null num_threads=1 num_cpu_threads=null num_io_threads=null num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false logging_level=INFO log_to_file=null help=false variant=(RodBinding name=variant source=/target/gpfs2/gcc/home/lfrancioli/results/trio-analysis/ug_initial/gonl.1_1-5000001.vcf) discordance=(RodBinding name= source=UNBOUND) concordance=(RodBinding name= source=UNBOUND) out=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub NO_HEADER=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub sample_name=[] sample_expressions=null sample_file=null exclude_sample_name=[] exclude_sample_file=[] select_expressions=[] excludeNonVariants=false excludeFiltered=false restrictAllelesTo=BIALLELIC keepOriginalAC=false mendelianViolation=false mendelianViolationQualThreshold=0.0 select_random_number=0 select_random_fraction=0.0 remove_fraction_genotypes=0.0 selectTypeToInclude=[] keepIDs=null outMVFile=null filter_mismatching_base_and_quals=false" +##SetFilterPASS="analysis_type=SetFilterPASS input_file=[] read_buffer_size=null phone_home=STANDARD gatk_key=null tag=NA read_filter=[] intervals=null excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/target/gpfs2/gcc/resources/hg19/indices/human_g1k_v37.fa nonDeterministicRandomSeed=false disableRandomization=false downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=1000 enable_experimental_downsampling=false baq=OFF baqGapOpenPenalty=40.0 performanceLog=null useOriginalQualities=false BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 defaultBaseQualities=-1 validation_strictness=SILENT remove_program_records=false keep_program_records=false unsafe=null num_threads=1 num_cpu_threads_per_data_thread=1 num_io_threads=0 monitorThreadEfficiency=false num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false logging_level=INFO log_to_file=null help=false variant=(RodBinding name=variant source=/target/gpfs2/gcc/home/lfrancioli/gonl/projects/trio-analysis/intermediate/snps/vqsr_1kg_phase1/gonl.biallelic.vqsr_1kg_phase1.2.99.5.vcf) sites=[(RodBinding name=sites source=/target/gpfs2/gcc/home/lfrancioli/resources/1000GP_hg19/EUR.wgs.project_consensus_vqsr2b.20101123.snps.low_coverage.sites.vcf)] out=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub filter_mismatching_base_and_quals=false" +##UnifiedGenotyper="analysis_type=UnifiedGenotyper input_file=[/humgen/1kg/phase1_cleaned_bams/bams/chr1/CHB.phase1.chr1.42.cleaned.bam, /humgen/1kg/phase1_cleaned_bams/bams/chr1/CHS.phase1.chr1.42.cleaned.bam, /humgen/1kg/phase1_cleaned_bams/bams/chr1/CLM.phase1.chr1.42.cleaned.bam, /humgen/1kg/phase1_cleaned_bams/bams/chr1/JPT.phase1.chr1.42.cleaned.bam, /humgen/1kg/phase1_cleaned_bams/bams/chr1/MXL.phase1.chr1.42.cleaned.bam, /humgen/1kg/phase1_cleaned_bams/bams/chr1/PUR.phase1.chr1.42.cleaned.bam] sample_metadata=[] read_buffer_size=null phone_home=STANDARD read_filter=[] intervals=[1:123000001-126000000] excludeIntervals=null reference_sequence=/humgen/1kg/reference/human_g1k_v37.fasta rodBind=[/humgen/1kg/processing/production_wgs_phase1/consensus/ALL.phase1.wgs.unionBC1.pass.sites.vcf, /humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf] rodToIntervalTrackName=null BTI_merge_rule=UNION nonDeterministicRandomSeed=false DBSNP=null downsampling_type=null downsample_to_fraction=null downsample_to_coverage=50 baq=CALCULATE_AS_NECESSARY baqGapOpenPenalty=40.0 performanceLog=null useOriginalQualities=false defaultBaseQualities=-1 validation_strictness=SILENT unsafe=null num_threads=1 interval_merging=ALL read_group_black_list=null processingTracker=null restartProcessingTracker=false processingTrackerStatusFile=null processingTrackerID=-1 allow_intervals_with_unindexed_bam=false disable_experimental_low_memory_sharding=false logging_level=INFO log_to_file=null help=false genotype_likelihoods_model=SNP p_nonref_model=EXACT heterozygosity=0.0010 pcr_error_rate=1.0E-4 genotyping_mode=GENOTYPE_GIVEN_ALLELES output_mode=EMIT_VARIANTS_ONLY standard_min_confidence_threshold_for_calling=4.0 standard_min_confidence_threshold_for_emitting=4.0 noSLOD=false assume_single_sample_reads=null abort_at_too_much_coverage=-1 min_base_quality_score=17 min_mapping_quality_score=20 max_deletion_fraction=0.05 min_indel_count_for_genotyping=5 indel_heterozygosity=1.25E-4 indelGapContinuationPenalty=10.0 indelGapOpenPenalty=45.0 indelHaplotypeSize=80 doContextDependentGapPenalties=true getGapPenaltiesFromData=false indel_recal_file=indel.recal_data.csv indelDebug=false dovit=false GSA_PRODUCTION_ONLY=false exactCalculation=LINEAR_EXPERIMENTAL ignoreSNPAlleles=false output_all_callable_bases=false genotype=false out=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub NO_HEADER=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub debug_file=null metrics_file=null annotation=[]" +##VariantAnnotator="analysis_type=VariantAnnotator input_file=[] read_buffer_size=null phone_home=STANDARD read_filter=[] intervals=[1:1-5000001] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL reference_sequence=/target/gpfs2/gcc/home/lfrancioli/gonl/resources/hg19/indices/human_g1k_v37.fa rodBind=[] nonDeterministicRandomSeed=false downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=1000 baq=OFF baqGapOpenPenalty=40.0 performanceLog=null useOriginalQualities=false defaultBaseQualities=-1 validation_strictness=SILENT unsafe=null num_threads=1 num_cpu_threads=null num_io_threads=null num_bam_file_handles=null read_group_black_list=null pedigree=[/target/gpfs2/gcc/home/lfrancioli/gonl/resources/UnifiedGenotyper/GoNL.ped] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false logging_level=INFO log_to_file=null help=false variant=(RodBinding name=variant source=/target/gpfs2/gcc/home/lfrancioli/results/trio-analysis/snps/gonl.1_1-5000001.biallelic.vcf) snpEffFile=(RodBinding name= source=UNBOUND) dbsnp=(RodBinding name= source=UNBOUND) comp=[] resource=[] out=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub NO_HEADER=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub annotation=[TransmissionDisequilibriumTest, InbreedingCoeff, AlleleDosage, ChromosomeCounts] excludeAnnotation=[] group=[] expression=[] useAllAnnotations=false list=false vcfContainsOnlyIndels=false MendelViolationGenotypeQualityThreshold=0.0 requireStrictAlleleMatch=false filter_mismatching_base_and_quals=false" +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##reference=file:///target/gpfs2/gcc/resources/hg19/indices/human_g1k_v37.fa +##source=SelectVariants +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +20 60309 . G T 991.76 PASS AC=4;AN=996;GTC=494,4,0 +20 60573 . T C 124.17 PASS AC=1;AN=996;GTC=497,1,0 +20 60828 . T G 807.71 PASS AC=6;AN=996;GTC=492,6,0 +20 61098 rs6078030 C T 51254.56 PASS AC=225;AN=996;GTC=304,163,31 +20 61270 . A C 2414.84 PASS AC=20;AN=992;GTC=476,20,0 +20 61289 . A C 419.41 TruthSensitivityTranche99.70to99.80 AC=71;AN=960;GTC=411,67,2 +20 61682 . C T 12.27 PASS AC=1;AN=996;GTC=497,1,0 diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index b3fcf92..f1824fa 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -236,6 +236,18 @@ def test_writer(self): assert not line.endswith('\t') +class TestGoNL(unittest.TestCase): + + def testParse(self): + reader = vcf.Reader(fh('gonl.chr20.release4.gtc.vcf')) + for _ in reader: + pass + + def test_contig_line(self): + reader = vcf.Reader(fh('gonl.chr20.release4.gtc.vcf')) + self.assertEqual(reader.contigs['1'].length, 249250621) + + class TestGatkOutputWriter(unittest.TestCase): def testWrite(self): @@ -897,6 +909,7 @@ def test_meta(self): suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestFilter)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(Test1kg)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(Test1kgSites)) +suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestGoNL)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestSamplesSpace)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestRecord)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestCall)) From 831c023afdd4034d9d8d599b361914d4baf12394 Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Thu, 19 Sep 2013 21:37:17 +0200 Subject: [PATCH 056/168] Test if contig lines are output by writer --- vcf/test/test_vcf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index c3a2e2f..9d0e204 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -271,6 +271,7 @@ def testWrite(self): self.assertEquals(reader.samples, reader2.samples) self.assertEquals(reader.formats, reader2.formats) self.assertEquals(reader.infos, reader2.infos) + self.assertEquals(reader.contigs, reader2.contigs) for l, r in zip(records, reader2): self.assertEquals(l.samples, r.samples) From bb72c5b126b3799e7285d755ee87704ad49971dd Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Thu, 19 Sep 2013 21:38:16 +0200 Subject: [PATCH 057/168] Output contig lines in writer --- vcf/parser.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vcf/parser.py b/vcf/parser.py index 20a12a8..aac102b 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -627,6 +627,8 @@ def __init__(self, stream, template, lineterminator="\r\n"): stream.write(two.format(key="FILTER", *line)) for line in template.alts.itervalues(): stream.write(two.format(key="ALT", *line)) + for line in template.contigs.itervalues(): + stream.write('##contig=\n'.format(*line)) self._write_header() From c4c69255f95d8776909a6edc54c005e6c7448eb8 Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Fri, 20 Sep 2013 12:10:59 +0200 Subject: [PATCH 058/168] Test parsing and writing INFO with type Character --- vcf/test/info-type-character.vcf | 8 ++++++++ vcf/test/test_vcf.py | 25 +++++++++++++++++++++++++ 2 files changed, 33 insertions(+) create mode 100644 vcf/test/info-type-character.vcf diff --git a/vcf/test/info-type-character.vcf b/vcf/test/info-type-character.vcf new file mode 100644 index 0000000..77b24a7 --- /dev/null +++ b/vcf/test/info-type-character.vcf @@ -0,0 +1,8 @@ +##fileformat=VCFv4.1 +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample +chr1 100 id1 G A . . FLOAT_1=123.456;CHAR_1=Y;FLOAT_N=123.456;CHAR_N=Y GT 0/1 diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 9d0e204..bcc39a9 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -248,6 +248,31 @@ def test_contig_line(self): self.assertEqual(reader.contigs['1'].length, 249250621) +class TestInfoTypeCharacter(unittest.TestCase): + def test_parse(self): + reader = vcf.Reader(fh('info-type-character.vcf')) + record = next(reader) + self.assertEqual(record.INFO['FLOAT_1'], 123.456) + self.assertEqual(record.INFO['CHAR_1'], 'Y') + self.assertEqual(record.INFO['FLOAT_N'], [123.456]) + self.assertEqual(record.INFO['CHAR_N'], ['Y']) + + def test_write(self): + reader = vcf.Reader(fh('info-type-character.vcf')) + out = StringIO() + writer = vcf.Writer(out, reader) + + records = list(reader) + + for record in records: + writer.write_record(record) + out.seek(0) + reader2 = vcf.Reader(out) + + for l, r in zip(records, reader2): + self.assertEquals(l.INFO, r.INFO) + + class TestGatkOutputWriter(unittest.TestCase): def testWrite(self): From e3fc03a5819aa67943ae53a1913cdc5674aaa43a Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Fri, 20 Sep 2013 12:12:01 +0200 Subject: [PATCH 059/168] Fix parsing INFO lines with type Character This fixes GitHub issue #120 (thanks @AndrewUzilov). --- vcf/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vcf/parser.py b/vcf/parser.py index aac102b..0aeaeb7 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -379,7 +379,7 @@ def _parse_info(self, info_str): val = self._map(float, vals) elif entry_type == 'Flag': val = True - elif entry_type == 'String': + elif entry_type in ('String', 'Character'): try: vals = entry[1].split(',') # commas are reserved characters indicating multiple values val = self._map(str, vals) From 58ef505e72b88f082bdfb98f3cdebe69db82becf Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Tue, 24 Sep 2013 10:55:26 +0200 Subject: [PATCH 060/168] Add TestInfoTypeCharacter to test suite --- vcf/test/test_vcf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index bcc39a9..9b18baf 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -942,6 +942,7 @@ def test_meta(self): suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestSamtoolsOutput)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestBcfToolsOutput)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestGatkOutputWriter)) +suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestInfoTypeCharacter)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestBcfToolsOutputWriter)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestWriterDictionaryMeta)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestTabix)) From 60398055dd86caf4d17d11a1be32faa6a99b22bc Mon Sep 17 00:00:00 2001 From: Peter Krusche Date: Tue, 5 Nov 2013 17:18:28 +0000 Subject: [PATCH 061/168] Fixed exception when reading single breakends --- vcf/model.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/vcf/model.py b/vcf/model.py index a975a82..54794fc 100644 --- a/vcf/model.py +++ b/vcf/model.py @@ -473,9 +473,15 @@ class _Breakend(_AltRecord): def __init__(self, chr, pos, orientation, remoteOrientation, connectingSequence, withinMainAssembly, **kwargs): super(_Breakend, self).__init__(type="BND", **kwargs) #: The chromosome of breakend's mate. - self.chr = str(chr) + if chr is not None: + self.chr = str(chr) + else: + self.chr = None # Single breakend #: The coordinate of breakend's mate. - self.pos = int(pos) + if pos is not None: + self.pos = int(pos) + else: + self.pos = None #: The orientation of breakend's mate. If the sequence 3' of the breakend's mate is connected, True, else if the sequence 5' of the breakend's mate is connected, False. self.remoteOrientation = remoteOrientation #: If the breakend mate is within the assembly, True, else False if the breakend mate is on a contig in an ancillary assembly file. From 4892aabf96568bb72b0f3c41c1a7d5d88a8c81c4 Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Sat, 16 Nov 2013 21:06:42 +0100 Subject: [PATCH 062/168] Do not maintain the order of INFO fields within records Using an ordinary dict instead of an OrderdDict for the INFO fields makes parsing faster. The INFO fields are sorted by the VCF writer where all fields defined in the VCF header go first and in the same order, followed by the remaining fields in alpabetical order. Note that this make writing slower. We lose two things: 1. Getting the INFO fields in original order from a record when using PyVCF as a library (but I don't think most users are expecting this anyway). 2. Preserving the original order of the INFO fields when writing (but the order is predictable). The following are some simple benchmarks, starting with just parsing: 1. Without this change (using OrderedDict): In [1]: %timeit list(vcf.Reader(open('vcf/test/1kg.sites.vcf'))) 100 loops, best of 3: 15 ms per loop 2. With this change (using dict): In [1]: %timeit list(vcf.Reader(open('vcf/test/1kg.sites.vcf'))) 100 loops, best of 3: 10 ms per loop Now parsing the same file and writing it back to VCF: 1. Without this change (using OrderedDict, no sorting): In [1]: %%timeit ...: reader = vcf.Reader(open('vcf/test/1kg.sites.vcf')) ...: writer = vcf.Writer(open(os.devnull, 'w'), reader) ...: for record in reader: ...: writer.write_record(record) ...: 10 loops, best of 3: 22.7 ms per loop 2. With half this change (using dict, no sorting): In [1]: %%timeit ...: reader = vcf.Reader(open('vcf/test/1kg.sites.vcf')) ...: writer = vcf.Writer(open(os.devnull, 'w'), reader) ...: for record in reader: ...: writer.write_record(record) ...: 100 loops, best of 3: 16.5 ms per loop 3. With this change (using dict, sorting during write): In [6]: %%timeit ...: reader = vcf.Reader(open('vcf/test/1kg.sites.vcf')) ...: writer = vcf.Writer(open(os.devnull, 'w'), reader) ...: for record in reader: ...: writer.write_record(record) ...: 100 loops, best of 3: 17.7 ms per loop Fixes GitHub issue #96. --- vcf/parser.py | 14 ++++++++++++-- vcf/test/test_vcf.py | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/vcf/parser.py b/vcf/parser.py index 0aeaeb7..5706ccf 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -350,7 +350,7 @@ def _parse_info(self, info_str): return {} entries = info_str.split(';') - retdict = OrderedDict() + retdict = {} for entry in entries: entry = entry.split('=') @@ -606,6 +606,12 @@ def __init__(self, stream, template, lineterminator="\r\n"): self.template = template self.stream = stream + # Order keys for INFO fields defined in the header (undefined fields + # get a maximum key). + self.info_order = collections.defaultdict( + lambda: len(template.infos), + dict(zip(template.infos.iterkeys(), itertools.count()))) + two = '##{key}=\n' four = '##{key}=\n' _num = self._fix_field_count @@ -681,7 +687,11 @@ def _format_filter(self, flt): def _format_info(self, info): if not info: return '.' - return ';'.join([self._stringify_pair(x,y) for x, y in info.iteritems()]) + def order_key(field): + # Order by header definition first, alphabetically second. + return self.info_order[field], field + return ';'.join(self._stringify_pair(f, info[f]) for f in + sorted(info, key=order_key)) def _format_sample(self, fmt, sample): try: diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 9b18baf..730e975 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -248,6 +248,43 @@ def test_contig_line(self): self.assertEqual(reader.contigs['1'].length, 249250621) +class TestInfoOrder(unittest.TestCase): + + def _assert_order(self, definitions, fields): + """ + Elements common to both lists should be in the same order. Elements + only in `fields` should be last and in alphabetical order. + """ + used_definitions = [d for d in definitions if d in fields] + self.assertEqual(used_definitions, fields[:len(used_definitions)]) + self.assertEqual(fields[len(used_definitions):], + sorted(fields[len(used_definitions):])) + + def test_writer(self): + """ + Order of INFO fields should be compatible with the order of their + definition in the header and undefined fields should be last and in + alphabetical order. + """ + reader = vcf.Reader(fh('1kg.sites.vcf', 'r')) + out = StringIO() + writer = vcf.Writer(out, reader, lineterminator='\n') + + for record in reader: + writer.write_record(record) + out.seek(0) + out_str = out.getvalue() + + definitions = [] + for line in out_str.split('\n'): + if line.startswith('##INFO='): + definitions.append(line.split('ID=')[1].split(',')[0]) + if not line or line.startswith('#'): + continue + fields = [f.split('=')[0] for f in line.split('\t')[7].split(';')] + self._assert_order(definitions, fields) + + class TestInfoTypeCharacter(unittest.TestCase): def test_parse(self): reader = vcf.Reader(fh('info-type-character.vcf')) From 0bd567cb65e8be44287a2cc02944c49c7cec165e Mon Sep 17 00:00:00 2001 From: James Boocock Date: Mon, 25 Nov 2013 13:20:10 +1300 Subject: [PATCH 063/168] Fixed tox.ini error regarding duplicate test section. File "/usr/local/bin/tox", line 9, in load_entry_point('tox==1.6.1', 'console_scripts', 'tox')() File "/usr/local/lib/python2.7/dist-packages/tox/_cmdline.py", line 25, in main config = parseconfig(args, 'tox') File "/usr/local/lib/python2.7/dist-packages/tox/_config.py", line 44, in parseconfig parseini(config, inipath) File "/usr/local/lib/python2.7/dist-packages/tox/_config.py", line 187, in __init__ self._cfg = py.iniconfig.IniConfig(config.toxinipath) File "/usr/local/lib/python2.7/dist-packages/py/_iniconfig.py", line 67, in __init__ self._raise(lineno, 'duplicate section %r'%(section, )) File "/usr/local/lib/python2.7/dist-packages/py/_iniconfig.py", line 75, in _raise raise ParseError(self.path, lineno, msg) py._iniconfig.ParseError: /home/smilefreak/MerrimanSelectionPipeline/PyVCF/tox.ini:30: duplicate section 'testenv:py32' --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 52e1085..8a56c17 100644 --- a/tox.ini +++ b/tox.ini @@ -27,6 +27,6 @@ deps = deps = cython -[testenv:py32] +[testenv:py33] deps = cython From e50d7506196768e61220cea6376bfb544785f260 Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Mon, 25 Nov 2013 11:35:10 +0100 Subject: [PATCH 064/168] Fix incorrect and missing reserved INFO/FORMAT fields The reserved HOMSEQ INFO field was incorrectly defined as Integer and some other fields listed in the VCF spec were missing. Fixes #130 --- vcf/parser.py | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/vcf/parser.py b/vcf/parser.py index 5706ccf..8996d37 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -29,25 +29,30 @@ RESERVED_INFO = { 'AA': 'String', 'AC': 'Integer', 'AF': 'Float', 'AN': 'Integer', 'BQ': 'Float', 'CIGAR': 'String', 'DB': 'Flag', 'DP': 'Integer', - 'END': 'Integer', 'H2': 'Flag', 'MQ': 'Float', 'MQ0': 'Integer', - 'NS': 'Integer', 'SB': 'String', 'SOMATIC': 'Flag', 'VALIDATED': 'Flag', - - # VCF 4.1 Additions - 'IMPRECISE':'Flag', 'NOVEL':'Flag', 'END':'Integer', 'SVTYPE':'String', - 'CIPOS':'Integer','CIEND':'Integer','HOMLEN':'Integer','HOMSEQ':'Integer', - 'BKPTID':'String','MEINFO':'String','METRANS':'String','DGVID':'String', - 'DBVARID':'String','MATEID':'String','PARID':'String','EVENT':'String', - 'CILEN':'Integer','CN':'Integer','CNADJ':'Integer','CICN':'Integer', - 'CICNADJ':'Integer' + 'END': 'Integer', 'H2': 'Flag', 'H3': 'Flag', 'MQ': 'Float', + 'MQ0': 'Integer', 'NS': 'Integer', 'SB': 'String', 'SOMATIC': 'Flag', + 'VALIDATED': 'Flag', '1000G': 'Flag', + + # Keys used for structural variants + 'IMPRECISE': 'Flag', 'NOVEL': 'Flag', 'SVTYPE': 'String', + 'SVLEN': 'Integer', 'CIPOS': 'Integer', 'CIEND': 'Integer', + 'HOMLEN': 'Integer', 'HOMSEQ': 'String', 'BKPTID': 'String', + 'MEINFO': 'String', 'METRANS': 'String', 'DGVID': 'String', + 'DBVARID': 'String', 'DBRIPID': 'String', 'MATEID': 'String', + 'PARID': 'String', 'EVENT': 'String', 'CILEN': 'Integer', + 'DPADJ': 'Integer', 'CN': 'Integer', 'CNADJ': 'Integer', + 'CICN': 'Integer', 'CICNADJ': 'Integer' } RESERVED_FORMAT = { 'GT': 'String', 'DP': 'Integer', 'FT': 'String', 'GL': 'Float', - 'GQ': 'Float', 'HQ': 'Float', + 'GLE': 'String', 'PL': 'Integer', 'GP': 'Float', 'GQ': 'Integer', + 'HQ': 'Integer', 'PS': 'Integer', 'PQ': 'Integer', 'EC': 'Integer', + 'MQ': 'Integer', - # VCF 4.1 Additions - 'CN':'Integer','CNQ':'Float','CNL':'Float','NQ':'Integer','HAP':'Integer', - 'AHAP':'Integer' + # Keys used for structural variants + 'CN': 'Integer', 'CNQ': 'Float', 'CNL': 'Float', 'NQ': 'Integer', + 'HAP': 'Integer', 'AHAP': 'Integer' } # Spec is a bit weak on which metadata lines are singular, like fileformat From cfd7091f826b3ede56b91e3d686f34d5e757827d Mon Sep 17 00:00:00 2001 From: mgymrek Date: Fri, 29 Nov 2013 15:37:25 -0500 Subject: [PATCH 065/168] Added method to return alt. allele frequencies when there is more than one alternate allele --- vcf/model.py | 19 ++++++++++++++++--- vcf/test/test_vcf.py | 4 ++-- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/vcf/model.py b/vcf/model.py index 54794fc..370e586 100644 --- a/vcf/model.py +++ b/vcf/model.py @@ -208,17 +208,30 @@ def num_unknown(self): @property def aaf(self): """ The allele frequency of the alternate allele. - NOTE 1: Punt if more than one alternate allele. + NOTE 1: Return a list of frequencies if more than one alternate allele NOTE 2: Denominator calc'ed from _called_ genotypes. """ - # skip if more than one alternate allele. assumes bi-allelic if len(self.ALT) > 1: - return None + return self.multi_aaf het = self.num_het hom_alt = self.num_hom_alt num_chroms = float(2.0 * self.num_called) return float(het + 2 * hom_alt) / float(num_chroms) + @property + def multi_aaf(self): + """ + The allele frequency of alternate alleles for multi-allelic loci. + Return a list of frequencies for each alternate allele. + """ + num_chroms = 2.0 * self.num_called + allele_counts = collections.defaultdict(int) + for s in self.samples: + if s.gt_type is not None: + allele_counts[s.gt_alleles[0]] += 1 + allele_counts[s.gt_alleles[1]] += 1 + return [allele_counts[str(i)]/num_chroms for i in range(1, len(self.ALT)+1)] + @property def nucl_diversity(self): """ diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 730e975..a882d5b 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -449,11 +449,11 @@ def test_aaf(self): if var.POS == 17330: self.assertEqual(1.0/6.0, aaf) if var.POS == 1110696: - self.assertEqual(None, aaf) + self.assertEqual([2.0/6.0, 4.0/6.0], aaf) if var.POS == 1230237: self.assertEqual(0.0/6.0, aaf) elif var.POS == 1234567: - self.assertEqual(None, aaf) + self.assertEqual([2.0/4.0, 1.0/4.0], aaf) def test_pi(self): reader = vcf.Reader(fh('example-4.0.vcf')) From 50a2fcb2255f4d7a613ae97d8ed1269ad656b7f3 Mon Sep 17 00:00:00 2001 From: mgymrek Date: Mon, 2 Dec 2013 13:55:12 -0500 Subject: [PATCH 066/168] made aaf a list, changed to use Counter --- vcf/model.py | 15 +++++++-------- vcf/test/test_vcf.py | 6 +++--- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/vcf/model.py b/vcf/model.py index 370e586..1cf94a8 100644 --- a/vcf/model.py +++ b/vcf/model.py @@ -207,16 +207,15 @@ def num_unknown(self): @property def aaf(self): - """ The allele frequency of the alternate allele. - NOTE 1: Return a list of frequencies if more than one alternate allele - NOTE 2: Denominator calc'ed from _called_ genotypes. + """ A list of allele frequencies of alternate alleles. + NOTE: Denominator calc'ed from _called_ genotypes. """ if len(self.ALT) > 1: return self.multi_aaf het = self.num_het hom_alt = self.num_hom_alt num_chroms = float(2.0 * self.num_called) - return float(het + 2 * hom_alt) / float(num_chroms) + return [float(het + 2 * hom_alt) / float(num_chroms)] @property def multi_aaf(self): @@ -225,11 +224,11 @@ def multi_aaf(self): Return a list of frequencies for each alternate allele. """ num_chroms = 2.0 * self.num_called - allele_counts = collections.defaultdict(int) + allele_counts = collections.Counter() for s in self.samples: if s.gt_type is not None: - allele_counts[s.gt_alleles[0]] += 1 - allele_counts[s.gt_alleles[1]] += 1 + allele_counts.update([s.gt_alleles[0]]) + allele_counts.update([s.gt_alleles[1]]) return [allele_counts[str(i)]/num_chroms for i in range(1, len(self.ALT)+1)] @property @@ -247,7 +246,7 @@ def nucl_diversity(self): # skip if more than one alternate allele. assumes bi-allelic if len(self.ALT) > 1: return None - p = self.aaf + p = self.aaf[0] q = 1.0 - p num_chroms = float(2.0 * self.num_called) return float(num_chroms / (num_chroms - 1.0)) * (2.0 * p * q) diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index a882d5b..5a3c3ec 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -445,13 +445,13 @@ def test_aaf(self): for var in reader: aaf = var.aaf if var.POS == 14370: - self.assertEqual(3.0/6.0, aaf) + self.assertEqual([3.0/6.0], aaf) if var.POS == 17330: - self.assertEqual(1.0/6.0, aaf) + self.assertEqual([1.0/6.0], aaf) if var.POS == 1110696: self.assertEqual([2.0/6.0, 4.0/6.0], aaf) if var.POS == 1230237: - self.assertEqual(0.0/6.0, aaf) + self.assertEqual([0.0/6.0], aaf) elif var.POS == 1234567: self.assertEqual([2.0/4.0, 1.0/4.0], aaf) From 36b4b68c6b76b219509cec2efb45cf3dd85ff7eb Mon Sep 17 00:00:00 2001 From: mgymrek Date: Mon, 2 Dec 2013 14:03:54 -0500 Subject: [PATCH 067/168] Changed aaf to use collections.Counter. Made aaf return a list with frequencies for all alternate alleles --- README.rst | 2 +- vcf/__init__.py | 2 +- vcf/model.py | 13 ------------- 3 files changed, 2 insertions(+), 15 deletions(-) diff --git a/README.rst b/README.rst index 52bd780..86e9fda 100644 --- a/README.rst +++ b/README.rst @@ -66,7 +66,7 @@ examine properties of interest:: >>> print record.num_hom_ref, record.num_het, record.num_hom_alt 1 1 1 >>> print record.nucl_diversity, record.aaf - 0.6 0.5 + 0.6 [0.5] >>> print record.get_hets() [Call(sample=NA00002, CallData(GT=1|0, GQ=48, DP=8, HQ=[51, 51]))] >>> print record.is_snp, record.is_indel, record.is_transition, record.is_deletion diff --git a/vcf/__init__.py b/vcf/__init__.py index 3e49b09..13420df 100644 --- a/vcf/__init__.py +++ b/vcf/__init__.py @@ -67,7 +67,7 @@ >>> print record.num_hom_ref, record.num_het, record.num_hom_alt 1 1 1 >>> print record.nucl_diversity, record.aaf - 0.6 0.5 + 0.6 [0.5] >>> print record.get_hets() [Call(sample=NA00002, CallData(GT=1|0, GQ=48, DP=8, HQ=[51, 51]))] >>> print record.is_snp, record.is_indel, record.is_transition, record.is_deletion diff --git a/vcf/model.py b/vcf/model.py index 1cf94a8..dd60df5 100644 --- a/vcf/model.py +++ b/vcf/model.py @@ -210,19 +210,6 @@ def aaf(self): """ A list of allele frequencies of alternate alleles. NOTE: Denominator calc'ed from _called_ genotypes. """ - if len(self.ALT) > 1: - return self.multi_aaf - het = self.num_het - hom_alt = self.num_hom_alt - num_chroms = float(2.0 * self.num_called) - return [float(het + 2 * hom_alt) / float(num_chroms)] - - @property - def multi_aaf(self): - """ - The allele frequency of alternate alleles for multi-allelic loci. - Return a list of frequencies for each alternate allele. - """ num_chroms = 2.0 * self.num_called allele_counts = collections.Counter() for s in self.samples: From 5497120af5e50f2031135da21afd7c813300d8e5 Mon Sep 17 00:00:00 2001 From: bow Date: Tue, 3 Dec 2013 11:55:38 +0100 Subject: [PATCH 068/168] Add custom equality function as walk_together argument --- vcf/test/test_vcf.py | 36 ++++++++++++++++++++++++++++++++++++ vcf/test/walk_refcall.vcf | 22 ++++++++++++++++++++++ vcf/utils.py | 20 ++++++++++++++------ 3 files changed, 72 insertions(+), 6 deletions(-) create mode 100644 vcf/test/walk_refcall.vcf diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 730e975..daba688 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -950,6 +950,42 @@ def test_walk(self): assert recs[0] is not None assert recs[1] is not None + # case with working custom equality function + + # without custom function, exception should be raised + + reader1 = vcf.Reader(fh('example-4.0.vcf')) + reader2 = vcf.Reader(fh('walk_refcall.vcf')) + self.assertRaisesRegexp(AttributeError, "'NoneType' object has no " + "attribute 'type'", next, utils.walk_together(reader1, reader2)) + + # with custom function, iteration works + + reader1 = vcf.Reader(fh('example-4.0.vcf')) + reader2 = vcf.Reader(fh('walk_refcall.vcf')) + + def custom_eq(rec1, rec2): + # check for equality only on CHROM, POS, and REF + if rec1 is None or rec2 is None: + return False + return rec1.CHROM == rec2.CHROM and rec1.POS == rec2.POS and \ + rec1.REF == rec2.REF + + nrecs, ncomps = 0, 0 + for x in utils.walk_together(reader1, reader2, eq_func=custom_eq): + assert len(x) == 2 + # avoid assert() when one record is None + if x[0] is not None and x[1] is not None: + assert (custom_eq(x[0], x[1]) and custom_eq(x[1], x[0])) + ncomps += 1 + # still increment counter to ensure iteration is finished for all + # records + nrecs += 1 + # check number of records total + assert nrecs == 5 + # check how many records found in all files + assert ncomps == 4 + def test_trim(self): tests = [('TAA GAA', 'T G'), ('TA TA', 'T T'), diff --git a/vcf/test/walk_refcall.vcf b/vcf/test/walk_refcall.vcf new file mode 100644 index 0000000..e93aeff --- /dev/null +++ b/vcf/test/walk_refcall.vcf @@ -0,0 +1,22 @@ +##fileformat=VCFv4.0 +##fileDate=20090805 +##source=myImputationProgramV3.1 +##reference=1000GenomesPilot-NCBI36 +##phasing=partial +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 +20 14370 rs6054257 G . 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 0|0:48:8:51,51 0/0:43:5:.,. +20 17330 . T . 3.0 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|0:3:5:65,3 0/0:41:3 +20 1110696 rs6040355 A . 1e+03 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 0|0:21:6:23,27 0|0:2:0:18,2 0/0:35:4 +20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 diff --git a/vcf/utils.py b/vcf/utils.py index ec6f686..0ab04ca 100644 --- a/vcf/utils.py +++ b/vcf/utils.py @@ -2,14 +2,25 @@ Utilities for VCF files. """ +import operator -def walk_together(*readers): + +def walk_together(*readers, **kwargs): """ Simultaneously iteratate two or more VCF readers and return lists of concurrent records from each reader, with None if no record present. Caller must check the inputs are sorted in the same way and use the same reference otherwise behaviour is undefined. """ + # if defined, custom equality functions must take the same arguments + # as operator.eq + if 'eq_func' in kwargs: + eq_func = kwargs['eq_func'] + # by default, we use the equality operator (==), which compares + # equality in CHROM, POS, REF, and ALT + else: + eq_func = operator.eq + # if one of the VCFs has no records, StopIteration is # raised immediately, so we need to check for that and # deal appropriately @@ -23,15 +34,12 @@ def walk_together(*readers): while True: min_next = min([x for x in nexts if x is not None]) - # this line uses equality on Records, which checks the ALTs - # not sure what to do with records that have overlapping but different - # variation - yield [x if x is None or x == min_next else None for x in nexts] + yield [x if x is None or eq_func(x, min_next) else None for x in nexts] # update nexts that we just yielded for i, n in enumerate(nexts): - if n is not None and n == min_next: + if n is not None and eq_func(n, min_next): try: nexts[i] = readers[i].next() except StopIteration: From 226f56af4d81c5a488a28e619f1974e32a1ebf16 Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Tue, 3 Dec 2013 13:26:36 +0100 Subject: [PATCH 069/168] Add dependency on collections.Counter implementation for Python 2.6 As per my suggestion in PR jamescasbon#131, defaultdict(int) was changed to Counter(). However, I didn't realize it was only added in Python 2.7 and we target Python 2.6. This approach follows what we already did for collections.OrderedDict. --- .travis.yml | 2 +- setup.py | 7 +++++-- tox.ini | 5 +++-- vcf/model.py | 7 ++++++- 4 files changed, 15 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index a1cb3a4..f54f5da 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,7 +7,7 @@ python: - "3.3" - "pypy" install: - - "if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install --use-mirrors cython && pip install --use-mirrors pysam argparse ordereddict; fi" + - "if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install --use-mirrors cython && pip install --use-mirrors pysam argparse counter ordereddict; fi" - "if [[ $TRAVIS_PYTHON_VERSION == '2.7' ]]; then pip install --use-mirrors cython && pip install --use-mirrors pysam; fi" - python setup.py install script: python setup.py test diff --git a/setup.py b/setup.py index bca3a0d..1ea709c 100644 --- a/setup.py +++ b/setup.py @@ -16,9 +16,12 @@ except ImportError: requires.append('argparse') - +import collections +try: + collections.Counter +except AttributeError: + requires.append('counter') try: - import collections collections.OrderedDict except AttributeError: requires.append('ordereddict') diff --git a/tox.ini b/tox.ini index 8a56c17..0c07a88 100644 --- a/tox.ini +++ b/tox.ini @@ -14,6 +14,7 @@ commands = [testenv:py26] deps = argparse + counter ordereddict cython pysam @@ -24,9 +25,9 @@ deps = cython [testenv:py32] -deps = +deps = cython [testenv:py33] -deps = +deps = cython diff --git a/vcf/model.py b/vcf/model.py index dd60df5..dc34319 100644 --- a/vcf/model.py +++ b/vcf/model.py @@ -2,6 +2,11 @@ import collections import sys +try: + from collections import Counter +except ImportError: + from counter import Counter + class _Call(object): """ A genotype call, a cell entry in a VCF file""" @@ -211,7 +216,7 @@ def aaf(self): NOTE: Denominator calc'ed from _called_ genotypes. """ num_chroms = 2.0 * self.num_called - allele_counts = collections.Counter() + allele_counts = Counter() for s in self.samples: if s.gt_type is not None: allele_counts.update([s.gt_alleles[0]]) From 322c2121cc0175e6faa662648170028070b0d500 Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Wed, 4 Dec 2013 14:35:25 +0100 Subject: [PATCH 070/168] Fix unit tests on Python 2.6 and add missing tests to the suite --- vcf/test/test_vcf.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index ba9a8a3..8fcd2a8 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -956,8 +956,8 @@ def test_walk(self): reader1 = vcf.Reader(fh('example-4.0.vcf')) reader2 = vcf.Reader(fh('walk_refcall.vcf')) - self.assertRaisesRegexp(AttributeError, "'NoneType' object has no " - "attribute 'type'", next, utils.walk_together(reader1, reader2)) + self.assertRaises(AttributeError, next, + utils.walk_together(reader1, reader2)) # with custom function, iteration works @@ -1009,24 +1009,26 @@ def test_meta(self): assert reader.metadata['GATKCommandLine'][1]['CommandLineOptions'] == '"analysis_type=VariantAnnotator annotation=[HomopolymerRun, VariantType, TandemRepeatAnnotator]"' - +suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestVcfSpecs)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestGatkOutput)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestFreebayesOutput)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestSamtoolsOutput)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestBcfToolsOutput)) -suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestGatkOutputWriter)) -suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestInfoTypeCharacter)) -suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestBcfToolsOutputWriter)) -suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestWriterDictionaryMeta)) -suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestTabix)) -suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestOpenMethods)) -suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestFilter)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(Test1kg)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(Test1kgSites)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestGoNL)) +suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestInfoOrder)) +suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestInfoTypeCharacter)) +suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestGatkOutputWriter)) +suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestBcfToolsOutputWriter)) +suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestWriterDictionaryMeta)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestSamplesSpace)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestMixedFiltering)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestRecord)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestCall)) +suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestTabix)) +suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestOpenMethods)) +suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestFilter)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestRegression)) -suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestVcfSpecs)) +suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestUtils)) +suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestGATKMeta)) From b8c0af7ee8382297aaf41409a359410263df5f0c Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Wed, 4 Dec 2013 14:38:34 +0100 Subject: [PATCH 071/168] Fix comparison of _Record objects on Python 3 --- vcf/model.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/vcf/model.py b/vcf/model.py index dc34319..09013dd 100644 --- a/vcf/model.py +++ b/vcf/model.py @@ -148,6 +148,11 @@ def __init__(self, CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, self.samples = samples or [] self._sample_indexes = sample_indexes + # For Python 2 + def __cmp__(self, other): + return cmp((self.CHROM, self.POS), (other.CHROM, other.POS)) + + # For Python 3 def __eq__(self, other): """ _Records are equal if they describe the same variant (same position, alleles) """ return (self.CHROM == other.CHROM and @@ -155,15 +160,16 @@ def __eq__(self, other): self.REF == other.REF and self.ALT == other.ALT) + # For Python 3 + def __lt__(self, other): + return (self.CHROM, self.POS) < (other.CHROM, other.POS) + def __iter__(self): return iter(self.samples) def __str__(self): return "Record(CHROM=%(CHROM)s, POS=%(POS)s, REF=%(REF)s, ALT=%(ALT)s)" % self.__dict__ - def __cmp__(self, other): - return cmp((self.CHROM, self.POS), (other.CHROM, other.POS)) - def add_format(self, fmt): self.FORMAT = self.FORMAT + ':' + fmt From 23d1fc0d663f764b955110bc79023cbbf0eb57af Mon Sep 17 00:00:00 2001 From: bow Date: Wed, 4 Dec 2013 16:27:07 +0100 Subject: [PATCH 072/168] Update __eq__ operators to return False for comparison with different types --- vcf/model.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/vcf/model.py b/vcf/model.py index 09013dd..552e1c4 100644 --- a/vcf/model.py +++ b/vcf/model.py @@ -36,6 +36,8 @@ def __eq__(self, other): """ Two _Calls are equal if their _Records are equal and the samples and ``gt_type``s are the same """ + if not isinstance(other, self.__class__): + return False return (self.site == other.site and self.sample == other.sample and self.gt_type == other.gt_type) @@ -155,6 +157,10 @@ def __cmp__(self, other): # For Python 3 def __eq__(self, other): """ _Records are equal if they describe the same variant (same position, alleles) """ + # a _Record is never equal with a non-Record + # do the check here to avoid AttributeError (i.e. None does not have CHROM) + if not isinstance(other, self.__class__): + return False return (self.CHROM == other.CHROM and self.POS == other.POS and self.REF == other.REF and @@ -447,6 +453,8 @@ def __str__(self): raise NotImplementedError def __eq__(self, other): + if not isinstance(other, self.__class__): + return False return self.type == other.type @@ -473,8 +481,9 @@ def __len__(self): def __eq__(self, other): if isinstance(other, basestring): return self.sequence == other - else: - return super(_Substitution, self).__eq__(other) and self.sequence == other.sequence + elif not isinstance(other, self.__class__): + return False + return super(_Substitution, self).__eq__(other) and self.sequence == other.sequence class _Breakend(_AltRecord): @@ -523,6 +532,8 @@ def __str__(self): return self.connectingSequence + remoteTag def __eq__(self, other): + if not isinstance(other, self.__class__): + return False return super(_Breakend, self).__eq__(other) \ and self.chr == other.chr \ and self.pos == other.pos \ From 1a103cd04d3f65640f82ae2ff1eb4c24ea7360b0 Mon Sep 17 00:00:00 2001 From: bow Date: Wed, 4 Dec 2013 16:34:46 +0100 Subject: [PATCH 073/168] Add tests for updated equality behavior --- vcf/test/test_vcf.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 8fcd2a8..5bbe824 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -425,6 +425,11 @@ def test_num_calls(self): var.num_het + var.num_unknown) self.assertEqual(len(var.samples), num_calls) + def test_dunder_eq(self): + rec = vcf.Reader(fh('example-4.0.vcf')).next() + self.assertFalse(rec == None) + self.assertFalse(None == rec) + def test_call_rate(self): reader = vcf.Reader(fh('example-4.0.vcf')) for var in reader: @@ -733,6 +738,13 @@ def test_pickle(self): class TestCall(unittest.TestCase): + def test_dunder_eq(self): + reader = vcf.Reader(fh('example-4.0.vcf')) + var = reader.next() + example_call = var.samples[0] + self.assertFalse(example_call == None) + self.assertFalse(None == example_call) + def test_phased(self): reader = vcf.Reader(fh('example-4.0.vcf')) for var in reader: From 8dcca20936c81a639893cf1eee5f8336d594acb7 Mon Sep 17 00:00:00 2001 From: bow Date: Wed, 4 Dec 2013 16:28:22 +0100 Subject: [PATCH 074/168] Update walk_together test to accomodate __eq__ behavior change --- vcf/test/test_vcf.py | 45 ++++++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 5bbe824..d3b6c64 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -964,17 +964,28 @@ def test_walk(self): # case with working custom equality function - # without custom function, exception should be raised + # without custom function, most records in these files + # are different since the default equality checks + # for ALT values reader1 = vcf.Reader(fh('example-4.0.vcf')) reader2 = vcf.Reader(fh('walk_refcall.vcf')) - self.assertRaises(AttributeError, next, - utils.walk_together(reader1, reader2)) - # with custom function, iteration works + # counters for distinct records and overlapping records + ndist_def, nover_def = 0, 0 + for x in utils.walk_together(reader1, reader2): + assert len(x) == 2 + if x[0] is not None and x[1] is not None: + assert (x[0] == x[1] and x[1] == x[0]) + nover_def += 1 + ndist_def += 1 + # check how many overlapping records + assert nover_def == 1 + # check how many distinct records + assert ndist_def == 8 - reader1 = vcf.Reader(fh('example-4.0.vcf')) - reader2 = vcf.Reader(fh('walk_refcall.vcf')) + # with custom function that does not check ALT, + # we see more overlaps and less distinct records def custom_eq(rec1, rec2): # check for equality only on CHROM, POS, and REF @@ -983,20 +994,22 @@ def custom_eq(rec1, rec2): return rec1.CHROM == rec2.CHROM and rec1.POS == rec2.POS and \ rec1.REF == rec2.REF - nrecs, ncomps = 0, 0 + reader1 = vcf.Reader(fh('example-4.0.vcf')) + reader2 = vcf.Reader(fh('walk_refcall.vcf')) + + ndist_cust, nover_cust = 0, 0 for x in utils.walk_together(reader1, reader2, eq_func=custom_eq): assert len(x) == 2 - # avoid assert() when one record is None if x[0] is not None and x[1] is not None: assert (custom_eq(x[0], x[1]) and custom_eq(x[1], x[0])) - ncomps += 1 - # still increment counter to ensure iteration is finished for all - # records - nrecs += 1 - # check number of records total - assert nrecs == 5 - # check how many records found in all files - assert ncomps == 4 + nover_cust += 1 + ndist_cust += 1 + assert nover_cust == 4 + assert ndist_cust == 5 + + # final check just to be absolutely sure + assert ndist_def != ndist_cust + assert nover_def != nover_cust def test_trim(self): tests = [('TAA GAA', 'T G'), From d39ffa0737c4b3f478972aaf84753fc48be2da77 Mon Sep 17 00:00:00 2001 From: mgymrek Date: Tue, 10 Dec 2013 19:55:57 -0500 Subject: [PATCH 075/168] Adding method to compute heterozygosity for a site --- README.rst | 4 ++-- vcf/__init__.py | 4 ++-- vcf/model.py | 12 ++++++++++++ vcf/test/test_vcf.py | 15 +++++++++++++++ 4 files changed, 31 insertions(+), 4 deletions(-) diff --git a/README.rst b/README.rst index 86e9fda..eed7808 100644 --- a/README.rst +++ b/README.rst @@ -65,8 +65,8 @@ examine properties of interest:: 3 1.0 0 >>> print record.num_hom_ref, record.num_het, record.num_hom_alt 1 1 1 - >>> print record.nucl_diversity, record.aaf - 0.6 [0.5] + >>> print record.nucl_diversity, record.aaf, record.heterozygosity + 0.6 [0.5], 0.5 >>> print record.get_hets() [Call(sample=NA00002, CallData(GT=1|0, GQ=48, DP=8, HQ=[51, 51]))] >>> print record.is_snp, record.is_indel, record.is_transition, record.is_deletion diff --git a/vcf/__init__.py b/vcf/__init__.py index 13420df..647106a 100644 --- a/vcf/__init__.py +++ b/vcf/__init__.py @@ -66,8 +66,8 @@ 3 1.0 0 >>> print record.num_hom_ref, record.num_het, record.num_hom_alt 1 1 1 - >>> print record.nucl_diversity, record.aaf - 0.6 [0.5] + >>> print record.nucl_diversity, record.aaf, record.heterozygosity + 0.6 [0.5] 0.5 >>> print record.get_hets() [Call(sample=NA00002, CallData(GT=1|0, GQ=48, DP=8, HQ=[51, 51]))] >>> print record.is_snp, record.is_indel, record.is_transition, record.is_deletion diff --git a/vcf/model.py b/vcf/model.py index 09013dd..b5a37e6 100644 --- a/vcf/model.py +++ b/vcf/model.py @@ -249,6 +249,18 @@ def nucl_diversity(self): num_chroms = float(2.0 * self.num_called) return float(num_chroms / (num_chroms - 1.0)) * (2.0 * p * q) + @property + def heterozygosity(self): + """ + Heterozygosity of a site. Heterozygosity gives the probability that + two randomly chosen chromosomes from the population have different + alleles, giving a measurement of the degree of polymorphism in a population. + + If there are i alleles with frequency p_i, H-1-sum_i(p_i^2) + """ + allele_freqs = [1-sum(self.aaf)] + self.aaf + return 1 - sum(map(lambda x: x**2, allele_freqs)) + def get_hom_refs(self): """ The list of hom ref genotypes""" return [s for s in self.samples if s.gt_type == 0] diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 8fcd2a8..713656c 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -470,6 +470,21 @@ def test_pi(self): elif var.POS == 1234567: self.assertEqual(None, pi) + def test_heterozygosity(self): + reader = vcf.Reader(fh('example-4.0.vcf')) + for var in reader: + het = var.heterozygosity + if var.POS == 14370: + self.assertEqual(0.5, het) + if var.POS == 17330: + self.assertEqual(1-((1.0/6)**2 + (5.0/6)**2), het) + if var.POS == 1110696: + self.assertEqual(4.0/9.0, het) + if var.POS == 1230237: + self.assertEqual(0.0, het) + elif var.POS == 1234567: + self.assertEqual(5.0/8.0, het) + def test_is_snp(self): reader = vcf.Reader(fh('example-4.0.vcf')) for r in reader: From cfade353527233e027b625f0d76da3db2e8f75ba Mon Sep 17 00:00:00 2001 From: mgymrek Date: Tue, 10 Dec 2013 19:57:59 -0500 Subject: [PATCH 076/168] added heterozygosity method, fixed typo in docstring --- vcf/model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vcf/model.py b/vcf/model.py index b5a37e6..5d5acd6 100644 --- a/vcf/model.py +++ b/vcf/model.py @@ -254,9 +254,9 @@ def heterozygosity(self): """ Heterozygosity of a site. Heterozygosity gives the probability that two randomly chosen chromosomes from the population have different - alleles, giving a measurement of the degree of polymorphism in a population. + alleles, giving a measure of the degree of polymorphism in a population. - If there are i alleles with frequency p_i, H-1-sum_i(p_i^2) + If there are i alleles with frequency p_i, H=1-sum_i(p_i^2) """ allele_freqs = [1-sum(self.aaf)] + self.aaf return 1 - sum(map(lambda x: x**2, allele_freqs)) From 1bd477a141a749c7c401f392681ce76f075ce10a Mon Sep 17 00:00:00 2001 From: mgymrek Date: Tue, 10 Dec 2013 19:59:54 -0500 Subject: [PATCH 077/168] fixed small typo in readme.rst for heterozygosity... --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index eed7808..a60c0c8 100644 --- a/README.rst +++ b/README.rst @@ -66,7 +66,7 @@ examine properties of interest:: >>> print record.num_hom_ref, record.num_het, record.num_hom_alt 1 1 1 >>> print record.nucl_diversity, record.aaf, record.heterozygosity - 0.6 [0.5], 0.5 + 0.6 [0.5] 0.5 >>> print record.get_hets() [Call(sample=NA00002, CallData(GT=1|0, GQ=48, DP=8, HQ=[51, 51]))] >>> print record.is_snp, record.is_indel, record.is_transition, record.is_deletion From 9c3822d63f161cdeaad6c6ebfb0d359a8981736e Mon Sep 17 00:00:00 2001 From: bow Date: Sat, 11 Jan 2014 11:43:37 +0100 Subject: [PATCH 078/168] Ensure spurious line ending characters on records are stripped away --- vcf/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vcf/parser.py b/vcf/parser.py index 8996d37..222f1a8 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -523,7 +523,7 @@ def _parse_alt(self, str): def next(self): '''Return the next record in the file.''' - line = self.reader.next() + line = self.reader.next().rstrip() row = re.split(self._separator, line) chrom = row[0] if self._prepend_chr: From a60ef2f88da920f237a5565016626290facbfe32 Mon Sep 17 00:00:00 2001 From: bow Date: Sat, 11 Jan 2014 11:52:38 +0100 Subject: [PATCH 079/168] Fix so conversion to Py3 works --- vcf/parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vcf/parser.py b/vcf/parser.py index 222f1a8..26948cb 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -523,8 +523,8 @@ def _parse_alt(self, str): def next(self): '''Return the next record in the file.''' - line = self.reader.next().rstrip() - row = re.split(self._separator, line) + line = self.reader.next() + row = re.split(self._separator, line.rstrip()) chrom = row[0] if self._prepend_chr: chrom = 'chr' + chrom From a06f583577b09a1af68933a0d6d4194d3ebcf161 Mon Sep 17 00:00:00 2001 From: Zhaorong Ma Date: Wed, 5 Feb 2014 11:11:37 -0500 Subject: [PATCH 080/168] Changed the default line ending in vcf.Writer() to '\n'. --- vcf/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vcf/parser.py b/vcf/parser.py index 8996d37..8c56ab8 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -606,7 +606,7 @@ class Writer(object): # Reverse keys and values in header field count dictionary counts = dict((v,k) for k,v in field_counts.iteritems()) - def __init__(self, stream, template, lineterminator="\r\n"): + def __init__(self, stream, template, lineterminator="\n"): self.writer = csv.writer(stream, delimiter="\t", lineterminator=lineterminator) self.template = template self.stream = stream From 7c2710358ebd76c2bc00d2e7fe09889f0a6abbbe Mon Sep 17 00:00:00 2001 From: James Casbon Date: Thu, 6 Feb 2014 16:02:29 +0000 Subject: [PATCH 081/168] version 0.6.5 --- docs/HISTORY.rst | 14 ++++++++++++++ vcf/__init__.py | 2 +- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/docs/HISTORY.rst b/docs/HISTORY.rst index 15aba83..2b03818 100644 --- a/docs/HISTORY.rst +++ b/docs/HISTORY.rst @@ -17,6 +17,20 @@ New features should have test code sent with them. Changes ======= +0.6.5 Release +------------- + +* Better contig handling (#115, #116, #119 thanks Martijn) +* INFO lines with type character (#120, #121 thanks @AndrewUzilov, Martijn) +* Single breakends fix (#126 thanks @pkrushe) +* Speedup by losing ordering of INFO (#128 thanks Martijn) +* HOMSEQ and other missing fields in INFO (#130 thanks Martijn) +* Add aaf property, (thanks @mgymrek #131) +* Custom equality for walk_together, thanks bow #132 +* Change default line encoding to '\n' +* Improved __eq__ (#134, thanks bow) + + 0.6.4 Release ------------- diff --git a/vcf/__init__.py b/vcf/__init__.py index 647106a..ec89aee 100644 --- a/vcf/__init__.py +++ b/vcf/__init__.py @@ -177,4 +177,4 @@ from vcf.filters import Base as Filter from vcf.parser import RESERVED_INFO, RESERVED_FORMAT -VERSION = '0.6.4' +VERSION = '0.6.5' From 616f31093ef63b3edc0bf059e89324485dfc19b0 Mon Sep 17 00:00:00 2001 From: datagram Date: Thu, 6 Feb 2014 01:08:21 -0800 Subject: [PATCH 082/168] Fix for issue #140, add vcf_record_sort_key arg - Added 'vcf_record_sort_key' to allow user to specify arbitrary chromosome ordering. - Fixed issue #140 by making sure to emit all records from the current chromosome before moving on to the next one. This takes care of the problem in most typical cases (eg. when all files have records for all contigs), but not in some edge cases, in which case the 'vcf_record_sort_key' arg can be used to fully solve the problem by explicitly defining the chromosome order. --- vcf/utils.py | 85 ++++++++++++++++++++++++++-------------------------- 1 file changed, 42 insertions(+), 43 deletions(-) diff --git a/vcf/utils.py b/vcf/utils.py index 0ab04ca..c4dea1a 100644 --- a/vcf/utils.py +++ b/vcf/utils.py @@ -2,51 +2,50 @@ Utilities for VCF files. """ -import operator +def walk_together(*readers, **kwargs): + """ Simultaneously iteratate two or more VCF readers and return + lists of concurrent records from each + reader, with None if no record present. Caller must check the + inputs are sorted in the same way and use the same reference + otherwise behaviour is undefined. + + Args: + vcf_record_sort_key: function that takes a VCF record and returns a tuple that can be used as the key for comparing and sorting VCF records across all given VCFReaders. The tuple's 1st element should be the contig name. + """ + if 'vcf_record_sort_key' in kwargs: + get_key = kwargs['vcf_record_sort_key'] + else: + get_key = lambda r: (r.CHROM, r.POS) + + nexts = [] + for reader in readers: + try: + nexts.append(reader.next()) + except StopIteration: + nexts.append(None) + min_k = (None,) # keep track of the previous min key's contig + while True: + kdict = {i: get_key(x) for i,x in enumerate(nexts) if x is not None} + keys_with_prev_contig = [k for k in kdict.values() if k[0] == min_k[0]] + if any(keys_with_prev_contig): + # finish all records from previous contig + min_k = min(keys_with_prev_contig) + else: + # move on to the next contig + min_k = min(kdict.values()) + + min_k_idxs = set([i for i, k in kdict.items() if k == min_k]) + yield [nexts[i] if i in min_k_idxs else None for i in range(len(nexts))] -def walk_together(*readers, **kwargs): - """ Simultaneously iteratate two or more VCF readers and return - lists of concurrent records from each - reader, with None if no record present. Caller must check the - inputs are sorted in the same way and use the same reference - otherwise behaviour is undefined. - """ - # if defined, custom equality functions must take the same arguments - # as operator.eq - if 'eq_func' in kwargs: - eq_func = kwargs['eq_func'] - # by default, we use the equality operator (==), which compares - # equality in CHROM, POS, REF, and ALT - else: - eq_func = operator.eq - - # if one of the VCFs has no records, StopIteration is - # raised immediately, so we need to check for that and - # deal appropriately - nexts = [] - for reader in readers: - try: - nexts.append(reader.next()) - except StopIteration: - nexts.append(None) - - while True: - min_next = min([x for x in nexts if x is not None]) - - yield [x if x is None or eq_func(x, min_next) else None for x in nexts] - - # update nexts that we just yielded - for i, n in enumerate(nexts): - - if n is not None and eq_func(n, min_next): - try: - nexts[i] = readers[i].next() - except StopIteration: - nexts[i] = None - - if all([x is None for x in nexts]): - break + for i in min_k_idxs: + try: + nexts[i] = readers[i].next() + except StopIteration: + nexts[i] = None + + if all([x is None for x in nexts]): + break def trim_common_suffix(*sequences): From 2de70ce85f1fb15511ee4f5302ee13224cd1b0eb Mon Sep 17 00:00:00 2001 From: Ben Weisburd Date: Thu, 6 Feb 2014 13:35:25 -0800 Subject: [PATCH 083/168] Fixed spacing and wrapping in utils.py, removed test for old walk_together arg (eq function), fixed edge case in _AltRecord --- vcf/model.py | 4 +- vcf/test/test_vcf.py | 72 +++++++++++++++++----------------- vcf/utils.py | 93 ++++++++++++++++++++++++-------------------- 3 files changed, 89 insertions(+), 80 deletions(-) diff --git a/vcf/model.py b/vcf/model.py index 21eabc5..266b941 100644 --- a/vcf/model.py +++ b/vcf/model.py @@ -465,9 +465,7 @@ def __str__(self): raise NotImplementedError def __eq__(self, other): - if not isinstance(other, self.__class__): - return False - return self.type == other.type + return self.type == getattr(other, 'type', None) class _Substitution(_AltRecord): diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index dcc54f4..9853ced 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -20,7 +20,7 @@ class TestVcfSpecs(unittest.TestCase): def test_vcf_4_0(self): reader = vcf.Reader(fh('example-4.0.vcf')) - assert reader.metadata['fileformat'] == 'VCFv4.0' + self.assertEqual(reader.metadata['fileformat'], 'VCFv4.0') # test we can walk the file at least for r in reader: @@ -81,21 +81,21 @@ def test_vcf_4_1_bnd(self): for a in r.ALT: print(a) if r.ID == "bnd1": - assert len(r.ALT) == 1 - assert r.ALT[0].type == "BND" - assert r.ALT[0].chr == "2" - assert r.ALT[0].pos == 3 - assert r.ALT[0].orientation == False - assert r.ALT[0].remoteOrientation == True - assert r.ALT[0].connectingSequence == "T" + self.assertEqual(len(r.ALT), 1) + self.assertEqual(r.ALT[0].type, "BND") + self.assertEqual(r.ALT[0].chr, "2") + self.assertEqual(r.ALT[0].pos, 3) + self.assertEqual(r.ALT[0].orientation, False) + self.assertEqual(r.ALT[0].remoteOrientation, True) + self.assertEqual(r.ALT[0].connectingSequence, "T") if r.ID == "bnd4": - assert len(r.ALT) == 1 - assert r.ALT[0].type == "BND" - assert r.ALT[0].chr == "1" - assert r.ALT[0].pos == 2 - assert r.ALT[0].orientation == True - assert r.ALT[0].remoteOrientation == False - assert r.ALT[0].connectingSequence == "G" + self.assertEqual(len(r.ALT), 1) + self.assertEqual(r.ALT[0].type, "BND") + self.assertEqual(r.ALT[0].chr, "1") + self.assertEqual(r.ALT[0].pos, 2) + self.assertEqual(r.ALT[0].orientation, True) + self.assertEqual(r.ALT[0].remoteOrientation, False) + self.assertEqual(r.ALT[0].connectingSequence, "G") for c in r: print(c) assert c @@ -165,7 +165,7 @@ def testParse(self): n+=1 for x in r: assert x - assert n == self.n_calls + self.assertEqual(n, self.n_calls) class TestSamtoolsOutput(unittest.TestCase): @@ -748,7 +748,7 @@ def test_info_multiple_values(self): def test_pickle(self): reader = vcf.Reader(fh('example-4.0.vcf')) for var in reader: - assert cPickle.loads(cPickle.dumps(var)) == var + self.assertEqual(cPickle.loads(cPickle.dumps(var)), var) class TestCall(unittest.TestCase): @@ -836,7 +836,7 @@ def testFetchSite(self): if not self.run: return site = self.reader.fetch('20', 14370) - assert site.POS == 14370 + self.assertEqual(site.POS, 14370) site = self.reader.fetch('20', 14369) assert site is None @@ -878,7 +878,7 @@ def testApplyFilter(self): return s, out = commands.getstatusoutput('python scripts/vcf_filter.py --site-quality 30 test/example-4.0.vcf sq') #print(out) - assert s == 0 + self.assertEqual(s, 0) buf = StringIO() buf.write(out) buf.seek(0) @@ -900,7 +900,7 @@ def testApplyFilter(self): n += 1 else: assert 'sq30' not in r.FILTER - assert n == 2 + self.assertEqual(n, 2) def testApplyMultipleFilters(self): @@ -908,7 +908,7 @@ def testApplyMultipleFilters(self): return s, out = commands.getstatusoutput('python scripts/vcf_filter.py --site-quality 30 ' '--genotype-quality 50 test/example-4.0.vcf sq mgq') - assert s == 0 + self.assertEqual(s, 0) #print(out) buf = StringIO() buf.write(out) @@ -954,10 +954,11 @@ def test_walk(self): n = 0 for x in utils.walk_together(reader1, reader2, reader3): - assert len(x) == 3 - assert (x[0] == x[1]) and (x[1] == x[2]) + self.assertEqual(len(x), 3) + self.assertEqual(x[0], x[1]) + self.assertEqual(x[1], x[2]) n+= 1 - assert n == 5 + self.assertEqual(n, 5) # artificial case 2 from the left, 2 from the right, 2 together, 1 from the right, 1 from the left @@ -1014,17 +1015,18 @@ def custom_eq(rec1, rec2): ndist_cust, nover_cust = 0, 0 for x in utils.walk_together(reader1, reader2, eq_func=custom_eq): - assert len(x) == 2 + self.assertEqual(len(x), 2) + # avoid assert() when one record is None if x[0] is not None and x[1] is not None: assert (custom_eq(x[0], x[1]) and custom_eq(x[1], x[0])) - nover_cust += 1 - ndist_cust += 1 - assert nover_cust == 4 - assert ndist_cust == 5 - - # final check just to be absolutely sure - assert ndist_def != ndist_cust - assert nover_def != nover_cust + ncomps += 1 + # still increment counter to ensure iteration is finished for all + # records + nrecs += 1 + # check number of records total + self.assertEqual(nrecs, 5) + # check how many records found in all files + self.assertEqual(ncomps, 4) def test_trim(self): tests = [('TAA GAA', 'T G'), @@ -1045,8 +1047,8 @@ def test_meta(self): # expect no exceptions raised reader = vcf.Reader(fh('gatk_26_meta.vcf')) assert 'GATKCommandLine' in reader.metadata - assert reader.metadata['GATKCommandLine'][0]['CommandLineOptions'] == '"analysis_type=LeftAlignAndTrimVariants"' - assert reader.metadata['GATKCommandLine'][1]['CommandLineOptions'] == '"analysis_type=VariantAnnotator annotation=[HomopolymerRun, VariantType, TandemRepeatAnnotator]"' + self.assertEqual(reader.metadata['GATKCommandLine'][0]['CommandLineOptions'], '"analysis_type=LeftAlignAndTrimVariants"') + self.assertEqual(reader.metadata['GATKCommandLine'][1]['CommandLineOptions'], '"analysis_type=VariantAnnotator annotation=[HomopolymerRun, VariantType, TandemRepeatAnnotator]"') suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestVcfSpecs)) diff --git a/vcf/utils.py b/vcf/utils.py index c4dea1a..09a6668 100644 --- a/vcf/utils.py +++ b/vcf/utils.py @@ -2,50 +2,59 @@ Utilities for VCF files. """ -def walk_together(*readers, **kwargs): - """ Simultaneously iteratate two or more VCF readers and return - lists of concurrent records from each - reader, with None if no record present. Caller must check the - inputs are sorted in the same way and use the same reference - otherwise behaviour is undefined. - - Args: - vcf_record_sort_key: function that takes a VCF record and returns a tuple that can be used as the key for comparing and sorting VCF records across all given VCFReaders. The tuple's 1st element should be the contig name. - """ - if 'vcf_record_sort_key' in kwargs: - get_key = kwargs['vcf_record_sort_key'] - else: - get_key = lambda r: (r.CHROM, r.POS) - - nexts = [] - for reader in readers: - try: - nexts.append(reader.next()) - except StopIteration: - nexts.append(None) +def walk_together(*readers, **kwargs): + """ + Simultaneously iteratate over two or more VCF readers. For each + genomic position with a variant, return a list of size equal to the number + of VCF readers. This list contains the VCF record from readers that have + this variant, and None for readers that don't have it. + The caller must make sure that inputs are sorted in the same way and use the + same reference otherwise behaviour is undefined. + + Args: + vcf_record_sort_key: function that takes a VCF record and returns a + tuple that can be used as a key for comparing and sorting VCF + records across all readers. This tuple defines what it means for two + variants to be equal (eg. whether it's only their position or also + their allele values), and implicitly determines the chromosome + ordering since the tuple's 1st element is typically the chromosome + name (or calculated from it). + """ + if 'vcf_record_sort_key' in kwargs: + get_key = kwargs['vcf_record_sort_key'] + else: + get_key = lambda r: (r.CHROM, r.POS) #, r.REF, r.ALT) + + nexts = [] + for reader in readers: + try: + nexts.append(reader.next()) + except StopIteration: + nexts.append(None) + + min_k = (None,) # keep track of the previous min key's contig + while True: + next_idx_to_k = dict( + (i, get_key(r)) for i, r in enumerate(nexts) if r is not None) + keys_with_prev_contig = [ + k for k in next_idx_to_k.values() if k[0] == min_k[0]] + + if any(keys_with_prev_contig): + min_k = min(keys_with_prev_contig) # finish previous contig + else: + min_k = min(next_idx_to_k.values()) # move on to next contig - min_k = (None,) # keep track of the previous min key's contig - while True: - kdict = {i: get_key(x) for i,x in enumerate(nexts) if x is not None} - keys_with_prev_contig = [k for k in kdict.values() if k[0] == min_k[0]] - if any(keys_with_prev_contig): - # finish all records from previous contig - min_k = min(keys_with_prev_contig) - else: - # move on to the next contig - min_k = min(kdict.values()) - - min_k_idxs = set([i for i, k in kdict.items() if k == min_k]) - yield [nexts[i] if i in min_k_idxs else None for i in range(len(nexts))] + min_k_idxs = set([i for i, k in next_idx_to_k.items() if k == min_k]) + yield [nexts[i] if i in min_k_idxs else None for i in range(len(nexts))] - for i in min_k_idxs: - try: - nexts[i] = readers[i].next() - except StopIteration: - nexts[i] = None - - if all([x is None for x in nexts]): - break + for i in min_k_idxs: + try: + nexts[i] = readers[i].next() + except StopIteration: + nexts[i] = None + + if all([r is None for r in nexts]): + break def trim_common_suffix(*sequences): From d7563dc80a1ef645e1c22a860183ec3472c835ec Mon Sep 17 00:00:00 2001 From: Ben Weisburd Date: Thu, 6 Feb 2014 14:16:19 -0800 Subject: [PATCH 084/168] Fixed edge case where all inputs are empty, simplified logic --- vcf/utils.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/vcf/utils.py b/vcf/utils.py index 09a6668..456e5fa 100644 --- a/vcf/utils.py +++ b/vcf/utils.py @@ -33,7 +33,7 @@ def walk_together(*readers, **kwargs): nexts.append(None) min_k = (None,) # keep track of the previous min key's contig - while True: + while any([r is not None for r in nexts]): next_idx_to_k = dict( (i, get_key(r)) for i, r in enumerate(nexts) if r is not None) keys_with_prev_contig = [ @@ -52,9 +52,6 @@ def walk_together(*readers, **kwargs): nexts[i] = readers[i].next() except StopIteration: nexts[i] = None - - if all([r is None for r in nexts]): - break def trim_common_suffix(*sequences): From ce4d20f1b92fb9a3c5d9e21f6b04bf3bde566f94 Mon Sep 17 00:00:00 2001 From: datagram Date: Fri, 7 Feb 2014 03:20:20 -0800 Subject: [PATCH 085/168] finished fixing edge case where 'other' is None --- vcf/model.py | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/vcf/model.py b/vcf/model.py index 266b941..c6e8f42 100644 --- a/vcf/model.py +++ b/vcf/model.py @@ -36,11 +36,9 @@ def __eq__(self, other): """ Two _Calls are equal if their _Records are equal and the samples and ``gt_type``s are the same """ - if not isinstance(other, self.__class__): - return False - return (self.site == other.site - and self.sample == other.sample - and self.gt_type == other.gt_type) + return (self.site == getattr(other, "site", None) + and self.sample == getattr(other, "sample", None) + and self.gt_type == getattr(other, "gt_type", None)) def __getstate__(self): return dict((attr, getattr(self, attr)) for attr in self.__slots__) @@ -152,23 +150,19 @@ def __init__(self, CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, # For Python 2 def __cmp__(self, other): - return cmp((self.CHROM, self.POS), (other.CHROM, other.POS)) + return cmp((self.CHROM, self.POS), (getattr(other, "CHROM", None), getattr(other, "POS", None))) # For Python 3 def __eq__(self, other): """ _Records are equal if they describe the same variant (same position, alleles) """ - # a _Record is never equal with a non-Record - # do the check here to avoid AttributeError (i.e. None does not have CHROM) - if not isinstance(other, self.__class__): - return False - return (self.CHROM == other.CHROM and - self.POS == other.POS and - self.REF == other.REF and - self.ALT == other.ALT) + return (self.CHROM == getattr(other, "CHROM", None) and + self.POS == getattr(other, "POS", None) and + self.REF == getattr(other, "REF", None) and + self.ALT == getattr(other, "ALT", None)) # For Python 3 def __lt__(self, other): - return (self.CHROM, self.POS) < (other.CHROM, other.POS) + return (self.CHROM, self.POS) < (getattr(other, "CHROM", None), getattr(other, "POS", None)) def __iter__(self): return iter(self.samples) @@ -545,12 +539,12 @@ def __eq__(self, other): if not isinstance(other, self.__class__): return False return super(_Breakend, self).__eq__(other) \ - and self.chr == other.chr \ - and self.pos == other.pos \ - and self.remoteOrientation == other.remoteOrientation \ - and self.withinMainAssembly == other.withinMainAssembly \ - and self.orientation == other.orientation \ - and self.connectingSequence == other.connectingSequence + and self.chr == getattr(other, "chr", None) \ + and self.pos == getattr(other, "pos", None) \ + and self.remoteOrientation == getattr(other, "remoteOrientation", None) \ + and self.withinMainAssembly == getattr(other, "withinMainAssembly", None) \ + and self.orientation == getattr(other, "orientation", None) \ + and self.connectingSequence == getattr(other, "connectingSequence", None) class _SingleBreakend(_Breakend): From d51db2303913b725612a29ad07b34968e947cfa3 Mon Sep 17 00:00:00 2001 From: datagram Date: Fri, 7 Feb 2014 03:21:19 -0800 Subject: [PATCH 086/168] Test data for testing the fix for issue #140 --- vcf/test/issue-140-file1.vcf | 35 +++++++++++++++++++++++++++++++++++ vcf/test/issue-140-file2.vcf | 34 ++++++++++++++++++++++++++++++++++ vcf/test/issue-140-file3.vcf | 25 +++++++++++++++++++++++++ 3 files changed, 94 insertions(+) create mode 100644 vcf/test/issue-140-file1.vcf create mode 100644 vcf/test/issue-140-file2.vcf create mode 100644 vcf/test/issue-140-file3.vcf diff --git a/vcf/test/issue-140-file1.vcf b/vcf/test/issue-140-file1.vcf new file mode 100644 index 0000000..8ee2de2 --- /dev/null +++ b/vcf/test/issue-140-file1.vcf @@ -0,0 +1,35 @@ +##fileformat=VCFv4.1 +##source=VarScan2 +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL +chr1 10 . G GGT . PASS DP=91;SS=1;SSC=2;GPV=3.0109E-23;SPV=5.8324E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:36:13:22:62.86%:2,11,1,21 +chr1 20 . GT G . PASS DP=77;SS=1;SSC=2;GPV=2.4504E-29;SPV=6.0772E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:28:5:22:81.48%:0,5,1,21 +chr2 30 . AC A . PASS DP=22;SS=1;SSC=7;GPV=1.3117E-10;SPV=1.9481E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:10:2:8:80%:0,2,0,8 +chr2 40 . AAAC A . PASS DP=42;SS=1;SSC=12;GPV=7.3092E-18;SPV=6.278E-2 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:13:4:9:69.23%:4,0,9,0 +chr3 50 . TC T . PASS DP=41;SS=1;SSC=2;GPV=9.8874E-23;SPV=5.3659E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:22:1:21:95.45%:1,0,15,6 +chr10 60 . T TTAA . PASS DP=27;SS=1;SSC=2;GPV=1.4382E-14;SPV=5.5556E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:12:0:12:100%:0,0,0,12 +chr10 70 . C CTG . PASS DP=40;SS=1;SSC=7;GPV=3.6006E-9;SPV=1.9922E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:10:6:4:40%:0,6,0,4 +chr11 80 . AGTT A . PASS DP=86;SS=1;SSC=0;GPV=4.1554E-34;SPV=8.5795E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:32:4:28:87.5%:1,3,0,28 +chr11 90 . GA G . PASS DP=41;SS=1;SSC=3;GPV=1.9197E-12;SPV=4.089E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:15:5:9:64.29%:1,4,0,9 +chr20 100 . TTTTG T . PASS DP=23;SS=1;SSC=1;GPV=2.9149E-12;SPV=6.5217E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:8:0:8:100%:0,0,7,1 +chr20 110 . GA G . PASS DP=83;SS=1;SSC=13;GPV=1E0;SPV=4.0806E-2 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:35:5:28:84.85%:4,1,12,16 +chrX 120 . G GA . PASS DP=61;SS=1;SSC=1;GPV=1.6967E-25;SPV=7.0485E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:22:3:19:86.36%:0,3,1,18 +chrX 130 . T TAA . PASS DP=19;SS=1;SSC=1;GPV=1.1285E-5;SPV=7.2172E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:8:2:5:71.43%:0,2,0,5 +chrY 140 . G GTTT . PASS DP=62;SS=1;SSC=0;GPV=3.4914E-15;SPV=9.571E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:36:2:19:90.48%:1,1,15,4 +chrY 150 . T TGAAG . PASS DP=28;SS=1;SSC=12;GPV=1.7583E-10;SPV=5.5797E-2 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:13:5:8:61.54%:4,1,2,6 +chrM 160 . G GTTT . PASS DP=62;SS=1;SSC=0;GPV=3.4914E-15;SPV=9.571E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:36:2:19:90.48%:1,1,15,4 +chrM 170 . T TGAAG . PASS DP=28;SS=1;SSC=12;GPV=1.7583E-10;SPV=5.5797E-2 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:13:5:8:61.54%:4,1,2,6 diff --git a/vcf/test/issue-140-file2.vcf b/vcf/test/issue-140-file2.vcf new file mode 100644 index 0000000..7852133 --- /dev/null +++ b/vcf/test/issue-140-file2.vcf @@ -0,0 +1,34 @@ +##fileformat=VCFv4.1 +##source=VarScan2 +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL +chr1 10 . G GGT . PASS DP=91;SS=1;SSC=2;GPV=3.0109E-23;SPV=5.8324E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:36:13:22:62.86%:2,11,1,21 +chr1 20 . GT G . PASS DP=77;SS=1;SSC=2;GPV=2.4504E-29;SPV=6.0772E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:28:5:22:81.48%:0,5,1,21 +chr2 30 . AC A . PASS DP=22;SS=1;SSC=7;GPV=1.3117E-10;SPV=1.9481E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:10:2:8:80%:0,2,0,8 +chr2 41 . AAAC A . PASS DP=42;SS=1;SSC=12;GPV=7.3092E-18;SPV=6.278E-2 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:13:4:9:69.23%:4,0,9,0 +chr10 60 . T TTAA . PASS DP=27;SS=1;SSC=2;GPV=1.4382E-14;SPV=5.5556E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:12:0:12:100%:0,0,0,12 +chr10 70 . C CTG . PASS DP=40;SS=1;SSC=7;GPV=3.6006E-9;SPV=1.9922E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:10:6:4:40%:0,6,0,4 +chr11 80 . AGTT A . PASS DP=86;SS=1;SSC=0;GPV=4.1554E-34;SPV=8.5795E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:32:4:28:87.5%:1,3,0,28 +chr11 91 . GA G . PASS DP=41;SS=1;SSC=3;GPV=1.9197E-12;SPV=4.089E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:15:5:9:64.29%:1,4,0,9 +chr20 100 . TTTTG T . PASS DP=23;SS=1;SSC=1;GPV=2.9149E-12;SPV=6.5217E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:8:0:8:100%:0,0,7,1 +chr20 110 . GA G . PASS DP=83;SS=1;SSC=13;GPV=1E0;SPV=4.0806E-2 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:35:5:28:84.85%:4,1,12,16 +chrX 120 . G GA . PASS DP=61;SS=1;SSC=1;GPV=1.6967E-25;SPV=7.0485E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:22:3:19:86.36%:0,3,1,18 +chrX 130 . T TAA . PASS DP=19;SS=1;SSC=1;GPV=1.1285E-5;SPV=7.2172E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:8:2:5:71.43%:0,2,0,5 +chrY 140 . G GTTT . PASS DP=62;SS=1;SSC=0;GPV=3.4914E-15;SPV=9.571E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:36:2:19:90.48%:1,1,15,4 +chrY 149 . T TGAAG . PASS DP=28;SS=1;SSC=12;GPV=1.7583E-10;SPV=5.5797E-2 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:13:5:8:61.54%:4,1,2,6 +chrM 160 . G GTTT . PASS DP=62;SS=1;SSC=0;GPV=3.4914E-15;SPV=9.571E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:36:2:19:90.48%:1,1,15,4 +chrM 170 . T TGAAG . PASS DP=28;SS=1;SSC=12;GPV=1.7583E-10;SPV=5.5797E-2 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:13:5:8:61.54%:4,1,2,6 diff --git a/vcf/test/issue-140-file3.vcf b/vcf/test/issue-140-file3.vcf new file mode 100644 index 0000000..754f6b6 --- /dev/null +++ b/vcf/test/issue-140-file3.vcf @@ -0,0 +1,25 @@ +##fileformat=VCFv4.1 +##source=VarScan2 +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL +chr3 50 . TC T . PASS DP=41;SS=1;SSC=2;GPV=9.8874E-23;SPV=5.3659E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:22:1:21:95.45%:1,0,15,6 +chr10 60 . T TTAA . PASS DP=27;SS=1;SSC=2;GPV=1.4382E-14;SPV=5.5556E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:12:0:12:100%:0,0,0,12 +chr10 70 . C CTG . PASS DP=40;SS=1;SSC=7;GPV=3.6006E-9;SPV=1.9922E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:10:6:4:40%:0,6,0,4 +chr11 80 . AGTT A . PASS DP=86;SS=1;SSC=0;GPV=4.1554E-34;SPV=8.5795E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:32:4:28:87.5%:1,3,0,28 +chr11 90 . GA G . PASS DP=41;SS=1;SSC=3;GPV=1.9197E-12;SPV=4.089E-1 GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:15:5:9:64.29%:1,4,0,9 +chr20 100 . TTTTG T . PASS DP=23;SS=1;SSC=1;GPV=2.9149E-12;SPV=6.5217E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:8:0:8:100%:0,0,7,1 +chrX 120 . G GA . PASS DP=61;SS=1;SSC=1;GPV=1.6967E-25;SPV=7.0485E-1 GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:22:3:19:86.36%:0,3,1,18 From 28dfe376695c8a2496f7cdc5e39187a85f5d5ed8 Mon Sep 17 00:00:00 2001 From: datagram Date: Fri, 7 Feb 2014 03:24:00 -0800 Subject: [PATCH 087/168] Added tests for walk_together with more complex inputs --- vcf/test/test_vcf.py | 65 ++++++++++---------------------------------- 1 file changed, 14 insertions(+), 51 deletions(-) diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 9853ced..efa633e 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -961,13 +961,11 @@ def test_walk(self): self.assertEqual(n, 5) # artificial case 2 from the left, 2 from the right, 2 together, 1 from the right, 1 from the left - expected = 'llrrttrl' reader1 = vcf.Reader(fh('walk_left.vcf')) reader2 = vcf.Reader(fh('example-4.0.vcf')) for ex, recs in zip(expected, utils.walk_together(reader1, reader2)): - if ex == 'l': assert recs[0] is not None assert recs[1] is None @@ -978,55 +976,20 @@ def test_walk(self): assert recs[0] is not None assert recs[1] is not None - # case with working custom equality function - - # without custom function, most records in these files - # are different since the default equality checks - # for ALT values - - reader1 = vcf.Reader(fh('example-4.0.vcf')) - reader2 = vcf.Reader(fh('walk_refcall.vcf')) - - # counters for distinct records and overlapping records - ndist_def, nover_def = 0, 0 - for x in utils.walk_together(reader1, reader2): - assert len(x) == 2 - if x[0] is not None and x[1] is not None: - assert (x[0] == x[1] and x[1] == x[0]) - nover_def += 1 - ndist_def += 1 - # check how many overlapping records - assert nover_def == 1 - # check how many distinct records - assert ndist_def == 8 - - # with custom function that does not check ALT, - # we see more overlaps and less distinct records - - def custom_eq(rec1, rec2): - # check for equality only on CHROM, POS, and REF - if rec1 is None or rec2 is None: - return False - return rec1.CHROM == rec2.CHROM and rec1.POS == rec2.POS and \ - rec1.REF == rec2.REF - - reader1 = vcf.Reader(fh('example-4.0.vcf')) - reader2 = vcf.Reader(fh('walk_refcall.vcf')) - - ndist_cust, nover_cust = 0, 0 - for x in utils.walk_together(reader1, reader2, eq_func=custom_eq): - self.assertEqual(len(x), 2) - # avoid assert() when one record is None - if x[0] is not None and x[1] is not None: - assert (custom_eq(x[0], x[1]) and custom_eq(x[1], x[0])) - ncomps += 1 - # still increment counter to ensure iteration is finished for all - # records - nrecs += 1 - # check number of records total - self.assertEqual(nrecs, 5) - # check how many records found in all files - self.assertEqual(ncomps, 4) + # test files with many chromosomes, set 'vcf_record_sort_key' to define chromosome order + chr_order = map(str, range(1, 30)) + ['X', 'Y', 'M'] + get_key = lambda r: (chr_order.index(r.CHROM.replace('chr','')), r.POS) + reader1 = vcf.Reader(fh('issue-140-file1.vcf')) + reader2 = vcf.Reader(fh('issue-140-file2.vcf')) + reader3 = vcf.Reader(fh('issue-140-file3.vcf')) + expected = "66642577752767662466" # each char is an integer bit flag - like file permissions + for ex, recs in zip(expected, utils.walk_together(reader1, reader2, reader3, vcf_record_sort_key = get_key)): + ex = int(ex) + for i, flag in enumerate([0x4, 0x2, 0x1]): + if ex & flag: + self.assertNotEqual(recs[i], None) + else: + self.assertEqual(recs[i], None) def test_trim(self): tests = [('TAA GAA', 'T G'), From 734daf4037727927600476decc79d6d929f3a3d4 Mon Sep 17 00:00:00 2001 From: James Casbon Date: Mon, 10 Feb 2014 10:55:56 +0000 Subject: [PATCH 088/168] bump version --- docs/HISTORY.rst | 5 +++++ vcf/__init__.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/HISTORY.rst b/docs/HISTORY.rst index 2b03818..723e841 100644 --- a/docs/HISTORY.rst +++ b/docs/HISTORY.rst @@ -17,6 +17,11 @@ New features should have test code sent with them. Changes ======= +0.6.6 Release +------------- + +* better walk together record ordering (Thanks @datagram, #141) + 0.6.5 Release ------------- diff --git a/vcf/__init__.py b/vcf/__init__.py index ec89aee..acfb87a 100644 --- a/vcf/__init__.py +++ b/vcf/__init__.py @@ -177,4 +177,4 @@ from vcf.filters import Base as Filter from vcf.parser import RESERVED_INFO, RESERVED_FORMAT -VERSION = '0.6.5' +VERSION = '0.6.6' From d1a9fdc56a9e52e798a7b00315dd5de01e6d8e99 Mon Sep 17 00:00:00 2001 From: James Casbon Date: Fri, 21 Feb 2014 10:17:45 +0000 Subject: [PATCH 089/168] fix missing .pyx --- MANIFEST.in | 1 + docs/HISTORY.rst | 5 +++++ vcf/__init__.py | 2 +- 3 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 MANIFEST.in diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..44f678a --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +recursive-include vcf *.pyx diff --git a/docs/HISTORY.rst b/docs/HISTORY.rst index 723e841..defff0d 100644 --- a/docs/HISTORY.rst +++ b/docs/HISTORY.rst @@ -17,6 +17,11 @@ New features should have test code sent with them. Changes ======= +0.6.7 Release +------------- + +* Include missing .pyx files + 0.6.6 Release ------------- diff --git a/vcf/__init__.py b/vcf/__init__.py index acfb87a..875e2d4 100644 --- a/vcf/__init__.py +++ b/vcf/__init__.py @@ -177,4 +177,4 @@ from vcf.filters import Base as Filter from vcf.parser import RESERVED_INFO, RESERVED_FORMAT -VERSION = '0.6.6' +VERSION = '0.6.7' From cbe8d906a3fd7491bbd6ce63a4f980a8e258b58d Mon Sep 17 00:00:00 2001 From: Lenna Peterson Date: Sat, 22 Feb 2014 18:40:27 -0500 Subject: [PATCH 090/168] Restore subprocess import to test --- vcf/test/test_vcf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 3bd788b..c7526ed 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -5,6 +5,7 @@ import commands import cPickle from StringIO import StringIO +import subprocess import vcf from vcf import utils From 097f2d0097bc7d5fb2b48a2d42514175ae72041f Mon Sep 17 00:00:00 2001 From: mgymrek Date: Thu, 6 Mar 2014 16:59:10 -0500 Subject: [PATCH 091/168] making alternate allele frequency work in the case of non-diploid alleles --- vcf/model.py | 7 ++++--- vcf/test/example-4.1-ploidy.vcf | 20 ++++++++++++++++++++ vcf/test/test_vcf.py | 7 ++++++- 3 files changed, 30 insertions(+), 4 deletions(-) create mode 100644 vcf/test/example-4.1-ploidy.vcf diff --git a/vcf/model.py b/vcf/model.py index c6e8f42..11c29d7 100644 --- a/vcf/model.py +++ b/vcf/model.py @@ -221,12 +221,13 @@ def aaf(self): """ A list of allele frequencies of alternate alleles. NOTE: Denominator calc'ed from _called_ genotypes. """ - num_chroms = 2.0 * self.num_called + num_chroms = 0.0 allele_counts = Counter() for s in self.samples: if s.gt_type is not None: - allele_counts.update([s.gt_alleles[0]]) - allele_counts.update([s.gt_alleles[1]]) + for a in s.gt_alleles: + allele_counts.update([a]) + num_chroms += 1 return [allele_counts[str(i)]/num_chroms for i in range(1, len(self.ALT)+1)] @property diff --git a/vcf/test/example-4.1-ploidy.vcf b/vcf/test/example-4.1-ploidy.vcf new file mode 100644 index 0000000..464c017 --- /dev/null +++ b/vcf/test/example-4.1-ploidy.vcf @@ -0,0 +1,20 @@ +##fileformat=VCFv4.1 +##fileDate=20090805 +##source=myImputationProgramV3.1 +##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta +##contig= +##phasing=partial +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 +X 60034 rs186434315 T A 100 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0:48:1:51,51 1|0:48:8:51,51 1/1/1:43:5:.,. \ No newline at end of file diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index c7526ed..ec9ecff 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -458,8 +458,13 @@ def test_aaf(self): self.assertEqual([2.0/6.0, 4.0/6.0], aaf) if var.POS == 1230237: self.assertEqual([0.0/6.0], aaf) - elif var.POS == 1234567: + if var.POS == 1234567: self.assertEqual([2.0/4.0, 1.0/4.0], aaf) + reader = vcf.Reader(fh('example-4.1-ploidy.vcf')) + for var in reader: + aaf = var.aaf + if var.POS == 60034: + self.assertEqual([4.0/6.0], aaf) def test_pi(self): reader = vcf.Reader(fh('example-4.0.vcf')) From 9a51b243b40a415941badc4e128d629b21e53a95 Mon Sep 17 00:00:00 2001 From: mgymrek Date: Thu, 6 Mar 2014 17:02:48 -0500 Subject: [PATCH 092/168] fixing small typo in elif in test case for aaf --- vcf/test/test_vcf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index ec9ecff..80a901d 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -458,7 +458,7 @@ def test_aaf(self): self.assertEqual([2.0/6.0, 4.0/6.0], aaf) if var.POS == 1230237: self.assertEqual([0.0/6.0], aaf) - if var.POS == 1234567: + elif var.POS == 1234567: self.assertEqual([2.0/4.0, 1.0/4.0], aaf) reader = vcf.Reader(fh('example-4.1-ploidy.vcf')) for var in reader: From 608078a5fabd9e5e3ee1680f2306141a952609a5 Mon Sep 17 00:00:00 2001 From: mgymrek Date: Thu, 6 Mar 2014 17:11:21 -0500 Subject: [PATCH 093/168] adding one more test case for non-diploids --- vcf/test/example-4.1-ploidy.vcf | 3 ++- vcf/test/test_vcf.py | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/vcf/test/example-4.1-ploidy.vcf b/vcf/test/example-4.1-ploidy.vcf index 464c017..4b9f048 100644 --- a/vcf/test/example-4.1-ploidy.vcf +++ b/vcf/test/example-4.1-ploidy.vcf @@ -17,4 +17,5 @@ ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 -X 60034 rs186434315 T A 100 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0:48:1:51,51 1|0:48:8:51,51 1/1/1:43:5:.,. \ No newline at end of file +X 60034 rs186434315 T A 100 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0:48:1:51,51 1|0:48:8:51,51 1/1/1:43:5:.,. +X 60378 rs185512268 C A 100 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0:48:1:51,51 1|0:48:8:51,51 1/1/1:43:5:.,. \ No newline at end of file diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 80a901d..947f554 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -465,6 +465,8 @@ def test_aaf(self): aaf = var.aaf if var.POS == 60034: self.assertEqual([4.0/6.0], aaf) + elif var.POS == 60387: + self.assertEqual([1.0/3.0], aaf) def test_pi(self): reader = vcf.Reader(fh('example-4.0.vcf')) From 4952f63c4866912f25a0d8f775697d1df3bb0e43 Mon Sep 17 00:00:00 2001 From: mgymrek Date: Thu, 6 Mar 2014 17:42:17 -0500 Subject: [PATCH 094/168] updating ploidy vcf example file --- vcf/test/example-4.1-ploidy.vcf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vcf/test/example-4.1-ploidy.vcf b/vcf/test/example-4.1-ploidy.vcf index 4b9f048..6704048 100644 --- a/vcf/test/example-4.1-ploidy.vcf +++ b/vcf/test/example-4.1-ploidy.vcf @@ -18,4 +18,4 @@ ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 X 60034 rs186434315 T A 100 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0:48:1:51,51 1|0:48:8:51,51 1/1/1:43:5:.,. -X 60378 rs185512268 C A 100 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0:48:1:51,51 1|0:48:8:51,51 1/1/1:43:5:.,. \ No newline at end of file +X 60378 rs185512268 C A 100 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0:48:1:51,51 1:48:8:51,51 0:43:5:.,. \ No newline at end of file From eeb892cfbb294621a4b9ed1d7bc7caf6c3751b7f Mon Sep 17 00:00:00 2001 From: Chris Lasher Date: Mon, 12 May 2014 16:45:37 -0700 Subject: [PATCH 095/168] Marks skipped tests as skipped, not passed. Decorates tests that are potentially skipped, as well as broken tests that are always skipped, as being skipped, rather than indicating falsely that these tests have passed (the result of premature return statements prior to any assertions in the tests). This introduces another dependency for Python 2.6, the unittest2 module, which back-ported this functionality from Python 2.7 and Python 3. --- .travis.yml | 2 +- tox.ini | 1 + vcf/test/test_vcf.py | 22 ++++++++++++++-------- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/.travis.yml b/.travis.yml index f54f5da..00b087c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,7 +7,7 @@ python: - "3.3" - "pypy" install: - - "if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install --use-mirrors cython && pip install --use-mirrors pysam argparse counter ordereddict; fi" + - "if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install --use-mirrors cython && pip install --use-mirrors pysam argparse counter ordereddict unittest2; fi" - "if [[ $TRAVIS_PYTHON_VERSION == '2.7' ]]; then pip install --use-mirrors cython && pip install --use-mirrors pysam; fi" - python setup.py install script: python setup.py test diff --git a/tox.ini b/tox.ini index 0c07a88..d8e584d 100644 --- a/tox.ini +++ b/tox.ini @@ -18,6 +18,7 @@ deps = ordereddict cython pysam + unittest2 [testenv:py27] deps = diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index c7526ed..2ff8920 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -1,5 +1,9 @@ from __future__ import print_function import unittest +try: + unittest.skip +except AttributeError: + import unittest2 as unittest import doctest import os import commands @@ -7,6 +11,11 @@ from StringIO import StringIO import subprocess +try: + import pysam +except ImportError: + pysam = None + import vcf from vcf import utils @@ -814,12 +823,9 @@ class TestTabix(unittest.TestCase): def setUp(self): self.reader = vcf.Reader(fh('tb.vcf.gz', 'rb')) - self.run = vcf.parser.pysam is not None - + @unittest.skipUnless(pysam, "test requires installation of PySAM.") def testFetchRange(self): - if not self.run: - return lines = list(self.reader.fetch('20', 14370, 14370)) self.assertEquals(len(lines), 1) self.assertEqual(lines[0].POS, 14370) @@ -833,9 +839,9 @@ def testFetchRange(self): lines = list(self.reader.fetch('20', 1110695, 1234567)) self.assertEquals(len(lines), 3) + + @unittest.skipUnless(pysam, "test requires installation of PySAM.") def testFetchSite(self): - if not self.run: - return site = self.reader.fetch('20', 14370) self.assertEqual(site.POS, 14370) @@ -920,9 +926,9 @@ def testSampleFilterModule(self): class TestFilter(unittest.TestCase): + @unittest.skip("test currently broken") def testApplyFilter(self): # FIXME: broken with distribute - return s, out = commands.getstatusoutput('python scripts/vcf_filter.py --site-quality 30 test/example-4.0.vcf sq') #print(out) self.assertEqual(s, 0) @@ -950,9 +956,9 @@ def testApplyFilter(self): self.assertEqual(n, 2) + @unittest.skip("test currently broken") def testApplyMultipleFilters(self): # FIXME: broken with distribute - return s, out = commands.getstatusoutput('python scripts/vcf_filter.py --site-quality 30 ' '--genotype-quality 50 test/example-4.0.vcf sq mgq') self.assertEqual(s, 0) From f3d6a35abbf1dfc866cbc4ad35eb999949a9102a Mon Sep 17 00:00:00 2001 From: Chris Lasher Date: Mon, 12 May 2014 17:01:55 -0700 Subject: [PATCH 096/168] Skips fragile tests broken for Python 3. --- vcf/test/test_vcf.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 2ff8920..2eef51b 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -10,6 +10,7 @@ import cPickle from StringIO import StringIO import subprocess +import sys try: import pysam @@ -19,6 +20,8 @@ import vcf from vcf import utils +IS_PYTHON2 = sys.version_info[0] == 2 + suite = doctest.DocTestSuite(vcf) @@ -878,6 +881,7 @@ def testOpenFilenameGzipped(self): class TestSampleFilter(unittest.TestCase): + @unittest.skipUnless(IS_PYTHON2, "test broken for Python 3") def testCLIListSamples(self): proc = subprocess.Popen('python scripts/vcf_sample_filter.py vcf/test/example-4.1.vcf', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = proc.communicate() @@ -886,6 +890,7 @@ def testCLIListSamples(self): expected_out = ['Samples:', '0: NA00001', '1: NA00002', '2: NA00003'] self.assertEqual(out.splitlines(), expected_out) + @unittest.skipUnless(IS_PYTHON2, "test broken for Python 3") def testCLIWithFilter(self): proc = subprocess.Popen('python scripts/vcf_sample_filter.py vcf/test/example-4.1.vcf -f 1,2 --quiet', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = proc.communicate() From 0e757e1ef3cccf33fda7d21b2390ecaba6eb59ac Mon Sep 17 00:00:00 2001 From: Chris Lasher Date: Mon, 12 May 2014 17:45:06 -0700 Subject: [PATCH 097/168] Skips broken test for PyPy. --- vcf/test/test_vcf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 2eef51b..ab73b8a 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -21,6 +21,7 @@ from vcf import utils IS_PYTHON2 = sys.version_info[0] == 2 +IS_NOT_PYPY = 'PyPy' not in sys.version suite = doctest.DocTestSuite(vcf) @@ -906,6 +907,7 @@ def testCLIWithFilter(self): rec = reader.next() self.assertEqual(len(rec.samples), 1) + @unittest.skipUnless(IS_NOT_PYPY, "test broken for PyPy") def testSampleFilterModule(self): # init filter with filename, get list of samples filt = vcf.SampleFilter('vcf/test/example-4.1.vcf') From 49be99b677e1fa72697f182a438802114f3b4b89 Mon Sep 17 00:00:00 2001 From: Chris Lasher Date: Tue, 13 May 2014 14:30:50 -0700 Subject: [PATCH 098/168] Decorate the TestTabix case rather than its tests. --- vcf/test/test_vcf.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index ab73b8a..85354bb 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -822,13 +822,13 @@ def test_gt_types(self): self.assertEqual([None,1,2], gt_types) +@unittest.skipUnless(pysam, "test requires installation of PySAM.") class TestTabix(unittest.TestCase): def setUp(self): self.reader = vcf.Reader(fh('tb.vcf.gz', 'rb')) - @unittest.skipUnless(pysam, "test requires installation of PySAM.") def testFetchRange(self): lines = list(self.reader.fetch('20', 14370, 14370)) self.assertEquals(len(lines), 1) @@ -844,7 +844,6 @@ def testFetchRange(self): self.assertEquals(len(lines), 3) - @unittest.skipUnless(pysam, "test requires installation of PySAM.") def testFetchSite(self): site = self.reader.fetch('20', 14370) self.assertEqual(site.POS, 14370) From 4fba62c16e78cb158bd443a2a3857bc5a9b437ee Mon Sep 17 00:00:00 2001 From: Chris Lasher Date: Tue, 13 May 2014 23:49:12 -0700 Subject: [PATCH 099/168] Reader.fetch uses zero-based, half-open coordinates. These changes make the behavior of Reader.fetch consistent with with pysam.Tabixfile, which uses the zero-based, half-open coordinate system for Tabixfile.fetch. See http://www.cgat.org/~andreas/documentation/pysam/api.html#pysam.Tabixfile.fetch Previously, PyVCF's Reader.fetch declared no particular coordinate system. Since the method quietly deducted 1 from the start position, apparently it assumed users were going to input a one-based coordinate there. However, users familiar with pysam's Tabixfile for other formats get an unexpected surprise when variants ahead of the start coordinate start getting returned by Reader.fetch. As _Record.start and _Record.end are in the ZBHO coordinate system, it adds to the consistency that fetch take start and end coordinates in ZBHO, so the same _Record instance could be retrieved using its .CHROM, .start, and .end coordinates. This change also removes the prior behavior of fetch of returning a single _Record instance if given only chrom and start coordinates, by implicitly doing a Tabixfile.fetch(chrom, start-1, start). The new behavior when omitting the end parameter is to return an iterator of _Record instances starting at start and continuing through the end of the chromosome chrom. Again, this is the behavior consistent with pysam.Tabixfile.fetch, and is what users ought to expect. This change also allows the user to omit both the start and end positions. In this case, an iterable of _Record instances for all records for the particular chromosome chrom will be returned, which again, is consistent with Tabixfile.fetch. This behavior also resolves Issue #123 "Cannot fetch() whole chromosome". --- vcf/parser.py | 46 ++++++++++++++++++++++++--------------- vcf/test/test_vcf.py | 52 +++++++++++++++++++++++++++++--------------- 2 files changed, 64 insertions(+), 34 deletions(-) diff --git a/vcf/parser.py b/vcf/parser.py index de39353..39d1f8a 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -165,7 +165,7 @@ def read_format(self, format_string): match.group('type'), match.group('desc')) return (match.group('id'), form) - + def read_contig(self, contig_string): '''Read a meta-contigrmation INFO line.''' match = self.contig_pattern.match(contig_string) @@ -321,7 +321,7 @@ def _parse_metainfo(self): elif line.startswith('##FORMAT'): key, val = parser.read_format(line) self.formats[key] = val - + elif line.startswith('##contig'): key, val = parser.read_contig(line) self.contigs[key] = val @@ -569,14 +569,36 @@ def next(self): return record - def fetch(self, chrom, start, end=None): - """ fetch records from a Tabix indexed VCF, requires pysam - if start and end are specified, return iterator over positions - if end not specified, return individual ``_Call`` at start or None + def fetch(self, chrom, start=None, end=None): + """ Fetches records from a tabix-indexed VCF file and returns an + iterable of ``_Record`` instances + + chrom must be specified. + + The start and end coordinates are in the zero-based, + half-open coordinate system, similar to ``_Record.start`` and + ``_Record.end``. The very first base of a chromosome is + index 0, and the the region includes bases up to, but not + including the base at the end coordinate. For example + ``fetch('4', 10, 20)`` would include all variants + overlapping a 10 base pair region from the 11th base of + through the 20th base (which is at index 19) of chromosome + 4. It would not include the 21st base (at index 20). See + http://genomewiki.ucsc.edu/index.php/Coordinate_Transforms + for more information on the zero-based, half-open coordinate + system. + + If end is omitted, all variants from start until the end of + the chromosome chrom will be included. + + If start and end are omitted, all variants on chrom will be + returned. + + requires pysam + """ if not pysam: raise Exception('pysam not available, try "pip install pysam"?') - if not self.filename: raise Exception('Please provide a filename (or a "normal" fsock)') @@ -586,16 +608,6 @@ def fetch(self, chrom, start, end=None): if self._prepend_chr and chrom[:3] == 'chr': chrom = chrom[3:] - # not sure why tabix needs position -1 - start = start - 1 - - if end is None: - self.reader = self._tabix.fetch(chrom, start, start + 1) - try: - return self.next() - except StopIteration: - return None - self.reader = self._tabix.fetch(chrom, start, end) return self diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 85354bb..e9d9756 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -823,35 +823,53 @@ def test_gt_types(self): @unittest.skipUnless(pysam, "test requires installation of PySAM.") -class TestTabix(unittest.TestCase): +class TestFetch(unittest.TestCase): def setUp(self): self.reader = vcf.Reader(fh('tb.vcf.gz', 'rb')) - def testFetchRange(self): - lines = list(self.reader.fetch('20', 14370, 14370)) - self.assertEquals(len(lines), 1) - self.assertEqual(lines[0].POS, 14370) + def assertFetchedExpectedPositions( + self, fetched_variants, expected_positions): + fetched_positions = [var.POS for var in fetched_variants] + self.assertEqual(fetched_positions, expected_positions) + + + def testNoVariantsInRange(self): + fetched_variants = self.reader.fetch('20', 14370, 17329) + self.assertFetchedExpectedPositions(fetched_variants, []) - lines = list(self.reader.fetch('20', 14370, 17330)) - self.assertEquals(len(lines), 2) - self.assertEqual(lines[0].POS, 14370) - self.assertEqual(lines[1].POS, 17330) + def testNoVariantsForZeroLengthInterval(self): + fetched_variants = self.reader.fetch('20', 14369, 14369) + self.assertFetchedExpectedPositions(fetched_variants, []) + + + def testFetchRange(self): + fetched_variants = self.reader.fetch('20', 14369, 14370) + self.assertFetchedExpectedPositions(fetched_variants, [14370]) - lines = list(self.reader.fetch('20', 1110695, 1234567)) - self.assertEquals(len(lines), 3) + fetched_variants = self.reader.fetch('20', 14369, 17330) + self.assertFetchedExpectedPositions( + fetched_variants, [14370, 17330]) + fetched_variants = self.reader.fetch('20', 1110695, 1234567) + self.assertFetchedExpectedPositions( + fetched_variants, [1110696, 1230237, 1234567]) - def testFetchSite(self): - site = self.reader.fetch('20', 14370) - self.assertEqual(site.POS, 14370) - site = self.reader.fetch('20', 14369) - assert site is None + def testFetchesFromStartIfStartOnlySpecified(self): + fetched_variants = self.reader.fetch('20', 1110695) + self.assertFetchedExpectedPositions( + fetched_variants, [1110696, 1230237, 1234567]) + def testFetchesAllFromChromIfOnlyChromSpecified(self): + fetched_variants = self.reader.fetch('20') + self.assertFetchedExpectedPositions( + fetched_variants, + [14370, 17330, 1110696, 1230237, 1234567] + ) class TestOpenMethods(unittest.TestCase): @@ -1090,7 +1108,7 @@ def test_meta(self): suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestMixedFiltering)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestRecord)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestCall)) -suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestTabix)) +suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestFetch)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestOpenMethods)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestSampleFilter)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestFilter)) From 2e4498d882f96ba028a9e808c7ec043e28d0ce17 Mon Sep 17 00:00:00 2001 From: Chris Lasher Date: Wed, 14 May 2014 09:23:18 -0700 Subject: [PATCH 100/168] Fixes fetch documentation in package docstring. This corrects several lines that relate to the changes to fetch brought in by Pull Request #156. --- vcf/__init__.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/vcf/__init__.py b/vcf/__init__.py index d13ae33..149d25a 100644 --- a/vcf/__init__.py +++ b/vcf/__init__.py @@ -143,20 +143,32 @@ >>> print bnd.withinMainAssembly, bnd.orientation, bnd.remoteOrientation, bnd.connectingSequence True False True T -Random access is supported for files with tabix indexes. Simply call fetch for the -region you are interested in:: +The Reader supports retrieval of records within designated regions for +files with tabix indexes via the fetch method. Pass in a chromosome, +and, optionally, start and end coordinates, for the regions of +interest:: >>> vcf_reader = vcf.Reader(filename='vcf/test/tb.vcf.gz') - >>> for record in vcf_reader.fetch('20', 1110696, 1230237): # doctest: +SKIP + >>> # fetch all records on chromosome 20 from base 1110696 through 1230237 + >>> for record in vcf_reader.fetch('20', 1110695, 1230237): # doctest: +SKIP ... print record Record(CHROM=20, POS=1110696, REF=A, ALT=[G, T]) Record(CHROM=20, POS=1230237, REF=T, ALT=[None]) -Or extract a single row:: +Note that the start and end coordinates are in the zero-based, half-open +coordinate system, similar to ``_Record.start`` and ``_Record.end``. The +very first base of a chromosome is index 0, and the the region includes +bases up to, but not including the base at the end coordinate. For +example:: - >>> print vcf_reader.fetch('20', 1110696) # doctest: +SKIP - Record(CHROM=20, POS=1110696, REF=A, ALT=[G, T]) + >>> # fetch all records on chromosome 4 from base 11 through 20 + >>> vcf_reader.fetch('4', 10, 20) # doctest: +SKIP +would include all records overlapping a 10 base pair region from the +11th base of through the 20th base (which is at index 19) of chromosome +4. It would not include the 21st base (at index 20). (See +http://genomewiki.ucsc.edu/index.php/Coordinate_Transforms for more +information on the zero-based, half-open coordinate system.) The ``Writer`` class provides a way of writing a VCF file. Currently, you must specify a template ``Reader`` which provides the metadata:: From 2d522b5251bec4a3e6060ddb7d7aa6cde81e3887 Mon Sep 17 00:00:00 2001 From: Chris Lasher Date: Wed, 14 May 2014 09:30:10 -0700 Subject: [PATCH 101/168] Removes setup import from distutils that overrides setuptools setup. --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index 321d69a..f063a53 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,4 @@ from setuptools import setup -from distutils.core import setup from distutils.extension import Extension try: From 2451c160f95ee5744d89e2ce18f4ce6556410c17 Mon Sep 17 00:00:00 2001 From: Chris Lasher Date: Thu, 15 May 2014 14:02:49 -0700 Subject: [PATCH 102/168] Tidies up Python 2.6 dependencies Moves all dependencies to the install_requires argument to setup. --- setup.py | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/setup.py b/setup.py index f063a53..e2e56c9 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,6 @@ from setuptools import setup from distutils.extension import Extension +import sys try: from Cython.Distutils import build_ext @@ -7,23 +8,14 @@ except: CYTHON = False -requires = [] +IS_PYTHON26 = sys.version_info[:2] == (2, 6) -# python 2.6 does not have argparse -try: - import argparse -except ImportError: - requires.append('argparse') +DEPENDENCIES = ['setuptools'] + +if IS_PYTHON26: + DEPENDENCIES.extend(['argparse', 'counter', 'ordereddict', + 'unittest2']) -import collections -try: - collections.Counter -except AttributeError: - requires.append('counter') -try: - collections.OrderedDict -except AttributeError: - requires.append('ordereddict') # get the version without an import VERSION = "Undefined" @@ -53,8 +45,7 @@ description='Variant Call Format (VCF) parser for Python', long_description=DOC, test_suite='vcf.test.test_vcf.suite', - install_requires=['distribute'], - requires=requires, + install_requires=DEPENDENCIES, entry_points = { 'vcf.filters': [ 'site_quality = vcf.filters:SiteQuality', From 606659010bf65a3e35e3f9ee5aa14b248a56899e Mon Sep 17 00:00:00 2001 From: Chris Lasher Date: Thu, 15 May 2014 15:21:59 -0700 Subject: [PATCH 103/168] Updates PyPI trove classifiers. --- setup.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e2e56c9..fb4e512 100644 --- a/setup.py +++ b/setup.py @@ -62,10 +62,18 @@ 'Development Status :: 4 - Beta', 'Intended Audience :: Developers', 'Intended Audience :: Science/Research', + 'License :: OSI Approved :: BSD License', + 'License :: OSI Approved :: MIT License', 'Operating System :: OS Independent', + 'Programming Language :: Cython', 'Programming Language :: Python', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 2.6' + 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', - 'Topic :: Scientific/Engineering', + 'Programming Language :: Python :: 3.2', + 'Programming Language :: Python :: 3.3', + 'Topic :: Scientific/Engineering :: Bio-Informatics', ], keywords='bioinformatics', use_2to3=True, From 2ef6a4facd2b18a5419cc9512377d51f3d4e6360 Mon Sep 17 00:00:00 2001 From: Chris Lasher Date: Thu, 15 May 2014 18:02:52 -0700 Subject: [PATCH 104/168] Use requirements files to consolidate dependencies. --- .travis.yml | 3 +-- requirements/common-requirements.txt | 3 +++ requirements/pypy-requirements.txt | 1 + requirements/python2.6-requirements.txt | 5 +++++ tox.ini | 22 +++++----------------- 5 files changed, 15 insertions(+), 19 deletions(-) create mode 100644 requirements/common-requirements.txt create mode 100644 requirements/pypy-requirements.txt create mode 100644 requirements/python2.6-requirements.txt diff --git a/.travis.yml b/.travis.yml index 00b087c..1e1b142 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,7 +7,6 @@ python: - "3.3" - "pypy" install: - - "if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install --use-mirrors cython && pip install --use-mirrors pysam argparse counter ordereddict unittest2; fi" - - "if [[ $TRAVIS_PYTHON_VERSION == '2.7' ]]; then pip install --use-mirrors cython && pip install --use-mirrors pysam; fi" + - "if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install -r requirements/python2.6-requirements.txt; elif [[ $TRAVIS_PYTHON_VERSION == 'pypy' ]]; then pip install -r requirements/pypy-requirements.txt; else pip install -r requirements/common-requirements.txt; fi" - python setup.py install script: python setup.py test diff --git a/requirements/common-requirements.txt b/requirements/common-requirements.txt new file mode 100644 index 0000000..ea364d9 --- /dev/null +++ b/requirements/common-requirements.txt @@ -0,0 +1,3 @@ +cython +pysam +setuptools diff --git a/requirements/pypy-requirements.txt b/requirements/pypy-requirements.txt new file mode 100644 index 0000000..49fe098 --- /dev/null +++ b/requirements/pypy-requirements.txt @@ -0,0 +1 @@ +setuptools diff --git a/requirements/python2.6-requirements.txt b/requirements/python2.6-requirements.txt new file mode 100644 index 0000000..27c9bc2 --- /dev/null +++ b/requirements/python2.6-requirements.txt @@ -0,0 +1,5 @@ +-r common-requirements.txt +argparse +counter +ordereddict +unittest2 diff --git a/tox.ini b/tox.ini index d8e584d..953a9dc 100644 --- a/tox.ini +++ b/tox.ini @@ -7,28 +7,16 @@ envlist = py26, py27, py32, py33 [testenv] +deps = + -rrequirements/common-requirements.txt commands = rm -rf {toxinidir}/build python setup.py test [testenv:py26] deps = - argparse - counter - ordereddict - cython - pysam - unittest2 - -[testenv:py27] -deps = - pysam - cython - -[testenv:py32] -deps = - cython + -rrequirements/python2.6-requirements.txt -[testenv:py33] +[testenv:pypy] deps = - cython + -rrequirements/pypy-requirements.txt From 47acb567ddd51e29fb64feddf641593191f2222b Mon Sep 17 00:00:00 2001 From: Chris Lasher Date: Sun, 18 May 2014 22:51:51 -0700 Subject: [PATCH 105/168] Adds _Record.affected_start and .affected_end. These coordinates should represent the zero-based, half-open region of the reference sequence affected by all the events included in ALT. These coordinates allow the user to identify precisely which bases are altered by the events in the record. Provides more thorough documentation on the coordinate schemes for _Record.POS, .start, and .end. --- vcf/model.py | 94 ++++++++++++++++- vcf/test/test_vcf.py | 235 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 326 insertions(+), 3 deletions(-) diff --git a/vcf/model.py b/vcf/model.py index c6e8f42..f0c8a97 100644 --- a/vcf/model.py +++ b/vcf/model.py @@ -125,10 +125,45 @@ class _Record(object): INFO and FORMAT are available as properties. The list of genotype calls is in the ``samples`` property. + + Regarding the coordinates associated with each instance: + + - ``POS``, per VCF specification, is the one-based index + (the first base of the contig has an index of 1) of the first + base of the ``REF`` sequence. + - The ``start`` and ``end`` denote the coordinates of the entire + ``REF`` sequence in the zero-based, half-open coordinate + system (see + http://genomewiki.ucsc.edu/index.php/Coordinate_Transforms), + where the first base of the contig has an index of 0, and the + interval runs up to, but does not include, the base at the + ``end`` index. This indexing scheme is analagous to Python + slice notation. + - The ``affected_start`` and ``affected_end`` coordinates are + also in the zero-based, half-open coordinate system. These + coordinates indicate the precise region of the reference + genome actually affected by the events denoted in ``ALT`` + (i.e., the minimum ``affected_start`` and maximum + ``affected_end``). + + - For SNPs and structural variants, the affected region + includes all bases of ``REF``, including the first base + (i.e., ``affected_start = start = POS - 1``). + - For deletions, the region includes all bases of ``REF`` + except the first base, which flanks upstream the actual + deletion event, per VCF specification. + - For insertions, the ``affected_start`` and ``affected_end`` + coordinates represent a 0 bp-length region between the two + flanking bases (i.e., ``affected_start`` = + ``affected_end``). This is analagous to Python slice + notation (see http://stackoverflow.com/a/2947881/38140). + Neither the upstream nor downstream flanking bases are + included in the region. """ def __init__(self, CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, sample_indexes, samples=None): self.CHROM = CHROM + #: the one-based coordinate of the first nucleotide in ``REF`` self.POS = POS self.ID = ID self.REF = REF @@ -137,9 +172,9 @@ def __init__(self, CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, self.FILTER = FILTER self.INFO = INFO self.FORMAT = FORMAT - #: 0-based start coordinate + #: zero-based, half-open start coordinate of ``REF`` self.start = self.POS - 1 - #: 1-based end coordinate + #: zero-based, half-open end coordinate of ``REF`` self.end = self.start + len(self.REF) #: list of alleles. [0] = REF, [1:] = ALTS self.alleles = [self.REF] @@ -148,6 +183,61 @@ def __init__(self, CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, self.samples = samples or [] self._sample_indexes = sample_indexes + # Setting affected_start and affected_end here for Sphinx + # autodoc purposes... + #: zero-based, half-open start coordinate of affected region of reference genome + self.affected_start = None + #: zero-based, half-open end coordinate of affected region of reference genome (not included in the region) + self.affected_end = None + self._set_start_and_end() + + + def _set_start_and_end(self): + self.affected_start = self.affected_end = self.POS + for alt in self.ALT: + if alt is None: + start, end = self._compute_coordinates_for_none_alt() + elif alt.type == 'SNV': + start, end = self._compute_coordinates_for_snp() + elif alt.type == 'MNV': + start, end = self._compute_coordinates_for_indel() + else: + start, end = self._compute_coordinates_for_sv() + self.affected_start = min(self.affected_start, start) + self.affected_end = max(self.affected_end, end) + + + def _compute_coordinates_for_none_alt(self): + start = self.POS - 1 + end = start + len(self.REF) + return (start, end) + + + def _compute_coordinates_for_snp(self): + if len(self.REF) > 1: + start = self.POS + end = start + (len(self.REF) - 1) + else: + start = self.POS - 1 + end = self.POS + return (start, end) + + + def _compute_coordinates_for_indel(self): + if len(self.REF) > 1: + start = self.POS + end = start + (len(self.REF) - 1) + else: + start = end = self.POS + return (start, end) + + + def _compute_coordinates_for_sv(self): + start = self.POS - 1 + end = start + len(self.REF) + return (start, end) + + # For Python 2 def __cmp__(self, other): return cmp((self.CHROM, self.POS), (getattr(other, "CHROM", None), getattr(other, "POS", None))) diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index e9d9756..66a5834 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -18,7 +18,7 @@ pysam = None import vcf -from vcf import utils +from vcf import model, utils IS_PYTHON2 = sys.version_info[0] == 2 IS_NOT_PYPY = 'PyPy' not in sys.version @@ -765,6 +765,239 @@ def test_pickle(self): self.assertEqual(cPickle.loads(cPickle.dumps(var)), var) + def assert_has_expected_coordinates( + self, + record, + expected_coordinates, + expected_affected_coordinates + ): + self.assertEqual( + (record.start, record.end), + expected_coordinates + ) + self.assertEqual( + (record.affected_start, record.affected_end), + expected_affected_coordinates + ) + + + def test_coordinates_for_snp(self): + record = model._Record( + '1', + 10, + 'id1', + 'C', + [model._Substitution('A')], + None, + None, + {}, + None, + {}, + None + ) + self.assert_has_expected_coordinates(record, (9, 10), (9, 10)) + + + def test_coordinates_for_insertion(self): + record = model._Record( + '1', + 10, + 'id2', + 'C', + [model._Substitution('CTA')], + None, + None, + {}, + None, + {}, + None + ) + self.assert_has_expected_coordinates(record, (9, 10), (10, 10)) + + + def test_coordinates_for_deletion(self): + record = model._Record( + '1', + 10, + 'id3', + 'CTA', + [model._Substitution('C')], + None, + None, + {}, + None, + {}, + None + ) + self.assert_has_expected_coordinates(record, (9, 12), (10, 12)) + + + def test_coordinates_for_None_alt(self): + record = model._Record( + '1', + 10, + 'id4', + 'C', + [None], + None, + None, + {}, + None, + {}, + None + ) + self.assert_has_expected_coordinates(record, (9, 10), (9, 10)) + + + def test_coordinates_for_multiple_snps(self): + record = model._Record( + '1', + 10, + 'id5', + 'C', + [ + model._Substitution('A'), + model._Substitution('G'), + model._Substitution('T') + ], + None, + None, + {}, + None, + {}, + None + ) + self.assert_has_expected_coordinates(record, (9, 10), (9, 10)) + + + def test_coordinates_for_insert_and_snp(self): + record = model._Record( + '1', + 10, + 'id6', + 'C', + [ + model._Substitution('GTA'), + model._Substitution('G'), + ], + None, + None, + {}, + None, + {}, + None + ) + self.assert_has_expected_coordinates(record, (9, 10), (9, 10)) + record = model._Record( + '1', + 10, + 'id7', + 'C', + [ + model._Substitution('G'), + model._Substitution('GTA'), + ], + None, + None, + {}, + None, + {}, + None + ) + self.assert_has_expected_coordinates(record, (9, 10), (9, 10)) + + + def test_coordinates_for_snp_and_deletion(self): + record = model._Record( + '1', + 10, + 'id8', + 'CTA', + [ + model._Substitution('C'), + model._Substitution('CTG'), + ], + None, + None, + {}, + None, + {}, + None + ) + self.assert_has_expected_coordinates(record, (9, 12), (10, 12)) + record = model._Record( + '1', + 10, + 'id9', + 'CTA', + [ + model._Substitution('CTG'), + model._Substitution('C'), + ], + None, + None, + {}, + None, + {}, + None + ) + self.assert_has_expected_coordinates(record, (9, 12), (10, 12)) + + + def test_coordinates_for_insertion_and_deletion(self): + record = model._Record( + '1', + 10, + 'id10', + 'CT', + [ + model._Substitution('CA'), + model._Substitution('CTT'), + ], + None, + None, + {}, + None, + {}, + None + ) + self.assert_has_expected_coordinates(record, (9, 11), (10, 11)) + record = model._Record( + '1', + 10, + 'id11', + 'CT', + [ + model._Substitution('CTT'), + model._Substitution('CA'), + ], + None, + None, + {}, + None, + {}, + None + ) + self.assert_has_expected_coordinates(record, (9, 11), (10, 11)) + + + def test_coordinates_for_breakend(self): + record = model._Record( + '1', + 10, + 'id12', + 'CTA', + [model._Breakend('1', 500, False, True, 'GGTC', True)], + None, + None, + {}, + None, + {}, + None + ) + self.assert_has_expected_coordinates(record, (9, 12), (9, 12)) + + class TestCall(unittest.TestCase): def test_dunder_eq(self): From 2f0d57706e67c7d2b13d1e527cae2201018ffc14 Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Wed, 25 Jun 2014 22:41:27 +0200 Subject: [PATCH 106/168] Allow flag INFO field to be declared as string As reported in #164, we previously crashed on flag INFO fields declared as strings (and the number of values declared as 1). This is indeed not according to spec, but we should probably allow it anyway. --- vcf/parser.py | 1 + vcf/test/string_as_flag.vcf | 8 ++++++++ vcf/test/test_vcf.py | 11 +++++++++++ 3 files changed, 20 insertions(+) create mode 100644 vcf/test/string_as_flag.vcf diff --git a/vcf/parser.py b/vcf/parser.py index 39d1f8a..244e8f5 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -390,6 +390,7 @@ def _parse_info(self, info_str): vals = entry[1].split(',') # commas are reserved characters indicating multiple values val = self._map(str, vals) except IndexError: + entry_type = 'Flag' val = True try: diff --git a/vcf/test/string_as_flag.vcf b/vcf/test/string_as_flag.vcf new file mode 100644 index 0000000..afa3b0d --- /dev/null +++ b/vcf/test/string_as_flag.vcf @@ -0,0 +1,8 @@ +##fileformat=VCFv4.1 +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT test +chr2 21 . A G . . GT . +chr2 24 . G T . . AB GT . +chr2 48 . C T . . CD GT . +chr2 75 . T C . . AB;CD GT . diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index b096e47..2017653 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -262,6 +262,16 @@ def test_contig_line(self): self.assertEqual(reader.contigs['1'].length, 249250621) +class TestStringAsFlag(unittest.TestCase): + + def test_string_as_flag(self): + """A flag INFO field is declared as string (not allowed by the spec, + but seen in practice).""" + reader = vcf.Reader(fh('string_as_flag.vcf', 'r')) + for _ in reader: + pass + + class TestInfoOrder(unittest.TestCase): def _assert_order(self, definitions, fields): @@ -1339,6 +1349,7 @@ def test_meta(self): suite.addTests(unittest.TestLoader().loadTestsFromTestCase(Test1kg)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(Test1kgSites)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestGoNL)) +suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestStringAsFlag)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestInfoOrder)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestInfoTypeCharacter)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestGatkOutputWriter)) From d927381018650767f63242ac86aa31294a7c8939 Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Wed, 25 Jun 2014 22:45:31 +0200 Subject: [PATCH 107/168] Don't crash when FORMAT is set to the missing value (.) It is not valid according to the spec, but issue #164 shows a VCF file where the FORMAT column contains just a dot character. We have no way of interpreting the subsequent genotype columns in that case, so this patch ignores them. --- vcf/parser.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vcf/parser.py b/vcf/parser.py index 39d1f8a..99baf4e 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -559,6 +559,9 @@ def next(self): fmt = row[8] except IndexError: fmt = None + else: + if fmt == '.': + fmt = None record = _Record(chrom, pos, ID, ref, alt, qual, filt, info, fmt, self._sample_indexes) From e7d350b23bd1cff87cce768af44f296d3f868108 Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Sun, 6 Jul 2014 18:32:15 +0200 Subject: [PATCH 108/168] Don't crash on metadata lines without value The spec actually does not allow for metadata lines without value, but we shouldn't crash on them. Fixes #168 --- vcf/parser.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/vcf/parser.py b/vcf/parser.py index d957e31..9e4f739 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -220,9 +220,13 @@ def read_meta_hash(self, meta_string): def read_meta(self, meta_string): if re.match("##.+=<", meta_string): return self.read_meta_hash(meta_string) - else: - match = self.meta_pattern.match(meta_string) - return match.group('key'), match.group('val') + match = self.meta_pattern.match(meta_string) + if not match: + # Spec only allows key=value, but we try to be liberal and + # interpret anything else as key=none (and all values are parsed + # as strings). + return meta_string.lstrip('#'), 'none' + return match.group('key'), match.group('val') class Reader(object): From c8f3f8d5fe9bdee2049e45bc09d01037e30d1aee Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Tue, 9 Sep 2014 17:23:01 +0200 Subject: [PATCH 109/168] Temporarily fix pysam on 0.7.8 (0.8.0 fails on Python 3) Before we figure out what causes this, let's have a working test suite by fixing pysam on the latest working release. Traceback: Traceback (most recent call last): File "/home/travis/build/jamescasbon/PyVCF/build/lib.linux-x86_64-3.3/vcf/test/test_vcf.py", line 1109, in testNoVariantsInRange fetched_variants = self.reader.fetch('20', 14370, 17329) File "/home/travis/build/jamescasbon/PyVCF/build/lib.linux-x86_64-3.3/vcf/parser.py", line 623, in fetch self.reader = self._tabix.fetch(chrom, start, end) File "ctabix.pyx", line 345, in pysam.ctabix.Tabixfile.fetch (pysam/ctabix.c:4241) TypeError: expected bytes, str found See #175 --- requirements/common-requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/common-requirements.txt b/requirements/common-requirements.txt index ea364d9..7bbf965 100644 --- a/requirements/common-requirements.txt +++ b/requirements/common-requirements.txt @@ -1,3 +1,3 @@ cython -pysam +pysam==0.7.8 setuptools From eafd842064d5f9ddbfc5fa1c7fa4bd3761feafdf Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Tue, 9 Sep 2014 14:02:36 +0200 Subject: [PATCH 110/168] Partial support for VCFv4.2 - Add R as an INFO field count (number of alleles including reference). - Support the optional Source and Version keys on INFO metainformation. Thanks alot @travc for contributing these fixes! See #172 --- vcf/parser.py | 14 ++++++---- vcf/test/example-4.2.vcf | 56 ++++++++++++++++++++++++++++++++++++++++ vcf/test/test_vcf.py | 20 ++++++++++++++ 3 files changed, 85 insertions(+), 5 deletions(-) create mode 100644 vcf/test/example-4.2.vcf diff --git a/vcf/parser.py b/vcf/parser.py index 9e4f739..6d668af 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -63,12 +63,13 @@ # Conversion between value in file and Python value field_counts = { '.': None, # Unknown number of values - 'A': -1, # Equal to the number of alleles in a given record + 'A': -1, # Equal to the number of alternate alleles in a given record 'G': -2, # Equal to the number of genotypes in a given record + 'R': -3, # Equal to the number of alleles including reference in a given record } -_Info = collections.namedtuple('Info', ['id', 'num', 'type', 'desc']) +_Info = collections.namedtuple('Info', ['id', 'num', 'type', 'desc', 'source', 'version']) _Filter = collections.namedtuple('Filter', ['id', 'desc']) _Alt = collections.namedtuple('Alt', ['id', 'desc']) _Format = collections.namedtuple('Format', ['id', 'num', 'type', 'desc']) @@ -82,9 +83,11 @@ def __init__(self): super(_vcf_metadata_parser, self).__init__() self.info_pattern = re.compile(r'''\#\#INFO=< ID=(?P[^,]+), - Number=(?P-?\d+|\.|[AG]), + Number=(?P-?\d+|\.|[AGR]), Type=(?PInteger|Float|Flag|Character|String), Description="(?P[^"]*)" + (?:,Source="(?P[^"]*)")? + (?:,Version="?(?P[^"]*)"?)? >''', re.VERBOSE) self.filter_pattern = re.compile(r'''\#\#FILTER=< ID=(?P[^,]+), @@ -96,7 +99,7 @@ def __init__(self): >''', re.VERBOSE) self.format_pattern = re.compile(r'''\#\#FORMAT=< ID=(?P.+), - Number=(?P-?\d+|\.|[AG]), + Number=(?P-?\d+|\.|[AGR]), Type=(?P.+), Description="(?P.*)" >''', re.VERBOSE) @@ -126,7 +129,8 @@ def read_info(self, info_string): num = self.vcf_field_count(match.group('number')) info = _Info(match.group('id'), num, - match.group('type'), match.group('desc')) + match.group('type'), match.group('desc'), + match.group('source'), match.group('version')) return (match.group('id'), info) diff --git a/vcf/test/example-4.2.vcf b/vcf/test/example-4.2.vcf new file mode 100644 index 0000000..d649fc3 --- /dev/null +++ b/vcf/test/example-4.2.vcf @@ -0,0 +1,56 @@ +##fileformat=VCFv4.2 +##FILTER= +##samtoolsVersion=1.0-17-gfaf4dd6+htslib-1.0-11-g830ea73 +##samtoolsCommand=samtools mpileup -u -t DP,DPR,DV,DP4,INFO/DPR,SP -f /data/archive/reference/Anopheles-arabiensis-Dongola_SCAFFOLDS_AaraD1.fa -r KB704451:0004153102-0004172483 huge_list_of_bam_files_removed +##reference=file:///data/archive/reference/Anopheles-arabiensis-Dongola_SCAFFOLDS_AaraD1.fa +##contig= +##ALT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##bcftools_callVersion=1.0-55-gc661821+htslib-1.0-11-g830ea73 +##bcftools_callCommand=call -m -vM -f GQ,GP +##SnpSiftVersion="SnpSift 3.6c (build 2014-05-20), by Pablo Cingolani" +##SnpSiftCmd="SnpSift varType - " +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT LUPI059 MINE001 OKJ042 LUPI001 LUPI007 LUPI024 LUPI056 LUPI071 LUPI074 LUPI082 MINE040 MINE100 MINE101 MINE105 MINE111 OKJ017 OKJ045 OKJ070 SAGA066 SAGA107 SAGA131 SAGA133 SAGA134 SAGA141 2012L_LUPI_002 2012L_LUPI_015 2012L_LUPI_017 2012L_LUPI_018 2012L_LUPI_035 2012L_LUPI_062 2012L_LUPI_065 2012L_LUPI_077 2012L_LUPI_083 2012L_LUPI_116 2012L_LUPI_013 2012L_LUPI_041 2012L_LUPI_068 2012L_LUPI_096 2012L_LUPI_098 2012L_LUPI_101 2012L_LUPI_103 2012_LUPI_156 2012_LUPI_157 2012_LUPI_161 2012_LUPI_171 2012_LUPI_173 2012_LUPI_180 2012L_LUPI_010 2012L_LUPI_012 2012L_LUPI_021 2012L_LUPI_045 2012L_LUPI_047 2012L_LUPI_060 2012L_LUPI_061 2012L_LUPI_067 2012_LUPI_125 2012_LUPI_129 2012_LUPI_146 2012_LUPI_178 2012_LUPI_211 2012_LUPI_277 2012_LUPI_278 2012_LUPI_279 2012_LUPI_284 +KB704451 4157846 . N A,C 167.0 . DP=10;VDB=1.17174e-06;SGB=1.26353;MQ0F=0;DPR=0,6,4;AC=10,4;AN=14;DP4=0,0,10,0;MQ=60;SNP;VARTYPE=SNP,SNP GT:PL:DP:DV:SP:DP4:DPR:GP:GQ 1/2:74,23,14,57,0,54:4:4:0:0,0,4,0:0,3,1:144,56,16,90,0,57:16 1/2:26,26,26,3,3,0:1:1:0:0,0,1,0:0,0,1:95,58,28,36,2,3:3 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 1/1:26,3,0,26,3,26:1:1:0:0,0,1,0:0,1,0:96,36,2,60,3,29:3 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 1/1:26,3,0,26,3,26:1:1:0:0,0,1,0:0,1,0:96,36,2,60,3,29:3 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 1/2:26,26,26,3,3,0:1:1:0:0,0,1,0:0,0,1:95,58,28,36,2,3:3 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 1/1:26,3,0,26,3,26:1:1:0:0,0,1,0:0,1,0:96,36,2,60,3,29:3 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 1/2:26,26,26,3,3,0:1:1:0:0,0,1,0:0,0,1:95,58,28,36,2,3:3 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 +KB704451 4157870 . T C 275.0 . DP=243;VDB=0.00023935;SGB=29.4468;RPB=0.0368658;MQB=0.979612;MQSB=0.268441;BQB=0.99223;MQ0F=0;DPR=213,19;ICB=0.85092;HOB=0.0287274;AC=6;AN=118;DP4=201,12,19,1;MQ=53;SNP;VARTYPE=SNP GT:PL:DP:DV:SP:DP4:DPR:GP:GQ 0/0:0,66,255:22:0:0:20,2,0,0:22,0:0,75,279:75 0/0:0,12,120:4:0:0:4,0,0,0:4,0:0,21,144:21 1/1:193,36,0:12:12:0:0,0,12,0:0,12:168,20,0:20 0/0:0,9,95:3:0:0:3,0,0,0:3,0:0,18,119:18 0/1:78,0,110:7:3:0:3,1,3,0:4,3:68,0,125:68 0/0:0,3,40:1:0:0:1,0,0,0:1,0:0,12,64:12 0/0:0,6,72:2:0:0:2,0,0,0:2,0:0,15,96:15 0/0:0,9,90:3:0:0:3,0,0,0:3,0:0,18,114:18 0/0:0,12,122:4:0:0:4,0,0,0:4,0:0,21,146:21 0/0:0,9,97:3:0:0:3,0,0,0:3,0:0,18,121:18 0/0:0,15,122:5:0:0:5,0,0,0:5,0:0,24,146:24 0/0:0,6,71:2:0:0:2,0,0,0:2,0:0,15,95:15 0/0:0,6,58:2:0:0:2,0,0,0:2,0:0,15,82:15 0/0:0,18,155:6:0:0:6,0,0,0:6,0:0,27,179:27 0/0:0,3,39:1:0:0:1,0,0,0:1,0:0,12,63:12 0/1:35,3,0:1:1:0:0,0,1,0:0,1:23,0,12:12 0/0:0,9,87:3:0:0:3,0,0,0:3,0:0,18,111:18 0/1:47,0,104:6:2:0:4,0,2,0:4,2:37,0,119:37 0/0:0,21,160:7:0:0:7,0,0,0:7,0:0,30,184:30 0/0:0,6,35:2:0:0:2,0,0,0:2,0:0,15,59:15 0/0:0,12,98:4:0:0:4,0,0,0:4,0:0,21,122:21 0/0:0,6,70:2:0:0:2,0,0,0:2,0:0,15,94:15 0/0:0,6,66:2:0:0:2,0,0,0:2,0:0,15,90:15 0/0:0,12,122:4:0:0:4,0,0,0:4,0:0,21,146:21 0/0:0,3,29:1:0:0:0,1,0,0:1,0:0,12,53:12 0/0:0,6,72:2:0:0:2,0,0,0:2,0:0,15,96:15 0/0:0,9,76:3:0:0:3,0,0,0:3,0:0,18,100:18 0/0:0,15,136:5:0:0:5,0,0,0:5,0:0,24,160:24 0/0:0,30,182:10:0:0:10,0,0,0:10,0:0,39,206:39 0/0:0,6,66:2:0:0:2,0,0,0:2,0:0,15,90:15 0/0:0,6,69:2:0:0:2,0,0,0:2,0:0,15,93:15 0/0:0,27,152:9:0:0:9,0,0,0:9,0:0,36,176:36 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,6,94:2:0:0:2,0,0,0:2,0:0,15,118:15 0/0:0,21,195:7:0:0:5,2,0,0:7,0:0,30,219:30 0/0:0,9,92:3:0:0:2,1,0,0:3,0:0,18,116:18 0/1:33,0,18:2:1:0:0,1,1,0:1,1:23,0,33:23 0/0:0,3,35:1:0:0:1,0,0,0:1,0:0,12,59:12 0/0:0,9,91:3:0:0:3,0,0,0:3,0:0,18,115:18 0/0:0,3,36:1:0:0:1,0,0,0:1,0:0,12,60:12 0/0:0,30,212:10:0:0:9,1,0,0:10,0:0,39,236:39 0/0:0,9,89:3:0:0:3,0,0,0:3,0:0,18,113:18 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,21,195:7:0:0:7,0,0,0:7,0:0,30,219:30 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,9,97:3:0:0:2,1,0,0:3,0:0,18,121:18 0/0:0,9,93:3:0:0:3,0,0,0:3,0:0,18,117:18 0/0:0,9,116:3:0:0:3,0,0,0:3,0:0,18,140:18 0/0:0,6,71:2:0:0:1,1,0,0:2,0:0,15,95:15 0/0:0,9,89:3:0:0:3,0,0,0:3,0:0,18,113:18 0/0:0,33,175:11:0:0:11,0,0,0:11,0:0,42,199:42 0/0:0,6,63:2:0:0:2,0,0,0:2,0:0,15,87:15 0/0:0,21,145:7:0:0:7,0,0,0:7,0:0,30,169:30 0/0:0,3,39:1:0:0:0,1,0,0:1,0:0,12,63:12 0/0:0,9,84:3:0:0:3,0,0,0:3,0:0,18,108:18 0/0:0,3,13:1:0:0:1,0,0,0:1,0:0,12,37:12 0/0:0,3,23:1:0:0:1,0,0,0:1,0:0,12,47:12 0/0:0,12,106:4:0:0:4,0,0,0:4,0:0,21,130:21 0/0:0,3,36:1:0:0:1,0,0,0:1,0:0,12,60:12 0/0:0,9,94:3:0:0:3,0,0,0:3,0:0,18,118:18 0/0:0,6,67:2:0:0:2,0,0,0:2,0:0,15,91:15 0/0:2,5,27:2:1:0:1,0,0,1:1,0:0,12,49:12 +KB704451 4157877 . G A 999.0 . DP=250;VDB=6.58963e-09;SGB=31.659;RPB=0.0227135;MQB=0.410318;MQSB=0.139343;BQB=0.0767891;MQ0F=0;DPR=188,48;ICB=0.990841;HOB=0.00761276;AC=17;AN=118;DP4=176,12,45,3;MQ=55;SNP;VARTYPE=SNP GT:PL:DP:DV:SP:DP4:DPR:GP:GQ 0/1:159,0,202:22:9:0:12,1,8,1:13,9:154,0,212:127 0/0:0,12,120:4:0:0:4,0,0,0:4,0:0,16,134:16 0/0:0,51,207:17:0:0:17,0,0,0:17,0:0,55,221:55 0/0:0,9,98:3:0:0:3,0,0,0:3,0:0,13,112:13 0/1:123,0,61:8:5:0:3,0,4,1:3,5:118,0,71:71 0/0:0,3,38:1:0:0:0,1,0,0:1,0:0,7,52:7 0/1:68,0,29:3:2:0:1,0,1,1:1,2:63,0,39:39 0/0:0,6,69:2:0:0:2,0,0,0:2,0:0,10,83:10 0/0:0,12,119:4:0:0:4,0,0,0:4,0:0,16,133:16 0/1:34,0,34:2:1:0:1,0,1,0:1,1:29,0,44:29 0/1:24,0,99:5:1:0:4,0,1,0:4,1:19,0,109:19 0/1:34,0,28:2:1:0:1,0,1,0:1,1:29,0,38:29 0/0:0,6,58:2:0:0:2,0,0,0:2,0:0,10,72:10 0/1:122,0,57:7:4:0:3,0,4,0:3,4:117,0,67:67 0/0:0,3,41:1:0:0:1,0,0,0:1,0:0,7,55:7 0/0:0,3,29:1:0:0:1,0,0,0:1,0:0,7,43:7 0/0:0,12,105:4:0:0:4,0,0,0:4,0:0,16,119:16 0/0:0,18,144:6:0:0:6,0,0,0:6,0:0,22,158:22 0/1:118,0,63:8:5:0:3,0,5,0:3,5:113,0,73:73 0/0:0,6,34:2:0:0:2,0,0,0:2,0:0,10,48:10 0/0:0,15,131:5:0:0:5,0,0,0:5,0:0,19,145:19 0/0:0,6,72:2:0:0:2,0,0,0:2,0:0,10,86:10 0/0:0,6,89:2:0:0:2,0,0,0:2,0:0,10,103:10 1/1:124,12,0:4:4:0:0,0,4,0:0,4:112,4,2:4 0/0:0,3,34:1:0:0:0,1,0,0:1,0:0,7,48:7 0/0:0,6,73:2:0:0:2,0,0,0:2,0:0,10,87:10 0/0:0,9,91:3:0:0:3,0,0,0:3,0:0,13,105:13 0/0:0,15,138:5:0:0:5,0,0,0:5,0:0,19,152:19 0/0:0,30,179:10:0:0:10,0,0,0:10,0:0,34,193:34 0/0:0,6,65:2:0:0:2,0,0,0:2,0:0,10,79:10 0/0:0,6,70:2:0:0:2,0,0,0:2,0:0,10,84:10 0/0:0,27,155:9:0:0:9,0,0,0:9,0:0,31,169:31 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,6,94:2:0:0:2,0,0,0:2,0:0,10,108:10 0/0:0,15,161:5:0:0:3,2,0,0:5,0:0,19,175:19 0/0:0,6,72:2:0:0:1,1,0,0:2,0:0,10,86:10 0/0:0,6,65:2:0:0:1,1,0,0:2,0:0,10,79:10 0/1:36,3,0:1:1:0:0,0,1,0:0,1:29,0,7:7 0/0:0,9,93:3:0:0:3,0,0,0:3,0:0,13,107:13 0/0:0,3,34:1:0:0:1,0,0,0:1,0:0,7,48:7 0/1:87,0,137:10:4:0:5,1,4,0:6,4:82,0,147:82 0/1:57,0,26:3:2:0:1,0,2,0:1,2:52,0,36:35 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/1:139,0,73:7:4:0:3,0,4,0:3,4:134,0,83:83 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,6,75:2:0:0:1,1,0,0:2,0:0,10,89:10 0/1:90,9,0:3:3:0:0,0,3,0:0,3:79,2,3:3 0/0:0,9,98:3:0:0:3,0,0,0:3,0:0,13,112:13 0/0:0,6,72:2:0:0:1,1,0,0:2,0:0,10,86:10 0/0:0,9,88:3:0:0:3,0,0,0:3,0:0,13,102:13 0/0:0,33,173:11:0:0:11,0,0,0:11,0:0,37,187:37 0/0:0,6,57:2:0:0:2,0,0,0:2,0:0,10,71:10 0/0:0,15,125:5:0:0:5,0,0,0:5,0:0,19,139:19 0/0:0,6,61:2:0:0:1,1,0,0:2,0:0,10,75:10 0/1:24,0,51:3:1:0:2,0,1,0:2,1:19,0,61:19 0/0:0,3,30:1:0:0:1,0,0,0:1,0:0,7,44:7 0/0:0,3,23:1:0:0:1,0,0,0:1,0:0,7,37:7 0/0:0,12,105:4:0:0:4,0,0,0:4,0:0,16,119:16 0/0:0,3,35:1:0:0:1,0,0,0:1,0:0,7,49:7 0/1:25,0,61:3:1:0:2,0,1,0:2,1:20,0,71:20 0/0:0,6,67:2:0:0:2,0,0,0:2,0:0,10,81:10 0/0:0,3,8:1:0:0:0,1,0,0:1,0:0,7,22:7 +KB704451 4157907 . A C 278.0 . DP=295;VDB=0.241276;SGB=26.7514;RPB=0.676983;MQB=0.997838;MQSB=0.136536;BQB=0.45683;MQ0F=0;DPR=264,15;ICB=0.00518819;HOB=0.00237812;AC=4;AN=116;DP4=233,31,14,1;MQ=59;SNP;VARTYPE=SNP GT:PL:DP:DV:SP:DP4:DPR:GP:GQ 0/0:0,90,255:30:0:0:25,5,0,0:30,0:0,101,283:101 0/0:0,30,201:10:0:0:9,1,0,0:10,0:0,41,229:41 0/1:157,0,188:18:8:0:10,0,7,1:10,8:145,0,205:127 0/1:75,0,90:5:2:0:2,1,2,0:3,2:63,0,107:63 0/0:0,30,201:10:0:0:9,1,0,0:10,0:0,41,229:41 0/0:0,6,80:2:0:0:1,1,0,0:2,0:0,17,108:17 0/0:0,12,134:4:0:0:3,1,0,0:4,0:0,23,162:23 0/0:0,3,33:1:0:0:1,0,0,0:1,0:0,14,61:14 0/0:0,21,160:7:0:0:7,0,0,0:7,0:0,32,188:32 0/0:0,12,135:4:0:0:2,2,0,0:4,0:0,23,163:23 0/0:0,15,148:5:0:0:5,0,0,0:5,0:0,26,176:26 0/0:0,9,82:3:0:0:3,0,0,0:3,0:0,20,110:20 0/1:70,0,19:4:3:0:1,0,3,0:1,3:58,0,36:36 0/0:0,24,246:8:0:0:7,1,0,0:8,0:0,35,274:35 0/0:0,18,147:6:0:0:6,0,0,0:6,0:0,29,175:29 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,9,82:3:0:0:3,0,0,0:3,0:0,20,110:20 0/1:59,0,62:5:2:0:3,0,2,0:3,2:47,0,79:47 0/0:0,33,192:11:0:0:11,0,0,0:11,0:0,44,220:44 0/0:0,9,94:3:0:0:3,0,0,0:3,0:0,20,122:20 0/0:0,24,198:8:0:0:7,1,0,0:8,0:0,35,226:35 0/0:0,12,120:4:0:0:4,0,0,0:4,0:0,23,148:23 0/0:0,15,165:5:0:0:3,2,0,0:5,0:0,26,193:26 0/0:0,24,172:8:0:0:8,0,0,0:8,0:0,35,200:35 0/0:0,3,31:1:0:0:0,1,0,0:1,0:0,14,59:14 0/0:0,6,64:2:0:0:2,0,0,0:2,0:0,17,92:17 0/0:0,6,66:2:0:0:2,0,0,0:2,0:0,17,94:17 0/0:0,15,118:5:0:0:5,0,0,0:5,0:0,26,146:26 0/0:0,33,178:11:0:0:11,0,0,0:11,0:0,44,206:44 0/0:0,3,35:1:0:0:1,0,0,0:1,0:0,14,63:14 0/0:0,9,74:3:0:0:2,1,0,0:3,0:0,20,102:20 0/0:0,21,168:7:0:0:5,2,0,0:7,0:0,32,196:32 0/0:0,3,40:1:0:0:1,0,0,0:1,0:0,14,68:14 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,6,89:2:0:0:2,0,0,0:2,0:0,17,117:17 0/0:0,12,130:4:0:0:2,2,0,0:4,0:0,23,158:23 0/0:0,9,78:3:0:0:1,2,0,0:3,0:0,20,106:20 0/0:0,6,65:2:0:0:1,1,0,0:2,0:0,17,93:17 0/0:0,3,35:1:0:0:1,0,0,0:1,0:0,14,63:14 0/0:0,6,55:2:0:0:2,0,0,0:2,0:0,17,83:17 0/0:0,3,29:1:0:0:1,0,0,0:1,0:0,14,57:14 0/0:0,36,194:12:0:0:11,1,0,0:12,0:0,47,222:47 0/0:0,12,110:4:0:0:4,0,0,0:4,0:0,23,138:23 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,18,182:6:0:0:6,0,0,0:6,0:0,29,210:29 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,3,34:1:0:0:0,1,0,0:1,0:0,14,62:14 0/0:0,6,68:2:0:0:2,0,0,0:2,0:0,17,96:17 0/0:0,9,107:3:0:0:3,0,0,0:3,0:0,20,135:20 0/0:0,6,67:2:0:0:1,1,0,0:2,0:0,17,95:17 0/0:0,6,68:2:0:0:2,0,0,0:2,0:0,17,96:17 0/0:0,27,184:9:0:0:9,0,0,0:9,0:0,38,212:38 0/0:0,9,85:3:0:0:3,0,0,0:3,0:0,20,113:20 0/0:0,12,111:4:0:0:4,0,0,0:4,0:0,23,139:23 0/0:0,6,77:2:0:0:1,1,0,0:2,0:0,17,105:17 0/0:0,12,108:4:0:0:3,1,0,0:4,0:0,23,136:23 0/0:0,3,27:1:0:0:1,0,0,0:1,0:0,14,55:14 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,6,64:2:0:0:2,0,0,0:2,0:0,17,92:17 0/0:0,3,33:1:0:0:1,0,0,0:1,0:0,14,61:14 0/0:0,12,125:4:0:0:4,0,0,0:4,0:0,23,153:23 0/0:0,9,98:3:0:0:2,1,0,0:3,0:0,20,126:20 0/0:0,6,46:2:0:0:2,0,0,0:2,0:0,17,74:17 +KB704451 4157909 . T G 278.0 . DP=295;VDB=0.184881;SGB=22.7413;RPB=0.646301;MQB=0.998034;MQSB=0.200514;BQB=0.321842;MQ0F=0;DPR=247,15;ICB=0.00558284;HOB=0.00255102;AC=4;AN=112;DP4=218,29,15,1;MQ=59;SNP;VARTYPE=SNP GT:PL:DP:DV:SP:DP4:DPR:GP:GQ 0/0:0,87,255:29:0:0:24,5,0,0:29,0:0,97,282:97 0/0:0,27,183:9:0:0:9,0,0,0:9,0:0,37,210:37 0/1:156,0,167:19:8:0:11,0,7,1:11,8:145,0,183:127 0/1:75,0,107:5:2:0:2,1,2,0:3,2:64,0,123:64 0/0:0,27,191:9:0:0:8,1,0,0:9,0:0,37,218:37 0/0:0,6,80:2:0:0:1,1,0,0:2,0:0,16,107:16 0/0:0,12,119:4:0:0:3,1,0,0:4,0:0,22,146:22 0/0:0,3,34:1:0:0:1,0,0,0:1,0:0,13,61:13 0/0:0,15,126:5:0:0:5,0,0,0:5,0:0,25,153:25 0/0:0,12,132:4:0:0:2,2,0,0:4,0:0,22,159:22 0/0:0,12,133:4:0:0:4,0,0,0:4,0:0,22,160:22 0/0:0,6,67:2:0:0:2,0,0,0:2,0:0,16,94:16 0/1:79,9,0:3:3:0:0,0,3,0:0,3:60,0,8:8 0/0:0,21,230:7:0:0:6,1,0,0:7,0:0,31,257:31 0/0:0,18,144:6:0:0:6,0,0,0:6,0:0,28,171:28 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,6,53:2:0:0:2,0,0,0:2,0:0,16,80:16 0/1:59,0,64:5:2:0:3,0,2,0:3,2:48,0,80:48 0/0:0,33,180:11:0:0:11,0,0,0:11,0:0,43,207:43 0/0:0,12,110:4:0:0:4,0,0,0:4,0:0,22,137:22 0/0:0,24,190:8:0:0:7,1,0,0:8,0:0,34,217:34 0/0:0,12,110:4:0:0:4,0,0,0:4,0:0,22,137:22 0/0:0,15,164:5:0:0:3,2,0,0:5,0:0,25,191:25 0/0:0,24,161:8:0:0:8,0,0,0:8,0:0,34,188:34 0/0:0,3,32:1:0:0:0,1,0,0:1,0:0,13,59:13 0/0:0,6,63:2:0:0:2,0,0,0:2,0:0,16,90:16 0/0:0,6,65:2:0:0:2,0,0,0:2,0:0,16,92:16 0/0:0,15,121:5:0:0:5,0,0,0:5,0:0,25,148:25 0/0:0,30,174:10:0:0:10,0,0,0:10,0:0,40,201:40 0/0:0,3,34:1:0:0:1,0,0,0:1,0:0,13,61:13 0/0:0,6,63:2:0:0:2,0,0,0:2,0:0,16,90:16 0/0:0,21,164:7:0:0:5,2,0,0:7,0:0,31,191:31 0/0:0,3,37:1:0:0:1,0,0,0:1,0:0,13,64:13 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,6,89:2:0:0:2,0,0,0:2,0:0,16,116:16 0/0:0,12,128:4:0:0:2,2,0,0:4,0:0,22,155:22 0/0:0,9,94:3:0:0:1,2,0,0:3,0:0,19,121:19 0/0:0,6,63:2:0:0:1,1,0,0:2,0:0,16,90:16 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,6,56:2:0:0:2,0,0,0:2,0:0,16,83:16 0/0:0,3,34:1:0:0:1,0,0,0:1,0:0,13,61:13 0/0:0,36,193:12:0:0:11,1,0,0:12,0:0,46,220:46 0/0:0,12,108:4:0:0:4,0,0,0:4,0:0,22,135:22 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,18,168:6:0:0:6,0,0,0:6,0:0,28,195:28 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,3,31:1:0:0:0,1,0,0:1,0:0,13,58:13 0/0:0,6,47:2:0:0:2,0,0,0:2,0:0,16,74:16 0/0:8,11,65:2:1:0:1,0,1,0:1,0:0,13,84:13 0/0:0,6,64:2:0:0:1,1,0,0:2,0:0,16,91:16 0/0:0,3,34:1:0:0:1,0,0,0:1,0:0,13,61:13 0/0:0,27,177:9:0:0:9,0,0,0:9,0:0,37,204:37 0/0:0,6,50:2:0:0:2,0,0,0:2,0:0,16,77:16 0/0:0,12,101:4:0:0:4,0,0,0:4,0:0,22,128:22 0/0:0,6,65:2:0:0:1,1,0,0:2,0:0,16,92:16 0/0:0,12,100:4:0:0:3,1,0,0:4,0:0,22,127:22 0/0:0,3,31:1:0:0:1,0,0,0:1,0:0,13,58:13 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,9,84:3:0:0:3,0,0,0:3,0:0,19,111:19 0/0:0,3,32:1:0:0:1,0,0,0:1,0:0,13,59:13 0/0:0,12,104:4:0:0:4,0,0,0:4,0:0,22,131:22 0/0:0,6,66:2:0:0:1,1,0,0:2,0:0,16,93:16 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 +KB704451 4157927 . G A 4.88727 . DP=334;VDB=0.38;SGB=3.29913;RPB=0.454248;MQB=0.970588;MQSB=0.546099;BQB=0.215686;MQ0F=0;DPR=306,2;ICB=0.000310486;HOB=0.000153894;AC=1;AN=114;DP4=265,41,2,0;MQ=59;SNP;VARTYPE=SNP GT:PL:DP:DV:SP:DP4:DPR:GP:GQ 0/0:0,105,255:35:0:0:31,4,0,0:35,0:0,122,295:122 0/0:0,45,255:15:0:0:14,1,0,0:15,0:0,62,295:62 0/0:0,60,255:20:0:0:17,3,0,0:20,0:0,77,295:77 0/0:0,21,242:7:0:0:6,1,0,0:7,0:0,38,282:38 0/0:0,24,207:8:0:0:6,2,0,0:8,0:0,41,247:41 0/0:0,6,71:2:0:0:2,0,0,0:2,0:0,23,112:23 0/0:0,24,215:8:0:0:6,2,0,0:8,0:0,41,255:41 0/0:0,6,70:2:0:0:1,1,0,0:2,0:0,23,111:23 0/0:0,30,191:10:0:0:10,0,0,0:10,0:0,47,231:47 0/0:0,15,163:5:0:0:3,2,0,0:5,0:0,32,203:32 0/0:0,15,151:5:0:0:5,0,0,0:5,0:0,32,191:32 0/0:0,6,70:2:0:0:2,0,0,0:2,0:0,23,111:23 0/0:0,12,102:4:0:0:4,0,0,0:4,0:0,29,142:29 0/0:0,24,255:8:0:0:6,2,0,0:8,0:0,41,295:41 0/0:0,21,189:7:0:0:7,0,0,0:7,0:0,38,229:38 0/0:0,3,35:1:0:0:0,1,0,0:1,0:0,20,76:20 0/0:0,3,40:1:0:0:1,0,0,0:1,0:0,20,81:20 0/0:0,12,126:4:0:0:3,1,0,0:4,0:0,29,166:29 0/0:0,39,255:13:0:0:12,1,0,0:13,0:0,56,295:56 0/0:0,21,206:7:0:0:6,1,0,0:7,0:0,38,246:38 0/0:0,30,238:10:0:0:8,2,0,0:10,0:0,47,278:47 0/0:0,18,145:6:0:0:6,0,0,0:6,0:0,35,185:35 0/0:0,24,244:8:0:0:6,2,0,0:8,0:0,41,284:41 0/0:0,24,195:8:0:0:7,1,0,0:8,0:0,41,235:41 0/0:0,3,27:1:0:0:0,1,0,0:1,0:0,20,68:20 0/0:0,6,62:2:0:0:2,0,0,0:2,0:0,23,103:23 0/0:0,6,64:2:0:0:2,0,0,0:2,0:0,23,105:23 0/0:0,15,123:5:0:0:5,0,0,0:5,0:0,32,163:32 0/0:0,33,184:11:0:0:11,0,0,0:11,0:0,50,224:50 0/0:0,3,35:1:0:0:1,0,0,0:1,0:0,20,76:20 0/0:0,3,35:1:0:0:1,0,0,0:1,0:0,20,76:20 0/0:0,18,165:6:0:0:4,2,0,0:6,0:0,35,205:35 0/0:0,3,38:1:0:0:1,0,0,0:1,0:0,20,79:20 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,9,117:3:0:0:3,0,0,0:3,0:0,26,157:26 0/0:0,12,121:4:0:0:2,2,0,0:4,0:0,29,161:29 0/0:0,9,95:3:0:0:1,2,0,0:3,0:0,26,135:26 0/0:0,3,41:1:0:0:1,0,0,0:1,0:0,20,82:20 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,6,60:2:0:0:2,0,0,0:2,0:0,23,101:23 0/0:0,3,25:1:0:0:1,0,0,0:1,0:0,20,66:20 0/0:0,36,213:12:0:0:11,1,0,0:12,0:0,53,253:53 0/0:0,15,152:5:0:0:4,1,0,0:5,0:0,32,192:32 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,21,215:7:0:0:6,1,0,0:7,0:0,38,255:38 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,3,31:1:0:0:0,1,0,0:1,0:0,20,72:20 0/0:0,6,60:2:0:0:2,0,0,0:2,0:0,23,101:23 0/0:0,9,101:3:0:0:3,0,0,0:3,0:0,26,141:26 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,6,60:2:0:0:2,0,0,0:2,0:0,23,101:23 0/0:0,27,179:9:0:0:9,0,0,0:9,0:0,44,219:44 0/0:0,9,92:3:0:0:3,0,0,0:3,0:0,26,132:26 0/0:0,12,112:4:0:0:4,0,0,0:4,0:0,29,152:29 0/0:0,6,58:2:0:0:1,1,0,0:2,0:0,23,99:23 0/0:0,15,123:5:0:0:4,1,0,0:5,0:0,32,163:32 0/0:0,3,25:1:0:0:1,0,0,0:1,0:0,20,66:20 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,9,78:3:0:0:3,0,0,0:3,0:0,26,118:26 0/0:0,3,34:1:0:0:1,0,0,0:1,0:0,20,75:20 0/1:47,0,51:3:2:0:1,0,2,0:1,2:29,0,74:29 0/0:0,9,80:3:0:0:2,1,0,0:3,0:0,26,120:26 0/0:0,6,53:2:0:0:2,0,0,0:2,0:0,23,94:23 +KB704451 4157938 . ATTT ATTTT 650.0 . INDEL;IDV=18;IMF=0.428571;DP=361;VDB=0.773794;SGB=32.6744;MQSB=0.993251;MQ0F=0.00831025;DPR=115,60;ICB=0.929833;HOB=0.0258;AC=23;AN=100;DP4=98,17,48,12;MQ=59;INS;VARTYPE=INS GT:PL:DP:DV:SP:DP4:DPR:GP:GQ 0/1:124,0,7:19:16:0:3,0,13,3:3,16:123,0,13:13 0/1:14,3,0:1:1:0:0,0,1,0:0,1:12,1,5:4 0/0:0,9,55:3:0:0:3,0,0,0:3,0:0,9,62:9 0/0:0,21,146:7:0:0:5,2,0,0:7,0:0,21,153:21 0/0:0,18,126:6:0:0:4,2,0,0:6,0:0,18,133:18 0/0:0,6,56:2:0:0:2,0,0,0:2,0:0,7,63:7 0/0:0,9,87:3:0:0:2,1,0,0:3,0:0,9,94:9 1/1:46,9,0:3:3:0:0,0,1,2:0,3:40,4,1:4 0/0:0,27,155:9:0:0:8,1,0,0:9,0:0,27,162:27 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,6,57:2:0:0:2,0,0,0:2,0:0,7,64:7 0/1:24,3,0:1:1:0:0,0,1,0:0,1:22,1,5:5 0/0:0,3,32:1:0:0:1,0,0,0:1,0:1,5,40:5 0/0:0,9,84:3:0:0:2,1,0,0:3,0:0,9,91:9 0/0:0,15,111:5:0:0:5,0,0,0:5,0:0,15,118:15 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/1:32,6,0:2:2:0:0,0,1,1:0,2:28,2,3:3 0/0:0,6,55:2:0:0:2,0,0,0:2,0:0,7,62:7 0/1:63,0,61:7:4:0:3,0,3,1:3,4:62,0,67:61 0/0:0,24,154:8:0:0:7,1,0,0:8,0:0,24,161:24 0/1:16,0,94:7:2:0:5,0,2,0:5,2:15,0,100:15 0/1:61,0,101:9:4:0:4,1,3,1:5,4:60,0,107:60 0/0:0,3,31:1:0:0:1,0,0,0:1,0:1,5,39:5 0/1:13,3,0:1:1:0:0,0,0,1:0,1:11,1,5:4 0/1:16,0,47:3:1:0:2,0,1,0:2,1:15,0,53:15 0/0:0,3,32:1:0:0:1,0,0,0:1,0:1,5,40:5 0/1:46,0,57:5:2:0:3,0,2,0:3,2:45,0,63:45 0/1:50,0,12:6:5:0:1,0,5,0:1,5:49,0,18:18 0/0:0,3,30:1:0:0:1,0,0,0:1,0:1,5,38:5 0/0:0,3,4:1:0:0:1,0,0,0:1,0:1,5,12:4 0/1:18,0,98:5:1:0:2,2,1,0:4,1:17,0,104:17 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/1:15,0,26:2:1:0:1,0,1,0:1,1:14,0,32:14 0/1:20,0,26:2:1:0:1,0,0,1:1,1:19,0,32:19 0/0:0,6,60:2:0:0:1,1,0,0:2,0:0,7,67:7 0/0:0,3,32:1:0:0:1,0,0,0:1,0:1,5,40:5 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/1:33,3,0:1:1:0:0,0,1,0:0,1:31,1,5:5 0/1:82,6,0:7:6:0:1,0,5,1:1,6:78,2,3:3 0/0:0,15,126:5:0:0:4,1,0,0:5,0:0,15,133:15 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/1:34,6,0:2:2:0:0,0,2,0:0,2:30,2,3:3 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,6,50:2:0:0:1,1,0,0:2,0:0,7,57:7 0/0:0,3,28:1:0:0:1,0,0,0:1,0:1,5,36:5 0/0:0,6,57:2:0:0:2,0,0,0:2,0:0,7,64:7 0/0:0,18,124:6:0:0:5,1,0,0:6,0:0,18,131:18 0/1:53,6,0:2:2:0:0,0,2,0:0,2:49,2,3:3 0/0:0,12,96:4:0:0:4,0,0,0:4,0:0,12,103:12 0/1:25,3,0:1:1:0:0,0,1,0:0,1:23,1,5:5 1/1:61,9,0:3:3:0:0,0,2,1:0,3:55,4,1:4 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,3,31:1:0:0:0,1,0,0:1,0:1,5,39:5 0/0:0,6,56:2:0:0:2,0,0,0:2,0:0,7,63:7 0/0:0,3,29:1:0:0:1,0,0,0:1,0:1,5,37:5 0/0:0,3,32:1:0:0:1,0,0,0:1,0:1,5,40:5 0/0:0,9,87:3:0:0:2,1,0,0:3,0:0,9,94:9 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 +KB704451 4157940 . TTGTGTGTGTGTGT TTGTGTGTGTGTGTGTGT,TTTCTGTGTGTGTGTGT 999.0 . INDEL;IDV=7;IMF=0.5;DP=366;VDB=0.0431342;SGB=14.7456;MQSB=0.996953;MQ0F=0.010929;DPR=86,41,8;ICB=0.963728;HOB=0.02;AC=21,6;AN=90;DP4=70,16,39,10;MQ=58;INS;VARTYPE=INS,INS GT:PL:DP:DV:SP:DP4:DPR:GP:GQ 0/2:60,60,60,3,3,0:1:1:0:0,0,1,0:0,0,1:54,55,61,2,5,9:4 1/1:255,18,0,255,18,255:6:6:0:0,0,3,3:0,6,0:248,11,0,252,19,263:11 0/0:0,6,62,6,62,62:2:0:0:2,0,0,0:2,0,0:1,7,70,11,71,78:6 0/1:9,0,238,27,241,255:7:1:0:4,2,1,0:6,1,0:8,0,245,31,249,270:8 0/0:0,12,185,12,185,185:4:0:0:3,1,0,0:4,0,0:0,12,192,16,193,200:11 0/0:0,6,110,6,110,110:2:0:0:2,0,0,0:2,0,0:1,7,118,11,119,126:6 0/1:1,0,158,10,161,165:4:1:0:2,1,1,0:3,1,0:3,2,167,16,171,182:3 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 0/0:0,24,255,24,255,255:8:0:0:7,1,0,0:8,0,0:0,24,262,28,263,270:23 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 0/1:37,0,54,40,57,94:2:1:0:1,0,1,0:1,1,0:36,0,60,44,64,108:35 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 0/0:0,3,60,3,60,60:1:0:0:1,0,0,0:1,0,0:2,5,69,9,70,77:4 0/1:117,6,0,117,6,117:2:2:0:0,0,1,1:0,2,0:113,3,3,118,10,128:2 0/0:0,18,237,18,237,237:6:0:0:5,1,0,0:6,0,0:0,18,244,22,245,252:17 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 0/2:120,120,120,6,6,0:2:2:0:0,0,1,1:0,0,2:111,112,119,2,6,7:3 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 1/1:255,21,0,255,21,255:7:7:0:0,0,5,2:0,7,0:247,14,0,252,22,263:13 0/2:19,31,181,0,157,154:5:1:0:3,1,1,0:4,0,1:14,27,183,0,160,164:14 0/1:120,0,255,141,255,255:10:3:0:6,1,2,1:7,3,0:119,0,261,145,262,269:119 0/0:0,15,212,15,212,212:5:0:0:5,0,0,0:5,0,0:0,15,219,19,220,227:14 0/0:0,15,243,15,243,243:5:0:0:4,1,0,0:5,0,0:0,15,250,19,251,258:14 1/1:40,9,0,40,9,40:3:3:0:0,0,3,0:0,3,0:34,4,2,39,12,50:3 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 0/0:0,6,110,6,110,110:2:0:0:2,0,0,0:2,0,0:1,7,118,11,119,126:6 0/1:54,0,54,57,57,111:2:1:0:1,0,1,0:1,1,0:53,0,60,61,64,125:51 0/0:0,9,139,9,139,139:3:0:0:3,0,0,0:3,0,0:0,10,146,14,147,154:8 1/1:243,15,0,243,15,243:5:5:0:0,0,4,1:0,5,0:236,9,0,241,16,251:8 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 0/1:55,1,0,58,3,57:2:1:0:1,0,1,0:1,1,0:54,1,7,62,11,72:5 0/0:0,9,170,9,170,170:3:0:0:1,2,0,0:3,0,0:0,10,177,14,178,185:8 0/1:60,3,0,60,3,60:1:1:0:0,0,1,0:0,1,0:58,2,5,63,9,73:4 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 0/0:0,3,60,3,60,60:1:0:0:1,0,0,0:1,0,0:2,5,69,9,70,77:4 0/0:0,3,60,3,60,60:1:0:0:1,0,0,0:1,0,0:2,5,69,9,70,77:4 0/1:60,3,0,60,3,60:1:1:0:0,0,0,1:0,1,0:58,2,5,63,9,73:4 0/0:0,3,60,3,60,60:1:0:0:1,0,0,0:1,0,0:2,5,69,9,70,77:4 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 0/1:60,3,0,60,3,60:1:1:0:0,0,1,0:0,1,0:58,2,5,63,9,73:4 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 0/0:0,15,206,15,206,206:5:0:0:4,1,0,0:5,0,0:0,15,213,19,214,221:14 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 1/2:134,105,96,40,0,34:5:5:0:0,0,5,0:0,3,2:125,97,95,36,0,41:35 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 0/1:58,6,0,58,6,58:2:2:0:0,0,2,0:0,2,0:54,3,3,59,10,69:2 0/0:0,6,102,6,102,102:2:0:0:1,1,0,0:2,0,0:1,7,110,11,111,118:6 0/0:0,3,60,3,60,60:1:0:0:1,0,0,0:1,0,0:2,5,69,9,70,77:4 0/0:0,3,60,3,60,60:1:0:0:1,0,0,0:1,0,0:2,5,69,9,70,77:4 0/1:98,0,88,104,94,191:4:2:0:1,1,2,0:2,2,0:97,0,94,108,101,205:92 0/2:35,35,35,3,3,0:1:1:0:0,0,1,0:0,0,1:29,30,36,2,5,9:4 0/0:0,6,110,6,110,110:2:0:0:2,0,0,0:2,0,0:1,7,118,11,119,126:6 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 0/2:45,45,45,3,3,0:1:1:0:0,0,1,0:0,0,1:39,40,46,2,5,9:4 0/0:0,3,60,3,60,60:1:0:0:0,1,0,0:1,0,0:2,5,69,9,70,77:4 0/0:0,6,110,6,110,110:2:0:0:2,0,0,0:2,0,0:1,7,118,11,119,126:6 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 0/0:0,3,60,3,60,60:1:0:0:1,0,0,0:1,0,0:2,5,69,9,70,77:4 0/0:0,6,120,6,120,120:2:0:0:1,1,0,0:2,0,0:1,7,128,11,129,136:6 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 2017653..bb4ce37 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -114,6 +114,26 @@ def test_vcf_4_1_bnd(self): print(c) assert c + def test_vcf_4_2(self): + reader = vcf.Reader(fh('example-4.2.vcf')) + self.assertEqual(reader.metadata['fileformat'], 'VCFv4.2') + + # If INFO contains no Source and Version keys, they should be None. + self.assertEqual(reader.infos['DP'].source, None) + self.assertEqual(reader.infos['DP'].version, None) + + # According to spec, INFO Version key is required to be double quoted, + # but at least SAMtools 1.0 does not quote it. So we want to be + # forgiving here. + self.assertEqual(reader.infos['VDB'].source, None) + self.assertEqual(reader.infos['VDB'].version, '3') + + # test we can walk the file at least + for r in reader: + for c in r: + assert c + + class TestGatkOutput(unittest.TestCase): filename = 'gatk.vcf' From f6e955ff2c47be3d41a29297734ebc7ad09848c5 Mon Sep 17 00:00:00 2001 From: Chris Lasher Date: Mon, 8 Sep 2014 23:12:36 -0700 Subject: [PATCH 111/168] Bugfix: SNP records with N as ALT now noted as SNPs. The VCF 4.0 and newer specifications say the ALT field is a comma separated list that includes "base Strings made up of the bases A,C,G,T,N". Notably, the last case was not handled by `Record.is_snp`, causing it to erroneously report `False` for records with "N" as the ALT. --- vcf/model.py | 2 +- vcf/test/test_vcf.py | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/vcf/model.py b/vcf/model.py index 68281ec..3d787ef 100644 --- a/vcf/model.py +++ b/vcf/model.py @@ -376,7 +376,7 @@ def is_snp(self): for alt in self.ALT: if alt is None or alt.type != "SNV": return False - if alt not in ['A', 'C', 'G', 'T']: + if alt not in ['A', 'C', 'G', 'T', 'N']: return False return True diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index bb4ce37..ecb0ddb 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -561,6 +561,24 @@ def test_is_snp(self): elif var.POS == 1234567: self.assertEqual(False, is_snp) + + def test_is_snp_for_n_alt(self): + record = model._Record( + '1', + 10, + 'id1', + 'C', + [model._Substitution('N')], + None, + None, + {}, + None, + {}, + None + ) + self.assertTrue(record.is_snp) + + def test_is_indel(self): reader = vcf.Reader(fh('example-4.0.vcf')) for var in reader: From 0a993e13007cc8adbaf17bd61612ccaafd4f983d Mon Sep 17 00:00:00 2001 From: Chris Lasher Date: Mon, 8 Sep 2014 23:17:51 -0700 Subject: [PATCH 112/168] Run tests for Python 3.4. --- .travis.yml | 1 + tox.ini | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 1e1b142..1fdfd54 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,6 +5,7 @@ python: - "2.7" - "3.2" - "3.3" + - "3.4" - "pypy" install: - "if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install -r requirements/python2.6-requirements.txt; elif [[ $TRAVIS_PYTHON_VERSION == 'pypy' ]]; then pip install -r requirements/pypy-requirements.txt; else pip install -r requirements/common-requirements.txt; fi" diff --git a/tox.ini b/tox.ini index 953a9dc..64a7ab4 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py26, py27, py32, py33 +envlist = py26, py27, py32, py33, py34 [testenv] deps = From e8a05d9beafb85c2552e934229479b179c18fafe Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Sat, 13 Sep 2014 09:26:18 +0200 Subject: [PATCH 113/168] Add Python 3.4 trove classifier --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index fb4e512..a266207 100644 --- a/setup.py +++ b/setup.py @@ -73,6 +73,7 @@ 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.2', 'Programming Language :: Python :: 3.3', + 'Programming Language :: Python :: 3.4', 'Topic :: Scientific/Engineering :: Bio-Informatics', ], keywords='bioinformatics', From 82d8288eeaa812992671f3490a5103694eaad738 Mon Sep 17 00:00:00 2001 From: awenger Date: Tue, 16 Sep 2014 17:11:46 -0700 Subject: [PATCH 114/168] Add test cases for uncalled genotypes support * Remember the ploidity of uncalled genotypes such that the sample genotypes written by PyVCF.Writer match the sample genotypes read by PyVCF.Reader. * For uncalled _Calls, gt_nums and gt_bases are None; gt_alleles is a list of "None" with a length of _Call.ploidity. --- vcf/test/test_vcf.py | 62 +++++++++++++++++++++++++++++++++ vcf/test/uncalled_genotypes.vcf | 7 ++++ 2 files changed, 69 insertions(+) create mode 100644 vcf/test/uncalled_genotypes.vcf diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index ecb0ddb..4d219d8 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -1379,6 +1379,67 @@ def test_meta(self): self.assertEqual(reader.metadata['GATKCommandLine'][1]['CommandLineOptions'], '"analysis_type=VariantAnnotator annotation=[HomopolymerRun, VariantType, TandemRepeatAnnotator]"') + +class TestUncalledGenotypes(unittest.TestCase): + """Test the handling of uncalled (., ./.) sample genotypes.""" + + def test_read_uncalled(self): + """Test that uncalled genotypes are properly read into + gt_nums, gt_bases, ploidity, and gt_alleles properties + of _Call objects. For uncalled _Call objects: + + - gt_nums should be None + - gt_bases should be None + - ploidity should match the input ploidity + - gt_alleles should be a list of None's with length + matching the ploidity""" + + reader = vcf.Reader(fh('uncalled_genotypes.vcf')) + for var in reader: + gt_bases = [s.gt_bases for s in var.samples] + gt_nums = [s.gt_nums for s in var.samples] + ploidity = [s.ploidity for s in var.samples] + gt_alleles = [s.gt_alleles for s in var.samples] + + if var.POS == 14370: + self.assertEqual(['0|0', None, '1/1'], gt_nums) + self.assertEqual(['G|G', None, 'A/A'], gt_bases) + self.assertEqual([2,2,2], ploidity) + self.assertEqual([['0','0'], [None,None], ['1','1']], gt_alleles) + elif var.POS == 17330: + self.assertEqual([None, '0|1', '0/0'], gt_nums) + self.assertEqual([None, 'T|A', 'T/T'], gt_bases) + self.assertEqual([3,2,2], ploidity) + self.assertEqual([[None,None,None], ['0','1'], ['0','0']], gt_alleles) + elif var.POS == 1234567: + self.assertEqual(['0/1', '0/2', None], gt_nums) + self.assertEqual(['GTC/G', 'GTC/GTCT', None], gt_bases) + self.assertEqual([2,2,1], ploidity) + self.assertEqual([['0','1'], ['0','2'], [None]], gt_alleles) + + + def test_write_uncalled(self): + """Test that uncalled genotypes are written just as + they were read in the input file.""" + + reader = vcf.Reader(fh('uncalled_genotypes.vcf')) + + # Write all reader records to a stream. + out = StringIO() + writer = vcf.Writer(out, reader, lineterminator='\n') + for record in reader: + writer.write_record(record) + + + # Compare the written stream to the input reader line-by-line. + out.seek(0) + out_lines = out.getvalue().split('\n') + in_lines = [l.rstrip('\n') for l in fh('uncalled_genotypes.vcf')] + for (in_line, out_line) in zip(in_lines, out_lines): + self.assertEqual(in_line,out_line) + + + suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestVcfSpecs)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestGatkOutput)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestFreebayesOutput)) @@ -1404,3 +1465,4 @@ def test_meta(self): suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestRegression)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestUtils)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestGATKMeta)) +suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestUncalledGenotypes)) diff --git a/vcf/test/uncalled_genotypes.vcf b/vcf/test/uncalled_genotypes.vcf new file mode 100644 index 0000000..2032097 --- /dev/null +++ b/vcf/test/uncalled_genotypes.vcf @@ -0,0 +1,7 @@ +##fileformat=VCFv4.2 +##INFO= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT AB00001 AB00002 AB00003 +20 14370 rs6054257 G A 29 PASS NS=3 GT 0|0 ./. 1/1 +20 17330 . T A 3 q10 NS=3 GT ././. 0|1 0/0 +20 1234567 microsat1 GTC G,GTCT 50 PASS NS=3 GT 0/1 0/2 . From 6f7b3d9ca59633293de2879af42ab32a4e5a36aa Mon Sep 17 00:00:00 2001 From: awenger Date: Tue, 16 Sep 2014 17:48:53 -0700 Subject: [PATCH 115/168] Close file handles in TestUncalledGenotypes tests Warning about open file handles muddle the output of unit tests and are a potentially confusing factor to those interpreting the tests. --- vcf/test/test_vcf.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 4d219d8..ce26863 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -1416,6 +1416,7 @@ def test_read_uncalled(self): self.assertEqual(['GTC/G', 'GTC/GTCT', None], gt_bases) self.assertEqual([2,2,1], ploidity) self.assertEqual([['0','1'], ['0','2'], [None]], gt_alleles) + reader._reader.close() def test_write_uncalled(self): @@ -1429,12 +1430,15 @@ def test_write_uncalled(self): writer = vcf.Writer(out, reader, lineterminator='\n') for record in reader: writer.write_record(record) + reader._reader.close() # Compare the written stream to the input reader line-by-line. out.seek(0) out_lines = out.getvalue().split('\n') - in_lines = [l.rstrip('\n') for l in fh('uncalled_genotypes.vcf')] + in_file = fh('uncalled_genotypes.vcf') + in_lines = [l.rstrip('\n') for l in in_file] + in_file.close() for (in_line, out_line) in zip(in_lines, out_lines): self.assertEqual(in_line,out_line) From 14e4837511600d20a38f9e49f70a19ddc1abeb76 Mon Sep 17 00:00:00 2001 From: awenger Date: Tue, 16 Sep 2014 18:43:05 -0700 Subject: [PATCH 116/168] Add support for uncalled genotypes --- vcf/cparse.pyx | 5 ++++- vcf/model.py | 26 +++++++++++++------------- vcf/parser.py | 5 ++++- 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/vcf/cparse.pyx b/vcf/cparse.pyx index 682e6a7..a3cb4b3 100644 --- a/vcf/cparse.pyx +++ b/vcf/cparse.pyx @@ -36,7 +36,10 @@ def parse_samples( vals = sampvals[j] # short circuit the most common - if vals == '.' or vals == './.': + if samp_fmt._fields[j] == 'GT': + sampdat[j] = vals + continue + elif vals == '.': sampdat[j] = None continue diff --git a/vcf/model.py b/vcf/model.py index 3d787ef..c1d5710 100644 --- a/vcf/model.py +++ b/vcf/model.py @@ -1,17 +1,19 @@ from abc import ABCMeta, abstractmethod import collections import sys +import re try: from collections import Counter except ImportError: from counter import Counter +allele_delimiter = re.compile(r'''[|/]''') # to split a genotype into alleles class _Call(object): """ A genotype call, a cell entry in a VCF file""" - __slots__ = ['site', 'sample', 'data', 'gt_nums', 'called'] + __slots__ = ['site', 'sample', 'data', 'gt_nums', 'gt_alleles', 'called', 'ploidity'] def __init__(self, site, sample, data): #: The ``_Record`` for this ``_Call`` @@ -20,14 +22,18 @@ def __init__(self, site, sample, data): self.sample = sample #: Dictionary of data from the VCF file self.data = data - try: - self.gt_nums = self.data.GT - #: True if the GT is not ./. - self.called = self.gt_nums is not None - except AttributeError: - self.gt_nums = None + + if hasattr(self.data, 'GT'): + self.gt_alleles = [(al if al != '.' else None) for al in allele_delimiter.split(self.data.GT)] + self.ploidity = len(self.gt_alleles) + self.called = all([al != None for al in self.gt_alleles]) + self.gt_nums = self.data.GT if self.called else None + else: #62 a call without a genotype is not defined as called or not + self.gt_alleles = None + self.ploidity = None self.called = None + self.gt_nums = None def __repr__(self): return "Call(sample=%s, %s)" % (self.sample, str(self.data)) @@ -50,12 +56,6 @@ def __setstate__(self, state): def gt_phase_char(self): return "/" if not self.phased else "|" - @property - def gt_alleles(self): - '''The numbers of the alleles called at a given sample''' - # grab the numeric alleles of the gt string; tokenize by phasing - return self.gt_nums.split(self.gt_phase_char()) - @property def gt_bases(self): '''The actual genotype alleles. diff --git a/vcf/parser.py b/vcf/parser.py index 6d668af..1284ddf 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -459,7 +459,10 @@ def _parse_samples(self, samples, samp_fmt, site): for i, vals in enumerate(sample.split(':')): # short circuit the most common - if vals == '.' or vals == './.': + if samp_fmt._fields[i] == 'GT': + sampdat[i] = vals + continue + elif vals == ".": sampdat[i] = None continue From 80a638cf0a7df39fa3765797f281428d929fbc10 Mon Sep 17 00:00:00 2001 From: awenger Date: Tue, 16 Sep 2014 18:44:09 -0700 Subject: [PATCH 117/168] Simplify _format_sample logic The sample.data.GT attribute is no longer set to None for uncalled calls, which means that _format_sample can now rely on obtaining the original sample genotype. --- vcf/parser.py | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/vcf/parser.py b/vcf/parser.py index 1284ddf..3c36c31 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -726,28 +726,14 @@ def order_key(field): sorted(info, key=order_key)) def _format_sample(self, fmt, sample): - try: - # Try to get the GT value first. - gt = getattr(sample.data, 'GT') - # PyVCF stores './.' GT values as None, so we need to revert it back - # to './.' when writing. - if gt is None: - gt = './.' - except AttributeError: - # Failing that, try to check whether 'GT' is specified in the FORMAT - # field. If yes, use the recommended empty value ('./.') - if 'GT' in fmt: - gt = './.' - # Otherwise use an empty string as the value - else: - gt = '' - # If gt is an empty string (i.e. not stored), write all other data + if hasattr(sample.data, 'GT'): + gt = sample.data.GT + else: + gt = './.' if 'GT' in fmt else '' + if not gt: return ':'.join([self._stringify(x) for x in sample.data]) - # Otherwise use the GT values from above and combine it with the rest of - # the data. - # Note that this follows the VCF spec, where GT is always the first - # item whenever it is present. + # Following the VCF spec, GT is always the first item whenever it is present. else: return ':'.join([gt] + [self._stringify(x) for x in sample.data[1:]]) From 28725da42749ff5d3fa1f5e6ecaa636fe9493f86 Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Fri, 10 Oct 2014 20:38:02 +0200 Subject: [PATCH 118/168] Tolerate equals sign in INFO field value Fixes #181 --- vcf/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vcf/parser.py b/vcf/parser.py index 3c36c31..411d94d 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -367,7 +367,7 @@ def _parse_info(self, info_str): retdict = {} for entry in entries: - entry = entry.split('=') + entry = entry.split('=', 1) ID = entry[0] try: entry_type = self.infos[ID].type From 2fceb0c2aaa4e864523883c021d81f89d711ab15 Mon Sep 17 00:00:00 2001 From: David Caplan Date: Fri, 24 Oct 2014 10:50:00 -0400 Subject: [PATCH 119/168] fix double quoting issue when writing VCFs --- vcf/parser.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vcf/parser.py b/vcf/parser.py index 411d94d..1f72b64 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -634,7 +634,9 @@ class Writer(object): counts = dict((v,k) for k,v in field_counts.iteritems()) def __init__(self, stream, template, lineterminator="\n"): - self.writer = csv.writer(stream, delimiter="\t", lineterminator=lineterminator) + self.writer = csv.writer(stream, delimiter="\t", + lineterminator=lineterminator, + quotechar='', quoting=csv.QUOTE_NONE) self.template = template self.stream = stream From 35ebae14f1f53b75b01ab212b4083598f2edc10a Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Mon, 10 Nov 2014 16:27:21 +0100 Subject: [PATCH 120/168] Blacklist pysam 0.8.0 in unit tests (fails on Python 3) The issue in 0.8.0 seems to be fixed in 0.8.1, so it's now safe to just blacklist 0.8.0 specifically. See #175 --- requirements/common-requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/common-requirements.txt b/requirements/common-requirements.txt index 7bbf965..876b75e 100644 --- a/requirements/common-requirements.txt +++ b/requirements/common-requirements.txt @@ -1,3 +1,3 @@ cython -pysam==0.7.8 +pysam!=0.8.0 setuptools From 2c8d94fc4b9a649df0ddbba4543051e7af40a7ad Mon Sep 17 00:00:00 2001 From: chapmanb Date: Sun, 15 Feb 2015 19:38:21 -0500 Subject: [PATCH 121/168] Support ##contig headers with only ID attributes. Generated by bcftools 1.2 when inputs have no ##contig information --- vcf/parser.py | 18 +++++++++--------- vcf/test/contig_idonly.vcf | 5 +++++ vcf/test/test_vcf.py | 11 +++++++++++ 3 files changed, 25 insertions(+), 9 deletions(-) create mode 100644 vcf/test/contig_idonly.vcf diff --git a/vcf/parser.py b/vcf/parser.py index 1f72b64..bc51ee9 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -104,16 +104,17 @@ def __init__(self): Description="(?P.*)" >''', re.VERBOSE) self.contig_pattern = re.compile(r'''\#\#contig=< - ID=(?P[^,]+), - .* - length=(?P-?\d+) + ID=(?P[^>,]+) + (,.*length=(?P-?\d+))? .* >''', re.VERBOSE) self.meta_pattern = re.compile(r'''##(?P.+?)=(?P.+)''') def vcf_field_count(self, num_str): """Cast vcf header numbers to integer or None""" - if num_str not in field_counts: + if num_str is None: + return None + elif num_str not in field_counts: # Fixed, specified number return int(num_str) else: @@ -176,14 +177,10 @@ def read_contig(self, contig_string): if not match: raise SyntaxError( "One of the contig lines is malformed: %s" % contig_string) - length = self.vcf_field_count(match.group('length')) - contig = _Contig(match.group('id'), length) - return (match.group('id'), contig) - def read_meta_hash(self, meta_string): items = re.split("[<>]", meta_string) # Removing initial hash marks and final equal sign @@ -668,7 +665,10 @@ def __init__(self, stream, template, lineterminator="\n"): for line in template.alts.itervalues(): stream.write(two.format(key="ALT", *line)) for line in template.contigs.itervalues(): - stream.write('##contig=\n'.format(*line)) + if line.length: + stream.write('##contig=\n'.format(*line)) + else: + stream.write('##contig=\n'.format(*line)) self._write_header() diff --git a/vcf/test/contig_idonly.vcf b/vcf/test/contig_idonly.vcf new file mode 100644 index 0000000..5e5a6ad --- /dev/null +++ b/vcf/test/contig_idonly.vcf @@ -0,0 +1,5 @@ +##fileformat=VCFv4.2 +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index ce26863..3782d12 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -133,6 +133,17 @@ def test_vcf_4_2(self): for c in r: assert c + def test_contig_idonly(self): + """Test VCF inputs with ##contig inputs containing only IDs. produced by bcftools 1.2+ + """ + reader = vcf.Reader(fh("contig_idonly.vcf")) + for cid, contig in reader.contigs.items(): + if cid == "1": + assert contig.length is None + elif cid == "2": + assert contig.length == 2000 + elif cid == "3": + assert contig.length == 3000 class TestGatkOutput(unittest.TestCase): From 5864f83bdbdc431fba543daf65a1a9604ef028fb Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Sat, 14 Mar 2015 22:17:22 +0100 Subject: [PATCH 122/168] Allow for whitespace after commas in metadata lines Fixes #192 --- vcf/parser.py | 20 ++++++------ vcf/test/metadata-whitespace.vcf | 56 ++++++++++++++++++++++++++++++++ vcf/test/test_vcf.py | 22 +++++++++++++ 3 files changed, 88 insertions(+), 10 deletions(-) create mode 100644 vcf/test/metadata-whitespace.vcf diff --git a/vcf/parser.py b/vcf/parser.py index bc51ee9..2124798 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -82,25 +82,25 @@ class _vcf_metadata_parser(object): def __init__(self): super(_vcf_metadata_parser, self).__init__() self.info_pattern = re.compile(r'''\#\#INFO=< - ID=(?P[^,]+), - Number=(?P-?\d+|\.|[AGR]), - Type=(?PInteger|Float|Flag|Character|String), + ID=(?P[^,]+),\s* + Number=(?P-?\d+|\.|[AGR]),\s* + Type=(?PInteger|Float|Flag|Character|String),\s* Description="(?P[^"]*)" - (?:,Source="(?P[^"]*)")? - (?:,Version="?(?P[^"]*)"?)? + (?:,\s*Source="(?P[^"]*)")? + (?:,\s*Version="?(?P[^"]*)"?)? >''', re.VERBOSE) self.filter_pattern = re.compile(r'''\#\#FILTER=< - ID=(?P[^,]+), + ID=(?P[^,]+),\s* Description="(?P[^"]*)" >''', re.VERBOSE) self.alt_pattern = re.compile(r'''\#\#ALT=< - ID=(?P[^,]+), + ID=(?P[^,]+),\s* Description="(?P[^"]*)" >''', re.VERBOSE) self.format_pattern = re.compile(r'''\#\#FORMAT=< - ID=(?P.+), - Number=(?P-?\d+|\.|[AGR]), - Type=(?P.+), + ID=(?P.+),\s* + Number=(?P-?\d+|\.|[AGR]),\s* + Type=(?P.+),\s* Description="(?P.*)" >''', re.VERBOSE) self.contig_pattern = re.compile(r'''\#\#contig=< diff --git a/vcf/test/metadata-whitespace.vcf b/vcf/test/metadata-whitespace.vcf new file mode 100644 index 0000000..c163f9a --- /dev/null +++ b/vcf/test/metadata-whitespace.vcf @@ -0,0 +1,56 @@ +##fileformat=VCFv4.2 +##FILTER= +##samtoolsVersion=1.0-17-gfaf4dd6+htslib-1.0-11-g830ea73 +##samtoolsCommand=samtools mpileup -u -t DP,DPR,DV,DP4,INFO/DPR,SP -f /data/archive/reference/Anopheles-arabiensis-Dongola_SCAFFOLDS_AaraD1.fa -r KB704451:0004153102-0004172483 huge_list_of_bam_files_removed +##reference=file:///data/archive/reference/Anopheles-arabiensis-Dongola_SCAFFOLDS_AaraD1.fa +##contig= +##ALT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##bcftools_callVersion=1.0-55-gc661821+htslib-1.0-11-g830ea73 +##bcftools_callCommand=call -m -vM -f GQ,GP +##SnpSiftVersion="SnpSift 3.6c (build 2014-05-20), by Pablo Cingolani" +##SnpSiftCmd="SnpSift varType - " +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT LUPI059 MINE001 OKJ042 LUPI001 LUPI007 LUPI024 LUPI056 LUPI071 LUPI074 LUPI082 MINE040 MINE100 MINE101 MINE105 MINE111 OKJ017 OKJ045 OKJ070 SAGA066 SAGA107 SAGA131 SAGA133 SAGA134 SAGA141 2012L_LUPI_002 2012L_LUPI_015 2012L_LUPI_017 2012L_LUPI_018 2012L_LUPI_035 2012L_LUPI_062 2012L_LUPI_065 2012L_LUPI_077 2012L_LUPI_083 2012L_LUPI_116 2012L_LUPI_013 2012L_LUPI_041 2012L_LUPI_068 2012L_LUPI_096 2012L_LUPI_098 2012L_LUPI_101 2012L_LUPI_103 2012_LUPI_156 2012_LUPI_157 2012_LUPI_161 2012_LUPI_171 2012_LUPI_173 2012_LUPI_180 2012L_LUPI_010 2012L_LUPI_012 2012L_LUPI_021 2012L_LUPI_045 2012L_LUPI_047 2012L_LUPI_060 2012L_LUPI_061 2012L_LUPI_067 2012_LUPI_125 2012_LUPI_129 2012_LUPI_146 2012_LUPI_178 2012_LUPI_211 2012_LUPI_277 2012_LUPI_278 2012_LUPI_279 2012_LUPI_284 +KB704451 4157846 . N A,C 167.0 . DP=10;VDB=1.17174e-06;SGB=1.26353;MQ0F=0;DPR=0,6,4;AC=10,4;AN=14;DP4=0,0,10,0;MQ=60;SNP;VARTYPE=SNP,SNP GT:PL:DP:DV:SP:DP4:DPR:GP:GQ 1/2:74,23,14,57,0,54:4:4:0:0,0,4,0:0,3,1:144,56,16,90,0,57:16 1/2:26,26,26,3,3,0:1:1:0:0,0,1,0:0,0,1:95,58,28,36,2,3:3 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 1/1:26,3,0,26,3,26:1:1:0:0,0,1,0:0,1,0:96,36,2,60,3,29:3 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 1/1:26,3,0,26,3,26:1:1:0:0,0,1,0:0,1,0:96,36,2,60,3,29:3 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 1/2:26,26,26,3,3,0:1:1:0:0,0,1,0:0,0,1:95,58,28,36,2,3:3 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 1/1:26,3,0,26,3,26:1:1:0:0,0,1,0:0,1,0:96,36,2,60,3,29:3 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 1/2:26,26,26,3,3,0:1:1:0:0,0,1,0:0,0,1:95,58,28,36,2,3:3 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 +KB704451 4157870 . T C 275.0 . DP=243;VDB=0.00023935;SGB=29.4468;RPB=0.0368658;MQB=0.979612;MQSB=0.268441;BQB=0.99223;MQ0F=0;DPR=213,19;ICB=0.85092;HOB=0.0287274;AC=6;AN=118;DP4=201,12,19,1;MQ=53;SNP;VARTYPE=SNP GT:PL:DP:DV:SP:DP4:DPR:GP:GQ 0/0:0,66,255:22:0:0:20,2,0,0:22,0:0,75,279:75 0/0:0,12,120:4:0:0:4,0,0,0:4,0:0,21,144:21 1/1:193,36,0:12:12:0:0,0,12,0:0,12:168,20,0:20 0/0:0,9,95:3:0:0:3,0,0,0:3,0:0,18,119:18 0/1:78,0,110:7:3:0:3,1,3,0:4,3:68,0,125:68 0/0:0,3,40:1:0:0:1,0,0,0:1,0:0,12,64:12 0/0:0,6,72:2:0:0:2,0,0,0:2,0:0,15,96:15 0/0:0,9,90:3:0:0:3,0,0,0:3,0:0,18,114:18 0/0:0,12,122:4:0:0:4,0,0,0:4,0:0,21,146:21 0/0:0,9,97:3:0:0:3,0,0,0:3,0:0,18,121:18 0/0:0,15,122:5:0:0:5,0,0,0:5,0:0,24,146:24 0/0:0,6,71:2:0:0:2,0,0,0:2,0:0,15,95:15 0/0:0,6,58:2:0:0:2,0,0,0:2,0:0,15,82:15 0/0:0,18,155:6:0:0:6,0,0,0:6,0:0,27,179:27 0/0:0,3,39:1:0:0:1,0,0,0:1,0:0,12,63:12 0/1:35,3,0:1:1:0:0,0,1,0:0,1:23,0,12:12 0/0:0,9,87:3:0:0:3,0,0,0:3,0:0,18,111:18 0/1:47,0,104:6:2:0:4,0,2,0:4,2:37,0,119:37 0/0:0,21,160:7:0:0:7,0,0,0:7,0:0,30,184:30 0/0:0,6,35:2:0:0:2,0,0,0:2,0:0,15,59:15 0/0:0,12,98:4:0:0:4,0,0,0:4,0:0,21,122:21 0/0:0,6,70:2:0:0:2,0,0,0:2,0:0,15,94:15 0/0:0,6,66:2:0:0:2,0,0,0:2,0:0,15,90:15 0/0:0,12,122:4:0:0:4,0,0,0:4,0:0,21,146:21 0/0:0,3,29:1:0:0:0,1,0,0:1,0:0,12,53:12 0/0:0,6,72:2:0:0:2,0,0,0:2,0:0,15,96:15 0/0:0,9,76:3:0:0:3,0,0,0:3,0:0,18,100:18 0/0:0,15,136:5:0:0:5,0,0,0:5,0:0,24,160:24 0/0:0,30,182:10:0:0:10,0,0,0:10,0:0,39,206:39 0/0:0,6,66:2:0:0:2,0,0,0:2,0:0,15,90:15 0/0:0,6,69:2:0:0:2,0,0,0:2,0:0,15,93:15 0/0:0,27,152:9:0:0:9,0,0,0:9,0:0,36,176:36 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,6,94:2:0:0:2,0,0,0:2,0:0,15,118:15 0/0:0,21,195:7:0:0:5,2,0,0:7,0:0,30,219:30 0/0:0,9,92:3:0:0:2,1,0,0:3,0:0,18,116:18 0/1:33,0,18:2:1:0:0,1,1,0:1,1:23,0,33:23 0/0:0,3,35:1:0:0:1,0,0,0:1,0:0,12,59:12 0/0:0,9,91:3:0:0:3,0,0,0:3,0:0,18,115:18 0/0:0,3,36:1:0:0:1,0,0,0:1,0:0,12,60:12 0/0:0,30,212:10:0:0:9,1,0,0:10,0:0,39,236:39 0/0:0,9,89:3:0:0:3,0,0,0:3,0:0,18,113:18 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,21,195:7:0:0:7,0,0,0:7,0:0,30,219:30 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,9,97:3:0:0:2,1,0,0:3,0:0,18,121:18 0/0:0,9,93:3:0:0:3,0,0,0:3,0:0,18,117:18 0/0:0,9,116:3:0:0:3,0,0,0:3,0:0,18,140:18 0/0:0,6,71:2:0:0:1,1,0,0:2,0:0,15,95:15 0/0:0,9,89:3:0:0:3,0,0,0:3,0:0,18,113:18 0/0:0,33,175:11:0:0:11,0,0,0:11,0:0,42,199:42 0/0:0,6,63:2:0:0:2,0,0,0:2,0:0,15,87:15 0/0:0,21,145:7:0:0:7,0,0,0:7,0:0,30,169:30 0/0:0,3,39:1:0:0:0,1,0,0:1,0:0,12,63:12 0/0:0,9,84:3:0:0:3,0,0,0:3,0:0,18,108:18 0/0:0,3,13:1:0:0:1,0,0,0:1,0:0,12,37:12 0/0:0,3,23:1:0:0:1,0,0,0:1,0:0,12,47:12 0/0:0,12,106:4:0:0:4,0,0,0:4,0:0,21,130:21 0/0:0,3,36:1:0:0:1,0,0,0:1,0:0,12,60:12 0/0:0,9,94:3:0:0:3,0,0,0:3,0:0,18,118:18 0/0:0,6,67:2:0:0:2,0,0,0:2,0:0,15,91:15 0/0:2,5,27:2:1:0:1,0,0,1:1,0:0,12,49:12 +KB704451 4157877 . G A 999.0 . DP=250;VDB=6.58963e-09;SGB=31.659;RPB=0.0227135;MQB=0.410318;MQSB=0.139343;BQB=0.0767891;MQ0F=0;DPR=188,48;ICB=0.990841;HOB=0.00761276;AC=17;AN=118;DP4=176,12,45,3;MQ=55;SNP;VARTYPE=SNP GT:PL:DP:DV:SP:DP4:DPR:GP:GQ 0/1:159,0,202:22:9:0:12,1,8,1:13,9:154,0,212:127 0/0:0,12,120:4:0:0:4,0,0,0:4,0:0,16,134:16 0/0:0,51,207:17:0:0:17,0,0,0:17,0:0,55,221:55 0/0:0,9,98:3:0:0:3,0,0,0:3,0:0,13,112:13 0/1:123,0,61:8:5:0:3,0,4,1:3,5:118,0,71:71 0/0:0,3,38:1:0:0:0,1,0,0:1,0:0,7,52:7 0/1:68,0,29:3:2:0:1,0,1,1:1,2:63,0,39:39 0/0:0,6,69:2:0:0:2,0,0,0:2,0:0,10,83:10 0/0:0,12,119:4:0:0:4,0,0,0:4,0:0,16,133:16 0/1:34,0,34:2:1:0:1,0,1,0:1,1:29,0,44:29 0/1:24,0,99:5:1:0:4,0,1,0:4,1:19,0,109:19 0/1:34,0,28:2:1:0:1,0,1,0:1,1:29,0,38:29 0/0:0,6,58:2:0:0:2,0,0,0:2,0:0,10,72:10 0/1:122,0,57:7:4:0:3,0,4,0:3,4:117,0,67:67 0/0:0,3,41:1:0:0:1,0,0,0:1,0:0,7,55:7 0/0:0,3,29:1:0:0:1,0,0,0:1,0:0,7,43:7 0/0:0,12,105:4:0:0:4,0,0,0:4,0:0,16,119:16 0/0:0,18,144:6:0:0:6,0,0,0:6,0:0,22,158:22 0/1:118,0,63:8:5:0:3,0,5,0:3,5:113,0,73:73 0/0:0,6,34:2:0:0:2,0,0,0:2,0:0,10,48:10 0/0:0,15,131:5:0:0:5,0,0,0:5,0:0,19,145:19 0/0:0,6,72:2:0:0:2,0,0,0:2,0:0,10,86:10 0/0:0,6,89:2:0:0:2,0,0,0:2,0:0,10,103:10 1/1:124,12,0:4:4:0:0,0,4,0:0,4:112,4,2:4 0/0:0,3,34:1:0:0:0,1,0,0:1,0:0,7,48:7 0/0:0,6,73:2:0:0:2,0,0,0:2,0:0,10,87:10 0/0:0,9,91:3:0:0:3,0,0,0:3,0:0,13,105:13 0/0:0,15,138:5:0:0:5,0,0,0:5,0:0,19,152:19 0/0:0,30,179:10:0:0:10,0,0,0:10,0:0,34,193:34 0/0:0,6,65:2:0:0:2,0,0,0:2,0:0,10,79:10 0/0:0,6,70:2:0:0:2,0,0,0:2,0:0,10,84:10 0/0:0,27,155:9:0:0:9,0,0,0:9,0:0,31,169:31 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,6,94:2:0:0:2,0,0,0:2,0:0,10,108:10 0/0:0,15,161:5:0:0:3,2,0,0:5,0:0,19,175:19 0/0:0,6,72:2:0:0:1,1,0,0:2,0:0,10,86:10 0/0:0,6,65:2:0:0:1,1,0,0:2,0:0,10,79:10 0/1:36,3,0:1:1:0:0,0,1,0:0,1:29,0,7:7 0/0:0,9,93:3:0:0:3,0,0,0:3,0:0,13,107:13 0/0:0,3,34:1:0:0:1,0,0,0:1,0:0,7,48:7 0/1:87,0,137:10:4:0:5,1,4,0:6,4:82,0,147:82 0/1:57,0,26:3:2:0:1,0,2,0:1,2:52,0,36:35 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/1:139,0,73:7:4:0:3,0,4,0:3,4:134,0,83:83 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,6,75:2:0:0:1,1,0,0:2,0:0,10,89:10 0/1:90,9,0:3:3:0:0,0,3,0:0,3:79,2,3:3 0/0:0,9,98:3:0:0:3,0,0,0:3,0:0,13,112:13 0/0:0,6,72:2:0:0:1,1,0,0:2,0:0,10,86:10 0/0:0,9,88:3:0:0:3,0,0,0:3,0:0,13,102:13 0/0:0,33,173:11:0:0:11,0,0,0:11,0:0,37,187:37 0/0:0,6,57:2:0:0:2,0,0,0:2,0:0,10,71:10 0/0:0,15,125:5:0:0:5,0,0,0:5,0:0,19,139:19 0/0:0,6,61:2:0:0:1,1,0,0:2,0:0,10,75:10 0/1:24,0,51:3:1:0:2,0,1,0:2,1:19,0,61:19 0/0:0,3,30:1:0:0:1,0,0,0:1,0:0,7,44:7 0/0:0,3,23:1:0:0:1,0,0,0:1,0:0,7,37:7 0/0:0,12,105:4:0:0:4,0,0,0:4,0:0,16,119:16 0/0:0,3,35:1:0:0:1,0,0,0:1,0:0,7,49:7 0/1:25,0,61:3:1:0:2,0,1,0:2,1:20,0,71:20 0/0:0,6,67:2:0:0:2,0,0,0:2,0:0,10,81:10 0/0:0,3,8:1:0:0:0,1,0,0:1,0:0,7,22:7 +KB704451 4157907 . A C 278.0 . DP=295;VDB=0.241276;SGB=26.7514;RPB=0.676983;MQB=0.997838;MQSB=0.136536;BQB=0.45683;MQ0F=0;DPR=264,15;ICB=0.00518819;HOB=0.00237812;AC=4;AN=116;DP4=233,31,14,1;MQ=59;SNP;VARTYPE=SNP GT:PL:DP:DV:SP:DP4:DPR:GP:GQ 0/0:0,90,255:30:0:0:25,5,0,0:30,0:0,101,283:101 0/0:0,30,201:10:0:0:9,1,0,0:10,0:0,41,229:41 0/1:157,0,188:18:8:0:10,0,7,1:10,8:145,0,205:127 0/1:75,0,90:5:2:0:2,1,2,0:3,2:63,0,107:63 0/0:0,30,201:10:0:0:9,1,0,0:10,0:0,41,229:41 0/0:0,6,80:2:0:0:1,1,0,0:2,0:0,17,108:17 0/0:0,12,134:4:0:0:3,1,0,0:4,0:0,23,162:23 0/0:0,3,33:1:0:0:1,0,0,0:1,0:0,14,61:14 0/0:0,21,160:7:0:0:7,0,0,0:7,0:0,32,188:32 0/0:0,12,135:4:0:0:2,2,0,0:4,0:0,23,163:23 0/0:0,15,148:5:0:0:5,0,0,0:5,0:0,26,176:26 0/0:0,9,82:3:0:0:3,0,0,0:3,0:0,20,110:20 0/1:70,0,19:4:3:0:1,0,3,0:1,3:58,0,36:36 0/0:0,24,246:8:0:0:7,1,0,0:8,0:0,35,274:35 0/0:0,18,147:6:0:0:6,0,0,0:6,0:0,29,175:29 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,9,82:3:0:0:3,0,0,0:3,0:0,20,110:20 0/1:59,0,62:5:2:0:3,0,2,0:3,2:47,0,79:47 0/0:0,33,192:11:0:0:11,0,0,0:11,0:0,44,220:44 0/0:0,9,94:3:0:0:3,0,0,0:3,0:0,20,122:20 0/0:0,24,198:8:0:0:7,1,0,0:8,0:0,35,226:35 0/0:0,12,120:4:0:0:4,0,0,0:4,0:0,23,148:23 0/0:0,15,165:5:0:0:3,2,0,0:5,0:0,26,193:26 0/0:0,24,172:8:0:0:8,0,0,0:8,0:0,35,200:35 0/0:0,3,31:1:0:0:0,1,0,0:1,0:0,14,59:14 0/0:0,6,64:2:0:0:2,0,0,0:2,0:0,17,92:17 0/0:0,6,66:2:0:0:2,0,0,0:2,0:0,17,94:17 0/0:0,15,118:5:0:0:5,0,0,0:5,0:0,26,146:26 0/0:0,33,178:11:0:0:11,0,0,0:11,0:0,44,206:44 0/0:0,3,35:1:0:0:1,0,0,0:1,0:0,14,63:14 0/0:0,9,74:3:0:0:2,1,0,0:3,0:0,20,102:20 0/0:0,21,168:7:0:0:5,2,0,0:7,0:0,32,196:32 0/0:0,3,40:1:0:0:1,0,0,0:1,0:0,14,68:14 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,6,89:2:0:0:2,0,0,0:2,0:0,17,117:17 0/0:0,12,130:4:0:0:2,2,0,0:4,0:0,23,158:23 0/0:0,9,78:3:0:0:1,2,0,0:3,0:0,20,106:20 0/0:0,6,65:2:0:0:1,1,0,0:2,0:0,17,93:17 0/0:0,3,35:1:0:0:1,0,0,0:1,0:0,14,63:14 0/0:0,6,55:2:0:0:2,0,0,0:2,0:0,17,83:17 0/0:0,3,29:1:0:0:1,0,0,0:1,0:0,14,57:14 0/0:0,36,194:12:0:0:11,1,0,0:12,0:0,47,222:47 0/0:0,12,110:4:0:0:4,0,0,0:4,0:0,23,138:23 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,18,182:6:0:0:6,0,0,0:6,0:0,29,210:29 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,3,34:1:0:0:0,1,0,0:1,0:0,14,62:14 0/0:0,6,68:2:0:0:2,0,0,0:2,0:0,17,96:17 0/0:0,9,107:3:0:0:3,0,0,0:3,0:0,20,135:20 0/0:0,6,67:2:0:0:1,1,0,0:2,0:0,17,95:17 0/0:0,6,68:2:0:0:2,0,0,0:2,0:0,17,96:17 0/0:0,27,184:9:0:0:9,0,0,0:9,0:0,38,212:38 0/0:0,9,85:3:0:0:3,0,0,0:3,0:0,20,113:20 0/0:0,12,111:4:0:0:4,0,0,0:4,0:0,23,139:23 0/0:0,6,77:2:0:0:1,1,0,0:2,0:0,17,105:17 0/0:0,12,108:4:0:0:3,1,0,0:4,0:0,23,136:23 0/0:0,3,27:1:0:0:1,0,0,0:1,0:0,14,55:14 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,6,64:2:0:0:2,0,0,0:2,0:0,17,92:17 0/0:0,3,33:1:0:0:1,0,0,0:1,0:0,14,61:14 0/0:0,12,125:4:0:0:4,0,0,0:4,0:0,23,153:23 0/0:0,9,98:3:0:0:2,1,0,0:3,0:0,20,126:20 0/0:0,6,46:2:0:0:2,0,0,0:2,0:0,17,74:17 +KB704451 4157909 . T G 278.0 . DP=295;VDB=0.184881;SGB=22.7413;RPB=0.646301;MQB=0.998034;MQSB=0.200514;BQB=0.321842;MQ0F=0;DPR=247,15;ICB=0.00558284;HOB=0.00255102;AC=4;AN=112;DP4=218,29,15,1;MQ=59;SNP;VARTYPE=SNP GT:PL:DP:DV:SP:DP4:DPR:GP:GQ 0/0:0,87,255:29:0:0:24,5,0,0:29,0:0,97,282:97 0/0:0,27,183:9:0:0:9,0,0,0:9,0:0,37,210:37 0/1:156,0,167:19:8:0:11,0,7,1:11,8:145,0,183:127 0/1:75,0,107:5:2:0:2,1,2,0:3,2:64,0,123:64 0/0:0,27,191:9:0:0:8,1,0,0:9,0:0,37,218:37 0/0:0,6,80:2:0:0:1,1,0,0:2,0:0,16,107:16 0/0:0,12,119:4:0:0:3,1,0,0:4,0:0,22,146:22 0/0:0,3,34:1:0:0:1,0,0,0:1,0:0,13,61:13 0/0:0,15,126:5:0:0:5,0,0,0:5,0:0,25,153:25 0/0:0,12,132:4:0:0:2,2,0,0:4,0:0,22,159:22 0/0:0,12,133:4:0:0:4,0,0,0:4,0:0,22,160:22 0/0:0,6,67:2:0:0:2,0,0,0:2,0:0,16,94:16 0/1:79,9,0:3:3:0:0,0,3,0:0,3:60,0,8:8 0/0:0,21,230:7:0:0:6,1,0,0:7,0:0,31,257:31 0/0:0,18,144:6:0:0:6,0,0,0:6,0:0,28,171:28 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,6,53:2:0:0:2,0,0,0:2,0:0,16,80:16 0/1:59,0,64:5:2:0:3,0,2,0:3,2:48,0,80:48 0/0:0,33,180:11:0:0:11,0,0,0:11,0:0,43,207:43 0/0:0,12,110:4:0:0:4,0,0,0:4,0:0,22,137:22 0/0:0,24,190:8:0:0:7,1,0,0:8,0:0,34,217:34 0/0:0,12,110:4:0:0:4,0,0,0:4,0:0,22,137:22 0/0:0,15,164:5:0:0:3,2,0,0:5,0:0,25,191:25 0/0:0,24,161:8:0:0:8,0,0,0:8,0:0,34,188:34 0/0:0,3,32:1:0:0:0,1,0,0:1,0:0,13,59:13 0/0:0,6,63:2:0:0:2,0,0,0:2,0:0,16,90:16 0/0:0,6,65:2:0:0:2,0,0,0:2,0:0,16,92:16 0/0:0,15,121:5:0:0:5,0,0,0:5,0:0,25,148:25 0/0:0,30,174:10:0:0:10,0,0,0:10,0:0,40,201:40 0/0:0,3,34:1:0:0:1,0,0,0:1,0:0,13,61:13 0/0:0,6,63:2:0:0:2,0,0,0:2,0:0,16,90:16 0/0:0,21,164:7:0:0:5,2,0,0:7,0:0,31,191:31 0/0:0,3,37:1:0:0:1,0,0,0:1,0:0,13,64:13 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,6,89:2:0:0:2,0,0,0:2,0:0,16,116:16 0/0:0,12,128:4:0:0:2,2,0,0:4,0:0,22,155:22 0/0:0,9,94:3:0:0:1,2,0,0:3,0:0,19,121:19 0/0:0,6,63:2:0:0:1,1,0,0:2,0:0,16,90:16 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,6,56:2:0:0:2,0,0,0:2,0:0,16,83:16 0/0:0,3,34:1:0:0:1,0,0,0:1,0:0,13,61:13 0/0:0,36,193:12:0:0:11,1,0,0:12,0:0,46,220:46 0/0:0,12,108:4:0:0:4,0,0,0:4,0:0,22,135:22 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,18,168:6:0:0:6,0,0,0:6,0:0,28,195:28 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,3,31:1:0:0:0,1,0,0:1,0:0,13,58:13 0/0:0,6,47:2:0:0:2,0,0,0:2,0:0,16,74:16 0/0:8,11,65:2:1:0:1,0,1,0:1,0:0,13,84:13 0/0:0,6,64:2:0:0:1,1,0,0:2,0:0,16,91:16 0/0:0,3,34:1:0:0:1,0,0,0:1,0:0,13,61:13 0/0:0,27,177:9:0:0:9,0,0,0:9,0:0,37,204:37 0/0:0,6,50:2:0:0:2,0,0,0:2,0:0,16,77:16 0/0:0,12,101:4:0:0:4,0,0,0:4,0:0,22,128:22 0/0:0,6,65:2:0:0:1,1,0,0:2,0:0,16,92:16 0/0:0,12,100:4:0:0:3,1,0,0:4,0:0,22,127:22 0/0:0,3,31:1:0:0:1,0,0,0:1,0:0,13,58:13 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,9,84:3:0:0:3,0,0,0:3,0:0,19,111:19 0/0:0,3,32:1:0:0:1,0,0,0:1,0:0,13,59:13 0/0:0,12,104:4:0:0:4,0,0,0:4,0:0,22,131:22 0/0:0,6,66:2:0:0:1,1,0,0:2,0:0,16,93:16 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 +KB704451 4157927 . G A 4.88727 . DP=334;VDB=0.38;SGB=3.29913;RPB=0.454248;MQB=0.970588;MQSB=0.546099;BQB=0.215686;MQ0F=0;DPR=306,2;ICB=0.000310486;HOB=0.000153894;AC=1;AN=114;DP4=265,41,2,0;MQ=59;SNP;VARTYPE=SNP GT:PL:DP:DV:SP:DP4:DPR:GP:GQ 0/0:0,105,255:35:0:0:31,4,0,0:35,0:0,122,295:122 0/0:0,45,255:15:0:0:14,1,0,0:15,0:0,62,295:62 0/0:0,60,255:20:0:0:17,3,0,0:20,0:0,77,295:77 0/0:0,21,242:7:0:0:6,1,0,0:7,0:0,38,282:38 0/0:0,24,207:8:0:0:6,2,0,0:8,0:0,41,247:41 0/0:0,6,71:2:0:0:2,0,0,0:2,0:0,23,112:23 0/0:0,24,215:8:0:0:6,2,0,0:8,0:0,41,255:41 0/0:0,6,70:2:0:0:1,1,0,0:2,0:0,23,111:23 0/0:0,30,191:10:0:0:10,0,0,0:10,0:0,47,231:47 0/0:0,15,163:5:0:0:3,2,0,0:5,0:0,32,203:32 0/0:0,15,151:5:0:0:5,0,0,0:5,0:0,32,191:32 0/0:0,6,70:2:0:0:2,0,0,0:2,0:0,23,111:23 0/0:0,12,102:4:0:0:4,0,0,0:4,0:0,29,142:29 0/0:0,24,255:8:0:0:6,2,0,0:8,0:0,41,295:41 0/0:0,21,189:7:0:0:7,0,0,0:7,0:0,38,229:38 0/0:0,3,35:1:0:0:0,1,0,0:1,0:0,20,76:20 0/0:0,3,40:1:0:0:1,0,0,0:1,0:0,20,81:20 0/0:0,12,126:4:0:0:3,1,0,0:4,0:0,29,166:29 0/0:0,39,255:13:0:0:12,1,0,0:13,0:0,56,295:56 0/0:0,21,206:7:0:0:6,1,0,0:7,0:0,38,246:38 0/0:0,30,238:10:0:0:8,2,0,0:10,0:0,47,278:47 0/0:0,18,145:6:0:0:6,0,0,0:6,0:0,35,185:35 0/0:0,24,244:8:0:0:6,2,0,0:8,0:0,41,284:41 0/0:0,24,195:8:0:0:7,1,0,0:8,0:0,41,235:41 0/0:0,3,27:1:0:0:0,1,0,0:1,0:0,20,68:20 0/0:0,6,62:2:0:0:2,0,0,0:2,0:0,23,103:23 0/0:0,6,64:2:0:0:2,0,0,0:2,0:0,23,105:23 0/0:0,15,123:5:0:0:5,0,0,0:5,0:0,32,163:32 0/0:0,33,184:11:0:0:11,0,0,0:11,0:0,50,224:50 0/0:0,3,35:1:0:0:1,0,0,0:1,0:0,20,76:20 0/0:0,3,35:1:0:0:1,0,0,0:1,0:0,20,76:20 0/0:0,18,165:6:0:0:4,2,0,0:6,0:0,35,205:35 0/0:0,3,38:1:0:0:1,0,0,0:1,0:0,20,79:20 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,9,117:3:0:0:3,0,0,0:3,0:0,26,157:26 0/0:0,12,121:4:0:0:2,2,0,0:4,0:0,29,161:29 0/0:0,9,95:3:0:0:1,2,0,0:3,0:0,26,135:26 0/0:0,3,41:1:0:0:1,0,0,0:1,0:0,20,82:20 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,6,60:2:0:0:2,0,0,0:2,0:0,23,101:23 0/0:0,3,25:1:0:0:1,0,0,0:1,0:0,20,66:20 0/0:0,36,213:12:0:0:11,1,0,0:12,0:0,53,253:53 0/0:0,15,152:5:0:0:4,1,0,0:5,0:0,32,192:32 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,21,215:7:0:0:6,1,0,0:7,0:0,38,255:38 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,3,31:1:0:0:0,1,0,0:1,0:0,20,72:20 0/0:0,6,60:2:0:0:2,0,0,0:2,0:0,23,101:23 0/0:0,9,101:3:0:0:3,0,0,0:3,0:0,26,141:26 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,6,60:2:0:0:2,0,0,0:2,0:0,23,101:23 0/0:0,27,179:9:0:0:9,0,0,0:9,0:0,44,219:44 0/0:0,9,92:3:0:0:3,0,0,0:3,0:0,26,132:26 0/0:0,12,112:4:0:0:4,0,0,0:4,0:0,29,152:29 0/0:0,6,58:2:0:0:1,1,0,0:2,0:0,23,99:23 0/0:0,15,123:5:0:0:4,1,0,0:5,0:0,32,163:32 0/0:0,3,25:1:0:0:1,0,0,0:1,0:0,20,66:20 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,9,78:3:0:0:3,0,0,0:3,0:0,26,118:26 0/0:0,3,34:1:0:0:1,0,0,0:1,0:0,20,75:20 0/1:47,0,51:3:2:0:1,0,2,0:1,2:29,0,74:29 0/0:0,9,80:3:0:0:2,1,0,0:3,0:0,26,120:26 0/0:0,6,53:2:0:0:2,0,0,0:2,0:0,23,94:23 +KB704451 4157938 . ATTT ATTTT 650.0 . INDEL;IDV=18;IMF=0.428571;DP=361;VDB=0.773794;SGB=32.6744;MQSB=0.993251;MQ0F=0.00831025;DPR=115,60;ICB=0.929833;HOB=0.0258;AC=23;AN=100;DP4=98,17,48,12;MQ=59;INS;VARTYPE=INS GT:PL:DP:DV:SP:DP4:DPR:GP:GQ 0/1:124,0,7:19:16:0:3,0,13,3:3,16:123,0,13:13 0/1:14,3,0:1:1:0:0,0,1,0:0,1:12,1,5:4 0/0:0,9,55:3:0:0:3,0,0,0:3,0:0,9,62:9 0/0:0,21,146:7:0:0:5,2,0,0:7,0:0,21,153:21 0/0:0,18,126:6:0:0:4,2,0,0:6,0:0,18,133:18 0/0:0,6,56:2:0:0:2,0,0,0:2,0:0,7,63:7 0/0:0,9,87:3:0:0:2,1,0,0:3,0:0,9,94:9 1/1:46,9,0:3:3:0:0,0,1,2:0,3:40,4,1:4 0/0:0,27,155:9:0:0:8,1,0,0:9,0:0,27,162:27 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,6,57:2:0:0:2,0,0,0:2,0:0,7,64:7 0/1:24,3,0:1:1:0:0,0,1,0:0,1:22,1,5:5 0/0:0,3,32:1:0:0:1,0,0,0:1,0:1,5,40:5 0/0:0,9,84:3:0:0:2,1,0,0:3,0:0,9,91:9 0/0:0,15,111:5:0:0:5,0,0,0:5,0:0,15,118:15 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/1:32,6,0:2:2:0:0,0,1,1:0,2:28,2,3:3 0/0:0,6,55:2:0:0:2,0,0,0:2,0:0,7,62:7 0/1:63,0,61:7:4:0:3,0,3,1:3,4:62,0,67:61 0/0:0,24,154:8:0:0:7,1,0,0:8,0:0,24,161:24 0/1:16,0,94:7:2:0:5,0,2,0:5,2:15,0,100:15 0/1:61,0,101:9:4:0:4,1,3,1:5,4:60,0,107:60 0/0:0,3,31:1:0:0:1,0,0,0:1,0:1,5,39:5 0/1:13,3,0:1:1:0:0,0,0,1:0,1:11,1,5:4 0/1:16,0,47:3:1:0:2,0,1,0:2,1:15,0,53:15 0/0:0,3,32:1:0:0:1,0,0,0:1,0:1,5,40:5 0/1:46,0,57:5:2:0:3,0,2,0:3,2:45,0,63:45 0/1:50,0,12:6:5:0:1,0,5,0:1,5:49,0,18:18 0/0:0,3,30:1:0:0:1,0,0,0:1,0:1,5,38:5 0/0:0,3,4:1:0:0:1,0,0,0:1,0:1,5,12:4 0/1:18,0,98:5:1:0:2,2,1,0:4,1:17,0,104:17 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/1:15,0,26:2:1:0:1,0,1,0:1,1:14,0,32:14 0/1:20,0,26:2:1:0:1,0,0,1:1,1:19,0,32:19 0/0:0,6,60:2:0:0:1,1,0,0:2,0:0,7,67:7 0/0:0,3,32:1:0:0:1,0,0,0:1,0:1,5,40:5 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/1:33,3,0:1:1:0:0,0,1,0:0,1:31,1,5:5 0/1:82,6,0:7:6:0:1,0,5,1:1,6:78,2,3:3 0/0:0,15,126:5:0:0:4,1,0,0:5,0:0,15,133:15 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/1:34,6,0:2:2:0:0,0,2,0:0,2:30,2,3:3 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,6,50:2:0:0:1,1,0,0:2,0:0,7,57:7 0/0:0,3,28:1:0:0:1,0,0,0:1,0:1,5,36:5 0/0:0,6,57:2:0:0:2,0,0,0:2,0:0,7,64:7 0/0:0,18,124:6:0:0:5,1,0,0:6,0:0,18,131:18 0/1:53,6,0:2:2:0:0,0,2,0:0,2:49,2,3:3 0/0:0,12,96:4:0:0:4,0,0,0:4,0:0,12,103:12 0/1:25,3,0:1:1:0:0,0,1,0:0,1:23,1,5:5 1/1:61,9,0:3:3:0:0,0,2,1:0,3:55,4,1:4 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 0/0:0,3,31:1:0:0:0,1,0,0:1,0:1,5,39:5 0/0:0,6,56:2:0:0:2,0,0,0:2,0:0,7,63:7 0/0:0,3,29:1:0:0:1,0,0,0:1,0:1,5,37:5 0/0:0,3,32:1:0:0:1,0,0,0:1,0:1,5,40:5 0/0:0,9,87:3:0:0:2,1,0,0:3,0:0,9,94:9 ./.:0,0,0:0:0:0:0,0,0,0:0,0:0,0,0:0 +KB704451 4157940 . TTGTGTGTGTGTGT TTGTGTGTGTGTGTGTGT,TTTCTGTGTGTGTGTGT 999.0 . INDEL;IDV=7;IMF=0.5;DP=366;VDB=0.0431342;SGB=14.7456;MQSB=0.996953;MQ0F=0.010929;DPR=86,41,8;ICB=0.963728;HOB=0.02;AC=21,6;AN=90;DP4=70,16,39,10;MQ=58;INS;VARTYPE=INS,INS GT:PL:DP:DV:SP:DP4:DPR:GP:GQ 0/2:60,60,60,3,3,0:1:1:0:0,0,1,0:0,0,1:54,55,61,2,5,9:4 1/1:255,18,0,255,18,255:6:6:0:0,0,3,3:0,6,0:248,11,0,252,19,263:11 0/0:0,6,62,6,62,62:2:0:0:2,0,0,0:2,0,0:1,7,70,11,71,78:6 0/1:9,0,238,27,241,255:7:1:0:4,2,1,0:6,1,0:8,0,245,31,249,270:8 0/0:0,12,185,12,185,185:4:0:0:3,1,0,0:4,0,0:0,12,192,16,193,200:11 0/0:0,6,110,6,110,110:2:0:0:2,0,0,0:2,0,0:1,7,118,11,119,126:6 0/1:1,0,158,10,161,165:4:1:0:2,1,1,0:3,1,0:3,2,167,16,171,182:3 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 0/0:0,24,255,24,255,255:8:0:0:7,1,0,0:8,0,0:0,24,262,28,263,270:23 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 0/1:37,0,54,40,57,94:2:1:0:1,0,1,0:1,1,0:36,0,60,44,64,108:35 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 0/0:0,3,60,3,60,60:1:0:0:1,0,0,0:1,0,0:2,5,69,9,70,77:4 0/1:117,6,0,117,6,117:2:2:0:0,0,1,1:0,2,0:113,3,3,118,10,128:2 0/0:0,18,237,18,237,237:6:0:0:5,1,0,0:6,0,0:0,18,244,22,245,252:17 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 0/2:120,120,120,6,6,0:2:2:0:0,0,1,1:0,0,2:111,112,119,2,6,7:3 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 1/1:255,21,0,255,21,255:7:7:0:0,0,5,2:0,7,0:247,14,0,252,22,263:13 0/2:19,31,181,0,157,154:5:1:0:3,1,1,0:4,0,1:14,27,183,0,160,164:14 0/1:120,0,255,141,255,255:10:3:0:6,1,2,1:7,3,0:119,0,261,145,262,269:119 0/0:0,15,212,15,212,212:5:0:0:5,0,0,0:5,0,0:0,15,219,19,220,227:14 0/0:0,15,243,15,243,243:5:0:0:4,1,0,0:5,0,0:0,15,250,19,251,258:14 1/1:40,9,0,40,9,40:3:3:0:0,0,3,0:0,3,0:34,4,2,39,12,50:3 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 0/0:0,6,110,6,110,110:2:0:0:2,0,0,0:2,0,0:1,7,118,11,119,126:6 0/1:54,0,54,57,57,111:2:1:0:1,0,1,0:1,1,0:53,0,60,61,64,125:51 0/0:0,9,139,9,139,139:3:0:0:3,0,0,0:3,0,0:0,10,146,14,147,154:8 1/1:243,15,0,243,15,243:5:5:0:0,0,4,1:0,5,0:236,9,0,241,16,251:8 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 0/1:55,1,0,58,3,57:2:1:0:1,0,1,0:1,1,0:54,1,7,62,11,72:5 0/0:0,9,170,9,170,170:3:0:0:1,2,0,0:3,0,0:0,10,177,14,178,185:8 0/1:60,3,0,60,3,60:1:1:0:0,0,1,0:0,1,0:58,2,5,63,9,73:4 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 0/0:0,3,60,3,60,60:1:0:0:1,0,0,0:1,0,0:2,5,69,9,70,77:4 0/0:0,3,60,3,60,60:1:0:0:1,0,0,0:1,0,0:2,5,69,9,70,77:4 0/1:60,3,0,60,3,60:1:1:0:0,0,0,1:0,1,0:58,2,5,63,9,73:4 0/0:0,3,60,3,60,60:1:0:0:1,0,0,0:1,0,0:2,5,69,9,70,77:4 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 0/1:60,3,0,60,3,60:1:1:0:0,0,1,0:0,1,0:58,2,5,63,9,73:4 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 0/0:0,15,206,15,206,206:5:0:0:4,1,0,0:5,0,0:0,15,213,19,214,221:14 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 1/2:134,105,96,40,0,34:5:5:0:0,0,5,0:0,3,2:125,97,95,36,0,41:35 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 0/1:58,6,0,58,6,58:2:2:0:0,0,2,0:0,2,0:54,3,3,59,10,69:2 0/0:0,6,102,6,102,102:2:0:0:1,1,0,0:2,0,0:1,7,110,11,111,118:6 0/0:0,3,60,3,60,60:1:0:0:1,0,0,0:1,0,0:2,5,69,9,70,77:4 0/0:0,3,60,3,60,60:1:0:0:1,0,0,0:1,0,0:2,5,69,9,70,77:4 0/1:98,0,88,104,94,191:4:2:0:1,1,2,0:2,2,0:97,0,94,108,101,205:92 0/2:35,35,35,3,3,0:1:1:0:0,0,1,0:0,0,1:29,30,36,2,5,9:4 0/0:0,6,110,6,110,110:2:0:0:2,0,0,0:2,0,0:1,7,118,11,119,126:6 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 0/2:45,45,45,3,3,0:1:1:0:0,0,1,0:0,0,1:39,40,46,2,5,9:4 0/0:0,3,60,3,60,60:1:0:0:0,1,0,0:1,0,0:2,5,69,9,70,77:4 0/0:0,6,110,6,110,110:2:0:0:2,0,0,0:2,0,0:1,7,118,11,119,126:6 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 0/0:0,3,60,3,60,60:1:0:0:1,0,0,0:1,0,0:2,5,69,9,70,77:4 0/0:0,6,120,6,120,120:2:0:0:1,1,0,0:2,0,0:1,7,128,11,129,136:6 ./.:0,0,0,0,0,0:0:0:0:0,0,0,0:0,0,0:0,0,0,0,0,0:0 diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 3782d12..729d6ee 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -457,6 +457,27 @@ def test_samples(self): self.assertEqual(self.reader.samples, self.samples) +class TestMetadataWhitespace(unittest.TestCase): + filename = 'metadata-whitespace.vcf' + def test_metadata_whitespace(self): + """ + Test parsing metadata header lines with whitespace. + """ + self.reader = vcf.Reader(fh(self.filename)) + + # Pick one INFO line and assert that we parsed it correctly. + info_indel = self.reader.infos['INDEL'] + assert info_indel.id == 'INDEL' + assert info_indel.num == 0 + assert info_indel.type == 'Flag' + assert info_indel.desc == 'Indicates that the variant is an INDEL.' + + # Test we can walk the file at least. + for r in self.reader: + for c in r: + pass + + class TestMixedFiltering(unittest.TestCase): filename = 'mixed-filtering.vcf' def test_mixed_filtering(self): @@ -1470,6 +1491,7 @@ def test_write_uncalled(self): suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestBcfToolsOutputWriter)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestWriterDictionaryMeta)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestSamplesSpace)) +suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestMetadataWhitespace)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestMixedFiltering)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestRecord)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestCall)) From ec193b12b9443dc40b0ea49cde134fe60915b60f Mon Sep 17 00:00:00 2001 From: cariaso Date: Sun, 19 Apr 2015 17:53:49 -0400 Subject: [PATCH 123/168] Enable compression to be disabled for .gz filenames https://github.com/jamescasbon/PyVCF/issues/198#issuecomment-94317496 --- vcf/parser.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vcf/parser.py b/vcf/parser.py index 2124798..bf55128 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -233,7 +233,7 @@ def read_meta(self, meta_string): class Reader(object): """ Reader for a VCF v 4.0 file, an iterator returning ``_Record objects`` """ - def __init__(self, fsock=None, filename=None, compressed=False, prepend_chr=False, + def __init__(self, fsock=None, filename=None, compressed=None, prepend_chr=False, strict_whitespace=False): """ Create a new Reader for a VCF file. @@ -256,9 +256,11 @@ def __init__(self, fsock=None, filename=None, compressed=False, prepend_chr=Fals self._reader = fsock if filename is None and hasattr(fsock, 'name'): filename = fsock.name - compressed = compressed or filename.endswith('.gz') + if compressed is None: + compressed = filename.endswith('.gz') elif filename: - compressed = compressed or filename.endswith('.gz') + if compressed is None: + compressed = filename.endswith('.gz') self._reader = open(filename, 'rb' if compressed else 'rt') self.filename = filename if compressed: From df454bcebd3ee6b376f9331264cab5e64c43a453 Mon Sep 17 00:00:00 2001 From: Michele Mattioni Date: Fri, 24 Jul 2015 15:15:33 +0100 Subject: [PATCH 124/168] Bump the version to development mode --- vcf/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vcf/__init__.py b/vcf/__init__.py index 149d25a..88842b6 100644 --- a/vcf/__init__.py +++ b/vcf/__init__.py @@ -190,4 +190,4 @@ from vcf.parser import RESERVED_INFO, RESERVED_FORMAT from vcf.sample_filter import SampleFilter -VERSION = '0.6.7' +VERSION = '0.6.8.dev0' From 6500a9ac099e8f97f2313b86cb1a118fbb254840 Mon Sep 17 00:00:00 2001 From: Harriet Dashnow Date: Fri, 21 Aug 2015 16:17:07 +1000 Subject: [PATCH 125/168] Chang self.data comment. It returns namedtuple not dict --- vcf/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vcf/model.py b/vcf/model.py index c1d5710..f523c24 100644 --- a/vcf/model.py +++ b/vcf/model.py @@ -20,7 +20,7 @@ def __init__(self, site, sample, data): self.site = site #: The sample name self.sample = sample - #: Dictionary of data from the VCF file + #: Namedtuple of data from the VCF file self.data = data if hasattr(self.data, 'GT'): From dc23dbe74fa71323885da0604a625cc71673708c Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Tue, 15 Sep 2015 10:43:35 +0200 Subject: [PATCH 126/168] Configurable encoding reading compressed VCF files The encoding used to read compressed VCF files was fixed to ASCII, but is now configurable with the optional `encoding` parameter to `vcf.Reader()`. This is really a stopgap solution to read compressed VCF files containing content that cannot be ASCII-decoded. More changes are need to properly handle encoding/decoding issues on both Python 2 and 3. Fixes #201 --- vcf/parser.py | 8 +++++--- vcf/test/issue-201.vcf.gz | Bin 0 -> 2639 bytes vcf/test/issue-201.vcf.gz.tbi | Bin 0 -> 129 bytes vcf/test/test_vcf.py | 20 ++++++++++++++++++++ 4 files changed, 25 insertions(+), 3 deletions(-) create mode 100644 vcf/test/issue-201.vcf.gz create mode 100644 vcf/test/issue-201.vcf.gz.tbi diff --git a/vcf/parser.py b/vcf/parser.py index bf55128..1c14694 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -234,7 +234,7 @@ class Reader(object): """ Reader for a VCF v 4.0 file, an iterator returning ``_Record objects`` """ def __init__(self, fsock=None, filename=None, compressed=None, prepend_chr=False, - strict_whitespace=False): + strict_whitespace=False, encoding='ascii'): """ Create a new Reader for a VCF file. You must specify either fsock (stream) or filename. Gzipped streams @@ -266,7 +266,7 @@ def __init__(self, fsock=None, filename=None, compressed=None, prepend_chr=False if compressed: self._reader = gzip.GzipFile(fileobj=self._reader) if sys.version > '3': - self._reader = codecs.getreader('ascii')(self._reader) + self._reader = codecs.getreader(encoding)(self._reader) if strict_whitespace: self._separator = '\t' @@ -295,6 +295,7 @@ def __init__(self, fsock=None, filename=None, compressed=None, prepend_chr=False self._prepend_chr = prepend_chr self._parse_metainfo() self._format_cache = {} + self.encoding = encoding def __iter__(self): return self @@ -617,7 +618,8 @@ def fetch(self, chrom, start=None, end=None): raise Exception('Please provide a filename (or a "normal" fsock)') if not self._tabix: - self._tabix = pysam.Tabixfile(self.filename) + self._tabix = pysam.Tabixfile(self.filename, + encoding=self.encoding) if self._prepend_chr and chrom[:3] == 'chr': chrom = chrom[3:] diff --git a/vcf/test/issue-201.vcf.gz b/vcf/test/issue-201.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..2af09fa65b70c3ecafbbb078b17aba0555cbc73c GIT binary patch literal 2639 zcmV-V3b6GbiwFb&00000{{{d;LjnLY3dNe;a@t51$FG^!$jwZGpW%09 zq;tR&1%fE_O*dHj_9l!^Eq3;iN+FLnz86}yYb?E$?LJ4i4PmrwQ3U#5y3wZ9cebJf z@g8vk+lVVX66e~GC@c~u+63?@YSsb-8KoFi0D)>C2pDZ6H6y}=Cq$T&x`+seq|!yo z2^TT7tuSH#y!qGbdL4w#JkjPxY0~M6> zV3?$kg4z_Y!mtgg5=Gzi*?2JS8;hAfo_A-1>B1OKjQK*JEsXBub~3IDmQ$4; z!(su8XwC|im=&=$1|ULl&Z~mB4bfXktv_EacWD_NB{g_y1%WWW z*GO2ufthUtuo~N5d4IXJS;v?n^=JhUKL1E5$ww_VtEcv=lWD$Htrv_k%uwW8p~xBG z726Gc&y79uk&?nEVh>)sPY4HufGVF5oF}5Vcg0ms8H~G=(RA2b^ydMf7g^{XZa#JnOn}2rqC8DpwegH9`Sf!jd^hW`Qy>!W5vL%N5YQS{2UARF!b`j^9g!n!#x*#DQ0Wpk?l5LK3$hfwU`1;RBY#LPAlcb&2A>q&_Ds4=`OA=V9cRzl2u!iDSVHhoq? z%q^=3T@dSP$2ub+=Aok3rh1cvnBg&8f3(g@h()3x8k??LLhQ?t5KHw?iiB9y#CJeA zDP}u022f-Juu6dIOOR@2 z9zw=qgzG-VsHYMn;<4>(KE;@h8dvICWeKuwunSKyras{u)b5iDPciD51W7K4eYInq zeTp#+6}>jq8=qp_;W1i&v@Sfwm?g??^2iVj%hgMcV5q($!a2cEEymtiJ&xyhQ={LT zO+S4yW^;|SIL?laUPCjqJ$GRSq2-GTZ>_Z3qe~0{@ULL$`IZJ+D#A*B{pLkn*cE=j z^e=O3hy`Y8xuIBf_OHNto3RjEZ`GJvY$51O|;C#@7 zyUaCB@sp3iv0ah5@p@^8+WepY`}cqU^Xyyh&Cv9NK%{lBbyvQ(wyK=Bv18r(bNnCf zKKH%8^xOn{RYK;3awThKnj0@L?)1g|XrzhBFij@IP8I#!RUrPOnIA=rt5`WYiit?a z-O*C7nmCd(3+IMb5Q=DtYz;ztsog*>LO@xWBz96YN7im;Ikx#|J9fAglXPYM&|p4j z(ZjFsX6bqUZ(1hEXXkpNu&f`Qy?u$j(hhXs=7HR~_jjUQAEZku5da1FFs6P`kq1-H z*=}AeS26A%^jS7zn{lq@&hV`#tzU7$?ckvtOHsJliH=Kj48n3+pQ|q_Q55x5MG||~ z!%4ugSU?W_9G8Zjhxco5)@qk=>ELgKN!r|dKT{_TOD8<&F_~bCruaFmkh<8GiV2m~ zNYo;v(j^rGWQ_CU5`DfhFi{2A{%oAAt!J~Sf$3r}o|V>*>1^`Q)5jHcA{}4J1Yt$a z4O~(G#jYy7Khbn&WxZ@yR`Tw9A{?gJmM7j zrRH&)lU`0naDO0-cXXVHuDQ0d zZ{IF0waKcHr&Ut5s|%@^@`#=sf2k!VSUD&jc@P09>c)$&6-kp-k)`7X_Oq5}>sQx~ zch82oTxwGSDdx@H$4zcSY>PZMslg|TNLF=V;davrwYktQ4-zaJI@3AqQvM!5mn><>qs9rTZajEB6KRi$D5>5 z0Z3ZqBhp=QTPv0hM|i=p9P4Ct&L-}e(js~$6je6Jb5LoC{pg9pzGou;qB>t>%HLDR xMMl&Y#1Fr*{y!yZ-+6*Z001A02m}BC000301^_}s0stET0{{R300000002!9=1Bkm literal 0 HcmV?d00001 diff --git a/vcf/test/issue-201.vcf.gz.tbi b/vcf/test/issue-201.vcf.gz.tbi new file mode 100644 index 0000000000000000000000000000000000000000..23ffe4d9d4bb7aa713e12f966015680f9ad9e66b GIT binary patch literal 129 zcmb2|=3rp}f&Xj_PR>jWDGYCKZ4`8H5MaGv>%w)Rfvd+sY0}Bil{TU~4u?M6J9EGN z4c)~4Tz^faS#xe3{ZTsGSoi!Zse6-6SMAjQ{Dui^6p(oE{Z;-|-cpXBY13618066` Lk!E0qDP#ZufEFmv literal 0 HcmV?d00001 diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 729d6ee..8a496c8 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -1192,6 +1192,25 @@ def testFetchesAllFromChromIfOnlyChromSpecified(self): ) +@unittest.skipUnless(pysam, "test requires installation of PySAM.") +class TestIssue201(unittest.TestCase): + def setUp(self): + # This file contains some non-ASCII characters in a UTF-8 encoding. + # https://github.com/jamescasbon/PyVCF/issues/201 + self.reader = vcf.Reader(fh('issue-201.vcf.gz', 'rb'), + encoding='utf-8') + + def testIterate(self): + for record in self.reader: + # Should not raise decoding errors. + pass + + def testFetch(self): + for record in self.reader.fetch(chrom='17'): + # Should not raise decoding errors. + pass + + class TestOpenMethods(unittest.TestCase): samples = 'NA00001 NA00002 NA00003'.split() @@ -1496,6 +1515,7 @@ def test_write_uncalled(self): suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestRecord)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestCall)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestFetch)) +suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestIssue201)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestOpenMethods)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestSampleFilter)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestFilter)) From 1cbbf58ea1ad745776d1ca5c55ee8151f7d725c4 Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Thu, 8 Oct 2015 11:41:23 +0200 Subject: [PATCH 127/168] More robust parsing of meta-information lines Fixes #210 --- vcf/parser.py | 9 +++++---- vcf/test/parse-meta-line.vcf | 6 ++++++ vcf/test/test_vcf.py | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 4 deletions(-) create mode 100644 vcf/test/parse-meta-line.vcf diff --git a/vcf/parser.py b/vcf/parser.py index 1c14694..862fbee 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -182,15 +182,16 @@ def read_contig(self, contig_string): return (match.group('id'), contig) def read_meta_hash(self, meta_string): - items = re.split("[<>]", meta_string) - # Removing initial hash marks and final equal sign - key = items[0][2:-1] + # assert re.match("##.+=<", meta_string) + items = meta_string.split('=', 1) + # Removing initial hash marks + key = items[0].lstrip('#') # N.B., items can have quoted values, so cannot just split on comma val = OrderedDict() state = 0 k = '' v = '' - for c in items[1]: + for c in items[1].strip('[<>]'): if state == 0: # reading item key if c == '=': diff --git a/vcf/test/parse-meta-line.vcf b/vcf/test/parse-meta-line.vcf new file mode 100644 index 0000000..e3a2611 --- /dev/null +++ b/vcf/test/parse-meta-line.vcf @@ -0,0 +1,6 @@ +##fileformat=VCFv4.1 +##INFO= +##FORMAT= +##MYFIELD= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample +chr1 100 id1 G A . . NS=3 GT 0/1 diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 8a496c8..63e972c 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -365,6 +365,38 @@ def test_write(self): self.assertEquals(l.INFO, r.INFO) +class TestParseMetaLine(unittest.TestCase): + def test_parse(self): + reader = vcf.Reader(fh('parse-meta-line.vcf')) + f = reader.metadata['MYFIELD'][0] + self.assertEqual(f['ID'], 'SomeField') + self.assertEqual(f['Version'], '3.4-0-g7e26428') + self.assertEqual(f['Date'], '"Wed Oct 07 09:11:47 CEST 2015"') + self.assertEqual(f['Options'], '"< 4 and > 3"') + next(reader) + + def test_write(self): + reader = vcf.Reader(fh('parse-meta-line.vcf')) + out = StringIO() + writer = vcf.Writer(out, reader) + + records = list(reader) + + for record in records: + writer.write_record(record) + out.seek(0) + reader2 = vcf.Reader(out) + + f = reader2.metadata['MYFIELD'][0] + self.assertEqual(f['ID'], 'SomeField') + self.assertEqual(f['Version'], '3.4-0-g7e26428') + self.assertEqual(f['Date'], '"Wed Oct 07 09:11:47 CEST 2015"') + self.assertEqual(f['Options'], '"< 4 and > 3"') + + for l, r in zip(records, reader2): + self.assertEquals(l.INFO, r.INFO) + + class TestGatkOutputWriter(unittest.TestCase): def testWrite(self): @@ -1506,6 +1538,7 @@ def test_write_uncalled(self): suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestStringAsFlag)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestInfoOrder)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestInfoTypeCharacter)) +suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestParseMetaLine)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestGatkOutputWriter)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestBcfToolsOutputWriter)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestWriterDictionaryMeta)) From 0253183e46d8a6a41ed6f73f58530d0f011792a7 Mon Sep 17 00:00:00 2001 From: alexjironkin Date: Fri, 23 Oct 2015 10:10:58 +0100 Subject: [PATCH 128/168] Precopiled patterns for improved performance. Patterns for row and ALT encoding are now pre-compiled instead of being compiled each time re.split and re.search is called. Increasing read performance. --- vcf/parser.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/vcf/parser.py b/vcf/parser.py index 862fbee..cc7a38c 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -273,6 +273,9 @@ def __init__(self, fsock=None, filename=None, compressed=None, prepend_chr=False self._separator = '\t' else: self._separator = '\t| +' + + self._row_pattern = re.compile(self._separator) + self._alt_pattern = re.compile('[\[\]]') self.reader = (line.strip() for line in self._reader if line.strip()) @@ -507,9 +510,9 @@ def _parse_samples(self, samples, samp_fmt, site): return samp_data def _parse_alt(self, str): - if re.search('[\[\]]', str) is not None: + if self._alt_pattern.search(str) is not None: # Paired breakend - items = re.split('[\[\]]', str) + items = self._alt_pattern.split(str) remoteCoords = items[1].split(':') chr = remoteCoords[0] if chr[0] == '<': @@ -537,7 +540,7 @@ def _parse_alt(self, str): def next(self): '''Return the next record in the file.''' line = self.reader.next() - row = re.split(self._separator, line.rstrip()) + row = self._row_pattern.split(line.rstrip()) chrom = row[0] if self._prepend_chr: chrom = 'chr' + chrom From d15a375f55dcbdd37e20bf948827c879e5e63af3 Mon Sep 17 00:00:00 2001 From: redmar Date: Thu, 12 Nov 2015 14:27:04 +0100 Subject: [PATCH 129/168] Added vcf and testcases to demonstrate issue214 --- vcf/test/issue-214.vcf | 32 ++++++++++++++++++++++++++++++++ vcf/test/test_vcf.py | 31 ++++++++++++++++++++++++++++++- 2 files changed, 62 insertions(+), 1 deletion(-) create mode 100644 vcf/test/issue-214.vcf diff --git a/vcf/test/issue-214.vcf b/vcf/test/issue-214.vcf new file mode 100644 index 0000000..dbc5fac --- /dev/null +++ b/vcf/test/issue-214.vcf @@ -0,0 +1,32 @@ +##fileformat=VCFv4.1 +##ALT= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample1 Sample2 +1 456904 . T C,* 6162.77 . AC=1,1;AF=8.333e-03,8.333e-03;AN=120;DP=7693;FS=0.000;MLEAC=1,1;MLEAF=8.333e-03,8.333e-03;MQ=60.00;QD=31.36;SOR=0.976 GT:AD:DP:GQ:PL 0:106,0,0:106:99:0,1800,1800 0:110,0,0:110:99:0,1800,1800 +1 456940 . * C,T 6162.77 . AC=1,1;AF=8.333e-03,8.333e-03;AN=120;DP=7693;FS=0.000;MLEAC=1,1;MLEAF=8.333e-03,8.333e-03;MQ=60.00;QD=31.36;SOR=0.976 GT:AD:DP:GQ:PL 0:106,0,0:106:99:0,1800,1800 0:110,0,0:110:99:0,1800,1800 diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 63e972c..8fbe35a 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -229,7 +229,35 @@ def testParse(self): for s in r.samples: s.phased - +class TestIssue214(unittest.TestCase): + """ See https://github.com/jamescasbon/PyVCF/issues/214 """ + + def test_issue_214_is_snp(self): + reader=vcf.Reader(fh('issue-214.vcf')) + r=reader.next() + self.assertTrue(r.is_snp) + + def test_issue_214_var_type(self): + reader=vcf.Reader(fh('issue-214.vcf')) + r=reader.next() + self.assertEqual(r.var_type,'snp') + + # Can the ref even be a spanning deletion? + # Note, this does not trigger issue 214, but I've added it here for completeness + def test_issue_214_ref_is_del_is_snp(self): + reader=vcf.Reader(fh('issue-214.vcf')) + reader.next() + r=reader.next() + self.assertTrue(r.is_snp) + + # Can the ref even be a spanning deletion? + # Note, this does not trigger issue 214, but I've added it here for completeness + def test_issue_214_ref_is_del_var_type(self): + reader=vcf.Reader(fh('issue-214.vcf')) + reader.next() + r=reader.next() + self.assertEqual(r.var_type,'snp') + class Test1kg(unittest.TestCase): def testParse(self): @@ -1532,6 +1560,7 @@ def test_write_uncalled(self): suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestFreebayesOutput)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestSamtoolsOutput)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestBcfToolsOutput)) +suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestIssue214)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(Test1kg)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(Test1kgSites)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestGoNL)) From 41bd5b8db06d9820b58f7da78425aa4c6d7445cd Mon Sep 17 00:00:00 2001 From: redmar Date: Thu, 12 Nov 2015 14:43:08 +0100 Subject: [PATCH 130/168] Resolved issue214 by adding '*' to the allowed characters --- vcf/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vcf/model.py b/vcf/model.py index f523c24..33c77b2 100644 --- a/vcf/model.py +++ b/vcf/model.py @@ -376,7 +376,7 @@ def is_snp(self): for alt in self.ALT: if alt is None or alt.type != "SNV": return False - if alt not in ['A', 'C', 'G', 'T', 'N']: + if alt not in ['A', 'C', 'G', 'T', 'N', '*']: return False return True From 2fc8e8664b42d3b7a33d08d3bfecf1fa57d3083c Mon Sep 17 00:00:00 2001 From: alexjironkin Date: Fri, 13 Nov 2015 14:57:34 +0000 Subject: [PATCH 131/168] Additional switch to pre-compiled re patter. Missed 1 split to be replaced with _row_pattern.split. --- vcf/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vcf/parser.py b/vcf/parser.py index cc7a38c..f7fc569 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -349,7 +349,7 @@ def _parse_metainfo(self): line = self.reader.next() - fields = re.split(self._separator, line[1:]) + fields = self._row_pattern.split(line[1:]) self._column_headers = fields[:9] self.samples = fields[9:] self._sample_indexes = dict([(x,i) for (i,x) in enumerate(self.samples)]) From 240ae07a21e05c8c2496c1c3c5a25e7940620032 Mon Sep 17 00:00:00 2001 From: "B. Arman Aksoy" Date: Mon, 16 Nov 2015 17:32:01 -0500 Subject: [PATCH 132/168] handle empty sample columns better --- vcf/cparse.pyx | 2 +- vcf/parser.py | 4 ++-- vcf/test/strelka.vcf | 57 ++++++++++++++++++++++++++++++++++++++++++++ vcf/test/test_vcf.py | 11 +++++++-- 4 files changed, 69 insertions(+), 5 deletions(-) create mode 100644 vcf/test/strelka.vcf diff --git a/vcf/cparse.pyx b/vcf/cparse.pyx index a3cb4b3..8a71d64 100644 --- a/vcf/cparse.pyx +++ b/vcf/cparse.pyx @@ -39,7 +39,7 @@ def parse_samples( if samp_fmt._fields[j] == 'GT': sampdat[j] = vals continue - elif vals == '.': + elif not vals or vals == '.': sampdat[j] = None continue diff --git a/vcf/parser.py b/vcf/parser.py index f7fc569..a5625d7 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -273,7 +273,7 @@ def __init__(self, fsock=None, filename=None, compressed=None, prepend_chr=False self._separator = '\t' else: self._separator = '\t| +' - + self._row_pattern = re.compile(self._separator) self._alt_pattern = re.compile('[\[\]]') @@ -466,7 +466,7 @@ def _parse_samples(self, samples, samp_fmt, site): if samp_fmt._fields[i] == 'GT': sampdat[i] = vals continue - elif vals == ".": + elif not vals or vals == ".": sampdat[i] = None continue diff --git a/vcf/test/strelka.vcf b/vcf/test/strelka.vcf new file mode 100644 index 0000000..b5aea76 --- /dev/null +++ b/vcf/test/strelka.vcf @@ -0,0 +1,57 @@ +##fileformat=VCFv4.1 +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##content=strelka somatic indel calls +##fileDate=20151113 +##germlineIndelTheta=0.0001 +##germlineSnvTheta=0.001 +##priorSomaticIndelRate=1e-06 +##priorSomaticSnvRate=1e-06 +##reference=file:///b37.fasta +##source=strelka +##source_version=2.0.17.strelka1 +##startTime=Fri Nov 13 19:38:43 2015 +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL.variant NORMAL.variant2 TUMOR.variant TUMOR.variant2 +1 1666175 . C T . PASS AC=0;AF=0.00;AN=0;NT=ref;QSS=28;QSS_NT=28;SGT=CC->CT;SOMATIC;TQSS=1;TQSS_NT=1;set=variant AU:CU:DP:FDP:GU:SDP:SUBDP:TU 0,0:42,42:43:0:0,0:0:0:1,1 0,0:45,45:59:0:0,0:0:0:14,14 +1 3750492 . G A . PASS AC=0;AF=0.00;AN=0;NT=ref;QSS=38;QSS_NT=38;SGT=GG->AG;SOMATIC;TQSS=2;TQSS_NT=2;set=variant AU:CU:DP:FDP:GU:SDP:SUBDP:TU 0,0:0,0:116:0:116,116:0:0:0,0 6,6:0,0:96:0:90,91:0:0:0,0 +1 9117626 . G A . PASS AC=0;AF=0.00;AN=0;NT=ref;QSS=32;QSS_NT=32;SGT=GG->AG;SOMATIC;TQSS=1;TQSS_NT=1;set=variant AU:CU:DP:FDP:GU:SDP:SUBDP:TU 0,0:0,0:165:0:165,166:0:0:0,0 6,6:0,0:132:0:126,127:0:0:0,0 \ No newline at end of file diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 8fbe35a..8ad3c03 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -231,7 +231,7 @@ def testParse(self): class TestIssue214(unittest.TestCase): """ See https://github.com/jamescasbon/PyVCF/issues/214 """ - + def test_issue_214_is_snp(self): reader=vcf.Reader(fh('issue-214.vcf')) r=reader.next() @@ -257,7 +257,7 @@ def test_issue_214_ref_is_del_var_type(self): reader.next() r=reader.next() self.assertEqual(r.var_type,'snp') - + class Test1kg(unittest.TestCase): def testParse(self): @@ -1553,6 +1553,12 @@ def test_write_uncalled(self): for (in_line, out_line) in zip(in_lines, out_lines): self.assertEqual(in_line,out_line) +class TestStrelka(unittest.TestCase): + + def test_strelka(self): + reader = vcf.Reader(fh('strelka.vcf')) + n = reader.next() + assert n is not None suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestVcfSpecs)) @@ -1585,3 +1591,4 @@ def test_write_uncalled(self): suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestUtils)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestGATKMeta)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestUncalledGenotypes)) +suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestStrelka)) From 3c6d6744dee6844e3125f16a7f0ffb378fa13e3b Mon Sep 17 00:00:00 2001 From: Kaarel Date: Sat, 19 Dec 2015 23:01:12 +0200 Subject: [PATCH 133/168] Update README.rst --- README.rst | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/README.rst b/README.rst index a60c0c8..aa1e0bd 100644 --- a/README.rst +++ b/README.rst @@ -50,7 +50,7 @@ of key=value pairs are converted to Python dictionaries, with flags being given a ``True`` value. Integers and floats are handled exactly as you'd expect:: >>> vcf_reader = vcf.Reader(open('vcf/test/example-4.0.vcf', 'r')) - >>> record = vcf_reader.next() + >>> record = next(vcf_reader) >>> print record.POS 14370 >>> print record.ALT @@ -82,7 +82,7 @@ fields. In case the FORMAT column does not exist, ``record.FORMAT`` is parsed sample column and ``record.genotype`` is a way of looking up genotypes by sample name:: - >>> record = vcf_reader.next() + >>> record = next(vcf_reader) >>> for sample in record.samples: ... print sample['GT'] 0|0 @@ -135,15 +135,14 @@ For example:: ALT records are actually classes, so that you can interrogate them:: >>> reader = vcf.Reader(open('vcf/test/example-4.1-bnd.vcf')) - >>> _ = reader.next(); row = reader.next() + >>> _ = next(reader); row = next(reader) >>> print row Record(CHROM=1, POS=2, REF=T, ALT=[T[2:3[]) >>> bnd = row.ALT[0] >>> print bnd.withinMainAssembly, bnd.orientation, bnd.remoteOrientation, bnd.connectingSequence True False True T -Random access is supported for files with tabix indexes. Simply call fetch for the -region you are interested in:: +Random access is supported for files with tabix indexes. This requires the pysam module as a dependency. Simply call fetch for the region you are interested in:: >>> vcf_reader = vcf.Reader(filename='vcf/test/tb.vcf.gz') >>> for record in vcf_reader.fetch('20', 1110696, 1230237): # doctest: +SKIP From 0a237900e12188d23385323f6ac5b1ad3d3eeeb7 Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Tue, 22 Dec 2015 18:18:50 +0100 Subject: [PATCH 134/168] Test more Python versions on Travis CI --- .travis.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 1fdfd54..b44e02c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,8 +6,11 @@ python: - "3.2" - "3.3" - "3.4" + - "3.5" + - "nightly" - "pypy" + - "pypy3" install: - - "if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install -r requirements/python2.6-requirements.txt; elif [[ $TRAVIS_PYTHON_VERSION == 'pypy' ]]; then pip install -r requirements/pypy-requirements.txt; else pip install -r requirements/common-requirements.txt; fi" + - "if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install -r requirements/python2.6-requirements.txt; elif [[ $TRAVIS_PYTHON_VERSION == 'pypy' ]] || [[ $TRAVIS_PYTHON_VERSION == 'pypy3' ]]; then pip install -r requirements/pypy-requirements.txt; else pip install -r requirements/common-requirements.txt; fi" - python setup.py install script: python setup.py test From 7448188c3ce7f5ab384d2786b06d9e2d28e93aa9 Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Tue, 22 Dec 2015 18:20:54 +0100 Subject: [PATCH 135/168] Enable containerized builds on Travis CI --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index b44e02c..fc795a3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,6 @@ # Validate this file using http://lint.travis-ci.org/ language: python +sudo: false python: - "2.6" - "2.7" From 3385c4cc931b75e97eca2708f76a5211ceac37ff Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Tue, 22 Dec 2015 18:24:05 +0100 Subject: [PATCH 136/168] Enable pip caching on Travis CI --- .travis.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.travis.yml b/.travis.yml index fc795a3..658f857 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,9 @@ # Validate this file using http://lint.travis-ci.org/ language: python sudo: false +cache: + directories: + - $HOME/.cache/pip python: - "2.6" - "2.7" From 1d9b4d6681874525ceb4b8fc1688088eeae960e3 Mon Sep 17 00:00:00 2001 From: Kaarel Date: Wed, 23 Dec 2015 00:15:31 +0200 Subject: [PATCH 137/168] Update utils.py --- vcf/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vcf/utils.py b/vcf/utils.py index 456e5fa..2881dc2 100644 --- a/vcf/utils.py +++ b/vcf/utils.py @@ -28,7 +28,7 @@ def walk_together(*readers, **kwargs): nexts = [] for reader in readers: try: - nexts.append(reader.next()) + nexts.append(next(reader)) except StopIteration: nexts.append(None) From 7b298a7dfa18ffcda1c1d0f2cbfbeb0323795370 Mon Sep 17 00:00:00 2001 From: Kaarel Date: Wed, 23 Dec 2015 00:16:51 +0200 Subject: [PATCH 138/168] Update parser.py --- vcf/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vcf/parser.py b/vcf/parser.py index a5625d7..e76150a 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -314,7 +314,7 @@ def _parse_metainfo(self): parser = _vcf_metadata_parser() - line = self.reader.next() + line = next(self.reader) while line.startswith('##'): self._header_lines.append(line) From f7119cd0603ab065b3c6703350629c5402350f56 Mon Sep 17 00:00:00 2001 From: Kaarel Date: Wed, 23 Dec 2015 00:18:05 +0200 Subject: [PATCH 139/168] Update test_vcf.py --- vcf/test/test_vcf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 8ad3c03..10b6c04 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -234,12 +234,12 @@ class TestIssue214(unittest.TestCase): def test_issue_214_is_snp(self): reader=vcf.Reader(fh('issue-214.vcf')) - r=reader.next() + r=next(reader) self.assertTrue(r.is_snp) def test_issue_214_var_type(self): reader=vcf.Reader(fh('issue-214.vcf')) - r=reader.next() + r=next(reader) self.assertEqual(r.var_type,'snp') # Can the ref even be a spanning deletion? From 4733c85d2d85a9c8e36722affbcd1466fafc1716 Mon Sep 17 00:00:00 2001 From: Kaarel Date: Wed, 23 Dec 2015 00:23:45 +0200 Subject: [PATCH 140/168] Update __init__.py --- vcf/__init__.py | 184 ------------------------------------------------ 1 file changed, 184 deletions(-) diff --git a/vcf/__init__.py b/vcf/__init__.py index 88842b6..c05058f 100644 --- a/vcf/__init__.py +++ b/vcf/__init__.py @@ -1,189 +1,5 @@ #!/usr/bin/env python -'''A VCFv4.0 and 4.1 parser for Python. -Online version of PyVCF documentation is available at http://pyvcf.rtfd.org/ - -The intent of this module is to mimic the ``csv`` module in the Python stdlib, -as opposed to more flexible serialization formats like JSON or YAML. ``vcf`` -will attempt to parse the content of each record based on the data types -specified in the meta-information lines -- specifically the ##INFO and -##FORMAT lines. If these lines are missing or incomplete, it will check -against the reserved types mentioned in the spec. Failing that, it will just -return strings. - -There main interface is the class: ``Reader``. It takes a file-like -object and acts as a reader:: - - >>> import vcf - >>> vcf_reader = vcf.Reader(open('vcf/test/example-4.0.vcf', 'r')) - >>> for record in vcf_reader: - ... print record - Record(CHROM=20, POS=14370, REF=G, ALT=[A]) - Record(CHROM=20, POS=17330, REF=T, ALT=[A]) - Record(CHROM=20, POS=1110696, REF=A, ALT=[G, T]) - Record(CHROM=20, POS=1230237, REF=T, ALT=[None]) - Record(CHROM=20, POS=1234567, REF=GTCT, ALT=[G, GTACT]) - - -This produces a great deal of information, but it is conveniently accessed. -The attributes of a Record are the 8 fixed fields from the VCF spec:: - - * ``Record.CHROM`` - * ``Record.POS`` - * ``Record.ID`` - * ``Record.REF`` - * ``Record.ALT`` - * ``Record.QUAL`` - * ``Record.FILTER`` - * ``Record.INFO`` - -plus attributes to handle genotype information: - - * ``Record.FORMAT`` - * ``Record.samples`` - * ``Record.genotype`` - -``samples`` and ``genotype``, not being the title of any column, are left lowercase. The format -of the fixed fields is from the spec. Comma-separated lists in the VCF are -converted to lists. In particular, one-entry VCF lists are converted to -one-entry Python lists (see, e.g., ``Record.ALT``). Semicolon-delimited lists -of key=value pairs are converted to Python dictionaries, with flags being given -a ``True`` value. Integers and floats are handled exactly as you'd expect:: - - >>> vcf_reader = vcf.Reader(open('vcf/test/example-4.0.vcf', 'r')) - >>> record = vcf_reader.next() - >>> print record.POS - 14370 - >>> print record.ALT - [A] - >>> print record.INFO['AF'] - [0.5] - -There are a number of convenience methods and properties for each ``Record`` allowing you to -examine properties of interest:: - - >>> print record.num_called, record.call_rate, record.num_unknown - 3 1.0 0 - >>> print record.num_hom_ref, record.num_het, record.num_hom_alt - 1 1 1 - >>> print record.nucl_diversity, record.aaf, record.heterozygosity - 0.6 [0.5] 0.5 - >>> print record.get_hets() - [Call(sample=NA00002, CallData(GT=1|0, GQ=48, DP=8, HQ=[51, 51]))] - >>> print record.is_snp, record.is_indel, record.is_transition, record.is_deletion - True False True False - >>> print record.var_type, record.var_subtype - snp ts - >>> print record.is_monomorphic - False - -``record.FORMAT`` will be a string specifying the format of the genotype -fields. In case the FORMAT column does not exist, ``record.FORMAT`` is -``None``. Finally, ``record.samples`` is a list of dictionaries containing the -parsed sample column and ``record.genotype`` is a way of looking up genotypes -by sample name:: - - >>> record = vcf_reader.next() - >>> for sample in record.samples: - ... print sample['GT'] - 0|0 - 0|1 - 0/0 - >>> print record.genotype('NA00001')['GT'] - 0|0 - -The genotypes are represented by ``Call`` objects, which have three attributes: the -corresponding Record ``site``, the sample name in ``sample`` and a dictionary of -call data in ``data``:: - - >>> call = record.genotype('NA00001') - >>> print call.site - Record(CHROM=20, POS=17330, REF=T, ALT=[A]) - >>> print call.sample - NA00001 - >>> print call.data - CallData(GT=0|0, GQ=49, DP=3, HQ=[58, 50]) - -Please note that as of release 0.4.0, attributes known to have single values (such as -``DP`` and ``GQ`` above) are returned as values. Other attributes are returned -as lists (such as ``HQ`` above). - -There are also a number of methods:: - - >>> print call.called, call.gt_type, call.gt_bases, call.phased - True 0 T|T True - -Metadata regarding the VCF file itself can be investigated through the -following attributes: - - * ``Reader.metadata`` - * ``Reader.infos`` - * ``Reader.filters`` - * ``Reader.formats`` - * ``Reader.samples`` - -For example:: - - >>> vcf_reader.metadata['fileDate'] - '20090805' - >>> vcf_reader.samples - ['NA00001', 'NA00002', 'NA00003'] - >>> vcf_reader.filters - OrderedDict([('q10', Filter(id='q10', desc='Quality below 10')), ('s50', Filter(id='s50', desc='Less than 50% of samples have data'))]) - >>> vcf_reader.infos['AA'].desc - 'Ancestral Allele' - -ALT records are actually classes, so that you can interrogate them:: - - >>> reader = vcf.Reader(open('vcf/test/example-4.1-bnd.vcf')) - >>> _ = reader.next(); row = reader.next() - >>> print row - Record(CHROM=1, POS=2, REF=T, ALT=[T[2:3[]) - >>> bnd = row.ALT[0] - >>> print bnd.withinMainAssembly, bnd.orientation, bnd.remoteOrientation, bnd.connectingSequence - True False True T - -The Reader supports retrieval of records within designated regions for -files with tabix indexes via the fetch method. Pass in a chromosome, -and, optionally, start and end coordinates, for the regions of -interest:: - - >>> vcf_reader = vcf.Reader(filename='vcf/test/tb.vcf.gz') - >>> # fetch all records on chromosome 20 from base 1110696 through 1230237 - >>> for record in vcf_reader.fetch('20', 1110695, 1230237): # doctest: +SKIP - ... print record - Record(CHROM=20, POS=1110696, REF=A, ALT=[G, T]) - Record(CHROM=20, POS=1230237, REF=T, ALT=[None]) - -Note that the start and end coordinates are in the zero-based, half-open -coordinate system, similar to ``_Record.start`` and ``_Record.end``. The -very first base of a chromosome is index 0, and the the region includes -bases up to, but not including the base at the end coordinate. For -example:: - - >>> # fetch all records on chromosome 4 from base 11 through 20 - >>> vcf_reader.fetch('4', 10, 20) # doctest: +SKIP - -would include all records overlapping a 10 base pair region from the -11th base of through the 20th base (which is at index 19) of chromosome -4. It would not include the 21st base (at index 20). (See -http://genomewiki.ucsc.edu/index.php/Coordinate_Transforms for more -information on the zero-based, half-open coordinate system.) - -The ``Writer`` class provides a way of writing a VCF file. Currently, you must specify a -template ``Reader`` which provides the metadata:: - - >>> vcf_reader = vcf.Reader(filename='vcf/test/tb.vcf.gz') - >>> vcf_writer = vcf.Writer(open('/dev/null', 'w'), vcf_reader) - >>> for record in vcf_reader: - ... vcf_writer.write_record(record) - - -An extensible script is available to filter vcf files in vcf_filter.py. VCF filters -declared by other packages will be available for use in this script. Please -see :doc:`FILTERS` for full description. - -''' from vcf.parser import Reader, Writer from vcf.parser import VCFReader, VCFWriter from vcf.filters import Base as Filter From 24fc5fe310a01093bc3cc52e32564de5fb128373 Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Thu, 24 Dec 2015 11:11:54 +0100 Subject: [PATCH 141/168] More _.next() to next(_) changes --- vcf/parser.py | 4 ++-- vcf/test/test_vcf.py | 24 ++++++++++++------------ 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/vcf/parser.py b/vcf/parser.py index e76150a..2cd8deb 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -347,7 +347,7 @@ def _parse_metainfo(self): self.metadata[key] = [] self.metadata[key].append(val) - line = self.reader.next() + line = next(self.reader) fields = self._row_pattern.split(line[1:]) self._column_headers = fields[:9] @@ -539,7 +539,7 @@ def _parse_alt(self, str): def next(self): '''Return the next record in the file.''' - line = self.reader.next() + line = next(self.reader) row = self._row_pattern.split(line.rstrip()) chrom = row[0] if self._prepend_chr: diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 10b6c04..20b71ad 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -246,16 +246,16 @@ def test_issue_214_var_type(self): # Note, this does not trigger issue 214, but I've added it here for completeness def test_issue_214_ref_is_del_is_snp(self): reader=vcf.Reader(fh('issue-214.vcf')) - reader.next() - r=reader.next() + next(reader) + r=next(reader) self.assertTrue(r.is_snp) # Can the ref even be a spanning deletion? # Note, this does not trigger issue 214, but I've added it here for completeness def test_issue_214_ref_is_del_var_type(self): reader=vcf.Reader(fh('issue-214.vcf')) - reader.next() - r=reader.next() + next(reader) + r=next(reader) self.assertEqual(r.var_type,'snp') class Test1kg(unittest.TestCase): @@ -562,7 +562,7 @@ def test_num_calls(self): self.assertEqual(len(var.samples), num_calls) def test_dunder_eq(self): - rec = vcf.Reader(fh('example-4.0.vcf')).next() + rec = next(vcf.Reader(fh('example-4.0.vcf'))) self.assertFalse(rec == None) self.assertFalse(None == rec) @@ -892,7 +892,7 @@ def test_qual(self): def test_info_multiple_values(self): reader = vcf.Reader(fh('example-4.1-info-multiple-values.vcf')) - var = reader.next() + var = next(reader) # check Float type INFO field with multiple values expected = [19.3, 47.4, 14.0] actual = var.INFO['RepeatCopies'] @@ -1149,7 +1149,7 @@ class TestCall(unittest.TestCase): def test_dunder_eq(self): reader = vcf.Reader(fh('example-4.0.vcf')) - var = reader.next() + var = next(reader) example_call = var.samples[0] self.assertFalse(example_call == None) self.assertFalse(None == example_call) @@ -1320,7 +1320,7 @@ def testCLIWithFilter(self): #print(buf.getvalue()) reader = vcf.Reader(buf) self.assertEqual(reader.samples, ['NA00001']) - rec = reader.next() + rec = next(reader) self.assertEqual(len(rec.samples), 1) @unittest.skipUnless(IS_NOT_PYPY, "test broken for PyPy") @@ -1342,7 +1342,7 @@ def testSampleFilterModule(self): # read output reader = vcf.Reader(buf) self.assertEqual(reader.samples, ['NA00001']) - rec = reader.next() + rec = next(reader) self.assertEqual(len(rec.samples), 1) @@ -1401,7 +1401,7 @@ class TestRegression(unittest.TestCase): def test_issue_16(self): reader = vcf.Reader(fh('issue-16.vcf')) - n = reader.next() + n = next(reader) assert n.QUAL == None def test_null_mono(self): @@ -1416,7 +1416,7 @@ def test_null_mono(self): out.seek(0) print(out.getvalue()) p2 = vcf.Reader(out) - rec = p2.next() + rec = next(p2) assert rec.samples @@ -1557,7 +1557,7 @@ class TestStrelka(unittest.TestCase): def test_strelka(self): reader = vcf.Reader(fh('strelka.vcf')) - n = reader.next() + n = next(reader) assert n is not None From 95c907ecfa5ac9ba93ff722331cee2d53c4f36d0 Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Thu, 24 Dec 2015 11:14:51 +0100 Subject: [PATCH 142/168] Misc doc updates --- README.rst | 26 ++++++++++++++++-------- docs/API.rst | 14 ++++++------- docs/HISTORY.rst | 51 ++++++++++++++++++++++++------------------------ docs/INTRO.rst | 3 +-- vcf/__init__.py | 6 ++++++ vcf/model.py | 15 +++++++------- 6 files changed, 65 insertions(+), 50 deletions(-) diff --git a/README.rst b/README.rst index aa1e0bd..67b5d1b 100644 --- a/README.rst +++ b/README.rst @@ -58,7 +58,7 @@ a ``True`` value. Integers and floats are handled exactly as you'd expect:: >>> print record.INFO['AF'] [0.5] -There are a number of convienience methods and properties for each ``Record`` allowing you to +There are a number of convenience methods and properties for each ``Record`` allowing you to examine properties of interest:: >>> print record.num_called, record.call_rate, record.num_unknown @@ -142,19 +142,31 @@ ALT records are actually classes, so that you can interrogate them:: >>> print bnd.withinMainAssembly, bnd.orientation, bnd.remoteOrientation, bnd.connectingSequence True False True T -Random access is supported for files with tabix indexes. This requires the pysam module as a dependency. Simply call fetch for the region you are interested in:: +The Reader supports retrieval of records within designated regions for files +with tabix indexes via the fetch method. This requires the pysam module as a +dependency. Pass in a chromosome, and, optionally, start and end coordinates, +for the regions of interest:: >>> vcf_reader = vcf.Reader(filename='vcf/test/tb.vcf.gz') - >>> for record in vcf_reader.fetch('20', 1110696, 1230237): # doctest: +SKIP + >>> # fetch all records on chromosome 20 from base 1110696 through 1230237 + >>> for record in vcf_reader.fetch('20', 1110695, 1230237): # doctest: +SKIP ... print record Record(CHROM=20, POS=1110696, REF=A, ALT=[G, T]) Record(CHROM=20, POS=1230237, REF=T, ALT=[None]) -Or extract a single row:: +Note that the start and end coordinates are in the zero-based, half-open +coordinate system, similar to ``_Record.start`` and ``_Record.end``. The very +first base of a chromosome is index 0, and the the region includes bases up +to, but not including the base at the end coordinate. For example:: - >>> print vcf_reader.fetch('20', 1110696) # doctest: +SKIP - Record(CHROM=20, POS=1110696, REF=A, ALT=[G, T]) + >>> # fetch all records on chromosome 4 from base 11 through 20 + >>> vcf_reader.fetch('4', 10, 20) # doctest: +SKIP +would include all records overlapping a 10 base pair region from the 11th base +of through the 20th base (which is at index 19) of chromosome 4. It would not +include the 21st base (at index 20). (See +http://genomewiki.ucsc.edu/index.php/Coordinate_Transforms for more +information on the zero-based, half-open coordinate system.) The ``Writer`` class provides a way of writing a VCF file. Currently, you must specify a template ``Reader`` which provides the metadata:: @@ -164,8 +176,6 @@ template ``Reader`` which provides the metadata:: >>> for record in vcf_reader: ... vcf_writer.write_record(record) - An extensible script is available to filter vcf files in vcf_filter.py. VCF filters declared by other packages will be available for use in this script. Please see :doc:`FILTERS` for full description. - diff --git a/docs/API.rst b/docs/API.rst index 7ffc21a..d688893 100644 --- a/docs/API.rst +++ b/docs/API.rst @@ -14,43 +14,43 @@ vcf.Writer :members: vcf.model._Record ------------ +----------------- .. autoclass:: vcf.model._Record :members: vcf.model._Call ---------- +--------------- .. autoclass:: vcf.model._Call :members: vcf.model._AltRecord ------------ +-------------------- .. autoclass:: vcf.model._AltRecord :members: vcf.model._Substitution ------------ +----------------------- .. autoclass:: vcf.model._Substitution :members: vcf.model._SV ------------ +------------- .. autoclass:: vcf.model._SV :members: vcf.model._SingleBreakend ------------ +------------------------- .. autoclass:: vcf.model._SingleBreakend :members: vcf.model._Breakend ------------ +------------------- .. autoclass:: vcf.parser._Breakend :members: diff --git a/docs/HISTORY.rst b/docs/HISTORY.rst index defff0d..8a97d8d 100644 --- a/docs/HISTORY.rst +++ b/docs/HISTORY.rst @@ -2,7 +2,7 @@ Development =========== Please use the `PyVCF repository `_. -Pull requests gladly accepted. +Pull requests gladly accepted. Issues should be reported at the github issue tracker. Running tests @@ -10,7 +10,7 @@ Running tests Please check the tests by running them with:: - python setup.py test + python setup.py test New features should have test code sent with them. @@ -20,7 +20,7 @@ Changes 0.6.7 Release ------------- -* Include missing .pyx files +* Include missing .pyx files 0.6.6 Release ------------- @@ -56,17 +56,17 @@ Changes ------------- * cython port of #79 -* correct writing of meta lines #84 +* correct writing of meta lines #84 0.6.2 Release ------------- -* issues #78, #79 (thanks Sean, Brad) +* issues #78, #79 (thanks Sean, Brad) 0.6.1 Release ------------- -* Add strict whitespace mode for well formed VCFs with spaces +* Add strict whitespace mode for well formed VCFs with spaces in sample names (thanks Marco) * Ignore blank lines in files (thanks Martijn) * Tweaks for handling missing data (thanks Sean) @@ -76,9 +76,9 @@ Changes 0.6.0 Release ------------- -* Backwards incompatible change: _Call.data is now a +* Backwards incompatible change: _Call.data is now a namedtuple (previously it was a dict) -* Optional cython version, much improved performance. +* Optional cython version, much improved performance. * Improvements to writer (thanks @cmclean) * Improvements to inheritance of classes (thanks @lennax) @@ -86,14 +86,14 @@ Changes 0.5.0 Release ------------- -VCF 4.1 support: - * support missing genotype #28 (thanks @martijnvermaat) - * parseALT for svs #42, #48 (thanks @dzerbino) +* VCF 4.1 support: + - support missing genotype #28 (thanks @martijnvermaat) + - parseALT for svs #42, #48 (thanks @dzerbino) * `trim_common_suffix` method #22 (thanks @martijnvermaat) * Multiple metadata with the same key is stored (#52) -Writer improvements - * A/G in Number INFO fields #53 (thanks @lennax) - * Better output #55 (thanks @cmclean) +* Writer improvements: + - A/G in Number INFO fields #53 (thanks @lennax) + - Better output #55 (thanks @cmclean) * Allow malformed INFO fields #49 (thanks @ilyaminkin) * Added bayes factor error bias VCF filter * Added docs on vcf_melt @@ -103,14 +103,14 @@ Writer improvements 0.4.6 Release ------------- -* Performance improvements (#47) +* Performance improvements (#47) * Preserve order of INFO column (#46) 0.4.5 Release ------------- -* Support exponent syntax qual values (#43, #44) (thanks @martijnvermaat) -* Preserve order of header lines (#45) +* Support exponent syntax qual values (#43, #44) (thanks @martijnvermaat) +* Preserve order of header lines (#45) 0.4.4 Release ------------- @@ -139,15 +139,15 @@ Writer improvements 0.4.0 Release ------------- -* Package structure +* Package structure * add ``vcf.utils`` module with ``walk_together`` method -* samtools tests +* samtools tests * support Freebayes' non standard '.' for no call -* fix vcf_melt +* fix vcf_melt * support monomorphic sites, add ``is_monomorphic`` method, handle null QUALs -* filter support for files with monomorphic calls +* filter support for files with monomorphic calls * Values declared as single are no-longer returned in lists -* several performance improvements +* several performance improvements 0.3.0 Release @@ -170,14 +170,14 @@ Documentation release * Add shebang to vcf_filter.py -0.2 Release +0.2 Release ----------- * Replace genotype dictionary with a ``Call`` object * Methods on ``Record`` and ``Call`` (thanks @arq5x) * Shortcut parse_sample when genotype is None -0.1 Release +0.1 Release ----------- * Added test code @@ -188,7 +188,7 @@ Documentation release * Allow opening by filename as well as filesocket * Support fetching rows for tabixed indexed files * Performance improvements (see ``test/prof.py``) -* Added extensible filter script (see FILTERS.md), vcf_filter.py +* Added extensible filter script (see FILTERS.md), vcf_filter.py Contributions ============= @@ -197,4 +197,3 @@ Project started by @jdoughertyii and taken over by @jamescasbon on 12th January Contributions from @arq5x, @brentp, @martijnvermaat, @ian1roberts, @marcelm. This project was supported by `Population Genetics `_. - diff --git a/docs/INTRO.rst b/docs/INTRO.rst index b61e9a9..2b9a587 100644 --- a/docs/INTRO.rst +++ b/docs/INTRO.rst @@ -1,5 +1,4 @@ Introduction ============ -.. automodule:: vcf - +.. include:: ../README.rst diff --git a/vcf/__init__.py b/vcf/__init__.py index c05058f..75bee03 100644 --- a/vcf/__init__.py +++ b/vcf/__init__.py @@ -1,4 +1,10 @@ #!/usr/bin/env python +""" +A VCFv4.0 and 4.1 parser for Python. + +Online version of PyVCF documentation is available at http://pyvcf.rtfd.org/ +""" + from vcf.parser import Reader, Writer from vcf.parser import VCFReader, VCFWriter diff --git a/vcf/model.py b/vcf/model.py index 33c77b2..ef1edb7 100644 --- a/vcf/model.py +++ b/vcf/model.py @@ -330,7 +330,7 @@ def nucl_diversity(self): Derived from: \"Population Genetics: A Concise Guide, 2nd ed., p.45\" - John Gillespie. + John Gillespie. """ # skip if more than one alternate allele. assumes bi-allelic if len(self.ALT) > 1: @@ -467,13 +467,14 @@ def var_type(self): def var_subtype(self): """ Return the subtype of variant. + - For SNPs and INDELs, yeild one of: [ts, tv, ins, del] - - For SVs yield either "complex" or the SV type defined - in the ALT fields (removing the brackets). - E.g.: - -> DEL - -> INS:ME:L1 - -> DUP + - For SVs yield either "complex" or the SV type defined in the ALT + fields (removing the brackets). E.g.:: + + -> DEL + -> INS:ME:L1 + -> DUP The logic is meant to follow the rules outlined in the following paragraph at: From bfcedb9bad1a14074ac4526ffdb610611e073810 Mon Sep 17 00:00:00 2001 From: James Casbon Date: Fri, 18 Mar 2016 16:21:45 +0000 Subject: [PATCH 143/168] Cut release. --- setup.py | 2 +- vcf/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index a266207..d8089c0 100644 --- a/setup.py +++ b/setup.py @@ -68,7 +68,7 @@ 'Programming Language :: Cython', 'Programming Language :: Python', 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.6' + 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.2', diff --git a/vcf/__init__.py b/vcf/__init__.py index 75bee03..e1aae58 100644 --- a/vcf/__init__.py +++ b/vcf/__init__.py @@ -12,4 +12,4 @@ from vcf.parser import RESERVED_INFO, RESERVED_FORMAT from vcf.sample_filter import SampleFilter -VERSION = '0.6.8.dev0' +VERSION = '0.6.8' From 0b24f4d8b74b2f6fa6f29812ceed4f0645726813 Mon Sep 17 00:00:00 2001 From: Juan Medina Date: Wed, 15 Jun 2016 11:22:57 -0400 Subject: [PATCH 144/168] allows to match empty values. Fixes #234 --- vcf/parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vcf/parser.py b/vcf/parser.py index 2cd8deb..af6aeae 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -83,7 +83,7 @@ def __init__(self): super(_vcf_metadata_parser, self).__init__() self.info_pattern = re.compile(r'''\#\#INFO=< ID=(?P[^,]+),\s* - Number=(?P-?\d+|\.|[AGR]),\s* + Number=(?P(?:(-?\d+|\.|[AGR]))?),\s* Type=(?PInteger|Float|Flag|Character|String),\s* Description="(?P[^"]*)" (?:,\s*Source="(?P[^"]*)")? @@ -112,7 +112,7 @@ def __init__(self): def vcf_field_count(self, num_str): """Cast vcf header numbers to integer or None""" - if num_str is None: + if num_str is None or num_str == "": return None elif num_str not in field_counts: # Fixed, specified number From 0da96626427bcc23f280b819dfba28ebe7bab18a Mon Sep 17 00:00:00 2001 From: Juan Medina Date: Wed, 15 Jun 2016 15:43:48 -0400 Subject: [PATCH 145/168] Issue #234: simplifies regex and adds test case --- vcf/parser.py | 4 ++-- vcf/test/test_vcf.py | 17 +++++++++++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/vcf/parser.py b/vcf/parser.py index af6aeae..e23a66b 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -83,7 +83,7 @@ def __init__(self): super(_vcf_metadata_parser, self).__init__() self.info_pattern = re.compile(r'''\#\#INFO=< ID=(?P[^,]+),\s* - Number=(?P(?:(-?\d+|\.|[AGR]))?),\s* + Number=(?P-?\d+|\.|[AGR])?,\s* Type=(?PInteger|Float|Flag|Character|String),\s* Description="(?P[^"]*)" (?:,\s*Source="(?P[^"]*)")? @@ -112,7 +112,7 @@ def __init__(self): def vcf_field_count(self, num_str): """Cast vcf header numbers to integer or None""" - if num_str is None or num_str == "": + if num_str is None: return None elif num_str not in field_counts: # Fixed, specified number diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 20b71ad..afc90ec 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -1271,6 +1271,22 @@ def testFetch(self): pass +class TestIssue234(unittest.TestCase): + """ See https://github.com/jamescasbon/PyVCF/issues/234 """ + + def test_vcf_metadata_parser_doesnt_break_with_empty_number_tags(self): + parser = vcf.parser._vcf_metadata_parser() + num_str = '##INFO= Date: Wed, 15 Jun 2016 15:51:46 -0400 Subject: [PATCH 146/168] Asserts that num attribute is None for #234 --- vcf/test/test_vcf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index afc90ec..2064bf8 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -1280,7 +1280,8 @@ def test_vcf_metadata_parser_doesnt_break_with_empty_number_tags(self): num_str += 'could not be annotated to a coding region of a transcript ' num_str += 'using the supplied bed file">' try: - parser.read_info(num_str) + info = parser.read_info(num_str)[1] + self.assertIsNone(info.num) except SyntaxError: msg = "vcf.parser._vcf_metadata_parser shouldn't raise SyntaxError" msg += " if Number tag is empty." From 21a52d2d8f71516b02c3e119d745dfd8f097f02b Mon Sep 17 00:00:00 2001 From: rwness Date: Tue, 28 Jun 2016 23:16:55 -0400 Subject: [PATCH 147/168] Update parser.py I think this was a typo and gives a misleading error message --- vcf/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vcf/parser.py b/vcf/parser.py index e23a66b..cff1adf 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -151,7 +151,7 @@ def read_alt(self, alt_string): match = self.alt_pattern.match(alt_string) if not match: raise SyntaxError( - "One of the FILTER lines is malformed: %s" % alt_string) + "One of the ALT lines is malformed: %s" % alt_string) alt = _Alt(match.group('id'), match.group('desc')) From f40482793fdbc9034cb905bef4aef8464ce02dbb Mon Sep 17 00:00:00 2001 From: bow Date: Thu, 21 Jul 2016 19:03:02 +0200 Subject: [PATCH 148/168] Fixes for GitHub issue #243 --- vcf/model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vcf/model.py b/vcf/model.py index ef1edb7..5adf2b7 100644 --- a/vcf/model.py +++ b/vcf/model.py @@ -23,7 +23,7 @@ def __init__(self, site, sample, data): #: Namedtuple of data from the VCF file self.data = data - if hasattr(self.data, 'GT'): + if getattr(self.data, 'GT', None) is not None: self.gt_alleles = [(al if al != '.' else None) for al in allele_delimiter.split(self.data.GT)] self.ploidity = len(self.gt_alleles) self.called = all([al != None for al in self.gt_alleles]) @@ -279,7 +279,7 @@ def genotype(self, name): @property def num_called(self): """ The number of called samples""" - return sum(s.called for s in self.samples) + return sum(1 for s in self.samples if s.called) @property def call_rate(self): From c6ee46e1a81de389bca0b5b6791c10867efd2241 Mon Sep 17 00:00:00 2001 From: trijntje Date: Sun, 31 Jul 2016 12:17:54 +0200 Subject: [PATCH 149/168] Moved code into Reader._parse_filter so it can be reused for FT --- vcf/parser.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/vcf/parser.py b/vcf/parser.py index cff1adf..9e40474 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -359,6 +359,15 @@ def _map(self, func, iterable, bad='.'): return [func(x) if x != bad else None for x in iterable] + def _parse_filter(self, filt_str): + '''Parse the FILTER field of a VCF entry into a Python list''' + if filt_str == '.': + return None + elif filt_str == 'PASS': + return [] + else: + return filt_str.split(';') + def _parse_info(self, info_str): '''Parse the INFO field of a VCF entry into a dictionary of Python types. @@ -562,13 +571,7 @@ def next(self): except ValueError: qual = None - filt = row[6] - if filt == '.': - filt = None - elif filt == 'PASS': - filt = [] - else: - filt = filt.split(';') + filt = self._parse_filter(row[6]) info = self._parse_info(row[7]) try: From d703113b98e0988b0502369e4e40938cb4516d41 Mon Sep 17 00:00:00 2001 From: trijntje Date: Sun, 31 Jul 2016 12:35:11 +0200 Subject: [PATCH 150/168] Added example file and 2 testcases --- vcf/test/FT.vcf | 50 ++++++++++++++++++++++++++++++++++++++++++++ vcf/test/test_vcf.py | 32 ++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) create mode 100644 vcf/test/FT.vcf diff --git a/vcf/test/FT.vcf b/vcf/test/FT.vcf new file mode 100644 index 0000000..e42436a --- /dev/null +++ b/vcf/test/FT.vcf @@ -0,0 +1,50 @@ +##fileformat=VCFv4.2 +##ALT= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##GATKCommandLine.VariantFiltration= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##contig= +##reference=file://../ref.fasta +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 2 3 4 5 +ref 63393 . C A 29454.60 . AC=5;AF=1.00;AN=5;DP=719;FS=0.000;MLEAC=5;MLEAF=1.00;MQ=60.00;QD=29.67;SOR=0.965 GT:AD:DP:GQ:PL 1:0,166:166:99:6740,0 1:0,142:142:99:5824,0 1:0,134:134:99:5616,0 1:0,122:122:99:4930,0 1:0,155:155:99:6371,0 +ref 65903 . AATTGCGCTG A 7340.57 PASS AC=1;AF=0.200;AN=5;DP=524;FS=0.000;MLEAC=1;MLEAF=0.200;MQ=60.00;QD=34.04;SOR=1.091 GT:AD:DP:FT:GQ:PL 1:0,164:164:PASS:99:7383,0 0:95,0:95:DP125;DP130:99:0,1800 0:88,0:88:DP125;DP130:99:0,1800 0:87,0:87:DP125;DP130:99:0,1800 0:89,0:89:DP125;DP130:99:0,1800 +ref 70837 . C A 4711.61 Q4800;Q5000 AC=1;AF=0.200;AN=5;DP=512;FS=0.000;MLEAC=1;MLEAF=0.200;MQ=60.00;QD=27.64;SOR=0.726 GT:AD:DP:FT:GQ:PL 0:121,0:121:DP125;DP130:99:0,1800 0:95,0:95:DP125;DP130:99:0,1800 1:0,120:120:DP125;DP130:99:4745,0 0:87,0:87:DP125;DP130:99:0,1800 0:89,0:89:DP125;DP130:99:0,1800 +ref 71448 . C T 31134.60 PASS AC=5;AF=1.00;AN=5;BaseQRankSum=2.22;ClippingRankSum=0.00;DP=768;FS=0.000;MLEAC=5;MLEAF=1.00;MQ=60.00;MQRankSum=0.00;QD=29.43;ReadPosRankSum=2.03;SOR=0.295 GT:AD:DP:FT:GQ:PL 1:0,147:147:PASS:99:5996,0 1:1,183:184:PASS:99:7501,0 1:0,113:113:DP125;DP130:99:4559,0 1:0,161:161:PASS:99:6436,0 1:0,163:163:PASS:99:6669,0 +ref 104257 . C T 5521.61 PASS AC=1;AF=0.200;AN=5;DP=506;FS=0.000;MLEAC=1;MLEAF=0.200;MQ=60.00;QD=29.45;SOR=0.854 GT:AD:DP:FT:GQ:PL 0:101,0:101:DP125;DP130:99:0,1800 0:109,0:109:DP125;DP130:99:0,1800 1:0,132:132:PASS:99:5555,0 0:67,0:67:DP125;DP130:99:0,1800 0:97,0:97:DP125;DP130:99:0,1800 +ref 140658 . C A 32467.60 PASS AC=5;AF=1.00;AN=5;BaseQRankSum=2.24;ClippingRankSum=0.00;DP=801;FS=0.000;MLEAC=5;MLEAF=1.00;MQ=60.00;MQRankSum=0.00;QD=29.65;ReadPosRankSum=1.27;SOR=0.346 GT:AD:DP:GQ:PL 1:0,170:170:99:6854,0 1:0,198:198:99:8098,0 1:0,136:136:99:5554,0 1:0,141:141:99:5661,0 1:1,155:156:99:6327,0 +ref 147463 . C A 4885.61 Q5000 AC=1;AF=0.200;AN=5;BaseQRankSum=-7.720e-01;ClippingRankSum=0.00;DP=503;FS=0.000;MLEAC=1;MLEAF=0.200;MQ=60.00;MQRankSum=0.00;QD=35.03;ReadPosRankSum=-6.950e-01;SOR=0.278 GT:AD:DP:FT:GQ:PL 0:97,0:97:DP125;DP130:99:0,1800 0:104,0:104:DP125;DP130:99:0,1800 0:84,0:84:DP125;DP130:99:0,1800 1:1,128:129:DP130:99:4919,0 0:89,0:89:DP125;DP130:99:0,1800 +ref 154578 . A G 32015.60 PASS AC=5;AF=1.00;AN=5;DP=776;FS=0.000;MLEAC=5;MLEAF=1.00;MQ=60.00;QD=25.82;SOR=0.902 GT:AD:DP:GQ:PL 1:0,152:152:99:6300,0 1:0,183:183:99:7608,0 1:0,137:137:99:5713,0 1:0,148:148:99:6040,0 1:0,156:156:99:6381,0 +ref 203200 . C T 30880.60 PASS AC=5;AF=1.00;AN=5;DP=752;FS=0.000;MLEAC=5;MLEAF=1.00;MQ=60.00;QD=29.65;SOR=0.878 GT:AD:DP:FT:GQ:PL 1:0,161:161:PASS:99:6708,0 1:0,185:185:PASS:99:7602,0 1:0,136:136:PASS:99:5602,0 1:0,126:126:DP130:99:5080,0 1:0,144:144:PASS:99:5915,0 +ref 231665 . C T 30074.60 PASS AC=5;AF=1.00;AN=5;DP=735;FS=0.000;MLEAC=5;MLEAF=1.00;MQ=60.00;QD=33.23;SOR=0.938 GT:AD:DP:FT:GQ:PL 1:0,191:191:PASS:99:7867,0 1:0,159:159:PASS:99:6431,0 1:0,130:130:PASS:99:5299,0 1:0,129:129:DP130:99:5290,0 1:0,126:126:DP130:99:5214,0 diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 2064bf8..b0c3ef8 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -1288,6 +1288,37 @@ def test_vcf_metadata_parser_doesnt_break_with_empty_number_tags(self): self.fail(msg) +class TestIssue246(unittest.TestCase): + """ See https://github.com/jamescasbon/PyVCF/issues/246 """ + + def test_FT_pass_two(self): + reader=vcf.Reader(fh('FT.vcf')) + next(reader) + r=next(reader) + target=[ + [], + ['DP125','DP130'], + ['DP125','DP130'], + ['DP125','DP130'], + ['DP125','DP130'] + ] + result=[call.data.FT for call in r.samples] + self.assertEqual(target,result) + + def test_FT_one_two(self): + reader=list(vcf.Reader(fh('FT.vcf'))) + r=reader[6] + target=[ + ['DP125','DP130'], + ['DP125','DP130'], + ['DP125','DP130'], + ['DP130'], + ['DP125','DP130'] + ] + result=[call.data.FT for call in r.samples] + self.assertEqual(target,result) + + class TestOpenMethods(unittest.TestCase): samples = 'NA00001 NA00002 NA00003'.split() @@ -1602,6 +1633,7 @@ def test_strelka(self): suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestFetch)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestIssue201)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestIssue234)) +suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestIssue246)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestOpenMethods)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestSampleFilter)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestFilter)) From 2f5777efd5eedd3f534f5c4037fdf1c6416efde4 Mon Sep 17 00:00:00 2001 From: trijntje Date: Sun, 31 Jul 2016 12:54:05 +0200 Subject: [PATCH 151/168] Added FT as a special case when parsing format --- vcf/cparse.pyx | 17 +++++++++++++++++ vcf/parser.py | 10 +++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/vcf/cparse.pyx b/vcf/cparse.pyx index 8a71d64..d6a93ae 100644 --- a/vcf/cparse.pyx +++ b/vcf/cparse.pyx @@ -9,6 +9,19 @@ INTEGER = 'Integer' FLOAT = 'Float' NUMERIC = 'Numeric' +def _parse_filter(filt_str): + '''Parse the FILTER field of a VCF entry into a Python list + + NOTE: this method has a python equivalent and care must be taken + to keep the two methods equivalent + ''' + if filt_str == '.': + return None + elif filt_str == 'PASS': + return [] + else: + return filt_str.split(';') + def parse_samples( list names, list samples, samp_fmt, list samp_fmt_types, list samp_fmt_nums, site): @@ -39,6 +52,10 @@ def parse_samples( if samp_fmt._fields[j] == 'GT': sampdat[j] = vals continue + # genotype filters are a special case + elif samp_fmt._fields[j] == 'FT': + sampdat[j] = _parse_filter(vals) + continue elif not vals or vals == '.': sampdat[j] = None continue diff --git a/vcf/parser.py b/vcf/parser.py index 9e40474..5e7816b 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -360,7 +360,11 @@ def _map(self, func, iterable, bad='.'): for x in iterable] def _parse_filter(self, filt_str): - '''Parse the FILTER field of a VCF entry into a Python list''' + '''Parse the FILTER field of a VCF entry into a Python list + + NOTE: this method has a cython equivalent and care must be taken + to keep the two methods equivalent + ''' if filt_str == '.': return None elif filt_str == 'PASS': @@ -475,6 +479,10 @@ def _parse_samples(self, samples, samp_fmt, site): if samp_fmt._fields[i] == 'GT': sampdat[i] = vals continue + # genotype filters are a special case + elif samp_fmt._fields[i] == 'FT': + sampdat[i] = self._parse_filter(vals) + continue elif not vals or vals == ".": sampdat[i] = None continue From 7d675ad367414acd255830764325cbfd715c3d1b Mon Sep 17 00:00:00 2001 From: Redmar Date: Mon, 1 Aug 2016 13:33:50 +0200 Subject: [PATCH 152/168] Implemented is_filt for _Call and _Record --- vcf/model.py | 29 +++++++++++++++++++++++++++++ vcf/test/test_vcf.py | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) diff --git a/vcf/model.py b/vcf/model.py index 5adf2b7..535286c 100644 --- a/vcf/model.py +++ b/vcf/model.py @@ -117,6 +117,22 @@ def is_het(self): return None return self.gt_type == 1 + @property + def is_filt(self): + """ Return True for filtered calls """ + try: # no FT annotation present for this variant + FT=self.data.FT + except AttributeError: + return False + if FT == None or FT == []: # FT is not set or set to PASS + return False + elif len(FT) > 0: # FT contains one or more filters + return True + else: # This should not happen + raise RuntimeError( + "Parsing error for FT annotation in {}, "\ + "please file a bug".format(self)) + class _Record(object): """ A set of calls at a site. Equivalent to a row in a VCF file. @@ -536,6 +552,19 @@ def is_monomorphic(self): """ Return True for reference calls """ return len(self.ALT) == 1 and self.ALT[0] is None + @property + def is_filt(self,call=None): + """ Return True if a variant has been filtered """ + FT=self.FILTER + if FT == None or FT == []: # FT is not set or set to PASS + return False + elif len(FT) > 0: # FT contains one or more filters + return True + else: # This should not happen + raise RuntimeError( + "Parsing error for FILTER annotation in {}, "\ + "please file a bug".format(self)) + class _AltRecord(object): '''An alternative allele record: either replacement string, SV placeholder, or breakend''' diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index b0c3ef8..4e62acb 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -1319,6 +1319,41 @@ def test_FT_one_two(self): self.assertEqual(target,result) +class TestIsFilt(unittest.TestCase): + """ Test is_filt property for _Call and _Record """ + + def test_is_filt_record(self): + reader = vcf.Reader(fh('FT.vcf')) + target = [ + False, False, True, False, False, + False, True, False, False, False + ] + result = [record.is_filt for record in reader] + self.assertEqual(target,result) + + def test_is_filt_call_unset(self): + reader = vcf.Reader(fh('FT.vcf')) + record = next(reader) + target = [False]*5 + result = [call.is_filt for call in record] + self.assertEqual(target,result) + + def test_is_filt_call_pass_two(self): + reader = vcf.Reader(fh('FT.vcf')) + next(reader) + record = next(reader) + target = [False, True, True, True, True] + result = [call.is_filt for call in record] + self.assertEqual(target,result) + + def test_is_filt_call_one(self): + reader = list(vcf.Reader(fh('FT.vcf'))) + record = reader[6] + target = [True]*5 + result = [call.is_filt for call in record] + self.assertEqual(target,result) + + class TestOpenMethods(unittest.TestCase): samples = 'NA00001 NA00002 NA00003'.split() @@ -1634,6 +1669,7 @@ def test_strelka(self): suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestIssue201)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestIssue234)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestIssue246)) +suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestIsFilt)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestOpenMethods)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestSampleFilter)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestFilter)) From 301f8fe4921c7cef3f4f1886fe4fba3fa693bbd6 Mon Sep 17 00:00:00 2001 From: Redmar Date: Sat, 13 Aug 2016 11:41:32 +0200 Subject: [PATCH 153/168] Minor code cleanup --- vcf/model.py | 24 ++++++++---------------- vcf/test/test_vcf.py | 14 +++++++------- 2 files changed, 15 insertions(+), 23 deletions(-) diff --git a/vcf/model.py b/vcf/model.py index 535286c..e6a8339 100644 --- a/vcf/model.py +++ b/vcf/model.py @@ -118,20 +118,16 @@ def is_het(self): return self.gt_type == 1 @property - def is_filt(self): + def is_filtered(self): """ Return True for filtered calls """ try: # no FT annotation present for this variant - FT=self.data.FT + filt = self.data.FT except AttributeError: return False - if FT == None or FT == []: # FT is not set or set to PASS + if filt is None or len(filt) == 0: # FT is not set or set to PASS return False - elif len(FT) > 0: # FT contains one or more filters + else: return True - else: # This should not happen - raise RuntimeError( - "Parsing error for FT annotation in {}, "\ - "please file a bug".format(self)) class _Record(object): @@ -553,17 +549,13 @@ def is_monomorphic(self): return len(self.ALT) == 1 and self.ALT[0] is None @property - def is_filt(self,call=None): + def is_filtered(self): """ Return True if a variant has been filtered """ - FT=self.FILTER - if FT == None or FT == []: # FT is not set or set to PASS + filt = self.FILTER + if filt is None or len(filt) == 0: # FILTER is not set or set to PASS return False - elif len(FT) > 0: # FT contains one or more filters + else: return True - else: # This should not happen - raise RuntimeError( - "Parsing error for FILTER annotation in {}, "\ - "please file a bug".format(self)) class _AltRecord(object): diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 4e62acb..a21b588 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -1319,8 +1319,8 @@ def test_FT_one_two(self): self.assertEqual(target,result) -class TestIsFilt(unittest.TestCase): - """ Test is_filt property for _Call and _Record """ +class TestIsFiltered(unittest.TestCase): + """ Test is_filtered property for _Call and _Record """ def test_is_filt_record(self): reader = vcf.Reader(fh('FT.vcf')) @@ -1328,14 +1328,14 @@ def test_is_filt_record(self): False, False, True, False, False, False, True, False, False, False ] - result = [record.is_filt for record in reader] + result = [record.is_filtered for record in reader] self.assertEqual(target,result) def test_is_filt_call_unset(self): reader = vcf.Reader(fh('FT.vcf')) record = next(reader) target = [False]*5 - result = [call.is_filt for call in record] + result = [call.is_filtered for call in record] self.assertEqual(target,result) def test_is_filt_call_pass_two(self): @@ -1343,14 +1343,14 @@ def test_is_filt_call_pass_two(self): next(reader) record = next(reader) target = [False, True, True, True, True] - result = [call.is_filt for call in record] + result = [call.is_filtered for call in record] self.assertEqual(target,result) def test_is_filt_call_one(self): reader = list(vcf.Reader(fh('FT.vcf'))) record = reader[6] target = [True]*5 - result = [call.is_filt for call in record] + result = [call.is_filtered for call in record] self.assertEqual(target,result) @@ -1669,7 +1669,7 @@ def test_strelka(self): suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestIssue201)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestIssue234)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestIssue246)) -suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestIsFilt)) +suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestIsFiltered)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestOpenMethods)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestSampleFilter)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestFilter)) From de3e2e9cb25511f6cd6dcd3a4e04e89b4f0b81d1 Mon Sep 17 00:00:00 2001 From: Redmar Date: Mon, 22 Aug 2016 11:21:51 +0200 Subject: [PATCH 154/168] Added support for writing FT annotations --- vcf/parser.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/vcf/parser.py b/vcf/parser.py index 5e7816b..029b18a 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -753,10 +753,26 @@ def _format_sample(self, fmt, sample): gt = './.' if 'GT' in fmt else '' if not gt: - return ':'.join([self._stringify(x) for x in sample.data]) + result = [] + for field in sample.data._fields: + value = getattr(sample.data,field) + if field == 'FT': + result.append(self._format_filter(value)) + else: + result.append(self._stringify(value)) + return ':'.join(result) # Following the VCF spec, GT is always the first item whenever it is present. else: - return ':'.join([gt] + [self._stringify(x) for x in sample.data[1:]]) + result = [] + for field in sample.data._fields: + value = getattr(sample.data,field) + if field == 'GT': + continue + if field == 'FT': + result.append(self._format_filter(value)) + else: + result.append(self._stringify(value)) + return ':'.join([gt] + result) def _stringify(self, x, none='.', delim=','): if type(x) == type([]): From ea86881578be594823055f95c16c72390a287a2e Mon Sep 17 00:00:00 2001 From: Redmar Date: Mon, 5 Sep 2016 08:39:08 +0200 Subject: [PATCH 155/168] Removed code duplication --- vcf/parser.py | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/vcf/parser.py b/vcf/parser.py index 029b18a..fbdaf25 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -752,27 +752,17 @@ def _format_sample(self, fmt, sample): else: gt = './.' if 'GT' in fmt else '' - if not gt: - result = [] - for field in sample.data._fields: - value = getattr(sample.data,field) - if field == 'FT': - result.append(self._format_filter(value)) - else: - result.append(self._stringify(value)) - return ':'.join(result) + result = [gt] if gt else [] # Following the VCF spec, GT is always the first item whenever it is present. - else: - result = [] - for field in sample.data._fields: - value = getattr(sample.data,field) - if field == 'GT': - continue - if field == 'FT': - result.append(self._format_filter(value)) - else: - result.append(self._stringify(value)) - return ':'.join([gt] + result) + for field in sample.data._fields: + value = getattr(sample.data,field) + if field == 'GT': + continue + if field == 'FT': + result.append(self._format_filter(value)) + else: + result.append(self._stringify(value)) + return ':'.join(result) def _stringify(self, x, none='.', delim=','): if type(x) == type([]): From abe72f5e4690a7068745cc3d6e8e91a3d720518c Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Wed, 19 Oct 2016 14:18:28 -0700 Subject: [PATCH 156/168] Fix docstring spelling --- vcf/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vcf/parser.py b/vcf/parser.py index fbdaf25..00a7666 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -78,7 +78,7 @@ class _vcf_metadata_parser(object): - '''Parse the metadat in the header of a VCF file.''' + '''Parse the metadata in the header of a VCF file.''' def __init__(self): super(_vcf_metadata_parser, self).__init__() self.info_pattern = re.compile(r'''\#\#INFO=< From f4d719fb8584fd91ffb51ba6436baed9e553aec4 Mon Sep 17 00:00:00 2001 From: rwness Date: Wed, 9 Mar 2016 09:40:05 -0500 Subject: [PATCH 157/168] Correct indel definition 1. Corrected RECORD.is_indel to not call reference sites as indels 2. Corrected RECORD.is_deletion to not call reference sites as deletions 3. Added deletion site to vcf/test/example-4.0.vcf 4. Added deletion site to vcf/test/walk_left.vcf 5. Corrected tests to account for site 1230237 not being an indel or deletion 5. Added tests for new site 1231234 which is an actual deletion --- vcf/model.py | 4 ++-- vcf/test/example-4.0.vcf | 1 + vcf/test/test_vcf.py | 46 ++++++++++++++++++++++++++++++++++------ vcf/test/walk_left.vcf | 1 + 4 files changed, 44 insertions(+), 8 deletions(-) diff --git a/vcf/model.py b/vcf/model.py index e6a8339..375a3f8 100644 --- a/vcf/model.py +++ b/vcf/model.py @@ -401,7 +401,7 @@ def is_indel(self): return True for alt in self.ALT: if alt is None: - return True + return False if alt.type != "SNV" and alt.type != "MNV": return False elif len(alt) != len(self.REF): @@ -452,7 +452,7 @@ def is_deletion(self): # just one alt allele alt_allele = self.ALT[0] if alt_allele is None: - return True + return False if len(self.REF) > len(alt_allele): return True else: diff --git a/vcf/test/example-4.0.vcf b/vcf/test/example-4.0.vcf index 27803a1..97fb07e 100644 --- a/vcf/test/example-4.0.vcf +++ b/vcf/test/example-4.0.vcf @@ -20,4 +20,5 @@ 20 17330 . T A 3.0 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3 20 1110696 rs6040355 A G,T 1e+03 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4 20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 +20 1231234 . AT A 46 PASS NS=3;DP=15;AA=A GT:GQ:DP:HQ 1|1:23:7:26,30 0|0:27:9:56,60 0|0:31:10:65,71 20 1234567 microsat1 GTCT G,GTACT . PASS NS=3;DP=9;AA=G GT:GQ:DP ./.:35:4 0/2:17:2 1/1:40:3 diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index a21b588..a47f4fa 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -578,6 +578,8 @@ def test_call_rate(self): self.assertEqual(3.0/3.0, call_rate) if var.POS == 1230237: self.assertEqual(3.0/3.0, call_rate) + if var.POS == 1231234: + self.assertEqual(3.0/3.0, call_rate) elif var.POS == 1234567: self.assertEqual(2.0/3.0, call_rate) @@ -593,6 +595,8 @@ def test_aaf(self): self.assertEqual([2.0/6.0, 4.0/6.0], aaf) if var.POS == 1230237: self.assertEqual([0.0/6.0], aaf) + if var.POS == 1231234: + self.assertEqual([2.0/6.0], aaf) elif var.POS == 1234567: self.assertEqual([2.0/4.0, 1.0/4.0], aaf) reader = vcf.Reader(fh('example-4.1-ploidy.vcf')) @@ -615,6 +619,8 @@ def test_pi(self): self.assertEqual(None, pi) if var.POS == 1230237: self.assertEqual(0.0/6.0, pi) + if var.POS == 1231234: + self.assertEqual((6.0/(6.0-1))*(2.0*(1.0/3.0)*(2.0/3.0)) , pi) elif var.POS == 1234567: self.assertEqual(None, pi) @@ -630,6 +636,8 @@ def test_heterozygosity(self): self.assertEqual(4.0/9.0, het) if var.POS == 1230237: self.assertEqual(0.0, het) + if var.POS == 1231234: + self.assertEqual(4.0/9.0, het) elif var.POS == 1234567: self.assertEqual(5.0/8.0, het) @@ -650,6 +658,8 @@ def test_is_snp(self): self.assertEqual(True, is_snp) if var.POS == 1230237: self.assertEqual(False, is_snp) + if var.POS == 1231234: + self.assertEqual(False, is_snp) elif var.POS == 1234567: self.assertEqual(False, is_snp) @@ -682,6 +692,8 @@ def test_is_indel(self): if var.POS == 1110696: self.assertEqual(False, is_indel) if var.POS == 1230237: + self.assertEqual(False, is_indel) + if var.POS == 1231234: self.assertEqual(True, is_indel) elif var.POS == 1234567: self.assertEqual(True, is_indel) @@ -698,6 +710,8 @@ def test_is_transition(self): self.assertEqual(False, is_trans) if var.POS == 1230237: self.assertEqual(False, is_trans) + if var.POS == 1231234: + self.assertEqual(False, is_trans) elif var.POS == 1234567: self.assertEqual(False, is_trans) @@ -712,6 +726,8 @@ def test_is_deletion(self): if var.POS == 1110696: self.assertEqual(False, is_del) if var.POS == 1230237: + self.assertEqual(False, is_del) + if var.POS == 1231234: self.assertEqual(True, is_del) elif var.POS == 1234567: self.assertEqual(False, is_del) @@ -727,6 +743,8 @@ def test_var_type(self): if var.POS == 1110696: self.assertEqual("snp", type) if var.POS == 1230237: + self.assertEqual("unknown", type) + if var.POS == 1231234: self.assertEqual("indel", type) elif var.POS == 1234567: self.assertEqual("indel", type) @@ -759,6 +777,8 @@ def test_var_subtype(self): if var.POS == 1110696: self.assertEqual("unknown", subtype) if var.POS == 1230237: + self.assertEqual("unknown", subtype) + if var.POS == 1231234: self.assertEqual("del", subtype) elif var.POS == 1234567: self.assertEqual("unknown", subtype) @@ -807,6 +827,8 @@ def test_is_sv(self): self.assertEqual(False, is_sv) if var.POS == 1230237: self.assertEqual(False, is_sv) + if var.POS == 1231234: + self.assertEqual(False, is_sv) elif var.POS == 1234567: self.assertEqual(False, is_sv) @@ -838,6 +860,8 @@ def test_is_sv_precise(self): self.assertEqual(False, is_precise) if var.POS == 1230237: self.assertEqual(False, is_precise) + if var.POS == 1231234: + self.assertEqual(False, is_precise) elif var.POS == 1234567: self.assertEqual(False, is_precise) @@ -869,6 +893,8 @@ def test_sv_end(self): self.assertEqual(None, sv_end) if var.POS == 1230237: self.assertEqual(None, sv_end) + if var.POS == 1231234: + self.assertEqual(None, sv_end) elif var.POS == 1234567: self.assertEqual(None, sv_end) @@ -885,6 +911,8 @@ def test_qual(self): expected = 1e+03 if var.POS == 1230237: expected = 47 + if var.POS == 1231234: + expected = 46 elif var.POS == 1234567: expected = None self.assertEqual(expected, qual) @@ -1166,6 +1194,8 @@ def test_phased(self): self.assertEqual([True, True, False], phases) if var.POS == 1230237: self.assertEqual([True, True, False], phases) + if var.POS == 1231234: + self.assertEqual([True, True, True], phases) elif var.POS == 1234567: self.assertEqual([False, False, False], phases) @@ -1181,6 +1211,8 @@ def test_gt_bases(self): self.assertEqual(['G|T', 'T|G', 'T/T'], gt_bases) elif var.POS == 1230237: self.assertEqual(['T|T', 'T|T', 'T/T'], gt_bases) + elif var.POS == 1231234: + self.assertEqual(['A|A', 'AT|AT', 'AT|AT'], gt_bases) elif var.POS == 1234567: self.assertEqual([None, 'GTCT/GTACT', 'G/G'], gt_bases) @@ -1198,6 +1230,8 @@ def test_gt_types(self): self.assertEqual([1,1,2], gt_types) elif var.POS == 1230237: self.assertEqual([0,0,0], gt_types) + elif var.POS == 1231234: + self.assertEqual([2,0,0], gt_types) elif var.POS == 1234567: self.assertEqual([None,1,2], gt_types) @@ -1235,20 +1269,20 @@ def testFetchRange(self): fetched_variants = self.reader.fetch('20', 1110695, 1234567) self.assertFetchedExpectedPositions( - fetched_variants, [1110696, 1230237, 1234567]) + fetched_variants, [1110696, 1230237, 1231234, 1234567]) def testFetchesFromStartIfStartOnlySpecified(self): fetched_variants = self.reader.fetch('20', 1110695) self.assertFetchedExpectedPositions( - fetched_variants, [1110696, 1230237, 1234567]) + fetched_variants, [1110696, 1230237, 1231234, 1234567]) def testFetchesAllFromChromIfOnlyChromSpecified(self): fetched_variants = self.reader.fetch('20') self.assertFetchedExpectedPositions( fetched_variants, - [14370, 17330, 1110696, 1230237, 1234567] + [14370, 17330, 1110696, 1230237, 1231234, 1234567] ) @@ -1517,10 +1551,10 @@ def test_walk(self): self.assertEqual(x[0], x[1]) self.assertEqual(x[1], x[2]) n+= 1 - self.assertEqual(n, 5) + self.assertEqual(n, 6) - # artificial case 2 from the left, 2 from the right, 2 together, 1 from the right, 1 from the left - expected = 'llrrttrl' + # artificial case 2 from the left, 2 from the right, 3 together, 1 from the right, 1 from the left + expected = 'llrrtttrl' reader1 = vcf.Reader(fh('walk_left.vcf')) reader2 = vcf.Reader(fh('example-4.0.vcf')) diff --git a/vcf/test/walk_left.vcf b/vcf/test/walk_left.vcf index c910432..aafb82b 100644 --- a/vcf/test/walk_left.vcf +++ b/vcf/test/walk_left.vcf @@ -21,4 +21,5 @@ 19 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3:65,3 20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4:65,4 20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2:65,3 +20 1231234 . AT A 46 PASS NS=3;DP=15;AA=A GT:GQ:DP:HQ 1|1:23:7:26,30 0|0:27:9:56,60 0|0:31:10:65,71 21 1234567 microsat1 GTCT G,GTACT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP ./.:35:4 0/2:17:2 1/1:40:3 From 8b54f4e672733a32a019e8509864ff24a3bd6b8b Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Thu, 26 Jan 2017 20:57:09 +0100 Subject: [PATCH 158/168] Undo some collateral damage to tests --- vcf/test/test_vcf.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index a47f4fa..ec69920 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -1269,20 +1269,20 @@ def testFetchRange(self): fetched_variants = self.reader.fetch('20', 1110695, 1234567) self.assertFetchedExpectedPositions( - fetched_variants, [1110696, 1230237, 1231234, 1234567]) + fetched_variants, [1110696, 1230237, 1234567]) def testFetchesFromStartIfStartOnlySpecified(self): fetched_variants = self.reader.fetch('20', 1110695) self.assertFetchedExpectedPositions( - fetched_variants, [1110696, 1230237, 1231234, 1234567]) + fetched_variants, [1110696, 1230237, 1234567]) def testFetchesAllFromChromIfOnlyChromSpecified(self): fetched_variants = self.reader.fetch('20') self.assertFetchedExpectedPositions( fetched_variants, - [14370, 17330, 1110696, 1230237, 1231234, 1234567] + [14370, 17330, 1110696, 1230237, 1234567] ) @@ -1351,7 +1351,7 @@ def test_FT_one_two(self): ] result=[call.data.FT for call in r.samples] self.assertEqual(target,result) - + class TestIsFiltered(unittest.TestCase): """ Test is_filtered property for _Call and _Record """ From 7bf793f9508b37549d466efe0686fa6116873cc6 Mon Sep 17 00:00:00 2001 From: Sam Brightman Date: Thu, 26 Jan 2017 20:11:37 +0100 Subject: [PATCH 159/168] Always use a list for list-type fields Singleton lists - e.g. Number=A with a single allele - are now parsed into lists instead of being treated as single values. This is more consistent with the meaning of the field definition and thus easier for client code. Fixes #254. --- vcf/cparse.pyx | 8 +------- vcf/parser.py | 8 +------- vcf/test/issue-254.vcf | 9 +++++++++ vcf/test/test_vcf.py | 12 ++++++++++++ 4 files changed, 23 insertions(+), 14 deletions(-) create mode 100644 vcf/test/issue-254.vcf diff --git a/vcf/cparse.pyx b/vcf/cparse.pyx index d6a93ae..87f806d 100644 --- a/vcf/cparse.pyx +++ b/vcf/cparse.pyx @@ -65,8 +65,7 @@ def parse_samples( entry_num = samp_fmt_nums[j] # we don't need to split single entries - if entry_num == 1 or ',' not in vals: - + if entry_num == 1: if entry_type == INTEGER: try: sampdat[j] = int(vals) @@ -76,14 +75,9 @@ def parse_samples( sampdat[j] = float(vals) else: sampdat[j] = vals - - if entry_num != 1: - sampdat[j] = (sampdat[j]) - continue vals = vals.split(',') - if entry_type == INTEGER: try: sampdat[j] = _map(int, vals) diff --git a/vcf/parser.py b/vcf/parser.py index 00a7666..bb7c90c 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -491,8 +491,7 @@ def _parse_samples(self, samples, samp_fmt, site): entry_type = samp_fmt._types[i] # we don't need to split single entries - if entry_num == 1 or ',' not in vals: - + if entry_num == 1: if entry_type == 'Integer': try: sampdat[i] = int(vals) @@ -502,14 +501,9 @@ def _parse_samples(self, samples, samp_fmt, site): sampdat[i] = float(vals) else: sampdat[i] = vals - - if entry_num != 1: - sampdat[i] = (sampdat[i]) - continue vals = vals.split(',') - if entry_type == 'Integer': try: sampdat[i] = _map(int, vals) diff --git a/vcf/test/issue-254.vcf b/vcf/test/issue-254.vcf new file mode 100644 index 0000000..c17262d --- /dev/null +++ b/vcf/test/issue-254.vcf @@ -0,0 +1,9 @@ +##fileformat=VCFv4.1 +##fileDate=20090805 +##source=myImputationProgramV3.1 +##reference=1000GenomesPilot-NCBI36 +##phasing=partial +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 +21 4242421 . T A 30 . . GT:AO 0|0:0.1 0|1:0.2 0/0:0.3 diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index ec69920..a09b0b9 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -1353,6 +1353,17 @@ def test_FT_one_two(self): self.assertEqual(target,result) +class TestIssue254(unittest.TestCase): + """ See https://github.com/jamescasbon/PyVCF/issues/254 """ + + def test_remains_singleton_list(self): + reader = vcf.Reader(fh('issue-254.vcf')) + record = next(reader) + expected = [[0.1], [0.2], [0.3]] + actual = [call.data.AO for call in record.samples] + self.assertEqual(expected, actual) + + class TestIsFiltered(unittest.TestCase): """ Test is_filtered property for _Call and _Record """ @@ -1703,6 +1714,7 @@ def test_strelka(self): suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestIssue201)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestIssue234)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestIssue246)) +suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestIssue254)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestIsFiltered)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestOpenMethods)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestSampleFilter)) From 1fc5ca3b467e5c45a49a711d06b2f5308e0b835f Mon Sep 17 00:00:00 2001 From: Sam Brightman Date: Thu, 26 Jan 2017 20:22:19 +0100 Subject: [PATCH 160/168] Unify code paths' treatment of "Numeric" type --- vcf/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vcf/parser.py b/vcf/parser.py index bb7c90c..be640f4 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -497,7 +497,7 @@ def _parse_samples(self, samples, samp_fmt, site): sampdat[i] = int(vals) except ValueError: sampdat[i] = float(vals) - elif entry_type == 'Float': + elif entry_type == 'Float' or entry_type == 'Numeric': sampdat[i] = float(vals) else: sampdat[i] = vals From ef406459cccbf63976b937b0a097f2a03dcd19e1 Mon Sep 17 00:00:00 2001 From: Sam Brightman Date: Sat, 28 Jan 2017 13:01:25 +0100 Subject: [PATCH 161/168] Unify tested versions, including Python 3.5/3.6 and PyPy --- .travis.yml | 1 + setup.py | 4 ++++ tox.ini | 6 +++++- 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 658f857..b346129 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,6 +11,7 @@ python: - "3.3" - "3.4" - "3.5" + - "3.6" - "nightly" - "pypy" - "pypy3" diff --git a/setup.py b/setup.py index d8089c0..a6e0595 100644 --- a/setup.py +++ b/setup.py @@ -74,6 +74,10 @@ 'Programming Language :: Python :: 3.2', 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: Implementation :: CPython', + 'Programming Language :: Python :: Implementation :: PyPy', 'Topic :: Scientific/Engineering :: Bio-Informatics', ], keywords='bioinformatics', diff --git a/tox.ini b/tox.ini index 64a7ab4..394251d 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py26, py27, py32, py33, py34 +envlist = py26, py27, py32, py33, py34, py35, py36, pypy, pypy3 [testenv] deps = @@ -20,3 +20,7 @@ deps = [testenv:pypy] deps = -rrequirements/pypy-requirements.txt + +[testenv:pypy3] +deps = + -rrequirements/pypy-requirements.txt From 60ae36f93ffc82bcaad0ab08f5f07d7ebdfc1201 Mon Sep 17 00:00:00 2001 From: Sam Brightman Date: Sat, 28 Jan 2017 13:02:09 +0100 Subject: [PATCH 162/168] Fix Tox warning by using clean command instead of rm --- tox.ini | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tox.ini b/tox.ini index 394251d..fb00aa3 100644 --- a/tox.ini +++ b/tox.ini @@ -10,8 +10,7 @@ envlist = py26, py27, py32, py33, py34, py35, py36, pypy, pypy3 deps = -rrequirements/common-requirements.txt commands = - rm -rf {toxinidir}/build - python setup.py test + python setup.py clean --all test [testenv:py26] deps = From cc005deb32c64b6c7648b39d3e2517c68a215a48 Mon Sep 17 00:00:00 2001 From: Sam Brightman Date: Sat, 28 Jan 2017 13:04:49 +0100 Subject: [PATCH 163/168] Drop Python 2.6, since PySAM needs sysconfig --- .travis.yml | 3 +-- requirements/python2.6-requirements.txt | 5 ----- setup.py | 8 -------- tox.ini | 6 +----- 4 files changed, 2 insertions(+), 20 deletions(-) delete mode 100644 requirements/python2.6-requirements.txt diff --git a/.travis.yml b/.travis.yml index b346129..221bd18 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,7 +5,6 @@ cache: directories: - $HOME/.cache/pip python: - - "2.6" - "2.7" - "3.2" - "3.3" @@ -16,6 +15,6 @@ python: - "pypy" - "pypy3" install: - - "if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install -r requirements/python2.6-requirements.txt; elif [[ $TRAVIS_PYTHON_VERSION == 'pypy' ]] || [[ $TRAVIS_PYTHON_VERSION == 'pypy3' ]]; then pip install -r requirements/pypy-requirements.txt; else pip install -r requirements/common-requirements.txt; fi" + - if [[ "$TRAVIS_PYTHON_VERSION" =~ ^pypy ]]; then pip install -r requirements/pypy-requirements.txt; else pip install -r requirements/common-requirements.txt; fi - python setup.py install script: python setup.py test diff --git a/requirements/python2.6-requirements.txt b/requirements/python2.6-requirements.txt deleted file mode 100644 index 27c9bc2..0000000 --- a/requirements/python2.6-requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ --r common-requirements.txt -argparse -counter -ordereddict -unittest2 diff --git a/setup.py b/setup.py index a6e0595..0bfd710 100644 --- a/setup.py +++ b/setup.py @@ -8,15 +8,8 @@ except: CYTHON = False -IS_PYTHON26 = sys.version_info[:2] == (2, 6) - DEPENDENCIES = ['setuptools'] -if IS_PYTHON26: - DEPENDENCIES.extend(['argparse', 'counter', 'ordereddict', - 'unittest2']) - - # get the version without an import VERSION = "Undefined" DOC = "" @@ -68,7 +61,6 @@ 'Programming Language :: Cython', 'Programming Language :: Python', 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.2', diff --git a/tox.ini b/tox.ini index fb00aa3..d6a9c09 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py26, py27, py32, py33, py34, py35, py36, pypy, pypy3 +envlist = py27, py32, py33, py34, py35, py36, pypy, pypy3 [testenv] deps = @@ -12,10 +12,6 @@ deps = commands = python setup.py clean --all test -[testenv:py26] -deps = - -rrequirements/python2.6-requirements.txt - [testenv:pypy] deps = -rrequirements/pypy-requirements.txt From d8839579d90c203425097bafb04a4f8fba747307 Mon Sep 17 00:00:00 2001 From: Sam Brightman Date: Sat, 28 Jan 2017 16:59:11 +0100 Subject: [PATCH 164/168] Drop 3.2/3.3, since PySAM does not build with them --- .travis.yml | 2 -- setup.py | 2 -- tox.ini | 2 +- 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index 221bd18..ad315b2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,8 +6,6 @@ cache: - $HOME/.cache/pip python: - "2.7" - - "3.2" - - "3.3" - "3.4" - "3.5" - "3.6" diff --git a/setup.py b/setup.py index 0bfd710..b865b8d 100644 --- a/setup.py +++ b/setup.py @@ -63,8 +63,6 @@ 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.2', - 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', diff --git a/tox.ini b/tox.ini index d6a9c09..af7049e 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py27, py32, py33, py34, py35, py36, pypy, pypy3 +envlist = py27, py34, py35, py36, pypy, pypy3 [testenv] deps = From a9b1731ac555b32e0f146be46789c3c8668a4903 Mon Sep 17 00:00:00 2001 From: Eric Date: Wed, 1 Feb 2017 14:32:33 +0200 Subject: [PATCH 165/168] handle empty string as none --- vcf/parser.py | 4 ++-- vcf/test/bad-info-character.vcf | 8 ++++++++ vcf/test/test_vcf.py | 10 ++++++++++ 3 files changed, 20 insertions(+), 2 deletions(-) create mode 100644 vcf/test/bad-info-character.vcf diff --git a/vcf/parser.py b/vcf/parser.py index be640f4..c3c3d08 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -354,9 +354,9 @@ def _parse_metainfo(self): self.samples = fields[9:] self._sample_indexes = dict([(x,i) for (i,x) in enumerate(self.samples)]) - def _map(self, func, iterable, bad='.'): + def _map(self, func, iterable, bad=['.', '']): '''``map``, but make bad values None.''' - return [func(x) if x != bad else None + return [func(x) if x not in bad else None for x in iterable] def _parse_filter(self, filt_str): diff --git a/vcf/test/bad-info-character.vcf b/vcf/test/bad-info-character.vcf new file mode 100644 index 0000000..8b23ae4 --- /dev/null +++ b/vcf/test/bad-info-character.vcf @@ -0,0 +1,8 @@ +##fileformat=VCFv4.1 +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample +chr1 100 id1 G A . . EMPTY=;DOT=.;NOTEMPTY=6 GT 0/1 diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index a09b0b9..9dca0bd 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -393,6 +393,16 @@ def test_write(self): self.assertEquals(l.INFO, r.INFO) +class TestInfoBadInfoFields(unittest.TestCase): + def test_parse(self): + reader = vcf.Reader(fh('bad-info-character.vcf')) + record = next(reader) + self.assertEquals(record.INFO['DOT'], [None]) + self.assertEquals(record.INFO['EMPTY'], [None]) + self.assertEquals(record.INFO['NOTEMPTY'], ['6']) + pass + + class TestParseMetaLine(unittest.TestCase): def test_parse(self): reader = vcf.Reader(fh('parse-meta-line.vcf')) From 3b76ada9beda23553456919e8eb9cd34dcb36623 Mon Sep 17 00:00:00 2001 From: Eric Date: Thu, 2 Feb 2017 11:34:43 +0200 Subject: [PATCH 166/168] CR comments --- vcf/cparse.pyx | 4 ++-- vcf/test/bad-info-character.vcf | 14 +++++++++----- vcf/test/test_vcf.py | 13 +++++++++---- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/vcf/cparse.pyx b/vcf/cparse.pyx index 87f806d..334542a 100644 --- a/vcf/cparse.pyx +++ b/vcf/cparse.pyx @@ -1,8 +1,8 @@ from model import _Call -cdef _map(func, iterable, bad='.'): +cdef _map(func, iterable, bad=['.', '']): '''``map``, but make bad values None.''' - return [func(x) if x != bad else None + return [func(x) if x not in bad else None for x in iterable] INTEGER = 'Integer' diff --git a/vcf/test/bad-info-character.vcf b/vcf/test/bad-info-character.vcf index 8b23ae4..93b87e1 100644 --- a/vcf/test/bad-info-character.vcf +++ b/vcf/test/bad-info-character.vcf @@ -1,8 +1,12 @@ ##fileformat=VCFv4.1 -##INFO= -##INFO= -##INFO= -##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample -chr1 100 id1 G A . . EMPTY=;DOT=.;NOTEMPTY=6 GT 0/1 +chr1 100 id1 G A . . FLAG;EMPTY=;EMPTY_6=;EMPTY_N=;DOT=.;DOT_6=.;DOT_N=.;NOTEMPTY=6 GT 0/1 diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 9dca0bd..0d107b2 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -393,13 +393,17 @@ def test_write(self): self.assertEquals(l.INFO, r.INFO) -class TestInfoBadInfoFields(unittest.TestCase): +class TestBadInfoFields(unittest.TestCase): def test_parse(self): reader = vcf.Reader(fh('bad-info-character.vcf')) record = next(reader) - self.assertEquals(record.INFO['DOT'], [None]) - self.assertEquals(record.INFO['EMPTY'], [None]) - self.assertEquals(record.INFO['NOTEMPTY'], ['6']) + self.assertEquals(record.INFO['DOT'], None) + self.assertEquals(record.INFO['DOT_6'], None) + self.assertEquals(record.INFO['DOT_N'], None) + self.assertEquals(record.INFO['EMPTY'], None) + self.assertEquals(record.INFO['EMPTY_6'], None) + self.assertEquals(record.INFO['EMPTY_N'], None) + self.assertEquals(record.INFO['NOTEMPTY'], [6]) pass @@ -1734,3 +1738,4 @@ def test_strelka(self): suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestGATKMeta)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestUncalledGenotypes)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestStrelka)) +suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestBadInfoFields)) From a2f4a4405e79b22bbe85ce987c33a56e0fbf56cc Mon Sep 17 00:00:00 2001 From: Martijn Vermaat Date: Mon, 6 Feb 2017 00:10:24 +0100 Subject: [PATCH 167/168] More testing for issue 264 --- vcf/test/bad-info-character.vcf | 18 ++++++++++-------- vcf/test/test_vcf.py | 18 +++++++++++------- 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/vcf/test/bad-info-character.vcf b/vcf/test/bad-info-character.vcf index 93b87e1..099470c 100644 --- a/vcf/test/bad-info-character.vcf +++ b/vcf/test/bad-info-character.vcf @@ -1,12 +1,14 @@ ##fileformat=VCFv4.1 -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= ##INFO= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample -chr1 100 id1 G A . . FLAG;EMPTY=;EMPTY_6=;EMPTY_N=;DOT=.;DOT_6=.;DOT_N=.;NOTEMPTY=6 GT 0/1 +chr1 100 id1 G A . . FLAG;EMPTY_1=;EMPTY_3=;EMPTY_N=;DOT_1=.;DOT_3=.,.,.;DOT_N=.;NOTEMPTY_1=1;NOTEMPTY_3=1,2,3;NOTEMPTY_N=1 GT 0/1 diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index 0d107b2..b2e3121 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -397,13 +397,17 @@ class TestBadInfoFields(unittest.TestCase): def test_parse(self): reader = vcf.Reader(fh('bad-info-character.vcf')) record = next(reader) - self.assertEquals(record.INFO['DOT'], None) - self.assertEquals(record.INFO['DOT_6'], None) - self.assertEquals(record.INFO['DOT_N'], None) - self.assertEquals(record.INFO['EMPTY'], None) - self.assertEquals(record.INFO['EMPTY_6'], None) - self.assertEquals(record.INFO['EMPTY_N'], None) - self.assertEquals(record.INFO['NOTEMPTY'], [6]) + self.assertEquals(record.INFO['DOT_1'], None) + self.assertEquals(record.INFO['DOT_3'], [None, None, None]) + self.assertEquals(record.INFO['DOT_N'], [None]) + self.assertEquals(record.INFO['EMPTY_1'], None) + # Perhaps EMPTY_3 should yield [None, None, None] but this is really a + # cornercase of unspecified behaviour. + self.assertEquals(record.INFO['EMPTY_3'], [None]) + self.assertEquals(record.INFO['EMPTY_N'], [None]) + self.assertEquals(record.INFO['NOTEMPTY_1'], 1) + self.assertEquals(record.INFO['NOTEMPTY_3'], [1, 2, 3]) + self.assertEquals(record.INFO['NOTEMPTY_N'], [1]) pass From d91ec5ed4e85fc34dc6942eca70dd869d75d1931 Mon Sep 17 00:00:00 2001 From: Sam Brightman Date: Tue, 14 Feb 2017 16:32:52 +0100 Subject: [PATCH 168/168] Allow partially-called genotypes to be considered called --- vcf/model.py | 4 ++-- vcf/test/test_vcf.py | 10 ++++++++++ vcf/test/uncalled_genotypes.vcf | 1 + 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/vcf/model.py b/vcf/model.py index 375a3f8..34a4d17 100644 --- a/vcf/model.py +++ b/vcf/model.py @@ -26,7 +26,7 @@ def __init__(self, site, sample, data): if getattr(self.data, 'GT', None) is not None: self.gt_alleles = [(al if al != '.' else None) for al in allele_delimiter.split(self.data.GT)] self.ploidity = len(self.gt_alleles) - self.called = all([al != None for al in self.gt_alleles]) + self.called = any(al is not None for al in self.gt_alleles) self.gt_nums = self.data.GT if self.called else None else: #62 a call without a genotype is not defined as called or not @@ -65,7 +65,7 @@ def gt_bases(self): if self.called: # lookup and return the actual DNA alleles try: - return self.gt_phase_char().join(str(self.site.alleles[int(X)]) for X in self.gt_alleles) + return self.gt_phase_char().join(str(self.site.alleles[int(X)] if X is not None else '.') for X in self.gt_alleles) except: sys.stderr.write("Allele number not found in list of alleles\n") else: diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index a09b0b9..deeff01 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -1639,22 +1639,32 @@ def test_read_uncalled(self): gt_nums = [s.gt_nums for s in var.samples] ploidity = [s.ploidity for s in var.samples] gt_alleles = [s.gt_alleles for s in var.samples] + gt_type = [s.gt_type for s in var.samples] if var.POS == 14370: self.assertEqual(['0|0', None, '1/1'], gt_nums) self.assertEqual(['G|G', None, 'A/A'], gt_bases) self.assertEqual([2,2,2], ploidity) self.assertEqual([['0','0'], [None,None], ['1','1']], gt_alleles) + self.assertEqual([0, None, 2], gt_type) elif var.POS == 17330: self.assertEqual([None, '0|1', '0/0'], gt_nums) self.assertEqual([None, 'T|A', 'T/T'], gt_bases) self.assertEqual([3,2,2], ploidity) self.assertEqual([[None,None,None], ['0','1'], ['0','0']], gt_alleles) + self.assertEqual([None, 1, 0], gt_type) elif var.POS == 1234567: self.assertEqual(['0/1', '0/2', None], gt_nums) self.assertEqual(['GTC/G', 'GTC/GTCT', None], gt_bases) self.assertEqual([2,2,1], ploidity) self.assertEqual([['0','1'], ['0','2'], [None]], gt_alleles) + self.assertEqual([1, 1, None], gt_type) + elif var.POS == 1234568: + self.assertEqual(['./1', '0/.', None], gt_nums) + self.assertEqual(['./G', 'GTC/.', None], gt_bases) + self.assertEqual([2,2,1], ploidity) + self.assertEqual([[None,'1'], ['0',None], [None]], gt_alleles) + self.assertEqual([1, 1, None], gt_type) reader._reader.close() diff --git a/vcf/test/uncalled_genotypes.vcf b/vcf/test/uncalled_genotypes.vcf index 2032097..794aea7 100644 --- a/vcf/test/uncalled_genotypes.vcf +++ b/vcf/test/uncalled_genotypes.vcf @@ -5,3 +5,4 @@ 20 14370 rs6054257 G A 29 PASS NS=3 GT 0|0 ./. 1/1 20 17330 . T A 3 q10 NS=3 GT ././. 0|1 0/0 20 1234567 microsat1 GTC G,GTCT 50 PASS NS=3 GT 0/1 0/2 . +20 1234568 . GTC G,GTCT 50 PASS NS=3 GT ./1 0/. .