pax_global_header00006660000000000000000000000064134652163440014522gustar00rootroot0000000000000052 comment=a5762db4926dfecf7306e23b7d77a7416a1a3fce surpyvor-0.5/000077500000000000000000000000001346521634400132775ustar00rootroot00000000000000surpyvor-0.5/.gitignore000066400000000000000000000000531346521634400152650ustar00rootroot00000000000000.remote-sync.json dist/ surpyvor.egg-info/ surpyvor-0.5/LICENSE000066400000000000000000000020611346521634400143030ustar00rootroot00000000000000MIT License Copyright (c) 2018 Wouter De Coster Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. surpyvor-0.5/README.md000066400000000000000000000010201346521634400145470ustar00rootroot00000000000000# surpyvor A python wrapper around [SURVIVOR](https://github.com/fritzsedlazeck/SURVIVOR), with additional convenience functions. ## sub-commands: merge merging vcf files of SVs highsens get union of SV vcfs highconf get intersection of SV vcfs prf calculate precision, recall and F-measure upset Make upset plot for multiple SV vcf files Each sub-command has its own help information, accessible by running `surpyvor -h/--help` surpyvor-0.5/setup.py000066400000000000000000000025561346521634400150210ustar00rootroot00000000000000# Always prefer setuptools over distutils from setuptools import setup, find_packages # To use a consistent encoding from codecs import open from os import path here = path.abspath(path.dirname(__file__)) exec(open('surpyvor/version.py').read()) setup( name='surpyvor', version=__version__, description='Manipulate vcf files of structural variants using SURVIVOR', long_description=open(path.join(here, "README.md")).read(), long_description_content_type="text/markdown", url='https://github.com/wdecoster/surpyvor', author='Wouter De Coster', author_email='decosterwouter@gmail.com', license='MIT', classifiers=[ 'Development Status :: 4 - Beta', 'Intended Audience :: Science/Research', 'Topic :: Scientific/Engineering :: Bio-Informatics', 'License :: OSI Approved :: MIT License', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', ], keywords='nanopore', packages=find_packages(), python_requires='>=3', install_requires=[], package_data={'surpyvor': []}, package_dir={'surpyvor': 'surpyvor'}, include_package_data=True, entry_points={ 'console_scripts': [ 'surpyvor=surpyvor.surpyvor:main', ], }, ) surpyvor-0.5/surpyvor/000077500000000000000000000000001346521634400152105ustar00rootroot00000000000000surpyvor-0.5/surpyvor/__init__.py000066400000000000000000000000501346521634400173140ustar00rootroot00000000000000import matplotlib matplotlib.use('Agg') surpyvor-0.5/surpyvor/parse_arguments.py000066400000000000000000000235171346521634400207710ustar00rootroot00000000000000from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter from .version import __version__ import subprocess import sys from os import path def get_args(): parser = ArgumentParser(description="A wrapper around SURVIVOR, with convenience functions.", formatter_class=ArgumentDefaultsHelpFormatter,) parser.add_argument("-v", "--version", action="version", version='surpyvor: {}, SURVIVOR {}'.format( __version__, get_survivor_version()), help="Print version and quit.") subparsers = parser.add_subparsers(dest='command', title='[sub-commands]') merge = subparsers.add_parser("merge", help="merging vcf files of SVs", formatter_class=ArgumentDefaultsHelpFormatter) merge_req = merge.add_argument_group('required arguments') merge_req.add_argument("--variants", nargs='+', required=True, help="vcf files to merge") merge_opt = merge.add_argument_group('optional arguments') merge_opt.add_argument("-o", "--output", help="output file", default="stdout") merge_opt.add_argument("-d", "--distance", type=int, default=500, help="distance between variants to merge") merge_opt.add_argument("-l", "--minlength", type=int, default=50, help="Minimum length of variants to consider") merge_opt.add_argument("-c", "--callers", type=int, default=1, help="Minimum number of callers to support a variant") merge_opt.add_argument("-i", "--ignore_type", help="Ignore the type of the structural variant", action="store_true", default=False) merge_opt.add_argument("-s", "--strand", action="store_true", default=False, help="Take strand into account") merge_opt.add_argument("-e", "--estimate_distance", action="store_true", default=False, help="Estimate distance between calls") highsens = subparsers.add_parser("highsens", help="get union of SV vcfs", formatter_class=ArgumentDefaultsHelpFormatter) highsens_req = highsens.add_argument_group('required arguments') highsens_req.add_argument("--variants", nargs='+', required=True, help="vcf files to merge") highsens_opt = highsens.add_argument_group('optional arguments') highsens_opt.add_argument("-o", "--output", help="output file", default="stdout") highconf = subparsers.add_parser("highconf", help="get intersection of SV vcfs", formatter_class=ArgumentDefaultsHelpFormatter) highconf_req = highconf.add_argument_group('required arguments') highconf_req.add_argument("--variants", nargs='+', required=True, help="vcf files to merge") highconf_opt = highconf.add_argument_group('optional arguments') highconf_opt.add_argument("-o", "--output", help="output file", default="stdout") prf = subparsers.add_parser('prf', help="calculate precision, recall and F-measure", formatter_class=ArgumentDefaultsHelpFormatter) prf_req = prf.add_argument_group('required arguments') prf_req.add_argument("--truth", help="vcf containing truth set", required=True) prf_req.add_argument("--test", help="vcf containing test set", required=True) prf_opt = prf.add_argument_group('optional arguments') prf_opt.add_argument("-d", "--distance", help="maximum distance between test and truth call", default=500) prf_opt.add_argument("--minlength", help="Minimum length of SVs to be taken into account", default=50) prf_opt.add_argument("-i", "--ignore_type", help="Ignore the type of the structural variant", action="store_true", default=False) prf_opt.add_argument("--ignore_chroms", help="Chromosomes to ignore for prf calculation.", nargs='*', default=['chrEBV']) prf_opt.add_argument("--keepmerged", help="Save merged vcf file.", default=False) prf_opt.add_argument("--bar", help="Make stacked bar chart of SV lengths coloured by validation status", action="store_true") prf_opt.add_argument("--matrix", help="Make a confusion matrix.", action="store_true") venn = subparsers.add_parser('venn', help="Make venn diagram for 2 or 3 SV vcf files", formatter_class=ArgumentDefaultsHelpFormatter) venn_req = venn.add_argument_group('required arguments') venn_req.add_argument("--variants", help="vcfs containing structural variants", required=True, nargs="*") venn_opt = venn.add_argument_group('optional arguments') venn_opt.add_argument("--names", help="Names of datasets in --variants", nargs="*") venn_opt.add_argument("-d", "--distance", help="maximum distance between test and truth call", default=500) venn_opt.add_argument("--minlength", help="Minimum length of SVs to be taken into account", default=50) venn_opt.add_argument("-i", "--ignore_type", help="Ignore the type of the structural variant", action="store_true", default=False) venn_opt.add_argument("--keepmerged", help="Save merged vcf file") venn_opt.add_argument("--plotout", help="Name of output plot", default="venn.png") upset = subparsers.add_parser('upset', help="Make upset plot for multiple SV vcf files", formatter_class=ArgumentDefaultsHelpFormatter) upset_req = upset.add_argument_group('required arguments') upset_req.add_argument("--variants", help="vcfs containing structural variants", required=True, nargs="*") upset_opt = upset.add_argument_group('optional arguments') upset_opt.add_argument("--names", help="Names of datasets in --variants", nargs="*") upset_opt.add_argument("-d", "--distance", help="maximum distance between test and truth call", default=500) upset_opt.add_argument("--minlength", help="Minimum length of SVs to be taken into account", default=50) upset_opt.add_argument("-i", "--ignore_type", help="Ignore the type of the structural variant", action="store_true", default=False) upset_opt.add_argument("--keepmerged", help="Save merged vcf file") upset_opt.add_argument("--plotout", help="Name of output plot", default="UpSetPlot.png") args = parser.parse_args() validate_args(parser, args) return args def validate_args(parser, args): if not args.command: sys.stderr.write("INPUT ERROR: sub-command required\n\n") parser.print_help() sys.exit() if args.command in ['upset', 'venn']: if args.names: if not len(args.variants) == len(args.names): sys.exit("INPUT ERROR: " "Need to have same number of values in --names as --variants!") if args.command == 'venn': if len(args.variants) > 3: sys.exit("INPUT ERROR: " "Venn diagrams are only created for 2 or 3 vcf files!") if hasattr(args, 'variants'): for f in args.variants: if not path.isfile(f): sys.exit("File not found: {}".format(f)) if hasattr(args, 'truth'): if not path.isfile(args.truth): sys.exit("File not found: {}".format(args.truth)) if hasattr(args, 'test'): if not path.isfile(args.test): sys.exit("File not found: {}".format(args.test)) def get_survivor_version(): for line in subprocess.check_output(args="SURVIVOR", stderr=subprocess.STDOUT, universal_newlines=True).split('\n'): if line.startswith("Version:"): return line.strip().split(' ')[1] else: return "version not found" surpyvor-0.5/surpyvor/plots.py000066400000000000000000000040111346521634400167170ustar00rootroot00000000000000from cyvcf2 import VCF import matplotlib.pyplot as plt from surpyvor import utils import numpy as np from matplotlib_venn import venn2, venn3 from upsetplot import plot as upsetplot def bar_chart(vcf, outname="stacked_bar.png"): """ Make a stacked bar chart for length of the SV split by validation status This ignores zygosity. """ len_dict = {"True": [], "False": [], "Missed": []} for v in VCF(vcf): if not v.INFO.get('SVTYPE') == 'TRA' and abs(v.INFO.get('SVLEN')) >= 50: calls = [utils.is_variant(call) for call in v.gt_types] if calls == [True, True]: len_dict['True'].append(v.INFO.get('SVLEN')) elif calls == [False, True]: len_dict['False'].append(v.INFO.get('SVLEN')) elif calls == [True, False]: len_dict['Missed'].append(v.INFO.get('SVLEN')) plt.subplot(2, 1, 1) plt.hist(x=np.array(list(len_dict.values())), bins=[i for i in range(0, 2000, 10)], stacked=True, histtype='bar', label=list(len_dict.keys())) plt.xlabel('Length of structural variant') plt.ylabel('Number of variants') plt.legend(frameon=False, fontsize="small") plt.subplot(2, 1, 2) plt.hist(x=np.array(list(len_dict.values())), bins=[i for i in range(0, 20000, 100)], stacked=True, histtype='bar', label=list(len_dict.keys()), log=True) plt.xlabel('Length of structural variant') plt.ylabel('Number of variants') plt.legend(frameon=False, fontsize="small") plt.tight_layout() plt.savefig(outname) plt.close() def upset_plot(upsets, outname="UpSetPlot.png"): upsetplot(upsets, sort_by='cardinality') plt.savefig(outname) def venn_diagram(sets, labels, num_samples=2, outname="venn.png"): if num_samples == 2: venn = venn2 else: venn = venn3 venn(sets, set_labels=labels) plt.savefig(outname) plt.close() surpyvor-0.5/surpyvor/surpyvor.py000066400000000000000000000112651346521634400175000ustar00rootroot00000000000000from surpyvor import plots, utils, parse_arguments import subprocess import tempfile import sys import shlex import os def main(): args = parse_arguments.get_args() utils.test_dependencies() if args.command == "merge": sv_merge(samples=args.variants, distance=args.distance, callers=args.callers, require_type=not args.ignore_type, require_strand=args.strand, estimate_distance=args.estimate_distance, minlength=args.minlength, output=args.output) elif args.command == "highsens": sv_merge(samples=[utils.vcf_concat(args.variants)], distance=100, callers=1, require_type=True, require_strand=False, estimate_distance=False, minlength=50, output=args.output) elif args.command == "highconf": sv_merge(samples=args.variants, distance=500, callers=len(args.variants), require_type=True, require_strand=False, estimate_distance=False, minlength=50, output=args.output) elif args.command == 'prf': precision_recall_fmeasure(args) elif args.command == 'upset': upset(args) elif args.command == 'venn': venn(args) def sv_merge(samples, distance, callers, require_type, require_strand, estimate_distance, minlength, output): """ Executes SURVIVOR merge, with parameters: -samples.fofn (samples, list) -distance between calls (distance, int) -number of callers to support call (callers, int) -require variants to have sampe type (type, boolean) -require variants to be on same strand (strand, boolean) -estimate distance between calls (estimate_distance, boolean) -specify minimal size of SV event (minlength, int) """ fhf, fofn_f = tempfile.mkstemp() fhs, interm_out = tempfile.mkstemp(suffix=".vcf") with open(fofn_f, 'w') as fofn: for s in [utils.decompress(s) for s in samples]: fofn.write(s + "\n") survivor_cmd = "SURVIVOR merge {fof} {dist} {call} {typ} {str} {estm} {ml} {out}".format( fof=fofn_f, dist=distance, call=callers, typ=1 if require_type else -1, str=1 if require_strand else -1, estm=1 if estimate_distance else -1, ml=minlength, out=interm_out) print("Executing SURVIVOR...", end="", flush=True, file=sys.stderr) subprocess.call(shlex.split(survivor_cmd), stdout=subprocess.DEVNULL) print("DONE", file=sys.stderr) utils.vcf_sort(interm_out, output) os.close(fhf) os.close(fhs) def default_merge(args, variants): if args.keepmerged: vcf_out = args.keepmerged else: _, vcf_out = tempfile.mkstemp() sv_merge(samples=[utils.normalize_vcf(s) for s in variants], distance=args.distance, callers=1, require_type=not args.ignore_type, require_strand=False, estimate_distance=False, minlength=args.minlength, output=vcf_out) return vcf_out def precision_recall_fmeasure(args): vcf_out = default_merge(args, variants=[args.truth, args.test]) truth_set, test_set = utils.get_variant_identifiers(vcf=vcf_out, ignore_chroms=args.ignore_chroms) plots.venn_diagram((truth_set, test_set), labels=('Truth', 'Test')) tp = len(truth_set & test_set) precision = tp / len(test_set) print(f"Precision: {round(precision, ndigits=4)}") recall = tp / len(truth_set) print(f"Recall: {round(recall, ndigits=4)}") fmeasure = 2 * (precision * recall) / (precision + recall) print(f"F-measure: {round(fmeasure, ndigits=4)}") if args.bar: plots.bar_chart(vcf_out) if args.matrix: utils.confusion_matrix(vcf_out, names=['truth', 'test']) def upset(args): vcf_out = default_merge(args, args.variants) upsets = utils.make_sets(vcf=vcf_out, names=args.names or args.variants) plots.upset_plot(upsets, outname=args.plotout) def venn(args): vcf_out = default_merge(args, args.variants) sets = utils.get_variant_identifiers(vcf=vcf_out, ignore_chroms=[], num_samples=len(args.variants)) plots.venn_diagram(sets, labels=args.names or args.variants, num_samples=len(args.variants), outname=args.plotout) if __name__ == '__main__': main() surpyvor-0.5/surpyvor/utils.py000066400000000000000000000122261346521634400167250ustar00rootroot00000000000000import os import sys from shutil import which import tempfile from cyvcf2 import VCF import subprocess import shlex import pandas as pd import gzip from collections import defaultdict def is_variant(call): """Check if a variant position qualifies as a variant 0,1,2,3==HOM_REF, HET, UNKNOWN, HOM_ALT""" if call == 1 or call == 3: return True else: return False def normalize_vcf(vcff): """Normalize a vcf by changing DUP to INS""" handle, name = tempfile.mkstemp(suffix='.vcf') out = open(name, 'w') if vcff.endswith('.gz'): vcf = gzip.open(vcff, 'rt') else: vcf = open(vcff) for line in vcf: out.write(line.replace('DUP', 'INS')) os.close(handle) return name def get_variant_identifiers(vcf, ignore_chroms, num_samples=2): """Get sets of variants for each sample in a merged vcf. Loop over the vcf file, adding a unique identifier to the respective list if the sample has a variant for that position return as set """ positions = [[] for _ in range(num_samples)] for v in VCF(vcf): if v.CHROM not in ignore_chroms: for index, call in enumerate(v.gt_types): if is_variant(call): positions[index].append( "{}:{}-{}".format(v.CHROM, v.start, v.INFO.get('SVTYPE'))) identifier_sets = [set(i) for i in positions] return identifier_sets def gt_types_to_binary_comparison(calls): """From an array of calls, check if a variant position qualifies as a variant. 0,1,2,3==HOM_REF, HET, UNKNOWN, HOM_ALT Return string of 1s and 0s to represent position""" binary_calls = [] for call in calls: if call == 1 or call == 3: binary_calls.append(1) else: binary_calls.append(0) return ''.join([str(i) for i in binary_calls]) def make_sets(vcf, names): """From the merged SV file, return pd.Series of overlapping sets. Intended for making an upset plot""" calls = defaultdict(int) for v in VCF(vcf): calls[gt_types_to_binary_comparison(v.gt_types)] += 1 tf_array = [[True, False]] * len(list(calls.keys())[0]) index = pd.MultiIndex.from_product(tf_array, names=names) values = [calls[''.join([str(int(j)) for j in i])] for i in index] return pd.Series(values, index=index) def vcf_concat(vcffiles): _, concatenated = tempfile.mkstemp(suffix=".vcf") sample = [get_sample(f) for f in vcffiles][0] vcffiles = [reheader(f, sample=sample) for f in vcffiles] vcffiles = [compress_and_tabix(f) for f in vcffiles] c = subprocess.Popen(shlex.split("bcftools concat -a {}".format(' '.join(vcffiles))), stdout=subprocess.PIPE) subprocess.call(shlex.split("bcftools sort -o {}".format(concatenated)), stdin=c.stdout) return concatenated def get_sample(vcffile): vcf = VCF(vcffile) return vcf.samples[0] def reheader(vcf, sample): _, output = tempfile.mkstemp(suffix=".vcf") handle, samplef = tempfile.mkstemp() open(samplef, 'w').write(sample) os.close(handle) subprocess.call(shlex.split("bcftools reheader -s {} {} -o {}".format(samplef, vcf, output))) return output def compress_and_tabix(vcf): if vcf.endswith('.vcf'): handle, output = tempfile.mkstemp(suffix=".vcf.gz") subprocess.call(shlex.split("bgzip -c {}".format(vcf)), stdout=handle) subprocess.call(shlex.split("tabix -p vcf {}".format(output))) return output else: return vcf def decompress(vcf): """ Decompress output to temporary file if filename endswith .gz or .bgz """ if vcf.endswith(('.gz', '.bgz')): handle, output = tempfile.mkstemp(suffix=".vcf") subprocess.call(shlex.split("bgzip -cd {}".format(vcf)), stdout=handle) return output else: return vcf def test_dependencies(): for dependency in ['bcftools', 'bgzip', 'tabix', 'SURVIVOR']: if not which(dependency): sys.exit("ERROR: Could not find required executable '{}'.\n" "Make sure it is installed and in $PATH".format(dependency)) def vcf_sort(input, output): if output in ["stdout", "-"]: subprocess.call(shlex.split('bcftools sort {}'.format(input))) else: subprocess.call(shlex.split('bcftools sort {} -o {}'.format(input, output))) def confusion_matrix(vcff, names): """ First level of the dict is the "first" call, second level is the "second" sample 0: hom_ref 1: heterozygous 2: unknown/nocall 3: hom_alt """ zygosities = {0: {0: 0, 1: 0, 2: 0, 3: 0}, 1: {0: 0, 1: 0, 2: 0, 3: 0}, 2: {0: 0, 1: 0, 2: 0, 3: 0}, 3: {0: 0, 1: 0, 2: 0, 3: 0}, } for v in VCF(vcff): zygosities[v.gt_types[0]][v.gt_types[1]] += 1 zygs = [2, 0, 1, 3] df = pd.DataFrame(index=zygs, columns=zygs) for tr in zygs: for te in zygs: df.loc[tr, te] = zygosities[tr][te] df.columns = ['nocall', 'hom_ref', 'het', 'hom_alt'] df.columns.name = names[1] df.index = ['nocall', 'hom_ref', 'het', 'hom_alt'] df.index.name = names[0] print(df) surpyvor-0.5/surpyvor/version.py000066400000000000000000000000261346521634400172450ustar00rootroot00000000000000__version__ = "0.5.0"