Skip to content

Commit a62a930

Browse files
authored
Merge pull request #48 from J35P312/merge_fix
Add information for all callers in INFO for overlapping events
2 parents 8deb4e3 + d0a5b0a commit a62a930

File tree

8 files changed

+267
-32
lines changed

8 files changed

+267
-32
lines changed

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
ext_modules = []
2020

2121
setup(name='svdb',
22-
version='2.5.3',
22+
version='2.6.0',
2323
url="https://github.com/J35P312/SVDB",
2424
author="Jesper Eisfeldt",
2525
author_email="jesper.eisfeldt@scilifelab.se",

svdb/__main__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def make_query_calls (args, queries, keyword):
3636
query_module.main(args)
3737

3838
def main():
39-
version = "2.5.3"
39+
version = "2.6.0"
4040
parser = argparse.ArgumentParser(
4141
"""SVDB-{}, use the build module to construct databases, use the query module to query the database usign vcf files, or use the hist module to generate histograms""".format(version), add_help=False)
4242
parser.add_argument('--build', help="create a db",

svdb/merge_vcf_module.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,15 @@ def print_header(vcf_list, vcf_dictionary, args, command_line):
7474
for entry in sorted(header["INFO"]):
7575
print(header["INFO"][entry].strip())
7676
del header["INFO"]
77+
78+
for vcf in vcf_dictionary:
79+
print("##INFO=<ID={}_INFO,Number=.,Type=String,Description=\"pipe separated list of all details in the INFO column of file {}\">".format(vcf_dictionary[vcf],vcf_dictionary[vcf]))
80+
print("##INFO=<ID={}_SAMPLE,Number=.,Type=String,Description=\"pipe separated list of all details in the SAMPLEs column of file {}\">".format(vcf_dictionary[vcf],vcf_dictionary[vcf]))
81+
print("##INFO=<ID={}_CHROM,Number=.,Type=String,Description=\"pipe separated list of all details in the CHROM column of file {}\">".format(vcf_dictionary[vcf],vcf_dictionary[vcf]))
82+
print("##INFO=<ID={}_POS,Number=.,Type=String,Description=\"pipe separated list of all details in the POS column of file {}\">".format(vcf_dictionary[vcf],vcf_dictionary[vcf]))
83+
print("##INFO=<ID={}_QUAL,Number=.,Type=String,Description=\"pipe separated list of all details in the QUAL column of file {}\">".format(vcf_dictionary[vcf],vcf_dictionary[vcf]))
84+
print("##INFO=<ID={}_FILTERS,Number=.,Type=String,Description=\"pipe separated list of all details in the FILTER column of file {}\">".format(vcf_dictionary[vcf],vcf_dictionary[vcf]))
85+
7786
# print contigs according to the input order
7887
if reference != "":
7988
print(reference.strip())
@@ -101,6 +110,8 @@ def print_header(vcf_list, vcf_dictionary, args, command_line):
101110
if not args.notag:
102111
print("##INFO=<ID=VARID,Number=1,Type=String,Description=\"The variant ID of merged samples\">")
103112
print("##INFO=<ID=set,Number=1,Type=String,Description=\"Source VCF for the merged record in SVDB\">")
113+
print("##INFO=<ID=svdb_origin,Number=1,Type=String,Description=\"pipe separated list of the VCF for the merged record in SVDB\">")
114+
104115
print("##svdbcmdline={}".format(" ".join(command_line)))
105116
sample_print_order = {}
106117

svdb/merge_vcf_module_cython.py

Lines changed: 152 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,72 @@
66
def retrieve_key(line, key):
77
key += '='
88
if key in line:
9-
item = line.strip().split(key)[-1].split(";")[0]
10-
if len(item) == len(line.strip()):
9+
if ";{}".format(key) in line:
10+
item = line.strip().split( ";{}".format(key) )[-1].split(";")[0].split("\t")[0]
11+
12+
elif "\t{}".format(key) in line:
13+
item = line.strip().split( "\t{}".format(key) )[-1].split(";")[0].split("\t")[0]
14+
else:
1115
return False
12-
return item
1316

17+
return item
1418

19+
#Check if no merging should occur
20+
def skip_variant(chrA,chrB,type_A,type_B,vcf_line_A,vcf_line_B,pass_only,current_variant,analysed_variants,no_var):
21+
#The variant is already clustered/analysed
22+
if current_variant in analysed_variants:
23+
return True
24+
25+
# only treat variants on the same pair of chromosomes
26+
if chrA != chrB:
27+
return True
28+
29+
# dont merge variants of different type
30+
if type_A != type_B and not no_var:
31+
return True
32+
33+
# if the pass_only option is chosen, only variants marked PASS will be merged
34+
if pass_only:
35+
filter_tag = vcf_line_B[6]
36+
if filter_tag not in ['PASS', '.']:
37+
return True
38+
39+
40+
#Collect SAMPLE columns from all merged variants:
41+
def collect_sample(vcf_line,samples,sample_order,f):
42+
variant=vcf_line[2].replace(";","_").replace(":","_").replace("|","_")
43+
sample_data=[variant]
44+
45+
for sample in samples:
46+
if not sample in sample_order:
47+
continue
48+
if not f in sample_order[sample]:
49+
continue
50+
sample_position = sample_order[sample][f]
51+
52+
entries = vcf_line[8].split(":")
53+
sample_entries = vcf_line[9 + sample_position].split(":")
54+
sample_data.append(sample)
55+
for i, entry in enumerate(entries):
56+
sample_data.append("{}:{}".format( entry,sample_entries[i] ) )
57+
58+
return "|".join(sample_data).replace(",",":")
59+
60+
#collect INFO from all merged_variants
61+
def collect_info(vcf_line):
62+
INFO = vcf_line[7]
63+
INFO_content = INFO.split(";")
64+
variant=vcf_line[2].replace(";","_").replace(":","_").replace("|","_")
65+
all_info=[variant]
66+
67+
for content in INFO_content:
68+
tag = content.split("=")[0]
69+
if not ":" in content and not "|" in content:
70+
all_info.append( content.replace("=",":").replace(",",":") )
71+
72+
return "|".join(all_info)
73+
74+
#create a GATK-like set of all merged variants
1575
def determine_set_tag(priority_order, files):
1676
n_filtered = 0
1777
n_pass = 0
@@ -39,7 +99,7 @@ def determine_set_tag(priority_order, files):
3999
filtered.append("filterIn" + sample)
40100
return "-".join(filtered)
41101

42-
102+
#merge csq field of merged variants (for instance when merging BNDs)
43103
def merge_csq(info, csq):
44104
"""Merge the csq fields of bnd variants"""
45105
var_csq = info.split("CSQ=")[-1].split(";")[0]
@@ -116,18 +176,30 @@ def sort_format_field(line, samples, sample_order, sample_print_order, priority_
116176
# generate a union of the info fields
117177
info_union = []
118178
tags_in_info = []
179+
180+
# tags only to be copied from the file with highest priority (to avoid problems in downstream analyses
181+
blacklist=set(["SVLEN","END","SVTYPE"])
182+
183+
first=True
119184
for input_file in priority_order:
120185
if input_file not in files:
121186
continue
187+
122188
INFO = files[input_file].strip().split("\t")[7]
123189
INFO_content = INFO.split(";")
124190

125191
for content in INFO_content:
126192
tag = content.split("=")[0]
193+
194+
if not first and tag in blacklist:
195+
continue
196+
127197
if tag not in tags_in_info:
128198
tags_in_info.append(tag)
129199
info_union.append(content)
130200

201+
first=False
202+
131203
new_info = ";".join(info_union)
132204
line[7] = new_info
133205
return line
@@ -150,31 +222,47 @@ def merge(variants, samples, sample_order, sample_print_order, priority_order, a
150222
continue
151223

152224
merge = []
225+
#keep track of all csq of merged variants
153226
csq = []
154227

228+
#Keep track of all files in the cluster
155229
files = {}
156-
for j in range(i + 1, len(variants[chrA])):
157-
if j in analysed_variants:
158-
continue
230+
#keep track of the FILTER column of all merged files
231+
filters_tag = {}
232+
#keep track of SAMPLEs columns of all merged files
233+
samples_tag = {}
234+
#keep track of INFO column of all merged files
235+
info_tag = {}
236+
#quality
237+
qual_tag = {}
238+
#pos
239+
pos_tag = {}
240+
#chrom
241+
chrom_tag ={}
159242

160-
# if the pass_only option is chosen, only variants marked PASS will be merged
161-
if pass_only:
162-
filter_tag = variants[chrA][i][-1].split("\t")[6]
163-
if filter_tag not in ['PASS', '.']:
164-
break
243+
if args.priority:
244+
id=variants[chrA][i][-3]
245+
else:
246+
id=variants[chrA][i][-3].split(".vcf")[0].split("/")[-1]
165247

166-
# only treat varints on the same pair of chromosomes
167-
if not variants[chrA][i][0] == variants[chrA][j][0]:
168-
continue
248+
vcf_line_A=variants[chrA][i][-1].strip().split("\t")
249+
chrom_tag[id]=[ "{}|{}".format(vcf_line_A[2].replace(";","_").replace(":","_").replace("|","_"),vcf_line_A[0]) ]
250+
pos_tag[id]=[ "{}|{}".format(vcf_line_A[2].replace(";","_").replace(":","_").replace("|","_"),vcf_line_A[1]) ]
251+
qual_tag[id]=[ "{}|{}".format(vcf_line_A[2].replace(";","_").replace(":","_").replace("|","_"),vcf_line_A[5]) ]
252+
filters_tag[id]=[ "{}|{}".format(vcf_line_A[2].replace(";","_").replace(":","_").replace("|","_"),vcf_line_A[6]) ]
253+
samples_tag[id]=[collect_sample( vcf_line_A ,samples,sample_order,id)]
254+
info_tag[id]=[collect_info(vcf_line_A)]
255+
256+
for j in range(i + 1, len(variants[chrA])):
257+
vcf_line_B=variants[chrA][j][-1].strip().split("\t")
169258

170259
# if the pass_only option is chosen, only variants marked PASS will be merged
171260
if pass_only:
172-
filter_tag = variants[chrA][j][-1].split("\t")[6]
261+
filter_tag = vcf_line_A[6]
173262
if filter_tag not in ['PASS', '.']:
174-
continue
263+
break
175264

176-
# dont merge variants of different type
177-
if variants[chrA][i][1] != variants[chrA][j][1] and not no_var:
265+
if skip_variant(variants[chrA][i][0],variants[chrA][j][0],variants[chrA][i][1],variants[chrA][j][1],vcf_line_A,vcf_line_B,pass_only,j,analysed_variants,no_var):
178266
continue
179267

180268
# if no_intra is chosen, variants may only be merged if they belong to different input files
@@ -192,21 +280,34 @@ def merge(variants, samples, sample_order, sample_print_order, priority_order, a
192280
if match:
193281
# add similar variants to the merge list and remove them
194282
if args.priority:
195-
files[variants[chrA][j][-3]] = variants[chrA][j][-1]
196-
merge.append(
197-
variants[chrA][j][-1].split("\t")[2].replace(";", "_") + ":" + variants[chrA][j][-3])
283+
match_id=variants[chrA][j][-3]
198284
else:
199-
files[variants[chrA][j]
200-
[-3].split(".vcf")[0].split("/")[-1]] = variants[chrA][j][-1]
201-
merge.append(variants[chrA][j][-1].split("\t")[2].replace(
202-
";", "_") + ":" + variants[chrA][j][-3].split(".vcf")[0].split("/")[-1])
285+
match_id=variants[chrA][j][-3].split(".vcf")[0].split("/")[-1]
286+
287+
files[match_id] = variants[chrA][j][-1]
288+
merge.append(vcf_line_B[2].replace(";", "_") + ":" + match_id)
289+
if not match_id in filters_tag:
290+
filters_tag[match_id]=[]
291+
samples_tag[match_id]=[]
292+
info_tag[match_id]=[]
293+
chrom_tag[match_id]=[]
294+
pos_tag[match_id]=[]
295+
qual_tag[match_id]=[]
296+
297+
chrom_tag[match_id].append("{}|{}".format(vcf_line_B[2].replace(";","_").replace(":","_").replace("|","_"),variants[chrA][j][-1].split("\t")[0]) )
298+
pos_tag[match_id].append("{}|{}".format(vcf_line_B[2].replace(";","_").replace(":","_").replace("|","_"),variants[chrA][j][-1].split("\t")[1]) )
299+
qual_tag[match_id].append("{}|{}".format(vcf_line_B[2].replace(";","_").replace(":","_").replace("|","_"),variants[chrA][j][-1].split("\t")[5]) )
300+
filters_tag[match_id].append("{}|{}".format(vcf_line_B[2].replace(";","_").replace(":","_").replace("|","_"),variants[chrA][j][-1].split("\t")[6]) )
301+
302+
samples_tag[match_id].append(collect_sample(vcf_line_B ,samples,sample_order,match_id))
303+
info_tag[match_id].append( collect_info(vcf_line_B) )
203304

204305
if variants[chrA][i][0] != chrA and "CSQ=" in variants[chrA][j][-1]:
205-
info = variants[chrA][j][-1].split("\t")[7]
306+
info = vcf_line_B[7]
206307
csq.append(info.split("CSQ=")[-1].split(";")[0])
207308
analysed_variants.add(j)
208309

209-
line = variants[chrA][i][-1].split("\t")
310+
line = vcf_line_A
210311

211312
if csq:
212313
line[7] = merge_csq(line[7], csq)
@@ -230,6 +331,29 @@ def merge(variants, samples, sample_order, sample_print_order, priority_order, a
230331
if not args.notag:
231332
set_tag = determine_set_tag(priority_order, files)
232333
line[7] += ";set={}".format(set_tag)
334+
335+
#add chrom information of all merged variants
336+
callers=[]
337+
for tag in filters_tag:
338+
line[7]+=";{}_CHROM={}".format(tag,",".join(chrom_tag[tag]))
339+
callers.append(tag)
340+
#add pos information of all merged variants
341+
for tag in filters_tag:
342+
line[7]+=";{}_POS={}".format(tag,",".join(pos_tag[tag]))
343+
#add qual information of all merged variants
344+
for tag in filters_tag:
345+
line[7]+=";{}_QUAL={}".format(tag,",".join(qual_tag[tag]))
346+
#add filter of all merged variants
347+
for tag in filters_tag:
348+
line[7]+=";{}_FILTERS={}".format(tag,",".join(filters_tag[tag]))
349+
#add samples information for all merged variants
350+
for tag in samples_tag:
351+
line[7]+=";{}_SAMPLES={}".format(tag,",".join(samples_tag[tag]))
352+
#add info column for all merged variants
353+
for tag in samples_tag:
354+
line[7]+=";{}_INFO={}".format(tag,",".join(info_tag[tag]))
355+
line[7]+=";svdb_origin={}".format("|".join(callers))
356+
233357
to_be_printed[line[0]].append(line)
234358

235359
analysed_variants.add(i)

tests/test_dbscan.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from svdb.DBSCAN import main
55

66

7-
class TestReadVCFLine(unittest.TestCase):
7+
class TestDBSCAN(unittest.TestCase):
88

99
#test that distant points are not merged
1010
def test_distant_points(self):

tests/test_merge.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import unittest
2+
import numpy
3+
4+
from svdb.merge_vcf_module_cython import collect_info, skip_variant, retrieve_key, collect_sample
5+
6+
7+
class TestMerge(unittest.TestCase):
8+
9+
#check that we find and retrieve correct entry of info column
10+
def test_retrieve_key(self):
11+
line="Y\t13799001\tCNVnator_dup_810:concatenated_ACC5821A7_XXXXXX_R_CNVnator|CNVnator_dup_1313:concatenated_ACC5838A1_XXXXXX_R_CNVnator\tN\t<DUP>\t.\tPASS\tEND=13870000;SVTYPE=DUP;SVLEN=71000;IMPRECISE;natorRD=25.6613;natorP1=0.000938744;natorP2=1.33071e-34;natorP3=0.00215104;natorP4=2.21186e-33;natorQ0=1;VARID=CNVnator_dup_1313:concatenated_ACC5838A1_XXXXXX_R_CNVnator;set=Intersection;concatenated_ACC5821A7_XXXXXX_R_CNVnator_FILTERS=CNVnator_dup_810|PASS;concatenated_ACC5838A1_XXXXXX_R_CNVnator_FILTERS=CNVnator_dup_1313|PASS;concatenated_ACC5821A7_XXXXXX_R_CNVnator_SAMPLES=CNVnator_dup_810|concatenated_ACC5821A7_XXXXXX_R_CNVnator|GT:./1|CN:26;concatenated_ACC5838A1_XXXXXX_R_CNVnator_SAMPLES=CNVnator_dup_1313|concatenated_ACC5838A1_XXXXXX_R_CNVnator|GT:./1|CN:35;concatenated_ACC5821A7_XXXXXX_R_CNVnator_INFO=CNVnator_dup_810|END:13870000|SVTYPE:DUP|SVLEN:71000|IMPRECISE|natorRD:25.6613|natorP1:0.000938744|natorP2:1.33071e-34|natorP3:0.00215104|natorP4:2.21186e-33|natorQ0:1;concatenated_ACC5838A1_XXXXXX_R_CNVnator_INFO=CNVnator_dup_1313|END:13870000|SVTYPE:DUP|SVLEN:71000|IMPRECISE|natorRD:35.4121|natorP1:0.000300039|natorP2:0|natorP3:0.00099868|natorP4:0|natorQ0:1\tGT:CN"
12+
key="SVLEN"
13+
assert(retrieve_key(line, key) == "71000")
14+
15+
def test_retrieve_key2(self):
16+
line="Y\t13799001\tCNVnator_dup_810:concatenated_ACC5821A7_XXXXXX_R_CNVnator|CNVnator_dup_1313:concatenated_ACC5838A1_XXXXXX_R_CNVnator\tN\t<DUP>\t.\tPASS\tEND=13870000;SVTYPE=DUP;SVLEN=71000;IMPRECISE;natorRD=25.6613;natorP1=0.000938744;natorP2=1.33071e-34;natorP3=0.00215104;natorP4=2.21186e-33;natorQ0=1;VARID=CNVnator_dup_1313:concatenated_ACC5838A1_XXXXXX_R_CNVnator;set=Intersection;concatenated_ACC5821A7_XXXXXX_R_CNVnator_FILTERS=CNVnator_dup_810|PASS;concatenated_ACC5838A1_XXXXXX_R_CNVnator_FILTERS=CNVnator_dup_1313|PASS;concatenated_ACC5821A7_XXXXXX_R_CNVnator_SAMPLES=CNVnator_dup_810|concatenated_ACC5821A7_XXXXXX_R_CNVnator|GT:./1|CN:26;concatenated_ACC5838A1_XXXXXX_R_CNVnator_SAMPLES=CNVnator_dup_1313|concatenated_ACC5838A1_XXXXXX_R_CNVnator|GT:./1|CN:35;concatenated_ACC5821A7_XXXXXX_R_CNVnator_INFO=CNVnator_dup_810|END:13870000|SVTYPE:DUP|SVLEN:71000|IMPRECISE|natorRD:25.6613|natorP1:0.000938744|natorP2:1.33071e-34|natorP3:0.00215104|natorP4:2.21186e-33|natorQ0:1;concatenated_ACC5838A1_XXXXXX_R_CNVnator_INFO=CNVnator_dup_1313|END:13870000|SVTYPE:DUP|SVLEN:71000|IMPRECISE|natorRD:35.4121|natorP1:0.000300039|natorP2:0|natorP3:0.00099868|natorP4:0|natorQ0:1\tGT:CN"
17+
key="VLEN"
18+
assert(retrieve_key(line, key) == False)
19+
20+
#check that the info field is summarised properly
21+
def test_collect_info(self):
22+
info = ["chr1", "1" , "hej" , "." ,"<DEL>", "." , "PASS" ,"END=5;SVTYPE=DEL;TEST=1,2,3,4,5"]
23+
result=collect_info(info)
24+
assert (result=="hej|END:5|SVTYPE:DEL|TEST:1:2:3:4:5")
25+
26+
#check that sample columns are retrieved properly
27+
def test_collect_collect_sample(self):
28+
vcf_line = ["chr1", "1" , "hej" , "." ,"<DEL>", "." , "PASS" ,"END=5;SVTYPE=DEL;TEST=1,2,3,4,5","GT:CN","1/1:0"]
29+
samples=["bob"]
30+
sample_order={"bob":{"cnvnator_bob":0}}
31+
f="cnvnator_bob"
32+
result=collect_sample(vcf_line,samples,sample_order,f)
33+
assert (result=="hej|bob|GT:1/1|CN:0")
34+
35+
36+
#test the skip_variant filter
37+
def test_skip_variant_different_var(self):
38+
no_var=False
39+
analysed_variants=set([1,2])
40+
current_variant=3
41+
pass_only=False
42+
vcf_line_A=["chr1", "1" , "hej" , "." ,"<DEL>", "." , "PASS" ,"END=5;SVTYPE=DEL;TEST=1,2,3,4,5"]
43+
vcf_line_B=["chr1", "1" , "hej" , "." ,"<DEL>", "." , "FAIL" ,"END=5;SVTYPE=DEL;TEST=1,2,3,4,5"]
44+
type_A="DEL"
45+
type_B="DEL"
46+
chrA="chr1"
47+
chrB="chr1"
48+
49+
#Do not merge variants of different type
50+
result=skip_variant(chrA,chrB,"DUP",type_B,vcf_line_A,vcf_line_B,pass_only,current_variant,analysed_variants,no_var)
51+
assert (result)
52+
53+
#merge variants of different types if no_var is True
54+
result=skip_variant(chrA,chrB,"DUP",type_B,vcf_line_A,vcf_line_B,pass_only,current_variant,analysed_variants,True)
55+
assert (not result)
56+
57+
#Do not cluster already clustered variants
58+
result=skip_variant(chrA,chrB,type_A,type_B,vcf_line_A,vcf_line_B,pass_only,2,analysed_variants,True)
59+
assert (result)
60+
61+
#Do not cluster variants located on different chromosomes
62+
result=skip_variant(chrA,"X",type_A,type_B,vcf_line_A,vcf_line_B,pass_only,current_variant,analysed_variants,True)
63+
assert (result)
64+
65+
#Skip filtered variants (if pass_only =True)
66+
result=skip_variant(chrA,"X",type_A,type_B,vcf_line_A,vcf_line_B,True,current_variant,analysed_variants,True)
67+
assert (result)
68+
69+
70+
71+
72+

0 commit comments

Comments
 (0)