6
6
def retrieve_key (line , key ):
7
7
key += '='
8
8
if key in line :
9
- item = line .strip ().split (key )[- 1 ].split (";" )[0 ]
10
- if len (item ) == len (line .strip ()):
9
+ if ";{}" .format (key ) in line :
10
+ item = line .strip ().split ( ";{}" .format (key ) )[- 1 ].split (";" )[0 ].split ("\t " )[0 ]
11
+
12
+ elif "\t {}" .format (key ) in line :
13
+ item = line .strip ().split ( "\t {}" .format (key ) )[- 1 ].split (";" )[0 ].split ("\t " )[0 ]
14
+ else :
11
15
return False
12
- return item
13
16
17
+ return item
14
18
19
+ #Check if no merging should occur
20
+ def skip_variant (chrA ,chrB ,type_A ,type_B ,vcf_line_A ,vcf_line_B ,pass_only ,current_variant ,analysed_variants ,no_var ):
21
+ #The variant is already clustered/analysed
22
+ if current_variant in analysed_variants :
23
+ return True
24
+
25
+ # only treat variants on the same pair of chromosomes
26
+ if chrA != chrB :
27
+ return True
28
+
29
+ # dont merge variants of different type
30
+ if type_A != type_B and not no_var :
31
+ return True
32
+
33
+ # if the pass_only option is chosen, only variants marked PASS will be merged
34
+ if pass_only :
35
+ filter_tag = vcf_line_B [6 ]
36
+ if filter_tag not in ['PASS' , '.' ]:
37
+ return True
38
+
39
+
40
+ #Collect SAMPLE columns from all merged variants:
41
+ def collect_sample (vcf_line ,samples ,sample_order ,f ):
42
+ variant = vcf_line [2 ].replace (";" ,"_" ).replace (":" ,"_" ).replace ("|" ,"_" )
43
+ sample_data = [variant ]
44
+
45
+ for sample in samples :
46
+ if not sample in sample_order :
47
+ continue
48
+ if not f in sample_order [sample ]:
49
+ continue
50
+ sample_position = sample_order [sample ][f ]
51
+
52
+ entries = vcf_line [8 ].split (":" )
53
+ sample_entries = vcf_line [9 + sample_position ].split (":" )
54
+ sample_data .append (sample )
55
+ for i , entry in enumerate (entries ):
56
+ sample_data .append ("{}:{}" .format ( entry ,sample_entries [i ] ) )
57
+
58
+ return "|" .join (sample_data ).replace ("," ,":" )
59
+
60
+ #collect INFO from all merged_variants
61
+ def collect_info (vcf_line ):
62
+ INFO = vcf_line [7 ]
63
+ INFO_content = INFO .split (";" )
64
+ variant = vcf_line [2 ].replace (";" ,"_" ).replace (":" ,"_" ).replace ("|" ,"_" )
65
+ all_info = [variant ]
66
+
67
+ for content in INFO_content :
68
+ tag = content .split ("=" )[0 ]
69
+ if not ":" in content and not "|" in content :
70
+ all_info .append ( content .replace ("=" ,":" ).replace ("," ,":" ) )
71
+
72
+ return "|" .join (all_info )
73
+
74
+ #create a GATK-like set of all merged variants
15
75
def determine_set_tag (priority_order , files ):
16
76
n_filtered = 0
17
77
n_pass = 0
@@ -39,7 +99,7 @@ def determine_set_tag(priority_order, files):
39
99
filtered .append ("filterIn" + sample )
40
100
return "-" .join (filtered )
41
101
42
-
102
+ #merge csq field of merged variants (for instance when merging BNDs)
43
103
def merge_csq (info , csq ):
44
104
"""Merge the csq fields of bnd variants"""
45
105
var_csq = info .split ("CSQ=" )[- 1 ].split (";" )[0 ]
@@ -116,18 +176,30 @@ def sort_format_field(line, samples, sample_order, sample_print_order, priority_
116
176
# generate a union of the info fields
117
177
info_union = []
118
178
tags_in_info = []
179
+
180
+ # tags only to be copied from the file with highest priority (to avoid problems in downstream analyses
181
+ blacklist = set (["SVLEN" ,"END" ,"SVTYPE" ])
182
+
183
+ first = True
119
184
for input_file in priority_order :
120
185
if input_file not in files :
121
186
continue
187
+
122
188
INFO = files [input_file ].strip ().split ("\t " )[7 ]
123
189
INFO_content = INFO .split (";" )
124
190
125
191
for content in INFO_content :
126
192
tag = content .split ("=" )[0 ]
193
+
194
+ if not first and tag in blacklist :
195
+ continue
196
+
127
197
if tag not in tags_in_info :
128
198
tags_in_info .append (tag )
129
199
info_union .append (content )
130
200
201
+ first = False
202
+
131
203
new_info = ";" .join (info_union )
132
204
line [7 ] = new_info
133
205
return line
@@ -150,31 +222,47 @@ def merge(variants, samples, sample_order, sample_print_order, priority_order, a
150
222
continue
151
223
152
224
merge = []
225
+ #keep track of all csq of merged variants
153
226
csq = []
154
227
228
+ #Keep track of all files in the cluster
155
229
files = {}
156
- for j in range (i + 1 , len (variants [chrA ])):
157
- if j in analysed_variants :
158
- continue
230
+ #keep track of the FILTER column of all merged files
231
+ filters_tag = {}
232
+ #keep track of SAMPLEs columns of all merged files
233
+ samples_tag = {}
234
+ #keep track of INFO column of all merged files
235
+ info_tag = {}
236
+ #quality
237
+ qual_tag = {}
238
+ #pos
239
+ pos_tag = {}
240
+ #chrom
241
+ chrom_tag = {}
159
242
160
- # if the pass_only option is chosen, only variants marked PASS will be merged
161
- if pass_only :
162
- filter_tag = variants [chrA ][i ][- 1 ].split ("\t " )[6 ]
163
- if filter_tag not in ['PASS' , '.' ]:
164
- break
243
+ if args .priority :
244
+ id = variants [chrA ][i ][- 3 ]
245
+ else :
246
+ id = variants [chrA ][i ][- 3 ].split (".vcf" )[0 ].split ("/" )[- 1 ]
165
247
166
- # only treat varints on the same pair of chromosomes
167
- if not variants [chrA ][i ][0 ] == variants [chrA ][j ][0 ]:
168
- continue
248
+ vcf_line_A = variants [chrA ][i ][- 1 ].strip ().split ("\t " )
249
+ chrom_tag [id ]= [ "{}|{}" .format (vcf_line_A [2 ].replace (";" ,"_" ).replace (":" ,"_" ).replace ("|" ,"_" ),vcf_line_A [0 ]) ]
250
+ pos_tag [id ]= [ "{}|{}" .format (vcf_line_A [2 ].replace (";" ,"_" ).replace (":" ,"_" ).replace ("|" ,"_" ),vcf_line_A [1 ]) ]
251
+ qual_tag [id ]= [ "{}|{}" .format (vcf_line_A [2 ].replace (";" ,"_" ).replace (":" ,"_" ).replace ("|" ,"_" ),vcf_line_A [5 ]) ]
252
+ filters_tag [id ]= [ "{}|{}" .format (vcf_line_A [2 ].replace (";" ,"_" ).replace (":" ,"_" ).replace ("|" ,"_" ),vcf_line_A [6 ]) ]
253
+ samples_tag [id ]= [collect_sample ( vcf_line_A ,samples ,sample_order ,id )]
254
+ info_tag [id ]= [collect_info (vcf_line_A )]
255
+
256
+ for j in range (i + 1 , len (variants [chrA ])):
257
+ vcf_line_B = variants [chrA ][j ][- 1 ].strip ().split ("\t " )
169
258
170
259
# if the pass_only option is chosen, only variants marked PASS will be merged
171
260
if pass_only :
172
- filter_tag = variants [ chrA ][ j ][ - 1 ]. split ( " \t " ) [6 ]
261
+ filter_tag = vcf_line_A [6 ]
173
262
if filter_tag not in ['PASS' , '.' ]:
174
- continue
263
+ break
175
264
176
- # dont merge variants of different type
177
- if variants [chrA ][i ][1 ] != variants [chrA ][j ][1 ] and not no_var :
265
+ if skip_variant (variants [chrA ][i ][0 ],variants [chrA ][j ][0 ],variants [chrA ][i ][1 ],variants [chrA ][j ][1 ],vcf_line_A ,vcf_line_B ,pass_only ,j ,analysed_variants ,no_var ):
178
266
continue
179
267
180
268
# if no_intra is chosen, variants may only be merged if they belong to different input files
@@ -192,21 +280,34 @@ def merge(variants, samples, sample_order, sample_print_order, priority_order, a
192
280
if match :
193
281
# add similar variants to the merge list and remove them
194
282
if args .priority :
195
- files [variants [chrA ][j ][- 3 ]] = variants [chrA ][j ][- 1 ]
196
- merge .append (
197
- variants [chrA ][j ][- 1 ].split ("\t " )[2 ].replace (";" , "_" ) + ":" + variants [chrA ][j ][- 3 ])
283
+ match_id = variants [chrA ][j ][- 3 ]
198
284
else :
199
- files [variants [chrA ][j ]
200
- [- 3 ].split (".vcf" )[0 ].split ("/" )[- 1 ]] = variants [chrA ][j ][- 1 ]
201
- merge .append (variants [chrA ][j ][- 1 ].split ("\t " )[2 ].replace (
202
- ";" , "_" ) + ":" + variants [chrA ][j ][- 3 ].split (".vcf" )[0 ].split ("/" )[- 1 ])
285
+ match_id = variants [chrA ][j ][- 3 ].split (".vcf" )[0 ].split ("/" )[- 1 ]
286
+
287
+ files [match_id ] = variants [chrA ][j ][- 1 ]
288
+ merge .append (vcf_line_B [2 ].replace (";" , "_" ) + ":" + match_id )
289
+ if not match_id in filters_tag :
290
+ filters_tag [match_id ]= []
291
+ samples_tag [match_id ]= []
292
+ info_tag [match_id ]= []
293
+ chrom_tag [match_id ]= []
294
+ pos_tag [match_id ]= []
295
+ qual_tag [match_id ]= []
296
+
297
+ chrom_tag [match_id ].append ("{}|{}" .format (vcf_line_B [2 ].replace (";" ,"_" ).replace (":" ,"_" ).replace ("|" ,"_" ),variants [chrA ][j ][- 1 ].split ("\t " )[0 ]) )
298
+ pos_tag [match_id ].append ("{}|{}" .format (vcf_line_B [2 ].replace (";" ,"_" ).replace (":" ,"_" ).replace ("|" ,"_" ),variants [chrA ][j ][- 1 ].split ("\t " )[1 ]) )
299
+ qual_tag [match_id ].append ("{}|{}" .format (vcf_line_B [2 ].replace (";" ,"_" ).replace (":" ,"_" ).replace ("|" ,"_" ),variants [chrA ][j ][- 1 ].split ("\t " )[5 ]) )
300
+ filters_tag [match_id ].append ("{}|{}" .format (vcf_line_B [2 ].replace (";" ,"_" ).replace (":" ,"_" ).replace ("|" ,"_" ),variants [chrA ][j ][- 1 ].split ("\t " )[6 ]) )
301
+
302
+ samples_tag [match_id ].append (collect_sample (vcf_line_B ,samples ,sample_order ,match_id ))
303
+ info_tag [match_id ].append ( collect_info (vcf_line_B ) )
203
304
204
305
if variants [chrA ][i ][0 ] != chrA and "CSQ=" in variants [chrA ][j ][- 1 ]:
205
- info = variants [ chrA ][ j ][ - 1 ]. split ( " \t " ) [7 ]
306
+ info = vcf_line_B [7 ]
206
307
csq .append (info .split ("CSQ=" )[- 1 ].split (";" )[0 ])
207
308
analysed_variants .add (j )
208
309
209
- line = variants [ chrA ][ i ][ - 1 ]. split ( " \t " )
310
+ line = vcf_line_A
210
311
211
312
if csq :
212
313
line [7 ] = merge_csq (line [7 ], csq )
@@ -230,6 +331,29 @@ def merge(variants, samples, sample_order, sample_print_order, priority_order, a
230
331
if not args .notag :
231
332
set_tag = determine_set_tag (priority_order , files )
232
333
line [7 ] += ";set={}" .format (set_tag )
334
+
335
+ #add chrom information of all merged variants
336
+ callers = []
337
+ for tag in filters_tag :
338
+ line [7 ]+= ";{}_CHROM={}" .format (tag ,"," .join (chrom_tag [tag ]))
339
+ callers .append (tag )
340
+ #add pos information of all merged variants
341
+ for tag in filters_tag :
342
+ line [7 ]+= ";{}_POS={}" .format (tag ,"," .join (pos_tag [tag ]))
343
+ #add qual information of all merged variants
344
+ for tag in filters_tag :
345
+ line [7 ]+= ";{}_QUAL={}" .format (tag ,"," .join (qual_tag [tag ]))
346
+ #add filter of all merged variants
347
+ for tag in filters_tag :
348
+ line [7 ]+= ";{}_FILTERS={}" .format (tag ,"," .join (filters_tag [tag ]))
349
+ #add samples information for all merged variants
350
+ for tag in samples_tag :
351
+ line [7 ]+= ";{}_SAMPLES={}" .format (tag ,"," .join (samples_tag [tag ]))
352
+ #add info column for all merged variants
353
+ for tag in samples_tag :
354
+ line [7 ]+= ";{}_INFO={}" .format (tag ,"," .join (info_tag [tag ]))
355
+ line [7 ]+= ";svdb_origin={}" .format ("|" .join (callers ))
356
+
233
357
to_be_printed [line [0 ]].append (line )
234
358
235
359
analysed_variants .add (i )
0 commit comments