Merge pull request #48 from J35P312/merge_fix

J35P312 · web-flow · commit a62a9308c308 · 2022-03-29T17:04:33.000+02:00
Add information for all callers in INFO for overlapping events
diff --git a/setup.py b/setup.py
@@ -19,7 +19,7 @@
     ext_modules = []
 
 setup(name='svdb',
-      version='2.5.3',
+      version='2.6.0',
       url="https://github.com/J35P312/SVDB",
       author="Jesper Eisfeldt",
       author_email="jesper.eisfeldt@scilifelab.se",
diff --git a/svdb/__main__.py b/svdb/__main__.py
@@ -36,7 +36,7 @@ def make_query_calls (args, queries, keyword):
         query_module.main(args)
 
 def main():
-    version = "2.5.3"
+    version = "2.6.0"
     parser = argparse.ArgumentParser(
         """SVDB-{}, use the build module to construct databases, use the query module to query the database usign vcf files, or use the hist module to generate histograms""".format(version), add_help=False)
     parser.add_argument('--build', help="create a db",
diff --git a/svdb/merge_vcf_module.py b/svdb/merge_vcf_module.py
@@ -74,6 +74,15 @@ def print_header(vcf_list, vcf_dictionary, args, command_line):
     for entry in sorted(header["INFO"]):
         print(header["INFO"][entry].strip())
     del header["INFO"]
+
+    for vcf in vcf_dictionary:
+        print("##INFO=<ID={}_INFO,Number=.,Type=String,Description=\"pipe separated list of all details in the INFO column of file {}\">".format(vcf_dictionary[vcf],vcf_dictionary[vcf]))
+        print("##INFO=<ID={}_SAMPLE,Number=.,Type=String,Description=\"pipe separated list of all details in the SAMPLEs column of file {}\">".format(vcf_dictionary[vcf],vcf_dictionary[vcf]))
+        print("##INFO=<ID={}_CHROM,Number=.,Type=String,Description=\"pipe separated list of all details in the CHROM column of file {}\">".format(vcf_dictionary[vcf],vcf_dictionary[vcf]))
+        print("##INFO=<ID={}_POS,Number=.,Type=String,Description=\"pipe separated list of all details in the POS column of file {}\">".format(vcf_dictionary[vcf],vcf_dictionary[vcf]))
+        print("##INFO=<ID={}_QUAL,Number=.,Type=String,Description=\"pipe separated list of all details in the QUAL column of file {}\">".format(vcf_dictionary[vcf],vcf_dictionary[vcf]))
+        print("##INFO=<ID={}_FILTERS,Number=.,Type=String,Description=\"pipe separated list of all details in the FILTER column of file {}\">".format(vcf_dictionary[vcf],vcf_dictionary[vcf]))
+
     # print contigs according to the input order
     if reference != "":
         print(reference.strip())
@@ -101,6 +110,8 @@ def print_header(vcf_list, vcf_dictionary, args, command_line):
     if not args.notag:
         print("##INFO=<ID=VARID,Number=1,Type=String,Description=\"The variant ID of merged samples\">")
         print("##INFO=<ID=set,Number=1,Type=String,Description=\"Source VCF for the merged record in SVDB\">")
+        print("##INFO=<ID=svdb_origin,Number=1,Type=String,Description=\"pipe separated list of the VCF for the merged record in SVDB\">")
+
     print("##svdbcmdline={}".format(" ".join(command_line)))
     sample_print_order = {}
 
diff --git a/svdb/merge_vcf_module_cython.py b/svdb/merge_vcf_module_cython.py
@@ -6,12 +6,72 @@
 def retrieve_key(line, key):
     key += '='
     if key in line:
-        item = line.strip().split(key)[-1].split(";")[0]
-        if len(item) == len(line.strip()):
+        if ";{}".format(key) in line:
+            item = line.strip().split( ";{}".format(key) )[-1].split(";")[0].split("\t")[0]
+
+        elif "\t{}".format(key) in line:
+            item = line.strip().split( "\t{}".format(key) )[-1].split(";")[0].split("\t")[0]
+        else:
             return False
-    return item
 
+    return item
 
+#Check if no merging should occur
+def skip_variant(chrA,chrB,type_A,type_B,vcf_line_A,vcf_line_B,pass_only,current_variant,analysed_variants,no_var):
+    #The variant is already clustered/analysed
+    if current_variant in analysed_variants:
+       return True
+
+    # only treat variants on the same pair of chromosomes
+    if chrA != chrB:
+       return True
+
+    # dont merge variants of different type
+    if type_A != type_B and not no_var:
+       return True
+
+    # if the pass_only option is chosen, only variants marked PASS will be merged
+    if pass_only:
+       filter_tag = vcf_line_B[6]
+       if filter_tag not in ['PASS', '.']:
+           return True
+
+
+#Collect SAMPLE columns from all merged variants:
+def collect_sample(vcf_line,samples,sample_order,f):
+    variant=vcf_line[2].replace(";","_").replace(":","_").replace("|","_")
+    sample_data=[variant]
+
+    for sample in samples:
+        if not sample in sample_order:
+           continue
+        if not f in sample_order[sample]:
+           continue
+        sample_position = sample_order[sample][f]
+        
+        entries = vcf_line[8].split(":")
+        sample_entries = vcf_line[9 + sample_position].split(":")
+        sample_data.append(sample)
+        for i, entry in enumerate(entries):
+            sample_data.append("{}:{}".format( entry,sample_entries[i] ) )
+
+    return "|".join(sample_data).replace(",",":")	
+
+#collect INFO from all merged_variants
+def collect_info(vcf_line):
+    INFO = vcf_line[7]
+    INFO_content = INFO.split(";")
+    variant=vcf_line[2].replace(";","_").replace(":","_").replace("|","_")
+    all_info=[variant]
+
+    for content in INFO_content:
+        tag = content.split("=")[0]
+        if not ":" in content and not "|" in content:
+           all_info.append( content.replace("=",":").replace(",",":") )
+
+    return "|".join(all_info) 
+
+#create a GATK-like set of all merged variants
 def determine_set_tag(priority_order, files):
     n_filtered = 0
     n_pass = 0
@@ -39,7 +99,7 @@ def determine_set_tag(priority_order, files):
                 filtered.append("filterIn" + sample)
         return "-".join(filtered)
 
-
+#merge csq field of merged variants (for instance when merging BNDs)
 def merge_csq(info, csq):
     """Merge the csq fields of bnd variants"""
     var_csq = info.split("CSQ=")[-1].split(";")[0]
@@ -116,18 +176,30 @@ def sort_format_field(line, samples, sample_order, sample_print_order, priority_
     # generate a union of the info fields
     info_union = []
     tags_in_info = []
+
+    # tags only to be copied from the file with highest priority (to avoid problems in downstream analyses
+    blacklist=set(["SVLEN","END","SVTYPE"])
+
+    first=True
     for input_file in priority_order:
         if input_file not in files:
             continue
+
         INFO = files[input_file].strip().split("\t")[7]
         INFO_content = INFO.split(";")
 
         for content in INFO_content:
             tag = content.split("=")[0]
+
+            if not first and tag in blacklist:
+                continue
+
             if tag not in tags_in_info:
                 tags_in_info.append(tag)
                 info_union.append(content)
 
+        first=False
+
     new_info = ";".join(info_union)
     line[7] = new_info
     return line
@@ -150,31 +222,47 @@ def merge(variants, samples, sample_order, sample_print_order, priority_order, a
                 continue
 
             merge = []
+            #keep track of all csq of merged variants
             csq = []
 
+            #Keep track of all files in the cluster
             files = {}
-            for j in range(i + 1, len(variants[chrA])):
-                if j in analysed_variants:
-                    continue
+            #keep track of the FILTER column of all merged files
+            filters_tag = {}
+            #keep track of SAMPLEs columns of all merged files
+            samples_tag = {}
+            #keep track of INFO column of all merged files
+            info_tag = {}
+            #quality
+            qual_tag = {}
+            #pos
+            pos_tag = {}
+            #chrom
+            chrom_tag ={}
 
-                # if the pass_only option is chosen, only variants marked PASS will be merged
-                if pass_only:
-                    filter_tag = variants[chrA][i][-1].split("\t")[6]
-                    if filter_tag not in ['PASS', '.']:
-                        break
+            if args.priority:
+               id=variants[chrA][i][-3]
+            else:
+               id=variants[chrA][i][-3].split(".vcf")[0].split("/")[-1]
 
-                # only treat varints on the same pair of chromosomes
-                if not variants[chrA][i][0] == variants[chrA][j][0]:
-                    continue
+            vcf_line_A=variants[chrA][i][-1].strip().split("\t")
+            chrom_tag[id]=[ "{}|{}".format(vcf_line_A[2].replace(";","_").replace(":","_").replace("|","_"),vcf_line_A[0]) ]
+            pos_tag[id]=[ "{}|{}".format(vcf_line_A[2].replace(";","_").replace(":","_").replace("|","_"),vcf_line_A[1]) ]
+            qual_tag[id]=[ "{}|{}".format(vcf_line_A[2].replace(";","_").replace(":","_").replace("|","_"),vcf_line_A[5]) ]
+            filters_tag[id]=[ "{}|{}".format(vcf_line_A[2].replace(";","_").replace(":","_").replace("|","_"),vcf_line_A[6]) ]
+            samples_tag[id]=[collect_sample( vcf_line_A ,samples,sample_order,id)]
+            info_tag[id]=[collect_info(vcf_line_A)]
+
+            for j in range(i + 1, len(variants[chrA])):
+                vcf_line_B=variants[chrA][j][-1].strip().split("\t")
 
                 # if the pass_only option is chosen, only variants marked PASS will be merged
                 if pass_only:
-                    filter_tag = variants[chrA][j][-1].split("\t")[6]
+                    filter_tag = vcf_line_A[6]
                     if filter_tag not in ['PASS', '.']:
-                        continue
+                        break
 
-                # dont merge variants of different type
-                if variants[chrA][i][1] != variants[chrA][j][1] and not no_var:
+                if skip_variant(variants[chrA][i][0],variants[chrA][j][0],variants[chrA][i][1],variants[chrA][j][1],vcf_line_A,vcf_line_B,pass_only,j,analysed_variants,no_var):
                     continue
 
                 # if no_intra is chosen, variants may only be merged if they belong to different input files
@@ -192,21 +280,34 @@ def merge(variants, samples, sample_order, sample_print_order, priority_order, a
                 if match:
                     # add similar variants to the merge list and remove them
                     if args.priority:
-                        files[variants[chrA][j][-3]] = variants[chrA][j][-1]
-                        merge.append(
-                            variants[chrA][j][-1].split("\t")[2].replace(";", "_") + ":" + variants[chrA][j][-3])
+                        match_id=variants[chrA][j][-3]
                     else:
-                        files[variants[chrA][j]
-                              [-3].split(".vcf")[0].split("/")[-1]] = variants[chrA][j][-1]
-                        merge.append(variants[chrA][j][-1].split("\t")[2].replace(
-                            ";", "_") + ":" + variants[chrA][j][-3].split(".vcf")[0].split("/")[-1])
+                        match_id=variants[chrA][j][-3].split(".vcf")[0].split("/")[-1]
+
+                    files[match_id] = variants[chrA][j][-1]
+                    merge.append(vcf_line_B[2].replace(";", "_") + ":" + match_id)
+                    if not match_id in filters_tag:
+                        filters_tag[match_id]=[]
+                        samples_tag[match_id]=[]
+                        info_tag[match_id]=[]
+                        chrom_tag[match_id]=[]
+                        pos_tag[match_id]=[]
+                        qual_tag[match_id]=[]
+
+                    chrom_tag[match_id].append("{}|{}".format(vcf_line_B[2].replace(";","_").replace(":","_").replace("|","_"),variants[chrA][j][-1].split("\t")[0]) )
+                    pos_tag[match_id].append("{}|{}".format(vcf_line_B[2].replace(";","_").replace(":","_").replace("|","_"),variants[chrA][j][-1].split("\t")[1]) )
+                    qual_tag[match_id].append("{}|{}".format(vcf_line_B[2].replace(";","_").replace(":","_").replace("|","_"),variants[chrA][j][-1].split("\t")[5]) )
+                    filters_tag[match_id].append("{}|{}".format(vcf_line_B[2].replace(";","_").replace(":","_").replace("|","_"),variants[chrA][j][-1].split("\t")[6]) )
+
+                    samples_tag[match_id].append(collect_sample(vcf_line_B ,samples,sample_order,match_id))
+                    info_tag[match_id].append( collect_info(vcf_line_B) )
 
                     if variants[chrA][i][0] != chrA and "CSQ=" in variants[chrA][j][-1]:
-                        info = variants[chrA][j][-1].split("\t")[7]
+                        info = vcf_line_B[7]
                         csq.append(info.split("CSQ=")[-1].split(";")[0])
                     analysed_variants.add(j)
 
-            line = variants[chrA][i][-1].split("\t")
+            line = vcf_line_A
 
             if csq:
                 line[7] = merge_csq(line[7], csq)
@@ -230,6 +331,29 @@ def merge(variants, samples, sample_order, sample_print_order, priority_order, a
             if not args.notag:
                 set_tag = determine_set_tag(priority_order, files)
                 line[7] += ";set={}".format(set_tag)
+
+            #add chrom information of all merged variants
+            callers=[]
+            for tag in filters_tag:
+                line[7]+=";{}_CHROM={}".format(tag,",".join(chrom_tag[tag]))
+                callers.append(tag)
+            #add pos information of all merged variants
+            for tag in filters_tag:
+                line[7]+=";{}_POS={}".format(tag,",".join(pos_tag[tag]))
+            #add qual information of all merged variants
+            for tag in filters_tag:
+                line[7]+=";{}_QUAL={}".format(tag,",".join(qual_tag[tag]))
+            #add filter of all merged variants
+            for tag in filters_tag:
+                line[7]+=";{}_FILTERS={}".format(tag,",".join(filters_tag[tag]))
+            #add samples information for all merged variants
+            for tag in samples_tag:
+                line[7]+=";{}_SAMPLES={}".format(tag,",".join(samples_tag[tag]))
+            #add info column for all merged variants
+            for tag in samples_tag:
+                line[7]+=";{}_INFO={}".format(tag,",".join(info_tag[tag]))
+            line[7]+=";svdb_origin={}".format("|".join(callers))
+
             to_be_printed[line[0]].append(line)
 
             analysed_variants.add(i)
diff --git a/tests/test_dbscan.py b/tests/test_dbscan.py
@@ -4,7 +4,7 @@
 from svdb.DBSCAN import main
 
 
-class TestReadVCFLine(unittest.TestCase):
+class TestDBSCAN(unittest.TestCase):
 
     #test that distant points are not merged
     def test_distant_points(self):
diff --git a/tests/test_merge.py b/tests/test_merge.py
@@ -0,0 +1,72 @@
+import unittest
+import numpy
+
+from svdb.merge_vcf_module_cython import collect_info, skip_variant, retrieve_key, collect_sample
+
+
+class TestMerge(unittest.TestCase):
+
+    #check that we find and retrieve correct entry of info column
+    def test_retrieve_key(self):
+        line="Y\t13799001\tCNVnator_dup_810:concatenated_ACC5821A7_XXXXXX_R_CNVnator|CNVnator_dup_1313:concatenated_ACC5838A1_XXXXXX_R_CNVnator\tN\t<DUP>\t.\tPASS\tEND=13870000;SVTYPE=DUP;SVLEN=71000;IMPRECISE;natorRD=25.6613;natorP1=0.000938744;natorP2=1.33071e-34;natorP3=0.00215104;natorP4=2.21186e-33;natorQ0=1;VARID=CNVnator_dup_1313:concatenated_ACC5838A1_XXXXXX_R_CNVnator;set=Intersection;concatenated_ACC5821A7_XXXXXX_R_CNVnator_FILTERS=CNVnator_dup_810|PASS;concatenated_ACC5838A1_XXXXXX_R_CNVnator_FILTERS=CNVnator_dup_1313|PASS;concatenated_ACC5821A7_XXXXXX_R_CNVnator_SAMPLES=CNVnator_dup_810|concatenated_ACC5821A7_XXXXXX_R_CNVnator|GT:./1|CN:26;concatenated_ACC5838A1_XXXXXX_R_CNVnator_SAMPLES=CNVnator_dup_1313|concatenated_ACC5838A1_XXXXXX_R_CNVnator|GT:./1|CN:35;concatenated_ACC5821A7_XXXXXX_R_CNVnator_INFO=CNVnator_dup_810|END:13870000|SVTYPE:DUP|SVLEN:71000|IMPRECISE|natorRD:25.6613|natorP1:0.000938744|natorP2:1.33071e-34|natorP3:0.00215104|natorP4:2.21186e-33|natorQ0:1;concatenated_ACC5838A1_XXXXXX_R_CNVnator_INFO=CNVnator_dup_1313|END:13870000|SVTYPE:DUP|SVLEN:71000|IMPRECISE|natorRD:35.4121|natorP1:0.000300039|natorP2:0|natorP3:0.00099868|natorP4:0|natorQ0:1\tGT:CN"
+        key="SVLEN"
+        assert(retrieve_key(line, key) == "71000")
+
+    def test_retrieve_key2(self):
+        line="Y\t13799001\tCNVnator_dup_810:concatenated_ACC5821A7_XXXXXX_R_CNVnator|CNVnator_dup_1313:concatenated_ACC5838A1_XXXXXX_R_CNVnator\tN\t<DUP>\t.\tPASS\tEND=13870000;SVTYPE=DUP;SVLEN=71000;IMPRECISE;natorRD=25.6613;natorP1=0.000938744;natorP2=1.33071e-34;natorP3=0.00215104;natorP4=2.21186e-33;natorQ0=1;VARID=CNVnator_dup_1313:concatenated_ACC5838A1_XXXXXX_R_CNVnator;set=Intersection;concatenated_ACC5821A7_XXXXXX_R_CNVnator_FILTERS=CNVnator_dup_810|PASS;concatenated_ACC5838A1_XXXXXX_R_CNVnator_FILTERS=CNVnator_dup_1313|PASS;concatenated_ACC5821A7_XXXXXX_R_CNVnator_SAMPLES=CNVnator_dup_810|concatenated_ACC5821A7_XXXXXX_R_CNVnator|GT:./1|CN:26;concatenated_ACC5838A1_XXXXXX_R_CNVnator_SAMPLES=CNVnator_dup_1313|concatenated_ACC5838A1_XXXXXX_R_CNVnator|GT:./1|CN:35;concatenated_ACC5821A7_XXXXXX_R_CNVnator_INFO=CNVnator_dup_810|END:13870000|SVTYPE:DUP|SVLEN:71000|IMPRECISE|natorRD:25.6613|natorP1:0.000938744|natorP2:1.33071e-34|natorP3:0.00215104|natorP4:2.21186e-33|natorQ0:1;concatenated_ACC5838A1_XXXXXX_R_CNVnator_INFO=CNVnator_dup_1313|END:13870000|SVTYPE:DUP|SVLEN:71000|IMPRECISE|natorRD:35.4121|natorP1:0.000300039|natorP2:0|natorP3:0.00099868|natorP4:0|natorQ0:1\tGT:CN"
+        key="VLEN"
+        assert(retrieve_key(line, key) == False)
+ 
+    #check that the info field is summarised properly
+    def test_collect_info(self):
+        info = ["chr1", "1" , "hej" , "." ,"<DEL>", "." , "PASS" ,"END=5;SVTYPE=DEL;TEST=1,2,3,4,5"]
+        result=collect_info(info)
+        assert (result=="hej|END:5|SVTYPE:DEL|TEST:1:2:3:4:5")
+
+    #check that sample columns are retrieved properly
+    def test_collect_collect_sample(self):
+        vcf_line = ["chr1", "1" , "hej" , "." ,"<DEL>", "." , "PASS" ,"END=5;SVTYPE=DEL;TEST=1,2,3,4,5","GT:CN","1/1:0"]
+        samples=["bob"]
+        sample_order={"bob":{"cnvnator_bob":0}}
+        f="cnvnator_bob"
+        result=collect_sample(vcf_line,samples,sample_order,f)
+        assert (result=="hej|bob|GT:1/1|CN:0")
+
+
+    #test the skip_variant filter
+    def test_skip_variant_different_var(self):
+        no_var=False
+        analysed_variants=set([1,2])
+        current_variant=3
+        pass_only=False
+        vcf_line_A=["chr1", "1" , "hej" , "." ,"<DEL>", "." , "PASS" ,"END=5;SVTYPE=DEL;TEST=1,2,3,4,5"]
+        vcf_line_B=["chr1", "1" , "hej" , "." ,"<DEL>", "." , "FAIL" ,"END=5;SVTYPE=DEL;TEST=1,2,3,4,5"]
+        type_A="DEL"
+        type_B="DEL"
+        chrA="chr1"
+        chrB="chr1"
+
+        #Do not merge variants of different type
+        result=skip_variant(chrA,chrB,"DUP",type_B,vcf_line_A,vcf_line_B,pass_only,current_variant,analysed_variants,no_var)
+        assert (result)
+
+        #merge variants of different types if no_var is True
+        result=skip_variant(chrA,chrB,"DUP",type_B,vcf_line_A,vcf_line_B,pass_only,current_variant,analysed_variants,True)
+        assert (not result)
+
+        #Do not cluster already clustered variants
+        result=skip_variant(chrA,chrB,type_A,type_B,vcf_line_A,vcf_line_B,pass_only,2,analysed_variants,True)
+        assert (result)
+
+        #Do not cluster variants located on different chromosomes
+        result=skip_variant(chrA,"X",type_A,type_B,vcf_line_A,vcf_line_B,pass_only,current_variant,analysed_variants,True)
+        assert (result)
+
+        #Skip filtered variants (if pass_only =True)
+        result=skip_variant(chrA,"X",type_A,type_B,vcf_line_A,vcf_line_B,True,current_variant,analysed_variants,True)
+        assert (result)
+
+
+
+
+ 
diff --git a/tests/test_overlap_module.py b/tests/test_overlap_module.py
diff --git a/tests/test_readvcf.py b/tests/test_readvcf.py