Skip to content

Commit 9e91957

Browse files
authored
Merge pull request #44 from maize-genetics/advancedImputation_metrics
Advanced imputation metrics
2 parents fc6c9b6 + 01b5779 commit 9e91957

File tree

7 files changed

+334
-3
lines changed

7 files changed

+334
-3
lines changed

data/test/merge_gvcfs.bed

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
chr3 611489 616181
2+
chr3 617969 619167
3+
chr7 1047602 1169685
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
##fileformat=VCFv4.2
2+
##FORMAT=<ID=AD,Number=3,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
3+
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth (only filtered reads used for calling)">
4+
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
5+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
6+
##FORMAT=<ID=PL,Number=3,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
7+
##INFO=<ID=AF,Number=3,Type=Integer,Description="Allele Frequency">
8+
##INFO=<ID=ASM_Chr,Number=1,Type=String,Description="Assembly chromosome">
9+
##INFO=<ID=ASM_End,Number=1,Type=Integer,Description="Assembly end position">
10+
##INFO=<ID=ASM_Start,Number=1,Type=Integer,Description="Assembly start position">
11+
##INFO=<ID=ASM_Strand,Number=1,Type=String,Description="Assembly strand">
12+
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
13+
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
14+
##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
15+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample1 Sample2 Sample3
16+
chr3 611512 . C G . . . GT 1 0 0
17+
chr3 611768 . G A . . . GT 1 0 0
18+
chr3 612208 . C T . . . GT 0 1 1
19+
chr3 612646 . A C . . . GT 1 0 0
20+
chr3 612705 . G A . . . GT 1 0 0
21+
chr3 612766 . C G . . . GT 1 1 1
22+
chr3 613041 . C G,<DEL> . . . GT 1 2 2
23+
chr3 613045 . G A . . . GT 1 0 0
24+
chr3 613046 . G A . . . GT 1 0 0
25+
chr3 613154 . C T . . . GT 0 1 1
26+
chr3 613697 . G T . . . GT 1 0 0
27+
chr3 614008 . C T . . . GT 1 1 1
28+
chr3 614010 . A G . . . GT 1 1 1
29+
chr3 614032 . T A . . . GT 1 0 0
30+
chr3 614060 . G C . . . GT 0 1 1
31+
chr3 614068 . A C . . . GT 0 1 1
32+
chr3 614098 . T C . . . GT 0 1 1
33+
chr3 614121 . A G . . . GT 0 1 1
34+
chr3 614172 . A G . . . GT 0 1 1
35+
chr3 614173 . G A . . . GT 0 1 1
36+
chr3 614189 . G A . . . GT 0 1 1
37+
chr3 614190 . G A . . . GT 1 1 1
38+
chr3 614214 . A G . . . GT 0 1 1
39+
chr3 614262 . T G . . . GT 0 1 1
40+
chr3 614287 . C T . . . GT 0 1 1
41+
chr3 614363 . T C . . . GT 1 1 1
42+
chr3 614376 . C T . . . GT 0 1 1
43+
chr3 614405 . A T . . . GT 0 1 1
44+
chr3 614409 . T A . . . GT 0 1 1
45+
chr3 614437 . A C . . . GT 0 1 1
46+
chr3 614444 . A C . . . GT 0 1 1
47+
chr3 614569 . T A . . . GT 1 1 1
48+
chr3 614730 . T G . . . GT 1 1 1
49+
chr3 614741 . A C . . . GT 0 1 1
50+
chr3 614793 . G T . . . GT 0 1 1
51+
chr3 614824 . A T . . . GT 0 1 1
52+
chr3 614834 . G T . . . GT 0 1 1
53+
chr3 615048 . C T . . . GT 0 0 1
54+
chr3 615051 . T A . . . GT 0 1 1
55+
chr3 615173 . A T . . . GT 1 1 1
56+
chr3 615286 . A G . . . GT 1 0 0
57+
chr3 615344 . A T . . . GT 1 1 1
58+
chr3 615366 . T C,<DEL> . . . GT 2 1 1
59+
chr3 615398 . G A . . . GT 0 1 1
60+
chr3 615448 . C T . . . GT 1 1 1
61+
chr3 615459 . G C . . . GT 1 1 1
62+
chr3 615461 . G C . . . GT 0 1 1
63+
chr3 615465 . G A . . . GT 0 1 1
64+
chr3 615481 . G A . . . GT 1 0 0
65+
chr3 615493 . T C . . . GT 1 0 0
66+
chr3 615494 . G T . . . GT 1 0 0
67+
chr3 615747 . G A . . . GT 1 1 1
68+
chr3 615823 . T C . . . GT 1 1 1
69+
chr3 615865 . A T . . . GT 1 1 1
70+
chr3 615968 . A G . . . GT 1 1 1
71+
chr3 616102 . T C . . . GT 1 0 0
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
##fileformat=VCFv4.2
2+
##FORMAT=<ID=AD,Number=3,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
3+
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth (only filtered reads used for calling)">
4+
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
5+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
6+
##FORMAT=<ID=PL,Number=3,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
7+
##INFO=<ID=AF,Number=3,Type=Integer,Description="Allele Frequency">
8+
##INFO=<ID=ASM_Chr,Number=1,Type=String,Description="Assembly chromosome">
9+
##INFO=<ID=ASM_End,Number=1,Type=Integer,Description="Assembly end position">
10+
##INFO=<ID=ASM_Start,Number=1,Type=Integer,Description="Assembly start position">
11+
##INFO=<ID=ASM_Strand,Number=1,Type=String,Description="Assembly strand">
12+
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
13+
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
14+
##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
15+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample1 Sample2 Sample3
16+
chr3 618018 . A G . . . GT 1 1 1
17+
chr3 618024 . A T . . . GT 1 1 1
18+
chr3 618156 . T G,<DEL> . . . GT 1 2 2
19+
chr3 618168 . G A,<DEL> . . . GT 1 2 2
20+
chr3 618170 . T C,<DEL> . . . GT 1 2 2
21+
chr3 618171 . A C,<DEL> . . . GT 1 2 2
22+
chr3 618178 . A T,<DEL> . . . GT 1 2 2
23+
chr3 618251 . T A . . . GT 0 1 1
24+
chr3 618272 . T C . . . GT 1 0 0
25+
chr3 618275 . A G . . . GT 0 1 1
26+
chr3 618295 . A G . . . GT 1 1 1
27+
chr3 618299 . T C . . . GT 1 1 1
28+
chr3 618306 . A G . . . GT 1 1 1
29+
chr3 618320 . A G . . . GT 0 1 1
30+
chr3 618324 . G A . . . GT 0 1 1
31+
chr3 618350 . G A . . . GT 0 1 1
32+
chr3 618364 . T G . . . GT 0 1 1
33+
chr3 618369 . C T . . . GT 1 0 0
34+
chr3 618386 . G A . . . GT 0 1 1
35+
chr3 618404 . G A . . . GT 1 0 0
36+
chr3 618407 . G A . . . GT 1 0 0
37+
chr3 618410 . G C . . . GT 1 0 0
38+
chr3 618418 . A T . . . GT 1 0 0
39+
chr3 618420 . G C . . . GT 0 1 1
40+
chr3 618454 . G C . . . GT 0 1 1
41+
chr3 618465 . G A . . . GT 1 0 0
42+
chr3 618501 . C T . . . GT 1 1 1
43+
chr3 618509 . T C . . . GT 1 0 0
44+
chr3 618524 . G A . . . GT 0 1 1
45+
chr3 618529 . C T . . . GT 0 1 1
46+
chr3 618548 . A T . . . GT 0 1 1
47+
chr3 618559 . C T . . . GT 1 1 1
48+
chr3 618564 . A G . . . GT 1 1 1
49+
chr3 618568 . C G . . . GT 1 1 1
50+
chr3 618581 . C T . . . GT 0 1 1
51+
chr3 618594 . G A . . . GT 1 1 1
52+
chr3 618606 . G T . . . GT 1 0 0
53+
chr3 618644 . A G . . . GT 1 1 1
54+
chr3 618672 . A G . . . GT 1 0 0
55+
chr3 618698 . G C . . . GT 0 1 1
56+
chr3 618706 . A G . . . GT 1 0 0
57+
chr3 618712 . A G . . . GT 1 0 0
58+
chr3 618727 . G A . . . GT 1 0 0
59+
chr3 618745 . A T . . . GT 0 1 1
60+
chr3 618874 . G A . . . GT 1 1 1
61+
chr3 618882 . T A . . . GT 0 1 1
62+
chr3 618889 . T A . . . GT 0 1 1
63+
chr3 618943 . G T . . . GT 0 1 1
64+
chr3 618963 . T C . . . GT 1 0 0
65+
chr3 618987 . C A,T . . . GT 1 2 2
66+
chr3 618994 . G A . . . GT 1 1 1
67+
chr3 619031 . C T . . . GT 1 1 1
68+
chr3 619044 . C T . . . GT 0 1 1
69+
chr3 619052 . C A . . . GT 0 1 1
70+
chr3 619053 . C T . . . GT 0 1 1
71+
chr3 619054 . G A . . . GT 0 1 1
72+
chr3 619102 . G T,<DEL> . . . GT 2 1 1
73+
chr3 619166 . A G . . . GT 1 1 1
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
##fileformat=VCFv4.2
2+
##FORMAT=<ID=AD,Number=3,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
3+
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth (only filtered reads used for calling)">
4+
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
5+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
6+
##FORMAT=<ID=PL,Number=3,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
7+
##INFO=<ID=AF,Number=3,Type=Integer,Description="Allele Frequency">
8+
##INFO=<ID=ASM_Chr,Number=1,Type=String,Description="Assembly chromosome">
9+
##INFO=<ID=ASM_End,Number=1,Type=Integer,Description="Assembly end position">
10+
##INFO=<ID=ASM_Start,Number=1,Type=Integer,Description="Assembly start position">
11+
##INFO=<ID=ASM_Strand,Number=1,Type=String,Description="Assembly strand">
12+
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
13+
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
14+
##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
15+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample1 Sample2 Sample3
16+
chr7 1047637 . C T,<INS> . . . GT 1 2 1
17+
chr7 1153180 . T G,<INS> . . . GT 2 1 0
18+
chr7 1153182 . T C,<INS> . . . GT 0 1 2
19+
chr7 1153183 . A C,<INS> . . . GT 0 2 1
20+
chr7 1153292 . A T,<INS> . . . GT 1 1 2
21+
chr7 1153343 . T G,<INS> . . . GT 2 2 1
22+
chr7 1153353 . T A,<INS> . . . GT 2 2 1
23+
chr7 1153393 . G T,<INS> . . . GT 2 1 2
24+
chr7 1154153 . G T,<INS> . . . GT 2 2 1
25+
chr7 1154345 . T G,<INS> . . . GT 1 1 2
26+
chr7 1157847 . T C,<INS> . . . GT 1 1 2
27+
chr7 1159061 . G A,<INS> . . . GT 2 1 1
28+
chr7 1159063 . G T,<INS> . . . GT 1 2 2
29+
chr7 1159274 . T A,<INS> . . . GT 0 1 2
30+
chr7 1159330 . C A,<INS> . . . GT 2 1 1
31+
chr7 1160237 . T G,<INS> . . . GT 1 2 2
32+
chr7 1169685 . G A,<INS> . . . GT 1 0 2
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
package biokotlin.util
2+
3+
import biokotlin.genome.PositionRange
4+
import java.io.File
5+
import java.util.*
6+
7+
object BedUtils {
8+
9+
/**
10+
* Read a bed file and return a sorted set of PositionRange objects.
11+
* Bed files are zero-based, inclusive / exclusive.
12+
* PositionRange objects are one-based, inclusive / inclusive.
13+
*/
14+
fun readBedfile(bedFileName: String): SortedSet<PositionRange> {
15+
16+
require(File(bedFileName).exists()) { "File $bedFileName does not exist." }
17+
18+
return bufferedReader(bedFileName).readLines().map { line ->
19+
val lineSplit = line.split("\t")
20+
val chrom = lineSplit[0]
21+
val start = lineSplit[1].toInt() + 1
22+
val end = lineSplit[2].toInt()
23+
PositionRange(chrom, start, end)
24+
}.toSortedSet()
25+
26+
}
27+
28+
}

src/main/kotlin/biokotlin/util/MergeGVCFUtils.kt

Lines changed: 78 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
package biokotlin.util
22

3+
import biokotlin.genome.PositionRange
4+
import biokotlin.util.BedUtils.readBedfile
35
import htsjdk.variant.variantcontext.Allele
46
import htsjdk.variant.variantcontext.GenotypeBuilder
57
import htsjdk.variant.variantcontext.VariantContext
68
import htsjdk.variant.variantcontext.VariantContextBuilder
79
import htsjdk.variant.variantcontext.writer.Options
10+
import htsjdk.variant.variantcontext.writer.VariantContextWriter
811
import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder
912
import htsjdk.variant.vcf.VCFHeader
1013
import kotlinx.coroutines.*
@@ -14,9 +17,15 @@ import java.io.File
1417

1518
private val myLogger = LogManager.getLogger("biokotlin.util.MergeGVCFUtils")
1619

20+
/**
21+
* Merges GVCF files into a single VCF file.
22+
* The GVCF files should have only one sample each.
23+
* If a bedfile is provided, then the output VCF file is split into multiple files
24+
* based on the ranges in the bedfile.
25+
*/
1726
object MergeGVCFUtils {
1827

19-
fun mergeGVCFs(inputDir: String, outputFile: String) {
28+
fun mergeGVCFs(inputDir: String, outputFile: String, bedfile: String? = null) {
2029

2130
runBlocking {
2231

@@ -61,8 +70,14 @@ object MergeGVCFUtils {
6170

6271
}
6372

64-
myLogger.info("writing output: $outputFile")
65-
writeOutputVCF(outputFile, samples, variantContextChannel)
73+
val theBedfile = bedfile?.trim()
74+
if (theBedfile.isNullOrEmpty()) {
75+
myLogger.info("Writing output: $outputFile")
76+
writeOutputVCF(outputFile, samples, variantContextChannel)
77+
} else {
78+
myLogger.info("Writing output files with base name: $outputFile and bedfile: $bedfile")
79+
writeOutputVCF(outputFile, theBedfile, samples, variantContextChannel)
80+
}
6681

6782
}
6883

@@ -97,6 +112,9 @@ private fun createVariantContext(
97112

98113
}
99114

115+
/**
116+
* Writes the merged VCF file. This is used when a bedfile is not provided.
117+
*/
100118
private suspend fun writeOutputVCF(
101119
outputFile: String,
102120
samples: List<String>,
@@ -124,6 +142,63 @@ private suspend fun writeOutputVCF(
124142

125143
}
126144

145+
/**
146+
* Writes the merged VCF files. This is used when a bedfile is provided.
147+
* The bedfile is used to create one VCF file per range in the bedfile.
148+
*/
149+
private suspend fun writeOutputVCF(
150+
outputFile: String,
151+
bedfile: String,
152+
samples: List<String>,
153+
variantContextChannel: Channel<Deferred<List<VariantContext>>>
154+
) {
155+
156+
val ranges = readBedfile(bedfile)
157+
158+
val header = VCFHeader(createGenericVCFHeaders(samples))
159+
160+
var writePositionRange: PositionRange? = null
161+
var writer: VariantContextWriter? = null
162+
for (deferred in variantContextChannel) {
163+
164+
val variantContexts = deferred.await()
165+
variantContexts.forEach { variantContext ->
166+
167+
val currentPositionRange = PositionRange(variantContext.contig, variantContext.start, variantContext.end)
168+
169+
// If the current position range doesn't overlap with the current write position range
170+
// then close the current writer and open a new one
171+
if (writePositionRange == null || !writePositionRange!!.overlapping(currentPositionRange)) {
172+
173+
// Close the previous writer
174+
writer?.close()
175+
writer = null
176+
177+
// Find the range in the bedfile that overlaps with the current position range
178+
// and create a new writer for that range. If no range is found, then don't create
179+
// a writer.
180+
writePositionRange = ranges.find { it.overlapping(currentPositionRange) }
181+
writePositionRange?.let {
182+
val filename = "${outputFile}-${writePositionRange!!.toString().replace(':', '_')}.vcf"
183+
writer = VariantContextWriterBuilder()
184+
.unsetOption(Options.INDEX_ON_THE_FLY)
185+
.setOutputFile(File(filename))
186+
.setOutputFileType(VariantContextWriterBuilder.OutputType.VCF)
187+
.setOption(Options.ALLOW_MISSING_FIELDS_IN_HEADER)
188+
.build()
189+
writer?.writeHeader(header)
190+
}
191+
192+
}
193+
194+
writer?.add(variantContext)
195+
196+
}
197+
198+
}
199+
200+
}
201+
127202
/**
128203
* Creates a VariantContext for the current position, given the variants
129204
* at that position from the GVCF readers.

0 commit comments

Comments
 (0)