Skip to content

Commit 01a14ca

Browse files
authored
Merge pull request #46 from maize-genetics/fix-maf-to-gvcf-efficiency
Fixing RAM issues with MAFToGVCF. This should reduce RAM requirements
2 parents 7f5a36b + 333d8de commit 01a14ca

File tree

2 files changed

+36
-18
lines changed

2 files changed

+36
-18
lines changed

src/main/kotlin/biokotlin/genome/MAFToGVCF.kt

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -114,18 +114,19 @@ class MAFToGVCF {
114114
val sampleName = variantsMap.keys.first()
115115

116116
// sort the variants by contig and position.
117-
val variants = variantsMap.values.first().sortedWith(VariantContextComparator(contigList))
117+
val variants = variantsMap.values.first().sortedBy { Position(it.chr,it.startPos) }
118118

119-
exportVariantContext(sampleName, variants, gvcfOutput, refSeqs)
119+
exportVariantContext(sampleName, variants, gvcfOutput, refSeqs, outJustGT, delAsSymbolic, maxDeletionSize)
120120
if (compressAndIndex) {
121121
// compress and index the file with bgzip and tabix.
122122
compressAndIndexFile(gvcfOutput)
123123
}
124124
} else if (variantsMap.size == 2) {
125125
val outputNames = twoOutputFiles(gvcfOutput)
126126
variantsMap.entries.forEachIndexed { index, (name, variants) ->
127-
val sortedVariants = variants.sortedWith(VariantContextComparator(contigList))
128-
val outputFile = exportVariantContext(name, sortedVariants, outputNames[index], refSeqs)
127+
val sortedVariants = variants.sortedBy { Position(it.chr,it.startPos) }
128+
129+
val outputFile = exportVariantContext(name, sortedVariants, outputNames[index], refSeqs, outJustGT, delAsSymbolic, maxDeletionSize)
129130
if (compressAndIndex) {
130131
compressAndIndexFile(outputNames[index])
131132
}
@@ -149,28 +150,28 @@ class MAFToGVCF {
149150
delAsSymbolic: Boolean = false,
150151
maxDeletionSize: Int = 0,
151152
anchorwaveLegacy: Boolean = false
152-
): Map<String, List<VariantContext>> {
153+
): Map<String, List<AssemblyVariantInfo>> {
153154

154155
val mafRecords = loadMAFRecords(mafFile)
155156
return if (twoGvcfs) {
156157
val splitGenomes = splitMafRecordsIntoTwoGenomes(mafRecords)
157158

158159
if (splitGenomes.size == 1) {
159160
println("getVariantContextsfromMAF:twoGvcfs is true but only 1 genome in the MAF file. Processing the single genome")
160-
mapOf(sampleName to buildVariantsForAllAlignments(sampleName, mafRecords, refSeqs, fillGaps,outJustGT,outputType, delAsSymbolic, maxDeletionSize, anchorwaveLegacy))
161+
mapOf(sampleName to buildVariantsForAllAlignments(mafRecords, refSeqs, fillGaps, outputType, anchorwaveLegacy))
161162
} else {
162-
splitGenomes.mapIndexed { index, mafrecs ->
163+
splitGenomes.mapIndexed { index, splitMafRecords ->
163164
//append _1 and _2 to the sampleName for the split genomes
164165

165166
val genomeName = "${sampleName}_${index + 1}"
166167
println("MAFToGVCF:getVariantContextsfromMAF: Splitting ${sampleName} into ${genomeName}")
167-
Pair(genomeName, buildVariantsForAllAlignments(genomeName, mafrecs, refSeqs, fillGaps, outJustGT, outputType, delAsSymbolic, maxDeletionSize, anchorwaveLegacy))
168+
Pair(genomeName, buildVariantsForAllAlignments(splitMafRecords, refSeqs, fillGaps, outputType, anchorwaveLegacy))
168169
}.toMap()
169170
}
170171

171172
} else {
172173
println("getVariantContextsfromMAF: Processing a single genome")
173-
mapOf(sampleName to buildVariantsForAllAlignments(sampleName, mafRecords, refSeqs, fillGaps, outJustGT, outputType, delAsSymbolic, maxDeletionSize, anchorwaveLegacy))
174+
mapOf(sampleName to buildVariantsForAllAlignments(mafRecords, refSeqs, fillGaps, outputType, anchorwaveLegacy))
174175
}
175176

176177
}
@@ -237,16 +238,12 @@ class MAFToGVCF {
237238
* Function to build the variants for all the alignments.
238239
*/
239240
fun buildVariantsForAllAlignments(
240-
sampleName: String,
241241
mafRecords: List<MAFRecord>,
242242
refGenomeSequence: Map<String, NucSeq>,
243243
fillGaps: Boolean,
244-
outJustGT: Boolean,
245244
outputType: OUTPUT_TYPE,
246-
delAsSymbolic: Boolean,
247-
maxDeletionSize: Int,
248245
anchorwaveLegacy: Boolean = false
249-
): List<VariantContext> {
246+
): List<AssemblyVariantInfo> {
250247
var variantInfos = mutableListOf<AssemblyVariantInfo>()
251248

252249
for (record in mafRecords) {
@@ -263,7 +260,7 @@ class MAFToGVCF {
263260
variantInfos = fillInMissingVariantBlocks(variantInfos, refGenomeSequence, true)
264261
}
265262

266-
return createVariantContextsFromInfo(sampleName, variantInfos, outJustGT, delAsSymbolic, maxDeletionSize)
263+
return variantInfos
267264
}
268265

269266
fun removeRefBlocks(variantInfos: MutableList<AssemblyVariantInfo>) : MutableList<AssemblyVariantInfo> {
@@ -1117,9 +1114,12 @@ class MAFToGVCF {
11171114
*/
11181115
fun exportVariantContext(
11191116
sampleName: String,
1120-
variantContexts: List<VariantContext>,
1117+
variantContexts: List<AssemblyVariantInfo>,
11211118
outputFileName: String,
1122-
refGenomeSequence: Map<String, NucSeq>
1119+
refGenomeSequence: Map<String, NucSeq>,
1120+
outputJustGT: Boolean,
1121+
delAsSymbolic: Boolean,
1122+
maxDeletionSize: Int
11231123
) {
11241124
val writer = VariantContextWriterBuilder()
11251125
.unsetOption(Options.INDEX_ON_THE_FLY)
@@ -1132,7 +1132,7 @@ class MAFToGVCF {
11321132
addSequenceDictionary(header, refGenomeSequence)
11331133
writer.writeHeader(header)
11341134
for (variant in variantContexts) {
1135-
writer.add(variant)
1135+
writer.add(convertVariantInfoToContext(sampleName, variant, outputJustGT, delAsSymbolic, maxDeletionSize))
11361136
}
11371137

11381138
writer.close()

src/test/kotlin/biokotlin/genome/MAFToGVCFTest.kt

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -619,6 +619,24 @@ class MAFToGVCFTest : StringSpec({
619619

620620
}
621621
}
622+
623+
"TestSort" {
624+
val chr1 = "1"
625+
val chr2 = "2"
626+
val chr10 = "10"
627+
628+
val asmInfos = listOf(AssemblyVariantInfo(chr1,10,20,"0","A","T",true, intArrayOf(10,20), "chr1", 10, 20, "+"),
629+
AssemblyVariantInfo(chr2,10,20,"0","A","T",true, intArrayOf(10,20), "chr2", 10, 20, "+"),
630+
AssemblyVariantInfo(chr10,10,20,"0","A","T",true, intArrayOf(10,20), "chr10", 10, 20, "+"),
631+
AssemblyVariantInfo(chr2,30,40,"0","A","T",true, intArrayOf(10,20), "chr2", 10, 20, "+"),
632+
AssemblyVariantInfo(chr1,1,9,"0","A","T",true, intArrayOf(10,20), "chr1", 10, 20, "+"),
633+
)
634+
635+
val expectedInfos = listOf(Pair(chr1,1), Pair(chr1,10), Pair(chr2,10), Pair(chr2,30), Pair(chr10,10))
636+
637+
val sorted = asmInfos.sortedBy { Position(it.chr,it.startPos) }
638+
sorted.map { Pair(it.chr,it.startPos) } shouldBe expectedInfos
639+
}
622640

623641
})
624642

0 commit comments

Comments
 (0)