Skip to content

Commit c94e9cb

Browse files
committed
Added in legacy option and fixing how we interprets anchorwave's outputs.
1 parent dd77d4e commit c94e9cb

File tree

3 files changed

+136
-30
lines changed

3 files changed

+136
-30
lines changed

src/main/kotlin/biokotlin/cli/MafToGvcfConverter.kt

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,9 @@ class MafToGvcfConverter : CliktCommand(help = "Create a GVCF file from a MAF fi
8282
.int()
8383
.default(0)
8484

85+
val anchorwaveLegacy by option(help = "Defaults to false. Enable this option if the MAF was created prior to Anchorwave version 1.2.3. Otherwise the ")
86+
.flag(default = false)
87+
8588
override fun run() {
8689
println("LCJ begin run: fillGaps = $fillGaps, twoGvcfs = $twoGvcfs, outJustGT = $outJustGT, compressAndIndex = $compressAndIndex, delAsSymbolic = $delAsSymbolic, outputType = $outputType")
8790
val outputType = try {
@@ -104,7 +107,8 @@ class MafToGvcfConverter : CliktCommand(help = "Create a GVCF file from a MAF fi
104107
outputType,
105108
compressAndIndex,
106109
delAsSymbolic,
107-
maxDeletionSize
110+
maxDeletionSize,
111+
anchorwaveLegacy
108112
)
109113

110114
}

src/main/kotlin/biokotlin/genome/MAFToGVCF.kt

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -97,11 +97,12 @@ class MAFToGVCF {
9797
outputType: OUTPUT_TYPE = OUTPUT_TYPE.gvcf,
9898
compressAndIndex: Boolean = true, // if bgzip and bcftools are not on the system path, this will fail
9999
delAsSymbolic: Boolean = false, // if true, replace deletion records with symbolic alleles
100-
maxDeletionSize: Int = 0 // if delAsSymbolic is true, replace deletions longer than this size with symbolic alleles
100+
maxDeletionSize: Int = 0, // if delAsSymbolic is true, replace deletions longer than this size with symbolic alleles
101+
anchorwaveLegacy : Boolean = false
101102
) {
102103

103104
val refSeqs = fastaToNucSeq(referenceFile)
104-
val variantsMap = getVariantContextsfromMAF(mafFile, refSeqs, sampleName, fillGaps, twoGvcfs,outJustGT,outputType, delAsSymbolic, maxDeletionSize)
105+
val variantsMap = getVariantContextsfromMAF(mafFile, refSeqs, sampleName, fillGaps, twoGvcfs,outJustGT,outputType, delAsSymbolic, maxDeletionSize, anchorwaveLegacy)
105106
check(variantsMap.size == 1 || variantsMap.size == 2) {
106107
"Expected either 1 or 2 variant maps but there are ${variantsMap.size}"
107108
}
@@ -146,7 +147,8 @@ class MAFToGVCF {
146147
outJustGT: Boolean = false,
147148
outputType: OUTPUT_TYPE = OUTPUT_TYPE.gvcf,
148149
delAsSymbolic: Boolean = false,
149-
maxDeletionSize: Int = 0
150+
maxDeletionSize: Int = 0,
151+
anchorwaveLegacy: Boolean = false
150152
): Map<String, List<VariantContext>> {
151153

152154
val mafRecords = loadMAFRecords(mafFile)
@@ -155,20 +157,20 @@ class MAFToGVCF {
155157

156158
if (splitGenomes.size == 1) {
157159
println("getVariantContextsfromMAF:twoGvcfs is true but only 1 genome in the MAF file. Processing the single genome")
158-
mapOf(sampleName to buildVariantsForAllAlignments(sampleName, mafRecords, refSeqs, fillGaps,outJustGT,outputType, delAsSymbolic, maxDeletionSize))
160+
mapOf(sampleName to buildVariantsForAllAlignments(sampleName, mafRecords, refSeqs, fillGaps,outJustGT,outputType, delAsSymbolic, maxDeletionSize, anchorwaveLegacy))
159161
} else {
160162
splitGenomes.mapIndexed { index, mafrecs ->
161163
//append _1 and _2 to the sampleName for the split genomes
162164

163165
val genomeName = "${sampleName}_${index + 1}"
164166
println("MAFToGVCF:getVariantContextsfromMAF: Splitting ${sampleName} into ${genomeName}")
165-
Pair(genomeName, buildVariantsForAllAlignments(genomeName, mafrecs, refSeqs, fillGaps, outJustGT, outputType, delAsSymbolic, maxDeletionSize))
167+
Pair(genomeName, buildVariantsForAllAlignments(genomeName, mafrecs, refSeqs, fillGaps, outJustGT, outputType, delAsSymbolic, maxDeletionSize, anchorwaveLegacy))
166168
}.toMap()
167169
}
168170

169171
} else {
170172
println("getVariantContextsfromMAF: Processing a single genome")
171-
mapOf(sampleName to buildVariantsForAllAlignments(sampleName, mafRecords, refSeqs, fillGaps, outJustGT, outputType, delAsSymbolic, maxDeletionSize))
173+
mapOf(sampleName to buildVariantsForAllAlignments(sampleName, mafRecords, refSeqs, fillGaps, outJustGT, outputType, delAsSymbolic, maxDeletionSize, anchorwaveLegacy))
172174
}
173175

174176
}
@@ -242,12 +244,13 @@ class MAFToGVCF {
242244
outJustGT: Boolean,
243245
outputType: OUTPUT_TYPE,
244246
delAsSymbolic: Boolean,
245-
maxDeletionSize: Int
247+
maxDeletionSize: Int,
248+
anchorwaveLegacy: Boolean = false
246249
): List<VariantContext> {
247250
var variantInfos = mutableListOf<AssemblyVariantInfo>()
248251

249252
for (record in mafRecords) {
250-
variantInfos.addAll(buildTempVariants(refGenomeSequence, record))
253+
variantInfos.addAll(buildTempVariants(refGenomeSequence, record, anchorwaveLegacy))
251254
}
252255

253256
if (fillGaps && outputType == OUTPUT_TYPE.vcf ){
@@ -410,7 +413,7 @@ class MAFToGVCF {
410413
/**
411414
* Function to build the AssemblyVariantInfos found in the given Maf record.
412415
*/
413-
fun buildTempVariants(refSequence: Map<String, NucSeq>, mafRecord: MAFRecord): List<AssemblyVariantInfo> {
416+
fun buildTempVariants(refSequence: Map<String, NucSeq>, mafRecord: MAFRecord, anchorwaveLegacy: Boolean = false): List<AssemblyVariantInfo> {
414417
//Build a list of VariantInfos for each alignment state
415418
val chrom = mafRecord.refRecord.chromName
416419

@@ -425,7 +428,14 @@ class MAFToGVCF {
425428
check(asmStrand =="-" || asmStrand == "+") {"ASM Strand is not - or +."}
426429
var currentRefBp =
427430
mafRecord.refRecord.start //position in ref sequence. That is, alignment bp minus dashes for REF line
428-
var currentASMBp = if(asmStrand == "-") {mafRecord.altRecord.start + mafRecord.altRecord.size -1 }
431+
432+
//Here we need to check to see if we are on the negative strand and if we are using a file from the old anchorwave version.
433+
//If negative and using the old anchorwave, the actual assembly start position is the start from the file plus the size of the alt seq minus one
434+
//If just negative, the actual assembly start needs to be resolved as MAF has the reported start coming from the end of the chrom.
435+
//To compute take the contig size, subtract the reported start and add 1.
436+
//If the same file was run between both versions of anchorwave these ifs will create the same correct Assembly start position
437+
var currentASMBp = if(asmStrand == "-" && anchorwaveLegacy) {mafRecord.altRecord.start + mafRecord.altRecord.size -1 }
438+
else if (asmStrand == "-") {mafRecord.altRecord.chrSize - mafRecord.altRecord.start + 1}
429439
else {mafRecord.altRecord.start}//position in the alt sequence. That is alignment bp minus dashes for ASM line
430440

431441
val asmChrom = mafRecord.altRecord.chromName

src/test/kotlin/biokotlin/genome/MAFToGVCFTest.kt

Lines changed: 111 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,10 @@ class MAFToGVCFTest : StringSpec({
2121
val diploidRefFile = "${testingDir}/CML103DiploidTest.fa"
2222
val mafFile = "${testingDir}/B97.maf"
2323
val mafFileInverted = "${testingDir}/B97_inverted.maf"
24+
val mafFileInvertedLegacy = "${testingDir}/B97_inverted_Legacy.maf"
2425
val diploidMafFile = "${testingDir}/B97diploid.maf"
2526
val mafFileNs = "${testingDir}/mafWithNs.maf"
27+
val mafFileNsLegacy = "${testingDir}/mafWithNs_Legacy.maf"
2628

2729
val truthGVCFFile = "${testingDir}/B97_truth.gvcf"
2830
val truthGVCFFileInverted = "${testingDir}/B97_truth_inverted.gvcf"
@@ -84,6 +86,7 @@ class MAFToGVCFTest : StringSpec({
8486
createTruthGVCFFile(truthGVCFFile)
8587

8688
createInvertedMAFFile(mafFileInverted)
89+
createInvertedMAFFileLegacy(mafFileInvertedLegacy)
8790
createTruthInversionGVCFFile(truthGVCFFileInverted)
8891

8992
// Create the known overlapping GVCF truth file for 1st diploid
@@ -99,6 +102,7 @@ class MAFToGVCFTest : StringSpec({
99102
// Create ref, maf, and truth GVCF for Ns testing
100103
createRefWithN(refNFile)
101104
createNsMAFFile(mafFileNs)
105+
createNsMAFFileLegacy(mafFileNsLegacy)
102106
createTruthNs(truthGVCFFileNs)
103107

104108

@@ -313,8 +317,8 @@ class MAFToGVCFTest : StringSpec({
313317
}
314318
}
315319

316-
"TestInversion" {
317-
MAFToGVCF().createGVCFfromMAF(mafFileInverted,refFile,outputFileInverted,sampleName,fillGaps=false,compressAndIndex=false)
320+
"TestInversionLegacy" {
321+
MAFToGVCF().createGVCFfromMAF(mafFileInvertedLegacy,refFile,outputFileInverted,sampleName,fillGaps=false,compressAndIndex=false, anchorwaveLegacy = true)
318322

319323
val truthVariantIterator = VCFFileReader(File(truthGVCFFileInverted),false).iterator()
320324
val truthVariants = mutableListOf<VariantContext>()
@@ -625,6 +629,51 @@ class MAFToGVCFTest : StringSpec({
625629
}
626630
}
627631

632+
"testMAFsWithNsLegacy" {
633+
MAFToGVCF().createGVCFfromMAF(mafFileNsLegacy,refNFile,outputFileNs,sampleName,fillGaps=false,compressAndIndex=false, anchorwaveLegacy = true)
634+
635+
val truthVariantIterator = VCFFileReader(File(truthGVCFFileNs),false).iterator()
636+
val truthVariants = mutableListOf<VariantContext>()
637+
while(truthVariantIterator.hasNext()) {
638+
truthVariants.add(truthVariantIterator.next())
639+
}
640+
val truthMap = truthVariants.associateBy { Position(it.contig, it.start) }
641+
642+
val outputVariantIterator = VCFFileReader(File(outputFileNs), false).iterator()
643+
val outputVariants = mutableListOf<VariantContext>()
644+
while(outputVariantIterator.hasNext()) {
645+
outputVariants.add(outputVariantIterator.next())
646+
}
647+
648+
//mafBlocks.size shouldBe 4
649+
outputVariants.size shouldBe truthVariants.size
650+
651+
for(variant in outputVariants) {
652+
if(!truthMap.containsKey(Position(variant.contig, variant.start))) {
653+
fail("No matching variant found: ${variant.contig}:${variant.start}")
654+
}
655+
val matchingTruth = truthMap[Position(variant.contig, variant.start)]!!
656+
657+
//Check END
658+
variant.end shouldBe matchingTruth.end
659+
//Check alleles
660+
variant.alleles.toTypedArray() contentEquals matchingTruth.alleles.toTypedArray() shouldBe true
661+
//Check GT
662+
(matchingTruth.getGenotype(0).genotypeString == variant.getGenotype(0).genotypeString) shouldBe true
663+
//Check AD
664+
(matchingTruth.getGenotype(0).ad contentEquals variant.getGenotype(0).ad) shouldBe true
665+
//Check ASM Contig
666+
(matchingTruth.getAttribute("ASM_Chr") == variant.getAttribute("ASM_Chr")) shouldBe true
667+
//Check ASM Start
668+
(matchingTruth.getAttribute("ASM_Start") == variant.getAttribute("ASM_Start")) shouldBe true
669+
//Check ASM END
670+
(matchingTruth.getAttribute("ASM_End") == variant.getAttribute("ASM_End")) shouldBe true
671+
//Check ASM Strand
672+
(matchingTruth.getAttribute("ASM_Strand") == variant.getAttribute("ASM_Strand")) shouldBe true
673+
674+
}
675+
}
676+
628677
"testMAFsWithNs" {
629678
MAFToGVCF().createGVCFfromMAF(mafFileNs,refNFile,outputFileNs,sampleName,fillGaps=false,compressAndIndex=false)
630679

@@ -661,6 +710,7 @@ class MAFToGVCFTest : StringSpec({
661710
//Check ASM Contig
662711
(matchingTruth.getAttribute("ASM_Chr") == variant.getAttribute("ASM_Chr")) shouldBe true
663712
//Check ASM Start
713+
println("ASM_Start: ${matchingTruth.getAttribute("ASM_Start")} == ${variant.getAttribute("ASM_Start")}")
664714
(matchingTruth.getAttribute("ASM_Start") == variant.getAttribute("ASM_Start")) shouldBe true
665715
//Check ASM END
666716
(matchingTruth.getAttribute("ASM_End") == variant.getAttribute("ASM_End")) shouldBe true
@@ -781,6 +831,30 @@ fun createMAFFileWithEIQlines(outputFile: String) {
781831
}
782832
}
783833
fun createInvertedMAFFile(outputFile: String) {
834+
File(outputFile).bufferedWriter().use {output ->
835+
output.write("##maf version=1 scoring=Tba.v8\n\n")
836+
837+
output.write("a\tscore=23262.0\n")
838+
output.write("s\tchr7\t12\t38\t+\t158545518\tAAA-GGGAATGTTAACCAAATGA---ATTGTCTCTTACGGTG\n")
839+
output.write("s\tchr4\t81344243\t40\t+\t187371129\t-AA-GGGGATGCTAAGCCAATGAGTTGTTGTCTCTCAATGTG\n\n")
840+
841+
output.write("a\tscore=5062.0\n")
842+
output.write("s\tchr7\t450\t6\t+\t158545518\tTAAAGAT---GGGT\n")
843+
output.write("s\tchr4\t81444246\t6\t+\t 187371129\tTAAGGATCCC---T\n\n")
844+
845+
output.write("a\tscore=6636.0\n")
846+
output.write("s\tchr1\t0\t40\t+\t 158545518\t-----GCAGCTGAAAACAGTCAATCTTACACACTTGGGGCCTACT\n")
847+
output.write("s\tchr6\t97794583\t45\t - 151104725\tAAAAAGACAGCTGAAAATATCAATCTTACACACTTGGGGCCTACT\n\n")
848+
849+
// we need a chr10 in here to test sorting the maf records
850+
output.write("a\tscore=6636.0\n")
851+
output.write("s\tchr10\t0\t40\t+\t 158545518\t-----GCAGCTGAAAACAGTCAATCTTACACACTTGGGGCCTACT\n")
852+
output.write("s\tchr6\t53310097\t45\t + 151104725\tAAAAAGACAGCTGAAAATATCAATCTTACACACTTGGGGCCTACT\n\n")
853+
854+
}
855+
}
856+
857+
fun createInvertedMAFFileLegacy(outputFile: String) {
784858
File(outputFile).bufferedWriter().use {output ->
785859
output.write("##maf version=1 scoring=Tba.v8\n\n")
786860

@@ -850,28 +924,46 @@ fun createMAFFileWithEIQlines(outputFile: String) {
850924
}
851925
}
852926

853-
/**
854-
* Function to create a MAF file used for testing. This covers alignments with lots of Ns
855-
*
856-
*/
857-
fun createNsMAFFile(outputFile: String) {
858-
File(outputFile).bufferedWriter().use {output ->
859-
output.write("##maf version 1\n\n")
927+
/**
928+
* Function to create a MAF file used for testing. This covers alignments with lots of Ns
929+
*
930+
*/
931+
fun createNsMAFFile(outputFile: String) {
932+
File(outputFile).bufferedWriter().use {output ->
933+
output.write("##maf version 1\n\n")
860934

861-
output.write("a\tscore=12\n")
862-
output.write("s\tChr01\t0\t129\t+\t129\tTGTCGACTCAGCTC---CACACTCG---ACTCCNCTACGCATCACNCNNNCCTACTCTACACACTCCACCACACA--CTCTCGT----CGTACGTGCG---CGTAGAG--CGA--GATCGACTACCC--ATCAG--GGCTCAGCTG------AGC------TCG\n")
863-
output.write("s\tChr01\t0\t156\t+\t184\tTGTCNNNTCACNNNGTACTCCACACGAANNNTCNCTNCGCATCACNCNNNNNNACTCTAC--NNNN----ACACAAANNNTCNNATAACGTACGTANNNNNGGTAGAGTTNNNNNGATCG--NNNNNNNATCAANNNNNNGAGCTGTCNNNNAGNNNNNATTCG\n\n")
935+
output.write("a\tscore=12\n")
936+
output.write("s\tChr01\t0\t129\t+\t129\tTGTCGACTCAGCTC---CACACTCG---ACTCCNCTACGCATCACNCNNNCCTACTCTACACACTCCACCACACA--CTCTCGT----CGTACGTGCG---CGTAGAG--CGA--GATCGACTACCC--ATCAG--GGCTCAGCTG------AGC------TCG\n")
937+
output.write("s\tChr01\t0\t156\t+\t184\tTGTCNNNTCACNNNGTACTCCACACGAANNNTCNCTNCGCATCACNCNNNNNNACTCTAC--NNNN----ACACAAANNNTCNNATAACGTACGTANNNNNGGTAGAGTTNNNNNGATCG--NNNNNNNATCAANNNNNNGAGCTGTCNNNNAGNNNNNATTCG\n\n")
864938

865-
output.write("a\tscore=12\n")
866-
output.write("s\tChr02\t2\t20\t+\t113\tNNNGCTAGCTAGCTCA-------GCGC\n")
867-
output.write("s\tChr02\t10\t25\t+\t160\tNNNNN--GCTAGCTNNNNNNTATGCGC\n\n")
939+
output.write("a\tscore=12\n")
940+
output.write("s\tChr02\t2\t20\t+\t113\tNNNGCTAGCTAGCTCA-------GCGC\n")
941+
output.write("s\tChr02\t10\t25\t+\t160\tNNNNN--GCTAGCTNNNNNNTATGCGC\n\n")
868942

869-
output.write("a\tscore=12\n")
870-
output.write("s\tChr02\t22\t66\t+\t113\tACACCTGTGTGCAGCTGCTTACGGGGCGCGCCCCATCTCGCGGGGCTCATGCGAACCNNNCGCATG\n")
871-
output.write("s\tChr02\t100\t58\t-\t160\tACA--NNNNTGCAAC--NNNNGGNNNNGNNNN--TTCTCGCGGNNNN--NNNNGNCCNNNCNNNNN\n\n")
943+
output.write("a\tscore=12\n")
944+
output.write("s\tChr02\t22\t66\t+\t113\tACACCTGTGTGCAGCTGCTTACGGGGCGCGCCCCATCTCGCGGGGCTCATGCGAACCNNNCGCATG\n")
945+
output.write("s\tChr02\t2\t58\t-\t160\tACA--NNNNTGCAAC--NNNNGGNNNNGNNNN--TTCTCGCGGNNNN--NNNNGNCCNNNCNNNNN\n\n")
872946

947+
}
948+
}
949+
fun createNsMAFFileLegacy(outputFile: String) {
950+
File(outputFile).bufferedWriter().use {output ->
951+
output.write("##maf version 1\n\n")
952+
953+
output.write("a\tscore=12\n")
954+
output.write("s\tChr01\t0\t129\t+\t129\tTGTCGACTCAGCTC---CACACTCG---ACTCCNCTACGCATCACNCNNNCCTACTCTACACACTCCACCACACA--CTCTCGT----CGTACGTGCG---CGTAGAG--CGA--GATCGACTACCC--ATCAG--GGCTCAGCTG------AGC------TCG\n")
955+
output.write("s\tChr01\t0\t156\t+\t184\tTGTCNNNTCACNNNGTACTCCACACGAANNNTCNCTNCGCATCACNCNNNNNNACTCTAC--NNNN----ACACAAANNNTCNNATAACGTACGTANNNNNGGTAGAGTTNNNNNGATCG--NNNNNNNATCAANNNNNNGAGCTGTCNNNNAGNNNNNATTCG\n\n")
956+
957+
output.write("a\tscore=12\n")
958+
output.write("s\tChr02\t2\t20\t+\t113\tNNNGCTAGCTAGCTCA-------GCGC\n")
959+
output.write("s\tChr02\t10\t25\t+\t160\tNNNNN--GCTAGCTNNNNNNTATGCGC\n\n")
960+
961+
output.write("a\tscore=12\n")
962+
output.write("s\tChr02\t22\t66\t+\t113\tACACCTGTGTGCAGCTGCTTACGGGGCGCGCCCCATCTCGCGGGGCTCATGCGAACCNNNCGCATG\n")
963+
output.write("s\tChr02\t100\t58\t-\t160\tACA--NNNNTGCAAC--NNNNGGNNNNGNNNN--TTCTCGCGGNNNN--NNNNGNCCNNNCNNNNN\n\n")
964+
965+
}
873966
}
874-
}
875967

876968
/**
877969
* Simple function to create a simple MAF file used for testing. This covers most of the edge cases we have run into.

0 commit comments

Comments
 (0)