Skip to content

Commit 7ef6fdc

Browse files
authored
Merge pull request #31 from maize-genetics/mutate-proteins
Mutate proteins
2 parents 364b278 + 4ee1389 commit 7ef6fdc

8 files changed

+43706
-1
lines changed

data/test/mutate-proteins/Zm-B73-REFERENCE-NAM-5.0_CorePeptide_random1K.fa

Lines changed: 8771 additions & 0 deletions
Large diffs are not rendered by default.

data/test/mutate-proteins/Zm-B73-REFERENCE-NAM-5.0_CorePeptide_random1K_delete-mutation.fa

Lines changed: 8726 additions & 0 deletions
Large diffs are not rendered by default.

data/test/mutate-proteins/Zm-B73-REFERENCE-NAM-5.0_CorePeptide_random1K_insert-mutation.fa

Lines changed: 8905 additions & 0 deletions
Large diffs are not rendered by default.

data/test/mutate-proteins/Zm-B73-REFERENCE-NAM-5.0_CorePeptide_random1K_point-mutation.fa

Lines changed: 8771 additions & 0 deletions
Large diffs are not rendered by default.

data/test/mutate-proteins/Zm-B73-V5_pfam_scan_coreRepTranscriptB73_random1K.bed

Lines changed: 7973 additions & 0 deletions
Large diffs are not rendered by default.

src/main/kotlin/biokotlin/cli/Biokotlin.kt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ fun main(args: Array<String>) = Biokotlin()
2323
MafToGvcfConverter(),
2424
ValidateGVCFs(),
2525
MergeGVCFs(),
26-
ValidateVCFs()
26+
ValidateVCFs(),
27+
MutateProteins()
2728
)
2829
.main(args)
Lines changed: 380 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,380 @@
1+
package biokotlin.cli
2+
3+
import biokotlin.seq.ProteinSeqRecord
4+
import biokotlin.seqIO.FastaIO
5+
import biokotlin.seqIO.SeqType
6+
import biokotlin.util.bufferedReader
7+
import biokotlin.util.bufferedWriter
8+
import com.github.ajalt.clikt.core.CliktCommand
9+
import com.github.ajalt.clikt.parameters.options.default
10+
import com.github.ajalt.clikt.parameters.options.option
11+
import com.github.ajalt.clikt.parameters.options.required
12+
import com.github.ajalt.clikt.parameters.types.boolean
13+
import com.github.ajalt.clikt.parameters.types.enum
14+
import com.github.ajalt.clikt.parameters.types.int
15+
import com.google.common.collect.HashMultimap
16+
import com.google.common.collect.Multimap
17+
import org.apache.logging.log4j.LogManager
18+
import java.io.BufferedWriter
19+
import kotlin.random.Random
20+
21+
/**
22+
* This command randomly mutates the protein sequences in a fasta file.
23+
* The mutations can be deletions, point mutations, or insertions.
24+
* The output is written to a new fasta file.
25+
* The mutations can be put in or out of ranges defined by a bedfile.
26+
*/
27+
class MutateProteins : CliktCommand(help = "Mutate Proteins") {
28+
29+
private val myLogger = LogManager.getLogger(MutateProteins::class.java)
30+
31+
enum class TypeMutation {
32+
DELETION, POINT_MUTATION, INSERTION
33+
}
34+
35+
private val proteinLetters = arrayOf(
36+
'R', 'H', 'K', 'D', 'E', 'S', 'T', 'N', 'Q', 'C', 'G', 'P', 'A', 'V', 'I', 'L', 'M', 'F', 'Y', 'W'
37+
)
38+
39+
private val numProteinLetters = proteinLetters.size
40+
41+
val inputFasta by option(help = "Full path to input fasta file")
42+
.required()
43+
44+
val outputFasta by option(help = "Full path to output fasta file")
45+
.required()
46+
47+
val bedfile by option(help = "Full path to bedfile")
48+
.required()
49+
50+
val typeMutation by option(help = "Type of mutation")
51+
.enum<TypeMutation>()
52+
.default(TypeMutation.POINT_MUTATION)
53+
54+
val putMutationsInRanges by option(help = "Put mutations in ranges defined by bedfile")
55+
.boolean()
56+
.default(true)
57+
58+
val length by option(help = "Length of the deletion mutation")
59+
.int()
60+
.default(5)
61+
62+
val numMutations by option(help = "Number of point mutations")
63+
.int()
64+
.default(10)
65+
66+
val mutatedIndicesBedfile by option(help = "Full path to bedfile to output mutated indices")
67+
68+
val randomSeed by option(help = "Random seed")
69+
.int()
70+
.default(1234)
71+
72+
override fun run() {
73+
74+
val ranges = readBedfile()
75+
76+
val reader = FastaIO(inputFasta, SeqType.protein)
77+
78+
val mutatedIndicesWriter = if (mutatedIndicesBedfile != null) bufferedWriter(mutatedIndicesBedfile!!) else null
79+
80+
bufferedWriter(outputFasta).use { writer ->
81+
82+
val ids = mutableSetOf<String>()
83+
var count = 0
84+
while (reader.hasNext()) {
85+
86+
val record = reader.next() as ProteinSeqRecord
87+
val id = record.id
88+
ids.add(id)
89+
count++
90+
91+
val mutatedSeq = when (typeMutation) {
92+
TypeMutation.DELETION -> deletionMutation(record, ranges)
93+
TypeMutation.POINT_MUTATION -> pointMutations(record, ranges, mutatedIndicesWriter)
94+
TypeMutation.INSERTION -> insertionMutation(record, ranges)
95+
}
96+
97+
writeSeq(writer, id, mutatedSeq)
98+
99+
}
100+
101+
mutatedIndicesWriter?.close()
102+
103+
if (count != ids.size) {
104+
myLogger.warn("Duplicate IDs found")
105+
}
106+
107+
}
108+
109+
}
110+
111+
/**
112+
* Write sequence to writer. The sequence is written in fasta format.
113+
*/
114+
private fun writeSeq(writer: BufferedWriter, id: String, mutatedSeq: String) {
115+
writer.write(">${id}\n")
116+
mutatedSeq
117+
.chunked(60)
118+
.forEach { line ->
119+
writer.write(line)
120+
writer.newLine()
121+
}
122+
}
123+
124+
// zero based inclusive / inclusive
125+
private data class BedfileRecord(val contig: String, val start: Int, val end: Int) {
126+
127+
fun contains(index: Int): Boolean {
128+
return index in start..end
129+
}
130+
131+
fun length(): Int {
132+
return end - start + 1
133+
}
134+
135+
}
136+
137+
/**
138+
* Read bedfile into a multimap. The key is the contig
139+
* and the value is a list of BedfileRecord.
140+
*/
141+
private fun readBedfile(): Multimap<String, BedfileRecord> {
142+
143+
val result: Multimap<String, BedfileRecord> = HashMultimap.create()
144+
145+
bufferedReader(bedfile).useLines { lines ->
146+
lines.forEach { line ->
147+
val splitLine = line.split("\t")
148+
val contig = splitLine[0]
149+
// bedfile is zero based inclusive / exclusive
150+
// convert to zero based inclusive / inclusive
151+
// keeping on zero base since fasta file is zero based
152+
val record = BedfileRecord(contig, splitLine[1].toInt(), splitLine[2].toInt() - 1)
153+
result.put(contig, record)
154+
}
155+
}
156+
157+
return result
158+
159+
}
160+
161+
// Validate ranges and merges overlapping ranges
162+
private fun validateAndMergeRanges(seqLength: Int, ranges: Collection<BedfileRecord>): Collection<BedfileRecord> {
163+
164+
var contig: String? = null
165+
166+
ranges
167+
.forEach { range ->
168+
if (contig == null) {
169+
contig = range.contig
170+
} else {
171+
require(contig == range.contig) { "All ranges must have the same contig" }
172+
}
173+
require(range.start >= 0) { "Start position must be greater than or equal to 0" }
174+
require(range.end >= range.start) { "End position must be greater than or equal to start position" }
175+
require(range.end < seqLength) { "End position must be less than the length of the sequence" }
176+
}
177+
178+
val result = mutableListOf<BedfileRecord>()
179+
180+
ranges
181+
.sortedBy { it.start }
182+
.forEach { range ->
183+
if (result.isEmpty()) {
184+
result.add(range)
185+
} else {
186+
val last = result.last()
187+
if (last.end >= range.start) {
188+
result[result.size - 1] = BedfileRecord(last.contig, last.start, range.end)
189+
} else {
190+
result.add(range)
191+
}
192+
}
193+
}
194+
195+
return result
196+
197+
}
198+
199+
// determine if index is in any of the ranges
200+
private fun inRanges(index: Int, ranges: Collection<BedfileRecord>): Boolean {
201+
return ranges.any { it.contains(index) }
202+
}
203+
204+
// All ranges should have the same contig
205+
// Ranges should not overlap
206+
private fun inverseRanges(
207+
contig: String,
208+
seqLength: Int,
209+
ranges: Collection<BedfileRecord>
210+
): Collection<BedfileRecord> {
211+
212+
if (ranges.isEmpty()) {
213+
return listOf(BedfileRecord(contig, 0, seqLength - 1))
214+
}
215+
216+
val result = mutableListOf<BedfileRecord>()
217+
218+
var start = 0
219+
ranges
220+
.sortedBy { it.start }
221+
.forEach { range ->
222+
if (range.start > start) {
223+
result.add(BedfileRecord(contig, start, range.start - 1))
224+
}
225+
start = range.end + 1
226+
}
227+
if (start < seqLength) {
228+
result.add(BedfileRecord(contig, start, seqLength - 1))
229+
}
230+
231+
return result
232+
233+
}
234+
235+
/**
236+
* This deletes bases (number specified by parameter length) from each sequence.
237+
* The bases are continuous. It makes the deletion in or out of
238+
* the ranges defined by the bedfile depending on the setting of putMutationsInRanges.
239+
*/
240+
private fun deletionMutation(
241+
record: ProteinSeqRecord, ranges: Multimap<String, BedfileRecord>
242+
): String {
243+
244+
val origSeq = record.seq()
245+
val seqLength = origSeq.length
246+
247+
val contig = record.id
248+
val rangesForRecord = ranges.get(contig)
249+
250+
val mergedRanges = validateAndMergeRanges(seqLength, rangesForRecord)
251+
252+
val possibleMutationRanges =
253+
if (putMutationsInRanges) mergedRanges else inverseRanges(contig, seqLength, mergedRanges)
254+
255+
val random = Random(randomSeed)
256+
257+
// get possible ranges to mutate
258+
// that are long enough to delete specified
259+
// length. Then randomly select one of these ranges
260+
val rangeToMutate = possibleMutationRanges
261+
.filter { range -> range.length() >= length }
262+
.randomOrNull(random)
263+
264+
val result = StringBuilder(origSeq)
265+
266+
if (rangeToMutate != null) {
267+
// get random start index of place to delete
268+
// within rangeToMutate
269+
// The +1 adds one to the end to make it inclusive for the
270+
// mutation range. And another +1 because nextInt() has exclusive
271+
// end position
272+
val start = random.nextInt(rangeToMutate.start, rangeToMutate.end - length + 2)
273+
274+
// delete length bases
275+
result.delete(start, start + length)
276+
}
277+
278+
return result.toString()
279+
280+
}
281+
282+
/**
283+
* This inserts bases (number specified by parameter length) into each sequence.
284+
* The bases are continuous. It makes the insertion in or out of
285+
* the ranges defined by the bedfile depending on the setting of putMutationsInRanges.
286+
*/
287+
private fun insertionMutation(
288+
record: ProteinSeqRecord, ranges: Multimap<String, BedfileRecord>
289+
): String {
290+
291+
val origSeq = record.seq()
292+
val seqLength = origSeq.length
293+
294+
val contig = record.id
295+
val rangesForRecord = ranges.get(contig)
296+
297+
val mergedRanges = validateAndMergeRanges(seqLength, rangesForRecord)
298+
299+
val possibleMutationRanges =
300+
if (putMutationsInRanges) mergedRanges else inverseRanges(contig, seqLength, mergedRanges)
301+
302+
val random = Random(randomSeed)
303+
304+
// get possible ranges to mutate
305+
// Then randomly select one of these ranges
306+
val rangeToMutate = possibleMutationRanges
307+
.randomOrNull(random)
308+
309+
val result = StringBuilder(origSeq)
310+
311+
if (rangeToMutate != null) {
312+
// get random start index of place to insert
313+
// within rangeToMutate
314+
val start = random.nextInt(rangeToMutate.start, rangeToMutate.end + 1)
315+
316+
(0 until length).forEach { _ ->
317+
val base = proteinLetters.random(random)
318+
result.insert(start, base)
319+
}
320+
}
321+
322+
return result.toString()
323+
324+
}
325+
326+
/**
327+
* This changes random bases in the sequence to another random value.
328+
* It changes upto numMutations bases. It puts the mutations in or out of
329+
* the ranges defined by the bedfile depending on the setting of putMutationsInRanges.
330+
*/
331+
private fun pointMutations(
332+
record: ProteinSeqRecord,
333+
ranges: Multimap<String, BedfileRecord>,
334+
mutatedIndicesWriter: BufferedWriter? = null
335+
): String {
336+
337+
val origSeq = record.seq()
338+
val seqLength = origSeq.length
339+
340+
val contig = record.id
341+
val rangesForRecord = ranges.get(contig)
342+
343+
val mergedRanges = validateAndMergeRanges(seqLength, rangesForRecord)
344+
345+
val possibleMutateIndices = if (putMutationsInRanges) {
346+
origSeq.indices.filter { index -> inRanges(index, mergedRanges) }
347+
} else {
348+
origSeq.indices.filter { index -> !inRanges(index, mergedRanges) }
349+
}.toMutableList()
350+
351+
val random = Random(randomSeed)
352+
353+
val mutatedIndices = mutableSetOf<Int>()
354+
var numMutated = 0
355+
while (numMutated < numMutations && possibleMutateIndices.isNotEmpty()) {
356+
val currentIndex = possibleMutateIndices.random(random)
357+
possibleMutateIndices.remove(currentIndex)
358+
mutatedIndices.add(currentIndex)
359+
numMutated++
360+
}
361+
362+
val result = StringBuilder(origSeq)
363+
364+
mutatedIndices.sorted().forEach { index ->
365+
val origBase = result[index]
366+
var mutatedBase = proteinLetters[random.nextInt(numProteinLetters)]
367+
while (origBase == mutatedBase) {
368+
mutatedBase = proteinLetters[random.nextInt(numProteinLetters)]
369+
}
370+
result[index] = mutatedBase
371+
mutatedIndicesWriter?.write("$contig\t$index\t${index + 1}\n")
372+
}
373+
374+
myLogger.info("pointMutations: contig: $contig numMutationsRequested: $numMutations numMutations: $numMutated mutatedIndices: $mutatedIndices")
375+
376+
return result.toString()
377+
378+
}
379+
380+
}

0 commit comments

Comments
 (0)