1
+ package biokotlin.cli
2
+
3
+ import biokotlin.seq.ProteinSeqRecord
4
+ import biokotlin.seqIO.FastaIO
5
+ import biokotlin.seqIO.SeqType
6
+ import biokotlin.util.bufferedReader
7
+ import biokotlin.util.bufferedWriter
8
+ import com.github.ajalt.clikt.core.CliktCommand
9
+ import com.github.ajalt.clikt.parameters.options.default
10
+ import com.github.ajalt.clikt.parameters.options.option
11
+ import com.github.ajalt.clikt.parameters.options.required
12
+ import com.github.ajalt.clikt.parameters.types.boolean
13
+ import com.github.ajalt.clikt.parameters.types.enum
14
+ import com.github.ajalt.clikt.parameters.types.int
15
+ import com.google.common.collect.HashMultimap
16
+ import com.google.common.collect.Multimap
17
+ import org.apache.logging.log4j.LogManager
18
+ import java.io.BufferedWriter
19
+ import kotlin.random.Random
20
+
21
+ /* *
22
+ * This command randomly mutates the protein sequences in a fasta file.
23
+ * The mutations can be deletions, point mutations, or insertions.
24
+ * The output is written to a new fasta file.
25
+ * The mutations can be put in or out of ranges defined by a bedfile.
26
+ */
27
+ class MutateProteins : CliktCommand (help = " Mutate Proteins" ) {
28
+
29
+ private val myLogger = LogManager .getLogger(MutateProteins ::class .java)
30
+
31
+ enum class TypeMutation {
32
+ DELETION , POINT_MUTATION , INSERTION
33
+ }
34
+
35
+ private val proteinLetters = arrayOf(
36
+ ' R' , ' H' , ' K' , ' D' , ' E' , ' S' , ' T' , ' N' , ' Q' , ' C' , ' G' , ' P' , ' A' , ' V' , ' I' , ' L' , ' M' , ' F' , ' Y' , ' W'
37
+ )
38
+
39
+ private val numProteinLetters = proteinLetters.size
40
+
41
+ val inputFasta by option(help = " Full path to input fasta file" )
42
+ .required()
43
+
44
+ val outputFasta by option(help = " Full path to output fasta file" )
45
+ .required()
46
+
47
+ val bedfile by option(help = " Full path to bedfile" )
48
+ .required()
49
+
50
+ val typeMutation by option(help = " Type of mutation" )
51
+ .enum<TypeMutation >()
52
+ .default(TypeMutation .POINT_MUTATION )
53
+
54
+ val putMutationsInRanges by option(help = " Put mutations in ranges defined by bedfile" )
55
+ .boolean()
56
+ .default(true )
57
+
58
+ val length by option(help = " Length of the deletion mutation" )
59
+ .int()
60
+ .default(5 )
61
+
62
+ val numMutations by option(help = " Number of point mutations" )
63
+ .int()
64
+ .default(10 )
65
+
66
+ val mutatedIndicesBedfile by option(help = " Full path to bedfile to output mutated indices" )
67
+
68
+ val randomSeed by option(help = " Random seed" )
69
+ .int()
70
+ .default(1234 )
71
+
72
+ override fun run () {
73
+
74
+ val ranges = readBedfile()
75
+
76
+ val reader = FastaIO (inputFasta, SeqType .protein)
77
+
78
+ val mutatedIndicesWriter = if (mutatedIndicesBedfile != null ) bufferedWriter(mutatedIndicesBedfile!! ) else null
79
+
80
+ bufferedWriter(outputFasta).use { writer ->
81
+
82
+ val ids = mutableSetOf<String >()
83
+ var count = 0
84
+ while (reader.hasNext()) {
85
+
86
+ val record = reader.next() as ProteinSeqRecord
87
+ val id = record.id
88
+ ids.add(id)
89
+ count++
90
+
91
+ val mutatedSeq = when (typeMutation) {
92
+ TypeMutation .DELETION -> deletionMutation(record, ranges)
93
+ TypeMutation .POINT_MUTATION -> pointMutations(record, ranges, mutatedIndicesWriter)
94
+ TypeMutation .INSERTION -> insertionMutation(record, ranges)
95
+ }
96
+
97
+ writeSeq(writer, id, mutatedSeq)
98
+
99
+ }
100
+
101
+ mutatedIndicesWriter?.close()
102
+
103
+ if (count != ids.size) {
104
+ myLogger.warn(" Duplicate IDs found" )
105
+ }
106
+
107
+ }
108
+
109
+ }
110
+
111
+ /* *
112
+ * Write sequence to writer. The sequence is written in fasta format.
113
+ */
114
+ private fun writeSeq (writer : BufferedWriter , id : String , mutatedSeq : String ) {
115
+ writer.write(" >${id} \n " )
116
+ mutatedSeq
117
+ .chunked(60 )
118
+ .forEach { line ->
119
+ writer.write(line)
120
+ writer.newLine()
121
+ }
122
+ }
123
+
124
+ // zero based inclusive / inclusive
125
+ private data class BedfileRecord (val contig : String , val start : Int , val end : Int ) {
126
+
127
+ fun contains (index : Int ): Boolean {
128
+ return index in start.. end
129
+ }
130
+
131
+ fun length (): Int {
132
+ return end - start + 1
133
+ }
134
+
135
+ }
136
+
137
+ /* *
138
+ * Read bedfile into a multimap. The key is the contig
139
+ * and the value is a list of BedfileRecord.
140
+ */
141
+ private fun readBedfile (): Multimap <String , BedfileRecord > {
142
+
143
+ val result: Multimap <String , BedfileRecord > = HashMultimap .create()
144
+
145
+ bufferedReader(bedfile).useLines { lines ->
146
+ lines.forEach { line ->
147
+ val splitLine = line.split(" \t " )
148
+ val contig = splitLine[0 ]
149
+ // bedfile is zero based inclusive / exclusive
150
+ // convert to zero based inclusive / inclusive
151
+ // keeping on zero base since fasta file is zero based
152
+ val record = BedfileRecord (contig, splitLine[1 ].toInt(), splitLine[2 ].toInt() - 1 )
153
+ result.put(contig, record)
154
+ }
155
+ }
156
+
157
+ return result
158
+
159
+ }
160
+
161
+ // Validate ranges and merges overlapping ranges
162
+ private fun validateAndMergeRanges (seqLength : Int , ranges : Collection <BedfileRecord >): Collection <BedfileRecord > {
163
+
164
+ var contig: String? = null
165
+
166
+ ranges
167
+ .forEach { range ->
168
+ if (contig == null ) {
169
+ contig = range.contig
170
+ } else {
171
+ require(contig == range.contig) { " All ranges must have the same contig" }
172
+ }
173
+ require(range.start >= 0 ) { " Start position must be greater than or equal to 0" }
174
+ require(range.end >= range.start) { " End position must be greater than or equal to start position" }
175
+ require(range.end < seqLength) { " End position must be less than the length of the sequence" }
176
+ }
177
+
178
+ val result = mutableListOf<BedfileRecord >()
179
+
180
+ ranges
181
+ .sortedBy { it.start }
182
+ .forEach { range ->
183
+ if (result.isEmpty()) {
184
+ result.add(range)
185
+ } else {
186
+ val last = result.last()
187
+ if (last.end >= range.start) {
188
+ result[result.size - 1 ] = BedfileRecord (last.contig, last.start, range.end)
189
+ } else {
190
+ result.add(range)
191
+ }
192
+ }
193
+ }
194
+
195
+ return result
196
+
197
+ }
198
+
199
+ // determine if index is in any of the ranges
200
+ private fun inRanges (index : Int , ranges : Collection <BedfileRecord >): Boolean {
201
+ return ranges.any { it.contains(index) }
202
+ }
203
+
204
+ // All ranges should have the same contig
205
+ // Ranges should not overlap
206
+ private fun inverseRanges (
207
+ contig : String ,
208
+ seqLength : Int ,
209
+ ranges : Collection <BedfileRecord >
210
+ ): Collection <BedfileRecord > {
211
+
212
+ if (ranges.isEmpty()) {
213
+ return listOf (BedfileRecord (contig, 0 , seqLength - 1 ))
214
+ }
215
+
216
+ val result = mutableListOf<BedfileRecord >()
217
+
218
+ var start = 0
219
+ ranges
220
+ .sortedBy { it.start }
221
+ .forEach { range ->
222
+ if (range.start > start) {
223
+ result.add(BedfileRecord (contig, start, range.start - 1 ))
224
+ }
225
+ start = range.end + 1
226
+ }
227
+ if (start < seqLength) {
228
+ result.add(BedfileRecord (contig, start, seqLength - 1 ))
229
+ }
230
+
231
+ return result
232
+
233
+ }
234
+
235
+ /* *
236
+ * This deletes bases (number specified by parameter length) from each sequence.
237
+ * The bases are continuous. It makes the deletion in or out of
238
+ * the ranges defined by the bedfile depending on the setting of putMutationsInRanges.
239
+ */
240
+ private fun deletionMutation (
241
+ record : ProteinSeqRecord , ranges : Multimap <String , BedfileRecord >
242
+ ): String {
243
+
244
+ val origSeq = record.seq()
245
+ val seqLength = origSeq.length
246
+
247
+ val contig = record.id
248
+ val rangesForRecord = ranges.get(contig)
249
+
250
+ val mergedRanges = validateAndMergeRanges(seqLength, rangesForRecord)
251
+
252
+ val possibleMutationRanges =
253
+ if (putMutationsInRanges) mergedRanges else inverseRanges(contig, seqLength, mergedRanges)
254
+
255
+ val random = Random (randomSeed)
256
+
257
+ // get possible ranges to mutate
258
+ // that are long enough to delete specified
259
+ // length. Then randomly select one of these ranges
260
+ val rangeToMutate = possibleMutationRanges
261
+ .filter { range -> range.length() >= length }
262
+ .randomOrNull(random)
263
+
264
+ val result = StringBuilder (origSeq)
265
+
266
+ if (rangeToMutate != null ) {
267
+ // get random start index of place to delete
268
+ // within rangeToMutate
269
+ // The +1 adds one to the end to make it inclusive for the
270
+ // mutation range. And another +1 because nextInt() has exclusive
271
+ // end position
272
+ val start = random.nextInt(rangeToMutate.start, rangeToMutate.end - length + 2 )
273
+
274
+ // delete length bases
275
+ result.delete(start, start + length)
276
+ }
277
+
278
+ return result.toString()
279
+
280
+ }
281
+
282
+ /* *
283
+ * This inserts bases (number specified by parameter length) into each sequence.
284
+ * The bases are continuous. It makes the insertion in or out of
285
+ * the ranges defined by the bedfile depending on the setting of putMutationsInRanges.
286
+ */
287
+ private fun insertionMutation (
288
+ record : ProteinSeqRecord , ranges : Multimap <String , BedfileRecord >
289
+ ): String {
290
+
291
+ val origSeq = record.seq()
292
+ val seqLength = origSeq.length
293
+
294
+ val contig = record.id
295
+ val rangesForRecord = ranges.get(contig)
296
+
297
+ val mergedRanges = validateAndMergeRanges(seqLength, rangesForRecord)
298
+
299
+ val possibleMutationRanges =
300
+ if (putMutationsInRanges) mergedRanges else inverseRanges(contig, seqLength, mergedRanges)
301
+
302
+ val random = Random (randomSeed)
303
+
304
+ // get possible ranges to mutate
305
+ // Then randomly select one of these ranges
306
+ val rangeToMutate = possibleMutationRanges
307
+ .randomOrNull(random)
308
+
309
+ val result = StringBuilder (origSeq)
310
+
311
+ if (rangeToMutate != null ) {
312
+ // get random start index of place to insert
313
+ // within rangeToMutate
314
+ val start = random.nextInt(rangeToMutate.start, rangeToMutate.end + 1 )
315
+
316
+ (0 until length).forEach { _ ->
317
+ val base = proteinLetters.random(random)
318
+ result.insert(start, base)
319
+ }
320
+ }
321
+
322
+ return result.toString()
323
+
324
+ }
325
+
326
+ /* *
327
+ * This changes random bases in the sequence to another random value.
328
+ * It changes upto numMutations bases. It puts the mutations in or out of
329
+ * the ranges defined by the bedfile depending on the setting of putMutationsInRanges.
330
+ */
331
+ private fun pointMutations (
332
+ record : ProteinSeqRecord ,
333
+ ranges : Multimap <String , BedfileRecord >,
334
+ mutatedIndicesWriter : BufferedWriter ? = null
335
+ ): String {
336
+
337
+ val origSeq = record.seq()
338
+ val seqLength = origSeq.length
339
+
340
+ val contig = record.id
341
+ val rangesForRecord = ranges.get(contig)
342
+
343
+ val mergedRanges = validateAndMergeRanges(seqLength, rangesForRecord)
344
+
345
+ val possibleMutateIndices = if (putMutationsInRanges) {
346
+ origSeq.indices.filter { index -> inRanges(index, mergedRanges) }
347
+ } else {
348
+ origSeq.indices.filter { index -> ! inRanges(index, mergedRanges) }
349
+ }.toMutableList()
350
+
351
+ val random = Random (randomSeed)
352
+
353
+ val mutatedIndices = mutableSetOf<Int >()
354
+ var numMutated = 0
355
+ while (numMutated < numMutations && possibleMutateIndices.isNotEmpty()) {
356
+ val currentIndex = possibleMutateIndices.random(random)
357
+ possibleMutateIndices.remove(currentIndex)
358
+ mutatedIndices.add(currentIndex)
359
+ numMutated++
360
+ }
361
+
362
+ val result = StringBuilder (origSeq)
363
+
364
+ mutatedIndices.sorted().forEach { index ->
365
+ val origBase = result[index]
366
+ var mutatedBase = proteinLetters[random.nextInt(numProteinLetters)]
367
+ while (origBase == mutatedBase) {
368
+ mutatedBase = proteinLetters[random.nextInt(numProteinLetters)]
369
+ }
370
+ result[index] = mutatedBase
371
+ mutatedIndicesWriter?.write(" $contig \t $index \t ${index + 1 } \n " )
372
+ }
373
+
374
+ myLogger.info(" pointMutations: contig: $contig numMutationsRequested: $numMutations numMutations: $numMutated mutatedIndices: $mutatedIndices " )
375
+
376
+ return result.toString()
377
+
378
+ }
379
+
380
+ }
0 commit comments