@@ -108,8 +108,8 @@ def write_unique_counts(fasta_filename: Path,
108
108
segment_unique_counts , ambiguous_count = \
109
109
binary_search (index_filename ,
110
110
sequence_segment ,
111
- kmer_lengths ,
112
- num_kmers ,
111
+ min_kmer_length ,
112
+ max_kmer_length ,
113
113
num_threads ,
114
114
data_type , # type: ignore
115
115
verbose )
@@ -151,74 +151,16 @@ def write_unique_counts(fasta_filename: Path,
151
151
min_length_found )
152
152
153
153
154
- def print_summary_statisitcs (verbose : bool ,
155
- total_unique_lengths_count : int ,
156
- total_ambiguous_positions : int ,
157
- total_no_unique_lengths_count : int ,
158
- max_length_found : int ,
159
- min_length_found : int ):
160
- if (verbose and
161
- total_unique_lengths_count ):
162
- verbose_print (verbose ,
163
- f"{ total_unique_lengths_count } unique lengths found" )
164
- verbose_print (verbose ,
165
- f"{ total_ambiguous_positions } positions skipped due to "
166
- "ambiguity" )
167
- verbose_print (verbose ,
168
- f"{ total_no_unique_lengths_count } positions with no "
169
- "unique length found" )
170
- verbose_print (verbose ,
171
- f"{ max_length_found } -mer maximum unique length found" )
172
- verbose_print (verbose ,
173
- f"{ min_length_found } -mer minimum unique length found" )
174
-
175
-
176
- def get_kmer_counts (index_filename : Path ,
177
- kmers : list [bytes ],
178
- num_threads : int ) -> npt .NDArray [np .uint32 ]:
179
-
180
- # Count the occurances of kmers on the forward strand
181
- count_list = np .array (count_kmers (
182
- str (index_filename ),
183
- kmers ,
184
- num_threads ),
185
- dtype = np .uint32 )
186
-
187
- # TODO: Add no reverse complement option to skip this to
188
- # support bisulfite treated kmer counting
189
- # TODO: Add option for a complement table
190
- count_list += np .array (
191
- count_kmers (str (index_filename ),
192
- [kmer .translate (COMPLEMENT_TRANSLATE_TABLE )[::- 1 ]
193
- for kmer in kmers ],
194
- num_threads ), dtype = np .uint32 )
195
-
196
- # If any element in the count list is 0
197
- if np .any (count_list == 0 ):
198
- # There is very likely a mismatch between sequence and the index
199
- # There is also a chance there is a bug in the index
200
-
201
- # NB: Get first element of tuple, then first element in numpy array
202
- first_kmer_index = (count_list == 0 ).nonzero ()[0 ][0 ]
203
- first_problem_kmer = kmers [first_kmer_index ].decode ("utf-8" )
204
- raise RuntimeError (f"The following generated k-mer was not found in "
205
- f"the index:\n { first_problem_kmer } \n "
206
- f"Possibly a mismatch between the sequence and the "
207
- f"index." )
208
-
209
- return count_list
210
-
211
-
212
154
def binary_search (index_filename : Path ,
213
155
sequence_segment : SequenceSegment ,
214
- kmer_lengths : list [ int ] ,
215
- num_kmers : int ,
156
+ min_kmer_length : int ,
157
+ max_kmer_length : int ,
216
158
num_threads : int ,
217
159
data_type : Union [np .uint8 , np .uint16 , np .uint32 ],
218
160
verbose : bool ) -> tuple [npt .NDArray [np .uint ], int ]:
219
161
220
- max_kmer_length = max ( kmer_lengths )
221
- min_kmer_length = min ( kmer_lengths )
162
+ num_kmers = get_num_kmers ( sequence_segment , max_kmer_length )
163
+
222
164
# NB: Floor division for midpoint
223
165
# NB: Avoid an overflow error by dividing first before sum
224
166
starting_kmer_length = (max_kmer_length // 2 ) + (min_kmer_length // 2 )
@@ -466,6 +408,81 @@ def linear_search(index_filename: Path,
466
408
return unique_lengths .astype (data_type ), ambiguous_positions_skipped
467
409
468
410
411
+ def get_kmer_counts (index_filename : Path ,
412
+ kmers : list [bytes ],
413
+ num_threads : int ) -> npt .NDArray [np .uint32 ]:
414
+
415
+ # Count the occurances of kmers on the forward strand
416
+ count_list = np .array (count_kmers (
417
+ str (index_filename ),
418
+ kmers ,
419
+ num_threads ),
420
+ dtype = np .uint32 )
421
+
422
+ # TODO: Add no reverse complement option to skip this to
423
+ # support bisulfite treated kmer counting
424
+ # TODO: Add option for a complement table
425
+ count_list += np .array (
426
+ count_kmers (str (index_filename ),
427
+ [kmer .translate (COMPLEMENT_TRANSLATE_TABLE )[::- 1 ]
428
+ for kmer in kmers ],
429
+ num_threads ), dtype = np .uint32 )
430
+
431
+ # If any element in the count list is 0
432
+ if np .any (count_list == 0 ):
433
+ # There is very likely a mismatch between sequence and the index
434
+ # There is also a chance there is a bug in the index
435
+
436
+ # NB: Get first element of tuple, then first element in numpy array
437
+ first_kmer_index = (count_list == 0 ).nonzero ()[0 ][0 ]
438
+ first_problem_kmer = kmers [first_kmer_index ].decode ("utf-8" )
439
+ raise RuntimeError (f"The following generated k-mer was not found in "
440
+ f"the index:\n { first_problem_kmer } \n "
441
+ f"Possibly a mismatch between the sequence and the "
442
+ f"index." )
443
+
444
+ return count_list
445
+
446
+
447
+ def get_num_kmers (sequence_segment : SequenceSegment ,
448
+ max_kmer_length : int ):
449
+ """Returns the maximum number of kmers from a SequenceSegment"""
450
+ sequence_length = len (sequence_segment .data )
451
+ # If this is the last sequence segment
452
+ if sequence_segment .epilogue :
453
+ # The number of kmers is equal to the entire length of the
454
+ # sequence segment
455
+ return sequence_length
456
+ # Otherwise
457
+ else :
458
+ # The number of kmers is equal to kmer batch size
459
+ # (or the sequence segment length minus the lookahead)
460
+ lookahead_length = max_kmer_length - 1
461
+ return sequence_length - lookahead_length
462
+
463
+
464
+ def print_summary_statisitcs (verbose : bool ,
465
+ total_unique_lengths_count : int ,
466
+ total_ambiguous_positions : int ,
467
+ total_no_unique_lengths_count : int ,
468
+ max_length_found : int ,
469
+ min_length_found : int ):
470
+ if (verbose and
471
+ total_unique_lengths_count ):
472
+ verbose_print (verbose ,
473
+ f"{ total_unique_lengths_count } unique lengths found" )
474
+ verbose_print (verbose ,
475
+ f"{ total_ambiguous_positions } positions skipped due to "
476
+ "ambiguity" )
477
+ verbose_print (verbose ,
478
+ f"{ total_no_unique_lengths_count } positions with no "
479
+ "unique length found" )
480
+ verbose_print (verbose ,
481
+ f"{ max_length_found } -mer maximum unique length found" )
482
+ verbose_print (verbose ,
483
+ f"{ min_length_found } -mer minimum unique length found" )
484
+
485
+
469
486
def main (args ):
470
487
fasta_filename = args .fasta_file
471
488
index_filename = args .index_file
0 commit comments