@@ -176,34 +176,37 @@ def binary_search(index_filename: Path,
176
176
verbose_print (verbose , f"Skipping { ambiguous_positions_skipped } ambiguous "
177
177
"positions" )
178
178
179
- # Track current kmer query (for minimum) length
179
+ # Track current kmer query length
180
180
current_length_query = np .full (num_kmers , starting_kmer_length ,
181
181
dtype = data_type )
182
182
183
- # Inclusive bounds for remaining possible lengths
183
+ # Track minimum and maximum kmer lengths for each position
184
+ # NB: Inclusive bounds for search lengths
184
185
lower_length_bound = np .full (num_kmers , min_kmer_length , dtype = data_type )
185
186
186
187
# The upper search length is bounded by the minimum of:
187
188
# The maximum length in our search query range
188
- # Or the maximum length that does not overlap with an ambiguous base
189
189
upper_length_bound = np .full (num_kmers , max_kmer_length , dtype = data_type )
190
190
191
- upper_bound_change_count , short_kmers_discarded_count = update_upper_search_bound (upper_length_bound ,
192
- lower_length_bound ,
193
- current_length_query ,
194
- finished_search ,
195
- sequence_segment .data ,
196
- max_kmer_length ,
197
- min_kmer_length )
191
+ # Or the maximum length that does not overlap with an ambiguous base
192
+ update_upper_search_bound (upper_length_bound ,
193
+ lower_length_bound ,
194
+ current_length_query ,
195
+ finished_search ,
196
+ max_kmer_length ,
197
+ min_kmer_length )
198
+
199
+ if verbose :
200
+ upper_bound_change_count = np .count_nonzero (
201
+ upper_length_bound [(~ finished_search ).nonzero ()] < max_kmer_length )
198
202
199
- upper_bound_change_count = np . count_nonzero (
200
- upper_length_bound [( ~ finished_search ). nonzero ()] < max_kmer_length )
203
+ short_kmers_discarded_count = ( finished_search . sum () -
204
+ ambiguous_positions_skipped )
201
205
202
- if (upper_bound_change_count ):
203
206
verbose_print (verbose , f"{ upper_bound_change_count } k-mer search "
204
207
"ranges truncated due to ambiguity" )
205
208
206
- if ( short_kmers_discarded_count ):
209
+
207
210
verbose_print (verbose , f"{ short_kmers_discarded_count } k-mers shorter "
208
211
"than the minimum length discarded due to ambiguity" )
209
212
@@ -300,51 +303,41 @@ def update_upper_search_bound(upper_length_bound_array: npt.NDArray[np.uint],
300
303
lower_length_bound_array : npt .NDArray [np .uint ],
301
304
current_length_query_array : npt .NDArray [np .uint ],
302
305
finished_search_array : npt .NDArray [np .bool_ ],
303
- sequence_data : bytes ,
304
306
max_kmer_length ,
305
- min_kmer_length ) -> Tuple [int , int ]:
306
- """Return the count of changed positions"""
307
-
308
- # NB: The following is effecitvely O(n^2) where the max k-mer length and
309
- # sequence length are similar in size/magnitude
310
- # There might be a better way based on finding the ambiguous bases in the
311
- # sequence buffer, and then setting the max lengths of the previous
312
- # positions based on their location up to the max k away
313
-
314
- upper_bound_change_count = 0
315
- short_kmers_discarded_count = 0
307
+ min_kmer_length ):
308
+ """Modifies in the input arrays to update the upper search bound based on
309
+ ambiguous bases in the sequence data.
310
+ Updates the query lengths between the new maximum upper bound
311
+ Marks positions with less than the minimum kmer length as finished
312
+ """
316
313
317
314
data_type = current_length_query_array .dtype
318
315
319
- # TODO: Fix this
320
- # For every non-ambiguous starting position
321
- for i in np .nonzero (~ finished_search_array )[0 ]:
322
- # Get what would the maximum length kmer for this position
323
- max_length_kmer = sequence_data [i :i + max_kmer_length ]
324
- # Search for the first occurence of a base that is ambiguous
325
- # NB: The index is 0-based, so the index is equal to the length
326
- # that excludes its own position
327
- # NB: This is very slow for large sequences
328
- maximum_non_ambiguous_length = max_length_kmer .find (b'N' )
329
- # If we found an index where an ambiguous base is
330
- if maximum_non_ambiguous_length != - 1 :
331
- # If the found length is longer than the minimum kmer length
332
- if maximum_non_ambiguous_length >= min_kmer_length :
333
- # Set the maximum length (to the index of the ambiguous base)
334
- upper_length_bound_array [i ] = maximum_non_ambiguous_length
335
- # Recalculate the current query length (as a midpoint)
336
- current_length_query_array [i ] = np .floor (
337
- (upper_length_bound_array [i ] / 2 ) +
338
- (lower_length_bound_array [i ] / 2 )).astype (data_type )
339
- upper_bound_change_count += 1
340
- # Otherwise
341
- else :
342
- # Cannot find a length that would be smaller than the minimum
343
- # Mark this position as finished
344
- short_kmers_discarded_count += 1
345
- finished_search_array [i ] = True
346
-
347
- return upper_bound_change_count , short_kmers_discarded_count
316
+ # NB: Assume the finished search array is all the ambigiuous positions at
317
+ # this point
318
+ # NB: The prepend of np.nan is to ensure the first position is always not
319
+ # counted and it does an implicit conversion to bool to float conversion
320
+ ambiguous_starting_positions = \
321
+ np .where (np .diff (finished_search_array , prepend = np .nan ) > 0 )[0 ]
322
+ max_lengths_to_ambiguous_position = \
323
+ np .arange (max_kmer_length - 1 , 0 , - 1 , dtype = data_type )
324
+
325
+ # For every ambigious starting position in reverse order
326
+ for i in ambiguous_starting_positions [::- 1 ]:
327
+ # From (max kmer length - 1) positions before this position:
328
+ length_change_position = i - (max_lengths_to_ambiguous_position .size )
329
+ minimum_length_position = i - min_kmer_length
330
+ # Set the maximum length up to 1 next to the ambiguous base position
331
+ upper_length_bound_array [length_change_position :i ] = \
332
+ max_lengths_to_ambiguous_position
333
+ # Calculate the new query length as the midpoint between the updated
334
+ # upper and the current lower bounds
335
+ current_length_query_array [length_change_position :i ] = np .floor (
336
+ (upper_length_bound_array [length_change_position :i ] / 2 ) +
337
+ (lower_length_bound_array [length_change_position :i ] / 2 )).astype (
338
+ data_type )
339
+ # Mark positions with values of (min length - 1) to 1 as finished
340
+ finished_search_array [minimum_length_position + 1 :i ] = True
348
341
349
342
350
343
def linear_search (index_filename : Path ,
0 commit comments