Skip to content

Commit 16356dc

Browse files
committed
Fix bug in upper length bound calculation where an ambiguous position could be less than the maximum k-mer length from the start of a sequence buffer
1 parent 16715fe commit 16356dc

File tree

1 file changed

+25
-10
lines changed

1 file changed

+25
-10
lines changed

newmap/unique_counts.py

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -342,30 +342,45 @@ def update_upper_search_bound(upper_length_bound_array: npt.NDArray[np.uint],
342342
# For every ambigious starting position in reverse order
343343
for i in ambiguous_starting_positions[::-1]:
344344
# From (max kmer length - 1) positions before this position:
345-
length_change_position = i - (max_lengths_to_ambiguous_position.size)
346-
minimum_length_position = i - min_kmer_length
347-
# Set the maximum length up to 1 next to the ambiguous base position
348-
upper_length_bound_array[length_change_position:i] = \
349-
max_lengths_to_ambiguous_position
345+
346+
# Skip if the ambiguous position is at position 0
347+
if i == 0:
348+
continue
349+
350+
# Calculate the starting position where the upper length bounds will
351+
# change
352+
# NB: Account for the case where our ambigious position is less than
353+
# the max kmer length - 1
354+
upper_length_change_position = \
355+
max(0, i - (max_lengths_to_ambiguous_position.size))
356+
357+
# This value is different from the max kmer length (-1) if the
358+
# ambiguous position is less than the max kmer length - 1 (near the
359+
# start of the arrays)
360+
num_upper_length_changes = i - upper_length_change_position
361+
upper_length_bound_array[upper_length_change_position:i] = \
362+
max_lengths_to_ambiguous_position[-num_upper_length_changes:]
350363

351364
# Calculate the new query length as the midpoint between the updated
352365
# upper and the current lower bounds
353366
new_initial_search_array = np.floor(
354-
(upper_length_bound_array[length_change_position:i] / 2) +
355-
(lower_length_bound_array[length_change_position:i] / 2)).astype(
356-
data_type)
367+
(upper_length_bound_array[upper_length_change_position:i] / 2) +
368+
(lower_length_bound_array[upper_length_change_position:i] /
369+
2)).astype(data_type)
357370

358371
# If we have an initial search length
359372
if initial_search_length:
360373
# Use the initial search length if it is less than the new midpoint
361374
new_initial_search_array = np.fmin(new_initial_search_array,
362375
initial_search_length)
363376

364-
current_length_query_array[length_change_position:i] = \
377+
current_length_query_array[upper_length_change_position:i] = \
365378
new_initial_search_array
366379

367380
# Mark positions with values of (min length - 1) to 1 as finished
368-
finished_search_array[minimum_length_position+1:i] = True
381+
finished_search_array[upper_length_change_position:i] = \
382+
(upper_length_bound_array[upper_length_change_position:i] <
383+
min_kmer_length)
369384

370385

371386
def linear_search(index_filename: Path,

0 commit comments

Comments
 (0)