Skip to content

Commit b6eaa55

Browse files
committed
Change upper bound ambiguity truncation function to be much more efficient
1 parent ffa732a commit b6eaa55

File tree

1 file changed

+48
-55
lines changed

1 file changed

+48
-55
lines changed

newmap/unique_counts.py

Lines changed: 48 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -176,34 +176,37 @@ def binary_search(index_filename: Path,
176176
verbose_print(verbose, f"Skipping {ambiguous_positions_skipped} ambiguous "
177177
"positions")
178178

179-
# Track current kmer query (for minimum) length
179+
# Track current kmer query length
180180
current_length_query = np.full(num_kmers, starting_kmer_length,
181181
dtype=data_type)
182182

183-
# Inclusive bounds for remaining possible lengths
183+
# Track minimum and maximum kmer lengths for each position
184+
# NB: Inclusive bounds for search lengths
184185
lower_length_bound = np.full(num_kmers, min_kmer_length, dtype=data_type)
185186

186187
# The upper search length is bounded by the minimum of:
187188
# The maximum length in our search query range
188-
# Or the maximum length that does not overlap with an ambiguous base
189189
upper_length_bound = np.full(num_kmers, max_kmer_length, dtype=data_type)
190190

191-
upper_bound_change_count, short_kmers_discarded_count = update_upper_search_bound(upper_length_bound,
192-
lower_length_bound,
193-
current_length_query,
194-
finished_search,
195-
sequence_segment.data,
196-
max_kmer_length,
197-
min_kmer_length)
191+
# Or the maximum length that does not overlap with an ambiguous base
192+
update_upper_search_bound(upper_length_bound,
193+
lower_length_bound,
194+
current_length_query,
195+
finished_search,
196+
max_kmer_length,
197+
min_kmer_length)
198+
199+
if verbose:
200+
upper_bound_change_count = np.count_nonzero(
201+
upper_length_bound[(~finished_search).nonzero()] < max_kmer_length)
198202

199-
upper_bound_change_count = np.count_nonzero(
200-
upper_length_bound[(~finished_search).nonzero()] < max_kmer_length)
203+
short_kmers_discarded_count = (finished_search.sum() -
204+
ambiguous_positions_skipped)
201205

202-
if (upper_bound_change_count):
203206
verbose_print(verbose, f"{upper_bound_change_count} k-mer search "
204207
"ranges truncated due to ambiguity")
205208

206-
if (short_kmers_discarded_count):
209+
207210
verbose_print(verbose, f"{short_kmers_discarded_count} k-mers shorter "
208211
"than the minimum length discarded due to ambiguity")
209212

@@ -300,51 +303,41 @@ def update_upper_search_bound(upper_length_bound_array: npt.NDArray[np.uint],
300303
lower_length_bound_array: npt.NDArray[np.uint],
301304
current_length_query_array: npt.NDArray[np.uint],
302305
finished_search_array: npt.NDArray[np.bool_],
303-
sequence_data: bytes,
304306
max_kmer_length,
305-
min_kmer_length) -> Tuple[int, int]:
306-
"""Return the count of changed positions"""
307-
308-
# NB: The following is effecitvely O(n^2) where the max k-mer length and
309-
# sequence length are similar in size/magnitude
310-
# There might be a better way based on finding the ambiguous bases in the
311-
# sequence buffer, and then setting the max lengths of the previous
312-
# positions based on their location up to the max k away
313-
314-
upper_bound_change_count = 0
315-
short_kmers_discarded_count = 0
307+
min_kmer_length):
308+
"""Modifies in the input arrays to update the upper search bound based on
309+
ambiguous bases in the sequence data.
310+
Updates the query lengths between the new maximum upper bound
311+
Marks positions with less than the minimum kmer length as finished
312+
"""
316313

317314
data_type = current_length_query_array.dtype
318315

319-
# TODO: Fix this
320-
# For every non-ambiguous starting position
321-
for i in np.nonzero(~finished_search_array)[0]:
322-
# Get what would the maximum length kmer for this position
323-
max_length_kmer = sequence_data[i:i+max_kmer_length]
324-
# Search for the first occurence of a base that is ambiguous
325-
# NB: The index is 0-based, so the index is equal to the length
326-
# that excludes its own position
327-
# NB: This is very slow for large sequences
328-
maximum_non_ambiguous_length = max_length_kmer.find(b'N')
329-
# If we found an index where an ambiguous base is
330-
if maximum_non_ambiguous_length != -1:
331-
# If the found length is longer than the minimum kmer length
332-
if maximum_non_ambiguous_length >= min_kmer_length:
333-
# Set the maximum length (to the index of the ambiguous base)
334-
upper_length_bound_array[i] = maximum_non_ambiguous_length
335-
# Recalculate the current query length (as a midpoint)
336-
current_length_query_array[i] = np.floor(
337-
(upper_length_bound_array[i] / 2) +
338-
(lower_length_bound_array[i] / 2)).astype(data_type)
339-
upper_bound_change_count += 1
340-
# Otherwise
341-
else:
342-
# Cannot find a length that would be smaller than the minimum
343-
# Mark this position as finished
344-
short_kmers_discarded_count += 1
345-
finished_search_array[i] = True
346-
347-
return upper_bound_change_count, short_kmers_discarded_count
316+
# NB: Assume the finished search array is all the ambigiuous positions at
317+
# this point
318+
# NB: The prepend of np.nan is to ensure the first position is always not
319+
# counted and it does an implicit conversion to bool to float conversion
320+
ambiguous_starting_positions = \
321+
np.where(np.diff(finished_search_array, prepend=np.nan) > 0)[0]
322+
max_lengths_to_ambiguous_position = \
323+
np.arange(max_kmer_length-1, 0, -1, dtype=data_type)
324+
325+
# For every ambigious starting position in reverse order
326+
for i in ambiguous_starting_positions[::-1]:
327+
# From (max kmer length - 1) positions before this position:
328+
length_change_position = i - (max_lengths_to_ambiguous_position.size)
329+
minimum_length_position = i - min_kmer_length
330+
# Set the maximum length up to 1 next to the ambiguous base position
331+
upper_length_bound_array[length_change_position:i] = \
332+
max_lengths_to_ambiguous_position
333+
# Calculate the new query length as the midpoint between the updated
334+
# upper and the current lower bounds
335+
current_length_query_array[length_change_position:i] = np.floor(
336+
(upper_length_bound_array[length_change_position:i] / 2) +
337+
(lower_length_bound_array[length_change_position:i] / 2)).astype(
338+
data_type)
339+
# Mark positions with values of (min length - 1) to 1 as finished
340+
finished_search_array[minimum_length_position+1:i] = True
348341

349342

350343
def linear_search(index_filename: Path,

0 commit comments

Comments
 (0)