Skip to content

Commit ffa732a

Browse files
committed
Move upper search bound calculation to own function
1 parent 0f3e859 commit ffa732a

File tree

1 file changed

+62
-38
lines changed

1 file changed

+62
-38
lines changed

newmap/unique_counts.py

Lines changed: 62 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from math import ceil, log2
22
from pathlib import Path
3-
from typing import Union
3+
from typing import Tuple, Union
44

55
import numpy as np
66
import numpy.typing as npt
@@ -187,41 +187,14 @@ def binary_search(index_filename: Path,
187187
# The maximum length in our search query range
188188
# Or the maximum length that does not overlap with an ambiguous base
189189
upper_length_bound = np.full(num_kmers, max_kmer_length, dtype=data_type)
190-
upper_bound_change_count = 0
191-
short_kmers_discarded_count = 0
192-
193-
# NB: The following is effecitvely O(n^2) where the max k-mer length and
194-
# sequence length are similar in size/magnitude
195-
# There might be a better way based on finding the ambiguous bases in the
196-
# sequence buffer, and then setting the max lengths of the previous
197-
# positions based on their location up to the max k away
198190

199-
# For every non-ambiguous starting position
200-
for i in np.nonzero(~finished_search)[0]:
201-
# Get what would the maximum length kmer for this position
202-
max_length_kmer = sequence_segment.data[i:i+max_kmer_length]
203-
# Search for the first occurance of a base that is ambiguous
204-
# NB: The index is 0-based, so the index is equal to the length
205-
# that excludes its own position
206-
# NB: This is very slow for large sequences
207-
maximum_non_ambiguous_length = max_length_kmer.find(b'N')
208-
# If we found an index where an ambiguous base is
209-
if maximum_non_ambiguous_length != -1:
210-
# If the found length is longer than the minimum kmer length
211-
if maximum_non_ambiguous_length >= min_kmer_length:
212-
# Set the maximum length (to the index of the ambiguous base)
213-
upper_length_bound[i] = maximum_non_ambiguous_length
214-
# Recalculate the current query length (as a midpoint)
215-
current_length_query[i] = np.floor(
216-
(upper_length_bound[i] / 2) +
217-
(lower_length_bound[i] / 2)).astype(data_type)
218-
upper_bound_change_count += 1
219-
# Otherwise
220-
else:
221-
# Cannot find a length that would be smaller than the minimum
222-
# Mark this position as finished
223-
short_kmers_discarded_count += 1
224-
finished_search[i] = True
191+
upper_bound_change_count, short_kmers_discarded_count = update_upper_search_bound(upper_length_bound,
192+
lower_length_bound,
193+
current_length_query,
194+
finished_search,
195+
sequence_segment.data,
196+
max_kmer_length,
197+
min_kmer_length)
225198

226199
upper_bound_change_count = np.count_nonzero(
227200
upper_length_bound[(~finished_search).nonzero()] < max_kmer_length)
@@ -259,7 +232,7 @@ def binary_search(index_filename: Path,
259232
kmer = sequence_segment.data[i:i+current_kmer_length]
260233
working_kmers.append(kmer)
261234

262-
# Get the occurances of the kmers on both strands
235+
# Get the occurences of the kmers on both strands
263236
count_list = get_kmer_counts(index_filename,
264237
working_kmers,
265238
num_threads)
@@ -323,6 +296,57 @@ def binary_search(index_filename: Path,
323296
return unique_lengths, ambiguous_positions_skipped
324297

325298

299+
def update_upper_search_bound(upper_length_bound_array: npt.NDArray[np.uint],
300+
lower_length_bound_array: npt.NDArray[np.uint],
301+
current_length_query_array: npt.NDArray[np.uint],
302+
finished_search_array: npt.NDArray[np.bool_],
303+
sequence_data: bytes,
304+
max_kmer_length,
305+
min_kmer_length) -> Tuple[int, int]:
306+
"""Return the count of changed positions"""
307+
308+
# NB: The following is effecitvely O(n^2) where the max k-mer length and
309+
# sequence length are similar in size/magnitude
310+
# There might be a better way based on finding the ambiguous bases in the
311+
# sequence buffer, and then setting the max lengths of the previous
312+
# positions based on their location up to the max k away
313+
314+
upper_bound_change_count = 0
315+
short_kmers_discarded_count = 0
316+
317+
data_type = current_length_query_array.dtype
318+
319+
# TODO: Fix this
320+
# For every non-ambiguous starting position
321+
for i in np.nonzero(~finished_search_array)[0]:
322+
# Get what would the maximum length kmer for this position
323+
max_length_kmer = sequence_data[i:i+max_kmer_length]
324+
# Search for the first occurence of a base that is ambiguous
325+
# NB: The index is 0-based, so the index is equal to the length
326+
# that excludes its own position
327+
# NB: This is very slow for large sequences
328+
maximum_non_ambiguous_length = max_length_kmer.find(b'N')
329+
# If we found an index where an ambiguous base is
330+
if maximum_non_ambiguous_length != -1:
331+
# If the found length is longer than the minimum kmer length
332+
if maximum_non_ambiguous_length >= min_kmer_length:
333+
# Set the maximum length (to the index of the ambiguous base)
334+
upper_length_bound_array[i] = maximum_non_ambiguous_length
335+
# Recalculate the current query length (as a midpoint)
336+
current_length_query_array[i] = np.floor(
337+
(upper_length_bound_array[i] / 2) +
338+
(lower_length_bound_array[i] / 2)).astype(data_type)
339+
upper_bound_change_count += 1
340+
# Otherwise
341+
else:
342+
# Cannot find a length that would be smaller than the minimum
343+
# Mark this position as finished
344+
short_kmers_discarded_count += 1
345+
finished_search_array[i] = True
346+
347+
return upper_bound_change_count, short_kmers_discarded_count
348+
349+
326350
def linear_search(index_filename: Path,
327351
sequence_segment: SequenceSegment,
328352
kmer_lengths: list[int],
@@ -377,7 +401,7 @@ def linear_search(index_filename: Path,
377401
"{} {}-mers remaining to be counted"
378402
.format(len(working_kmers), kmer_length))
379403

380-
# Count the occurances of the kmers on both strands
404+
# Count the occurences of the kmers on both strands
381405
count_list = get_kmer_counts(index_filename,
382406
working_kmers,
383407
num_threads)
@@ -413,7 +437,7 @@ def get_kmer_counts(index_filename: Path,
413437
kmers: list[bytes],
414438
num_threads: int) -> npt.NDArray[np.uint32]:
415439

416-
# Count the occurances of kmers on the forward strand
440+
# Count the occurences of kmers on the forward strand
417441
count_list = np.array(count_kmers(
418442
str(index_filename),
419443
kmers,

0 commit comments

Comments
 (0)