|
1 | 1 | from math import ceil, log2
|
2 | 2 | from pathlib import Path
|
3 |
| -from typing import Union |
| 3 | +from typing import Tuple, Union |
4 | 4 |
|
5 | 5 | import numpy as np
|
6 | 6 | import numpy.typing as npt
|
@@ -187,41 +187,14 @@ def binary_search(index_filename: Path,
|
187 | 187 | # The maximum length in our search query range
|
188 | 188 | # Or the maximum length that does not overlap with an ambiguous base
|
189 | 189 | upper_length_bound = np.full(num_kmers, max_kmer_length, dtype=data_type)
|
190 |
| - upper_bound_change_count = 0 |
191 |
| - short_kmers_discarded_count = 0 |
192 |
| - |
193 |
| - # NB: The following is effecitvely O(n^2) where the max k-mer length and |
194 |
| - # sequence length are similar in size/magnitude |
195 |
| - # There might be a better way based on finding the ambiguous bases in the |
196 |
| - # sequence buffer, and then setting the max lengths of the previous |
197 |
| - # positions based on their location up to the max k away |
198 | 190 |
|
199 |
| - # For every non-ambiguous starting position |
200 |
| - for i in np.nonzero(~finished_search)[0]: |
201 |
| - # Get what would the maximum length kmer for this position |
202 |
| - max_length_kmer = sequence_segment.data[i:i+max_kmer_length] |
203 |
| - # Search for the first occurance of a base that is ambiguous |
204 |
| - # NB: The index is 0-based, so the index is equal to the length |
205 |
| - # that excludes its own position |
206 |
| - # NB: This is very slow for large sequences |
207 |
| - maximum_non_ambiguous_length = max_length_kmer.find(b'N') |
208 |
| - # If we found an index where an ambiguous base is |
209 |
| - if maximum_non_ambiguous_length != -1: |
210 |
| - # If the found length is longer than the minimum kmer length |
211 |
| - if maximum_non_ambiguous_length >= min_kmer_length: |
212 |
| - # Set the maximum length (to the index of the ambiguous base) |
213 |
| - upper_length_bound[i] = maximum_non_ambiguous_length |
214 |
| - # Recalculate the current query length (as a midpoint) |
215 |
| - current_length_query[i] = np.floor( |
216 |
| - (upper_length_bound[i] / 2) + |
217 |
| - (lower_length_bound[i] / 2)).astype(data_type) |
218 |
| - upper_bound_change_count += 1 |
219 |
| - # Otherwise |
220 |
| - else: |
221 |
| - # Cannot find a length that would be smaller than the minimum |
222 |
| - # Mark this position as finished |
223 |
| - short_kmers_discarded_count += 1 |
224 |
| - finished_search[i] = True |
| 191 | + upper_bound_change_count, short_kmers_discarded_count = update_upper_search_bound(upper_length_bound, |
| 192 | + lower_length_bound, |
| 193 | + current_length_query, |
| 194 | + finished_search, |
| 195 | + sequence_segment.data, |
| 196 | + max_kmer_length, |
| 197 | + min_kmer_length) |
225 | 198 |
|
226 | 199 | upper_bound_change_count = np.count_nonzero(
|
227 | 200 | upper_length_bound[(~finished_search).nonzero()] < max_kmer_length)
|
@@ -259,7 +232,7 @@ def binary_search(index_filename: Path,
|
259 | 232 | kmer = sequence_segment.data[i:i+current_kmer_length]
|
260 | 233 | working_kmers.append(kmer)
|
261 | 234 |
|
262 |
| - # Get the occurances of the kmers on both strands |
| 235 | + # Get the occurences of the kmers on both strands |
263 | 236 | count_list = get_kmer_counts(index_filename,
|
264 | 237 | working_kmers,
|
265 | 238 | num_threads)
|
@@ -323,6 +296,57 @@ def binary_search(index_filename: Path,
|
323 | 296 | return unique_lengths, ambiguous_positions_skipped
|
324 | 297 |
|
325 | 298 |
|
| 299 | +def update_upper_search_bound(upper_length_bound_array: npt.NDArray[np.uint], |
| 300 | + lower_length_bound_array: npt.NDArray[np.uint], |
| 301 | + current_length_query_array: npt.NDArray[np.uint], |
| 302 | + finished_search_array: npt.NDArray[np.bool_], |
| 303 | + sequence_data: bytes, |
| 304 | + max_kmer_length, |
| 305 | + min_kmer_length) -> Tuple[int, int]: |
| 306 | + """Return the count of changed positions""" |
| 307 | + |
| 308 | + # NB: The following is effecitvely O(n^2) where the max k-mer length and |
| 309 | + # sequence length are similar in size/magnitude |
| 310 | + # There might be a better way based on finding the ambiguous bases in the |
| 311 | + # sequence buffer, and then setting the max lengths of the previous |
| 312 | + # positions based on their location up to the max k away |
| 313 | + |
| 314 | + upper_bound_change_count = 0 |
| 315 | + short_kmers_discarded_count = 0 |
| 316 | + |
| 317 | + data_type = current_length_query_array.dtype |
| 318 | + |
| 319 | + # TODO: Fix this |
| 320 | + # For every non-ambiguous starting position |
| 321 | + for i in np.nonzero(~finished_search_array)[0]: |
| 322 | + # Get what would the maximum length kmer for this position |
| 323 | + max_length_kmer = sequence_data[i:i+max_kmer_length] |
| 324 | + # Search for the first occurence of a base that is ambiguous |
| 325 | + # NB: The index is 0-based, so the index is equal to the length |
| 326 | + # that excludes its own position |
| 327 | + # NB: This is very slow for large sequences |
| 328 | + maximum_non_ambiguous_length = max_length_kmer.find(b'N') |
| 329 | + # If we found an index where an ambiguous base is |
| 330 | + if maximum_non_ambiguous_length != -1: |
| 331 | + # If the found length is longer than the minimum kmer length |
| 332 | + if maximum_non_ambiguous_length >= min_kmer_length: |
| 333 | + # Set the maximum length (to the index of the ambiguous base) |
| 334 | + upper_length_bound_array[i] = maximum_non_ambiguous_length |
| 335 | + # Recalculate the current query length (as a midpoint) |
| 336 | + current_length_query_array[i] = np.floor( |
| 337 | + (upper_length_bound_array[i] / 2) + |
| 338 | + (lower_length_bound_array[i] / 2)).astype(data_type) |
| 339 | + upper_bound_change_count += 1 |
| 340 | + # Otherwise |
| 341 | + else: |
| 342 | + # Cannot find a length that would be smaller than the minimum |
| 343 | + # Mark this position as finished |
| 344 | + short_kmers_discarded_count += 1 |
| 345 | + finished_search_array[i] = True |
| 346 | + |
| 347 | + return upper_bound_change_count, short_kmers_discarded_count |
| 348 | + |
| 349 | + |
326 | 350 | def linear_search(index_filename: Path,
|
327 | 351 | sequence_segment: SequenceSegment,
|
328 | 352 | kmer_lengths: list[int],
|
@@ -377,7 +401,7 @@ def linear_search(index_filename: Path,
|
377 | 401 | "{} {}-mers remaining to be counted"
|
378 | 402 | .format(len(working_kmers), kmer_length))
|
379 | 403 |
|
380 |
| - # Count the occurances of the kmers on both strands |
| 404 | + # Count the occurences of the kmers on both strands |
381 | 405 | count_list = get_kmer_counts(index_filename,
|
382 | 406 | working_kmers,
|
383 | 407 | num_threads)
|
@@ -413,7 +437,7 @@ def get_kmer_counts(index_filename: Path,
|
413 | 437 | kmers: list[bytes],
|
414 | 438 | num_threads: int) -> npt.NDArray[np.uint32]:
|
415 | 439 |
|
416 |
| - # Count the occurances of kmers on the forward strand |
| 440 | + # Count the occurences of kmers on the forward strand |
417 | 441 | count_list = np.array(count_kmers(
|
418 | 442 | str(index_filename),
|
419 | 443 | kmers,
|
|
0 commit comments