Skip to content

Commit 8954ff3

Browse files
committed
Change parameters to binary_search
1 parent 7fbf89f commit 8954ff3

File tree

1 file changed

+81
-64
lines changed

1 file changed

+81
-64
lines changed

newmap/unique_counts.py

Lines changed: 81 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -108,8 +108,8 @@ def write_unique_counts(fasta_filename: Path,
108108
segment_unique_counts, ambiguous_count = \
109109
binary_search(index_filename,
110110
sequence_segment,
111-
kmer_lengths,
112-
num_kmers,
111+
min_kmer_length,
112+
max_kmer_length,
113113
num_threads,
114114
data_type, # type: ignore
115115
verbose)
@@ -151,74 +151,16 @@ def write_unique_counts(fasta_filename: Path,
151151
min_length_found)
152152

153153

154-
def print_summary_statisitcs(verbose: bool,
155-
total_unique_lengths_count: int,
156-
total_ambiguous_positions: int,
157-
total_no_unique_lengths_count: int,
158-
max_length_found: int,
159-
min_length_found: int):
160-
if (verbose and
161-
total_unique_lengths_count):
162-
verbose_print(verbose,
163-
f"{total_unique_lengths_count} unique lengths found")
164-
verbose_print(verbose,
165-
f"{total_ambiguous_positions} positions skipped due to "
166-
"ambiguity")
167-
verbose_print(verbose,
168-
f"{total_no_unique_lengths_count} positions with no "
169-
"unique length found")
170-
verbose_print(verbose,
171-
f"{max_length_found}-mer maximum unique length found")
172-
verbose_print(verbose,
173-
f"{min_length_found}-mer minimum unique length found")
174-
175-
176-
def get_kmer_counts(index_filename: Path,
177-
kmers: list[bytes],
178-
num_threads: int) -> npt.NDArray[np.uint32]:
179-
180-
# Count the occurances of kmers on the forward strand
181-
count_list = np.array(count_kmers(
182-
str(index_filename),
183-
kmers,
184-
num_threads),
185-
dtype=np.uint32)
186-
187-
# TODO: Add no reverse complement option to skip this to
188-
# support bisulfite treated kmer counting
189-
# TODO: Add option for a complement table
190-
count_list += np.array(
191-
count_kmers(str(index_filename),
192-
[kmer.translate(COMPLEMENT_TRANSLATE_TABLE)[::-1]
193-
for kmer in kmers],
194-
num_threads), dtype=np.uint32)
195-
196-
# If any element in the count list is 0
197-
if np.any(count_list == 0):
198-
# There is very likely a mismatch between sequence and the index
199-
# There is also a chance there is a bug in the index
200-
201-
# NB: Get first element of tuple, then first element in numpy array
202-
first_kmer_index = (count_list == 0).nonzero()[0][0]
203-
first_problem_kmer = kmers[first_kmer_index].decode("utf-8")
204-
raise RuntimeError(f"The following generated k-mer was not found in "
205-
f"the index:\n{first_problem_kmer}\n"
206-
f"Possibly a mismatch between the sequence and the "
207-
f"index.")
208-
209-
return count_list
210-
211-
212154
def binary_search(index_filename: Path,
213155
sequence_segment: SequenceSegment,
214-
kmer_lengths: list[int],
215-
num_kmers: int,
156+
min_kmer_length: int,
157+
max_kmer_length: int,
216158
num_threads: int,
217159
data_type: Union[np.uint8, np.uint16, np.uint32],
218160
verbose: bool) -> tuple[npt.NDArray[np.uint], int]:
219161

220-
max_kmer_length = max(kmer_lengths)
221-
min_kmer_length = min(kmer_lengths)
162+
num_kmers = get_num_kmers(sequence_segment, max_kmer_length)
163+
222164
# NB: Floor division for midpoint
223165
# NB: Avoid an overflow error by dividing first before sum
224166
starting_kmer_length = (max_kmer_length // 2) + (min_kmer_length // 2)
@@ -466,6 +408,81 @@ def linear_search(index_filename: Path,
466408
return unique_lengths.astype(data_type), ambiguous_positions_skipped
467409

468410

411+
def get_kmer_counts(index_filename: Path,
412+
kmers: list[bytes],
413+
num_threads: int) -> npt.NDArray[np.uint32]:
414+
415+
# Count the occurances of kmers on the forward strand
416+
count_list = np.array(count_kmers(
417+
str(index_filename),
418+
kmers,
419+
num_threads),
420+
dtype=np.uint32)
421+
422+
# TODO: Add no reverse complement option to skip this to
423+
# support bisulfite treated kmer counting
424+
# TODO: Add option for a complement table
425+
count_list += np.array(
426+
count_kmers(str(index_filename),
427+
[kmer.translate(COMPLEMENT_TRANSLATE_TABLE)[::-1]
428+
for kmer in kmers],
429+
num_threads), dtype=np.uint32)
430+
431+
# If any element in the count list is 0
432+
if np.any(count_list == 0):
433+
# There is very likely a mismatch between sequence and the index
434+
# There is also a chance there is a bug in the index
435+
436+
# NB: Get first element of tuple, then first element in numpy array
437+
first_kmer_index = (count_list == 0).nonzero()[0][0]
438+
first_problem_kmer = kmers[first_kmer_index].decode("utf-8")
439+
raise RuntimeError(f"The following generated k-mer was not found in "
440+
f"the index:\n{first_problem_kmer}\n"
441+
f"Possibly a mismatch between the sequence and the "
442+
f"index.")
443+
444+
return count_list
445+
446+
447+
def get_num_kmers(sequence_segment: SequenceSegment,
448+
max_kmer_length: int):
449+
"""Returns the maximum number of kmers from a SequenceSegment"""
450+
sequence_length = len(sequence_segment.data)
451+
# If this is the last sequence segment
452+
if sequence_segment.epilogue:
453+
# The number of kmers is equal to the entire length of the
454+
# sequence segment
455+
return sequence_length
456+
# Otherwise
457+
else:
458+
# The number of kmers is equal to kmer batch size
459+
# (or the sequence segment length minus the lookahead)
460+
lookahead_length = max_kmer_length - 1
461+
return sequence_length - lookahead_length
462+
463+
464+
def print_summary_statisitcs(verbose: bool,
465+
total_unique_lengths_count: int,
466+
total_ambiguous_positions: int,
467+
total_no_unique_lengths_count: int,
468+
max_length_found: int,
469+
min_length_found: int):
470+
if (verbose and
471+
total_unique_lengths_count):
472+
verbose_print(verbose,
473+
f"{total_unique_lengths_count} unique lengths found")
474+
verbose_print(verbose,
475+
f"{total_ambiguous_positions} positions skipped due to "
476+
"ambiguity")
477+
verbose_print(verbose,
478+
f"{total_no_unique_lengths_count} positions with no "
479+
"unique length found")
480+
verbose_print(verbose,
481+
f"{max_length_found}-mer maximum unique length found")
482+
verbose_print(verbose,
483+
f"{min_length_found}-mer minimum unique length found")
484+
485+
469486
def main(args):
470487
fasta_filename = args.fasta_file
471488
index_filename = args.index_file

0 commit comments

Comments
 (0)