Skip to content

Commit e92bb99

Browse files
committed
Fix linear search regression on invalid query k-mer lengths
1 parent 0030f47 commit e92bb99

File tree

4 files changed

+46
-13
lines changed

4 files changed

+46
-13
lines changed

newmap/unique_counts.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -463,7 +463,9 @@ def linear_search(index_filename: Path,
463463
(~finished_search).sum()))
464464
verbose_print(verbose, "Counting {}-mers".format(kmer_length))
465465

466-
# Skip any kmers that contain an ambiguous base
466+
max_kmer_query_lengths = []
467+
468+
# For every position that does not have an ambiguous base
467469
for i in np.nonzero(~finished_search)[0]:
468470
# Create the kmer from the sequence segment
469471
# NB: At the epilogue of the sequence, out of bounds
@@ -476,6 +478,10 @@ def linear_search(index_filename: Path,
476478
# Ignore it for all longer kmer lengths (i.e. all
477479
# future iterations)
478480
finished_search[i] = True
481+
# Otherwise:
482+
else:
483+
# Record the length of this k-mer
484+
max_kmer_query_lengths.append(len(kmer))
479485

480486
kmer_indices = np.nonzero(~finished_search)[0]
481487

@@ -495,7 +501,7 @@ def linear_search(index_filename: Path,
495501
count_list = get_kmer_counts(index_filename,
496502
sequence_segment.data,
497503
kmer_indices.tolist(),
498-
[kmer_length]*len(kmer_indices),
504+
max_kmer_query_lengths,
499505
num_threads)
500506

501507
# Assert that the number of indices to count and the number of counts

tests/test_count_kmers.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from tempfile import NamedTemporaryFile
44
from util import TEST_DATA_PATH
55

6-
from newmap._c_newmap_count_kmers import count_kmers
6+
from newmap._c_newmap_count_kmers import count_kmers, count_kmers_from_sequence
77
from newmap.main import (DEFAULT_SUFFIX_ARRAY_COMPRESSION_RATIO,
88
DEFAULT_KMER_LENGTH_IN_SEED_TABLE)
99
from newmap.generate_index import generate_fm_index
@@ -18,21 +18,27 @@ def setUp(self):
1818
DEFAULT_KMER_LENGTH_IN_SEED_TABLE)
1919
self.num_threads = 1
2020

21-
# @unittest.skip("Test relies on large data file not in respository")
2221
def test_count_kmers(self):
2322
counts = count_kmers(self.genome_index_filename,
2423
[b'AAAA', b'AT', b'TAT', b'CCC', b'NNN', b'TCGT'],
2524
self.num_threads)
2625
self.assertEqual(counts, [9, 3, 1, 8, 0, 0])
2726

28-
# @unittest.skip("Test relies on large data file not in respository")
2927
def test_count_wrong_type(self):
3028
with self.assertRaises(TypeError):
3129
count_kmers(self.genome_index_filename, ["AAAA"],
3230
self.num_threads)
3331

34-
# @unittest.skip("Test relies on large data file not in respository")
3532
def test_empty_byte_string(self):
3633
with self.assertRaises(ValueError):
3734
count_kmers(self.genome_index_filename, [b'AAAA', b'', b'TAT'],
3835
self.num_threads)
36+
37+
def test_sequence_counting(self):
38+
counts = count_kmers_from_sequence(
39+
self.genome_index_filename,
40+
b'AAAAATTTTTATCGAATCGA',
41+
[0, 4, 9],
42+
[4, 2, 3],
43+
self.num_threads)
44+
self.assertEqual(counts, [9, 3, 1])

tests/test_sequence_buffer_iter.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,3 +187,7 @@ def test_sizes_and_epilogues(self):
187187
self.assertTrue(sequence_buffer.epilogue)
188188

189189
self.assertRaises(StopIteration, next, buffer_iter)
190+
191+
192+
if __name__ == "__main__":
193+
unittest.main()

tests/test_unique_counts.py

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,29 @@
2222

2323

2424
class TestCountKmers(unittest.TestCase):
25-
def setUp(self):
26-
self.genome_index_filename = NamedTemporaryFile(mode="w").name
27-
self.fasta_filename = str(TEST_DATA_PATH / 'genome.fa')
28-
generate_fm_index(self.fasta_filename,
29-
self.genome_index_filename,
25+
genome_index_file = NamedTemporaryFile(mode="w")
26+
genome_index_filename = genome_index_file.name
27+
fasta_filename = str(TEST_DATA_PATH / 'genome.fa')
28+
num_threads = 1
29+
30+
@classmethod
31+
def setUpClass(cls):
32+
generate_fm_index(cls.fasta_filename,
33+
cls.genome_index_filename,
3034
DEFAULT_SUFFIX_ARRAY_COMPRESSION_RATIO,
3135
DEFAULT_KMER_LENGTH_IN_SEED_TABLE)
32-
self.num_threads = 1
36+
37+
@classmethod
38+
def tearDownClass(cls):
39+
cls.genome_index_file.close()
3340

3441
def test_binary_search(self):
42+
self.search(use_binary_search=True)
43+
44+
def test_linear_search(self):
45+
self.search(use_binary_search=False)
46+
47+
def search(self, use_binary_search):
3548
write_unique_counts(Path(self.fasta_filename),
3649
Path(self.genome_index_filename),
3750
15, # Batch size
@@ -40,7 +53,7 @@ def test_binary_search(self):
4053
[], # Include chr ids
4154
[], # Exclude chr ids
4255
self.num_threads,
43-
use_binary_search=True)
56+
use_binary_search)
4457

4558
# Check the results in chr1.unique.uint8 and chr2.unique.uint8
4659
chr1_results = np.fromfile('chr1.unique.uint8', dtype=np.uint8)
@@ -51,3 +64,7 @@ def test_binary_search(self):
5164

5265
self.assertTrue(np.array_equal(chr1_results, EXPECTED_CHR1_COUNTS))
5366
self.assertTrue(np.array_equal(chr2_results, EXPECTED_CHR2_COUNTS))
67+
68+
69+
if __name__ == '__main__':
70+
unittest.main()

0 commit comments

Comments
 (0)