Skip to content

Commit 18e1e07

Browse files
committed
Fix corner case bug for the sequence buffer iterator
It was possible that the the remaining sequence buffer exactly aligned with the end of the sequence. This cause some sequence to be double counted.
1 parent eba9066 commit 18e1e07

File tree

2 files changed

+177
-44
lines changed

2 files changed

+177
-44
lines changed

newmap/fasta.py

Lines changed: 150 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -38,51 +38,157 @@ def sequence_segments(
3838
# NB: Mutable sequence of bytes
3939
# NB: This is over a 1000x (not a typo) speed-up over a byte object
4040
working_sequence_buffer = bytearray()
41+
overlap_buffer = bytearray()
4142

42-
# NB: Line buffered reading is probably the best way to handle edge cases
43-
# around sequence ID parsing and newline characters
44-
# NB: fasta_file is assumed to be opened in binary mode (rb)
45-
for fasta_line in fasta_file:
46-
# If we are on a new sequence (sequence ID)
47-
# NB: Assume that either of the delimiters are indicators of a
48-
# new sequence, notably including comments
49-
if fasta_line.startswith(FASTA_FILE_IGNORE_DELIMITERS): # type: ignore
50-
# Yield the current sequence segment if there is remaining sequence
51-
# NB: We always keep the lookahead/overlap in the working sequence
52-
# buffer, therefore there can only be sequence remaining if it is
53-
# longer than the lookahead/overlap length
54-
if len(working_sequence_buffer) > sequence_overlap_length:
55-
yield SequenceSegment(current_sequence_id, # type: ignore
56-
bytes(working_sequence_buffer))
57-
58-
# Get the new reference sequence name
43+
sequences = [] # working list of sequences
44+
45+
# import debugpy
46+
# debugpy.listen(5678)
47+
# debugpy.wait_for_client()
48+
# debugpy.breakpoint()
49+
# For every line in the fasta file
50+
for line in fasta_file:
51+
line = line.rstrip() # Remove trailing newline
52+
53+
# While there is enough working sequence buffer to fill the requested
54+
# sequence length
55+
# Create sequences for each segment
56+
sequences.extend(get_sequences_from_buffer(
57+
working_sequence_buffer,
58+
overlap_buffer,
59+
sequence_length,
60+
sequence_overlap_length))
61+
62+
# If the current line is a sequence ID
63+
if line.startswith(FASTA_FILE_IGNORE_DELIMITERS): # type: ignore
64+
# Yield the remaining sequences
65+
for sequence_buffer in get_remaining_sequence_segments(
66+
current_sequence_id, # type: ignore
67+
sequences,
68+
working_sequence_buffer,
69+
overlap_buffer,
70+
sequence_length,
71+
sequence_overlap_length):
72+
yield sequence_buffer
73+
74+
# Empty working sequences
75+
sequences.clear()
76+
77+
# Update the working sequence ID
5978
# NB: remove leading '>'
60-
current_sequence_id = fasta_line.split()[0][1:] # type: ignore
61-
# Reset the working sequence buffer
62-
working_sequence_buffer = bytearray()
79+
current_sequence_id = line.split()[0][1:] # type: ignore
80+
# Clear the overlap buffer
81+
overlap_buffer.clear()
82+
# Clear the working buffer
83+
working_sequence_buffer.clear()
6384

64-
# Otherwise the line we are on is sequence data
85+
# Otherwise the line is not a sequence ID and is sequence data
6586
else:
66-
fasta_line = fasta_line.rstrip() # Remove trailing newline
67-
# Add to the working sequence buffer
68-
working_sequence_buffer += fasta_line # type: ignore
69-
# While we have enough sequence buffer to fill a sequence segment
70-
while len(working_sequence_buffer) >= sequence_length:
71-
yield SequenceSegment(
72-
current_sequence_id, # type: ignore
73-
bytes(working_sequence_buffer[:sequence_length]))
74-
# Truncate the working sequence buffer by the sequence length
75-
# minus the lookahead
76-
# XXX: Assert that the kmer/sequence length is always larger
77-
# than the lookahead length?
78-
truncate_length = sequence_length - sequence_overlap_length
79-
working_sequence_buffer = \
80-
working_sequence_buffer[truncate_length:]
81-
82-
# Yield the last sequence segment
83-
# NB: We always keep the lookahead in the working sequence buffer
84-
# So there needs to be check if it is longer the lookahead length
85-
if len(working_sequence_buffer) > sequence_overlap_length:
86-
yield SequenceSegment(current_sequence_id, # type: ignore
87-
bytes(working_sequence_buffer),
88-
epilogue=True)
87+
# If any sequences were created
88+
if sequences:
89+
# Create all sequence segments but for the last
90+
for sequence in sequences[:-1]:
91+
# Yield a sequence segment without the epilogue flag set
92+
yield SequenceSegment(current_sequence_id, bytes(sequence))
93+
# Carry over the last sequence to the next iteration
94+
# in case this the last line it the sequence filled the
95+
# remaining buffer exactly
96+
sequences = [sequences[-1]]
97+
98+
# Add the sequence line to the working buffer
99+
working_sequence_buffer += line # type: ignore
100+
101+
# Yield the remaining sequences
102+
for sequence_buffer in get_remaining_sequence_segments(
103+
current_sequence_id, # type: ignore
104+
sequences,
105+
working_sequence_buffer,
106+
overlap_buffer,
107+
sequence_length,
108+
sequence_overlap_length):
109+
110+
yield sequence_buffer
111+
112+
113+
def get_sequences_from_buffer(working_sequence_buffer: bytearray,
114+
overlap_sequence: bytearray,
115+
sequence_length: int,
116+
sequence_overlap_length: int):
117+
"""Returns a list of overlapping byte sequences from a sequence buffer.
118+
Modifies the working sequence buffer and overlap buffer in place.
119+
"""
120+
121+
# If there is no sequence buffer
122+
if not working_sequence_buffer:
123+
# Return nothing
124+
return []
125+
126+
sequences = []
127+
128+
non_overlap_length = sequence_length - sequence_overlap_length
129+
130+
while (len(working_sequence_buffer) + len(overlap_sequence) >=
131+
sequence_length):
132+
# If there is an overlap buffer
133+
if overlap_sequence:
134+
# Create the sequence with the overlap
135+
sequence = bytes(
136+
overlap_sequence +
137+
working_sequence_buffer[:non_overlap_length])
138+
bytes_used = non_overlap_length
139+
else:
140+
# Otherwise create the sequence without the overlap
141+
sequence = bytes(working_sequence_buffer[:sequence_length])
142+
bytes_used = sequence_length
143+
144+
# Add to our working list of sequences
145+
sequences.append(sequence)
146+
# Update the overlap buffer if it exists by taking the last
147+
# current calculated sequence
148+
if sequence_overlap_length:
149+
# NB: Avoid re-assignment to modifiy in place
150+
overlap_sequence[:] = sequence[-sequence_overlap_length:]
151+
# Truncate the start of working sequence buffer by bytes used
152+
working_sequence_buffer[:bytes_used] = b''
153+
154+
return sequences
155+
156+
157+
def get_remaining_sequence_segments(sequence_id: bytes,
158+
sequences: list[bytes],
159+
working_sequence_buffer: bytearray,
160+
overlap_buffer: bytearray,
161+
sequence_length: int,
162+
sequence_overlap_length: int):
163+
# Assumes last of any buffer is the the epilogue
164+
165+
sequence_segments = []
166+
167+
sequences.extend(get_sequences_from_buffer(
168+
working_sequence_buffer,
169+
overlap_buffer,
170+
sequence_length,
171+
sequence_overlap_length))
172+
173+
if working_sequence_buffer:
174+
sequences.append(bytes(overlap_buffer + working_sequence_buffer))
175+
176+
# If any sequences were created
177+
if sequences:
178+
# Create a sequence segment for all but the last element
179+
# NB: Empty on a single list
180+
for sequence in sequences[:-1]:
181+
sequence_segments.append(
182+
SequenceSegment(sequence_id, bytes(sequence))
183+
)
184+
185+
# Create a sequence segment for the last element with the
186+
# epilogue flag set
187+
188+
sequence_segments.append(
189+
SequenceSegment(sequence_id,
190+
bytes(sequences[-1]),
191+
epilogue=True)
192+
)
193+
194+
return sequence_segments

tests/test_sequence_buffer_iter.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ def test_entire_sequence(self):
2020
sequence_buffer = next(buffer_iter)
2121
self.assertEqual(b'chr2', sequence_buffer.id)
2222
self.assertEqual(b'CGCANCAGAGCANCGNCG', sequence_buffer.data)
23+
self.assertTrue(sequence_buffer.epilogue)
2324
self.assertRaises(StopIteration, next, buffer_iter)
2425

2526
def test_sequence_overlap(self):
@@ -28,26 +29,32 @@ def test_sequence_overlap(self):
2829
sequence_buffer = next(buffer_iter)
2930
self.assertEqual(b'chr2', sequence_buffer.id)
3031
self.assertEqual(b'CGCAN', sequence_buffer.data)
32+
self.assertFalse(sequence_buffer.epilogue)
3133

3234
sequence_buffer = next(buffer_iter)
3335
self.assertEqual(b'chr2', sequence_buffer.id)
3436
self.assertEqual(b'ANCAG', sequence_buffer.data)
37+
self.assertFalse(sequence_buffer.epilogue)
3538

3639
sequence_buffer = next(buffer_iter)
3740
self.assertEqual(b'chr2', sequence_buffer.id)
3841
self.assertEqual(b'AGAGC', sequence_buffer.data)
42+
self.assertFalse(sequence_buffer.epilogue)
3943

4044
sequence_buffer = next(buffer_iter)
4145
self.assertEqual(b'chr2', sequence_buffer.id)
4246
self.assertEqual(b'GCANC', sequence_buffer.data)
47+
self.assertFalse(sequence_buffer.epilogue)
4348

4449
sequence_buffer = next(buffer_iter)
4550
self.assertEqual(b'chr2', sequence_buffer.id)
4651
self.assertEqual(b'NCGNC', sequence_buffer.data)
52+
self.assertFalse(sequence_buffer.epilogue)
4753

4854
sequence_buffer = next(buffer_iter)
4955
self.assertEqual(b'chr2', sequence_buffer.id)
5056
self.assertEqual(b'NCG', sequence_buffer.data)
57+
self.assertTrue(sequence_buffer.epilogue)
5158

5259
self.assertRaises(StopIteration, next, buffer_iter)
5360

@@ -160,3 +167,23 @@ def test_single_nucleotide_epilogue(self):
160167

161168
self.assertRaises(StopIteration, next, buffer_iter)
162169
fasta_file.close()
170+
171+
def test_sizes_and_epilogues(self):
172+
buffer_iter = sequence_segments(self.genome_fasta_file, 20, 0)
173+
174+
sequence_buffer = next(buffer_iter)
175+
self.assertEqual(b'chr1', sequence_buffer.id)
176+
self.assertEqual(len(sequence_buffer.data), 20)
177+
self.assertTrue(sequence_buffer.epilogue)
178+
179+
sequence_buffer = next(buffer_iter)
180+
self.assertEqual(b'chr2', sequence_buffer.id)
181+
self.assertEqual(len(sequence_buffer.data), 20)
182+
self.assertFalse(sequence_buffer.epilogue)
183+
184+
sequence_buffer = next(buffer_iter)
185+
self.assertEqual(b'chr2', sequence_buffer.id)
186+
self.assertEqual(len(sequence_buffer.data), 10)
187+
self.assertTrue(sequence_buffer.epilogue)
188+
189+
self.assertRaises(StopIteration, next, buffer_iter)

0 commit comments

Comments
 (0)