Skip to content

Commit 968b54a

Browse files
committed
Optimise searching (mostly for shortest data searches)
1 parent c81e803 commit 968b54a

File tree

5 files changed

+210
-116
lines changed

5 files changed

+210
-116
lines changed

Sources/StreamReader/Implementations/DataReader.swift

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -64,29 +64,30 @@ public final class DataReader : StreamReader {
6464

6565
public func readData<T>(upTo delimiters: [Data], matchingMode: DelimiterMatchingMode, failIfNotFound: Bool, includeDelimiter: Bool, updateReadPosition: Bool, _ handler: (UnsafeRawBufferPointer, Data) throws -> T) throws -> T {
6666
let sizeToEnd = sizeToAllowedEnd
67+
let delimiters = cleanupDelimiters(delimiters, forMatchingMode: matchingMode, includingDelimiter: includeDelimiter)
6768

68-
if delimiters.count == 0 || (!failIfNotFound && delimiters.count == 1 && delimiters[0] == Data()) {
69+
if delimiters.isEmpty || (!failIfNotFound && delimiters.count == 1 && delimiters[0] == Data()) {
6970
/* When there are no delimiters or if there is only one delimiter which is empty and we do not fail if we do not find the delimiter,
7071
* we simply read the stream to the end.
7172
* There may be more optimization possible, but we don’t care for now. */
7273
return try readData(size: sizeToEnd, allowReadingLess: false, updateReadPosition: updateReadPosition, { ret in try handler(ret, Data()) })
7374
}
7475

75-
var unmatchedDelimiters = Array(delimiters.enumerated())
76-
let minDelimiterLength = delimiters.map{ $0.count }.min() ?? 0
77-
var matchedDatas = [Match]()
76+
var bestMatch: Match?
77+
var unmatchedDelimiters = Array(delimiters.filter{ !$0.isEmpty }.enumerated())
78+
let minDelimiterLength = unmatchedDelimiters.map(\.element.count).min() ?? 0
7879

7980
return try sourceData.withUnsafeBytes{ bytes in
8081
assert(bytes.baseAddress != nil || currentReadPosition == 0)
8182
let searchedData = UnsafeRawBufferPointer(start: bytes.baseAddress.flatMap{ $0 + currentReadPosition }, count: sizeToEnd)
82-
if let match = matchDelimiters(inData: searchedData, dataStartOffset: 0, usingMatchingMode: matchingMode, includeDelimiter: includeDelimiter, minDelimiterLength: minDelimiterLength, withUnmatchedDelimiters: &unmatchedDelimiters, matchedDatas: &matchedDatas) {
83+
if let match = matchDelimiters(inData: searchedData, dataStartOffset: 0, usingMatchingMode: matchingMode, includeDelimiter: includeDelimiter, minDelimiterLength: minDelimiterLength, withUnmatchedDelimiters: &unmatchedDelimiters, bestMatch: &bestMatch) {
8384
return try readData(size: match.length, allowReadingLess: false, updateReadPosition: updateReadPosition, { ret in try handler(ret, delimiters[match.delimiterIdx]) })
8485
}
8586
/* matchDelimiters did not find an indisputable match.
8687
* However, we have fed all the data we have to it.
8788
* We cannot find more matches!
8889
* We simply return the best match we got. */
89-
if let match = findBestMatch(fromMatchedDatas: matchedDatas, usingMatchingMode: matchingMode) {
90+
if let match = bestMatch {
9091
return try readData(size: match.length, allowReadingLess: false, updateReadPosition: updateReadPosition, { ret in try handler(ret, delimiters[match.delimiterIdx]) })
9192
}
9293
if failIfNotFound {

Sources/StreamReader/Implementations/GenericStreamReader.swift

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -156,17 +156,18 @@ public final class GenericStreamReader : StreamReader {
156156
}
157157

158158
public func readData<T>(upTo delimiters: [Data], matchingMode: DelimiterMatchingMode, failIfNotFound: Bool, includeDelimiter: Bool, updateReadPosition: Bool, _ handler: (UnsafeRawBufferPointer, Data) throws -> T) throws -> T {
159-
let (minDelimiterLength, maxDelimiterLength) = delimiters.reduce((delimiters.first?.count ?? 0, 0), { (min($0.0, $1.count), max($0.1, $1.count)) })
159+
let delimiters = cleanupDelimiters(delimiters, forMatchingMode: matchingMode, includingDelimiter: includeDelimiter)
160160

161-
var unmatchedDelimiters = Array(delimiters.enumerated())
162-
var matchedDatas = [Match]()
161+
var bestMatch: Match?
162+
var unmatchedDelimiters = Array(delimiters.filter{ !$0.isEmpty }.enumerated())
163+
let (minDelimiterLength, maxDelimiterLength) = (unmatchedDelimiters.map(\.element.count).min() ?? 0, unmatchedDelimiters.map(\.element.count).max() ?? 0)
163164

164165
var searchOffset = 0
165166
repeat {
166167
assert(bufferValidLength - searchOffset >= 0, "INTERNAL LOGIC ERROR")
167168
let bufferStart = buffer + bufferStartPos
168169
let bufferSearchData = UnsafeRawBufferPointer(start: bufferStart + searchOffset, count: bufferValidLength - searchOffset)
169-
if let match = matchDelimiters(inData: bufferSearchData, dataStartOffset: searchOffset, usingMatchingMode: matchingMode, includeDelimiter: includeDelimiter, minDelimiterLength: minDelimiterLength, withUnmatchedDelimiters: &unmatchedDelimiters, matchedDatas: &matchedDatas) {
170+
if let match = matchDelimiters(inData: bufferSearchData, dataStartOffset: searchOffset, usingMatchingMode: matchingMode, includeDelimiter: includeDelimiter, minDelimiterLength: minDelimiterLength, withUnmatchedDelimiters: &unmatchedDelimiters, bestMatch: &bestMatch) {
170171
if updateReadPosition {
171172
bufferStartPos += match.length
172173
bufferValidLength -= match.length
@@ -186,7 +187,7 @@ public final class GenericStreamReader : StreamReader {
186187
assert(sizeRead >= 0)
187188
} while true
188189

189-
if let match = findBestMatch(fromMatchedDatas: matchedDatas, usingMatchingMode: matchingMode) {
190+
if let match = bestMatch {
190191
let ret = try handler(UnsafeRawBufferPointer(start: buffer + bufferStartPos, count: match.length), delimiters[match.delimiterIdx])
191192
if updateReadPosition {
192193
bufferStartPos += match.length

Sources/StreamReader/Matching.swift

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
/*
2+
* Utils.swift
3+
* StreamReader
4+
*
5+
* Created by François Lamboley on 20/08/2017.
6+
*/
7+
8+
import Foundation
9+
10+
11+
12+
internal struct Match {
13+
14+
var length: Int
15+
16+
/* For optimization puproses. */
17+
var delimiterIdx: Int
18+
var lengthNoDelimiter: Int
19+
20+
}
21+
22+
internal func cleanupDelimiters(_ delimiters: [Data], forMatchingMode matchingMode: DelimiterMatchingMode, includingDelimiter: Bool) -> [Data] {
23+
/* First we remove delimiters duplicates, keeping the order (e.g. [1,2,3,2,1] -> [1,2,3]). */
24+
// let delimiters = NSOrderedSet(array: delimiters).array as! [Data]
25+
var found = Set<Data>()
26+
let delimiters = delimiters.filter{ found.insert($0).inserted }
27+
28+
switch (matchingMode, includingDelimiter) {
29+
case (.shortestDataWins, false):
30+
return delimiters.filter{ delimiter in
31+
!delimiters.contains(where: { !$0.isEmpty && $0 != delimiter && delimiter.starts(with: $0) }) /* If the delimiter has another delimiter as a prefix, we do not keep it. */
32+
}
33+
34+
case (.longestDataWins, true):
35+
return delimiters.filter{ delimiter in
36+
!delimiters.contains(where: { !$0.isEmpty && $0 != delimiter && delimiter.reversed().starts(with: $0.reversed()) }) /* If the delimiter has another delimiter as a suffix, we do not keep it. */
37+
}
38+
39+
case (.firstMatchingDelimiterWins, _), (.anyMatchWins, _), (.shortestDataWins, true), (.longestDataWins, false):
40+
/* TODO: Find potential delimiter optimizations for these cases.
41+
* There are probably things to do for the shortestDataWins and longestDataWins cases.
42+
* For the anyMatchWins, there are none.
43+
* For the firstMatchingDelimiterWins I’m not sure… */
44+
return delimiters
45+
}
46+
}
47+
48+
/* Returns nil if no confirmed matches were found, the length of the matched data otherwise.
49+
* The given unmatched delimiters must be 1/ cleaned up for the given matching mode and co and 2/ gotten rid of the empty delimiters. */
50+
internal func matchDelimiters(inData data: UnsafeRawBufferPointer, dataStartOffset: Int, usingMatchingMode matchingMode: DelimiterMatchingMode, includeDelimiter: Bool, minDelimiterLength: Int, withUnmatchedDelimiters unmatchedDelimiters: inout [(offset: Int, element: Data)], bestMatch: inout Match?) -> Match? {
51+
assert(!unmatchedDelimiters.contains(where: { $0.element.isEmpty }))
52+
assert(cleanupDelimiters(unmatchedDelimiters.map{ $0.element }, forMatchingMode: matchingMode, includingDelimiter: includeDelimiter) == unmatchedDelimiters.map{ $0.element })
53+
54+
guard minDelimiterLength > 0, data.count >= minDelimiterLength else {
55+
/* No need to search if all the delimiters are empty or if there are less data than the minimum delimiter length: nothing matches. */
56+
return nil
57+
}
58+
59+
/* We implement the search ourselves to be able to early bail when possible. */
60+
let start = data.baseAddress!
61+
var hasDelimitersTooBigThatCouldMatch = false
62+
let end = data.baseAddress!.advanced(by: data.count - minDelimiterLength)
63+
for (curDataIdx, curPos) in (start...end).enumerated() {
64+
let curLength = dataStartOffset + curDataIdx
65+
let curRemainingSpace = data.count - curDataIdx
66+
assert(curRemainingSpace > 0) /* minDelimiterLength is >0, so there should always be at least 1 byte available. */
67+
/* Reversed enumeration in order to be able to remove an element from the unmatchedDelimiters array while still enumerating it and keeping valid indexes. */
68+
for (delimiterIdx, delimiter) in unmatchedDelimiters.enumerated().reversed() {
69+
let delimiterLength = delimiter.element.count
70+
/* If the delimiter is empty or bigger than the remaining space it cannot match. */
71+
guard delimiterLength > 0 else {continue}
72+
guard delimiterLength <= curRemainingSpace else {
73+
/* The delimiter is too big to compare to the whole data.
74+
* If the delimiter is a potential match (the data available is a prefix of the delimiter),
75+
* we’ll keep a hint that we have a delimiter that could match. */
76+
hasDelimitersTooBigThatCouldMatch = (
77+
hasDelimitersTooBigThatCouldMatch ||
78+
Data(bytesNoCopy: UnsafeMutableRawPointer(mutating: curPos), count: curRemainingSpace, deallocator: .none) == delimiter.element[0..<curRemainingSpace]
79+
)
80+
continue
81+
}
82+
if Data(bytesNoCopy: UnsafeMutableRawPointer(mutating: curPos), count: delimiterLength, deallocator: .none) == delimiter.element {
83+
/* We have a match! */
84+
let match = Match(
85+
length: curLength + (includeDelimiter ? delimiterLength : 0),
86+
delimiterIdx: delimiter.offset, lengthNoDelimiter: curLength
87+
)
88+
unmatchedDelimiters.remove(at: delimiterIdx) /* Probably triggers CoW. Should we do better? */
89+
90+
/* Obvious use cases where we can return the match at once. */
91+
switch matchingMode {
92+
case .anyMatchWins,
93+
/* If the delimiter is *not* included AND we do not have a potential match by a delimiter too big,
94+
* whatever we could find next would be bigger than our current match, so we can return it. */
95+
.shortestDataWins where !includeDelimiter && !hasDelimitersTooBigThatCouldMatch:
96+
assert(bestMatch.flatMap{ $0.length >= match.length } ?? true)
97+
bestMatch = match
98+
return match
99+
100+
case .shortestDataWins:
101+
if (bestMatch.flatMap{ $0.length > match.length } ?? true) {
102+
bestMatch = match
103+
/* Early bail if the match has the minimum length possible. */
104+
if match.length == (includeDelimiter ? minDelimiterLength : 0) {
105+
return match
106+
}
107+
}
108+
/* We process another early bail possibilities once all the delimiters have been seen. */
109+
110+
case .firstMatchingDelimiterWins:
111+
if (bestMatch.flatMap{ $0.delimiterIdx > match.delimiterIdx } ?? true) {
112+
bestMatch = match
113+
/* Early bail if the first delimiter has matched. */
114+
if match.delimiterIdx == 0 {
115+
return match
116+
}
117+
}
118+
/* No need to keep the delimiters whose offset is >delimiter.offset; we know we won’t choose them.
119+
* Note: The removal will be applied at the next byte check (the enumeration of unmatchedDelimiters enumerates on a copy). */
120+
unmatchedDelimiters.removeAll{ $0.offset > delimiter.offset }
121+
122+
case .longestDataWins:
123+
if (bestMatch.flatMap{ $0.length < match.length } ?? true) {
124+
bestMatch = match
125+
/* No known early bails. I don’t think there are any. */
126+
}
127+
}
128+
}
129+
}
130+
/*
131+
* f
132+
* ef
133+
* def
134+
* cde
135+
* cdef
136+
* cdefg
137+
* abcdef[gh]
138+
*
139+
* Let’s see if we have enough info to bail early. */
140+
switch matchingMode {
141+
case .shortestDataWins:
142+
if let bestMatch, includeDelimiter {
143+
/* We have a match and we include the delimiters (the case where we do not include the delimiter is already taken care of).
144+
* Let’s try to bail early. */
145+
146+
/* First we remove the unmatched delimiters which would give a longer match than the best one we have now. */
147+
unmatchedDelimiters.removeAll(where: { delimiter in
148+
let potentialMatchLength = curLength + delimiter.element.count
149+
return potentialMatchLength >= bestMatch.length
150+
})
151+
if unmatchedDelimiters.isEmpty {
152+
return bestMatch
153+
}
154+
}
155+
156+
case .anyMatchWins, .longestDataWins, .firstMatchingDelimiterWins:
157+
(/* No known early bail. */)
158+
}
159+
}
160+
161+
/* Let's search for a confirmed match.
162+
* We can only do that if all the delimiters have been matched.
163+
* All other early bail cases have been taken care of above. */
164+
guard unmatchedDelimiters.count == 0 else {return nil}
165+
return bestMatch
166+
}

Sources/StreamReader/Utils.swift

Lines changed: 0 additions & 105 deletions
This file was deleted.
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
/*
2+
* MatchingTests.swift
3+
* StreamReader
4+
*
5+
* Created by François Lamboley on 2022/09/14.
6+
*/
7+
8+
import Foundation
9+
import XCTest
10+
11+
@testable import StreamReader
12+
13+
14+
15+
class MatchingTests : XCTestCase {
16+
17+
func testDelimitersCleanupForShortestDataWithoutDelimiter() {
18+
XCTAssertEqual(
19+
cleanupDelimiters([Data(), Data(hexEncoded: "01")!, Data(hexEncoded: "01 02")!, Data(hexEncoded: "01 02")!, Data(hexEncoded: "02")!], forMatchingMode: .shortestDataWins, includingDelimiter: false),
20+
[Data(), Data(hexEncoded: "01")!, Data(hexEncoded: "02")!]
21+
)
22+
}
23+
24+
func testDelimitersCleanupForLongestDataWithDelimiter() {
25+
XCTAssertEqual(
26+
cleanupDelimiters([Data(), Data(hexEncoded: "01")!, Data(hexEncoded: "01 02")!, Data(hexEncoded: "01 02")!, Data(hexEncoded: "02")!, Data(hexEncoded: "01")!], forMatchingMode: .longestDataWins, includingDelimiter: true),
27+
[Data(), Data(hexEncoded: "01")!, Data(hexEncoded: "02")!]
28+
)
29+
}
30+
31+
}

0 commit comments

Comments
 (0)