|
| 1 | +/* |
| 2 | + * Utils.swift |
| 3 | + * StreamReader |
| 4 | + * |
| 5 | + * Created by François Lamboley on 20/08/2017. |
| 6 | + */ |
| 7 | + |
| 8 | +import Foundation |
| 9 | + |
| 10 | + |
| 11 | + |
| 12 | +internal struct Match { |
| 13 | + |
| 14 | + var length: Int |
| 15 | + |
| 16 | + /* For optimization puproses. */ |
| 17 | + var delimiterIdx: Int |
| 18 | + var lengthNoDelimiter: Int |
| 19 | + |
| 20 | +} |
| 21 | + |
| 22 | +internal func cleanupDelimiters(_ delimiters: [Data], forMatchingMode matchingMode: DelimiterMatchingMode, includingDelimiter: Bool) -> [Data] { |
| 23 | + /* First we remove delimiters duplicates, keeping the order (e.g. [1,2,3,2,1] -> [1,2,3]). */ |
| 24 | +// let delimiters = NSOrderedSet(array: delimiters).array as! [Data] |
| 25 | + var found = Set<Data>() |
| 26 | + let delimiters = delimiters.filter{ found.insert($0).inserted } |
| 27 | + |
| 28 | + switch (matchingMode, includingDelimiter) { |
| 29 | + case (.shortestDataWins, false): |
| 30 | + return delimiters.filter{ delimiter in |
| 31 | + !delimiters.contains(where: { !$0.isEmpty && $0 != delimiter && delimiter.starts(with: $0) }) /* If the delimiter has another delimiter as a prefix, we do not keep it. */ |
| 32 | + } |
| 33 | + |
| 34 | + case (.longestDataWins, true): |
| 35 | + return delimiters.filter{ delimiter in |
| 36 | + !delimiters.contains(where: { !$0.isEmpty && $0 != delimiter && delimiter.reversed().starts(with: $0.reversed()) }) /* If the delimiter has another delimiter as a suffix, we do not keep it. */ |
| 37 | + } |
| 38 | + |
| 39 | + case (.firstMatchingDelimiterWins, _), (.anyMatchWins, _), (.shortestDataWins, true), (.longestDataWins, false): |
| 40 | + /* TODO: Find potential delimiter optimizations for these cases. |
| 41 | + * There are probably things to do for the shortestDataWins and longestDataWins cases. |
| 42 | + * For the anyMatchWins, there are none. |
| 43 | + * For the firstMatchingDelimiterWins I’m not sure… */ |
| 44 | + return delimiters |
| 45 | + } |
| 46 | +} |
| 47 | + |
| 48 | +/* Returns nil if no confirmed matches were found, the length of the matched data otherwise. |
| 49 | + * The given unmatched delimiters must be 1/ cleaned up for the given matching mode and co and 2/ gotten rid of the empty delimiters. */ |
| 50 | +internal func matchDelimiters(inData data: UnsafeRawBufferPointer, dataStartOffset: Int, usingMatchingMode matchingMode: DelimiterMatchingMode, includeDelimiter: Bool, minDelimiterLength: Int, withUnmatchedDelimiters unmatchedDelimiters: inout [(offset: Int, element: Data)], bestMatch: inout Match?) -> Match? { |
| 51 | + assert(!unmatchedDelimiters.contains(where: { $0.element.isEmpty })) |
| 52 | + assert(cleanupDelimiters(unmatchedDelimiters.map{ $0.element }, forMatchingMode: matchingMode, includingDelimiter: includeDelimiter) == unmatchedDelimiters.map{ $0.element }) |
| 53 | + |
| 54 | + guard minDelimiterLength > 0, data.count >= minDelimiterLength else { |
| 55 | + /* No need to search if all the delimiters are empty or if there are less data than the minimum delimiter length: nothing matches. */ |
| 56 | + return nil |
| 57 | + } |
| 58 | + |
| 59 | + /* We implement the search ourselves to be able to early bail when possible. */ |
| 60 | + let start = data.baseAddress! |
| 61 | + var hasDelimitersTooBigThatCouldMatch = false |
| 62 | + let end = data.baseAddress!.advanced(by: data.count - minDelimiterLength) |
| 63 | + for (curDataIdx, curPos) in (start...end).enumerated() { |
| 64 | + let curLength = dataStartOffset + curDataIdx |
| 65 | + let curRemainingSpace = data.count - curDataIdx |
| 66 | + assert(curRemainingSpace > 0) /* minDelimiterLength is >0, so there should always be at least 1 byte available. */ |
| 67 | + /* Reversed enumeration in order to be able to remove an element from the unmatchedDelimiters array while still enumerating it and keeping valid indexes. */ |
| 68 | + for (delimiterIdx, delimiter) in unmatchedDelimiters.enumerated().reversed() { |
| 69 | + let delimiterLength = delimiter.element.count |
| 70 | + /* If the delimiter is empty or bigger than the remaining space it cannot match. */ |
| 71 | + guard delimiterLength > 0 else {continue} |
| 72 | + guard delimiterLength <= curRemainingSpace else { |
| 73 | + /* The delimiter is too big to compare to the whole data. |
| 74 | + * If the delimiter is a potential match (the data available is a prefix of the delimiter), |
| 75 | + * we’ll keep a hint that we have a delimiter that could match. */ |
| 76 | + hasDelimitersTooBigThatCouldMatch = ( |
| 77 | + hasDelimitersTooBigThatCouldMatch || |
| 78 | + Data(bytesNoCopy: UnsafeMutableRawPointer(mutating: curPos), count: curRemainingSpace, deallocator: .none) == delimiter.element[0..<curRemainingSpace] |
| 79 | + ) |
| 80 | + continue |
| 81 | + } |
| 82 | + if Data(bytesNoCopy: UnsafeMutableRawPointer(mutating: curPos), count: delimiterLength, deallocator: .none) == delimiter.element { |
| 83 | + /* We have a match! */ |
| 84 | + let match = Match( |
| 85 | + length: curLength + (includeDelimiter ? delimiterLength : 0), |
| 86 | + delimiterIdx: delimiter.offset, lengthNoDelimiter: curLength |
| 87 | + ) |
| 88 | + unmatchedDelimiters.remove(at: delimiterIdx) /* Probably triggers CoW. Should we do better? */ |
| 89 | + |
| 90 | + /* Obvious use cases where we can return the match at once. */ |
| 91 | + switch matchingMode { |
| 92 | + case .anyMatchWins, |
| 93 | + /* If the delimiter is *not* included AND we do not have a potential match by a delimiter too big, |
| 94 | + * whatever we could find next would be bigger than our current match, so we can return it. */ |
| 95 | + .shortestDataWins where !includeDelimiter && !hasDelimitersTooBigThatCouldMatch: |
| 96 | + assert(bestMatch.flatMap{ $0.length >= match.length } ?? true) |
| 97 | + bestMatch = match |
| 98 | + return match |
| 99 | + |
| 100 | + case .shortestDataWins: |
| 101 | + if (bestMatch.flatMap{ $0.length > match.length } ?? true) { |
| 102 | + bestMatch = match |
| 103 | + /* Early bail if the match has the minimum length possible. */ |
| 104 | + if match.length == (includeDelimiter ? minDelimiterLength : 0) { |
| 105 | + return match |
| 106 | + } |
| 107 | + } |
| 108 | + /* We process another early bail possibilities once all the delimiters have been seen. */ |
| 109 | + |
| 110 | + case .firstMatchingDelimiterWins: |
| 111 | + if (bestMatch.flatMap{ $0.delimiterIdx > match.delimiterIdx } ?? true) { |
| 112 | + bestMatch = match |
| 113 | + /* Early bail if the first delimiter has matched. */ |
| 114 | + if match.delimiterIdx == 0 { |
| 115 | + return match |
| 116 | + } |
| 117 | + } |
| 118 | + /* No need to keep the delimiters whose offset is >delimiter.offset; we know we won’t choose them. |
| 119 | + * Note: The removal will be applied at the next byte check (the enumeration of unmatchedDelimiters enumerates on a copy). */ |
| 120 | + unmatchedDelimiters.removeAll{ $0.offset > delimiter.offset } |
| 121 | + |
| 122 | + case .longestDataWins: |
| 123 | + if (bestMatch.flatMap{ $0.length < match.length } ?? true) { |
| 124 | + bestMatch = match |
| 125 | + /* No known early bails. I don’t think there are any. */ |
| 126 | + } |
| 127 | + } |
| 128 | + } |
| 129 | + } |
| 130 | + /* |
| 131 | + * f |
| 132 | + * ef |
| 133 | + * def |
| 134 | + * cde |
| 135 | + * cdef |
| 136 | + * cdefg |
| 137 | + * abcdef[gh] |
| 138 | + * |
| 139 | + * Let’s see if we have enough info to bail early. */ |
| 140 | + switch matchingMode { |
| 141 | + case .shortestDataWins: |
| 142 | + if let bestMatch, includeDelimiter { |
| 143 | + /* We have a match and we include the delimiters (the case where we do not include the delimiter is already taken care of). |
| 144 | + * Let’s try to bail early. */ |
| 145 | + |
| 146 | + /* First we remove the unmatched delimiters which would give a longer match than the best one we have now. */ |
| 147 | + unmatchedDelimiters.removeAll(where: { delimiter in |
| 148 | + let potentialMatchLength = curLength + delimiter.element.count |
| 149 | + return potentialMatchLength >= bestMatch.length |
| 150 | + }) |
| 151 | + if unmatchedDelimiters.isEmpty { |
| 152 | + return bestMatch |
| 153 | + } |
| 154 | + } |
| 155 | + |
| 156 | + case .anyMatchWins, .longestDataWins, .firstMatchingDelimiterWins: |
| 157 | + (/* No known early bail. */) |
| 158 | + } |
| 159 | + } |
| 160 | + |
| 161 | + /* Let's search for a confirmed match. |
| 162 | + * We can only do that if all the delimiters have been matched. |
| 163 | + * All other early bail cases have been taken care of above. */ |
| 164 | + guard unmatchedDelimiters.count == 0 else {return nil} |
| 165 | + return bestMatch |
| 166 | +} |
0 commit comments