Skip to content

Commit 888855e

Browse files
authored
Revert "Revert "KmerCamel can compute lower bounds on superstring lengths.""
1 parent ea84fdd commit 888855e

File tree

6 files changed

+95
-9
lines changed

6 files changed

+95
-9
lines changed

src/global.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ void PartialPreSort(std::vector<kmer_t> &vals, int k) {
5757
/// If complements are provided, treat k-mer and its complement as identical.
5858
/// If this is the case, k-mers are expected to contain only one k-mer from a complement pair.
5959
/// Moreover, if so, the resulting Hamiltonian path contains two superstrings which are reverse complements of one another.
60-
overlapPath OverlapHamiltonianPath (std::vector<kmer_t> &kMers, int k, bool complements) {
60+
overlapPath OverlapHamiltonianPath (std::vector<kmer_t> &kMers, int k, bool complements, bool lower_bound = false) {
6161
size_t n = kMers.size();
6262
size_t kMersCount = n * (1 + complements);
6363
size_t batchSize = kMersCount / MEMORY_REDUCTION_FACTOR + 1;
@@ -106,10 +106,12 @@ overlapPath OverlapHamiltonianPath (std::vector<kmer_t> &kMers, int k, bool comp
106106
if (suffix_key == kh_end(prefixes)) continue;
107107
size_t previous, j;
108108
previous = j = kh_val(prefixes, suffix_key);
109-
// If the path forms a cycle, or is between k-mer and its reverse complement, or the k-mers complement was already selected skip this path.
110109
while (j != size_t(-1) && \
111-
(accessFirstLast(first, last, i, n) % n == j % n \
112-
|| accessFirstLast(first, last, i, n) % n == accessFirstLast(last, first, j, n) % n \
110+
// k-mers are complementary
111+
((i + n) % (2 * n) == j \
112+
// forms a cycle
113+
|| (!lower_bound && accessFirstLast(first, last, i, n) == j) \
114+
// k-mer is already used
113115
|| prefixForbidden[j])) {
114116
size_t new_j = next[j - from];
115117
// If the k-mer is forbidden, remove it to keep the complexity linear.

src/lower_bound.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
#pragma once
2+
3+
#include "global.h"
4+
#include "kmers.h"
5+
6+
/// Return the length of the cycle cover which lower bounds the superstring length.
7+
size_t LowerBoundLength(std::vector<kmer_t> kMers, int k, bool complements) {
8+
auto cycle_cover = OverlapHamiltonianPath(kMers, k, complements, true);
9+
size_t res = 0;
10+
for (auto &overlap : cycle_cover.second) {
11+
res += size_t(k) - size_t(overlap);
12+
}
13+
return res / (1 + complements);
14+
}

src/main.cpp

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,13 @@
1111
#include "streaming.h"
1212
#include "output.h"
1313
#include "khash_utils.h"
14+
15+
#include <iostream>
16+
#include <string>
17+
#include "unistd.h"
18+
#include "version.h"
1419
#include "masks.h"
20+
#include "lower_bound.h"
1521

1622

1723
#ifdef LARGE_KMERS
@@ -33,6 +39,7 @@ int Help() {
3339
std::cerr << " -d d_value - integer value for d_max; default 5" << std::endl;
3440
std::cerr << " -c - treat k-mer and its reverse complement as equal" << std::endl;
3541
std::cerr << " -m - turn off the memory optimizations for global" << std::endl;
42+
std::cerr << " -l - compute the cycle cover lower bound instead of masked superstring" << std::endl;
3643
std::cerr << " -h - print help" << std::endl;
3744
std::cerr << " -v - print version" << std::endl;
3845
std::cerr << "Example usage: ./kmercamel -p path_to_fasta -k 13 -d 5 -a local" << std::endl;
@@ -71,9 +78,10 @@ int main(int argc, char **argv) {
7178
bool complements = false;
7279
bool optimize_memory = true;
7380
bool d_set = false;
81+
bool lower_bound = false;
7482
int opt;
7583
try {
76-
while ((opt = getopt(argc, argv, "p:k:d:a:o:hcvm")) != -1) {
84+
while ((opt = getopt(argc, argv, "p:k:d:a:o:hcvml")) != -1) {
7785
switch(opt) {
7886
case 'p':
7987
path = optarg;
@@ -103,6 +111,9 @@ int main(int argc, char **argv) {
103111
case 'm':
104112
optimize_memory = false;
105113
break;
114+
case 'l':
115+
lower_bound = true;
116+
break;
106117
case 'v':
107118
Version();
108119
return 0;
@@ -140,6 +151,9 @@ int main(int argc, char **argv) {
140151
} else if (masks && (d_set || !optimize_memory)) {
141152
std::cerr << "Not supported flags for optimize." << std::endl;
142153
return Help();
154+
} else if (lower_bound && algorithm != "global") {
155+
std::cerr << "Lower bound computation supported only for hash table global." << std::endl;
156+
return Help();
143157
}
144158

145159
if (masks) {
@@ -162,14 +176,15 @@ int main(int argc, char **argv) {
162176
return Help();
163177
}
164178
d_max = std::min(k - 1, d_max);
165-
WriteName(k, *of);
179+
if (!lower_bound) WriteName(k, *of);
166180
if (algorithm == "global") {
167181
auto kMerVec = kMersToVec(kMers);
168182
kh_destroy_S64(kMers);
169183
// Turn off the memory optimizations if optimize_memory is set to false.
170-
if(optimize_memory) PartialPreSort(kMerVec, k);
184+
if (optimize_memory) PartialPreSort(kMerVec, k);
171185
else MEMORY_REDUCTION_FACTOR = 1;
172-
Global(kMerVec, *of, k, complements);
186+
if (lower_bound) std::cout << LowerBoundLength(kMerVec, k, complements);
187+
else Global(kMerVec, *of, k, complements);
173188
}
174189
else Local(kMers, *of, k, d_max, complements);
175190
} else {

tests/global_unittest.h

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,30 +78,48 @@ namespace {
7878
overlapPath wantResult;
7979
int k;
8080
bool complements;
81+
bool lower_bound;
8182
};
8283
std::vector<TestCase> tests = {
8384
{
8485
{KMerToNumber({"AT"})},
8586
{{(size_t)-1, (size_t)-1}, {(byte)-1, (byte)-1}},
8687
2,
8788
true,
89+
false,
8890
},
8991
{
9092
{KMerToNumber({"ACG"}), KMerToNumber({"TAC"}), KMerToNumber({"GGC"})},
9193
{{2, 0, (size_t)-1}, {1, 2, (byte)-1}},
9294
3,
9395
false,
96+
false,
9497
},
9598
{
9699
{KMerToNumber({"ACAA"}), KMerToNumber({"ATTT"}), KMerToNumber({"AACA"})},
97100
{{4, 3, 0, 5, (size_t)-1, (size_t)-1}, {2, 2, 3, 3, (byte)-1, (byte)-1}},
98101
4,
99102
true,
103+
false,
104+
},
105+
{
106+
{KMerToNumber({"ACG"}), KMerToNumber({"CGT"}), KMerToNumber({"TAA"})},
107+
{{1, 2, 0}, {2, 1, 1}},
108+
3,
109+
false,
110+
true,
111+
},
112+
{
113+
{KMerToNumber({"ACC"}), KMerToNumber({"CGG"})},
114+
{{3, 2, 1, 0}, {2, 2, 0, 0}},
115+
3,
116+
true,
117+
true,
100118
},
101119
};
102120

103121
for (auto t : tests) {
104-
overlapPath got = OverlapHamiltonianPath( t.kMers, t.k, t.complements);
122+
overlapPath got = OverlapHamiltonianPath( t.kMers, t.k, t.complements, t.lower_bound);
105123
EXPECT_EQ(t.wantResult.first, got.first);
106124
EXPECT_EQ(t.wantResult.second, got.second);
107125
}

tests/lower_bound_unittest.h

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#pragma once
2+
3+
#include "../src/lower_bound.h"
4+
5+
#include "gtest/gtest.h"
6+
7+
namespace {
8+
TEST(LowerBound, LowerBoundLength) {
9+
struct TestCase {
10+
std::vector<kmer_t> kMers;
11+
int k;
12+
bool complements;
13+
size_t wantResult;
14+
};
15+
std::vector<TestCase> tests = {
16+
{
17+
{KMerToNumber({"ACG"}), KMerToNumber({"CGT"}), KMerToNumber({"TAA"})},
18+
3, false, 5
19+
},
20+
{
21+
{KMerToNumber({"AC"}), KMerToNumber({"GG"})},
22+
2, true, 3
23+
},
24+
{
25+
{KMerToNumber({"AAACCC"}), KMerToNumber({"CCCAAA"}), KMerToNumber({"TGGGGT"})},
26+
6, false, 11
27+
},
28+
};
29+
30+
for (auto &t : tests) {
31+
auto gotResult = LowerBoundLength(t.kMers, t.k, t.complements);
32+
33+
ASSERT_EQ(t.wantResult, gotResult);
34+
}
35+
}
36+
}

tests/unittest.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include "local_ac_unittest.h"
77
#include "streaming_unittest.h"
88
#include "ac_automaton_unittest.h"
9+
#include "lower_bound_unittest.h"
910
#include "masks_unittest.h"
1011

1112
#include "gtest/gtest.h"

0 commit comments

Comments
 (0)