Skip to content

Commit 8738872

Browse files
authored
Merge pull request #68 from OndrejSladky/revert-67-revert-59-lowerbound
KmerCamel can compute lower bounds on superstring lengths.
2 parents 13c2127 + d8705d6 commit 8738872

File tree

10 files changed

+132
-26
lines changed

10 files changed

+132
-26
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ kmercamel
66
kmercameltest
77
kmercamel-large
88
kmercameltest-large
9+
kmercameltest-extra-large
910
src/version.h
1011
🐫
1112
bin/

Makefile

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ CXX= g++
44
CXXFLAGS= -g -Wall -Wno-unused-function -std=c++17 -O2 -I/opt/homebrew/include
55
LDFLAGS= -lz -lglpk -L/opt/homebrew/lib
66
LARGEFLAGS= -DLARGE_KMERS
7+
ELARGEFLAGS= -DEXTRA_LARGE_KMERS
78
SRC= src
89
TESTS= tests
910
GTEST= $(TESTS)/googletest/googletest
@@ -22,9 +23,10 @@ quick-verify: verify.py kmercamel
2223
./verify.py --quick $(DATA)/spneumoniae.fa
2324
./verify.py --k 13 --superstring_path $(DATA)/global-k13c.fa $(DATA)/spneumoniae.fa
2425

25-
cpptest: kmercameltest kmercameltest-large
26+
cpptest: kmercameltest kmercameltest-large kmercameltest-extra-large
2627
./kmercameltest
2728
./kmercameltest-large
29+
./kmercameltest-extra-large
2830

2931
converttest: convert_superstring_unittest.py
3032
./convert_superstring_unittest.py
@@ -40,6 +42,9 @@ kmercameltest: $(TESTS)/unittest.cpp gtest-all.o $(SRC)/$(wildcard *.cpp *.h *.h
4042
kmercameltest-large: $(TESTS)/unittest.cpp gtest-all.o $(SRC)/$(wildcard *.cpp *.h *.hpp) $(TESTS)/$(wildcard *.cpp *.h *.hpp)
4143
$(CXX) $(CXXFLAGS) -isystem $(GTEST)/include -I $(GTEST)/include $(TESTS)/unittest.cpp gtest-all.o -pthread -o $@ $(LDFLAGS) $(LARGEFLAGS)
4244

45+
kmercameltest-extra-large: $(TESTS)/unittest.cpp gtest-all.o $(SRC)/$(wildcard *.cpp *.h *.hpp) $(TESTS)/$(wildcard *.cpp *.h *.hpp)
46+
$(CXX) $(CXXFLAGS) -isystem $(GTEST)/include -I $(GTEST)/include $(TESTS)/unittest.cpp gtest-all.o -pthread -o $@ $(LDFLAGS) $(ELARGEFLAGS)
47+
4348
gtest-all.o: $(GTEST)/src/gtest-all.cc $(wildcard *.cpp *.h *.hpp)
4449
$(CXX) $(CXXFLAGS) -isystem $(GTEST)/include -I $(GTEST)/include -I $(GTEST) -DGTEST_CREATE_SHARED_LIBRARY=1 -c -pthread $(GTEST)/src/gtest-all.cc -o $@
4550

src/global.h

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,10 @@ void PartialPreSort(std::vector<kmer_t> &vals, int k) {
5757
/// If complements are provided, treat k-mer and its complement as identical.
5858
/// If this is the case, k-mers are expected to contain only one k-mer from a complement pair.
5959
/// Moreover, if so, the resulting Hamiltonian path contains two superstrings which are reverse complements of one another.
60+
/// If lower_bound is set to true, return a shortest cycle cover instead.
6061
template <typename kmer_t, typename kh_wrapper_t>
61-
overlapPath OverlapHamiltonianPath (kh_wrapper_t wrapper, std::vector<kmer_t> &kMers, int k, bool complements) {
62+
overlapPath OverlapHamiltonianPath (kh_wrapper_t wrapper, std::vector<kmer_t> &kMers, int k, bool complements,
63+
bool lower_bound = false) {
6264
size_t n = kMers.size();
6365
size_t kMersCount = n * (1 + complements);
6466
size_t batchSize = kMersCount / MEMORY_REDUCTION_FACTOR + 1;
@@ -106,10 +108,12 @@ overlapPath OverlapHamiltonianPath (kh_wrapper_t wrapper, std::vector<kmer_t> &k
106108
if (suffix_key == kh_end(prefixes)) continue;
107109
size_t previous, j;
108110
previous = j = kh_val(prefixes, suffix_key);
109-
// If the path forms a cycle, or is between k-mer and its reverse complement, or the k-mers complement was already selected skip this path.
110111
while (j != size_t(-1) && \
111-
(accessFirstLast(first, last, i, n) % n == j % n \
112-
|| accessFirstLast(first, last, i, n) % n == accessFirstLast(last, first, j, n) % n \
112+
// k-mers are complementary
113+
((i + n) % (2 * n) == j \
114+
// forms a cycle
115+
|| (!lower_bound && accessFirstLast(first, last, i, n) == j) \
116+
// k-mer is already used
113117
|| prefixForbidden[j])) {
114118
size_t new_j = next[j - from];
115119
// If the k-mer is forbidden, remove it to keep the complexity linear.

src/lower_bound.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#pragma once
2+
3+
#include "global.h"
4+
#include "kmers.h"
5+
6+
/// Return the length of the cycle cover which lower bounds the superstring length.
7+
template <typename kmer_t, typename kh_wrapper_t>
8+
size_t LowerBoundLength(kh_wrapper_t wrapper, std::vector<kmer_t> kMers, int k, bool complements) {
9+
auto cycle_cover = OverlapHamiltonianPath(wrapper, kMers, k, complements, true);
10+
size_t res = 0;
11+
for (auto &overlap : cycle_cover.second) {
12+
res += size_t(k) - size_t(overlap);
13+
}
14+
return res / (1 + complements);
15+
}

src/main.cpp

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,13 @@
1111
#include "ac/parser_ac.h"
1212
#include "ac/streaming.h"
1313
#include "khash_utils.h"
14+
15+
#include <iostream>
16+
#include <string>
17+
#include "unistd.h"
18+
#include "version.h"
1419
#include "masks.h"
20+
#include "lower_bound.h"
1521

1622
int Help() {
1723
std::cerr << "KmerCamel version " << VERSION << std::endl;
@@ -23,6 +29,7 @@ int Help() {
2329
std::cerr << " -d d_value - integer value for d_max; default 5" << std::endl;
2430
std::cerr << " -c - treat k-mer and its reverse complement as equal" << std::endl;
2531
std::cerr << " -m - turn off the memory optimizations for global" << std::endl;
32+
std::cerr << " -l - compute the cycle cover lower bound instead of masked superstring" << std::endl;
2633
std::cerr << " -h - print help" << std::endl;
2734
std::cerr << " -v - print version" << std::endl;
2835
std::cerr << "Example usage: ./kmercamel -p path_to_fasta -k 31 -d 5 -a local -c" << std::endl;
@@ -48,8 +55,9 @@ void Version() {
4855

4956
// This macro cannot be avoided without bloating the code too much.
5057
#define INIT_KMERCAMEL(type) \
51-
int kmercamel##type(std::string path, int k, int d_max, std::ostream *of, bool complements, bool masks, std::string algorithm, bool optimize_memory) { \
52-
kmer_dict##type##_t wrapper; \
58+
int kmercamel##type(std::string path, int k, int d_max, std::ostream *of, bool complements, bool masks, \
59+
std::string algorithm, bool optimize_memory, bool lower_bound) { \
60+
kmer_dict##type##_t wrapper; \
5361
kmer##type##_t kmer_type = 0; \
5462
if (masks) { \
5563
int ret = Optimize(wrapper, kmer_type, algorithm, path, *of, k, complements); \
@@ -71,14 +79,15 @@ int kmercamel##type(std::string path, int k, int d_max, std::ostream *of, bool c
7179
return Help(); \
7280
} \
7381
d_max = std::min(k - 1, d_max); \
74-
WriteName(k, *of); \
82+
if (!lower_bound) WriteName(k, *of); \
7583
if (algorithm == "global") { \
7684
auto kMerVec = kMersToVec(kMers, kmer_type); \
7785
kh_destroy_S##type(kMers); \
7886
/* Turn off the memory optimizations if optimize_memory is set to false. */ \
7987
if(optimize_memory) PartialPreSort(kMerVec, k); \
8088
else MEMORY_REDUCTION_FACTOR = 1; \
81-
Global(wrapper, kMerVec, *of, k, complements); \
89+
if (lower_bound) std::cout << LowerBoundLength(wrapper, kMerVec, k, complements); \
90+
else Global(wrapper, kMerVec, *of, k, complements); \
8291
} \
8392
else Local(kMers, wrapper, kmer_type, *of, k, d_max, complements); \
8493
} else { \
@@ -127,9 +136,10 @@ int main(int argc, char **argv) {
127136
bool complements = false;
128137
bool optimize_memory = true;
129138
bool d_set = false;
139+
bool lower_bound = false;
130140
int opt;
131141
try {
132-
while ((opt = getopt(argc, argv, "p:k:d:a:o:hcvm")) != -1) {
142+
while ((opt = getopt(argc, argv, "p:k:d:a:o:hcvml")) != -1) {
133143
switch(opt) {
134144
case 'p':
135145
path = optarg;
@@ -159,6 +169,9 @@ int main(int argc, char **argv) {
159169
case 'm':
160170
optimize_memory = false;
161171
break;
172+
case 'l':
173+
lower_bound = true;
174+
break;
162175
case 'v':
163176
Version();
164177
return 0;
@@ -196,12 +209,15 @@ int main(int argc, char **argv) {
196209
} else if (masks && (d_set || !optimize_memory)) {
197210
std::cerr << "Not supported flags for optimize." << std::endl;
198211
return Help();
212+
} else if (lower_bound && algorithm != "global") {
213+
std::cerr << "Lower bound computation supported only for hash table global." << std::endl;
214+
return Help();
199215
}
200216
if (k < 32) {
201-
return kmercamel64(path, k, d_max, of, complements, masks, algorithm, optimize_memory);
217+
return kmercamel64(path, k, d_max, of, complements, masks, algorithm, optimize_memory, lower_bound);
202218
} else if (k < 64) {
203-
return kmercamel128(path, k, d_max, of, complements, masks, algorithm, optimize_memory);
219+
return kmercamel128(path, k, d_max, of, complements, masks, algorithm, optimize_memory, lower_bound);
204220
} else {
205-
return kmercamel256(path, k, d_max, of, complements, masks, algorithm, optimize_memory);
221+
return kmercamel256(path, k, d_max, of, complements, masks, algorithm, optimize_memory, lower_bound);
206222
}
207223
}

src/uint256_t/uint256_t.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
#pragma once
12
#ifndef __LITTLE_ENDIAN__
23
#ifndef __BIG_ENDIAN__
34
#define __LITTLE_ENDIAN__ 1

tests/global_unittest.h

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,30 +80,48 @@ namespace {
8080
overlapPath wantResult;
8181
int k;
8282
bool complements;
83+
bool lower_bound;
8384
};
8485
std::vector<TestCase> tests = {
8586
{
8687
{KMerToNumber({"AT"})},
8788
{{(size_t)-1, (size_t)-1}, {(byte)-1, (byte)-1}},
8889
2,
8990
true,
91+
false,
9092
},
9193
{
9294
{KMerToNumber({"ACG"}), KMerToNumber({"TAC"}), KMerToNumber({"GGC"})},
9395
{{2, 0, (size_t)-1}, {1, 2, (byte)-1}},
9496
3,
9597
false,
98+
false,
9699
},
97100
{
98101
{KMerToNumber({"ACAA"}), KMerToNumber({"ATTT"}), KMerToNumber({"AACA"})},
99102
{{4, 3, 0, 5, (size_t)-1, (size_t)-1}, {2, 2, 3, 3, (byte)-1, (byte)-1}},
100103
4,
101104
true,
105+
false,
106+
},
107+
{
108+
{KMerToNumber({"ACG"}), KMerToNumber({"CGT"}), KMerToNumber({"TAA"})},
109+
{{1, 2, 0}, {2, 1, 1}},
110+
3,
111+
false,
112+
true,
113+
},
114+
{
115+
{KMerToNumber({"ACC"}), KMerToNumber({"CGG"})},
116+
{{3, 2, 1, 0}, {2, 2, 0, 0}},
117+
3,
118+
true,
119+
true,
102120
},
103121
};
104122

105123
for (auto t : tests) {
106-
overlapPath got = OverlapHamiltonianPath(wrapper, t.kMers, t.k, t.complements);
124+
overlapPath got = OverlapHamiltonianPath(wrapper, t.kMers, t.k, t.complements, t.lower_bound);
107125
EXPECT_EQ(t.wantResult.first, got.first);
108126
EXPECT_EQ(t.wantResult.second, got.second);
109127
}

tests/kmer_types.h

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,26 @@
22

33
#include <cstdint>
44
#include "../src/khash_utils.h"
5+
#include "../src/uint256_t/uint256_t.h"
56

6-
7-
#ifdef LARGE_KMERS
8-
typedef __uint128_t kmer_t;
9-
typedef kmer_dict128_t kh_wrapper;
10-
typedef kh_S128_t kh_S_t;
11-
typedef kh_P128_t kh_P_t;
12-
#else // LARGE_KMERS
13-
typedef uint64_t kmer_t;
14-
typedef kmer_dict64_t kh_wrapper;
15-
typedef kh_S64_t kh_S_t;
16-
typedef kh_P64_t kh_P_t;
17-
#endif // LARGE_KMERS
7+
#ifdef EXTRA_LARGE_KMERS
8+
typedef uint256_t kmer_t;
9+
typedef kmer_dict256_t kh_wrapper;
10+
typedef kh_S256_t kh_S_t;
11+
typedef kh_P256_t kh_P_t;
12+
#else // EXTRA_LARGE_KMERS
13+
#ifdef LARGE_KMERS
14+
typedef __uint128_t kmer_t;
15+
typedef kmer_dict128_t kh_wrapper;
16+
typedef kh_S128_t kh_S_t;
17+
typedef kh_P128_t kh_P_t;
18+
#else // LARGE_KMERS
19+
typedef uint64_t kmer_t;
20+
typedef kmer_dict64_t kh_wrapper;
21+
typedef kh_S64_t kh_S_t;
22+
typedef kh_P64_t kh_P_t;
23+
#endif // LARGE_KMERS
24+
#endif // EXTRA_LARGE_KMERS
1825

1926
// To be passed to functions.
2027
kh_wrapper wrapper;

tests/lower_bound_unittest.h

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#pragma once
2+
3+
#include "kmer_types.h"
4+
5+
#include "../src/lower_bound.h"
6+
7+
#include "gtest/gtest.h"
8+
9+
namespace {
10+
TEST(LowerBound, LowerBoundLength) {
11+
struct TestCase {
12+
std::vector<kmer_t> kMers;
13+
int k;
14+
bool complements;
15+
size_t wantResult;
16+
};
17+
std::vector<TestCase> tests = {
18+
{
19+
{KMerToNumber({"ACG"}), KMerToNumber({"CGT"}), KMerToNumber({"TAA"})},
20+
3, false, 5
21+
},
22+
{
23+
{KMerToNumber({"AC"}), KMerToNumber({"GG"})},
24+
2, true, 3
25+
},
26+
{
27+
{KMerToNumber({"AAACCC"}), KMerToNumber({"CCCAAA"}), KMerToNumber({"TGGGGT"})},
28+
6, false, 11
29+
},
30+
};
31+
32+
for (auto &t : tests) {
33+
auto gotResult = LowerBoundLength(wrapper, t.kMers, t.k, t.complements);
34+
35+
ASSERT_EQ(t.wantResult, gotResult);
36+
}
37+
}
38+
}

tests/unittest.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include "local_ac_unittest.h"
77
#include "streaming_unittest.h"
88
#include "ac_automaton_unittest.h"
9+
#include "lower_bound_unittest.h"
910
#include "masks_unittest.h"
1011

1112
#include "gtest/gtest.h"

0 commit comments

Comments
 (0)