Skip to content

Commit b5612ba

Browse files
committed
Merge branch 'main' into revert-67-revert-59-lowerbound
2 parents 198df62 + 13c2127 commit b5612ba

37 files changed

+1860
-706
lines changed

Makefile

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,14 @@ CXX= g++
44
CXXFLAGS= -g -Wall -Wno-unused-function -std=c++17 -O2 -I/opt/homebrew/include
55
LDFLAGS= -lz -lglpk -L/opt/homebrew/lib
66
LARGEFLAGS= -DLARGE_KMERS
7+
ELARGEFLAGS= -DEXTRA_LARGE_KMERS
78
SRC= src
89
TESTS= tests
910
GTEST= $(TESTS)/googletest/googletest
1011
DATA= data
1112

1213

13-
all: kmercamel kmercamel-large
14+
all: kmercamel
1415

1516
test: cpptest converttest verify
1617

@@ -22,9 +23,10 @@ quick-verify: verify.py kmercamel
2223
./verify.py --quick $(DATA)/spneumoniae.fa
2324
./verify.py --k 13 --superstring_path $(DATA)/global-k13c.fa $(DATA)/spneumoniae.fa
2425

25-
cpptest: kmercameltest kmercameltest-large
26+
cpptest: kmercameltest kmercameltest-large kmercameltest-extra-large
2627
./kmercameltest
2728
./kmercameltest-large
29+
./kmercameltest-extra-large
2830

2931
converttest: convert_superstring_unittest.py
3032
./convert_superstring_unittest.py
@@ -34,16 +36,15 @@ kmercamel: $(SRC)/main.cpp $(SRC)/$(wildcard *.cpp *.h *.hpp) src/version.h
3436
$(CXX) $(CXXFLAGS) $(SRC)/main.cpp -o $@ $(LDFLAGS)
3537
cp kmercamel 🐫 || true
3638

37-
kmercamel-large: $(SRC)/main.cpp $(SRC)/$(wildcard *.cpp *.h *.hpp) src/version.h
38-
./create-version.sh
39-
$(CXX) $(CXXFLAGS) $(SRC)/main.cpp -o $@ $(LDFLAGS) $(LARGEFLAGS)
40-
4139
kmercameltest: $(TESTS)/unittest.cpp gtest-all.o $(SRC)/$(wildcard *.cpp *.h *.hpp) $(TESTS)/$(wildcard *.cpp *.h *.hpp)
4240
$(CXX) $(CXXFLAGS) -isystem $(GTEST)/include -I $(GTEST)/include $(TESTS)/unittest.cpp gtest-all.o -pthread -o $@ $(LDFLAGS)
4341

4442
kmercameltest-large: $(TESTS)/unittest.cpp gtest-all.o $(SRC)/$(wildcard *.cpp *.h *.hpp) $(TESTS)/$(wildcard *.cpp *.h *.hpp)
4543
$(CXX) $(CXXFLAGS) -isystem $(GTEST)/include -I $(GTEST)/include $(TESTS)/unittest.cpp gtest-all.o -pthread -o $@ $(LDFLAGS) $(LARGEFLAGS)
4644

45+
kmercameltest-extra-large: $(TESTS)/unittest.cpp gtest-all.o $(SRC)/$(wildcard *.cpp *.h *.hpp) $(TESTS)/$(wildcard *.cpp *.h *.hpp)
46+
$(CXX) $(CXXFLAGS) -isystem $(GTEST)/include -I $(GTEST)/include $(TESTS)/unittest.cpp gtest-all.o -pthread -o $@ $(LDFLAGS) $(ELARGEFLAGS)
47+
4748
gtest-all.o: $(GTEST)/src/gtest-all.cc $(wildcard *.cpp *.h *.hpp)
4849
$(CXX) $(CXXFLAGS) -isystem $(GTEST)/include -I $(GTEST)/include -I $(GTEST) -DGTEST_CREATE_SHARED_LIBRARY=1 -c -pthread $(GTEST)/src/gtest-all.cc -o $@
4950

@@ -55,6 +56,7 @@ clean:
5556
rm -f kmercamel
5657
rm -f 🐫 || true
5758
rm -f kmercameltest
59+
rm -f kmercameltest-large
5860
rm -r -f ./bin
5961
rm -f gtest-all.o
6062
rm -f src/version.h

README.md

Lines changed: 7 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,7 @@ They come in two different implementations (their results may differ due to the
2020
Note that at this point only the implementations with hash table are optimized and that the Aho-Corasick automaton
2121
based versions of the algorithms are only experimental.
2222

23-
The hashing based implementations of the default KmerCamel🐫 (`./kmercamel`) support $k$-mer with $k$ at most 31,
24-
whereas the larger KmerCamel🐫 (`./kmercamel-large`) supports $k$-mers with $k$ at most 63 (at the cost of slight slowdown).
23+
The hashing based implementations of KmerCamel🐫 support $k$-mer with $k$ at most 127,
2524

2625
All algorithms can be used to either work in the unidirectional model or in the bidirectional model
2726
(i.e. treat $k$-mer and its reverse complement as the same; in this case either of them appears in the result).
@@ -62,8 +61,8 @@ on macOS.
6261

6362
The program has the following arguments:
6463

65-
- `-p path_to_fasta` - the path to fasta file. This is a required argument.
66-
- `-k value_of_k` - the size of one k-mer. This is a required argument.
64+
- `-p path_to_fasta` - the path to fasta file (can be `gzip`ed). This is a required argument.
65+
- `-k value_of_k` - the size of one k-mer (up to 127). This is a required argument.
6766
- `-a algorithm` - the algorithm which should be run. Either `global` or `globalAC` for Global Greedy, `local` or `localAC` for Local Greedy.
6867
The versions with AC use Aho-Corasick automaton. Default `global`.
6968
- `-o output_path` - the path to output file. If not specified, output is printed to stdout.
@@ -79,27 +78,18 @@ The output contains the resulting superstring - capital letters indicate that at
7978
For example:
8079

8180
```
82-
./kmercamel -p ./spneumoniae.fa -a local -k 12 -d 7 -c
81+
./kmercamel -p ./spneumoniae.fa -a local -k 31 -d 5 -c
8382
```
8483

85-
runs the Local Greedy in the bidirectional model on the streptococcus fasta file with `k=12` and `d=7`.
84+
runs the Local Greedy in the bidirectional model on the streptococcus fasta file with `k=31` and `d=5`.
8685

8786
Alternatively, if your operating system supports it, you can run `./🐫` instead of `./kmercamel`.
8887

89-
Currently, KmerCamel🐫 does not support gziped files as an input.
90-
A possible workaround is to use `gzcat` and process substitution.
91-
92-
```
93-
./kmercamel -k 13 -p <(gzcat fasta_file.fa.gz)
94-
```
95-
96-
Note: on some systems you might need to use the name `zcat` instead of `gzcat`.
97-
9888
### Mask optimization
9989

10090
For mask optimization, run the subcommand `optimize` with the following arguments:
10191

102-
- `p path_to_fasta` - the path to fasta file. This is a required argument.
92+
- `p path_to_fasta` - the path to fasta file (can be `gzip`ed). This is a required argument.
10393
- `k k_value` - the size of one k-mer. This is a required argument.
10494
- `a algorithm` - the algorithm for mask optimization. Either `ones` for maximizing the number of 1s, `runs` for minimizing the number of runs of 1s, `runsapprox` for approximately minimizing the number of runs of 1s, or `zeros` for maximizing the number of 0s. Default `ones`.
10595
- `o output_path` - the path to output file. If not specified, output is printed to stdout.
@@ -110,26 +100,11 @@ For mask optimization, run the subcommand `optimize` with the following argument
110100
For example:
111101

112102
```
113-
./kmercamel optimize -p ./global-spneumoniae.fa -k 12 -a runs -c
103+
./kmercamel optimize -p ./global-spneumoniae.fa -k 31 -a runs -c
114104
```
115105

116106
minimizes the number of runs of 1s in the mask of the superstring computed by Global Greedy in the bidirectional model on the streptococcus fasta file with `k=12`.
117107

118-
Note: currently mask optimization cannot read gziped files, nor can the proccess substititution be used.
119-
120-
### Large $k$-mers
121-
122-
The default version of KmerCamel🐫 does not support $k > 31$. For those values, use the large KmerCamel🐫,
123-
which supports $k < 64$.
124-
125-
For example:
126-
127-
```
128-
./kmercamel-large -p ./spneumoniae.fa -a global -k 63 -c
129-
```
130-
131-
Note: for smaller $k$ it is recommended to use default KmerCamel🐫 as it is faster.
132-
133108
### Turn off memory optimizations for Global
134109

135110
In order to reduce the memory footprint of hash-table based Global Greedy,

src/ac_automaton.h renamed to src/ac/ac_automaton.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
#include <list>
44
#include <vector>
55

6-
#include "models.h"
6+
#include "kmers_ac.h"
77

88
constexpr int INVALID_STATE = -1;
99
struct ACState {

src/global_ac.h renamed to src/ac/global_ac.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,7 @@
1010
#include <fstream>
1111
#include <algorithm>
1212

13-
#include "models.h"
14-
#include "kmers.h"
13+
#include "kmers_ac.h"
1514
#include "ac_automaton.h"
1615

1716

src/ac/kmers_ac.h

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
#pragma once
2+
#include <string>
3+
#include <vector>
4+
#include "../kmers.h"
5+
6+
struct KMer {
7+
std::string value;
8+
size_t length() const{ return value.size(); }
9+
};
10+
11+
/// Return the complementary nucleotide for the given one.
12+
char ComplementaryNucleotide(const char nucleotide) {
13+
if (nucleotide == 'A') return 'T';
14+
else if (nucleotide == 'T') return 'A';
15+
else if (nucleotide == 'G') return 'C';
16+
else if (nucleotide == 'C') return 'G';
17+
throw std::invalid_argument("cannot find complementary nucleotide for letter " + std::string(1, nucleotide));
18+
}
19+
20+
/// Compute the reverse complement of the given k-mer.
21+
KMer ReverseComplement(const KMer &kMer) {
22+
KMer ans;
23+
ans.value.reserve(kMer.length());
24+
for (int i = (int)kMer.length() - 1; i >= 0; --i) {
25+
ans.value += ComplementaryNucleotide(kMer.value[i]);
26+
}
27+
return ans;
28+
}
29+
30+
/// Convert the given basic nucleotide to int so it can be used for indexing in AC.
31+
/// If non-existing nucleotide is given, return -1.
32+
int NucleotideToInt (char c) {
33+
switch (c) {
34+
case 'A': return 0;
35+
case 'C': return 1;
36+
case 'G': return 2;
37+
case 'T': return 3;
38+
case 'a': return 0;
39+
case 'c': return 1;
40+
case 'g': return 2;
41+
case 't': return 3;
42+
default: return -1;
43+
}
44+
}
45+
46+
/// Convert the given k-mer to its representation as a number.
47+
kmer64_t KMerToNumber(const KMer &kMer) {
48+
kmer64_t ret = 0;
49+
for (char c : kMer.value) {
50+
ret <<= 2;
51+
ret |= NucleotideToInt(c);
52+
}
53+
return ret;
54+
}
55+

src/local_ac.h renamed to src/ac/local_ac.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
#include <algorithm>
88
#include <fstream>
99

10-
#include "models.h"
10+
#include "kmers_ac.h"
1111
#include "ac_automaton.h"
1212

1313
/// Find the index of the first extending k-mer from the incidentKMers which is not forbidden.

src/ac/parser_ac.h

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
#pragma once
2+
3+
// Implicitly initialize kseq.
4+
#include "../parser.h"
5+
6+
#include "kmers_ac.h"
7+
8+
/// Record of one fasta sequence.
9+
struct FastaRecord {
10+
// The name of the record. Starts with '>'.
11+
std::string name;
12+
// The genetic sequence.
13+
std::string sequence;
14+
};
15+
16+
/// Read fasta file with given path.
17+
std::vector<FastaRecord> ReadFasta(std::string &path) {
18+
gzFile fp;
19+
kseq_t *seq;
20+
std::vector<FastaRecord> records;
21+
22+
fp = gzopen(path.c_str(), "r");
23+
if (fp == nullptr) {
24+
throw std::invalid_argument("couldn't open file " + path);
25+
}
26+
seq = kseq_init(fp);
27+
while (kseq_read(seq) >= 0) {
28+
records.push_back(FastaRecord{
29+
seq->name.s,
30+
seq->seq.s,
31+
});
32+
}
33+
kseq_destroy(seq);
34+
gzclose(fp);
35+
return records;
36+
}
37+
38+
/// Create a list of unique k-mers in no particular order.
39+
/// This runs in O(k*data.size) expected time.
40+
void AddKMersFromSequence(std::unordered_set<std::string> &kMers, std::string &data, int k) {
41+
// Convert the sequence to uppercase letters.
42+
std::transform(data.begin(), data.end(), data.begin(), toupper);
43+
size_t possibleKMerEnd = k;
44+
for (size_t i = 1; i <= data.size(); ++i) {
45+
if (data[i-1] != 'A' && data[i-1] != 'C' && data[i-1] != 'G' && data[i-1] != 'T') {
46+
// Skip this and the next k-1 k-mers.
47+
possibleKMerEnd = i + k;
48+
}
49+
if (i >= possibleKMerEnd) {
50+
kMers.insert(data.substr(i - k, k));
51+
}
52+
}
53+
}
54+
55+
/// Return only the subset of k-mers where no two k-mers are complements of one another.
56+
/// The k-mers are chosen with no particular property.
57+
std::unordered_set<std::string> FilterKMersWithComplement(std::unordered_set<std::string> &kMers) {
58+
std::unordered_set<std::string> ret;
59+
for (auto &&x : kMers) {
60+
auto kMer = KMer{x};
61+
if (ret.count(ReverseComplement(kMer).value) == 0) ret.insert(x);
62+
}
63+
return ret;
64+
}
65+
66+
/// Create a list of unique k-mers in no particular order.
67+
/// This runs in O(k*data.size) expected time.
68+
std::vector<KMer> ConstructKMers(std::vector<FastaRecord> &data, int k, bool complements) {
69+
std::unordered_set<std::string> uniqueKMers;
70+
for (auto &&record : data) {
71+
AddKMersFromSequence(uniqueKMers, record.sequence, k);
72+
}
73+
if (complements) uniqueKMers = FilterKMersWithComplement(uniqueKMers);
74+
std::vector<KMer> result;
75+
for (auto &kMer : uniqueKMers) {
76+
result.push_back(KMer{kMer});
77+
}
78+
return result;
79+
}

src/streaming.h renamed to src/ac/streaming.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
#include <cassert>
66
#include <algorithm>
77

8-
#include "kmers.h"
8+
#include "kmers_ac.h"
99

1010
void Push(std::ostream &of, const std::string &current, int k, int32_t used) {
1111
if (current.size() == static_cast<size_t>(k) && used != 0) {
@@ -40,7 +40,7 @@ void Streaming(std::string &path, std::ostream &of, int k, bool complements) {
4040
if (kMer.size() == size_t(k + 1)) kMer=kMer.substr(1);
4141
assert(kMer.size() <= size_t(k));;
4242
if (kMer.length() == size_t (k) ) {
43-
int64_t encoded = KMerToNumber(KMer{kMer});
43+
uint64_t encoded = KMerToNumber(KMer{kMer});
4444
auto rc = ReverseComplement(encoded, k);
4545
auto rc2 = NumberToKMer(rc, k);
4646
bool contained = (kMers.count(encoded) > 0) || ((complements) && kMers.count(ReverseComplement(encoded, k)) > 0);

0 commit comments

Comments
 (0)