Skip to content

Commit f8d9691

Browse files
authored
Merge pull request #94 from OndrejSladky/min-frequency
Minimum frequency filtering
2 parents 9e55086 + 1408cdf commit f8d9691

File tree

8 files changed

+244
-14
lines changed

8 files changed

+244
-14
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ fmsi index -p ms-opt.msfa # Create a k-
9999
Examples of computing masked superstrings (`ms` subcommand):
100100
```
101101
kmercamel ms -k 31 -o ms.msfa yourfile[.fa|.fa.gz] # From a (gziped) fasta file, use "-" for stdin
102+
kmercamel ms -k 31 -o ms.msfa -z 2 yourfile.fa # Represent only k-mers appearing at least z=2 times
102103
kmercamel ms -k 31 -o ms.msfa -u yourfile.fa # Treat k-mer and its reverse complement as distinct
103104
kmercamel ms -k 31 -o ms.msfa -M maxonemask.m yourfile.fa # Also store mask with maximum ones
104105
kmercamel ms -k 31 -o ms.msfa -a streaming yourfile.fa # Use streaming instead of global for lower memory footprint (likely worse result)

src/khash_utils.h

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,16 @@
2727
// Use 128-bit integers for extra large k-mers to allow for larger k.
2828
KHASH_SET_INIT_INT256(S256)
2929
KHASH_MAP_INIT_INT256(P256, size_t)
30+
KHASH_MAP_INIT_INT256(Q256, uint8_t)
3031
// Use 128-bit integers for large k-mers to allow for larger k.
3132
KHASH_SET_INIT_INT128(S128)
3233
KHASH_MAP_INIT_INT128(P128, size_t)
34+
KHASH_MAP_INIT_INT128(Q128, uint8_t)
3335
// Use 64-bits integers for small k-mers for faster operations and less memory usage.
3436
KHASH_SET_INIT_INT64(S64)
3537
KHASH_MAP_INIT_INT64(P64, size_t)
38+
KHASH_MAP_INIT_INT64(Q64, uint8_t)
39+
3640

3741
#define INIT_KHASH_WRAPPER(type) \
3842
struct kmer_dict##type##_t { \
@@ -72,6 +76,18 @@ KHASH_MAP_INIT_INT64(P64, size_t)
7276
inline void kh_resize_map(kh_P##type##_t *map, khint_t size) { \
7377
kh_resize_P##type(map, size); \
7478
} \
79+
inline void kh_destroy_freq_map(kh_Q##type##_t *map) { \
80+
kh_destroy_Q##type(map); \
81+
} \
82+
inline kh_Q##type##_t *kh_init_freq_map() { \
83+
return kh_init_Q##type(); \
84+
} \
85+
inline khint_t kh_get_from_freq_map(kh_Q##type##_t *map, kmer##type##_t key) { \
86+
return kh_get_Q##type(map, key); \
87+
} \
88+
inline khint_t kh_put_to_freq_map(kh_Q##type##_t *map, kmer##type##_t key, int *ret) { \
89+
return kh_put_Q##type(map, key, ret); \
90+
} \
7591
};
7692

7793
INIT_KHASH_WRAPPER(64)
@@ -123,6 +139,26 @@ std::vector<kmer_t> kMersToVec(kh_S_t *kMers, [[maybe_unused]] kmer_t _) {
123139
return res;
124140
}
125141

142+
/// Construct a vector of the k-mer set in an arbitrary order.
143+
template <typename kmer_t, typename kh_Q_t>
144+
std::vector<kmer_t> kMersToVecFiltered(kh_Q_t *kMers, [[maybe_unused]] kmer_t _, uint16_t min_frequency) {
145+
std::vector<kmer_t> res;
146+
for (auto i = kh_begin(kMers); i != kh_end(kMers); ++i) {
147+
if (!kh_exist(kMers, i) || ((uint16_t)kh_val(kMers, i)) + 1 < min_frequency) continue;
148+
res.push_back(kh_key(kMers, i));
149+
}
150+
return res;
151+
}
152+
153+
template <typename kmer_t, typename kh_S_t, typename kh_wrapper_t>
154+
void kMersFromVec(kh_S_t *kMers, kh_wrapper_t wrapper, std::vector<kmer_t> &kMerVec) {
155+
for (kmer_t kMer : kMerVec) {
156+
int ret;
157+
wrapper.kh_put_to_set(kMers, kMer, &ret);
158+
}
159+
}
160+
161+
126162
/// Add an interval with given index to the given k-mer.
127163
///
128164
/// [intervalsForKMer] store the intervals and [intervals] maps the k-mer to the index in [intervalsForKMer].

src/main.cpp

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,9 @@ int usage_subcommand(std::string subcommand) {
7373

7474
if (subcommand == "compute" || subcommand == "maskopt" || subcommand == "lowerbound")
7575
std::cerr << " -u - treat k-mer and its reverse complement as distinct" << std::endl;
76+
77+
if (subcommand == "compute")
78+
std::cerr << " -z INT - minimum frequency to represent a k-mer; default 1" << std::endl;
7679

7780
if (subcommand == "mssep2ms") {
7881
std::cerr << " -m FILE - input file with mask" << std::endl;
@@ -97,7 +100,7 @@ void Version() {
97100
/// Run KmerCamel with the given parameters.
98101
template <typename kmer_t, typename kh_wrapper_t>
99102
int kmercamel(kh_wrapper_t wrapper, kmer_t kmer_type, std::string path, int k, int d_max, std::ostream *of, std::ostream *maskf, bool complements, bool masks,
100-
std::string algorithm, bool lower_bound, bool assume_simplitigs) {
103+
std::string algorithm, bool lower_bound, bool assume_simplitigs, uint16_t min_frequency) {
101104
if (masks) {
102105
WriteLog("Started optimization of a masked superstring from '" + path + "'.");
103106
int ret = Optimize(wrapper, kmer_type, algorithm, path, *of, k, complements);
@@ -112,7 +115,8 @@ int kmercamel(kh_wrapper_t wrapper, kmer_t kmer_type, std::string path, int k, i
112115
/* Handle streaming algorithm separately. */
113116
if (algorithm == "streaming") {
114117
WriteName(path, algorithm, k, false, !complements, *of);
115-
Streaming(wrapper, kmer_type, path, *of, k , complements);
118+
if (min_frequency == 1) Streaming(wrapper, kmer_type, path, *of, k , complements);
119+
else StreamingFiltered(wrapper, kmer_type, path, *of, k , complements, min_frequency);
116120
WriteLog("Finished masked superstring computation.");
117121
}
118122
/* Handle hash table based algorithms separately so that they consume less memory. */
@@ -121,7 +125,11 @@ int kmercamel(kh_wrapper_t wrapper, kmer_t kmer_type, std::string path, int k, i
121125
auto *kMers = wrapper.kh_init_set();
122126
size_t kmer_count;
123127
if (!assume_simplitigs) {
124-
ReadKMers(kMers, wrapper, kmer_type, path, k, complements);
128+
if (min_frequency == 1) {
129+
ReadKMers(kMers, wrapper, kmer_type, path, k, complements);
130+
} else {
131+
ReadKMersFiltered(kMers, wrapper, kmer_type, path, k, complements, min_frequency);
132+
}
125133

126134
if (!kh_size(kMers)) {
127135
std::cerr << "Path '" << path << "' contains no k-mers. Make sure that your file is a FASTA or gzipped FASTA." << std::endl;
@@ -198,8 +206,9 @@ int camel_compute(int argc, char **argv) {
198206
bool d_set = false;
199207
bool assume_simplitigs = false;
200208
int opt;
209+
uint16_t min_frequency = 1;
201210
try {
202-
while ((opt = getopt(argc, argv, "k:d:a:o:huxM:S")) != -1) {
211+
while ((opt = getopt(argc, argv, "k:d:a:o:huxM:Sz:")) != -1) {
203212
switch(opt) {
204213
case 'o':
205214
output.open(optarg);
@@ -231,6 +240,9 @@ int camel_compute(int argc, char **argv) {
231240
case 'h':
232241
usage_subcommand(subcommand);
233242
return 0;
243+
case 'z':
244+
min_frequency = std::stoi(optarg);
245+
break;
234246
default:
235247
return usage_subcommand(subcommand);
236248
}
@@ -263,13 +275,19 @@ int camel_compute(int argc, char **argv) {
263275
} else if (assume_simplitigs && algorithm != "global") {
264276
std::cerr << "Optimization for the input being simplitigs is possible only with global." << std::endl;
265277
return usage_subcommand(subcommand);
278+
} else if (min_frequency >= 256 || min_frequency < 1) {
279+
std::cerr << "Minimum frequency '-z' must be between 1 and 255." << std::endl;
280+
return usage_subcommand(subcommand);
281+
} else if (min_frequency != 1 && assume_simplitigs) {
282+
std::cerr << "Inputting simplitigs is not compatible with frequency filterring." << std::endl;
283+
return usage_subcommand(subcommand);
266284
}
267285
if (k < 32) {
268-
return kmercamel(kmer_dict64_t(), kmer64_t(0), path, k, d_max, of, maskf, complements, false, algorithm, false, assume_simplitigs);
286+
return kmercamel(kmer_dict64_t(), kmer64_t(0), path, k, d_max, of, maskf, complements, false, algorithm, false, assume_simplitigs, min_frequency);
269287
} else if (k < 64) {
270-
return kmercamel(kmer_dict128_t(), kmer128_t(0), path, k, d_max, of, maskf, complements, false, algorithm, false, assume_simplitigs);
288+
return kmercamel(kmer_dict128_t(), kmer128_t(0), path, k, d_max, of, maskf, complements, false, algorithm, false, assume_simplitigs, min_frequency);
271289
} else {
272-
return kmercamel(kmer_dict256_t(), kmer256_t(0), path, k, d_max, of, maskf, complements, false, algorithm, false, assume_simplitigs);
290+
return kmercamel(kmer_dict256_t(), kmer256_t(0), path, k, d_max, of, maskf, complements, false, algorithm, false, assume_simplitigs, min_frequency);
273291
}
274292
}
275293

@@ -327,11 +345,11 @@ int camel_optimize(int argc, char **argv) {
327345
return usage_subcommand(subcommand);
328346
}
329347
if (k < 32) {
330-
return kmercamel(kmer_dict64_t(), kmer64_t(0), path, k, 0, of, nullptr, complements, true, algorithm, false, false);
348+
return kmercamel(kmer_dict64_t(), kmer64_t(0), path, k, 0, of, nullptr, complements, true, algorithm, false, false, 1);
331349
} else if (k < 64) {
332-
return kmercamel(kmer_dict128_t(), kmer128_t(0), path, k, 0, of, nullptr, complements, true, algorithm, false, false);
350+
return kmercamel(kmer_dict128_t(), kmer128_t(0), path, k, 0, of, nullptr, complements, true, algorithm, false, false, 1);
333351
} else {
334-
return kmercamel(kmer_dict256_t(), kmer256_t(0), path, k, 0, of, nullptr, complements, true, algorithm, false, false);
352+
return kmercamel(kmer_dict256_t(), kmer256_t(0), path, k, 0, of, nullptr, complements, true, algorithm, false, false, 1);
335353
}
336354
}
337355

@@ -379,11 +397,11 @@ int camel_lowerbound(int argc, char **argv) {
379397
return usage_subcommand(subcommand);
380398
}
381399
if (k < 32) {
382-
return kmercamel(kmer_dict64_t(), kmer64_t(0), path, k, 0, of, nullptr, complements, false, "global", true, false);
400+
return kmercamel(kmer_dict64_t(), kmer64_t(0), path, k, 0, of, nullptr, complements, false, "global", true, false, 1);
383401
} else if (k < 64) {
384-
return kmercamel(kmer_dict128_t(), kmer128_t(0), path, k, 0, of, nullptr, complements, false, "global", true, false);
402+
return kmercamel(kmer_dict128_t(), kmer128_t(0), path, k, 0, of, nullptr, complements, false, "global", true, false, 1);
385403
} else {
386-
return kmercamel(kmer_dict256_t(), kmer256_t(0), path, k, 0, of, nullptr, complements, false, "global", true, false);
404+
return kmercamel(kmer_dict256_t(), kmer256_t(0), path, k, 0, of, nullptr, complements, false, "global", true, false, 1);
387405
}
388406
}
389407

src/parser.h

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,42 @@ void AddKMers(kh_S_t *kMers, kh_wrapper_t wrapper, [[maybe_unused]] kmer_t _, si
4848
}
4949
}
5050

51+
/// Fill the k-mer dictionary with k-mers from the given sequence and their frequencies.
52+
/// If complements is true, always add the canonical k-mers.
53+
template <typename kmer_t, typename kh_Q_t, typename kh_wrapper_t>
54+
void AddKMersWithFrequencies(kh_Q_t *kMers, kh_wrapper_t wrapper, [[maybe_unused]] kmer_t _, size_t sequence_length,
55+
const char* sequence, int64_t k, bool complements) {
56+
int64_t currentLength = 0;
57+
kmer_t currentKMer = 0, reverseComplement = 0;
58+
kmer_t mask = (((kmer_t) 1) << (2 * k) ) - 1;
59+
kmer_t shift = 2 * (k - 1);
60+
for (size_t i = 0; i < sequence_length; ++i) {
61+
auto data = nucleotideToInt[(uint8_t)sequence[i]];
62+
if (data >= 4) {
63+
// Restart if "N"-like nucleotide.
64+
currentKMer = reverseComplement = 0;
65+
currentLength = 0;
66+
continue;
67+
}
68+
currentKMer = ((currentKMer << 2) | data) & mask;
69+
reverseComplement = (reverseComplement >> 2) | ((kmer_t(3 ^ data)) << shift);
70+
if (++currentLength >= k) {
71+
// Add the canonical k-mer to the dictionary.
72+
kmer_t canonical = ((!complements) || currentKMer < reverseComplement) ? currentKMer : reverseComplement;
73+
auto ptr = wrapper.kh_get_from_freq_map(kMers, canonical);
74+
bool contained = ptr != kh_end(kMers);
75+
if (contained) {
76+
uint16_t count = kh_val(kMers, ptr);
77+
kh_value(kMers, ptr) = (uint8_t) std::min(255, count + 1);
78+
} else {
79+
int ret;
80+
ptr = wrapper.kh_put_to_freq_map(kMers, canonical, &ret);
81+
kh_value(kMers, ptr) = 0;
82+
}
83+
}
84+
}
85+
}
86+
5187
/// Return a file/stdin for reading.
5288
gzFile OpenFile(std::string &path) {
5389
FILE *in_stream;
@@ -81,6 +117,30 @@ void ReadKMers(kh_S_t *kMers, kh_wrapper_t wrapper, kmer_t _, std::string &path,
81117
gzclose(fp);
82118
}
83119

120+
/// Load a dictionary of k-mers from a fasta file.
121+
/// If complements is true, add the canonical k-mers.
122+
template <typename kmer_t, typename kh_S_t, typename kh_wrapper_t>
123+
void ReadKMersFiltered(kh_S_t *kMers, kh_wrapper_t wrapper, kmer_t _, std::string &path, int k, bool complements,
124+
uint16_t min_frequency) {
125+
gzFile fp = OpenFile(path);
126+
kseq_t *seq = kseq_init(fp);
127+
128+
auto kMersFrequencies = wrapper.kh_init_freq_map();
129+
130+
while (kseq_read(seq) >= 0) {
131+
AddKMersWithFrequencies(kMersFrequencies, wrapper, _, seq->seq.l, seq->seq.s, k, complements);
132+
}
133+
134+
135+
auto kMersVec = kMersToVecFiltered(kMersFrequencies, _, min_frequency);
136+
wrapper.kh_destroy_freq_map(kMersFrequencies);
137+
kMersFromVec(kMers, wrapper, kMersVec);
138+
139+
kseq_destroy(seq);
140+
gzclose(fp);
141+
}
142+
143+
84144
/// Read the masked superstring from the given path and return it wrapped as a kseq_t.
85145
kseq_t* ReadMaskedSuperstring(std::string &path) {
86146
gzFile fp = OpenFile(path);

src/streaming.h

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,63 @@ void Streaming(kh_wrapper_t wrapper, kmer_t kmer_type, std::string &path, std::o
4545
}
4646
}
4747

48+
kseq_destroy(seq);
49+
gzclose(fp);
50+
}
51+
52+
template <typename kmer_t, typename kh_wrapper_t>
53+
void StreamingFiltered(kh_wrapper_t wrapper, kmer_t kmer_type, std::string &path, std::ostream &of, int k, bool complements, uint16_t min_frequency) {
54+
gzFile fp = OpenFile(path);
55+
kseq_t *seq = kseq_init(fp);
56+
57+
auto kMers = wrapper.kh_init_freq_map();
58+
59+
kmer_t mask = (kmer_t(1)) << (2 * k - 1);
60+
mask |= mask - 1;
61+
62+
63+
while (kseq_read(seq) >= 0) {
64+
kmer_t kMer = 0;
65+
size_t firstIndex = k - 1;
66+
int64_t lastOne = -k;
67+
for (size_t i = 0; i < seq->seq.l + k - 1; ++i) {
68+
kmer_t c = 4;
69+
if (i < seq->seq.l) c = nucleotideToInt[(uint8_t)seq->seq.s[i]];
70+
if (c >= 4) {
71+
kMer = 0;
72+
firstIndex = i + k;
73+
}
74+
kMer <<= 2;
75+
kMer |= c;
76+
kMer &= mask;
77+
auto rc = ReverseComplement(kMer, k);
78+
auto canonical = complements ? std::min(kMer, rc) : kMer;
79+
80+
auto ptr = wrapper.kh_get_from_freq_map(kMers, canonical);
81+
bool contained = ptr != kh_end(kMers);
82+
uint16_t count = 0;
83+
if (i >= firstIndex) {
84+
if (contained) {
85+
count = kh_val(kMers, ptr);
86+
count++;
87+
kh_value(kMers, ptr) = (uint8_t)std::min((uint16_t)255, count);
88+
} else {
89+
int ret;
90+
ptr = wrapper.kh_put_to_freq_map(kMers, canonical, &ret);
91+
kh_value(kMers, ptr) = 0;
92+
}
93+
}
94+
95+
if (count + 1 == min_frequency) {
96+
of << Masked(seq->seq.s[i - k + 1], true);
97+
lastOne = i;
98+
} else if ((int64_t (i)) <= lastOne + k - 1) {
99+
of << Masked(seq->seq.s[i - k + 1], false);
100+
}
101+
}
102+
}
103+
104+
wrapper.kh_destroy_freq_map(kMers);
48105
kseq_destroy(seq);
49106
gzclose(fp);
50107
}

src/version

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
v2.1.1
1+
v2.2.0

tests/parser_unittest.h

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,33 @@ namespace {
5858

5959
ReadKMers(kMers, wrapper, kmer_t (0), path, t.k, t.complements, t.case_sensitive);
6060

61+
EXPECT_EQ(t.wantResultSize, kh_size(kMers));
62+
}
63+
}
64+
TEST(Parser, ReadKMersFiltered) {
65+
std::string path = std::filesystem::current_path();
66+
path += "/tests/testdata/test.fa";
67+
struct TestCase {
68+
int k;
69+
bool complements;
70+
uint16_t min_frequency;
71+
size_t wantResultSize;
72+
};
73+
std::vector<TestCase> tests = {
74+
{3, true, 2, 6},
75+
{3, false, 2, 5},
76+
{3, true, 3, 2},
77+
{4, true, 2, 3},
78+
{1, false, 5, 4},
79+
{1, false, 6, 2},
80+
{1, false, 10, 1},
81+
};
82+
83+
for (auto &t: tests) {
84+
auto kMers = wrapper.kh_init_set();
85+
86+
ReadKMersFiltered(kMers, wrapper, kmer_t (0), path, t.k, t.complements, t.min_frequency);
87+
6188
EXPECT_EQ(t.wantResultSize, kh_size(kMers));
6289
}
6390

0 commit comments

Comments
 (0)