@@ -73,6 +73,9 @@ int usage_subcommand(std::string subcommand) {
73
73
74
74
if (subcommand == " compute" || subcommand == " maskopt" || subcommand == " lowerbound" )
75
75
std::cerr << " -u - treat k-mer and its reverse complement as distinct" << std::endl;
76
+
77
+ if (subcommand == " compute" )
78
+ std::cerr << " -z INT - minimum frequency to represent a k-mer; default 1" << std::endl;
76
79
77
80
if (subcommand == " mssep2ms" ) {
78
81
std::cerr << " -m FILE - input file with mask" << std::endl;
@@ -97,7 +100,7 @@ void Version() {
97
100
// / Run KmerCamel with the given parameters.
98
101
template <typename kmer_t , typename kh_wrapper_t >
99
102
int kmercamel (kh_wrapper_t wrapper, kmer_t kmer_type, std::string path, int k, int d_max, std::ostream *of, std::ostream *maskf, bool complements, bool masks,
100
- std::string algorithm, bool lower_bound, bool assume_simplitigs) {
103
+ std::string algorithm, bool lower_bound, bool assume_simplitigs, uint16_t min_frequency ) {
101
104
if (masks) {
102
105
WriteLog (" Started optimization of a masked superstring from '" + path + " '." );
103
106
int ret = Optimize (wrapper, kmer_type, algorithm, path, *of, k, complements);
@@ -112,7 +115,8 @@ int kmercamel(kh_wrapper_t wrapper, kmer_t kmer_type, std::string path, int k, i
112
115
/* Handle streaming algorithm separately. */
113
116
if (algorithm == " streaming" ) {
114
117
WriteName (path, algorithm, k, false , !complements, *of);
115
- Streaming (wrapper, kmer_type, path, *of, k , complements);
118
+ if (min_frequency == 1 ) Streaming (wrapper, kmer_type, path, *of, k , complements);
119
+ else StreamingFiltered (wrapper, kmer_type, path, *of, k , complements, min_frequency);
116
120
WriteLog (" Finished masked superstring computation." );
117
121
}
118
122
/* Handle hash table based algorithms separately so that they consume less memory. */
@@ -121,7 +125,11 @@ int kmercamel(kh_wrapper_t wrapper, kmer_t kmer_type, std::string path, int k, i
121
125
auto *kMers = wrapper.kh_init_set ();
122
126
size_t kmer_count;
123
127
if (!assume_simplitigs) {
124
- ReadKMers (kMers , wrapper, kmer_type, path, k, complements);
128
+ if (min_frequency == 1 ) {
129
+ ReadKMers (kMers , wrapper, kmer_type, path, k, complements);
130
+ } else {
131
+ ReadKMersFiltered (kMers , wrapper, kmer_type, path, k, complements, min_frequency);
132
+ }
125
133
126
134
if (!kh_size (kMers )) {
127
135
std::cerr << " Path '" << path << " ' contains no k-mers. Make sure that your file is a FASTA or gzipped FASTA." << std::endl;
@@ -198,8 +206,9 @@ int camel_compute(int argc, char **argv) {
198
206
bool d_set = false ;
199
207
bool assume_simplitigs = false ;
200
208
int opt;
209
+ uint16_t min_frequency = 1 ;
201
210
try {
202
- while ((opt = getopt (argc, argv, " k:d:a:o:huxM:S " )) != -1 ) {
211
+ while ((opt = getopt (argc, argv, " k:d:a:o:huxM:Sz: " )) != -1 ) {
203
212
switch (opt) {
204
213
case ' o' :
205
214
output.open (optarg);
@@ -231,6 +240,9 @@ int camel_compute(int argc, char **argv) {
231
240
case ' h' :
232
241
usage_subcommand (subcommand);
233
242
return 0 ;
243
+ case ' z' :
244
+ min_frequency = std::stoi (optarg);
245
+ break ;
234
246
default :
235
247
return usage_subcommand (subcommand);
236
248
}
@@ -263,13 +275,19 @@ int camel_compute(int argc, char **argv) {
263
275
} else if (assume_simplitigs && algorithm != " global" ) {
264
276
std::cerr << " Optimization for the input being simplitigs is possible only with global." << std::endl;
265
277
return usage_subcommand (subcommand);
278
+ } else if (min_frequency >= 256 || min_frequency < 1 ) {
279
+ std::cerr << " Minimum frequency '-z' must be between 1 and 255." << std::endl;
280
+ return usage_subcommand (subcommand);
281
+ } else if (min_frequency != 1 && assume_simplitigs) {
282
+ std::cerr << " Inputting simplitigs is not compatible with frequency filterring." << std::endl;
283
+ return usage_subcommand (subcommand);
266
284
}
267
285
if (k < 32 ) {
268
- return kmercamel (kmer_dict64_t (), kmer64_t (0 ), path, k, d_max, of, maskf, complements, false , algorithm, false , assume_simplitigs);
286
+ return kmercamel (kmer_dict64_t (), kmer64_t (0 ), path, k, d_max, of, maskf, complements, false , algorithm, false , assume_simplitigs, min_frequency );
269
287
} else if (k < 64 ) {
270
- return kmercamel (kmer_dict128_t (), kmer128_t (0 ), path, k, d_max, of, maskf, complements, false , algorithm, false , assume_simplitigs);
288
+ return kmercamel (kmer_dict128_t (), kmer128_t (0 ), path, k, d_max, of, maskf, complements, false , algorithm, false , assume_simplitigs, min_frequency );
271
289
} else {
272
- return kmercamel (kmer_dict256_t (), kmer256_t (0 ), path, k, d_max, of, maskf, complements, false , algorithm, false , assume_simplitigs);
290
+ return kmercamel (kmer_dict256_t (), kmer256_t (0 ), path, k, d_max, of, maskf, complements, false , algorithm, false , assume_simplitigs, min_frequency );
273
291
}
274
292
}
275
293
@@ -327,11 +345,11 @@ int camel_optimize(int argc, char **argv) {
327
345
return usage_subcommand (subcommand);
328
346
}
329
347
if (k < 32 ) {
330
- return kmercamel (kmer_dict64_t (), kmer64_t (0 ), path, k, 0 , of, nullptr , complements, true , algorithm, false , false );
348
+ return kmercamel (kmer_dict64_t (), kmer64_t (0 ), path, k, 0 , of, nullptr , complements, true , algorithm, false , false , 1 );
331
349
} else if (k < 64 ) {
332
- return kmercamel (kmer_dict128_t (), kmer128_t (0 ), path, k, 0 , of, nullptr , complements, true , algorithm, false , false );
350
+ return kmercamel (kmer_dict128_t (), kmer128_t (0 ), path, k, 0 , of, nullptr , complements, true , algorithm, false , false , 1 );
333
351
} else {
334
- return kmercamel (kmer_dict256_t (), kmer256_t (0 ), path, k, 0 , of, nullptr , complements, true , algorithm, false , false );
352
+ return kmercamel (kmer_dict256_t (), kmer256_t (0 ), path, k, 0 , of, nullptr , complements, true , algorithm, false , false , 1 );
335
353
}
336
354
}
337
355
@@ -379,11 +397,11 @@ int camel_lowerbound(int argc, char **argv) {
379
397
return usage_subcommand (subcommand);
380
398
}
381
399
if (k < 32 ) {
382
- return kmercamel (kmer_dict64_t (), kmer64_t (0 ), path, k, 0 , of, nullptr , complements, false , " global" , true , false );
400
+ return kmercamel (kmer_dict64_t (), kmer64_t (0 ), path, k, 0 , of, nullptr , complements, false , " global" , true , false , 1 );
383
401
} else if (k < 64 ) {
384
- return kmercamel (kmer_dict128_t (), kmer128_t (0 ), path, k, 0 , of, nullptr , complements, false , " global" , true , false );
402
+ return kmercamel (kmer_dict128_t (), kmer128_t (0 ), path, k, 0 , of, nullptr , complements, false , " global" , true , false , 1 );
385
403
} else {
386
- return kmercamel (kmer_dict256_t (), kmer256_t (0 ), path, k, 0 , of, nullptr , complements, false , " global" , true , false );
404
+ return kmercamel (kmer_dict256_t (), kmer256_t (0 ), path, k, 0 , of, nullptr , complements, false , " global" , true , false , 1 );
387
405
}
388
406
}
389
407
0 commit comments