Skip to content

Commit 198df62

Browse files
committed
Merge branch 'main' into revert-67-revert-59-lowerbound
2 parents 888855e + 0bb95b9 commit 198df62

File tree

7 files changed

+43
-12
lines changed

7 files changed

+43
-12
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ For mask optimization, run the subcommand `optimize` with the following argument
101101

102102
- `p path_to_fasta` - the path to fasta file. This is a required argument.
103103
- `k k_value` - the size of one k-mer. This is a required argument.
104-
- `a algorithm` - the algorithm for mask optimization. Either `ones` for maximizing the number of 1s, `runs` for minimizing the number of runs of 1s, or `zeros` for maximizing the number of 0s. Default `ones`.
104+
- `a algorithm` - the algorithm for mask optimization. Either `ones` for maximizing the number of 1s, `runs` for minimizing the number of runs of 1s, `runsapprox` for approximately minimizing the number of runs of 1s, or `zeros` for maximizing the number of 0s. Default `ones`.
105105
- `o output_path` - the path to output file. If not specified, output is printed to stdout.
106106
- `c` - treat k-mer and its reverse complement as equal.
107107
- `h` - print help.

src/main.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ int Help() {
4949
std::cerr << "Accepted arguments:" << std::endl;
5050
std::cerr << " -p path_to_fasta - required; valid path to fasta file" << std::endl;
5151
std::cerr << " -k k_value - required; integer value for k" << std::endl;
52-
std::cerr << " -a algorithm - the algorithm to be run [ones (default), runs, zeros]" << std::endl;
52+
std::cerr << " -a algorithm - the algorithm to be run [ones (default), runs, runsapprox, zeros]" << std::endl;
5353
std::cerr << " -o output_path - if not specified, the output is printed to stdout" << std::endl;
5454
std::cerr << " -c - treat k-mer and its reverse complement as equal" << std::endl;
5555
std::cerr << " -h - print help" << std::endl;

src/masks.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -112,15 +112,15 @@ std::pair<std::vector<int>, std::vector<int>> HeuristicPreSolve(std::vector<std:
112112

113113

114114
/// For the given masked superstring output the same superstring with mask with minimal number of runs of ones.
115-
void OptimizeRuns(std::string path, kh_S64_t *kMers, std::ostream &of, int k, bool complements) {
115+
void OptimizeRuns(std::string path, kh_S64_t *kMers, std::ostream &of, int k, bool complements, bool approximate) {
116116
kh_O64_t *intervals = kh_init_O64();
117117
std::vector<std::list<size_t>> intervalsForKMer;
118118
auto [size, rows] = ReadIntervals(intervals, kMers, intervalsForKMer, path, k, complements, of, nullptr);
119119
int mappedSize, newIntervals; size_t totalIntervals;
120120
auto [mapping, intervalMapping] = HeuristicPreSolve(intervalsForKMer, rows, mappedSize, totalIntervals, newIntervals);
121121
glp_prob *lp;
122122
lp = glp_create_prob();
123-
if (mappedSize != 0) {
123+
if (mappedSize != 0 && !approximate) {
124124
auto *ia = new int[totalIntervals + 1];
125125
auto *ja = new int[totalIntervals + 1];
126126
auto *ar = new double[totalIntervals + 1];
@@ -160,10 +160,10 @@ void OptimizeRuns(std::string path, kh_S64_t *kMers, std::ostream &of, int k, bo
160160

161161
for (size_t i = 0; i < rows; ++i) {
162162
if (intervalMapping[i] == -1) intervalsSet[i] = true;
163+
else if (approximate) intervalsSet[i] = mappedSize != 0;
163164
else intervalsSet[i] = mappedSize == 0 ? false : (glp_get_col_prim(lp, intervalMapping[i] + 1) > 0.5);
164165
}
165166

166-
of << "> superstring" << std::endl;
167167
ReadIntervals(nullptr, kMers, intervalsForKMer, path, k, complements, of, intervalsSet);
168168
of << std::endl;
169169
}
@@ -179,7 +179,9 @@ int Optimize(std::string &algorithm, std::string path, std::ostream &of, int k,
179179
} else if (algorithm == "zeros") {
180180
OptimizeOnes(in, of, kMers, k, complements, true);
181181
} else if (algorithm == "runs") {
182-
OptimizeRuns(path, kMers, of, k, complements);
182+
OptimizeRuns(path, kMers, of, k, complements, false);
183+
} else if (algorithm == "runsapprox") {
184+
OptimizeRuns(path, kMers, of, k, complements, true);
183185
} else {
184186
std::cerr << "Algorithm '" + algorithm + "' not recognized." << std::endl;
185187
in.close();

src/parser.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,9 @@ std::pair<size_t, size_t> ReadIntervals(kh_O64_t *intervals, kh_S64_t *kMers, st
168168
currentInterval += interval_used;
169169
interval_used = false;
170170
}
171-
else if (c == '\n') readingHeader = false;
171+
// Reprint the header.
172+
if (readingHeader && !reading) of << c;
173+
if (c == '\n') readingHeader = false;
172174
if (readingHeader) continue;
173175
auto data = NucleotideToInt(c);
174176
// Disregard white space.
@@ -186,7 +188,7 @@ std::pair<size_t, size_t> ReadIntervals(kh_O64_t *intervals, kh_S64_t *kMers, st
186188
currentKMer |= data;
187189
--beforeKMerEnd;
188190
if (beforeKMerEnd == 0) {
189-
bool represented = kh_get_S64(kMers, currentKMer) != kh_end(kMers);
191+
bool represented = containsKMer(kMers, currentKMer, k, complements);
190192
bool set = false;
191193
if (represented) {
192194
interval_used = true;

tests/masks_unittest.h

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -67,12 +67,13 @@ namespace {
6767

6868
TEST(Mask, OptimizeRuns) {
6969
std::string path = std::filesystem::current_path();
70-
path += "/tests/testdata/runstest.fa";
7170

7271
struct TestCase {
7372
std::vector<kmer_t> kMers;
7473
int k;
7574
bool complements;
75+
bool approximate;
76+
std::string relativePath;
7677
std::string wantResult;
7778
};
7879
std::vector<TestCase> tests = {
@@ -84,18 +85,42 @@ namespace {
8485
KMerToNumber({"TT"}),
8586
KMerToNumber({"AT"})
8687
},
87-
2, false,
88+
2, false, false,
89+
"/tests/testdata/runstest.fa",
8890
"> superstring\nacgcgttACGtATt\n"
8991
},
92+
{
93+
{
94+
KMerToNumber({"AC"}),
95+
KMerToNumber({"CG"}),
96+
KMerToNumber({"GT"}),
97+
KMerToNumber({"TT"}),
98+
KMerToNumber({"AT"})
99+
},
100+
2, false, true,
101+
"/tests/testdata/runstest.fa",
102+
"> superstring\nACgCGTtACGtATt\n"
103+
},
104+
{
105+
{
106+
KMerToNumber({"ACT"}),
107+
KMerToNumber({"CTA"}),
108+
KMerToNumber({"GTA"})
109+
},
110+
3, true, false,
111+
"/tests/testdata/runstest2.fa",
112+
"> superstring\nACTAGta\n"
113+
},
90114
};
91115

92116
for (auto &t : tests) {
93117
std::stringstream of;
118+
auto totalPath = path + t.relativePath;
94119
auto kMersDict = kh_init_S64();
95120
int ret;
96121
for (auto &kMer : t.kMers) kh_put_S64(kMersDict, kMer, &ret);
97122

98-
OptimizeRuns(path, kMersDict, of, t.k, t.complements);
123+
OptimizeRuns(totalPath, kMersDict, of, t.k, t.complements, t.approximate);
99124

100125
EXPECT_EQ(t.wantResult, of.str());
101126
}

tests/testdata/runstest2.fa

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
> superstring
2+
ACtaGta

verify.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ def main():
8484
else:
8585
k = int(args.k)
8686
# Do the tests on mask optimization.
87-
for a in ["runs", "ones", "zeros"]:
87+
for a in ["runs", "ones", "zeros", "runsapprox"]:
8888
print(f"Testing {a}:")
8989
for large in [True, False]:
9090
if not large and k >= 32: continue

0 commit comments

Comments
 (0)