Skip to content

Commit 4a2505a

Browse files
committed
Add option to convert many unique files to a single mappability output
Now a blob or equivalent can be used on the shell to specify multiple unique filenames which will all have their output appended to a single mappability file output
1 parent e92bb99 commit 4a2505a

File tree

4 files changed

+153
-71
lines changed

4 files changed

+153
-71
lines changed

docs/source/commands.rst

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ with some diminishing returns afterwards.
148148
--------------------
149149
generate-mappability
150150
--------------------
151-
Generates mappability files from a given ``unique`` file (see
151+
Generates mappability files from one or more given ``unique`` files (see
152152
:ref:`unique-file-format`). There are two types of mappability files that can
153153
be generated:
154154

@@ -167,6 +167,13 @@ Options
167167

168168
Only ``single-read-bed-file`` or ``multi-read-wig-file`` can output to ``stdout`` when both are specified on the command line.
169169

170+
Positional Arguments
171+
--------------------
172+
- `unique_count_files`: One or more unique count files to generate mappability
173+
from. The resulting mappability from each unique file will be appended to
174+
files specified by the ``single-read-bed-file`` and ``multi-read-wig-file``
175+
options.
176+
170177

171178
Mappability datasets
172179
^^^^^^^^^^^^^^^^^^^^
@@ -202,4 +209,4 @@ Example:
202209

203210
.. code-block:: console
204211
205-
$ newmap generate-mappability -k 24 -m k24_multiread_mappability.wig -s k24_singleread_mappability.bed chr1.unique.uint8
212+
$ newmap generate-mappability -k 24 -m k24_multiread_mappability.wig -s k24_singleread_mappability.bed chr*.unique.uint8

newmap/main.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -154,8 +154,9 @@ def parse_subcommands():
154154
func=unique_counts_conversion.main)
155155

156156
generate_mappability_parser.add_argument(
157-
"unique_count_file",
158-
help="Unique count file to convert to bed file")
157+
"unique_count_files",
158+
nargs="+", # NB: One or more unique files
159+
help="One or more unique count files to convert to mappability files")
159160

160161
generate_mappability_parser.add_argument(
161162
"--kmer-length", "-k",

newmap/unique_counts_conversion.py

Lines changed: 87 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -97,12 +97,11 @@ def write_multi_read_wig(wig_file: TextIO,
9797
wig_file.write("{}\n".format(value))
9898

9999

100-
def main(args):
101-
unique_count_filename = Path(args.unique_count_file)
102-
kmer_length = args.kmer_length
103-
single_read_bed_filename = args.single_read_bed_file
104-
multi_read_wig_filename = args.multi_read_wig_file
105-
verbose = args.verbose
100+
def write_mappability_files(unique_count_filenames: list[Path],
101+
kmer_length: int,
102+
single_read_bed_filename: str, # Might be stdout
103+
multi_read_wig_filename: str, # Might be stdout
104+
verbose: bool):
106105

107106
# Error if both single-read and multi-read output files are standard output
108107
if (single_read_bed_filename == STDOUT_FILENAME and
@@ -114,70 +113,91 @@ def main(args):
114113
not multi_read_wig_filename):
115114
raise ValueError("Must specify at least one output file")
116115

117-
# Get the chromosome name from the unique length filename
118-
# NB: Assume the chromosome name is the the entire string preceding the
119-
# ".unique*" part of the unique_count_filename (may contain periods)
120-
file_basename = unique_count_filename.name
121-
chr_name = \
122-
file_basename[:file_basename.find(CHROMOSOME_FILENAME_DELIMITER)]
123-
124-
# Get the data type from the unique length filename suffix
125-
data_type_string = unique_count_filename.suffix
126-
127-
if data_type_string == ".uint8":
128-
data_type = np.uint8
129-
elif data_type_string == ".uint16":
130-
data_type = np.uint16
131-
elif data_type_string == ".uint32":
132-
data_type = np.uint32
133-
else:
134-
raise ValueError(f"Unknown extension on unique length file: "
135-
f"\"{data_type_string}\"")
136-
137-
# NB: The single-read mappability is defined for the entire sequence where
138-
# a uniquely mappable k-mer would cover. So if a k-mer is uniquely mappable
139-
# starting at position i, then the single read mappability would be 1 for
140-
# all positions i to i + kmer_length - 1
141-
# It follows that the multi-read mappability covers the same positions as
142-
# the single-read, so any non-zero value would be considered single-read
143-
# mappable
144-
verbose_print(verbose,
145-
f"Calculating mappability regions from minimum unique k-mer "
146-
f"lengths in file: {unique_count_filename}")
147-
148-
multi_read_mappability = create_multiread_mappability_from_unique_file(
149-
unique_count_filename,
150-
kmer_length,
151-
data_type) # type: ignore
152-
153-
verbose_print(verbose, "Chromosome size:")
154-
verbose_print(verbose,
155-
"{}\t{}".format(chr_name, multi_read_mappability.shape[0]))
156-
157-
if single_read_bed_filename:
158-
verbose_print(verbose, "Writing out single-read mappability regions")
159-
160-
if single_read_bed_filename == STDOUT_FILENAME:
161-
write_single_read_bed(sys.stdout,
162-
kmer_length,
163-
multi_read_mappability,
164-
chr_name)
116+
# For every unique length file specified
117+
for unique_count_filename in unique_count_filenames:
118+
# Get the chromosome name from the unique length filename
119+
# NB: Assume the chromosome name is the the entire string preceding the
120+
# ".unique*" part of the unique_count_filename (may contain periods)
121+
file_basename = unique_count_filename.name
122+
chr_name = \
123+
file_basename[:file_basename.find(CHROMOSOME_FILENAME_DELIMITER)]
124+
125+
# Get the data type from the unique length filename suffix
126+
data_type_string = unique_count_filename.suffix
127+
128+
if data_type_string == ".uint8":
129+
data_type = np.uint8
130+
elif data_type_string == ".uint16":
131+
data_type = np.uint16
132+
elif data_type_string == ".uint32":
133+
data_type = np.uint32
165134
else:
166-
with open(single_read_bed_filename, "w") as single_read_bed_file:
167-
write_single_read_bed(single_read_bed_file,
135+
raise ValueError(f"Unknown extension on unique length file: "
136+
f"\"{data_type_string}\"")
137+
138+
# NB: The single-read mappability is defined for the entire sequence
139+
# where a uniquely mappable k-mer would cover. So if a k-mer is
140+
# uniquely mappable starting at position i, then the single read
141+
# mappability would be 1 for all positions i to i + kmer_length - 1
142+
# It follows that the multi-read mappability covers the same positions
143+
# as the single-read, so any non-zero value would be considered
144+
# single-read mappable
145+
verbose_print(verbose, f"Calculating mappability regions from minimum "
146+
f"unique k-mer lengths in file: "
147+
f"{unique_count_filename}")
148+
149+
multi_read_mappability = create_multiread_mappability_from_unique_file(
150+
unique_count_filename,
151+
kmer_length,
152+
data_type) # type: ignore
153+
154+
verbose_print(verbose, "Chromosome size:")
155+
verbose_print(verbose,
156+
"{}\t{}".format(chr_name,
157+
multi_read_mappability.shape[0]))
158+
159+
if single_read_bed_filename:
160+
verbose_print(verbose, f"Appending single-read mappability "
161+
f"regions to {single_read_bed_filename}")
162+
163+
if single_read_bed_filename == STDOUT_FILENAME:
164+
write_single_read_bed(sys.stdout,
168165
kmer_length,
169166
multi_read_mappability,
170167
chr_name)
171-
172-
if multi_read_wig_filename:
173-
verbose_print(verbose, "Writing out multi-read mappability regions")
174-
175-
if multi_read_wig_filename == STDOUT_FILENAME:
176-
write_multi_read_wig(sys.stdout,
177-
multi_read_mappability,
178-
chr_name)
179-
else:
180-
with open(multi_read_wig_filename, "w") as multi_read_wig_file:
181-
write_multi_read_wig(multi_read_wig_file,
168+
else:
169+
with open(single_read_bed_filename, "a") as \
170+
single_read_bed_file:
171+
write_single_read_bed(single_read_bed_file,
172+
kmer_length,
173+
multi_read_mappability,
174+
chr_name)
175+
176+
if multi_read_wig_filename:
177+
verbose_print(verbose, f"Appending multi-read mappability regions"
178+
f" to {multi_read_wig_filename}")
179+
180+
if multi_read_wig_filename == STDOUT_FILENAME:
181+
write_multi_read_wig(sys.stdout,
182182
multi_read_mappability,
183183
chr_name)
184+
else:
185+
with open(multi_read_wig_filename, "a") as multi_read_wig_file:
186+
write_multi_read_wig(multi_read_wig_file,
187+
multi_read_mappability,
188+
chr_name)
189+
190+
191+
def main(args):
192+
unique_count_filenames = [Path(filename) for filename in
193+
args.unique_count_files]
194+
kmer_length = args.kmer_length
195+
single_read_bed_filename = args.single_read_bed_file
196+
multi_read_wig_filename = args.multi_read_wig_file
197+
verbose = args.verbose
198+
199+
write_mappability_files(unique_count_filenames,
200+
kmer_length,
201+
single_read_bed_filename,
202+
multi_read_wig_filename,
203+
verbose)

tests/test_unique_to_mappability.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import unittest
2+
3+
from pathlib import Path
4+
from tempfile import NamedTemporaryFile
5+
from util import TEST_DATA_PATH
6+
7+
from newmap.main import (DEFAULT_SUFFIX_ARRAY_COMPRESSION_RATIO,
8+
DEFAULT_KMER_LENGTH_IN_SEED_TABLE)
9+
from newmap.generate_index import generate_fm_index
10+
from newmap.unique_counts import write_unique_counts
11+
from newmap.unique_counts_conversion import write_mappability_files
12+
13+
14+
class TestCountKmers(unittest.TestCase):
15+
genome_index_file = NamedTemporaryFile(mode="w")
16+
genome_index_filename = genome_index_file.name
17+
fasta_filename = str(TEST_DATA_PATH / 'genome.fa')
18+
num_threads = 1
19+
20+
@classmethod
21+
def setUpClass(cls):
22+
generate_fm_index(cls.fasta_filename,
23+
cls.genome_index_filename,
24+
DEFAULT_SUFFIX_ARRAY_COMPRESSION_RATIO,
25+
DEFAULT_KMER_LENGTH_IN_SEED_TABLE)
26+
27+
write_unique_counts(Path(cls.fasta_filename),
28+
Path(cls.genome_index_filename),
29+
15, # Batch size
30+
list(range(4, 11)), # Kmer lengths 4 to 10
31+
0, # Initial search length
32+
[], # Include chr ids
33+
[], # Exclude chr ids
34+
cls.num_threads,
35+
use_binary_search=True)
36+
37+
@classmethod
38+
def tearDownClass(cls):
39+
cls.genome_index_file.close()
40+
41+
def test_mappability_output(self):
42+
unique_count_filenames = [Path('chr1.unique.uint8'),
43+
Path('chr2.unique.uint8')]
44+
kmer_mappability_length = 10
45+
46+
write_mappability_files(unique_count_filenames,
47+
kmer_mappability_length,
48+
"genome.10.bed",
49+
"genome.10.wig",
50+
verbose=False)
51+
52+
53+
if __name__ == '__main__':
54+
unittest.main()

0 commit comments

Comments
 (0)