Add option to convert many unique files to a single mappability output

EricR86 · EricR86 · commit 4a2505a2151a · 2024-12-03T12:12:32.000-05:00
Now a blob or equivalent can be used on the shell to specify multiple
unique filenames which will all have their output appended to a single
mappability file output
diff --git a/docs/source/commands.rst b/docs/source/commands.rst
@@ -148,7 +148,7 @@ with some diminishing returns afterwards.
 --------------------
 generate-mappability
 --------------------
-Generates mappability files from a given ``unique`` file (see
+Generates mappability files from one or more given ``unique`` files (see
 :ref:`unique-file-format`). There are two types of mappability files that can
 be generated:
 
@@ -167,6 +167,13 @@ Options
 
     Only ``single-read-bed-file`` or ``multi-read-wig-file`` can output to ``stdout`` when both are specified on the command line.
 
+Positional Arguments
+--------------------
+- `unique_count_files`: One or more unique count files to generate mappability
+  from. The resulting mappability from each unique file will be appended to
+  files specified by the ``single-read-bed-file`` and ``multi-read-wig-file``
+  options.
+
 
 Mappability datasets
 ^^^^^^^^^^^^^^^^^^^^
@@ -202,4 +209,4 @@ Example:
 
 .. code-block:: console
 
-    $ newmap generate-mappability -k 24 -m k24_multiread_mappability.wig -s k24_singleread_mappability.bed chr1.unique.uint8
+    $ newmap generate-mappability -k 24 -m k24_multiread_mappability.wig -s k24_singleread_mappability.bed chr*.unique.uint8
diff --git a/newmap/main.py b/newmap/main.py
@@ -154,8 +154,9 @@ def parse_subcommands():
         func=unique_counts_conversion.main)
 
     generate_mappability_parser.add_argument(
-        "unique_count_file",
-        help="Unique count file to convert to bed file")
+        "unique_count_files",
+        nargs="+",  # NB: One or more unique files
+        help="One or more unique count files to convert to mappability files")
 
     generate_mappability_parser.add_argument(
         "--kmer-length", "-k",
diff --git a/newmap/unique_counts_conversion.py b/newmap/unique_counts_conversion.py
@@ -97,12 +97,11 @@ def write_multi_read_wig(wig_file: TextIO,
         wig_file.write("{}\n".format(value))
 
 
-def main(args):
-    unique_count_filename = Path(args.unique_count_file)
-    kmer_length = args.kmer_length
-    single_read_bed_filename = args.single_read_bed_file
-    multi_read_wig_filename = args.multi_read_wig_file
-    verbose = args.verbose
+def write_mappability_files(unique_count_filenames: list[Path],
+                            kmer_length: int,
+                            single_read_bed_filename: str,  # Might be stdout
+                            multi_read_wig_filename: str,  # Might be stdout
+                            verbose: bool):
 
     # Error if both single-read and multi-read output files are standard output
     if (single_read_bed_filename == STDOUT_FILENAME and
@@ -114,70 +113,91 @@ def main(args):
           not multi_read_wig_filename):
         raise ValueError("Must specify at least one output file")
 
-    # Get the chromosome name from the unique length filename
-    # NB: Assume the chromosome name is the the entire string preceding the
-    # ".unique*" part of the unique_count_filename (may contain periods)
-    file_basename = unique_count_filename.name
-    chr_name = \
-        file_basename[:file_basename.find(CHROMOSOME_FILENAME_DELIMITER)]
-
-    # Get the data type from the unique length filename suffix
-    data_type_string = unique_count_filename.suffix
-
-    if data_type_string == ".uint8":
-        data_type = np.uint8
-    elif data_type_string == ".uint16":
-        data_type = np.uint16
-    elif data_type_string == ".uint32":
-        data_type = np.uint32
-    else:
-        raise ValueError(f"Unknown extension on unique length file: "
-                         f"\"{data_type_string}\"")
-
-    # NB: The single-read mappability is defined for the entire sequence where
-    # a uniquely mappable k-mer would cover. So if a k-mer is uniquely mappable
-    # starting at position i, then the single read mappability would be 1 for
-    # all positions i to i + kmer_length - 1
-    # It follows that the multi-read mappability covers the same positions as
-    # the single-read, so any non-zero value would be considered single-read
-    # mappable
-    verbose_print(verbose,
-                  f"Calculating mappability regions from minimum unique k-mer "
-                  f"lengths in file: {unique_count_filename}")
-
-    multi_read_mappability = create_multiread_mappability_from_unique_file(
-                             unique_count_filename,
-                             kmer_length,
-                             data_type)  # type: ignore
-
-    verbose_print(verbose, "Chromosome size:")
-    verbose_print(verbose,
-                  "{}\t{}".format(chr_name, multi_read_mappability.shape[0]))
-
-    if single_read_bed_filename:
-        verbose_print(verbose, "Writing out single-read mappability regions")
-
-        if single_read_bed_filename == STDOUT_FILENAME:
-            write_single_read_bed(sys.stdout,
-                                  kmer_length,
-                                  multi_read_mappability,
-                                  chr_name)
+    # For every unique length file specified
+    for unique_count_filename in unique_count_filenames:
+        # Get the chromosome name from the unique length filename
+        # NB: Assume the chromosome name is the the entire string preceding the
+        # ".unique*" part of the unique_count_filename (may contain periods)
+        file_basename = unique_count_filename.name
+        chr_name = \
+            file_basename[:file_basename.find(CHROMOSOME_FILENAME_DELIMITER)]
+
+        # Get the data type from the unique length filename suffix
+        data_type_string = unique_count_filename.suffix
+
+        if data_type_string == ".uint8":
+            data_type = np.uint8
+        elif data_type_string == ".uint16":
+            data_type = np.uint16
+        elif data_type_string == ".uint32":
+            data_type = np.uint32
         else:
-            with open(single_read_bed_filename, "w") as single_read_bed_file:
-                write_single_read_bed(single_read_bed_file,
+            raise ValueError(f"Unknown extension on unique length file: "
+                             f"\"{data_type_string}\"")
+
+        # NB: The single-read mappability is defined for the entire sequence
+        # where a uniquely mappable k-mer would cover. So if a k-mer is
+        # uniquely mappable starting at position i, then the single read
+        # mappability would be 1 for all positions i to i + kmer_length - 1
+        # It follows that the multi-read mappability covers the same positions
+        # as the single-read, so any non-zero value would be considered
+        # single-read mappable
+        verbose_print(verbose, f"Calculating mappability regions from minimum "
+                               f"unique k-mer lengths in file: "
+                               f"{unique_count_filename}")
+
+        multi_read_mappability = create_multiread_mappability_from_unique_file(
+                                 unique_count_filename,
+                                 kmer_length,
+                                 data_type)  # type: ignore
+
+        verbose_print(verbose, "Chromosome size:")
+        verbose_print(verbose,
+                      "{}\t{}".format(chr_name,
+                                      multi_read_mappability.shape[0]))
+
+        if single_read_bed_filename:
+            verbose_print(verbose, f"Appending single-read mappability "
+                                   f"regions to {single_read_bed_filename}")
+
+            if single_read_bed_filename == STDOUT_FILENAME:
+                write_single_read_bed(sys.stdout,
                                       kmer_length,
                                       multi_read_mappability,
                                       chr_name)
-
-    if multi_read_wig_filename:
-        verbose_print(verbose, "Writing out multi-read mappability regions")
-
-        if multi_read_wig_filename == STDOUT_FILENAME:
-            write_multi_read_wig(sys.stdout,
-                                 multi_read_mappability,
-                                 chr_name)
-        else:
-            with open(multi_read_wig_filename, "w") as multi_read_wig_file:
-                write_multi_read_wig(multi_read_wig_file,
+            else:
+                with open(single_read_bed_filename, "a") as \
+                          single_read_bed_file:
+                    write_single_read_bed(single_read_bed_file,
+                                          kmer_length,
+                                          multi_read_mappability,
+                                          chr_name)
+
+        if multi_read_wig_filename:
+            verbose_print(verbose, f"Appending multi-read mappability regions"
+                                   f" to {multi_read_wig_filename}")
+
+            if multi_read_wig_filename == STDOUT_FILENAME:
+                write_multi_read_wig(sys.stdout,
                                      multi_read_mappability,
                                      chr_name)
+            else:
+                with open(multi_read_wig_filename, "a") as multi_read_wig_file:
+                    write_multi_read_wig(multi_read_wig_file,
+                                         multi_read_mappability,
+                                         chr_name)
+
+
+def main(args):
+    unique_count_filenames = [Path(filename) for filename in
+                              args.unique_count_files]
+    kmer_length = args.kmer_length
+    single_read_bed_filename = args.single_read_bed_file
+    multi_read_wig_filename = args.multi_read_wig_file
+    verbose = args.verbose
+
+    write_mappability_files(unique_count_filenames,
+                            kmer_length,
+                            single_read_bed_filename,
+                            multi_read_wig_filename,
+                            verbose)
diff --git a/tests/test_unique_to_mappability.py b/tests/test_unique_to_mappability.py
@@ -0,0 +1,54 @@
+import unittest
+
+from pathlib import Path
+from tempfile import NamedTemporaryFile
+from util import TEST_DATA_PATH
+
+from newmap.main import (DEFAULT_SUFFIX_ARRAY_COMPRESSION_RATIO,
+                         DEFAULT_KMER_LENGTH_IN_SEED_TABLE)
+from newmap.generate_index import generate_fm_index
+from newmap.unique_counts import write_unique_counts
+from newmap.unique_counts_conversion import write_mappability_files
+
+
+class TestCountKmers(unittest.TestCase):
+    genome_index_file = NamedTemporaryFile(mode="w")
+    genome_index_filename = genome_index_file.name
+    fasta_filename = str(TEST_DATA_PATH / 'genome.fa')
+    num_threads = 1
+
+    @classmethod
+    def setUpClass(cls):
+        generate_fm_index(cls.fasta_filename,
+                          cls.genome_index_filename,
+                          DEFAULT_SUFFIX_ARRAY_COMPRESSION_RATIO,
+                          DEFAULT_KMER_LENGTH_IN_SEED_TABLE)
+
+        write_unique_counts(Path(cls.fasta_filename),
+                            Path(cls.genome_index_filename),
+                            15,  # Batch size
+                            list(range(4, 11)),  # Kmer lengths 4 to 10
+                            0,  # Initial search length
+                            [],  # Include chr ids
+                            [],  # Exclude chr ids
+                            cls.num_threads,
+                            use_binary_search=True)
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.genome_index_file.close()
+
+    def test_mappability_output(self):
+        unique_count_filenames = [Path('chr1.unique.uint8'),
+                                  Path('chr2.unique.uint8')]
+        kmer_mappability_length = 10
+
+        write_mappability_files(unique_count_filenames,
+                                kmer_mappability_length,
+                                "genome.10.bed",
+                                "genome.10.wig",
+                                verbose=False)
+
+
+if __name__ == '__main__':
+    unittest.main()