|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | +""" |
| 3 | +Created on Wed Nov 29 12:23:15 2023 |
| 4 | +
|
| 5 | +@author: ian.michael.bollinger@gmail.com/researchconsultants@critical.consulting |
| 6 | +
|
| 7 | +This is the Next-Generation Barcoding Species Identifier (NGSpeciesID) main wrapper developed for a modified |
| 8 | +
|
| 9 | +protocol by Stephen Douglas Russell and paid for by the Fungal Diversity Survey (FunDiS). |
| 10 | +
|
| 11 | +Protocol Link: https://www.protocols.io/view/primary-data-analysis-basecalling-demultiplexing-a-dm6gpbm88lzp/v3?step=3 |
| 12 | +""" |
| 13 | + |
| 14 | +# Base Python Imports |
| 15 | +import subprocess, os |
| 16 | + |
| 17 | +# Required Python Imports |
| 18 | +from termcolor import cprint |
| 19 | +from datetime import datetime |
| 20 | + |
| 21 | +# Function to color coded print to console and save to log_file information |
| 22 | +def log_print(input_message, log_file=None): |
| 23 | + """ |
| 24 | + Logs a message to a file and prints it to the console with appropriate coloring. |
| 25 | + |
| 26 | + This function takes a message and logs it to the specified file. Additionally, the message is printed to the |
| 27 | + console, potentially with specific coloring depending on the context. |
| 28 | + |
| 29 | + Parameters: |
| 30 | + input_message (str): Message to be logged and printed. |
| 31 | + log_file (str): Path to the log file. |
| 32 | +
|
| 33 | + Notes: |
| 34 | + - The function uses a global default log file if none is specified. |
| 35 | + - Timestamps each log entry for easy tracking. |
| 36 | + - Utilizes color coding in the console to distinguish between different types of messages (e.g., errors, warnings). |
| 37 | + - Supports color coding for specific message types: NOTE, CMD, ERROR, WARN, and PASS. |
| 38 | + - Falls back to default (white) color if the message type is unrecognized. |
| 39 | + """ |
| 40 | + # Access the global variable |
| 41 | + global DEFAULT_LOG_FILE |
| 42 | + |
| 43 | + # Use the default log file if none specified |
| 44 | + if log_file is None: |
| 45 | + log_file = DEFAULT_LOG_FILE |
| 46 | + |
| 47 | + # Establish current date-time |
| 48 | + now = datetime.now() |
| 49 | + message = f'[{now:%Y-%m-%d %H:%M:%S}]\t{input_message}' |
| 50 | + |
| 51 | + # Determine the print color based on the input_message content |
| 52 | + message_type_dict = {'NOTE': ['blue'], |
| 53 | + 'CMD': ['cyan'], |
| 54 | + 'ERROR': ['red'], |
| 55 | + 'WARN': ['yellow'], |
| 56 | + 'PASS': ['green'],} |
| 57 | + print_color = ['white'] # Default color |
| 58 | + for key, value in message_type_dict.items(): |
| 59 | + if key.lower() in input_message.lower(): |
| 60 | + print_color = value |
| 61 | + break |
| 62 | + |
| 63 | + # Writing the message to the log file |
| 64 | + with open(log_file, 'a') as file: |
| 65 | + print(message, file=file) |
| 66 | + |
| 67 | + # Handling different message types for colored printing |
| 68 | + try: |
| 69 | + cprint(message, print_color[0]) |
| 70 | + except (KeyError, IndexError): |
| 71 | + cprint(message, print_color[1] if len(print_color) > 1 else 'white') |
| 72 | + |
| 73 | + |
| 74 | +from FunDiS_hap_phase import phase_haplotypes |
| 75 | + |
| 76 | +# Wrapper function to run ngsid_prep in a separate thread |
| 77 | +def run_ngsid_prep(ngsid_primers_path, input_fastq_file, sample_size, min_length_bp, max_std_dev_bp, hap_phase_bool, ngsid_output_dir): |
| 78 | + """ |
| 79 | + Wrapper function to run NGSpeciesID preparation in a separate thread. |
| 80 | +
|
| 81 | + This function handles the NGSpeciesID preparation process. It executes the preparation steps in a separate thread |
| 82 | + and logs any errors encountered during execution. |
| 83 | +
|
| 84 | + Parameters: |
| 85 | + ngsid_primers_path (str): Path to the NGSpeciesID primers file. |
| 86 | + input_fastq_file (str): Path to the input FASTQ file. |
| 87 | + sample_size (int): The sample size parameter for NGSpeciesID. |
| 88 | + min_length_bp (int): Minimum length in base pairs for NGSpeciesID. |
| 89 | + max_std_dev_bp (int): Maximum standard deviation in base pairs for NGSpeciesID. |
| 90 | + hap_phase_bool (bool): Boolean to indicate if haplotype phasing is to be performed. |
| 91 | + ngsid_output_dir (str): Directory path for storing NGSpeciesID output files. |
| 92 | +
|
| 93 | + Global Variables: |
| 94 | + output_area (str): A global variable used for logging output messages. |
| 95 | +
|
| 96 | + Notes: |
| 97 | + - Captures and logs exceptions during NGSpeciesID preparation. |
| 98 | + """ |
| 99 | + global DEFAULT_LOG_FILE |
| 100 | + try: |
| 101 | + ngsid_prep(ngsid_primers_path, input_fastq_file, sample_size, min_length_bp, max_std_dev_bp, hap_phase_bool, ngsid_output_dir) |
| 102 | + except Exception as e: |
| 103 | + log_print( f"ERROR:\t{str(e)}") |
| 104 | + |
| 105 | +# Function to perform NGSpeciesID preparation |
| 106 | +def ngsid_prep(ngsid_primers_path, input_fastq_file, sample_size, min_length_bp, max_std_dev_bp, hap_phase_bool, ngsid_output_dir): |
| 107 | + """ |
| 108 | + Performs NGSpeciesID preparation. |
| 109 | +
|
| 110 | + This function carries out the necessary steps for NGSpeciesID preparation. It processes FASTQ files, |
| 111 | + executes NGSpeciesID, and optionally performs haplotype phasing. |
| 112 | +
|
| 113 | + Parameters: |
| 114 | + ngsid_primers_path (str): Path to the NGSpeciesID primers file. |
| 115 | + input_fastq_file (str): Path to the input FASTQ file. |
| 116 | + sample_size (int): The sample size parameter for NGSpeciesID. |
| 117 | + min_length_bp (int): Minimum length in base pairs for NGSpeciesID. |
| 118 | + max_std_dev_bp (int): Maximum standard deviation in base pairs for NGSpeciesID. |
| 119 | + hap_phase_bool (bool): Boolean to indicate if haplotype phasing is to be performed. |
| 120 | + ngsid_output_dir (str): Directory path for storing NGSpeciesID output files. |
| 121 | +
|
| 122 | + Global Variables: |
| 123 | + output_area (str): A global variable used for logging output messages. |
| 124 | + cpu_threads (int): Number of CPU threads to be used in processing. |
| 125 | +
|
| 126 | + Notes: |
| 127 | + - Processes each .fastq file in the specified output directory. |
| 128 | + - Logs progress and errors during the preparation process. |
| 129 | + """ |
| 130 | + global cpu_threads, DEFAULT_LOG_FILE |
| 131 | + # For any .fastq file in the ngsid_output_dir run the NGSID function on it |
| 132 | + fastq_files_list = [] |
| 133 | + |
| 134 | + # Iterate over all files in the given folder |
| 135 | + for file in os.listdir(ngsid_output_dir): |
| 136 | + # Check if the file ends with .fastq |
| 137 | + if file.endswith('.fastq'): |
| 138 | + # Add the file to the list, with full path |
| 139 | + fastq_files_list.append(os.path.join(ngsid_output_dir, file)) |
| 140 | + |
| 141 | + for input_fastq_file in fastq_files_list: |
| 142 | + ngsid_output_dir = ngsid_fastq(ngsid_primers_path, input_fastq_file, sample_size, min_length_bp, max_std_dev_bp) |
| 143 | + if hap_phase_bool == True: |
| 144 | + try: |
| 145 | + phased_fasta_file = phase_haplotypes(ngsid_output_dir, cpu_threads) |
| 146 | + except TypeError: |
| 147 | + log_print( f"Skipping Haplotype Phasing, unable to process: {input_fastq_file} with NGSpeciesID") |
| 148 | + continue |
| 149 | + log_print(f"PASS:\tSuccessfully processed {ngsid_output_dir}") |
| 150 | + |
| 151 | +def ngsid_fastq(ngsid_primers_path, input_fastq_file, sample_size, min_length_bp, max_std_dev_bp): |
| 152 | + """ |
| 153 | + Processes a FASTQ file with NGSpeciesID. |
| 154 | +
|
| 155 | + This function handles the processing of a single FASTQ file using the NGSpeciesID tool. It constructs and executes |
| 156 | + the NGSpeciesID command with the provided parameters. |
| 157 | +
|
| 158 | + Parameters: |
| 159 | + ngsid_primers_path (str): Path to the NGSpeciesID primers file. |
| 160 | + input_fastq_file (str): Path to the input FASTQ file. |
| 161 | + sample_size (int): The sample size parameter for NGSpeciesID. |
| 162 | + min_length_bp (int): Minimum length in base pairs for NGSpeciesID. |
| 163 | + max_std_dev_bp (int): Maximum standard deviation in base pairs for NGSpeciesID. |
| 164 | +
|
| 165 | + Global Variables: |
| 166 | + output_area (str): A global variable used for logging output messages. |
| 167 | + cpu_threads (int): Number of CPU threads to be used in processing. |
| 168 | +
|
| 169 | + Returns: |
| 170 | + str: The directory path where the NGSpeciesID output is stored. |
| 171 | +
|
| 172 | + Notes: |
| 173 | + - Logs the constructed command and the outcome of the process. |
| 174 | + - Handles and logs medaka consensus calls. |
| 175 | + """ |
| 176 | + global cpu_threads, DEFAULT_LOG_FILE |
| 177 | + log_print( f"Processing {input_fastq_file} with NGSpeciesID...") |
| 178 | + ngsid_output_dir = input_fastq_file.replace(".fastq","") |
| 179 | + |
| 180 | + NGSID_cmd = ["NGSpeciesID", "--ont", "--consensus", |
| 181 | + "--sample_size", str(sample_size), |
| 182 | + "--m", str(min_length_bp), |
| 183 | + "--s", str(max_std_dev_bp), |
| 184 | + "--racon", "--racon_iter", str(3), |
| 185 | + "--primer_file", ngsid_primers_path, |
| 186 | + "--fastq", input_fastq_file, |
| 187 | + "--outfolder", ngsid_output_dir] |
| 188 | + |
| 189 | + log_print( f"CMD:\t{' '.join(NGSID_cmd)}") |
| 190 | + process = subprocess.run(NGSID_cmd, text=True) |
| 191 | + if process.returncode != 0: |
| 192 | + log_print( f"ERROR:\t{process.stderr}") |
| 193 | + return None |
| 194 | + else: |
| 195 | + log_print( f"PASS:\tSuccessfully processed {input_fastq_file} with NGSpeciesID") |
| 196 | + |
| 197 | + racon_folders = [folder for folder in os.listdir(ngsid_output_dir) |
| 198 | + if os.path.isdir(os.path.join(ngsid_output_dir, folder)) and 'racon' in folder] |
| 199 | + |
| 200 | + for folder in racon_folders: |
| 201 | + base_number = folder.split("_")[-1] |
| 202 | + medaka_output_dir = f"{ngsid_output_dir}/medaka_cl_id_{base_number}" |
| 203 | + |
| 204 | + if not os.path.exists(medaka_output_dir): |
| 205 | + os.makedirs(medaka_output_dir) |
| 206 | + |
| 207 | + reads_to_consensus_fastq = f"{ngsid_output_dir}/reads_to_consensus_{base_number}.fastq" |
| 208 | + consensus_ref_fasta = f"{ngsid_output_dir}/consensus_reference_{base_number}.fasta" |
| 209 | + call_medaka_script(reads_to_consensus_fastq, consensus_ref_fasta, medaka_output_dir) |
| 210 | + |
| 211 | + return ngsid_output_dir |
| 212 | + |
| 213 | +def call_medaka_script(reads_to_consensus_fastq, consensus_ref_fasta, medaka_output_dir): |
| 214 | + """ |
| 215 | + Calls a script to run Medaka on generated consensus reads. |
| 216 | +
|
| 217 | + This function handles the execution of a shell script to run Medaka, a tool for generating consensus sequences from |
| 218 | + sequencing data. It logs the process and captures the output and errors of the script execution. |
| 219 | +
|
| 220 | + Parameters: |
| 221 | + reads_to_consensus_fastq (str): Path to the FASTQ file containing reads for consensus. |
| 222 | + consensus_ref_fasta (str): Path to the FASTA file used as a reference for consensus calling. |
| 223 | + medaka_output_dir (str): Directory path for storing Medaka output files. |
| 224 | +
|
| 225 | + Global Variables: |
| 226 | + output_area (str): A global variable used for logging output messages. |
| 227 | + cpu_threads (int): Number of CPU threads to be used in processing. |
| 228 | +
|
| 229 | + Notes: |
| 230 | + - Executes a shell script for running Medaka and handles the subprocess communication. |
| 231 | + - Logs the output and errors from the Medaka script execution. |
| 232 | + """ |
| 233 | + global cpu_threads, DEFAULT_LOG_FILE |
| 234 | + log_print(f"Running medaka on generated consensus reads: {reads_to_consensus_fastq}...") |
| 235 | + medaka_shell_path = "/mnt/d/FunDiS/run_medaka.sh" |
| 236 | + script_command = [medaka_shell_path, reads_to_consensus_fastq, consensus_ref_fasta, medaka_output_dir, str(cpu_threads)] |
| 237 | + |
| 238 | + process = subprocess.Popen(script_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) |
| 239 | + stdout, stderr = process.communicate() |
| 240 | + |
| 241 | + if stdout: |
| 242 | + log_print("NOTE:\n" + stdout) |
| 243 | + if stderr: |
| 244 | + log_print("ERROR:\n" + stderr) |
| 245 | + |
| 246 | + if process.returncode != 0: |
| 247 | + log_print("ERROR:\tSubprocess returned with error") |
| 248 | + else: |
| 249 | + log_print("PASS:\tSuccessfully ran medaka on generated consensus reads") |
| 250 | + |
| 251 | +if __name__ == "__main__": |
| 252 | + print("UNLOGGED DEBUG:\tUNDEVELOPED FunDiS_NGSpeciesID.py DEBUG AREA") |
0 commit comments