Skip to content

Commit 8ce5852

Browse files
authored
Create FunDiS_NGSpeciesID.py
Uploaded modern version Signed-off-by: Ian M. B. <99409346+iPsychonaut@users.noreply.github.com>
1 parent c95c959 commit 8ce5852

File tree

1 file changed

+252
-0
lines changed

1 file changed

+252
-0
lines changed

FunDiS_NGSpeciesID.py

Lines changed: 252 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,252 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Wed Nov 29 12:23:15 2023
4+
5+
@author: ian.michael.bollinger@gmail.com/researchconsultants@critical.consulting
6+
7+
This is the Next-Generation Barcoding Species Identifier (NGSpeciesID) main wrapper developed for a modified
8+
9+
protocol by Stephen Douglas Russell and paid for by the Fungal Diversity Survey (FunDiS).
10+
11+
Protocol Link: https://www.protocols.io/view/primary-data-analysis-basecalling-demultiplexing-a-dm6gpbm88lzp/v3?step=3
12+
"""
13+
14+
# Base Python Imports
15+
import subprocess, os
16+
17+
# Required Python Imports
18+
from termcolor import cprint
19+
from datetime import datetime
20+
21+
# Function to color coded print to console and save to log_file information
22+
def log_print(input_message, log_file=None):
23+
"""
24+
Logs a message to a file and prints it to the console with appropriate coloring.
25+
26+
This function takes a message and logs it to the specified file. Additionally, the message is printed to the
27+
console, potentially with specific coloring depending on the context.
28+
29+
Parameters:
30+
input_message (str): Message to be logged and printed.
31+
log_file (str): Path to the log file.
32+
33+
Notes:
34+
- The function uses a global default log file if none is specified.
35+
- Timestamps each log entry for easy tracking.
36+
- Utilizes color coding in the console to distinguish between different types of messages (e.g., errors, warnings).
37+
- Supports color coding for specific message types: NOTE, CMD, ERROR, WARN, and PASS.
38+
- Falls back to default (white) color if the message type is unrecognized.
39+
"""
40+
# Access the global variable
41+
global DEFAULT_LOG_FILE
42+
43+
# Use the default log file if none specified
44+
if log_file is None:
45+
log_file = DEFAULT_LOG_FILE
46+
47+
# Establish current date-time
48+
now = datetime.now()
49+
message = f'[{now:%Y-%m-%d %H:%M:%S}]\t{input_message}'
50+
51+
# Determine the print color based on the input_message content
52+
message_type_dict = {'NOTE': ['blue'],
53+
'CMD': ['cyan'],
54+
'ERROR': ['red'],
55+
'WARN': ['yellow'],
56+
'PASS': ['green'],}
57+
print_color = ['white'] # Default color
58+
for key, value in message_type_dict.items():
59+
if key.lower() in input_message.lower():
60+
print_color = value
61+
break
62+
63+
# Writing the message to the log file
64+
with open(log_file, 'a') as file:
65+
print(message, file=file)
66+
67+
# Handling different message types for colored printing
68+
try:
69+
cprint(message, print_color[0])
70+
except (KeyError, IndexError):
71+
cprint(message, print_color[1] if len(print_color) > 1 else 'white')
72+
73+
74+
from FunDiS_hap_phase import phase_haplotypes
75+
76+
# Wrapper function to run ngsid_prep in a separate thread
77+
def run_ngsid_prep(ngsid_primers_path, input_fastq_file, sample_size, min_length_bp, max_std_dev_bp, hap_phase_bool, ngsid_output_dir):
78+
"""
79+
Wrapper function to run NGSpeciesID preparation in a separate thread.
80+
81+
This function handles the NGSpeciesID preparation process. It executes the preparation steps in a separate thread
82+
and logs any errors encountered during execution.
83+
84+
Parameters:
85+
ngsid_primers_path (str): Path to the NGSpeciesID primers file.
86+
input_fastq_file (str): Path to the input FASTQ file.
87+
sample_size (int): The sample size parameter for NGSpeciesID.
88+
min_length_bp (int): Minimum length in base pairs for NGSpeciesID.
89+
max_std_dev_bp (int): Maximum standard deviation in base pairs for NGSpeciesID.
90+
hap_phase_bool (bool): Boolean to indicate if haplotype phasing is to be performed.
91+
ngsid_output_dir (str): Directory path for storing NGSpeciesID output files.
92+
93+
Global Variables:
94+
output_area (str): A global variable used for logging output messages.
95+
96+
Notes:
97+
- Captures and logs exceptions during NGSpeciesID preparation.
98+
"""
99+
global DEFAULT_LOG_FILE
100+
try:
101+
ngsid_prep(ngsid_primers_path, input_fastq_file, sample_size, min_length_bp, max_std_dev_bp, hap_phase_bool, ngsid_output_dir)
102+
except Exception as e:
103+
log_print( f"ERROR:\t{str(e)}")
104+
105+
# Function to perform NGSpeciesID preparation
106+
def ngsid_prep(ngsid_primers_path, input_fastq_file, sample_size, min_length_bp, max_std_dev_bp, hap_phase_bool, ngsid_output_dir):
107+
"""
108+
Performs NGSpeciesID preparation.
109+
110+
This function carries out the necessary steps for NGSpeciesID preparation. It processes FASTQ files,
111+
executes NGSpeciesID, and optionally performs haplotype phasing.
112+
113+
Parameters:
114+
ngsid_primers_path (str): Path to the NGSpeciesID primers file.
115+
input_fastq_file (str): Path to the input FASTQ file.
116+
sample_size (int): The sample size parameter for NGSpeciesID.
117+
min_length_bp (int): Minimum length in base pairs for NGSpeciesID.
118+
max_std_dev_bp (int): Maximum standard deviation in base pairs for NGSpeciesID.
119+
hap_phase_bool (bool): Boolean to indicate if haplotype phasing is to be performed.
120+
ngsid_output_dir (str): Directory path for storing NGSpeciesID output files.
121+
122+
Global Variables:
123+
output_area (str): A global variable used for logging output messages.
124+
cpu_threads (int): Number of CPU threads to be used in processing.
125+
126+
Notes:
127+
- Processes each .fastq file in the specified output directory.
128+
- Logs progress and errors during the preparation process.
129+
"""
130+
global cpu_threads, DEFAULT_LOG_FILE
131+
# For any .fastq file in the ngsid_output_dir run the NGSID function on it
132+
fastq_files_list = []
133+
134+
# Iterate over all files in the given folder
135+
for file in os.listdir(ngsid_output_dir):
136+
# Check if the file ends with .fastq
137+
if file.endswith('.fastq'):
138+
# Add the file to the list, with full path
139+
fastq_files_list.append(os.path.join(ngsid_output_dir, file))
140+
141+
for input_fastq_file in fastq_files_list:
142+
ngsid_output_dir = ngsid_fastq(ngsid_primers_path, input_fastq_file, sample_size, min_length_bp, max_std_dev_bp)
143+
if hap_phase_bool == True:
144+
try:
145+
phased_fasta_file = phase_haplotypes(ngsid_output_dir, cpu_threads)
146+
except TypeError:
147+
log_print( f"Skipping Haplotype Phasing, unable to process: {input_fastq_file} with NGSpeciesID")
148+
continue
149+
log_print(f"PASS:\tSuccessfully processed {ngsid_output_dir}")
150+
151+
def ngsid_fastq(ngsid_primers_path, input_fastq_file, sample_size, min_length_bp, max_std_dev_bp):
152+
"""
153+
Processes a FASTQ file with NGSpeciesID.
154+
155+
This function handles the processing of a single FASTQ file using the NGSpeciesID tool. It constructs and executes
156+
the NGSpeciesID command with the provided parameters.
157+
158+
Parameters:
159+
ngsid_primers_path (str): Path to the NGSpeciesID primers file.
160+
input_fastq_file (str): Path to the input FASTQ file.
161+
sample_size (int): The sample size parameter for NGSpeciesID.
162+
min_length_bp (int): Minimum length in base pairs for NGSpeciesID.
163+
max_std_dev_bp (int): Maximum standard deviation in base pairs for NGSpeciesID.
164+
165+
Global Variables:
166+
output_area (str): A global variable used for logging output messages.
167+
cpu_threads (int): Number of CPU threads to be used in processing.
168+
169+
Returns:
170+
str: The directory path where the NGSpeciesID output is stored.
171+
172+
Notes:
173+
- Logs the constructed command and the outcome of the process.
174+
- Handles and logs medaka consensus calls.
175+
"""
176+
global cpu_threads, DEFAULT_LOG_FILE
177+
log_print( f"Processing {input_fastq_file} with NGSpeciesID...")
178+
ngsid_output_dir = input_fastq_file.replace(".fastq","")
179+
180+
NGSID_cmd = ["NGSpeciesID", "--ont", "--consensus",
181+
"--sample_size", str(sample_size),
182+
"--m", str(min_length_bp),
183+
"--s", str(max_std_dev_bp),
184+
"--racon", "--racon_iter", str(3),
185+
"--primer_file", ngsid_primers_path,
186+
"--fastq", input_fastq_file,
187+
"--outfolder", ngsid_output_dir]
188+
189+
log_print( f"CMD:\t{' '.join(NGSID_cmd)}")
190+
process = subprocess.run(NGSID_cmd, text=True)
191+
if process.returncode != 0:
192+
log_print( f"ERROR:\t{process.stderr}")
193+
return None
194+
else:
195+
log_print( f"PASS:\tSuccessfully processed {input_fastq_file} with NGSpeciesID")
196+
197+
racon_folders = [folder for folder in os.listdir(ngsid_output_dir)
198+
if os.path.isdir(os.path.join(ngsid_output_dir, folder)) and 'racon' in folder]
199+
200+
for folder in racon_folders:
201+
base_number = folder.split("_")[-1]
202+
medaka_output_dir = f"{ngsid_output_dir}/medaka_cl_id_{base_number}"
203+
204+
if not os.path.exists(medaka_output_dir):
205+
os.makedirs(medaka_output_dir)
206+
207+
reads_to_consensus_fastq = f"{ngsid_output_dir}/reads_to_consensus_{base_number}.fastq"
208+
consensus_ref_fasta = f"{ngsid_output_dir}/consensus_reference_{base_number}.fasta"
209+
call_medaka_script(reads_to_consensus_fastq, consensus_ref_fasta, medaka_output_dir)
210+
211+
return ngsid_output_dir
212+
213+
def call_medaka_script(reads_to_consensus_fastq, consensus_ref_fasta, medaka_output_dir):
214+
"""
215+
Calls a script to run Medaka on generated consensus reads.
216+
217+
This function handles the execution of a shell script to run Medaka, a tool for generating consensus sequences from
218+
sequencing data. It logs the process and captures the output and errors of the script execution.
219+
220+
Parameters:
221+
reads_to_consensus_fastq (str): Path to the FASTQ file containing reads for consensus.
222+
consensus_ref_fasta (str): Path to the FASTA file used as a reference for consensus calling.
223+
medaka_output_dir (str): Directory path for storing Medaka output files.
224+
225+
Global Variables:
226+
output_area (str): A global variable used for logging output messages.
227+
cpu_threads (int): Number of CPU threads to be used in processing.
228+
229+
Notes:
230+
- Executes a shell script for running Medaka and handles the subprocess communication.
231+
- Logs the output and errors from the Medaka script execution.
232+
"""
233+
global cpu_threads, DEFAULT_LOG_FILE
234+
log_print(f"Running medaka on generated consensus reads: {reads_to_consensus_fastq}...")
235+
medaka_shell_path = "/mnt/d/FunDiS/run_medaka.sh"
236+
script_command = [medaka_shell_path, reads_to_consensus_fastq, consensus_ref_fasta, medaka_output_dir, str(cpu_threads)]
237+
238+
process = subprocess.Popen(script_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
239+
stdout, stderr = process.communicate()
240+
241+
if stdout:
242+
log_print("NOTE:\n" + stdout)
243+
if stderr:
244+
log_print("ERROR:\n" + stderr)
245+
246+
if process.returncode != 0:
247+
log_print("ERROR:\tSubprocess returned with error")
248+
else:
249+
log_print("PASS:\tSuccessfully ran medaka on generated consensus reads")
250+
251+
if __name__ == "__main__":
252+
print("UNLOGGED DEBUG:\tUNDEVELOPED FunDiS_NGSpeciesID.py DEBUG AREA")

0 commit comments

Comments
 (0)