|
7 | 7 | This is a module intended to be used as a part of a pipeline.
|
8 | 8 |
|
9 | 9 | This can be used individually by calling the command:
|
10 |
| - python /path/to/fundis_summarize.py -i /path/to/input_dir -p 80 |
11 |
| - python /path/to/fundis_summarize.py --input_dir /path/to/input_dir --percent_system_use 80 |
| 10 | + python /path/to/fundis_summarize.py -i /path/to/input_dir -p 50 |
| 11 | + python /path/to/fundis_summarize.py --input /path/to/input_dir --percent_system_use 50 |
12 | 12 | """
|
13 | 13 | from Bio import SeqIO
|
14 | 14 | import pandas as pd
|
|
17 | 17 | import shutil
|
18 | 18 | from tqdm import tqdm
|
19 | 19 | import argparse
|
| 20 | +import platform |
| 21 | + |
| 22 | +# Function to determine which Operating System the code is being executed in |
| 23 | +def check_os(): |
| 24 | + global environment_dir |
| 25 | + global environment_cmd_prefix |
| 26 | + # Determine the operating system in use |
| 27 | + # os.name will return 'posix', 'nt', or 'java' |
| 28 | + os_name = os.name |
| 29 | + # platform.system() will return 'Linux', 'Windows', 'Java', etc. |
| 30 | + platform_system = platform.system() |
| 31 | + |
| 32 | + # If the operating system is Windows (identified by 'nt' from os.name or 'Windows' from platform.system()) |
| 33 | + if os_name == 'nt' or platform_system == 'Windows': |
| 34 | + # Set the working directory to "E:" (or whatever drive letter is appropriate for your Windows system) |
| 35 | + environment_dir = "E:" |
| 36 | + # For running Linux commands in Windows Subsystem for Linux (WSL), prefix the command with "wsl " |
| 37 | + environment_cmd_prefix = "wsl " |
| 38 | + |
| 39 | + # If the operating system is Linux (identified by 'posix' from os.name or 'Linux' from platform.system()) |
| 40 | + elif os_name == 'posix' or platform_system == 'Linux': |
| 41 | + # Set the working directory to "/mnt/e" (or whatever the corresponding path is in your Linux system) |
| 42 | + environment_dir = "/mnt/e" |
| 43 | + |
| 44 | + else: |
| 45 | + # If the operating system is neither Windows nor Linux, raise an Exception |
| 46 | + raise Exception("ERROR: OS NOT TESTED WITH THIS CODE.") |
| 47 | + |
| 48 | + # Print out the detected operating system and the determined environment directory |
| 49 | + print(f'Operating System: {platform_system}') |
| 50 | + print(f'Environment Directory: {environment_dir}') |
| 51 | + return environment_dir |
20 | 52 |
|
21 | 53 | # Function to take in a folder containined processed NGSequenceID folders and generate a summary folder for MycoMap
|
22 | 54 | def mycomap_summarize_ngsid_dir(ngsid_dir):
|
23 |
| - print('\nGenerating MycoMap summary folder for {ngsid_dir}...') |
| 55 | + if '.fastq' in ngsid_dir: |
| 56 | + ngsid_dir = ngsid_dir.replace('.fastq','_minibar_NGSID') |
| 57 | + print(f'\nGenerating MycoMap summary folder for {ngsid_dir}...') |
24 | 58 | # Create the summary and FASTQ directories if they don't exist
|
25 | 59 | summary_dir = ngsid_dir.replace("_minibar_NGSID","_Summary")
|
26 | 60 |
|
27 | 61 | # Create pandas DataFrame for sequence information
|
28 | 62 | stats_df = pd.DataFrame(columns=['Filename', 'Length', 'Reads in Consensus', 'Multiple'])
|
29 | 63 |
|
30 | 64 | # Get all directories in the ngsid_dir
|
31 |
| - sample_dirs = [d for d in os.listdir(ngsid_dir) if os.path.isdir(os.path.join(ngsid_dir, d))] |
| 65 | + sample_dirs = [d for d in os.listdir(ngsid_dir) if os.path.isdir(os.path.join(ngsid_dir, d))] |
32 | 66 |
|
| 67 | + # Generate storage folders |
33 | 68 | fastq_dir = f"{summary_dir}/FASTQ Files"
|
34 | 69 | os.makedirs(summary_dir, exist_ok=True)
|
35 | 70 | os.makedirs(fastq_dir, exist_ok=True)
|
36 | 71 |
|
37 |
| - # MAKE LOOK THROUGH SAMPLE DIR IN SAMPLE DIRS |
| 72 | + # Look through current_sample_dir in sample_dirs |
38 | 73 | for current_sample_dir in tqdm(sample_dirs):
|
39 |
| - #current_sample_dir = "E:/Fundis/TEST/sample_HS_ONT03_03_35-HAY-F-001900-iNat155234876-Agaricomycetes_NGSequenceID" |
40 | 74 | current_sample_dir = f'{ngsid_dir}/{current_sample_dir}'
|
41 | 75 | base_name = current_sample_dir.split("sample_")[-1]
|
42 | 76 |
|
43 |
| - # get all directories in the path that match the pattern |
| 77 | + # Establish main variables and directories for processing |
44 | 78 | consensus_dirs = [entry.path for entry in os.scandir(current_sample_dir) if entry.is_dir() and entry.name.startswith('consensus_reference_') and any(os.scandir(entry.path))]
|
45 |
| - |
46 | 79 | consensus_dirs = [entry.replace("\\","/") for entry in consensus_dirs]
|
47 |
| - |
48 | 80 | consensus_fastq_list = []
|
49 |
| - |
50 | 81 | medaka_count = 1
|
51 |
| - |
52 | 82 | reads_in_consensus = 0
|
53 | 83 |
|
54 | 84 | for consensus_dir in consensus_dirs:
|
@@ -156,23 +186,29 @@ def mycomap_summarize_ngsid_dir(ngsid_dir):
|
156 | 186 | # Write the updated records to the combined fasta file
|
157 | 187 | SeqIO.write(records, combined_fasta_file, 'fasta')
|
158 | 188 |
|
159 |
| -def main(args): |
160 |
| - # Set the path to the folder containing the fastq files |
161 |
| - percent_system_use = float(args.percent_system_use)/100 if args.percent_system_use else 0.8 |
162 |
| - input_dir = args.input_dir if args.input_dir else os.path.dirname(os.path.realpath(__file__)) |
163 |
| - mycomap_summarize_ngsid_dir(input_dir) |
164 |
| - print('PASS: Successfully summarized NGSequenceID Folder for MycoMap upload') |
| 189 | +def summarize(args): |
| 190 | + try: |
| 191 | + # Global environment_dir |
| 192 | + environment_dir = "" |
| 193 | + environment_cmd_prefix = "" |
| 194 | + environment_dir = check_os() |
| 195 | + main_working_dir = os.getcwd() |
| 196 | + |
| 197 | + # Set the path to the folder containing the fastq files |
| 198 | + percent_system_use = float(args.percent_system_use)/100 if args.percent_system_use else 0.5 |
| 199 | + input_dir = args.input if args.input else os.path.dirname(os.path.realpath(__file__)) |
| 200 | + mycomap_summarize_ngsid_dir(input_dir) |
| 201 | + print('PASS: Successfully summarized NGSequenceID Folder for MycoMap upload') |
| 202 | + return True |
| 203 | + |
| 204 | + except Exception as e: |
| 205 | + print(f"ERROR: There was a problem in summarize: {str(e)}") |
| 206 | + return False |
165 | 207 |
|
166 | 208 | if __name__ == "__main__":
|
167 |
| - # Global environment_dir |
168 |
| - environment_dir = "" |
169 |
| - environment_cmd_prefix = "" |
170 |
| - environment_dir = check_os() |
171 |
| - main_working_dir = os.getcwd() |
172 |
| - |
173 | 209 | # Parse user arguments
|
174 | 210 | parser = argparse.ArgumentParser(description="Process NGSpeciesID source folder.")
|
175 |
| - parser.add_argument('-i','--input_dir', type=str, help='Path to the NGSpeciesID source folder') |
| 211 | + parser.add_argument('-i','--input', type=str, help='Path to the NGSpeciesID source folder') |
176 | 212 | parser.add_argument('-p','--percent_system_use', type=str, help='Percent system use written as integer.')
|
177 | 213 | args = parser.parse_args()
|
178 |
| - main(args) |
| 214 | + summarize(args) |
0 commit comments