@@ -78,6 +78,8 @@ rule pre_QC:
78
78
"""
79
79
80
80
# Run BBMap's Clumpify to remove optical duplicates pre-mapping
81
+ ## Please adjust the 'dupedist' value according to your sequencing platform in the scripts/clumpify_OpDup.sh file
82
+ ## (recommendations included within the script).
81
83
rule clumpify :
82
84
input :
83
85
trimmed_1 = rules .pre_QC .output .trimmed_1
@@ -177,6 +179,7 @@ rule addRG:
177
179
sh scripts/addRG.sh {input.pass2_bam} {params.meta} scripts/PICARD_addRG.sh
178
180
"""
179
181
182
+ # Filter BAM files based on quality with samtools
180
183
rule filterBAM :
181
184
input :
182
185
withRG = rules .addRG .output .withRG
@@ -194,6 +197,7 @@ rule filterBAM:
194
197
samtools view -bq 20 {input.withRG} > {output.filt}
195
198
"""
196
199
200
+ # Sort BAM files, then mark and remove duplicates
197
201
rule sortMarkDup :
198
202
input :
199
203
filt = rules .filterBAM .output .filt
@@ -211,6 +215,7 @@ rule sortMarkDup:
211
215
sh scripts/sortMarkDup.sh {input.filt}
212
216
"""
213
217
218
+ # Splits reads that contain Ns in their cigar string
214
219
rule splitN :
215
220
input :
216
221
nodup = rules .sortMarkDup .output .nodup
@@ -231,6 +236,7 @@ rule splitN:
231
236
sh scripts/splitncigar.sh {input.nodup} {params.genome_fa}
232
237
"""
233
238
239
+ # Perform base recalibration (get the recalibration table)
234
240
rule baseRecalib :
235
241
input :
236
242
nodup = rules .splitN .output .ncigar
@@ -253,6 +259,7 @@ rule baseRecalib:
253
259
sh scripts/base_recalibrator.sh {input.nodup} {params.genome_fa} {params.indel1} {params.indel2}
254
260
"""
255
261
262
+ # Apply base recalibration
256
263
rule applybqsr :
257
264
input :
258
265
nodup = rules .splitN .output .ncigar ,
@@ -274,6 +281,7 @@ rule applybqsr:
274
281
sh scripts/apply_bqsr.sh {input.nodup} {params.genome_fa} {input.recaltab}
275
282
"""
276
283
284
+ # Call germline SNPs and indels via local re-assembly of haplotypes
277
285
rule haploCall :
278
286
input :
279
287
finalbam = rules .applybqsr .output .finalbam
@@ -294,6 +302,7 @@ rule haploCall:
294
302
sh scripts/haploCall.sh {input.finalbam} {params.genome_fa}
295
303
"""
296
304
305
+ # Get list of GVCF files for merging
297
306
rule getListGeno :
298
307
input :
299
308
expand ("vars/{name}first.g.vcf.gz" , name = SAMPLES ["name" ])
@@ -306,6 +315,7 @@ rule getListGeno:
306
315
echo "{input}" | sed 's/vars/-V vars/g' > {output.gvcf_list}
307
316
"""
308
317
318
+ # Merge GVCFs and call genotype
309
319
rule genotype :
310
320
input :
311
321
gvcf_list = rules .getListGeno .output .gvcf_list
@@ -328,6 +338,7 @@ rule genotype:
328
338
sh scripts/genotypegvcfs.sh {input.gvcf_list} {params.genome_fa} {params.interval}
329
339
"""
330
340
341
+ # Initial filtering of variants called
331
342
rule filGen :
332
343
input :
333
344
genotyped = rules .genotype .output .genotyped
@@ -348,6 +359,7 @@ rule filGen:
348
359
sh scripts/variantFil.sh {input.genotyped} {params.genome_fa}
349
360
"""
350
361
362
+ # Select variants based on previous filtering
351
363
rule selGen :
352
364
input :
353
365
varFil = rules .filGen .output .varFil
@@ -368,6 +380,7 @@ rule selGen:
368
380
sh scripts/variantSel.sh {input.varFil} {params.genome_fa}
369
381
"""
370
382
383
+ # Clean up VCF file
371
384
rule plink_clean :
372
385
input :
373
386
varSel = rules .selGen .output .varSel
@@ -385,6 +398,7 @@ rule plink_clean:
385
398
Rscript scripts/clean_data.R VCF/Genotyped_filterOut VCF/Genotyped_filterOut_clean
386
399
"""
387
400
401
+ # Filter based on MAF<0.05
388
402
rule qc_plink :
389
403
input :
390
404
bim_clean = rules .plink_clean .output .bim_clean
@@ -399,6 +413,7 @@ rule qc_plink:
399
413
sh scripts/qc_all_basic.sh VCF/Genotyped_filterOut_clean VCF/Genotyped_filterOut_clean_maf005 0.05 0 0.1 0.2
400
414
"""
401
415
416
+ # Compare with HapMap3 reads and take overlaps
402
417
rule hapmap3 :
403
418
input :
404
419
bim_maf = rules .qc_plink .output .bim_maf
@@ -423,6 +438,7 @@ rule hapmap3:
423
438
plink --bfile ./VCF/Genotyped_filterOut_clean_maf005 --extract {output.hm3_list} --make-bed --out ./VCF/Genotyped_filterOut_clean_maf005_hm3
424
439
"""
425
440
441
+ # Filter based on ld<0.05
426
442
rule ld :
427
443
input :
428
444
hm3_bim = rules .hapmap3 .output .hm3_bim
@@ -440,6 +456,7 @@ rule ld:
440
456
sh scripts/ld_thinning_pruning.sh VCF/Genotyped_filterOut_clean_maf005_hm3 VCF/Genotyped_filterOut_clean_maf005_hm3_ld005 1000 50 0.05 {params.ld_reg}
441
457
"""
442
458
459
+ # Generate genetic PCs
443
460
rule pca :
444
461
input :
445
462
ld_bim = rules .ld .output .ld_bim
0 commit comments