This document assumes you’ve already ran a sample an ONT Nanopore device and have generated fast5 files recording the squiggles of that run.
This document is intended to be used as a copy paste template for two sbatch scripts. The first script will use a GPU to take advantage of parallelization. While the second will run on a CPU as these tools lack GPU integration. Being sbatch scripts these need to be executed from the command line using:
sbatch filename.sh
#Step 1. We will perform base-calling on these fast5 files to convert them to FASTQ files. We will be using ONT’s Guppy for this, although other base callers exist.
#Step 2. Once we have FASTQ files we can align to a reference genome, with an optional structural variant call.
If your reference genome is small (< 4Gb): Step 2a. Aligning DNA to a small (< 4Gb) genome. Step 2b. Aligning cDNA to a small (< 4Gb) genome.
If your reference genome is large (> 4Gb):
Step 2c. Aligning DNA to a large (> 4Gb) genome. Step 2d. Aligning cDNA to a large (> 4Gb) genome.
#Step 3 Alternatively we can create de novo assemblies using miniasm.
#!/bin/bash
#
#SBATCH --verbose
#SBATCH --job-name=guppy
#SBATCH --output=guppy_%j.out
#SBATCH --error=guppy_%j.err
#SBATCH --time=24:00:00
#SBATCH --nodes=1
#SBATCH --mem=20GB
#SBATCH --mail-type=BEGIN,END,FAIL,TIME_LIMIT
#SBATCH --gres=gpu:1
nanopore_experiment_path=<path>
flowcell=<flowcell_type>
kit=<kit_id>
module purge
module load guppy/4.5.2_gpu
guppy_basecaller \
--input_path ${nanopore_experiment_path}/fast5 \
--save_path ${nanopore_experiment_path}/FASTQ \
--flowcell ${flowcell} \
--kit ${kit} \
--device "auto"
Note that you will need to provied the path for ‘nanopore_experiment_path’ this should point to the where you uploaded the nanopore run on the server. Make sure the fast5 directory in contained in this directory.
The flowcell_type will likely be ‘FLO-MIN106’. You can find this specified in the experiment report generated during the run. Refer to the guppy documentation if you are not using a FLO-MIN106 device.
The kit_id will probably be either ‘SQK-LSK110’ (for DNA) or ‘SQK-DCS109’ (for cDNA) sequencing. Again you can find this specified in the experiment report generated during the run. Refer to the guppy documentation if you are not using one of these options.
#!/bin/bash
#
#SBATCH --verbose
#SBATCH --job-name=minimap_DNA_small
#SBATCH --output=minimap_%j.out
#SBATCH --error=minimap_%j.err
#SBATCH --time=24:00:00
#SBATCH --nodes=1
#SBATCH --mem=20GB
#SBATCH --mail-type=BEGIN,END,FAIL,TIME_LIMIT
#module load
module load samtools/intel/1.12
module load minimap2/2.22
#(for optional structural variant calling)
module load sniffles/1.0.12
ref_fa=<refernce_fasta>
#sample:
sample_name=<sample_name>
in_dir=<location_of '/FASTQ/pass/' generated by guppy>
out_dir=<preferred output path>
#
mkdir -p ${out_dir}
#Align nanopore long reads to a reference genome
#Combine reads
echo 'combining fastq'
cat ${in_dir}*.fastq > ${out_dir}${sample_name}.fastq
#Align reads
echo 'running minimap'
minimap2 -ax map-ont ${ref_fa} ${out_dir}${sample_name}.fastq > ${out_dir}${sample_name}.sam
#Generate bams:
echo 'samtools post alignment'
samtools view -Sb ${out_dir}${sample_name}.sam -o ${out_dir}${sample_name}.bam
samtools sort ${out_dir}${sample_name}.bam -o ${out_dir}${sample_name}.sorted.bam
samtools index ${out_dir}${sample_name}.sorted.bam
#(for optional structural variant calling)
#Call structural variants
echo 'Call SVs'
samtools calmd -Eb ${out_dir}${sample_name}.sorted.bam --reference ${ref_fa} > ${out_dir}${sample_name}.md.bam
sniffles -m ${out_dir}${sample_name}.md.bam -v ${out_dir}${sample_name}_sniffles.vcf
*any spliced long reads
#!/bin/bash
#
#SBATCH --verbose
#SBATCH --job-name=minimap_cDNA_small
#SBATCH --output=minimap_%j.out
#SBATCH --error=minimap_%j.err
#SBATCH --time=24:00:00
#SBATCH --nodes=1
#SBATCH --mem=20GB
#SBATCH --mail-type=BEGIN,END,FAIL,TIME_LIMIT
#module load
module load samtools/intel/1.12
module load minimap2/2.22
#(for optional structural variant calling)
module load sniffles/1.0.12
ref_fa=<refernce_fasta>
#annotated yeast splice sites bed
#generated using: paftools.js gff2bed anno.gff > anno.bed
anno_splice=/scratch/cgsb/gresham/pieter/genome/Saccharomyces_cerevisiae_ensembl/Saccharomyces_cerevisiae.R64-1-1.104.bed
#maximum intron length, 200 = 200K
max_intron=1000
#sample:
sample_name=<sample_name>
in_dir=<location_of '/FASTQ/pass/' generated by guppy>
out_dir=<preferred output path>
#
mkdir -p ${out_dir}
#Align nanopore long reads to a reference genome
#Combine reads
echo 'combining fastq'
cat ${in_dir}*.fastq > ${out_dir}${sample_name}.fastq
#Align reads
echo 'running minimap'
minimap2 -ax splice --junc-bed ${anno_splice} -G ${max_intron} ${ref_fa} ${out_dir}${sample_name}.fastq > ${out_dir}${sample_name}.sam
#Generate bams:
echo 'samtools post alignment'
samtools view -Sb ${out_dir}${sample_name}.sam -o ${out_dir}${sample_name}.bam
samtools sort ${out_dir}${sample_name}.bam -o ${out_dir}${sample_name}.sorted.bam
samtools index ${out_dir}${sample_name}.sorted.bam
#(for optional structural variant calling)
#Call structural variants
echo 'Call SVs'
samtools calmd -Eb ${out_dir}${sample_name}.sorted.bam --reference ${ref_fa} > ${out_dir}${sample_name}.md.bam
sniffles -m ${out_dir}${sample_name}.md.bam -v ${out_dir}${sample_name}_sniffles.vcf
##NB If you are using the human genome, double check that you are using the appropriate genome http://lh3.github.io/2017/11/13/which-human-reference-genome-to-use #TL;DR Dont use ‘dna.toplevel.fa’
#!/bin/bash
#
#SBATCH --verbose
#SBATCH --job-name=minimap_DNA_large
#SBATCH --output=minimap_%j.out
#SBATCH --error=minimap_%j.err
#SBATCH --time=24:00:00
#SBATCH --nodes=1
#SBATCH --mem=40GB
#SBATCH --mail-type=BEGIN,END,FAIL,TIME_LIMIT
#module load
module load samtools/intel/1.12
module load minimap2/2.22
#(for optional structural variant calling)
module load sniffles/1.0.12
#############################################
### Example of downloading a large genome ###
#mkdir -p genome
#cd genome
#wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz
#gunzip *.gz
#ref_fa=GCA_000001405.15_GRCh38_no_alt_analysis_set.fna
#cd ..
###
#############################################
#create a minimizer index of the reference fasta
ref_fa=<refernce_fasta>
ref_mmi=${ref_fa}.mmi
minimap2 -d ${ref_mmi} ${ref_fa}
#because large genomes will loose their sam headers, we need to repair them
# before converting to bam files.
ref_fai=${ref_fa}.fai
samtools faidx ${ref_fa}
#sample:
sample_name=<sample_name>
input_path=<location_of '/FASTQ/pass/' generated by guppy>
output_path=<preferred output path>
#
in_dir=${input_path}
out_dir=${output_path}/ont_${sample_name}/
#
mkdir -p ${out_dir}
#Align nanopore long reads to a reference genome
#Combine reads
echo 'combining fastq'
cat ${in_dir}*.fastq > ${out_dir}${sample_name}.fastq
#Align reads
echo 'running minimap'
minimap2 -ax map-ont ${ref_mmi} ${out_dir}${sample_name}.fastq > ${out_dir}${sample_name}.sam
#Generate bams:
echo 'samtools post alignment'
samtools view -t ${ref_fai} -Sb ${out_dir}${sample_name}.sam -o ${out_dir}${sample_name}.bam
samtools sort ${out_dir}${sample_name}.bam -o ${out_dir}${sample_name}.sorted.bam
samtools index ${out_dir}${sample_name}.sorted.bam
#(for optional structural variant calling)
#Call structural variants
echo 'Call SVs'
samtools calmd -Eb ${out_dir}${sample_name}.sorted.bam --reference ${ref_fa} > ${out_dir}${sample_name}.md.bam
sniffles -m ${out_dir}${sample_name}.md.bam -v ${out_dir}${sample_name}_sniffles.vcf
##NB If you are using the human genome, double check that you are using the appropriate genome http://lh3.github.io/2017/11/13/which-human-reference-genome-to-use #TL;DR Dont use ‘dna.toplevel.fa’
#!/bin/bash
#
#SBATCH --verbose
#SBATCH --job-name=minimap_cDNA_large
#SBATCH --output=minimap_%j.out
#SBATCH --error=minimap_%j.err
#SBATCH --time=24:00:00
#SBATCH --nodes=1
#SBATCH --mem=40GB
#SBATCH --mail-type=BEGIN,END,FAIL,TIME_LIMIT
#module load
module load samtools/intel/1.12
module load minimap2/2.22
#(for optional structural variant calling)
module load sniffles/1.0.12
#############################################
### Example of downloading a large genome ###
#mkdir -p genome
#cd genome
#wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz
#gunzip *.gz
#ref_fa=GCA_000001405.15_GRCh38_no_alt_analysis_set.fna
#cd ..
###
#############################################
#create a minimizer index of the reference fasta
ref_fa=<refernce_fasta>
ref_mmi=${reference_fasta}.mmi
minimap2 -d ${ref_mmi} ${ref_fa}
#because large genomes will loose their sam headers, we need to repair them
# before converting to bam files.
ref_fai=${reference_fasta}.fai
samtools faidx ${ref_fa}
#annotated human splice sites bed
#generated using: paftools.js gff2bed anno.gff > anno.bed
anno_splice=/scratch/cgsb/gresham/pieter/genome/hsapiens/Homo_sapiens.GRCh38.104.bed
#sample:
sample_name=<sample_name>
#
in_dir=<location_of '/FASTQ/pass/' generated by guppy>
out_dir=<preferred output path>
#
mkdir -p ${out_dir}
#Align nanopore long reads to a reference genome
#Combine reads
echo 'combining fastq'
cat ${in_dir}*.fastq > ${out_dir}${sample_name}.fastq
#Align reads
echo 'running minimap'
minimap2 -ax splice ${ref_mmi} --junc-bed ${anno_splice} -G ${max_intron} ${out_dir}${sample_name}.fastq > ${out_dir}${sample_name}.sam
#Generate bams:
echo 'samtools post alignment'
samtools view -t ${ref_fai} -Sb ${out_dir}${sample_name}.sam -o ${out_dir}${sample_name}.bam
samtools sort ${out_dir}${sample_name}.bam -o ${out_dir}${sample_name}.sorted.bam
samtools index ${out_dir}${sample_name}.sorted.bam
#(for optional structural variant calling)
#Call structural variants
echo 'Call SVs'
samtools calmd -Eb ${out_dir}${sample_name}.sorted.bam --reference ${ref_fa} > ${out_dir}${sample_name}.md.bam
sniffles -m ${out_dir}${sample_name}.md.bam -v ${out_dir}${sample_name}_sniffles.vcf
#!/bin/bash
#
#SBATCH --verbose
#SBATCH --job-name=miniasm
#SBATCH --output=miniasm_%j.out
#SBATCH --error=miniasm_%j.err
#SBATCH --time=24:00:00
#SBATCH --nodes=1
#SBATCH --mem=20GB
#SBATCH --mail-type=BEGIN,END,FAIL,TIME_LIMIT
#module load
module load minimap2/2.22
module load miniasm/0.3
#sample:
sample_name=<sample_name>
input_path=<location_of '/FASTQ/pass/' generated by guppy>
output_path=<preferred output path>
#
in_dir=${input_path}
out_dir=${output_path}/asm_${sample_name}/
#
mkdir -p ${out_dir}
#Align nanopore long reads to a reference genome
#Combine reads
echo 'combining fastq'
cat ${in_dir}*.fastq > ${out_dir}${sample_name}.fastq
#Self-align reads
echo 'running minimap'
minimap2 -x ava-ont -t8 ${out_dir}${sample_name}.fastq | gzip -1 > ${out_dir}${sample_name}_reads.paf.gz
#Generate gfa:
echo 'running miniasm'
miniasm -f ${out_dir}${sample_name}.fastq ${out_dir}${sample_name}_reads.paf.gz > ${out_dir}${sample_name}_assembly.gfa