Nanopore Scripts

Step 1 - Base-calling

#!/bin/bash
#
#SBATCH --verbose
#SBATCH --job-name=guppy
#SBATCH --output=guppy_%j.out
#SBATCH --error=guppy_%j.err
#SBATCH --time=24:00:00
#SBATCH --nodes=1
#SBATCH --mem=20GB
#SBATCH --mail-type=BEGIN,END,FAIL,TIME_LIMIT
#SBATCH --gres=gpu:1

nanopore_experiment_path=<path>
flowcell=<flowcell_type>
kit=<kit_id>

module purge
module load guppy/4.5.2_gpu

guppy_basecaller \
        --input_path ${nanopore_experiment_path}/fast5 \
        --save_path ${nanopore_experiment_path}/FASTQ \
        --flowcell ${flowcell} \
        --kit ${kit} \
        --device "auto"

Note that you will need to provied the path for ‘nanopore_experiment_path’ this should point to the where you uploaded the nanopore run on the server. Make sure the fast5 directory in contained in this directory.

The flowcell_type will likely be ‘FLO-MIN106’. You can find this specified in the experiment report generated during the run. Refer to the guppy documentation if you are not using a FLO-MIN106 device.

The kit_id will probably be either ‘SQK-LSK110’ (for DNA) or ‘SQK-DCS109’ (for cDNA) sequencing. Again you can find this specified in the experiment report generated during the run. Refer to the guppy documentation if you are not using one of these options.

Step 2a. Aligning DNA to a small (< 4Gb) genome

#!/bin/bash
#
#SBATCH --verbose
#SBATCH --job-name=minimap_DNA_small
#SBATCH --output=minimap_%j.out
#SBATCH --error=minimap_%j.err
#SBATCH --time=24:00:00
#SBATCH --nodes=1
#SBATCH --mem=20GB
#SBATCH --mail-type=BEGIN,END,FAIL,TIME_LIMIT

#module load
module load samtools/intel/1.12
module load minimap2/2.22

#(for optional structural variant calling)
module load sniffles/1.0.12

ref_fa=<refernce_fasta>

#sample:
sample_name=<sample_name>
    in_dir=<location_of '/FASTQ/pass/' generated by guppy>
    out_dir=<preferred output path>
    #
    mkdir -p ${out_dir}

  #Align nanopore long reads to a reference genome
    #Combine reads
        echo 'combining fastq'
        cat ${in_dir}*.fastq > ${out_dir}${sample_name}.fastq
            
        #Align reads
        echo 'running minimap'
            minimap2 -ax map-ont ${ref_fa} ${out_dir}${sample_name}.fastq > ${out_dir}${sample_name}.sam
        
        #Generate bams:
        echo 'samtools post alignment'
            samtools view -Sb ${out_dir}${sample_name}.sam -o ${out_dir}${sample_name}.bam
            samtools sort ${out_dir}${sample_name}.bam -o ${out_dir}${sample_name}.sorted.bam
            samtools index ${out_dir}${sample_name}.sorted.bam
        
        #(for optional structural variant calling)
        #Call structural variants
        echo 'Call SVs'
            samtools calmd -Eb ${out_dir}${sample_name}.sorted.bam --reference ${ref_fa} > ${out_dir}${sample_name}.md.bam
            sniffles -m ${out_dir}${sample_name}.md.bam -v ${out_dir}${sample_name}_sniffles.vcf

Step 2b. Aligning cDNA* to a small (< 4Gb) genome

*any spliced long reads

#!/bin/bash
#
#SBATCH --verbose
#SBATCH --job-name=minimap_cDNA_small
#SBATCH --output=minimap_%j.out
#SBATCH --error=minimap_%j.err
#SBATCH --time=24:00:00
#SBATCH --nodes=1
#SBATCH --mem=20GB
#SBATCH --mail-type=BEGIN,END,FAIL,TIME_LIMIT

#module load
module load samtools/intel/1.12
module load minimap2/2.22

#(for optional structural variant calling)
module load sniffles/1.0.12

ref_fa=<refernce_fasta>

#annotated yeast splice sites bed
#generated using: paftools.js gff2bed anno.gff > anno.bed
anno_splice=/scratch/cgsb/gresham/pieter/genome/Saccharomyces_cerevisiae_ensembl/Saccharomyces_cerevisiae.R64-1-1.104.bed
#maximum intron length, 200 = 200K
max_intron=1000

#sample:
sample_name=<sample_name>
  in_dir=<location_of '/FASTQ/pass/' generated by guppy>
  out_dir=<preferred output path>
    #
    mkdir -p ${out_dir}

  #Align nanopore long reads to a reference genome
    #Combine reads
        echo 'combining fastq'
        cat ${in_dir}*.fastq > ${out_dir}${sample_name}.fastq
            
        #Align reads
        echo 'running minimap'
          minimap2 -ax splice --junc-bed ${anno_splice} -G ${max_intron} ${ref_fa} ${out_dir}${sample_name}.fastq > ${out_dir}${sample_name}.sam
            
        #Generate bams:
        echo 'samtools post alignment'
            samtools view -Sb ${out_dir}${sample_name}.sam -o ${out_dir}${sample_name}.bam
            samtools sort ${out_dir}${sample_name}.bam -o ${out_dir}${sample_name}.sorted.bam
            samtools index ${out_dir}${sample_name}.sorted.bam
        
        #(for optional structural variant calling)
        #Call structural variants
        echo 'Call SVs'
            samtools calmd -Eb ${out_dir}${sample_name}.sorted.bam --reference ${ref_fa} > ${out_dir}${sample_name}.md.bam
            sniffles -m ${out_dir}${sample_name}.md.bam -v ${out_dir}${sample_name}_sniffles.vcf

Step 2c. Aligning DNA to a large (> 4Gb) genome

##NB If you are using the human genome, double check that you are using the appropriate genome http://lh3.github.io/2017/11/13/which-human-reference-genome-to-use #TL;DR Dont use ‘dna.toplevel.fa’

#!/bin/bash
#
#SBATCH --verbose
#SBATCH --job-name=minimap_DNA_large
#SBATCH --output=minimap_%j.out
#SBATCH --error=minimap_%j.err
#SBATCH --time=24:00:00
#SBATCH --nodes=1
#SBATCH --mem=40GB
#SBATCH --mail-type=BEGIN,END,FAIL,TIME_LIMIT

#module load
module load samtools/intel/1.12
module load minimap2/2.22

#(for optional structural variant calling)
module load sniffles/1.0.12

#############################################
### Example of downloading a large genome ###
#mkdir -p genome
#cd genome
#wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz
#gunzip *.gz
#ref_fa=GCA_000001405.15_GRCh38_no_alt_analysis_set.fna
#cd ..
###
#############################################

#create a minimizer index of the reference fasta
ref_fa=<refernce_fasta>
ref_mmi=${ref_fa}.mmi
minimap2 -d ${ref_mmi} ${ref_fa}

#because large genomes will loose their sam headers, we need to repair them 
# before converting to bam files.
ref_fai=${ref_fa}.fai
samtools faidx ${ref_fa}

#sample:
sample_name=<sample_name>
  input_path=<location_of '/FASTQ/pass/' generated by guppy>
  output_path=<preferred output path>
  #
    in_dir=${input_path}
    out_dir=${output_path}/ont_${sample_name}/
    #
    mkdir -p ${out_dir}

  #Align nanopore long reads to a reference genome
    #Combine reads
        echo 'combining fastq'
        cat ${in_dir}*.fastq > ${out_dir}${sample_name}.fastq
            
        #Align reads
        echo 'running minimap'
            minimap2 -ax map-ont ${ref_mmi} ${out_dir}${sample_name}.fastq > ${out_dir}${sample_name}.sam
        
        #Generate bams:
        echo 'samtools post alignment'
            samtools view -t ${ref_fai} -Sb ${out_dir}${sample_name}.sam -o ${out_dir}${sample_name}.bam
            samtools sort ${out_dir}${sample_name}.bam -o ${out_dir}${sample_name}.sorted.bam
            samtools index ${out_dir}${sample_name}.sorted.bam
        
        #(for optional structural variant calling)
        #Call structural variants
        echo 'Call SVs'
            samtools calmd -Eb ${out_dir}${sample_name}.sorted.bam --reference ${ref_fa} > ${out_dir}${sample_name}.md.bam
            sniffles -m ${out_dir}${sample_name}.md.bam -v ${out_dir}${sample_name}_sniffles.vcf

Step 2d. Aligning cDNA to a large (> 4Gb) genome

#!/bin/bash
#
#SBATCH --verbose
#SBATCH --job-name=minimap_cDNA_large
#SBATCH --output=minimap_%j.out
#SBATCH --error=minimap_%j.err
#SBATCH --time=24:00:00
#SBATCH --nodes=1
#SBATCH --mem=40GB
#SBATCH --mail-type=BEGIN,END,FAIL,TIME_LIMIT

#module load
module load samtools/intel/1.12
module load minimap2/2.22

#(for optional structural variant calling)
module load sniffles/1.0.12

#############################################
### Example of downloading a large genome ###
#mkdir -p genome
#cd genome
#wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz
#gunzip *.gz
#ref_fa=GCA_000001405.15_GRCh38_no_alt_analysis_set.fna
#cd ..
###
#############################################

#create a minimizer index of the reference fasta
ref_fa=<refernce_fasta>
ref_mmi=${reference_fasta}.mmi
minimap2 -d ${ref_mmi} ${ref_fa}

#because large genomes will loose their sam headers, we need to repair them 
# before converting to bam files.
ref_fai=${reference_fasta}.fai
samtools faidx ${ref_fa}

#annotated human splice sites bed
#generated using: paftools.js gff2bed anno.gff > anno.bed
anno_splice=/scratch/cgsb/gresham/pieter/genome/hsapiens/Homo_sapiens.GRCh38.104.bed

#sample:
sample_name=<sample_name>
  #
    in_dir=<location_of '/FASTQ/pass/' generated by guppy>
    out_dir=<preferred output path>
    #
    mkdir -p ${out_dir}

  #Align nanopore long reads to a reference genome
    #Combine reads
        echo 'combining fastq'
        cat ${in_dir}*.fastq > ${out_dir}${sample_name}.fastq
            
        #Align reads
        echo 'running minimap'
            minimap2 -ax splice ${ref_mmi} --junc-bed ${anno_splice} -G ${max_intron} ${out_dir}${sample_name}.fastq > ${out_dir}${sample_name}.sam
        
        #Generate bams:
        echo 'samtools post alignment'
            samtools view -t ${ref_fai} -Sb ${out_dir}${sample_name}.sam -o ${out_dir}${sample_name}.bam
            samtools sort ${out_dir}${sample_name}.bam -o ${out_dir}${sample_name}.sorted.bam
            samtools index ${out_dir}${sample_name}.sorted.bam
        
        #(for optional structural variant calling)
        #Call structural variants
        echo 'Call SVs'
            samtools calmd -Eb ${out_dir}${sample_name}.sorted.bam --reference ${ref_fa} > ${out_dir}${sample_name}.md.bam
            sniffles -m ${out_dir}${sample_name}.md.bam -v ${out_dir}${sample_name}_sniffles.vcf

Step 3. De novo assembly

#!/bin/bash
#
#SBATCH --verbose
#SBATCH --job-name=miniasm
#SBATCH --output=miniasm_%j.out
#SBATCH --error=miniasm_%j.err
#SBATCH --time=24:00:00
#SBATCH --nodes=1
#SBATCH --mem=20GB
#SBATCH --mail-type=BEGIN,END,FAIL,TIME_LIMIT

#module load
module load minimap2/2.22
module load miniasm/0.3

#sample:
sample_name=<sample_name>
  input_path=<location_of '/FASTQ/pass/' generated by guppy>
  output_path=<preferred output path>
  #
    in_dir=${input_path}
    out_dir=${output_path}/asm_${sample_name}/
    #
    mkdir -p ${out_dir}

  #Align nanopore long reads to a reference genome
    #Combine reads
        echo 'combining fastq'
        cat ${in_dir}*.fastq > ${out_dir}${sample_name}.fastq
            
        #Self-align reads
        echo 'running minimap'
        minimap2 -x ava-ont -t8 ${out_dir}${sample_name}.fastq | gzip -1 > ${out_dir}${sample_name}_reads.paf.gz
        
        #Generate gfa:
        echo 'running miniasm'
        miniasm -f ${out_dir}${sample_name}.fastq ${out_dir}${sample_name}_reads.paf.gz > ${out_dir}${sample_name}_assembly.gfa