This document was created for a workshop I ran at NYU in 2018. It contains explanations and examples of using ggplot2() to generate plots using tidy data in R.

ggplot is part of the tidyverse() package

#install.packages("tidyverse")
library(tidyverse)

ggplot

the grammar of graphics

  • data the data you want to plot
  • aesthetics how the data is mapped
  • geometries vizualization of the data
  • stats representations of data that aid understanding
  • coordinates space on which data is plotted
  • facets how plots are subsetted
  • themes non-data aspects of plots

ggplot

topics

  • basic plotting
  • scale adjustments
  • position adjustment
  • zooming
  • facetting
  • labels
  • themes

ggplot

the basic syntax

ggplot(data = <DATA>, mapping = aes(<Mapping>)) +
        <GEOM_FUNCTION>()

ggplot constraints

example dataset (modified yeast gff file)

feature and chromosome are factors

levels(yeast_features$feature)
 [1] "CDS"         "chromosome"  "exon"        "gene"        "mRNA"        "ncRNA_gene"  "pseudogene" 
 [8] "rRNA"        "rRNA_gene"   "snoRNA"      "snoRNA_gene" "snRNA"       "snRNA_gene"  "transcript" 
[15] "tRNA_gene"  
levels(yeast_features$chromosome)
 [1] "I"    "II"   "III"  "IV"   "IX"   "Mito" "V"    "VI"   "VII"  "VIII" "X"    "XI"   "XII"  "XIII" "XIV" 
[16] "XV"   "XVI" 

view structure using str()

str(yeast_features)
tibble [7,448 × 6] (S3: tbl_df/tbl/data.frame)
 $ chromosome: Factor w/ 17 levels "I","II","III",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ feature   : Factor w/ 15 levels "CDS","chromosome",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ start     : num [1:7448] 335 538 1807 2480 7235 ...
 $ stop      : num [1:7448] 649 792 2169 2707 9016 ...
 $ strand    : Factor w/ 3 levels "-",".","+": 3 3 1 3 1 3 1 3 1 3 ...
 $ length    : num [1:7448] 314 254 362 227 1781 ...

histogram of feature lengths

ggplot(data = yeast_features, mapping = aes(x = length)) +
        geom_histogram()

plot as a continuous distribution

ggplot(data = yeast_features, mapping = aes(x = length)) +
        geom_freqpoly()

color according to feature type

ggplot(data = yeast_features, mapping = aes(x = length)) +
        geom_freqpoly(mapping = aes(color = feature))

Plot as a probability density

ggplot(data = yeast_features, mapping = aes(x = length, y = ..density..,color = feature)) +
        geom_freqpoly()

Plot all data using geom_point

ggplot(data = yeast_features, mapping = aes(x = chromosome, y = length)) +
        geom_point(aes(color = feature))

Dealing with overplotting using jitter

ggplot(data = yeast_features, mapping = aes(x = chromosome, y = length)) +
        geom_point(aes(color = feature), position = "jitter")

Dealing with overplotting using alpha

ggplot(data = yeast_features, mapping = aes(x = chromosome, y = length)) +
        geom_jitter(aes(color = feature), alpha = 1/10)

Boxplots using geom_boxplot()

ggplot(data = yeast_features, mapping = aes(x = chromosome, y = length)) +
        geom_boxplot()

Mapping aesthetics within layers

ggplot(data = yeast_features, mapping = aes(x = chromosome, y = length, color = feature)) +
        geom_boxplot()

Combining layers

ggplot(data = yeast_features, mapping = aes(x = chromosome, y = length)) +
        geom_boxplot() + 
        geom_point(position = "jitter", shape = ".", mapping= aes(color = feature))

Modifying scales

ggplot(data = yeast_features, mapping = aes(x = chromosome, y = length)) +
        geom_boxplot() + 
        geom_point(position = "jitter", shape = ".", mapping= aes(color = feature)) +
        scale_y_log10()

Changing the order using reorder

ggplot(data = yeast_features, mapping = aes(x = reorder(chromosome, length, FUN = median), y = length)) +
        geom_boxplot() + 
        geom_point(position = "jitter", shape = ".", mapping= aes(color = feature)) +
        scale_y_log10()

Flipping coordinates

ggplot(data = yeast_features, mapping = aes(x = reorder(chromosome, length, FUN = median), y = length)) +
        geom_boxplot() + 
        geom_point(position = "jitter", shape = ".", mapping= aes(color = feature)) +
        scale_y_log10() +
        coord_flip()

Barplots

With one categorical datatype (factor): chromosome

ggplot(data = yeast_features, mapping = aes(x = chromosome, fill = chromosome)) +
        geom_bar()

Barplots

two categorical datatypes/factors: chromosome & features

ggplot(data = yeast_features, mapping = aes(x = chromosome, fill = feature)) +
        geom_bar()

Barplot variant: fill

ggplot(data = yeast_features, mapping = aes(x = chromosome, fill = feature)) +
        geom_bar(position = "fill")

Barplot variant: dodge

ggplot(data = yeast_features, mapping = aes(x = chromosome, fill = feature)) +
        geom_bar(position = "dodge")

Computing and ploting statistics

ggplot(data = yeast_features, mapping = aes(x = feature, y = length)) +
        stat_summary(fun.data = mean_sdl)

Facetting plots

ggplot(data = yeast_features, mapping = aes(x = length, fill = chromosome)) +
        geom_histogram() + 
        facet_wrap( ~ chromosome) 

Facetting (grid)

ggplot(data = yeast_features, mapping = aes(x = length, y = ..density..)) +
        geom_histogram() + 
        facet_wrap(feature ~ strand, nrow = 2) 

Scatter plots

ggplot(data = yeast_features, mapping = aes(x = start, y = stop)) +
        geom_point(aes(size = feature, color = feature, shape = feature), alpha = 1/3) +
        scale_x_log10() + 
        scale_y_log10()

Statistics: adding trend line

ggplot(data = yeast_features, mapping = aes(x = start, y = stop)) +
        geom_point(aes(size = feature, color = feature, shape = feature), alpha = 1/3) +
        scale_x_log10() + 
        scale_y_log10() + 
        geom_smooth()

Statistics: controlling trend line

ggplot(data = yeast_features, mapping = aes(x = start, y = stop)) +
        geom_point(aes(size = feature, color = feature, shape = feature), alpha = 1/3) +
        scale_x_log10() + 
        scale_y_log10() + 
        geom_smooth(method = "lm")

Colors (setting color)

ggplot(data = yeast_features, mapping = aes(x = length, fill = chromosome)) +
        geom_histogram(fill = "red") + 
        scale_x_log10() +
        facet_wrap( ~ chromosome) 

Zooming in with coord_cartesian()

ggplot(data = yeast_features, mapping = aes(x = start, y = stop)) +
        geom_point(aes(size = feature, color = feature, shape = feature), alpha = 1/3) +
        scale_x_log10() + 
        scale_y_log10() + 
        geom_smooth() +
        coord_cartesian(xlim = c(10,5000), ylim = c(10,5000))

Adding Labels to plots

ggplot(data = yeast_features, mapping = aes(x = length, y = ..density..)) +
        geom_freqpoly(mapping = aes(color = feature)) +
        scale_x_log10() +
        labs(
                title = "Distribution of feature sizes",
                x = "length (base pairs)",
                y = "probability density"
        ) + 
        theme(legend.position = "bottom")

Themes change the overall look

ggplot(data = yeast_features, mapping = aes(x = length, y = ..density..)) +
        geom_freqpoly(mapping = aes(color = feature)) +
        scale_x_log10() +
        labs(
                title = "Distribution of feature sizes",
                x = "length (base pairs)",
                y = "probability density"
        ) + 
        theme(legend.position = "bottom") + 
        theme_light()

Themes

more themes available in add-on package ggthemes

#install.packages("ggthemes")
library(ggthemes)

Additional themes : theme_tufte()

ggplot(data = yeast_features, mapping = aes(x = length, y = ..density..)) +
        geom_freqpoly(mapping = aes(color = feature)) +
        scale_x_log10() +
        theme_tufte()

Additional themes : theme_excel()

Don’t do this!

ggplot(data = yeast_features, mapping = aes(x = length, y = ..density..)) +
        geom_freqpoly(mapping = aes(color = feature)) +
        scale_x_log10() +
        theme_excel()

Generic plot function

ggplot(data = <DATA>, mapping = aes(<Mapping>)) +
        <GEOM_FUNCTION>() + 
        <STAT_FUNCTYION>() +
        <FACET_FUNCTION>() + 
        <SCALE_FUNCTION>() +
        <THEME_FUNCTION>()
        

Concise code

code can be made concise excluding some definitions

ggplot(yeast_features, aes(length, ..density..)) +
        geom_freqpoly(aes(color = feature)) +
        scale_x_log10() +
        theme(legend.position = "bottom") + 
        theme_light()

Saving plots as variables

my_plot <- ggplot(yeast_features, aes(length, ..density..)) +
        geom_freqpoly(aes(color = feature)) +
        scale_x_log10() +
        labs(
                title = "Distribution of feature sizes",
                x = "length (base pairs)",
                y = "probability density"
        ) + 
        theme(legend.position = "bottom") + 
        theme_light()

Adding to plot variables

my_plot + geom_hline(yintercept = 2, color = "red")

Exercise 1

Add: 1. new axis labels, 2. title, 3. trend line, and 4. change theme to tufte_theme

ggplot(data = a, aes(x = x, y = y, color = Class))+
        geom_point(size = 3)

Read in and modify gff

gff <- read_delim("Saccharomyces_cerevisiae.R64-1-1.34.gff3", 
    "\t", escape_double = FALSE, col_names = FALSE, 
    comment = "#", trim_ws = TRUE, skip = 24)

── Column specification ────────────────────────────────────────────────────────────────────────────────────────
cols(
  X1 = col_character(),
  X2 = col_character(),
  X3 = col_character(),
  X4 = col_double(),
  X5 = col_double(),
  X6 = col_character(),
  X7 = col_character(),
  X8 = col_character(),
  X9 = col_character()
)
names(gff) <- c("chromosome", 
                "source", 
                "feature", 
                "start",
                "stop", 
                "unknown1",
                "strand",
                "unknown2",
                "info"
                )
#correct data types
gff$feature = as.factor(gff$feature)
gff$chromosome = as.factor(gff$chromosome)
gff$strand = as.factor(gff$strand)

yeast_features <- gff %>%
        select(chromosome, feature, start, stop, strand) %>%
        mutate(length = abs(start - stop)) %>%
        filter(feature == "CDS" | feature == "rRNA" | feature == "snoRNA" | feature == "snRNA" | feature == "tRNA_gene")

Exercise 2

Plot as barplots with error bars

ggplot(data = yeast_features, mapping = aes(x = feature, y = length)) +
        stat_summary(fun.data = mean_sdl)

Exercise 3

Change bin size for histogram

ggplot(data = yeast_features, mapping = aes(x = length)) +
        geom_histogram()

