This document was created for a workshop I ran at NYU in 2018. It contains explanations and examples of using ggplot2()
to generate plots using tidy data in R.
ggplot is part of the tidyverse() package
#install.packages("tidyverse")
library(tidyverse)
ggplot
the grammar of graphics
- data the data you want to plot
- aesthetics how the data is mapped
- geometries vizualization of the data
- stats representations of data that aid understanding
- coordinates space on which data is plotted
- facets how plots are subsetted
- themes non-data aspects of plots
ggplot
topics
- basic plotting
- scale adjustments
- position adjustment
- zooming
- facetting
- labels
- themes
ggplot
the basic syntax
ggplot(data = <DATA>, mapping = aes(<Mapping>)) +
<GEOM_FUNCTION>()
ggplot constraints
- data must be in a dataframe
- data should be in tidy format
- each variable must have its own column
- each observation must have its own row
- each value must have its own cell
- for gene expression data this means one row per gene per experiment
example dataset (modified yeast gff file)
feature and chromosome are factors
levels(yeast_features$feature)
[1] "CDS" "chromosome" "exon" "gene" "mRNA" "ncRNA_gene" "pseudogene"
[8] "rRNA" "rRNA_gene" "snoRNA" "snoRNA_gene" "snRNA" "snRNA_gene" "transcript"
[15] "tRNA_gene"
levels(yeast_features$chromosome)
[1] "I" "II" "III" "IV" "IX" "Mito" "V" "VI" "VII" "VIII" "X" "XI" "XII" "XIII" "XIV"
[16] "XV" "XVI"
view structure using str()
str(yeast_features)
tibble [7,448 × 6] (S3: tbl_df/tbl/data.frame)
$ chromosome: Factor w/ 17 levels "I","II","III",..: 1 1 1 1 1 1 1 1 1 1 ...
$ feature : Factor w/ 15 levels "CDS","chromosome",..: 1 1 1 1 1 1 1 1 1 1 ...
$ start : num [1:7448] 335 538 1807 2480 7235 ...
$ stop : num [1:7448] 649 792 2169 2707 9016 ...
$ strand : Factor w/ 3 levels "-",".","+": 3 3 1 3 1 3 1 3 1 3 ...
$ length : num [1:7448] 314 254 362 227 1781 ...
histogram of feature lengths
ggplot(data = yeast_features, mapping = aes(x = length)) +
geom_histogram()

plot as a continuous distribution
ggplot(data = yeast_features, mapping = aes(x = length)) +
geom_freqpoly()

color according to feature type
ggplot(data = yeast_features, mapping = aes(x = length)) +
geom_freqpoly(mapping = aes(color = feature))

Plot as a probability density
ggplot(data = yeast_features, mapping = aes(x = length, y = ..density..,color = feature)) +
geom_freqpoly()

Plot all data using geom_point
ggplot(data = yeast_features, mapping = aes(x = chromosome, y = length)) +
geom_point(aes(color = feature))

Dealing with overplotting using jitter
ggplot(data = yeast_features, mapping = aes(x = chromosome, y = length)) +
geom_point(aes(color = feature), position = "jitter")

Dealing with overplotting using alpha
ggplot(data = yeast_features, mapping = aes(x = chromosome, y = length)) +
geom_jitter(aes(color = feature), alpha = 1/10)

Boxplots using geom_boxplot()
ggplot(data = yeast_features, mapping = aes(x = chromosome, y = length)) +
geom_boxplot()

Mapping aesthetics within layers
ggplot(data = yeast_features, mapping = aes(x = chromosome, y = length, color = feature)) +
geom_boxplot()

Combining layers
ggplot(data = yeast_features, mapping = aes(x = chromosome, y = length)) +
geom_boxplot() +
geom_point(position = "jitter", shape = ".", mapping= aes(color = feature))

Modifying scales
ggplot(data = yeast_features, mapping = aes(x = chromosome, y = length)) +
geom_boxplot() +
geom_point(position = "jitter", shape = ".", mapping= aes(color = feature)) +
scale_y_log10()

Changing the order using reorder
ggplot(data = yeast_features, mapping = aes(x = reorder(chromosome, length, FUN = median), y = length)) +
geom_boxplot() +
geom_point(position = "jitter", shape = ".", mapping= aes(color = feature)) +
scale_y_log10()

Flipping coordinates
ggplot(data = yeast_features, mapping = aes(x = reorder(chromosome, length, FUN = median), y = length)) +
geom_boxplot() +
geom_point(position = "jitter", shape = ".", mapping= aes(color = feature)) +
scale_y_log10() +
coord_flip()

Barplots
With one categorical datatype (factor): chromosome
ggplot(data = yeast_features, mapping = aes(x = chromosome, fill = chromosome)) +
geom_bar()

Barplots
two categorical datatypes/factors: chromosome & features
ggplot(data = yeast_features, mapping = aes(x = chromosome, fill = feature)) +
geom_bar()

Barplot variant: fill
ggplot(data = yeast_features, mapping = aes(x = chromosome, fill = feature)) +
geom_bar(position = "fill")

Barplot variant: dodge
ggplot(data = yeast_features, mapping = aes(x = chromosome, fill = feature)) +
geom_bar(position = "dodge")

Computing and ploting statistics
ggplot(data = yeast_features, mapping = aes(x = feature, y = length)) +
stat_summary(fun.data = mean_sdl)

Facetting plots
ggplot(data = yeast_features, mapping = aes(x = length, fill = chromosome)) +
geom_histogram() +
facet_wrap( ~ chromosome)

Facetting (grid)
ggplot(data = yeast_features, mapping = aes(x = length, y = ..density..)) +
geom_histogram() +
facet_wrap(feature ~ strand, nrow = 2)

Scatter plots
ggplot(data = yeast_features, mapping = aes(x = start, y = stop)) +
geom_point(aes(size = feature, color = feature, shape = feature), alpha = 1/3) +
scale_x_log10() +
scale_y_log10()

Statistics: adding trend line
ggplot(data = yeast_features, mapping = aes(x = start, y = stop)) +
geom_point(aes(size = feature, color = feature, shape = feature), alpha = 1/3) +
scale_x_log10() +
scale_y_log10() +
geom_smooth()

Statistics: controlling trend line
ggplot(data = yeast_features, mapping = aes(x = start, y = stop)) +
geom_point(aes(size = feature, color = feature, shape = feature), alpha = 1/3) +
scale_x_log10() +
scale_y_log10() +
geom_smooth(method = "lm")

Colors (setting color)
ggplot(data = yeast_features, mapping = aes(x = length, fill = chromosome)) +
geom_histogram(fill = "red") +
scale_x_log10() +
facet_wrap( ~ chromosome)

Zooming in with coord_cartesian()
ggplot(data = yeast_features, mapping = aes(x = start, y = stop)) +
geom_point(aes(size = feature, color = feature, shape = feature), alpha = 1/3) +
scale_x_log10() +
scale_y_log10() +
geom_smooth() +
coord_cartesian(xlim = c(10,5000), ylim = c(10,5000))

Adding Labels to plots
ggplot(data = yeast_features, mapping = aes(x = length, y = ..density..)) +
geom_freqpoly(mapping = aes(color = feature)) +
scale_x_log10() +
labs(
title = "Distribution of feature sizes",
x = "length (base pairs)",
y = "probability density"
) +
theme(legend.position = "bottom")

Themes change the overall look
ggplot(data = yeast_features, mapping = aes(x = length, y = ..density..)) +
geom_freqpoly(mapping = aes(color = feature)) +
scale_x_log10() +
labs(
title = "Distribution of feature sizes",
x = "length (base pairs)",
y = "probability density"
) +
theme(legend.position = "bottom") +
theme_light()

Themes
more themes available in add-on package ggthemes
#install.packages("ggthemes")
library(ggthemes)
Additional themes : theme_tufte()
ggplot(data = yeast_features, mapping = aes(x = length, y = ..density..)) +
geom_freqpoly(mapping = aes(color = feature)) +
scale_x_log10() +
theme_tufte()

Additional themes : theme_excel()
Don’t do this!
ggplot(data = yeast_features, mapping = aes(x = length, y = ..density..)) +
geom_freqpoly(mapping = aes(color = feature)) +
scale_x_log10() +
theme_excel()

Generic plot function
ggplot(data = <DATA>, mapping = aes(<Mapping>)) +
<GEOM_FUNCTION>() +
<STAT_FUNCTYION>() +
<FACET_FUNCTION>() +
<SCALE_FUNCTION>() +
<THEME_FUNCTION>()
Concise code
code can be made concise excluding some definitions
ggplot(yeast_features, aes(length, ..density..)) +
geom_freqpoly(aes(color = feature)) +
scale_x_log10() +
theme(legend.position = "bottom") +
theme_light()

Saving plots as variables
my_plot <- ggplot(yeast_features, aes(length, ..density..)) +
geom_freqpoly(aes(color = feature)) +
scale_x_log10() +
labs(
title = "Distribution of feature sizes",
x = "length (base pairs)",
y = "probability density"
) +
theme(legend.position = "bottom") +
theme_light()
Adding to plot variables
my_plot + geom_hline(yintercept = 2, color = "red")

Exercise 1
Add: 1. new axis labels, 2. title, 3. trend line, and 4. change theme to tufte_theme
ggplot(data = a, aes(x = x, y = y, color = Class))+
geom_point(size = 3)

Read in and modify gff
gff <- read_delim("Saccharomyces_cerevisiae.R64-1-1.34.gff3",
"\t", escape_double = FALSE, col_names = FALSE,
comment = "#", trim_ws = TRUE, skip = 24)
── Column specification ────────────────────────────────────────────────────────────────────────────────────────
cols(
X1 = col_character(),
X2 = col_character(),
X3 = col_character(),
X4 = col_double(),
X5 = col_double(),
X6 = col_character(),
X7 = col_character(),
X8 = col_character(),
X9 = col_character()
)
names(gff) <- c("chromosome",
"source",
"feature",
"start",
"stop",
"unknown1",
"strand",
"unknown2",
"info"
)
#correct data types
gff$feature = as.factor(gff$feature)
gff$chromosome = as.factor(gff$chromosome)
gff$strand = as.factor(gff$strand)
yeast_features <- gff %>%
select(chromosome, feature, start, stop, strand) %>%
mutate(length = abs(start - stop)) %>%
filter(feature == "CDS" | feature == "rRNA" | feature == "snoRNA" | feature == "snRNA" | feature == "tRNA_gene")
Exercise 2
Plot as barplots with error bars
ggplot(data = yeast_features, mapping = aes(x = feature, y = length)) +
stat_summary(fun.data = mean_sdl)

Exercise 3
Change bin size for histogram
ggplot(data = yeast_features, mapping = aes(x = length)) +
geom_histogram()

