This document was created for a workshop I ran at NYU on using tidyverse in 2018. It contains explanations and examples of creating tidy data and using functions in dplyr()
to manipulate data.
select()
mutate()
arrange()
summarise()
filter()
%>%
##Load packages
library(tidyverse)
library(stringr)
No.
table2
No.
table3
Yes!
table1
No.
gene_expression
No.
facs_data
Jeff Leek in his book The Elements of Data Analytic Style summarizes the characteristics of tidy data as the points:
used when column names are not names of variables, but values of a variable (e.g. time). makes tables longer and skinny (previously known as melting)
gene_expression
used when column names are not names of variables, but values of a variable (e.g. time). makes tables longer and skinny (previously known as melting)
gather(gene_expression, t0:t2, key = "timepoint", value = "expression")
Spreading is the opposite of gathering. Used when an observation is scattered across multiple rows. spread()
makes tables shorter and wider
facs_data
Spreading is the opposite of gathering. Used when an observation is scattered across multiple rows. spread()
makes tables shorter and wider
spread(facs_data, key = Measure, value = Value)
put table2 in tidy format
table2[1:6,] #truncated so it fits on slide
spread(table2, key = type, value = count)
convert table1 to table2
table1
head(table2)
gather(table1, c(country, year, cases, population), key = "year", value = "count")
readr()
has numerous functions for reading in files as tibbles
read_csv()
for comma-delimitedread_tsv()
for tab-delimitedread_delim()
in which you specify the delimiterRead in the annotation file for the yeast genome.
gff <- read_delim("Saccharomyces_cerevisiae.R64-1-1.34.gff3",
"\t", escape_double = FALSE, col_names = FALSE,
comment = "#", trim_ws = TRUE, skip = 24)
── Column specification ────────────────────────────────────────────────────────────────────────────────────────
cols(
X1 = col_character(),
X2 = col_character(),
X3 = col_character(),
X4 = col_double(),
X5 = col_double(),
X6 = col_character(),
X7 = col_character(),
X8 = col_character(),
X9 = col_character()
)
a tibble is a dataframe
head(gff)
str(gff)
spec_tbl_df [28,871 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ X1: chr [1:28871] "I" "I" "I" "I" ...
$ X2: chr [1:28871] "SGD" "SGD" "SGD" "SGD" ...
$ X3: chr [1:28871] "gene" "mRNA" "exon" "CDS" ...
$ X4: num [1:28871] 335 335 335 335 538 ...
$ X5: num [1:28871] 649 649 649 649 792 ...
$ X6: chr [1:28871] "." "." "." "." ...
$ X7: chr [1:28871] "+" "+" "+" "+" ...
$ X8: chr [1:28871] "." "." "." "0" ...
$ X9: chr [1:28871] "ID=gene:YAL069W;biotype=protein_coding;description=Dubious open reading frame%3B unlikely to encode a functiona"| __truncated__ "ID=transcript:YAL069W;Parent=gene:YAL069W;biotype=protein_coding;transcript_id=YAL069W" "Parent=transcript:YAL069W;Name=YAL069W.1;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=YAL069W.1;rank=1" "ID=CDS:YAL069W;Parent=transcript:YAL069W;protein_id=YAL069W" ...
- attr(*, "spec")=
.. cols(
.. X1 = col_character(),
.. X2 = col_character(),
.. X3 = col_character(),
.. X4 = col_double(),
.. X5 = col_double(),
.. X6 = col_character(),
.. X7 = col_character(),
.. X8 = col_character(),
.. X9 = col_character()
.. )
glimpse(gff)
Rows: 28,871
Columns: 9
$ X1 <chr> "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I",…
$ X2 <chr> "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "…
$ X3 <chr> "gene", "mRNA", "exon", "CDS", "gene", "mRNA", "exon", "CDS", "gene", "mRNA", "exon", "CDS", "gene"…
$ X4 <dbl> 335, 335, 335, 335, 538, 538, 538, 538, 1807, 1807, 1807, 1807, 2480, 2480, 2480, 2480, 7235, 7235,…
$ X5 <dbl> 649, 649, 649, 649, 792, 792, 792, 792, 2169, 2169, 2169, 2169, 2707, 2707, 2707, 2707, 9016, 9016,…
$ X6 <chr> ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".",…
$ X7 <chr> "+", "+", "+", "+", "+", "+", "+", "+", "-", "-", "-", "-", "+", "+", "+", "+", "-", "-", "-", "-",…
$ X8 <chr> ".", ".", ".", "0", ".", ".", ".", "0", ".", ".", ".", "0", ".", ".", ".", "0", ".", ".", ".", "0",…
$ X9 <chr> "ID=gene:YAL069W;biotype=protein_coding;description=Dubious open reading frame%3B unlikely to encod…
same approach as naming dataframe columns in base R
names(gff) <- c("chromosome",
"source",
"feature",
"start",
"stop",
"unknown1",
"strand",
"unknown2",
"info"
)
note that tidyverse tries to guess data type
glimpse(gff)
Rows: 28,871
Columns: 9
$ chromosome <chr> "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "…
$ source <chr> "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", …
$ feature <chr> "gene", "mRNA", "exon", "CDS", "gene", "mRNA", "exon", "CDS", "gene", "mRNA", "exon", "CDS"…
$ start <dbl> 335, 335, 335, 335, 538, 538, 538, 538, 1807, 1807, 1807, 1807, 2480, 2480, 2480, 2480, 723…
$ stop <dbl> 649, 649, 649, 649, 792, 792, 792, 792, 2169, 2169, 2169, 2169, 2707, 2707, 2707, 2707, 901…
$ unknown1 <chr> ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", "…
$ strand <chr> "+", "+", "+", "+", "+", "+", "+", "+", "-", "-", "-", "-", "+", "+", "+", "+", "-", "-", "…
$ unknown2 <chr> ".", ".", ".", "0", ".", ".", ".", "0", ".", ".", ".", "0", ".", ".", ".", "0", ".", ".", "…
$ info <chr> "ID=gene:YAL069W;biotype=protein_coding;description=Dubious open reading frame%3B unlikely …
assigning correct data type is critical for analyses and plotting with ggplot()
gff$feature = as.factor(gff$feature)
gff$chromosome = as.factor(gff$chromosome)
gff$strand = as.factor(gff$strand)
glimpse(gff)
Rows: 28,871
Columns: 9
$ chromosome <fct> I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I…
$ source <chr> "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", …
$ feature <fct> gene, mRNA, exon, CDS, gene, mRNA, exon, CDS, gene, mRNA, exon, CDS, gene, mRNA, exon, CDS,…
$ start <dbl> 335, 335, 335, 335, 538, 538, 538, 538, 1807, 1807, 1807, 1807, 2480, 2480, 2480, 2480, 723…
$ stop <dbl> 649, 649, 649, 649, 792, 792, 792, 792, 2169, 2169, 2169, 2169, 2707, 2707, 2707, 2707, 901…
$ unknown1 <chr> ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", "…
$ strand <fct> +, +, +, +, +, +, +, +, -, -, -, -, +, +, +, +, -, -, -, -, +, +, +, +, -, -, -, -, +, +, +…
$ unknown2 <chr> ".", ".", ".", "0", ".", ".", ".", "0", ".", ".", ".", "0", ".", ".", ".", "0", ".", ".", "…
$ info <chr> "ID=gene:YAL069W;biotype=protein_coding;description=Dubious open reading frame%3B unlikely …
select()
gff <- select(gff, c("chromosome", "feature", "start", "stop", "strand"))
head(gff)
mutate()
gff <- mutate(gff, length = abs(start - stop))
head(gff)
arrange()
Note that writing dplyr::arrange
specifies the package and function if there is an overlay with functions in other packages or in base R.
dplyr::arrange(gff,length)
sort largest to smallest using -
dplyr::arrange(gff,-length)
summarize()
this creates a new tibble/dataframe
summarise(gff, mean = mean(length),
sd = sd(length),
min = min(length),
max = max(length)
)
summarize()
the function n()
counts how many observations their are
summarise(gff, mean = mean(length),
sd = sd(length),
min = min(length),
max = max(length),
n = n())
%>%
magittr
package|
in unixgroup_by()
gff %>%
mutate(length = abs(start - stop)) %>%
group_by(feature) %>%
summarise(mean = mean(length), sd = sd(length), min = min(length), max = max(length), n = n())
filter()
gff %>%
filter(feature != "mRNA" & feature != "rRNA_gene" & feature != "snoRNA_gene"& feature != "snRNA_gene") %>%
mutate(length = abs(start - stop)) %>%
group_by(feature) %>%
summarise(mean = mean(length), sd = sd(length), min = min(length), max = max(length), n = n())
NOTE: ggplot uses +
not the pipe %>%
gff %>%
filter(feature == c("CDS")) %>%
ggplot(aes(x = length)) +
geom_histogram(bins = 100)
plot the population of each country in 1999 using %>% and
ggplot()`
table1
table1 %>%
select(-cases) %>%
filter(year == "1999") %>%
ggplot(mapping = aes(x = country, fill = country, y = population)) +
geom_bar(stat="identity")
stringr()
How do we get the gene names?
select(gff, info)
separate()
gff %>%
mutate(length = abs(start - stop)) %>%
filter(feature == "gene") %>%
separate(col = "info", into = c("info1", "info2", "info3", "info4", "info5"), sep = ";", extra = "merge") %>%
separate(col = "info1", into = c("junk", "Systematic_name"), sep = ":") %>%
separate(col = "info2", into = c("junk2", "Gene"), sep = "Name=") %>%
separate(col = "info3", into = c("junk3", "Description1"), sep = "description=") %>%
separate(col = "info4", into = c("junk4", "Description2"), sep = "description=") %>%
select(c(Description1, Description2))
unite()
gff %>%
mutate(length = abs(start - stop)) %>%
filter(feature == "gene") %>%
separate(col = "info", into = c("info1", "info2", "info3", "info4", "info5"), sep = ";", extra = "merge") %>%
separate(col = "info1", into = c("junk", "Systematic_name"), sep = ":") %>%
separate(col = "info2", into = c("junk2", "Gene"), sep = "Name=") %>%
separate(col = "info3", into = c("junk3", "Description1"), sep = "description=") %>%
separate(col = "info4", into = c("junk4", "Description2"), sep = "description=") %>%
unite(Description, Description1, Description2, sep = ":") %>%
select(c(Description))
A general rule is if you are piping more than 10 steps save as a new variable
gff_clean <- gff %>%
mutate(length = abs(start - stop)) %>%
filter(feature == "gene") %>%
separate(col = "info", into = c("info1", "info2", "info3", "info4", "info5"), sep = ";", extra = "merge") %>%
separate(col = "info1", into = c("junk", "Systematic_name"), sep = ":") %>%
separate(col = "info2", into = c("junk2", "Gene"), sep = "Name=") %>%
separate(col = "info3", into = c("junk3", "Description1"), sep = "description=") %>%
separate(col = "info4", into = c("junk4", "Description2"), sep = "description=") %>%
unite(Description, Description1, Description2, sep = "") %>%
select(c(Systematic_name, Gene, Description))
stringr()
gff_clean$Description <- str_replace_all(gff_clean$Description, "%3B", "")
gff_clean$Description <- str_replace_all(gff_clean$Description, "%2C", "")
gff_clean$Description <- str_replace_all(gff_clean$Description, "^NA", "")
gff_clean %>%
select(c(Description))
NA
write_tsv(gff_clean, "Yeast_genes.txt", na = "NA")
A mutating join allows you combine variables from two tables by matchiung observations by their keys
matches pairs of observation from two tables whenever their keys are equal
keeps observations that appear in at least one of the tables
affects (filters) the observations not the variables
str(data)
spec_tbl_df [5,850 × 57] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ Syst : chr [1:5850] "Q0045" "Q0050" "Q0055" "Q0060" ...
$ Gene : chr [1:5850] "COX1" "AI1" "AI2" "AI3" ...
$ CodingLength : num [1:5850] 3.21 3.4 3.41 3.1 3.22 ...
$ X3primeUTRLength : num [1:5850] 2.04 NA NA NA 2.33 ...
$ X5primeUTRLength : num [1:5850] NA NA NA NA NA ...
$ X5primeUTR_GC : num [1:5850] NA NA NA NA NA ...
$ X3primeUTR_GC : num [1:5850] 0.055 NA NA NA 0.234 ...
$ mRNA_abundance : num [1:5850] -1.65 -1.64 -2.22 -2.74 -2.74 ...
$ Protein_per_cell : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Ribosome_density : num [1:5850] NA NA NA NA NA ...
$ Transcription_Rate: num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ codingGCposition1 : num [1:5850] 0.381 0.309 0.311 0.305 0.309 0.303 0.268 0.204 0.285 0.36 ...
$ codingGCposition2 : num [1:5850] 0.372 0.329 0.329 0.284 0.312 0.309 0.2 0.204 0.315 0.332 ...
$ WobbleGC : num [1:5850] 0.144 0.11 0.156 0.06 0.083 0.074 0.028 0.163 0.112 0.14 ...
$ Protein_halflife : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ deltaG : num [1:5850] -480 -661 -756 -309 -449 ...
$ CAI : num [1:5850] -0.853 -0.911 -0.807 -0.915 -0.916 ...
$ nTE : num [1:5850] 0.152 0.134 0.144 0.128 0.133 ...
$ Protein_per_mRNA : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Munchel : num [1:5850] 0.0354 NA 0.0378 0.0334 0.0301 ...
$ Miller : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Wang : num [1:5850] 0.00815 0.01136 0.01386 0.00866 0.00856 ...
$ Grigul : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Garcia_Martinez : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Shalem : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Presnyak : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Neymotin : num [1:5850] NA 0.0561 0.0368 0.0842 NA NA 0.0857 NA NA 0.0205 ...
$ Bfr1 : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Cbc2 : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Cbf5 : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Gbp2 : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Hrb1 : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Khd1 : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Msl5 : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Nab2 : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Nab3 : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Nab6 : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Npl3 : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Nrd1 : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Nsr1 : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Pab1 : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Pin4 : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Pub1 : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Puf1 : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Puf2 : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Puf3 : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Puf4 : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Puf5 : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Scp160 : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Sik1 : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Ski2 : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Ssd1 : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Tdh3 : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Vts1 : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Yll032c : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Ypl184c : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
$ Yra2 : num [1:5850] NA NA NA NA NA NA NA NA NA NA ...
- attr(*, "spec")=
.. cols(
.. Syst = col_character(),
.. Gene = col_character(),
.. CodingLength = col_double(),
.. X3primeUTRLength = col_double(),
.. X5primeUTRLength = col_double(),
.. X5primeUTR_GC = col_double(),
.. X3primeUTR_GC = col_double(),
.. mRNA_abundance = col_double(),
.. Protein_per_cell = col_double(),
.. Ribosome_density = col_double(),
.. Transcription_Rate = col_double(),
.. codingGCposition1 = col_double(),
.. codingGCposition2 = col_double(),
.. WobbleGC = col_double(),
.. Protein_halflife = col_double(),
.. deltaG = col_double(),
.. CAI = col_double(),
.. nTE = col_double(),
.. Protein_per_mRNA = col_double(),
.. Munchel = col_double(),
.. Miller = col_double(),
.. Wang = col_double(),
.. Grigul = col_double(),
.. Garcia_Martinez = col_double(),
.. Shalem = col_double(),
.. Presnyak = col_double(),
.. Neymotin = col_double(),
.. Bfr1 = col_double(),
.. Cbc2 = col_double(),
.. Cbf5 = col_double(),
.. Gbp2 = col_double(),
.. Hrb1 = col_double(),
.. Khd1 = col_double(),
.. Msl5 = col_double(),
.. Nab2 = col_double(),
.. Nab3 = col_double(),
.. Nab6 = col_double(),
.. Npl3 = col_double(),
.. Nrd1 = col_double(),
.. Nsr1 = col_double(),
.. Pab1 = col_double(),
.. Pin4 = col_double(),
.. Pub1 = col_double(),
.. Puf1 = col_double(),
.. Puf2 = col_double(),
.. Puf3 = col_double(),
.. Puf4 = col_double(),
.. Puf5 = col_double(),
.. Scp160 = col_double(),
.. Sik1 = col_double(),
.. Ski2 = col_double(),
.. Ssd1 = col_double(),
.. Tdh3 = col_double(),
.. Vts1 = col_double(),
.. Yll032c = col_double(),
.. Ypl184c = col_double(),
.. Yra2 = col_double()
.. )
str(gff_clean)
tibble [6,692 × 3] (S3: tbl_df/tbl/data.frame)
$ Systematic_name: chr [1:6692] "YAL069W" "YAL068W-A" "YAL068C" "YAL067W-A" ...
$ Gene : chr [1:6692] NA NA "PAU8" NA ...
$ Description : chr [1:6692] "Dubious open reading frame unlikely to encode a functional protein based on available experimental and comparat"| __truncated__ "Dubious open reading frame unlikely to encode a functional protein based on available experimental and comparat"| __truncated__ "Protein of unknown function member of the seripauperin multigene family encoded mainly in subtelomeric regions "| __truncated__ "Putative protein of unknown function identified by gene-trapping microarray-based expression analysis and genom"| __truncated__ ...
dplyr::left_join(a, b, by = "x1")
Join matching rows from b to a.
left_join(gff_clean, data, by = c("Systematic_name" = "Syst")) %>%
str()
tibble [6,695 × 59] (S3: tbl_df/tbl/data.frame)
$ Systematic_name : chr [1:6695] "YAL069W" "YAL068W-A" "YAL068C" "YAL067W-A" ...
$ Gene.x : chr [1:6695] NA NA "PAU8" NA ...
$ Description : chr [1:6695] "Dubious open reading frame unlikely to encode a functional protein based on available experimental and comparat"| __truncated__ "Dubious open reading frame unlikely to encode a functional protein based on available experimental and comparat"| __truncated__ "Protein of unknown function member of the seripauperin multigene family encoded mainly in subtelomeric regions "| __truncated__ "Putative protein of unknown function identified by gene-trapping microarray-based expression analysis and genom"| __truncated__ ...
$ Gene.y : chr [1:6695] NA NA "PAU8" "YAL067W-A" ...
$ CodingLength : num [1:6695] NA NA 2.56 2.36 3.25 ...
$ X3primeUTRLength : num [1:6695] NA NA NA NA NA NA NA NA NA NA ...
$ X5primeUTRLength : num [1:6695] NA NA NA NA NA NA NA NA NA NA ...
$ X5primeUTR_GC : num [1:6695] NA NA NA NA NA NA NA NA NA NA ...
$ X3primeUTR_GC : num [1:6695] NA NA NA NA NA NA NA NA NA NA ...
$ mRNA_abundance : num [1:6695] NA NA NA -2.92082 0.00423 ...
$ Protein_per_cell : num [1:6695] NA NA NA NA NA NA NA NA NA NA ...
$ Ribosome_density : num [1:6695] NA NA NA NA 2.23 ...
$ Transcription_Rate: num [1:6695] NA NA NA NA NA ...
$ codingGCposition1 : num [1:6695] NA NA 0.488 0.421 0.396 NA 0.38 0.63 0.48 0.432 ...
$ codingGCposition2 : num [1:6695] NA NA 0.471 0.368 0.357 NA 0.581 0.362 0.449 0.347 ...
$ WobbleGC : num [1:6695] NA NA 0.529 0.447 0.33 NA 0.364 0.37 0.386 0.326 ...
$ Protein_halflife : num [1:6695] NA NA NA NA NA NA NA NA NA NA ...
$ deltaG : num [1:6695] NA NA -110.9 -63.4 -569.7 ...
$ CAI : num [1:6695] NA NA -0.202 -1.178 -0.735 ...
$ nTE : num [1:6695] NA NA 0.241 0.142 0.161 ...
$ Protein_per_mRNA : num [1:6695] NA NA NA NA NA NA NA NA NA NA ...
$ Munchel : num [1:6695] NA NA 0.061 NA NA ...
$ Miller : num [1:6695] NA NA NA NA NA NA NA NA NA NA ...
$ Wang : num [1:6695] NA NA NA NA NA ...
$ Grigul : num [1:6695] NA NA NA NA NA NA NA NA NA NA ...
$ Garcia_Martinez : num [1:6695] NA NA NA NA NA ...
$ Shalem : num [1:6695] NA NA 0.00869 0.02155 0.02005 ...
$ Presnyak : num [1:6695] NA NA NA NA NA NA NA NA NA NA ...
$ Neymotin : num [1:6695] NA NA NA NA NA ...
$ Bfr1 : num [1:6695] NA NA 0 NA 0 NA 0 0 0 0 ...
$ Cbc2 : num [1:6695] NA NA 0 NA 0 NA 0 0 0 0 ...
$ Cbf5 : num [1:6695] NA NA 0 NA 0 NA 0 0 0 0 ...
$ Gbp2 : num [1:6695] NA NA 0 NA 0 NA 0 0 0 0 ...
$ Hrb1 : num [1:6695] NA NA 0 NA 0 NA 0 0 0 0 ...
$ Khd1 : num [1:6695] NA NA 0 NA 0 NA 1 0 1 0 ...
$ Msl5 : num [1:6695] NA NA 0 NA 0 NA 0 0 0 0 ...
$ Nab2 : num [1:6695] NA NA 0 NA 0 NA 0 0 0 0 ...
$ Nab3 : num [1:6695] NA NA 0 NA 0 NA 0 0 0 0 ...
$ Nab6 : num [1:6695] NA NA 0 NA 0 NA 0 0 0 0 ...
$ Npl3 : num [1:6695] NA NA 0 NA 0 NA 0 0 0 0 ...
$ Nrd1 : num [1:6695] NA NA 0 NA 0 NA 0 0 0 0 ...
$ Nsr1 : num [1:6695] NA NA 0 NA 0 NA 0 0 0 0 ...
$ Pab1 : num [1:6695] NA NA 0 NA 0 NA 0 0 0 0 ...
$ Pin4 : num [1:6695] NA NA 0 NA 0 NA 0 0 0 0 ...
$ Pub1 : num [1:6695] NA NA 0 NA 0 NA 0 0 0 0 ...
$ Puf1 : num [1:6695] NA NA 0 NA 0 NA 0 0 0 0 ...
$ Puf2 : num [1:6695] NA NA 0 NA 0 NA 0 0 0 0 ...
$ Puf3 : num [1:6695] NA NA 0 NA 0 NA 0 0 0 0 ...
$ Puf4 : num [1:6695] NA NA 0 NA 0 NA 0 0 0 0 ...
$ Puf5 : num [1:6695] NA NA 0 NA 0 NA 0 0 0 0 ...
$ Scp160 : num [1:6695] NA NA 0 NA 0 NA 0 0 0 0 ...
$ Sik1 : num [1:6695] NA NA 0 NA 0 NA 0 0 0 0 ...
$ Ski2 : num [1:6695] NA NA 0 NA 0 NA 0 0 0 0 ...
$ Ssd1 : num [1:6695] NA NA 0 NA 0 NA 0 0 0 0 ...
$ Tdh3 : num [1:6695] NA NA 0 NA 0 NA 0 0 0 0 ...
$ Vts1 : num [1:6695] NA NA 0 NA 0 NA 0 0 0 0 ...
$ Yll032c : num [1:6695] NA NA 0 NA 0 NA 0 0 0 0 ...
$ Ypl184c : num [1:6695] NA NA 0 NA 0 NA 0 0 0 0 ...
$ Yra2 : num [1:6695] NA NA 0 NA 0 NA 0 0 0 0 ...
tidy data don’t allow easy correlation plots, so we need to rearrage the data
tidy_gene_expression <- gene_expression %>%
gather(t0:t2, key = "timepoint", value = "expression")
tidy_gene_expression
ggplot(data = yeast_features, mapping = aes(x = chromosome, fill = feature)) +
geom_bar(position = "dodge")