Data Visualization with ggplot2 in R

ggplot2 is a powerful data visualization package in R that allows you to create complex and aesthetically pleasing visualizations using a simple and consistent syntax. This course aims to provide a detailed guide to ggplot2, from basic concepts to advanced techniques, along with hands-on practice to help you master this versatile package.

Import libraries

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.3     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Key components

Every ggplot2 plot has three key components:

  1. data,

  2. A set of aesthetic mappings between variables in the data and visual properties, and

  3. At least one layer which describes how to render each observation. Layers are usually created with a geom function.

Here's a simple example:

ggplot(mpg, aes(x = displ, y = hwy)) + 
  geom_point()

Color, size, shape and other aesthetic attributes

  • aes(displ, hwy, colour = class)

  • aes(displ, hwy, shape = drv)

  • aes(displ, hwy, size = cyl)

ggplot(mpg, aes(displ, hwy, colour = class)) + 
  geom_point()

ggplot(mpg, aes(displ, hwy)) + 
  geom_point(aes(colour = "blue"))

ggplot(mpg, aes(displ, hwy)) + 
  geom_point(colour = "blue")

Faceting

ggplot(mpg, aes(displ, hwy)) + 
  geom_point() + 
  facet_wrap(~class)

One variable (Discrete)

b <- ggplot(mpg, aes(fl))
b + geom_bar()

One variable (Cont.)

a <- ggplot(mpg, aes(hwy))

a + geom_area(stat = "bin")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

a + geom_density(kernel = "gaussian")

a + geom_dotplot()
Bin width defaults to 1/30 of the range of the data. Pick better value with
`binwidth`.

a + geom_freqpoly()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

a + geom_histogram(binwidth = 5)

Two variables (Cont. & Cont.)

f <- ggplot(mpg, aes(cty, hwy))
f + geom_blank()

f + geom_jitter()

f + geom_point()

# install.packages("quantreg")
library(quantreg)
Loading required package: SparseM

Attaching package: 'SparseM'
The following object is masked from 'package:base':

    backsolve
f + geom_quantile() + 
  geom_jitter()
Smoothing formula not specified. Using: y ~ x

f + geom_rug(sides = "bl") + 
  geom_jitter()

f + geom_rug(sides = "bl") + 
  geom_point()

f + geom_smooth(model = lm) +  
  geom_point()
Warning in geom_smooth(model = lm): Ignoring unknown parameters: `model`
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

f + geom_text(aes(label = cty)) + 
  geom_jitter()

f + geom_text(aes(label = fl))

# install.packages("ggimage")
library(ggimage)

img <- list.files(system.file("extdata", 
                              package="ggimage"),
                  pattern="png", full.names=TRUE)

f + geom_image(aes(image=img[2]))

Two variables (Discrete & Cont.)

g <- ggplot(mpg, aes(class, hwy))

levels(as.factor(mpg$class))
[1] "2seater"    "compact"    "midsize"    "minivan"    "pickup"    
[6] "subcompact" "suv"       
str(mpg$class)
 chr [1:234] "compact" "compact" "compact" "compact" "compact" "compact" ...
levels(as.factor(mpg$class))
[1] "2seater"    "compact"    "midsize"    "minivan"    "pickup"    
[6] "subcompact" "suv"       
unique(mpg$class)
[1] "compact"    "midsize"    "suv"        "2seater"    "minivan"   
[6] "pickup"     "subcompact"
g + geom_bar(stat = "identity")

g + geom_boxplot() 

# Let's specify some cars
mpg %>% 
  select(manufacturer, class, hwy) %>% 
  group_by(class) %>% 
  arrange(desc(hwy)) %>% 
  head(10) -> text_in_graph
text_in_graph
# A tibble: 10 × 3
# Groups:   class [2]
   manufacturer class        hwy
   <chr>        <chr>      <int>
 1 volkswagen   compact       44
 2 volkswagen   subcompact    44
 3 volkswagen   subcompact    41
 4 toyota       compact       37
 5 honda        subcompact    36
 6 honda        subcompact    36
 7 toyota       compact       35
 8 toyota       compact       35
 9 honda        subcompact    34
10 honda        subcompact    33
g + geom_boxplot() +
  geom_text(data=text_in_graph, 
            aes(label = manufacturer))

g + geom_dotplot(binaxis = "y",
                 stackdir = "center")
Bin width defaults to 1/30 of the range of the data. Pick better value with
`binwidth`.

g + geom_violin(scale = "area")

Two variables (Discrete & Discrete)

head(diamonds)
# A tibble: 6 × 10
  carat cut       color clarity depth table price     x     y     z
  <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
h <- ggplot(diamonds, aes(cut, color))
h + geom_jitter()

Continuous Bivariate Distribution

# install.packages("ggplot2movies")
library(ggplot2movies)
i <- ggplot(movies, aes(year, rating))
i + geom_bin2d(binwidth = c(5, 0.5))

i + geom_density2d()

# install.packages("hexbin")
library(hexbin)
i + geom_hex()

Continuous functions (time-series)

j <- ggplot(economics, aes(date, unemploy))
j + geom_area()

j + geom_line()

j + geom_step(direction = "hv")

Visualizing bars with errors

# Visualizing error
df <- data.frame(grp = c("A", "B"), fit = 4:5, se = 1:2)
k <- ggplot(df, aes(grp, fit, ymin = fit-se, ymax = fit+se))

k + geom_crossbar(fatten = 2)

k + geom_errorbar()

k + geom_linerange()

k + geom_pointrange()

Three variables

seals$z <- with(seals, sqrt(delta_long^2 + delta_lat^2))
m <- ggplot(seals, aes(long, lat))

m + geom_tile(aes(fill = z))

m + geom_contour(aes(z = z))

m + geom_raster(aes(fill = z), hjust=0.5,
                vjust=0.5, interpolate=FALSE)

Scales

n <- b + geom_bar(aes(fill = fl))
n

n + scale_fill_manual(
  values = c("skyblue", "royalblue", "blue", "navy"),
  limits = c("d", "e", "p", "r"), breaks =c("d", "e", "p", "r"),
  name = "fuel", labels = c("D", "E", "P", "R"))

# Color and fill scales
n <- b + geom_bar(aes(fill = fl))
o <- a + geom_dotplot(aes(fill = ..x..))

# install.packages("RColorBrewer")
library(RColorBrewer)
n + scale_fill_brewer(palette = "Blues")

display.brewer.all()

n + scale_fill_grey(
  start = 0.2, end = 0.8,
  na.value = "red")

o + scale_fill_gradient(
  low = "red",
  high = "yellow")
Warning: The dot-dot notation (`..x..`) was deprecated in ggplot2 3.4.0.
ℹ Please use `after_stat(x)` instead.
Bin width defaults to 1/30 of the range of the data. Pick better value with
`binwidth`.

o + scale_fill_gradientn(
  colours = terrain.colors(6))
Bin width defaults to 1/30 of the range of the data. Pick better value with
`binwidth`.

# Also: rainbow(), heat.colors(),
# topo.colors(), cm.colors(),
# RColorBrewer::brewer.pal()

# Shape scales
f

p <- f + geom_point(aes(shape = fl))
p

p + scale_shape(solid = FALSE)

p + scale_shape_manual(values = c(3:7))

# Size scales
q <- f + geom_point(aes(size = cyl))

Coordinate systems

r <- b+geom_bar()
r + coord_cartesian(xlim = c(0, 5))

r + coord_fixed(ratio = 1/2)

r + coord_fixed(ratio = 1/10)

r + coord_fixed(ratio = 1/100)

r + coord_flip()

r + coord_polar(theta = "x", direction=1 )

Position adjustments

s <- ggplot(mpg, aes(fl, fill = drv))

s + geom_bar(position = "dodge")

# Arrange elements side by side
s + geom_bar(position = "fill")

# Stack elements on top of one another, normalize height
s + geom_bar(position = "stack")

# Stack elements on top of one another
f + geom_point(position = "jitter")

# Add random noise to X and Y position of each element to avoid overplotting

Themes

# Theme
r + theme_bw()

r + theme_classic()

r + theme_grey()

r + theme_minimal()

Faceting

# Faceting

t <- ggplot(mpg, aes(cty, hwy)) + geom_point()
t + facet_grid(. ~ fl)

t + facet_grid(fl ~ .)

# facet into columns based on fl
t + facet_grid(year ~ .)

# facet into rows based on year
t + facet_grid(year ~ fl)

# facet into both rows and columns
t + facet_wrap(~ fl)

# wrap facets into a rectangular layout

Labels

# Labels
t + ggtitle("New Plot Title ")

# Add a main title above the plot
t + xlab("New X label")

# Change the label on the X axis
t + ylab("New Y label")

# Change the label on the Y axis
t + labs(title =" New title", x = "New x", y = "New y")

# All of the above

Qz

Question: Using the mpg dataset in R, create a horizontal bar chart that displays the average highway miles per gallon (MPG) for each car manufacturer. Arrange the data in descending order based on the average highway MPG. Use a gradient color scale ranging from red (low values) to green (high values) for the bars. Remove the x-axis label.

Hint: Use dplyr and ggplot2 functions such as group_by(), summarise(), arrange(), aes(), geom_bar(), coord_flip(), and scale_fill_gradient().