Data Exploration (2)

Data Wrangling with tidyverse

Weekly design


Pre-class video


Data wrangling



# 02 Data processing using Base R #

library(gapminder)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
glimpse(gapminder)
Rows: 1,704
Columns: 6
$ country   <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", …
$ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, …
$ year      <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, …
$ lifeExp   <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.8…
$ pop       <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 12…
$ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134, …
gapminder[, c("country", "lifeExp")]
# A tibble: 1,704 × 2
   country     lifeExp
   <fct>         <dbl>
 1 Afghanistan    28.8
 2 Afghanistan    30.3
 3 Afghanistan    32.0
 4 Afghanistan    34.0
 5 Afghanistan    36.1
 6 Afghanistan    38.4
 7 Afghanistan    39.9
 8 Afghanistan    40.8
 9 Afghanistan    41.7
10 Afghanistan    41.8
# ℹ 1,694 more rows
gapminder[, c("country", "lifeExp", "year")]
# A tibble: 1,704 × 3
   country     lifeExp  year
   <fct>         <dbl> <int>
 1 Afghanistan    28.8  1952
 2 Afghanistan    30.3  1957
 3 Afghanistan    32.0  1962
 4 Afghanistan    34.0  1967
 5 Afghanistan    36.1  1972
 6 Afghanistan    38.4  1977
 7 Afghanistan    39.9  1982
 8 Afghanistan    40.8  1987
 9 Afghanistan    41.7  1992
10 Afghanistan    41.8  1997
# ℹ 1,694 more rows
gapminder[1:15, ]
# A tibble: 15 × 6
   country     continent  year lifeExp      pop gdpPercap
   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
 1 Afghanistan Asia       1952    28.8  8425333      779.
 2 Afghanistan Asia       1957    30.3  9240934      821.
 3 Afghanistan Asia       1962    32.0 10267083      853.
 4 Afghanistan Asia       1967    34.0 11537966      836.
 5 Afghanistan Asia       1972    36.1 13079460      740.
 6 Afghanistan Asia       1977    38.4 14880372      786.
 7 Afghanistan Asia       1982    39.9 12881816      978.
 8 Afghanistan Asia       1987    40.8 13867957      852.
 9 Afghanistan Asia       1992    41.7 16317921      649.
10 Afghanistan Asia       1997    41.8 22227415      635.
11 Afghanistan Asia       2002    42.1 25268405      727.
12 Afghanistan Asia       2007    43.8 31889923      975.
13 Albania     Europe     1952    55.2  1282697     1601.
14 Albania     Europe     1957    59.3  1476505     1942.
15 Albania     Europe     1962    64.8  1728137     2313.
library(dplyr)
gapminder %>% filter(country=="Croatia") %>% select(year, gdpPercap) %>% plot

gapminder[gapminder$country == "Croatia", ]
# A tibble: 12 × 6
   country continent  year lifeExp     pop gdpPercap
   <fct>   <fct>     <int>   <dbl>   <int>     <dbl>
 1 Croatia Europe     1952    61.2 3882229     3119.
 2 Croatia Europe     1957    64.8 3991242     4338.
 3 Croatia Europe     1962    67.1 4076557     5478.
 4 Croatia Europe     1967    68.5 4174366     6960.
 5 Croatia Europe     1972    69.6 4225310     9164.
 6 Croatia Europe     1977    70.6 4318673    11305.
 7 Croatia Europe     1982    70.5 4413368    13222.
 8 Croatia Europe     1987    71.5 4484310    13823.
 9 Croatia Europe     1992    72.5 4494013     8448.
10 Croatia Europe     1997    73.7 4444595     9876.
11 Croatia Europe     2002    74.9 4481020    11628.
12 Croatia Europe     2007    75.7 4493312    14619.
gapminder[gapminder$country == "Korea, Rep.", ]
# A tibble: 12 × 6
   country     continent  year lifeExp      pop gdpPercap
   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
 1 Korea, Rep. Asia       1952    47.5 20947571     1031.
 2 Korea, Rep. Asia       1957    52.7 22611552     1488.
 3 Korea, Rep. Asia       1962    55.3 26420307     1536.
 4 Korea, Rep. Asia       1967    57.7 30131000     2029.
 5 Korea, Rep. Asia       1972    62.6 33505000     3031.
 6 Korea, Rep. Asia       1977    64.8 36436000     4657.
 7 Korea, Rep. Asia       1982    67.1 39326000     5623.
 8 Korea, Rep. Asia       1987    69.8 41622000     8533.
 9 Korea, Rep. Asia       1992    72.2 43805450    12104.
10 Korea, Rep. Asia       1997    74.6 46173816    15994.
11 Korea, Rep. Asia       2002    77.0 47969150    19234.
12 Korea, Rep. Asia       2007    78.6 49044790    23348.
levels(gapminder$country)
  [1] "Afghanistan"              "Albania"                 
  [3] "Algeria"                  "Angola"                  
  [5] "Argentina"                "Australia"               
  [7] "Austria"                  "Bahrain"                 
  [9] "Bangladesh"               "Belgium"                 
 [11] "Benin"                    "Bolivia"                 
 [13] "Bosnia and Herzegovina"   "Botswana"                
 [15] "Brazil"                   "Bulgaria"                
 [17] "Burkina Faso"             "Burundi"                 
 [19] "Cambodia"                 "Cameroon"                
 [21] "Canada"                   "Central African Republic"
 [23] "Chad"                     "Chile"                   
 [25] "China"                    "Colombia"                
 [27] "Comoros"                  "Congo, Dem. Rep."        
 [29] "Congo, Rep."              "Costa Rica"              
 [31] "Cote d'Ivoire"            "Croatia"                 
 [33] "Cuba"                     "Czech Republic"          
 [35] "Denmark"                  "Djibouti"                
 [37] "Dominican Republic"       "Ecuador"                 
 [39] "Egypt"                    "El Salvador"             
 [41] "Equatorial Guinea"        "Eritrea"                 
 [43] "Ethiopia"                 "Finland"                 
 [45] "France"                   "Gabon"                   
 [47] "Gambia"                   "Germany"                 
 [49] "Ghana"                    "Greece"                  
 [51] "Guatemala"                "Guinea"                  
 [53] "Guinea-Bissau"            "Haiti"                   
 [55] "Honduras"                 "Hong Kong, China"        
 [57] "Hungary"                  "Iceland"                 
 [59] "India"                    "Indonesia"               
 [61] "Iran"                     "Iraq"                    
 [63] "Ireland"                  "Israel"                  
 [65] "Italy"                    "Jamaica"                 
 [67] "Japan"                    "Jordan"                  
 [69] "Kenya"                    "Korea, Dem. Rep."        
 [71] "Korea, Rep."              "Kuwait"                  
 [73] "Lebanon"                  "Lesotho"                 
 [75] "Liberia"                  "Libya"                   
 [77] "Madagascar"               "Malawi"                  
 [79] "Malaysia"                 "Mali"                    
 [81] "Mauritania"               "Mauritius"               
 [83] "Mexico"                   "Mongolia"                
 [85] "Montenegro"               "Morocco"                 
 [87] "Mozambique"               "Myanmar"                 
 [89] "Namibia"                  "Nepal"                   
 [91] "Netherlands"              "New Zealand"             
 [93] "Nicaragua"                "Niger"                   
 [95] "Nigeria"                  "Norway"                  
 [97] "Oman"                     "Pakistan"                
 [99] "Panama"                   "Paraguay"                
[101] "Peru"                     "Philippines"             
[103] "Poland"                   "Portugal"                
[105] "Puerto Rico"              "Reunion"                 
[107] "Romania"                  "Rwanda"                  
[109] "Sao Tome and Principe"    "Saudi Arabia"            
[111] "Senegal"                  "Serbia"                  
[113] "Sierra Leone"             "Singapore"               
[115] "Slovak Republic"          "Slovenia"                
[117] "Somalia"                  "South Africa"            
[119] "Spain"                    "Sri Lanka"               
[121] "Sudan"                    "Swaziland"               
[123] "Sweden"                   "Switzerland"             
[125] "Syria"                    "Taiwan"                  
[127] "Tanzania"                 "Thailand"                
[129] "Togo"                     "Trinidad and Tobago"     
[131] "Tunisia"                  "Turkey"                  
[133] "Uganda"                   "United Kingdom"          
[135] "United States"            "Uruguay"                 
[137] "Venezuela"                "Vietnam"                 
[139] "West Bank and Gaza"       "Yemen, Rep."             
[141] "Zambia"                   "Zimbabwe"                
gapminder[gapminder$country == "Croatia", "pop"]
# A tibble: 12 × 1
       pop
     <int>
 1 3882229
 2 3991242
 3 4076557
 4 4174366
 5 4225310
 6 4318673
 7 4413368
 8 4484310
 9 4494013
10 4444595
11 4481020
12 4493312
gapminder[gapminder$country == "Croatia", c("lifeExp","pop")]
# A tibble: 12 × 2
   lifeExp     pop
     <dbl>   <int>
 1    61.2 3882229
 2    64.8 3991242
 3    67.1 4076557
 4    68.5 4174366
 5    69.6 4225310
 6    70.6 4318673
 7    70.5 4413368
 8    71.5 4484310
 9    72.5 4494013
10    73.7 4444595
11    74.9 4481020
12    75.7 4493312
gapminder[gapminder$country == "Croatia" & #Croatia extraction
             gapminder$year > 1990, #1990 after
           c("lifeExp","pop")] # those variables
# A tibble: 4 × 2
  lifeExp     pop
    <dbl>   <int>
1    72.5 4494013
2    73.7 4444595
3    74.9 4481020
4    75.7 4493312
apply(gapminder[gapminder$country == "Croatia",
                 c("lifeExp","pop")],
       2, mean)
     lifeExp          pop 
7.005592e+01 4.289916e+06 
apply(gapminder[gapminder$country == "Korea, Rep.",
                 c("lifeExp","pop")],
       2, mean)
     lifeExp          pop 
      65.001 36499386.333 
# 03 Data processing using the dplyr library #
select(gapminder, country, year, lifeExp)
# A tibble: 1,704 × 3
   country      year lifeExp
   <fct>       <int>   <dbl>
 1 Afghanistan  1952    28.8
 2 Afghanistan  1957    30.3
 3 Afghanistan  1962    32.0
 4 Afghanistan  1967    34.0
 5 Afghanistan  1972    36.1
 6 Afghanistan  1977    38.4
 7 Afghanistan  1982    39.9
 8 Afghanistan  1987    40.8
 9 Afghanistan  1992    41.7
10 Afghanistan  1997    41.8
# ℹ 1,694 more rows
filter(gapminder, country == "Croatia")
# A tibble: 12 × 6
   country continent  year lifeExp     pop gdpPercap
   <fct>   <fct>     <int>   <dbl>   <int>     <dbl>
 1 Croatia Europe     1952    61.2 3882229     3119.
 2 Croatia Europe     1957    64.8 3991242     4338.
 3 Croatia Europe     1962    67.1 4076557     5478.
 4 Croatia Europe     1967    68.5 4174366     6960.
 5 Croatia Europe     1972    69.6 4225310     9164.
 6 Croatia Europe     1977    70.6 4318673    11305.
 7 Croatia Europe     1982    70.5 4413368    13222.
 8 Croatia Europe     1987    71.5 4484310    13823.
 9 Croatia Europe     1992    72.5 4494013     8448.
10 Croatia Europe     1997    73.7 4444595     9876.
11 Croatia Europe     2002    74.9 4481020    11628.
12 Croatia Europe     2007    75.7 4493312    14619.
summarize(gapminder, pop_avg = mean(pop))
# A tibble: 1 × 1
    pop_avg
      <dbl>
1 29601212.
summarize(group_by(gapminder, continent), pop_avg = mean(pop))
# A tibble: 5 × 2
  continent   pop_avg
  <fct>         <dbl>
1 Africa     9916003.
2 Americas  24504795.
3 Asia      77038722.
4 Europe    17169765.
5 Oceania    8874672.
summarize(group_by(gapminder, continent, country), pop_avg = mean(pop))
`summarise()` has grouped output by 'continent'. You can override using the
`.groups` argument.
# A tibble: 142 × 3
# Groups:   continent [5]
   continent country                    pop_avg
   <fct>     <fct>                        <dbl>
 1 Africa    Algeria                  19875406.
 2 Africa    Angola                    7309390.
 3 Africa    Benin                     4017497.
 4 Africa    Botswana                   971186.
 5 Africa    Burkina Faso              7548677.
 6 Africa    Burundi                   4651608.
 7 Africa    Cameroon                  9816648.
 8 Africa    Central African Republic  2560963 
 9 Africa    Chad                      5329256.
10 Africa    Comoros                    361684.
# ℹ 132 more rows
gapminder %>%
   group_by(continent, country) %>%
   summarize(pop_avg = mean(pop))
`summarise()` has grouped output by 'continent'. You can override using the
`.groups` argument.
# A tibble: 142 × 3
# Groups:   continent [5]
   continent country                    pop_avg
   <fct>     <fct>                        <dbl>
 1 Africa    Algeria                  19875406.
 2 Africa    Angola                    7309390.
 3 Africa    Benin                     4017497.
 4 Africa    Botswana                   971186.
 5 Africa    Burkina Faso              7548677.
 6 Africa    Burundi                   4651608.
 7 Africa    Cameroon                  9816648.
 8 Africa    Central African Republic  2560963 
 9 Africa    Chad                      5329256.
10 Africa    Comoros                    361684.
# ℹ 132 more rows
temp1 = filter(gapminder, country == "Croatia")
temp2 = select(temp1, country, year, lifeExp)
temp3 = apply(temp2[ , c("lifeExp")], 2, mean)
temp3
 lifeExp 
70.05592 
gapminder %>%
   filter(country == "Croatia") %>%
   select(country, year, lifeExp) %>%
   summarize(lifeExp_avg = mean(lifeExp))
# A tibble: 1 × 1
  lifeExp_avg
        <dbl>
1        70.1

data in need: avocado.csv

# 04 The reality of data processing #
library(ggplot2)
avocado <- read.csv("data/avocado.csv", header=TRUE, sep = ",")

str(avocado)
'data.frame':   18249 obs. of  14 variables:
 $ X           : int  0 1 2 3 4 5 6 7 8 9 ...
 $ Date        : chr  "2015-12-27" "2015-12-20" "2015-12-13" "2015-12-06" ...
 $ AveragePrice: num  1.33 1.35 0.93 1.08 1.28 1.26 0.99 0.98 1.02 1.07 ...
 $ Total.Volume: num  64237 54877 118220 78992 51040 ...
 $ X4046       : num  1037 674 795 1132 941 ...
 $ X4225       : num  54455 44639 109150 71976 43838 ...
 $ X4770       : num  48.2 58.3 130.5 72.6 75.8 ...
 $ Total.Bags  : num  8697 9506 8145 5811 6184 ...
 $ Small.Bags  : num  8604 9408 8042 5677 5986 ...
 $ Large.Bags  : num  93.2 97.5 103.1 133.8 197.7 ...
 $ XLarge.Bags : num  0 0 0 0 0 0 0 0 0 0 ...
 $ type        : chr  "conventional" "conventional" "conventional" "conventional" ...
 $ year        : int  2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
 $ region      : chr  "Albany" "Albany" "Albany" "Albany" ...
(x_avg = avocado %>% group_by(region) %>% summarize(V_avg = mean(Total.Volume), P_avg = mean(AveragePrice)))
# A tibble: 54 × 3
   region                 V_avg P_avg
   <chr>                  <dbl> <dbl>
 1 Albany                47538.  1.56
 2 Atlanta              262145.  1.34
 3 BaltimoreWashington  398562.  1.53
 4 Boise                 42643.  1.35
 5 Boston               287793.  1.53
 6 BuffaloRochester      67936.  1.52
 7 California          3044324.  1.40
 8 Charlotte            105194.  1.61
 9 Chicago              395569.  1.56
10 CincinnatiDayton     131722.  1.21
# ℹ 44 more rows
(x_avg = avocado %>% group_by(region, year) %>% summarize(V_avg = mean(Total.Volume), P_avg = mean(AveragePrice)))
`summarise()` has grouped output by 'region'. You can override using the
`.groups` argument.
# A tibble: 216 × 4
# Groups:   region [54]
   region               year   V_avg P_avg
   <chr>               <int>   <dbl> <dbl>
 1 Albany               2015  38749.  1.54
 2 Albany               2016  50619.  1.53
 3 Albany               2017  49355.  1.64
 4 Albany               2018  64249.  1.44
 5 Atlanta              2015 223382.  1.38
 6 Atlanta              2016 272374.  1.21
 7 Atlanta              2017 271841.  1.43
 8 Atlanta              2018 342976.  1.29
 9 BaltimoreWashington  2015 390823.  1.37
10 BaltimoreWashington  2016 393210.  1.59
# ℹ 206 more rows
x_avg = avocado %>% group_by(region, year, type) %>% summarize(V_avg = mean(Total.Volume), P_avg = mean(AveragePrice))
`summarise()` has grouped output by 'region', 'year'. You can override using
the `.groups` argument.
avocado %>%
   group_by(region, year, type) %>%
   summarize(V_avg = mean(Total.Volume),
             P_avg = mean(AveragePrice)) -> x_avg
`summarise()` has grouped output by 'region', 'year'. You can override using
the `.groups` argument.
x_avg %>% filter(region != "TotalUS") %>% 
  ggplot(aes(year, V_avg, col = type)) + geom_line() + facet_wrap(~region)

# install.packages("ggplot2")
library(ggplot2)

arrange(x_avg, desc(V_avg))
# A tibble: 432 × 5
# Groups:   region, year [216]
   region        year type             V_avg P_avg
   <chr>        <int> <chr>            <dbl> <dbl>
 1 TotalUS       2018 conventional 42125533. 1.06 
 2 TotalUS       2016 conventional 34043450. 1.05 
 3 TotalUS       2017 conventional 33995658. 1.22 
 4 TotalUS       2015 conventional 31224729. 1.01 
 5 SouthCentral  2018 conventional  7465557. 0.806
 6 West          2018 conventional  7451445. 0.981
 7 California    2018 conventional  6786962. 1.08 
 8 West          2016 conventional  6404892. 0.916
 9 West          2017 conventional  6279482. 1.10 
10 California    2016 conventional  6105539. 1.05 
# ℹ 422 more rows
x_avg1 = x_avg %>% filter(region != "TotalUS")
# After excluding TotalUS, you can process it using statistical functions directly.

x_avg1[x_avg1$V_avg == max(x_avg1$V_avg),]
# A tibble: 1 × 5
# Groups:   region, year [1]
  region        year type            V_avg P_avg
  <chr>        <int> <chr>           <dbl> <dbl>
1 SouthCentral  2018 conventional 7465557. 0.806
# install.packages("lubridate")
library(lubridate)

Attaching package: 'lubridate'
The following objects are masked from 'package:base':

    date, intersect, setdiff, union
(x_avg = avocado %>% 
    group_by(region, year, month(Date), type) %>% 
    summarize(V_avg = mean(Total.Volume), 
              P_avg = mean(AveragePrice)))
`summarise()` has grouped output by 'region', 'year', 'month(Date)'. You can
override using the `.groups` argument.
# A tibble: 4,212 × 6
# Groups:   region, year, month(Date) [2,106]
   region  year `month(Date)` type          V_avg P_avg
   <chr>  <int>         <dbl> <chr>         <dbl> <dbl>
 1 Albany  2015             1 conventional 42932.  1.17
 2 Albany  2015             1 organic       1198.  1.84
 3 Albany  2015             2 conventional 52343.  1.03
 4 Albany  2015             2 organic       1334.  1.76
 5 Albany  2015             3 conventional 50659.  1.06
 6 Albany  2015             3 organic       1444.  1.83
 7 Albany  2015             4 conventional 48594.  1.17
 8 Albany  2015             4 organic       1402.  1.89
 9 Albany  2015             5 conventional 97216.  1.26
10 Albany  2015             5 organic       1836.  1.94
# ℹ 4,202 more rows

Class


Introduction to Tidyverse

The tidyverse is a powerful collection  of  R  packages  that are actually  data tools for transforming and visualizing data. All packages of the tidyverse share an underlying philosophy and common APls.

The core packages are: 

  • ggplot2, which implements the grammar of graphics. You can use it to visualize your data.

  • dplyr is a grammar of data You can use it to solve the most common data manipulation challenges.

  • tidyr helps you to create tidy data or data where each variable is in a column, each observation is a row end each value is a column, each observation is a row end each value is a cell. 

  • readr is a fast and friendly way to read rectangular

  • purrr enhances R’s functional programming (FP)toolkit by providing a complete and consistent set of tools for working with functions and vectors. 

  • tibble is a modern re-imaginging of the data

  • stringr provides a cohesive set of functions designed to make working with strings as easy as possible

  • forcats provide a suite of useful  tools that solve common problems with factors.  



The introduction of the tidyverse package in R has significantly influenced the way data science is performed using R, impacting coding practices, data analysis methodologies, and the overall approach to data manipulation and visualization.

Before Tidyverse

Before the tidyverse, R programming was largely centered around base R functions and packages. This included using base R functions for data manipulation (like subset, merge, and apply functions) and visualization (such as plotting with plot and hist). The syntax and methods varied widely across different packages, which often led to inconsistent coding practices and a steeper learning curve for beginners. Each task could be approached in multiple ways without a clear ‘best’ method, leading to fragmented and less readable code.

After Tidyverse

The tidyverse, developed by Hadley Wickham and others, brought a suite of packages designed to work harmoniously together using a consistent syntax and underlying philosophy. Key features and impacts include:

  1. Consistent Syntax: The tidyverse introduced a consistent and readable syntax that leverages chaining operations using the %>% operator from the magrittr package. This has made code more readable and easier to write and understand, especially for newcomers.

  2. Data Manipulation: With dplyr, data manipulation became more intuitive and less verbose. Functions like filter(), arrange(), select(), mutate(), and summarise() allow for straightforward data operations that are both faster and easier to code compared to base R functions.

  3. Data Importing and Tidying: readr for reading data and tidyr for tidying data introduced more efficient data reading and transforming capabilities, making it simpler to convert data into a tidy format. Tidy data, where each column is a variable and each row is an observation, has become a standard for data analysis, facilitating easier manipulation and analysis.

  4. Visualization: ggplot2 transformed data visualization in R by allowing for the layering of plots intuitively and flexibly, using a system based on the grammar of graphics. This has enabled users to create complex, publication-quality graphs more easily than was possible with base R plotting functions.

  5. Community and Accessibility: The tidyverse has fostered a strong community and has contributed significantly to teaching materials that are user-friendly and accessible to beginners. This has democratized data analysis in R, making it more accessible to non-programmers.

  6. Impact on Package Development: The tidyverse’s philosophy and popularity have influenced the development of other packages, even those not part of the tidyverse, to adopt tidy principles and interoperate smoothly with tidyverse packages.

The tidyverse has not only changed the syntax and functionality of R coding but also its philosophy towards data analysis. It promotes a workflow that is coherent, transparent, and efficient, which has been widely adopted in academia, industry, and teaching. While some veteran R users prefer the flexibility and control of base R, the tidyverse’s approachable syntax and powerful capabilities have made it a pivotal tool in modern R programming, particularly for data science.


Please check out the homepage of tidyverse: https://www.tidyverse.org/


You can install the complete tidyverse with:

> install.packages(“tidyverse”)

Then, load the core tidyverse and make it available in your current R session by running:

> library(tidyverse)

Please see this cheat sheet:

https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf


Pipe

Subset Observations

# filter
iris %>% filter(Sepal.Length > 7)
   Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
1           7.1         3.0          5.9         2.1 virginica
2           7.6         3.0          6.6         2.1 virginica
3           7.3         2.9          6.3         1.8 virginica
4           7.2         3.6          6.1         2.5 virginica
5           7.7         3.8          6.7         2.2 virginica
6           7.7         2.6          6.9         2.3 virginica
7           7.7         2.8          6.7         2.0 virginica
8           7.2         3.2          6.0         1.8 virginica
9           7.2         3.0          5.8         1.6 virginica
10          7.4         2.8          6.1         1.9 virginica
11          7.9         3.8          6.4         2.0 virginica
12          7.7         3.0          6.1         2.3 virginica
levels(iris$Species)
[1] "setosa"     "versicolor" "virginica" 
iris %>% filter(Species == "setosa")
   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1           5.1         3.5          1.4         0.2  setosa
2           4.9         3.0          1.4         0.2  setosa
3           4.7         3.2          1.3         0.2  setosa
4           4.6         3.1          1.5         0.2  setosa
5           5.0         3.6          1.4         0.2  setosa
6           5.4         3.9          1.7         0.4  setosa
7           4.6         3.4          1.4         0.3  setosa
8           5.0         3.4          1.5         0.2  setosa
9           4.4         2.9          1.4         0.2  setosa
10          4.9         3.1          1.5         0.1  setosa
11          5.4         3.7          1.5         0.2  setosa
12          4.8         3.4          1.6         0.2  setosa
13          4.8         3.0          1.4         0.1  setosa
14          4.3         3.0          1.1         0.1  setosa
15          5.8         4.0          1.2         0.2  setosa
16          5.7         4.4          1.5         0.4  setosa
17          5.4         3.9          1.3         0.4  setosa
18          5.1         3.5          1.4         0.3  setosa
19          5.7         3.8          1.7         0.3  setosa
20          5.1         3.8          1.5         0.3  setosa
21          5.4         3.4          1.7         0.2  setosa
22          5.1         3.7          1.5         0.4  setosa
23          4.6         3.6          1.0         0.2  setosa
24          5.1         3.3          1.7         0.5  setosa
25          4.8         3.4          1.9         0.2  setosa
26          5.0         3.0          1.6         0.2  setosa
27          5.0         3.4          1.6         0.4  setosa
28          5.2         3.5          1.5         0.2  setosa
29          5.2         3.4          1.4         0.2  setosa
30          4.7         3.2          1.6         0.2  setosa
31          4.8         3.1          1.6         0.2  setosa
32          5.4         3.4          1.5         0.4  setosa
33          5.2         4.1          1.5         0.1  setosa
34          5.5         4.2          1.4         0.2  setosa
35          4.9         3.1          1.5         0.2  setosa
36          5.0         3.2          1.2         0.2  setosa
37          5.5         3.5          1.3         0.2  setosa
38          4.9         3.6          1.4         0.1  setosa
39          4.4         3.0          1.3         0.2  setosa
40          5.1         3.4          1.5         0.2  setosa
41          5.0         3.5          1.3         0.3  setosa
42          4.5         2.3          1.3         0.3  setosa
43          4.4         3.2          1.3         0.2  setosa
44          5.0         3.5          1.6         0.6  setosa
45          5.1         3.8          1.9         0.4  setosa
46          4.8         3.0          1.4         0.3  setosa
47          5.1         3.8          1.6         0.2  setosa
48          4.6         3.2          1.4         0.2  setosa
49          5.3         3.7          1.5         0.2  setosa
50          5.0         3.3          1.4         0.2  setosa
iris %>% filter(Species == "setosa") %>% plot

iris %>% filter(Species == "setosa") %>% summary
  Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
 Min.   :4.300   Min.   :2.300   Min.   :1.000   Min.   :0.100  
 1st Qu.:4.800   1st Qu.:3.200   1st Qu.:1.400   1st Qu.:0.200  
 Median :5.000   Median :3.400   Median :1.500   Median :0.200  
 Mean   :5.006   Mean   :3.428   Mean   :1.462   Mean   :0.246  
 3rd Qu.:5.200   3rd Qu.:3.675   3rd Qu.:1.575   3rd Qu.:0.300  
 Max.   :5.800   Max.   :4.400   Max.   :1.900   Max.   :0.600  
       Species  
 setosa    :50  
 versicolor: 0  
 virginica : 0  
                
                
                
# distinct: remove duplication (take only unique values)
iris %>% distinct(Species)
     Species
1     setosa
2 versicolor
3  virginica
# random sampling
iris %>% nrow
[1] 150
iris %>% sample_frac(0.5, replace=T)
   Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
1           6.0         2.7          5.1         1.6 versicolor
2           6.6         3.0          4.4         1.4 versicolor
3           6.3         2.3          4.4         1.3 versicolor
4           4.8         3.4          1.9         0.2     setosa
5           5.1         3.7          1.5         0.4     setosa
6           5.2         2.7          3.9         1.4 versicolor
7           6.2         3.4          5.4         2.3  virginica
8           5.4         3.9          1.3         0.4     setosa
9           7.3         2.9          6.3         1.8  virginica
10          6.2         3.4          5.4         2.3  virginica
11          5.9         3.0          5.1         1.8  virginica
12          6.9         3.1          5.4         2.1  virginica
13          4.5         2.3          1.3         0.3     setosa
14          7.7         2.6          6.9         2.3  virginica
15          5.6         2.8          4.9         2.0  virginica
16          7.7         3.0          6.1         2.3  virginica
17          5.0         3.5          1.6         0.6     setosa
18          5.1         3.8          1.5         0.3     setosa
19          5.6         2.9          3.6         1.3 versicolor
20          6.4         3.2          4.5         1.5 versicolor
21          6.5         3.0          5.5         1.8  virginica
22          4.9         2.5          4.5         1.7  virginica
23          6.3         2.5          5.0         1.9  virginica
24          4.4         2.9          1.4         0.2     setosa
25          6.3         3.3          4.7         1.6 versicolor
26          5.4         3.0          4.5         1.5 versicolor
27          6.9         3.1          4.9         1.5 versicolor
28          4.4         2.9          1.4         0.2     setosa
29          5.4         3.9          1.7         0.4     setosa
30          4.9         3.1          1.5         0.2     setosa
31          6.1         2.8          4.0         1.3 versicolor
32          5.4         3.4          1.7         0.2     setosa
33          5.8         2.7          4.1         1.0 versicolor
34          5.7         3.0          4.2         1.2 versicolor
35          6.0         2.7          5.1         1.6 versicolor
36          6.5         3.2          5.1         2.0  virginica
37          5.1         2.5          3.0         1.1 versicolor
38          5.5         2.3          4.0         1.3 versicolor
39          6.7         3.1          5.6         2.4  virginica
40          6.7         3.1          5.6         2.4  virginica
41          6.8         2.8          4.8         1.4 versicolor
42          5.0         2.0          3.5         1.0 versicolor
43          5.5         3.5          1.3         0.2     setosa
44          5.8         2.7          5.1         1.9  virginica
45          5.2         2.7          3.9         1.4 versicolor
46          7.0         3.2          4.7         1.4 versicolor
47          6.3         2.9          5.6         1.8  virginica
48          4.8         3.4          1.6         0.2     setosa
49          6.4         2.8          5.6         2.2  virginica
50          5.9         3.2          4.8         1.8 versicolor
51          4.9         2.4          3.3         1.0 versicolor
52          4.9         3.1          1.5         0.2     setosa
53          4.8         3.0          1.4         0.3     setosa
54          6.4         2.9          4.3         1.3 versicolor
55          4.4         3.2          1.3         0.2     setosa
56          5.7         3.0          4.2         1.2 versicolor
57          5.7         2.6          3.5         1.0 versicolor
58          5.5         2.6          4.4         1.2 versicolor
59          5.6         2.7          4.2         1.3 versicolor
60          6.5         2.8          4.6         1.5 versicolor
61          5.1         3.7          1.5         0.4     setosa
62          6.4         2.8          5.6         2.1  virginica
63          5.4         3.9          1.3         0.4     setosa
64          6.4         3.2          4.5         1.5 versicolor
65          5.5         3.5          1.3         0.2     setosa
66          5.7         2.9          4.2         1.3 versicolor
67          6.9         3.2          5.7         2.3  virginica
68          6.7         3.3          5.7         2.1  virginica
69          4.6         3.4          1.4         0.3     setosa
70          6.7         3.0          5.0         1.7 versicolor
71          5.6         2.5          3.9         1.1 versicolor
72          5.3         3.7          1.5         0.2     setosa
73          5.6         2.8          4.9         2.0  virginica
74          6.0         2.2          4.0         1.0 versicolor
75          7.2         3.6          6.1         2.5  virginica
iris %>% sample_frac(0.5, replace=T) %>% nrow
[1] 75
iris %>% sample_n(10, replace=T) 
   Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
1           5.1         3.8          1.6         0.2     setosa
2           4.5         2.3          1.3         0.3     setosa
3           7.2         3.6          6.1         2.5  virginica
4           5.8         2.7          5.1         1.9  virginica
5           7.2         3.6          6.1         2.5  virginica
6           6.4         3.2          5.3         2.3  virginica
7           6.8         3.0          5.5         2.1  virginica
8           6.3         2.9          5.6         1.8  virginica
9           6.0         2.7          5.1         1.6 versicolor
10          6.0         2.9          4.5         1.5 versicolor
iris %>% sample_n(10, replace=T) %>% nrow 
[1] 10
# slice 
iris %>% slice(10:15)
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1          4.9         3.1          1.5         0.1  setosa
2          5.4         3.7          1.5         0.2  setosa
3          4.8         3.4          1.6         0.2  setosa
4          4.8         3.0          1.4         0.1  setosa
5          4.3         3.0          1.1         0.1  setosa
6          5.8         4.0          1.2         0.2  setosa
# Top n in X
iris %>% top_n(5, Sepal.Length)
  Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
1          7.7         3.8          6.7         2.2 virginica
2          7.7         2.6          6.9         2.3 virginica
3          7.7         2.8          6.7         2.0 virginica
4          7.9         3.8          6.4         2.0 virginica
5          7.7         3.0          6.1         2.3 virginica
iris %>% top_n(5, Sepal.Width)
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1          5.4         3.9          1.7         0.4  setosa
2          5.8         4.0          1.2         0.2  setosa
3          5.7         4.4          1.5         0.4  setosa
4          5.4         3.9          1.3         0.4  setosa
5          5.2         4.1          1.5         0.1  setosa
6          5.5         4.2          1.4         0.2  setosa

Subset Variables

# pull & select
iris %>% pull(Petal.Width)
  [1] 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 0.2 0.2 0.1 0.1 0.2 0.4 0.4 0.3
 [19] 0.3 0.3 0.2 0.4 0.2 0.5 0.2 0.2 0.4 0.2 0.2 0.2 0.2 0.4 0.1 0.2 0.2 0.2
 [37] 0.2 0.1 0.2 0.2 0.3 0.3 0.2 0.6 0.4 0.3 0.2 0.2 0.2 0.2 1.4 1.5 1.5 1.3
 [55] 1.5 1.3 1.6 1.0 1.3 1.4 1.0 1.5 1.0 1.4 1.3 1.4 1.5 1.0 1.5 1.1 1.8 1.3
 [73] 1.5 1.2 1.3 1.4 1.4 1.7 1.5 1.0 1.1 1.0 1.2 1.6 1.5 1.6 1.5 1.3 1.3 1.3
 [91] 1.2 1.4 1.2 1.0 1.3 1.2 1.3 1.3 1.1 1.3 2.5 1.9 2.1 1.8 2.2 2.1 1.7 1.8
[109] 1.8 2.5 2.0 1.9 2.1 2.0 2.4 2.3 1.8 2.2 2.3 1.5 2.3 2.0 2.0 1.8 2.1 1.8
[127] 1.8 1.8 2.1 1.6 1.9 2.0 2.2 1.5 1.4 2.3 2.4 1.8 1.8 2.1 2.4 2.3 1.9 2.3
[145] 2.5 2.3 1.9 2.0 2.3 1.8
iris %>% select(Petal.Width)
    Petal.Width
1           0.2
2           0.2
3           0.2
4           0.2
5           0.2
6           0.4
7           0.3
8           0.2
9           0.2
10          0.1
11          0.2
12          0.2
13          0.1
14          0.1
15          0.2
16          0.4
17          0.4
18          0.3
19          0.3
20          0.3
21          0.2
22          0.4
23          0.2
24          0.5
25          0.2
26          0.2
27          0.4
28          0.2
29          0.2
30          0.2
31          0.2
32          0.4
33          0.1
34          0.2
35          0.2
36          0.2
37          0.2
38          0.1
39          0.2
40          0.2
41          0.3
42          0.3
43          0.2
44          0.6
45          0.4
46          0.3
47          0.2
48          0.2
49          0.2
50          0.2
51          1.4
52          1.5
53          1.5
54          1.3
55          1.5
56          1.3
57          1.6
58          1.0
59          1.3
60          1.4
61          1.0
62          1.5
63          1.0
64          1.4
65          1.3
66          1.4
67          1.5
68          1.0
69          1.5
70          1.1
71          1.8
72          1.3
73          1.5
74          1.2
75          1.3
76          1.4
77          1.4
78          1.7
79          1.5
80          1.0
81          1.1
82          1.0
83          1.2
84          1.6
85          1.5
86          1.6
87          1.5
88          1.3
89          1.3
90          1.3
91          1.2
92          1.4
93          1.2
94          1.0
95          1.3
96          1.2
97          1.3
98          1.3
99          1.1
100         1.3
101         2.5
102         1.9
103         2.1
104         1.8
105         2.2
106         2.1
107         1.7
108         1.8
109         1.8
110         2.5
111         2.0
112         1.9
113         2.1
114         2.0
115         2.4
116         2.3
117         1.8
118         2.2
119         2.3
120         1.5
121         2.3
122         2.0
123         2.0
124         1.8
125         2.1
126         1.8
127         1.8
128         1.8
129         2.1
130         1.6
131         1.9
132         2.0
133         2.2
134         1.5
135         1.4
136         2.3
137         2.4
138         1.8
139         1.8
140         2.1
141         2.4
142         2.3
143         1.9
144         2.3
145         2.5
146         2.3
147         1.9
148         2.0
149         2.3
150         1.8
iris %>% pull(Petal.Width) %>% str
 num [1:150] 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
iris %>% select(Petal.Width) %>% str
'data.frame':   150 obs. of  1 variable:
 $ Petal.Width: num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
iris %>% select(Petal.Length, Petal.Width) %>% head
  Petal.Length Petal.Width
1          1.4         0.2
2          1.4         0.2
3          1.3         0.2
4          1.5         0.2
5          1.4         0.2
6          1.7         0.4
# useful helpers: starts_with(), contains()
iris %>% select(starts_with("Peta")) %>% head
  Petal.Length Petal.Width
1          1.4         0.2
2          1.4         0.2
3          1.3         0.2
4          1.5         0.2
5          1.4         0.2
6          1.7         0.4
iris %>% select(contains("tal")) %>% head
  Petal.Length Petal.Width
1          1.4         0.2
2          1.4         0.2
3          1.3         0.2
4          1.5         0.2
5          1.4         0.2
6          1.7         0.4

Reshaping & Arrange data

# let's use mtcars dataset
mtcars
                     mpg cyl  disp  hp drat    wt  qsec vs am gear carb
Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
Hornet 4 Drive      21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
Valiant             18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
Merc 240D           24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
Merc 230            22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
Fiat 128            32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
Honda Civic         30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
Toyota Corolla      33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
Toyota Corona       21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
Dodge Challenger    15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
AMC Javelin         15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
Pontiac Firebird    19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
Fiat X1-9           27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
Porsche 914-2       26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
Lotus Europa        30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
Ford Pantera L      15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
Ferrari Dino        19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
Maserati Bora       15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
Volvo 142E          21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
# About mtcars dataset
# help(mtcars)

# arrange
mtcars %>% arrange(mpg) %>% head
                     mpg cyl disp  hp drat    wt  qsec vs am gear carb
Cadillac Fleetwood  10.4   8  472 205 2.93 5.250 17.98  0  0    3    4
Lincoln Continental 10.4   8  460 215 3.00 5.424 17.82  0  0    3    4
Camaro Z28          13.3   8  350 245 3.73 3.840 15.41  0  0    3    4
Duster 360          14.3   8  360 245 3.21 3.570 15.84  0  0    3    4
Chrysler Imperial   14.7   8  440 230 3.23 5.345 17.42  0  0    3    4
Maserati Bora       15.0   8  301 335 3.54 3.570 14.60  0  1    5    8
mtcars %>% add_rownames %>% head
Warning: `add_rownames()` was deprecated in dplyr 1.0.0.
ℹ Please use `tibble::rownames_to_column()` instead.
# A tibble: 6 × 12
  rowname        mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
  <chr>        <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Mazda RX4     21       6   160   110  3.9   2.62  16.5     0     1     4     4
2 Mazda RX4 W…  21       6   160   110  3.9   2.88  17.0     0     1     4     4
3 Datsun 710    22.8     4   108    93  3.85  2.32  18.6     1     1     4     1
4 Hornet 4 Dr…  21.4     6   258   110  3.08  3.22  19.4     1     0     3     1
5 Hornet Spor…  18.7     8   360   175  3.15  3.44  17.0     0     0     3     2
6 Valiant       18.1     6   225   105  2.76  3.46  20.2     1     0     3     1
mtcars %>% add_rownames %>% arrange(mpg) %>% 
head
Warning: `add_rownames()` was deprecated in dplyr 1.0.0.
ℹ Please use `tibble::rownames_to_column()` instead.
# A tibble: 6 × 12
  rowname        mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
  <chr>        <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Cadillac Fl…  10.4     8   472   205  2.93  5.25  18.0     0     0     3     4
2 Lincoln Con…  10.4     8   460   215  3     5.42  17.8     0     0     3     4
3 Camaro Z28    13.3     8   350   245  3.73  3.84  15.4     0     0     3     4
4 Duster 360    14.3     8   360   245  3.21  3.57  15.8     0     0     3     4
5 Chrysler Im…  14.7     8   440   230  3.23  5.34  17.4     0     0     3     4
6 Maserati Bo…  15       8   301   335  3.54  3.57  14.6     0     1     5     8
mtcars %>% add_rownames %>% arrange(desc(mpg)) %>% select(rowname)
Warning: `add_rownames()` was deprecated in dplyr 1.0.0.
ℹ Please use `tibble::rownames_to_column()` instead.
# A tibble: 32 × 1
   rowname       
   <chr>         
 1 Toyota Corolla
 2 Fiat 128      
 3 Honda Civic   
 4 Lotus Europa  
 5 Fiat X1-9     
 6 Porsche 914-2 
 7 Merc 240D     
 8 Datsun 710    
 9 Merc 230      
10 Toyota Corona 
# ℹ 22 more rows

Summarise data

# Summarise

iris %>% summarise(avg.PL=mean(Petal.Length))
  avg.PL
1  3.758
iris %>% summarise(sd.PL=sd(Petal.Length))
     sd.PL
1 1.765298
iris %>% summarise(avg.PL=mean(Petal.Length),
                   sd.PL=sd(Petal.Length),
                   min.PL=min(Petal.Length))
  avg.PL    sd.PL min.PL
1  3.758 1.765298      1
iris %>% count(Species)
     Species  n
1     setosa 50
2 versicolor 50
3  virginica 50
iris %>% sample_frac(0.3) %>% count(Species)
     Species  n
1     setosa 10
2 versicolor 17
3  virginica 18
iris %>% summarise_all(mean)
Warning: There was 1 warning in `summarise()`.
ℹ In argument: `Species = (function (x, ...) ...`.
Caused by warning in `mean.default()`:
! argument is not numeric or logical: returning NA
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1     5.843333    3.057333        3.758    1.199333      NA
iris %>% summarise_at("Petal.Length", sum)
  Petal.Length
1        563.7
iris %>% summarise_at(c("Petal.Length", "Petal.Width"), mean)
  Petal.Length Petal.Width
1        3.758    1.199333

Group and Summarise data

# Group
iris %>% group_by(Species)
# A tibble: 150 × 5
# Groups:   Species [3]
   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
          <dbl>       <dbl>        <dbl>       <dbl> <fct>  
 1          5.1         3.5          1.4         0.2 setosa 
 2          4.9         3            1.4         0.2 setosa 
 3          4.7         3.2          1.3         0.2 setosa 
 4          4.6         3.1          1.5         0.2 setosa 
 5          5           3.6          1.4         0.2 setosa 
 6          5.4         3.9          1.7         0.4 setosa 
 7          4.6         3.4          1.4         0.3 setosa 
 8          5           3.4          1.5         0.2 setosa 
 9          4.4         2.9          1.4         0.2 setosa 
10          4.9         3.1          1.5         0.1 setosa 
# ℹ 140 more rows
iris %>% group_by(Species) %>% 
  summarise_all(mean)
# A tibble: 3 × 5
  Species    Sepal.Length Sepal.Width Petal.Length Petal.Width
  <fct>             <dbl>       <dbl>        <dbl>       <dbl>
1 setosa             5.01        3.43         1.46       0.246
2 versicolor         5.94        2.77         4.26       1.33 
3 virginica          6.59        2.97         5.55       2.03 
iris %>% group_by(Species) %>% 
  select(starts_with("Sep")) %>% 
  summarise_all(mean)
Adding missing grouping variables: `Species`
# A tibble: 3 × 3
  Species    Sepal.Length Sepal.Width
  <fct>             <dbl>       <dbl>
1 setosa             5.01        3.43
2 versicolor         5.94        2.77
3 virginica          6.59        2.97
mtcars %>% group_by(am) %>% 
  summarise(hp.avg=mean(hp),
            hp.sd=sd(hp))
# A tibble: 2 × 3
     am hp.avg hp.sd
  <dbl>  <dbl> <dbl>
1     0   160.  53.9
2     1   127.  84.1

Make New Variables

# mutate
# mpg to kml
mtcars %>% mutate(kml=0.425144*mpg)
                     mpg cyl  disp  hp drat    wt  qsec vs am gear carb
Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
Hornet 4 Drive      21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
Valiant             18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
Merc 240D           24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
Merc 230            22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
Fiat 128            32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
Honda Civic         30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
Toyota Corolla      33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
Toyota Corona       21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
Dodge Challenger    15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
AMC Javelin         15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
Pontiac Firebird    19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
Fiat X1-9           27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
Porsche 914-2       26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
Lotus Europa        30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
Ford Pantera L      15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
Ferrari Dino        19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
Maserati Bora       15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
Volvo 142E          21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
                          kml
Mazda RX4            8.928024
Mazda RX4 Wag        8.928024
Datsun 710           9.693283
Hornet 4 Drive       9.098082
Hornet Sportabout    7.950193
Valiant              7.695106
Duster 360           6.079559
Merc 240D           10.373514
Merc 230             9.693283
Merc 280             8.162765
Merc 280C            7.567563
Merc 450SE           6.972362
Merc 450SL           7.354991
Merc 450SLC          6.462189
Cadillac Fleetwood   4.421498
Lincoln Continental  4.421498
Chrysler Imperial    6.249617
Fiat 128            13.774666
Honda Civic         12.924378
Toyota Corolla      14.412382
Toyota Corona        9.140596
Dodge Challenger     6.589732
AMC Javelin          6.462189
Camaro Z28           5.654415
Pontiac Firebird     8.162765
Fiat X1-9           11.606431
Porsche 914-2       11.053744
Lotus Europa        12.924378
Ford Pantera L       6.717275
Ferrari Dino         8.375337
Maserati Bora        6.377160
Volvo 142E           9.098082
mtcars %>% mutate(kml=round(0.425144*mpg, 1))
                     mpg cyl  disp  hp drat    wt  qsec vs am gear carb  kml
Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4  8.9
Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4  8.9
Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1  9.7
Hornet 4 Drive      21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1  9.1
Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2  8.0
Valiant             18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1  7.7
Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4  6.1
Merc 240D           24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2 10.4
Merc 230            22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2  9.7
Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4  8.2
Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4  7.6
Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3  7.0
Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3  7.4
Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3  6.5
Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4  4.4
Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4  4.4
Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4  6.2
Fiat 128            32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1 13.8
Honda Civic         30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2 12.9
Toyota Corolla      33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1 14.4
Toyota Corona       21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1  9.1
Dodge Challenger    15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2  6.6
AMC Javelin         15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2  6.5
Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4  5.7
Pontiac Firebird    19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2  8.2
Fiat X1-9           27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1 11.6
Porsche 914-2       26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2 11.1
Lotus Europa        30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2 12.9
Ford Pantera L      15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4  6.7
Ferrari Dino        19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6  8.4
Maserati Bora       15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8  6.4
Volvo 142E          21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2  9.1
mtcars %>% mutate(kml=round(0.425144*mpg, 1)) %>% 
  select(mpg, kml) %>% 
  top_n(5, mpg)
                mpg  kml
Fiat 128       32.4 13.8
Honda Civic    30.4 12.9
Toyota Corolla 33.9 14.4
Fiat X1-9      27.3 11.6
Lotus Europa   30.4 12.9
# transmute
mtcars %>% transmute(hp/wt)
                       hp/wt
Mazda RX4           41.98473
Mazda RX4 Wag       38.26087
Datsun 710          40.08621
Hornet 4 Drive      34.21462
Hornet Sportabout   50.87209
Valiant             30.34682
Duster 360          68.62745
Merc 240D           19.43574
Merc 230            30.15873
Merc 280            35.75581
Merc 280C           35.75581
Merc 450SE          44.22604
Merc 450SL          48.25737
Merc 450SLC         47.61905
Cadillac Fleetwood  39.04762
Lincoln Continental 39.63864
Chrysler Imperial   43.03087
Fiat 128            30.00000
Honda Civic         32.19814
Toyota Corolla      35.42234
Toyota Corona       39.35091
Dodge Challenger    42.61364
AMC Javelin         43.66812
Camaro Z28          63.80208
Pontiac Firebird    45.51365
Fiat X1-9           34.10853
Porsche 914-2       42.52336
Lotus Europa        74.68605
Ford Pantera L      83.28076
Ferrari Dino        63.17690
Maserati Bora       93.83754
Volvo 142E          39.20863
# rename
mtcars %>% names
 [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear"
[11] "carb"
mtcars %>% mutate(new=1) %>% head
                   mpg cyl disp  hp drat    wt  qsec vs am gear carb new
Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4   1
Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4   1
Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1   1
Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1   1
Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2   1
Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1   1
mtcars %>% mutate(new=1) %>% 
rename(change.name=new) %>% head
                   mpg cyl disp  hp drat    wt  qsec vs am gear carb
Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
                  change.name
Mazda RX4                   1
Mazda RX4 Wag               1
Datsun 710                  1
Hornet 4 Drive              1
Hornet Sportabout           1
Valiant                     1

Combine Data Sets




Joins!

Thanks to Garrick Aden-Buie (garrickadenbuie.com), we can teach joins very easily with his fantastic diagrams.


Let’s use some example datasets that come pre-loaded in R to demonstrate how joins work using dplyr. We will use the mtcars dataset, splitting it into two separate data frames to illustrate how the different types of joins operate.

First, let’s set up our example data frames from the mtcars dataset:

# Create two data frames from mtcars
df1 <- mtcars[1:10, c("mpg", "cyl", "disp")]  # first 10 rows, select specific columns
df2 <- mtcars[5:15, c("disp", "hp", "drat")]  # rows 5 to 15, select specific columns

# Make sure there's a common key for joining; here, we'll use 'disp'
# Print out the data frames to see what they contain
print(df1)
                   mpg cyl  disp
Mazda RX4         21.0   6 160.0
Mazda RX4 Wag     21.0   6 160.0
Datsun 710        22.8   4 108.0
Hornet 4 Drive    21.4   6 258.0
Hornet Sportabout 18.7   8 360.0
Valiant           18.1   6 225.0
Duster 360        14.3   8 360.0
Merc 240D         24.4   4 146.7
Merc 230          22.8   4 140.8
Merc 280          19.2   6 167.6
print(df2)
                    disp  hp drat
Hornet Sportabout  360.0 175 3.15
Valiant            225.0 105 2.76
Duster 360         360.0 245 3.21
Merc 240D          146.7  62 3.69
Merc 230           140.8  95 3.92
Merc 280           167.6 123 3.92
Merc 280C          167.6 123 3.92
Merc 450SE         275.8 180 3.07
Merc 450SL         275.8 180 3.07
Merc 450SLC        275.8 180 3.07
Cadillac Fleetwood 472.0 205 2.93

Inner Join

All rows from x where there are matching values in y, and all columns from x and y.

inner_join_result <- inner_join(df1, df2, by = "disp")
Warning in inner_join(df1, df2, by = "disp"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 5 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
print(inner_join_result)
   mpg cyl  disp  hp drat
1 18.7   8 360.0 175 3.15
2 18.7   8 360.0 245 3.21
3 18.1   6 225.0 105 2.76
4 14.3   8 360.0 175 3.15
5 14.3   8 360.0 245 3.21
6 24.4   4 146.7  62 3.69
7 22.8   4 140.8  95 3.92
8 19.2   6 167.6 123 3.92
9 19.2   6 167.6 123 3.92

Left Join

All rows from x, and all columns from x and y. Rows in x with no match in y will have NA values in the new columns.

left_join_result <- left_join(df1, df2, by = "disp")
Warning in left_join(df1, df2, by = "disp"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 5 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
print(left_join_result)
    mpg cyl  disp  hp drat
1  21.0   6 160.0  NA   NA
2  21.0   6 160.0  NA   NA
3  22.8   4 108.0  NA   NA
4  21.4   6 258.0  NA   NA
5  18.7   8 360.0 175 3.15
6  18.7   8 360.0 245 3.21
7  18.1   6 225.0 105 2.76
8  14.3   8 360.0 175 3.15
9  14.3   8 360.0 245 3.21
10 24.4   4 146.7  62 3.69
11 22.8   4 140.8  95 3.92
12 19.2   6 167.6 123 3.92
13 19.2   6 167.6 123 3.92

Left Join (Extra Rows in y)

… If there are multiple matches between x and y, all combinations of the matches are returned.

Right Join

All rows from y, and all columns from x and y. Rows in y with no match in x will have NA values in the new columns.

right_join_result <- right_join(df1, df2, by = "disp")
Warning in right_join(df1, df2, by = "disp"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 5 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
print(right_join_result)
    mpg cyl  disp  hp drat
1  18.7   8 360.0 175 3.15
2  18.7   8 360.0 245 3.21
3  18.1   6 225.0 105 2.76
4  14.3   8 360.0 175 3.15
5  14.3   8 360.0 245 3.21
6  24.4   4 146.7  62 3.69
7  22.8   4 140.8  95 3.92
8  19.2   6 167.6 123 3.92
9  19.2   6 167.6 123 3.92
10   NA  NA 275.8 180 3.07
11   NA  NA 275.8 180 3.07
12   NA  NA 275.8 180 3.07
13   NA  NA 472.0 205 2.93

Full Join

All rows and all columns from both x and y. Where there are not matching values, returns NA for the one missing.

full_join_result <- full_join(df1, df2, by = "disp")
Warning in full_join(df1, df2, by = "disp"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 5 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
print(full_join_result)
    mpg cyl  disp  hp drat
1  21.0   6 160.0  NA   NA
2  21.0   6 160.0  NA   NA
3  22.8   4 108.0  NA   NA
4  21.4   6 258.0  NA   NA
5  18.7   8 360.0 175 3.15
6  18.7   8 360.0 245 3.21
7  18.1   6 225.0 105 2.76
8  14.3   8 360.0 175 3.15
9  14.3   8 360.0 245 3.21
10 24.4   4 146.7  62 3.69
11 22.8   4 140.8  95 3.92
12 19.2   6 167.6 123 3.92
13 19.2   6 167.6 123 3.92
14   NA  NA 275.8 180 3.07
15   NA  NA 275.8 180 3.07
16   NA  NA 275.8 180 3.07
17   NA  NA 472.0 205 2.93

Filtering Joins

Filtering joins match observations in the same way as mutating joins, but affect the observations, not the variables. … Semi-joins are useful for matching filtered summary tables back to the original rows. … Anti-joins are useful for diagnosing join mismatches.
R for Data Science: Filtering Joins

Semi Join

All rows from x where there are matching values in y, keeping just columns from x.

semi_join_result <- semi_join(df1, df2, by = "disp")
print(semi_join_result)
                   mpg cyl  disp
Hornet Sportabout 18.7   8 360.0
Valiant           18.1   6 225.0
Duster 360        14.3   8 360.0
Merc 240D         24.4   4 146.7
Merc 230          22.8   4 140.8
Merc 280          19.2   6 167.6

Anti Join

All rows from x where there are not matching values in y, keeping just column

# Assuming df1 and df2 from the previous example
anti_join_result <- anti_join(df1, df2, by = "disp")
print(anti_join_result)
                mpg cyl disp
Mazda RX4      21.0   6  160
Mazda RX4 Wag  21.0   6  160
Datsun 710     22.8   4  108
Hornet 4 Drive 21.4   6  258