Introduction to R workshop notes

contributed package

##
##
## 1. 2 plus 2
2 + 2
## [1] 4
## or
sum(2, 2)
## [1] 4
## 2. square root of 10:
sqrt(10)
## [1] 3.162278
## or
10^(1/2)
## [1] 3.162278
## 3. Find "An Introduction to R".
## Go to the main help page by running 'help.start() or using the GUI
## menu, find and click on the link to "An Introduction to R".
## FunctionName(arg.1 = value.1, arg.2 = value.2, ..., arg.n - value.n)
sqrt(10) ## calculate square root of 10; result is not stored anywhere
## [1] 3.162278
x <- sqrt(10) ## assign result to a variable named x
help(help)
?help
?sqrt
help(package = "stats")
## install.packages("readr")
library(readr)
## read ?read_csv
baby_names <- read_csv("http://tutorials.iq.harvard.edu/data/babyNames.csv")
## install.packages("dplyr")
library(dplyr)
baby_names_alexmark <- filter(baby_names, 
             Year == 1992 & (Name == "Alex" | Name == "Mark"))
baby_names_alexmark
## # A tibble: 4 x 4
##   Name  Sex   Count  Year
##   <chr> <chr> <dbl> <dbl>
## 1 Alex  Girls   366  1992
## 2 Mark  Girls    20  1992
## 3 Mark  Boys   8743  1992
## 4 Alex  Boys   7348  1992
arrange(baby_names_alexmark, Count)
## # A tibble: 4 x 4
##   Name  Sex   Count  Year
##   <chr> <chr> <dbl> <dbl>
## 1 Mark  Girls    20  1992
## 2 Alex  Girls   366  1992
## 3 Alex  Boys   7348  1992
## 4 Mark  Boys   8743  1992
arrange(baby_names_alexmark, desc(Count))
## # A tibble: 4 x 4
##   Name  Sex   Count  Year
##   <chr> <chr> <dbl> <dbl>
## 1 Mark  Boys   8743  1992
## 2 Alex  Boys   7348  1992
## 3 Alex  Girls   366  1992
## 4 Mark  Girls    20  1992
##
##
##
# 1.  Use `filter()` to extract data for your name (or another name of your choice).  
baby_names_george <- filter(baby_names, Name == "George")
# 2.  Arrange the data you produced in step 1 above by `Count`. 
#     In which year was the name most popular?
arrange(baby_names_george, desc(Count))
## # A tibble: 111 x 4
##    Name   Sex   Count  Year
##    <chr>  <chr> <dbl> <dbl>
##  1 George Boys  14063  1960
##  2 George Boys  13638  1961
##  3 George Boys  12553  1962
##  4 George Boys  12084  1963
##  5 George Boys  11793  1964
##  6 George Boys  10683  1965
##  7 George Boys   9942  1966
##  8 George Boys   9702  1967
##  9 George Boys   9388  1968
## 10 George Boys   9203  1969
## # ... with 101 more rows
# 3.  BONUS (optional): Filter the data to extract _only_ the 
#     row containing the most popular boys name in 1999.
baby_names_boys1999 <- filter(baby_names, 
                    Year == 1999 & Sex == "Boys")
filter(baby_names_boys1999, Count == max(Count))
## # A tibble: 1 x 4
##   Name  Sex   Count  Year
##   <chr> <chr> <dbl> <dbl>
## 1 Jacob Boys  35361  1999
baby_names %>% 
  filter(Year == 1992 & (Name == "Alex" | Name == "Mark")) %>%
  arrange(desc(Count))
## # A tibble: 4 x 4
##   Name  Sex   Count  Year
##   <chr> <chr> <dbl> <dbl>
## 1 Mark  Boys   8743  1992
## 2 Alex  Boys   7348  1992
## 3 Alex  Girls   366  1992
## 4 Mark  Girls    20  1992
baby_names %>% 
  filter(Name == "George") %>%
  arrange(desc(Count))
## # A tibble: 111 x 4
##    Name   Sex   Count  Year
##    <chr>  <chr> <dbl> <dbl>
##  1 George Boys  14063  1960
##  2 George Boys  13638  1961
##  3 George Boys  12553  1962
##  4 George Boys  12084  1963
##  5 George Boys  11793  1964
##  6 George Boys  10683  1965
##  7 George Boys   9942  1966
##  8 George Boys   9702  1967
##  9 George Boys   9388  1968
## 10 George Boys   9203  1969
## # ... with 101 more rows
## install.packages("ggplot2")
library(ggplot2)
baby_names_diana <- filter(baby_names, Name == "Diana")
qplot(x = Year, y = Count,
     data = baby_names_diana)
qplot(x = Year, y = Count, color = Sex,
      data = baby_names_diana)
##
##
##
# 1. Use `filter()` to extract data for your name (same as previous exercise)  
baby_names_george <- filter(baby_names, Name == "George")
# 2.  Plot the data you produced in step 1 above, with `Year` on the x-axis
#     and `Count` on the y-axis.
qplot(x = Year, y = Count, data = baby_names_george)
# 3. Adjust the plot so that is shows boys and girls in different colors.
qplot(x = Year, y = Count, color = Sex, data = baby_names_george)
# 4.  BONUS (Optional): Adust the plot to use lines instead of points.
qplot(x = Year, y = Count, color = Sex, data = baby_names_george, geom = "line")
baby_names <- mutate(baby_names, Count_1K = Count/1000)
baby_names ## same as print(baby_names)
## # A tibble: 1,352,203 x 5
##    Name     Sex   Count  Year Count_1K
##    <chr>    <chr> <dbl> <dbl>    <dbl>
##  1 Mary     Girls 51474  1960     51.5
##  2 Susan    Girls 39200  1960     39.2
##  3 Linda    Girls 37314  1960     37.3
##  4 Karen    Girls 36376  1960     36.4
##  5 Donna    Girls 34133  1960     34.1
##  6 Lisa     Girls 33702  1960     33.7
##  7 Patricia Girls 32102  1960     32.1
##  8 Debra    Girls 26737  1960     26.7
##  9 Cynthia  Girls 26725  1960     26.7
## 10 Deborah  Girls 25264  1960     25.3
## # ... with 1,352,193 more rows
head(baby_names) 
## # A tibble: 6 x 5
##   Name  Sex   Count  Year Count_1K
##   <chr> <chr> <dbl> <dbl>    <dbl>
## 1 Mary  Girls 51474  1960     51.5
## 2 Susan Girls 39200  1960     39.2
## 3 Linda Girls 37314  1960     37.3
## 4 Karen Girls 36376  1960     36.4
## 5 Donna Girls 34133  1960     34.1
## 6 Lisa  Girls 33702  1960     33.7
baby_names_scaled <- select(baby_names, Name, Sex, Year, Count_1K)
head(baby_names_scaled)
## # A tibble: 6 x 4
##   Name  Sex    Year Count_1K
##   <chr> <chr> <dbl>    <dbl>
## 1 Mary  Girls  1960     51.5
## 2 Susan Girls  1960     39.2
## 3 Linda Girls  1960     37.3
## 4 Karen Girls  1960     36.4
## 5 Donna Girls  1960     34.1
## 6 Lisa  Girls  1960     33.7
baby_names %>% 
  select(Name, Sex, Year, Count_1K) %>%
  head
## # A tibble: 6 x 4
##   Name  Sex    Year Count_1K
##   <chr> <chr> <dbl>    <dbl>
## 1 Mary  Girls  1960     51.5
## 2 Susan Girls  1960     39.2
## 3 Linda Girls  1960     37.3
## 4 Karen Girls  1960     36.4
## 5 Donna Girls  1960     34.1
## 6 Lisa  Girls  1960     33.7
baby_names <- 
  baby_names %>%
  group_by(Year, Sex) %>%
  mutate(Rank = rank(-Count)) %>%
  arrange(Rank, Year, Sex) %>%
  ungroup

baby_names
## # A tibble: 1,352,203 x 6
##    Name    Sex   Count  Year Count_1K  Rank
##    <chr>   <chr> <dbl> <dbl>    <dbl> <dbl>
##  1 David   Boys  85928  1960     85.9     1
##  2 Mary    Girls 51474  1960     51.5     1
##  3 Michael Boys  86922  1961     86.9     1
##  4 Mary    Girls 47676  1961     47.7     1
##  5 Michael Boys  85037  1962     85.0     1
##  6 Lisa    Girls 46080  1962     46.1     1
##  7 Michael Boys  83789  1963     83.8     1
##  8 Lisa    Girls 56037  1963     56.0     1
##  9 Michael Boys  82653  1964     82.7     1
## 10 Lisa    Girls 54276  1964     54.3     1
## # ... with 1,352,193 more rows
## baby_names <- baby_names %>% ungroup
##
##
##
## 1.  Use `mutate()` and `group_by()` to create a column named "Proportion"
##     where `Proportion = Count/sum(Count)` for each `Year X Sex` group.
baby_names <- 
  baby_names %>%
  group_by(Year, Sex) %>%
  mutate(Proportion = Count/sum(Count)) %>%
  ungroup
## 2.  Use `mutate()` and `group_by()` to create a column named "Rank" where 
##     `Rank = rank(-Count)` for each `Year X Sex` group.
baby_names <- 
  baby_names %>%
  group_by(Year, Sex) %>%
  mutate(Rank = rank(-Count)) %>%
  ungroup
## 3.  Filter the baby names data to display only the most popular name 
##     for each `Year X Sex` group. Output columns Name, Sex, and Proportion.
top1 <- filter(baby_names, Rank == 1)
top1 %>% 
  select(Name, Sex, Proportion)
## # A tibble: 116 x 3
##    Name    Sex   Proportion
##    <chr>   <chr>      <dbl>
##  1 David   Boys      0.0403
##  2 Mary    Girls     0.0255
##  3 Michael Boys      0.0409
##  4 Mary    Girls     0.0236
##  5 Michael Boys      0.0411
##  6 Lisa    Girls     0.0234
##  7 Michael Boys      0.0412
##  8 Lisa    Girls     0.0291
##  9 Michael Boys      0.0415
## 10 Lisa    Girls     0.0286
## # ... with 106 more rows
## 4. Plot the data produced in step 3, putting `Year` on the x-axis
##    and `Proportion` on the y-axis. How has the proportion of babies
##    given the most popular name changed over time?
qplot(x = Year, 
      y = Proportion, 
      color = Sex, 
      data = top1, 
      geom = "line")
## 5. BONUS (optional): Which names are the most popular for both boys 
##    and girls?
bn_girls <- baby_names %>% 
  filter(Sex == "Boys") %>%
  select(Name, Year, Count)

bn_boys <- baby_names %>% 
  filter(Sex == "Girls") %>%
  select(Name, Year, Count)

girls_and_boys <- inner_join(bn_girls, 
                             bn_boys,
                             by = c("Year", "Name"))
head(girls_and_boys)
## # A tibble: 6 x 4
##   Name     Year Count.x Count.y
##   <chr>   <dbl>   <dbl>   <dbl>
## 1 David    1960   85928     223
## 2 Michael  1961   86922     325
## 3 Michael  1962   85037     354
## 4 Michael  1963   83789     377
## 5 Michael  1964   82653     302
## 6 Michael  1965   81019     355
girls_and_boys <- mutate(girls_and_boys,
                         Product = Count.x * Count.y,
                         Rank = rank(-Product))
head(girls_and_boys)
## # A tibble: 6 x 6
##   Name     Year Count.x Count.y  Product  Rank
##   <chr>   <dbl>   <dbl>   <dbl>    <dbl> <dbl>
## 1 David    1960   85928     223 19161944   200
## 2 Michael  1961   86922     325 28249650   109
## 3 Michael  1962   85037     354 30103098    98
## 4 Michael  1963   83789     377 31588453    90
## 5 Michael  1964   82653     302 24961206   130
## 6 Michael  1965   81019     355 28761745   106
filter(girls_and_boys, Rank == 1)
## # A tibble: 1 x 6
##   Name    Year Count.x Count.y   Product  Rank
##   <chr>  <dbl>   <dbl>   <dbl>     <dbl> <dbl>
## 1 Taylor  1993    7688   21266 163493008     1
baby_names %>% 
  filter(Sex == "Girls") %>%
  summarize(Girls_n = sum(Count))
## # A tibble: 1 x 1
##     Girls_n
##       <dbl>
## 1 101422255
bn_by_year <-
  baby_names %>%
  group_by(Year) %>%
  summarize(Total = sum(Count))

head(bn_by_year)
## # A tibble: 6 x 2
##    Year   Total
##   <dbl>   <dbl>
## 1  1960 4154377
## 2  1961 4140244
## 3  1962 4035234
## 4  1963 3958791
## 5  1964 3887800
## 6  1965 3626029
##
##
##
## 1.  Filter the baby_names data, retaining only the 10 most 
##     popular girl and boy names for each year.
most_popular <- 
  baby_names %>% 
  group_by(Year, Sex) %>%
  filter(Rank <= 10)

most_popular
## # A tibble: 1,160 x 7
## # Groups:   Year, Sex [116]
##    Name    Sex   Count  Year Count_1K  Rank Proportion
##    <chr>   <chr> <dbl> <dbl>    <dbl> <dbl>      <dbl>
##  1 David   Boys  85928  1960     85.9     1     0.0403
##  2 Mary    Girls 51474  1960     51.5     1     0.0255
##  3 Michael Boys  86922  1961     86.9     1     0.0409
##  4 Mary    Girls 47676  1961     47.7     1     0.0236
##  5 Michael Boys  85037  1962     85.0     1     0.0411
##  6 Lisa    Girls 46080  1962     46.1     1     0.0234
##  7 Michael Boys  83789  1963     83.8     1     0.0412
##  8 Lisa    Girls 56037  1963     56.0     1     0.0291
##  9 Michael Boys  82653  1964     82.7     1     0.0415
## 10 Lisa    Girls 54276  1964     54.3     1     0.0286
## # ... with 1,150 more rows
## 2.  Summarize the data produced in step one to calculate the total
##     Proportion of boys and girls given one of the top 10 names
##     each year.
# #most_popular data.frame is already grouped by Year and Sex
top10 <- 
  most_popular %>% 
  summarize(TotalProportion = sum(Proportion))
## 3.  Plot the data produced in step 2, with year on the x-axis
##     and total proportion on the y axis. Color by sex.
qplot(x = Year, 
      y = TotalProportion, 
      color = Sex,
      data = top10,
      geom = "line")
ls() # list objects in our workspace
# rm(list=ls()) # remove all objects from our workspace 
# write data to a .csv file
write_csv(baby_names, "babyNames.csv")
# write data to an R file
write_rds(baby_names, "babyNames.rds")

Data Type	Function
comma separated	`read_csv()`
tab separated	`read_delim()`
other delimited formats	`read_table()`
fixed width	`read_fwf()`

Operator	Meaning
`==`	equal to
`!=`	not equal to
`>`	greater than
`>=`	greater than or equal to
`<`	less than
`<=`	less than or equal to
`%in%`	contained in

Introduction to R workshop notes

Welcome

Materials and setup

Workshop goals and approach

Graphical User Interfaces (GUIs)

Launch RStudio (skip if not using RStudio)

Exercise 0

R basics

Function calls

Assignment

Asking R for help

Getting data into R

Installing and using R packages

Readers for common file types

Baby names data

Exercise 1: Reading the baby names data

Popularity of your name

Filtering and arranging data

Other logical operators

Exercise 2.1: Peak popularity of your name

Pipe operator in R

Exercise 2.2: Peak popularity of your name

Plotting baby name trends over time

Exercise 3: Plotting peak popularity of your name

Finding the most popular names

Computing better measures of popularity

Operating by group

Exercise 4: Most popular names

Percent choosing one of the top 10 names

Exercise 5: Popularity of the most popular names

Saving our Work

Best Practices for Writing R Code

Wrap-up

Help us make this workshop better!

Additional resources