Foreword

  • Output options: the ‘tango’ syntax and the ‘readable’ theme.
  • Snippets and results.
  • Source: ‘Introduction to Machine Learning’ and ‘Data Exploration with Kaggle Scripts’ from DataCamp.


Analyzing the 2013 American Community Survey

Introducing the American Community Survey

Analyze data of the 2013 American Community Survey to find out whether it makes sense to pursue a PhD. The end result is a Kaggle script that can be shared on Kaggle.

# Investigate first 10 observations
head(AC_Survey_Subset, 10)
##    SCHL ESR PINCP
## 1    19   6     0
## 2    20   1 52000
## 3    16   1 99000
## 4    19   6     0
## 5    19   6     0
## 6    21   1 39930
## 7    14   6 10300
## 8    16   3  1100
## 9     9  NA    NA
## 10    1   6  3900
str(AC_Survey_Subset)
## Classes 'data.table' and 'data.frame':   3132795 obs. of  3 variables:
##  $ SCHL : int  19 20 16 19 19 21 14 16 9 1 ...
##  $ ESR  : int  6 1 1 6 6 1 6 3 NA 6 ...
##  $ PINCP: int  0 52000 99000 0 0 39930 10300 1100 NA 3900 ...
##  - attr(*, ".internal.selfref")=<externalptr>

The dataset contains over 300000 observations and 3 variables: SCHL (School Level), PINCP (Income) and ESR (Work Status).

Preparing your data set for further analysis

The data are messy. Clean it up.

# load in the dplyr package
library(dplyr)

# convert AC_Survey_Subset to tbl_df
AC_Survey_Subset <- tbl_df(AC_Survey_Subset)
head(AC_Survey_Subset, 20)
## # A tibble: 20 x 3
##     SCHL   ESR  PINCP
##    <int> <int>  <int>
##  1    19     6      0
##  2    20     1  52000
##  3    16     1  99000
##  4    19     6      0
##  5    19     6      0
##  6    21     1  39930
##  7    14     6  10300
##  8    16     3   1100
##  9     9    NA     NA
## 10     1     6   3900
## 11    10     6   5400
## 12    16     1  90000
## 13    18     1  46000
## 14    17     6  39500
## 15    17     6  13100
## 16    19     6 103000
## 17    21     1  53600
## 18    19     1  28000
## 19    16     1  18700
## 20    16     6   6000
# remove NA, only keep school level 21, 22 and 4, group the set by school level
# use the pipe operator and chaining 
AC_Survey_Subset_Cleaned <- AC_Survey_Subset %>%
  na.omit() %>%
  filter(SCHL %in% c(21,22,24)) %>%
  group_by(SCHL)

# Check it out
head(AC_Survey_Subset_Cleaned, 5)
## # A tibble: 5 x 3
## # Groups:   SCHL [1]
##    SCHL   ESR  PINCP
##   <int> <int>  <int>
## 1    21     1  39930
## 2    21     1  53600
## 3    21     3   2000
## 4    21     3  16000
## 5    21     1 100000

How many are there? - Part one

Look at the numbers of BSc, MSc & PhD holders in the US. Print tables.

# count the number of Bachelor (21), Masters (22) and PhD holders (24) or per school level
degree_holders <- summarize(AC_Survey_Subset_Cleaned, count = n())
degree_holders
## # A tibble: 3 x 2
##    SCHL  count
##   <int>  <int>
## 1    21 423943
## 2    22 183182
## 3    24  30877
# compute the average income per school level
avg_inc <- AC_Survey_Subset_Cleaned %>%
  group_by(SCHL) %>%
  summarize(avg_inc = mean(PINCP))
avg_inc
## # A tibble: 3 x 2
##    SCHL   avg_inc
##   <int>     <dbl>
## 1    21  56427.38
## 2    22  72421.54
## 3    24 100421.51
# join degree_codes with degree_holders, assign to degree_holders_2
degree_holders_2 <- inner_join(degree_holders, avg_inc)
degree_holders_2
## # A tibble: 3 x 3
##    SCHL  count   avg_inc
##   <int>  <int>     <dbl>
## 1    21 423943  56427.38
## 2    22 183182  72421.54
## 3    24  30877 100421.51
degree_names <- c('Bachelor', 'Master', 'PhD')
cbind(degree_names, degree_holders_2)
##   degree_names SCHL  count   avg_inc
## 1     Bachelor   21 423943  56427.38
## 2       Master   22 183182  72421.54
## 3          PhD   24  30877 100421.51

How many are there? - Part two

And how much do they earn in average?

# load the ggplot2 and gridExtra packages
library(ggplot2)
library(gridExtra)

# visualize the number of Bachelor (21), Masters (22) and PhD (24) holders   
plot1 <- ggplot(degree_holders_2, aes(x = degree_names, y = count/1000, fill = degree_names)) + 
  geom_bar(stat = 'identity') + 
  xlab('Degree') + 
  ylab('No of People (000)') + 
  ggtitle('Number of Degree Holders in the US')

plot2 <- ggplot(degree_holders_2, aes(x = degree_names, y = avg_inc/1000, fill = degree_names)) + 
  geom_bar(stat = 'identity') + 
  xlab('Degree') + 
  ylab('Income (000)') + 
  ggtitle('Income of Degree Holders in the US')

grid.arrange(plot1, plot2, nrow = 2)



Pigeons, Chopsticks, Spanish Silver, and Weed

The American Racing Pigeon Union has a great database of race results.

Looking at pigeon data

# Check it out
head(pigeon)
##   Color Pos   Speed
## 1  BCWF   1 172.155
## 2  SIWF   2 163.569
## 3    BB   3 163.442
## 4  BBSP   4 163.392
## 5    BC   5 163.366
## 6    BC   6 163.190
str(pigeon)
## 'data.frame':    400 obs. of  3 variables:
##  $ Color: Factor w/ 29 levels "BB","BBPD","BBPI",..: 9 26 1 4 6 6 5 6 1 6 ...
##  $ Pos  : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Speed: num  172 164 163 163 163 ...
# select those colors that appear over 10 times
over_ten_colors <- pigeon %>% 
  group_by(Color) %>% 
  summarize(count = n()) %>% 
  filter(count > 10)
over_ten_colors
## # A tibble: 4 x 2
##    Color count
##   <fctr> <int>
## 1     BB   177
## 2   BBWF    36
## 3     BC    92
## 4     RC    16
# calculate average speed for these colors
average_speed_per_color <- pigeon %>%
  filter(Color %in% over_ten_colors$Color) %>%
  group_by(Color) %>% 
  summarise(Average_Speed = mean(Speed))
average_speed_per_color
## # A tibble: 4 x 2
##    Color Average_Speed
##   <fctr>         <dbl>
## 1     BB      124.0063
## 2   BBWF      130.9791
## 3     BC      135.3931
## 4     RC      128.6933
# plot Speed
ggplot(pigeon, aes(x = Pos, y = Speed)) +
  geom_point() + 
  xlab('Rank') + 
  ylab('Pigeon Speed') + 
  ggtitle('The correlation between speed and rank')

Assessing chopsticks effectiveness

In a study named An Investigation for Determining the Optimum Length of Chopsticks researchers set out to determine the optimal length for chopsticks. One of the criteria to measure chopstick performance is food pinching efficiency (the number of peanuts picked up and placed in a cup).

# Check it out
head(chopstick)
##   Food_Pinching_Efficiency Individual Chopstick_Length
## 1                    19.55          1              180
## 2                    27.24          2              180
## 3                    28.76          3              180
## 4                    31.19          4              180
## 5                    21.91          5              180
## 6                    27.62          6              180
str(chopstick)
## 'data.frame':    186 obs. of  3 variables:
##  $ Food_Pinching_Efficiency: num  19.6 27.2 28.8 31.2 21.9 ...
##  $ Individual              : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Chopstick_Length        : Factor w/ 6 levels "180","210","240",..: 1 1 1 1 1 1 1 1 1 1 ...
ggplot(chopstick, aes(x = Food_Pinching_Efficiency, fill = Chopstick_Length)) + 
  geom_density(alpha = 0.3) + 
  xlab('Food Pinching Efficiency') + 
  ylab('Relative Frequency')

The Spanish silver production

The Spaniards kept very detailed records on how much silver they were producing during the colonial era (1720 - 1800). Let’s explore their efficiency over the years using some visualizations.

# Check it out
head(silver)
##   year situados_paid silver_minted was_american_revolution
## 1 1720       1226215       7874315                       0
## 2 1721       1046440       9460730                       0
## 3 1722        376311       8823927                       0
## 4 1723        921332       8107343                       0
## 5 1724        928764       7872819                       0
## 6 1725        698335       7369815                       0
str(silver)
## 'data.frame':    81 obs. of  4 variables:
##  $ year                   : num  1720 1721 1722 1723 1724 ...
##  $ situados_paid          : num  1226215 1046440 376311 921332 928764 ...
##  $ silver_minted          : num  7874315 9460730 8823927 8107343 7872819 ...
##  $ was_american_revolution: num  0 0 0 0 0 0 0 0 0 0 ...
# yearly production of silver
plot_a <- ggplot(silver, aes(x = year, y = silver_minted/1000000)) + 
  geom_area(fill = 'blue', alpha = 0.3) +
  xlab('Year') + 
  ylab('Silver Minted (000,000)') + 
  ggtitle('Yearly Production (1720 - 1800)')

# yearly production Of silver & cumulative production of silver between 1720 - 1800 
silver_cs <- silver %>% 
  mutate(cumsum = cumsum(silver_minted))

plot_b <- ggplot(silver_cs, aes(x = year, y = silver_minted/1000000)) + 
  geom_area(fill = 'blue', alpha = 0.3) +
  geom_area(aes(y = cumsum/1000000), fill = '#c0c0c0', alpha = 0.5) +
  xlab('Year') + 
  ylab('Silver Minted (000,000)') + 
  ggtitle('Yearly & Cumulative Production (1720 - 1800)')

grid.arrange(plot_a, plot_b, nrow = 2)

Price differences in high quality marijuana per state

Investigate data from this repository of historical marijuana prices.

# Check it out
head(weed)
##        state  price
## 1    alabama 339.06
## 2     alaska 288.75
## 3    arizona 303.31
## 4   arkansas 361.85
## 5 california 248.78
## 6   colorado 236.31
str(weed)
## 'data.frame':    22899 obs. of  2 variables:
##  $ state: chr  "alabama" "alaska" "arizona" "arkansas" ...
##  $ price: num  339 289 303 362 249 ...
# load in the dplyr package
library(acs)
library(plyr)
library(dplyr)

# get the average price per state
average_weed_price <- weed %>% group_by(state) %>% summarise(mean_price = mean(price))

average_weed_price
##   mean_price
## 1   329.7599
# load a mapping package
library(choroplethrMaps)
library(choroplethr)

# which states are most expensive?
colnames(average_weed_price) <- c('region','value')

state_choropleth(average_weed_price, title = 'Average Weed Price Per State', legend = 'Price in $')


Irises and Diamonds

Irises

Different points of view of the R dataset.

library(ggplot2)
library(gridExtra)

c1 <- ggplot(iris, aes(Petal.Length, Petal.Width, color = Species)) +
  geom_point() +
  theme(legend.position = 'bottom')

c2 <- ggplot(iris, aes(Sepal.Length, Sepal.Width, color = Species)) +
  geom_point() +
  theme(legend.position = 'bottom')

grid.arrange(c1, c2, nrow = 1)

grid.arrange(c1, c2, ncol = 1)

grid.arrange(c1, c2, nrow = 1, widths = c(2, 4))

grid.arrange(c1, c2, ncol = 1, heights = c(2, 4))

Diamonds

Different points of view of the R dataset.

library(tidyr)
library(dplyr)

diam1 <- diamonds %>% select(carat, depth:z) %>% gather(dX, dV, carat:z)

ggplot(diam1, aes('dX', dV)) +
  geom_boxplot() +
  facet_wrap(~dX, scales = 'free_y', nrow = 1) +
  xlab('') +
  ylab('') +
  scale_x_discrete(breaks = NULL)

a2 <- ggplot(diamonds, aes(y, z)) +
  geom_point() +
  xlab('width') +
  ylab('depth')
d2 <- filter(diamonds, y > 2 & y < 11 & z > 1 & z < 7)
b2 <- ggplot(d2, aes(y, z)) +
  geom_point() +
  xlab('width') +
  ylab('depth')

grid.arrange(a2, b2, ncol = 2)

p0 <- ggplot(diamonds, aes(x = carat)) + ylab('')
p1 <- p0 + geom_histogram(binwidth = 1)
p2 <- p0 + geom_histogram(binwidth = 0.1)
p3 <- p0 + geom_histogram(binwidth = 0.01)

grid.arrange(arrangeGrob(p1, p2, ncol = 2), p3, nrow = 2)