Foreword
Introducing the American Community Survey
Analyze data of the 2013 American Community Survey to find out whether it makes sense to pursue a PhD. The end result is a Kaggle script that can be shared on Kaggle.
# Investigate first 10 observations
head(AC_Survey_Subset, 10)## SCHL ESR PINCP
## 1 19 6 0
## 2 20 1 52000
## 3 16 1 99000
## 4 19 6 0
## 5 19 6 0
## 6 21 1 39930
## 7 14 6 10300
## 8 16 3 1100
## 9 9 NA NA
## 10 1 6 3900
str(AC_Survey_Subset)## Classes 'data.table' and 'data.frame': 3132795 obs. of 3 variables:
## $ SCHL : int 19 20 16 19 19 21 14 16 9 1 ...
## $ ESR : int 6 1 1 6 6 1 6 3 NA 6 ...
## $ PINCP: int 0 52000 99000 0 0 39930 10300 1100 NA 3900 ...
## - attr(*, ".internal.selfref")=<externalptr>
The dataset contains over 300000 observations and 3 variables: SCHL (School Level), PINCP (Income) and ESR (Work Status).
Preparing your data set for further analysis
The data are messy. Clean it up.
# load in the dplyr package
library(dplyr)
# convert AC_Survey_Subset to tbl_df
AC_Survey_Subset <- tbl_df(AC_Survey_Subset)
head(AC_Survey_Subset, 20)## # A tibble: 20 x 3
## SCHL ESR PINCP
## <int> <int> <int>
## 1 19 6 0
## 2 20 1 52000
## 3 16 1 99000
## 4 19 6 0
## 5 19 6 0
## 6 21 1 39930
## 7 14 6 10300
## 8 16 3 1100
## 9 9 NA NA
## 10 1 6 3900
## 11 10 6 5400
## 12 16 1 90000
## 13 18 1 46000
## 14 17 6 39500
## 15 17 6 13100
## 16 19 6 103000
## 17 21 1 53600
## 18 19 1 28000
## 19 16 1 18700
## 20 16 6 6000
# remove NA, only keep school level 21, 22 and 4, group the set by school level
# use the pipe operator and chaining
AC_Survey_Subset_Cleaned <- AC_Survey_Subset %>%
na.omit() %>%
filter(SCHL %in% c(21,22,24)) %>%
group_by(SCHL)
# Check it out
head(AC_Survey_Subset_Cleaned, 5)## # A tibble: 5 x 3
## # Groups: SCHL [1]
## SCHL ESR PINCP
## <int> <int> <int>
## 1 21 1 39930
## 2 21 1 53600
## 3 21 3 2000
## 4 21 3 16000
## 5 21 1 100000
How many are there? - Part one
Look at the numbers of BSc, MSc & PhD holders in the US. Print tables.
# count the number of Bachelor (21), Masters (22) and PhD holders (24) or per school level
degree_holders <- summarize(AC_Survey_Subset_Cleaned, count = n())
degree_holders## # A tibble: 3 x 2
## SCHL count
## <int> <int>
## 1 21 423943
## 2 22 183182
## 3 24 30877
# compute the average income per school level
avg_inc <- AC_Survey_Subset_Cleaned %>%
group_by(SCHL) %>%
summarize(avg_inc = mean(PINCP))
avg_inc## # A tibble: 3 x 2
## SCHL avg_inc
## <int> <dbl>
## 1 21 56427.38
## 2 22 72421.54
## 3 24 100421.51
# join degree_codes with degree_holders, assign to degree_holders_2
degree_holders_2 <- inner_join(degree_holders, avg_inc)
degree_holders_2## # A tibble: 3 x 3
## SCHL count avg_inc
## <int> <int> <dbl>
## 1 21 423943 56427.38
## 2 22 183182 72421.54
## 3 24 30877 100421.51
degree_names <- c('Bachelor', 'Master', 'PhD')
cbind(degree_names, degree_holders_2)## degree_names SCHL count avg_inc
## 1 Bachelor 21 423943 56427.38
## 2 Master 22 183182 72421.54
## 3 PhD 24 30877 100421.51
How many are there? - Part two
And how much do they earn in average?
# load the ggplot2 and gridExtra packages
library(ggplot2)
library(gridExtra)
# visualize the number of Bachelor (21), Masters (22) and PhD (24) holders
plot1 <- ggplot(degree_holders_2, aes(x = degree_names, y = count/1000, fill = degree_names)) +
geom_bar(stat = 'identity') +
xlab('Degree') +
ylab('No of People (000)') +
ggtitle('Number of Degree Holders in the US')
plot2 <- ggplot(degree_holders_2, aes(x = degree_names, y = avg_inc/1000, fill = degree_names)) +
geom_bar(stat = 'identity') +
xlab('Degree') +
ylab('Income (000)') +
ggtitle('Income of Degree Holders in the US')
grid.arrange(plot1, plot2, nrow = 2)The American Racing Pigeon Union has a great database of race results.
# Check it out
head(pigeon)## Color Pos Speed
## 1 BCWF 1 172.155
## 2 SIWF 2 163.569
## 3 BB 3 163.442
## 4 BBSP 4 163.392
## 5 BC 5 163.366
## 6 BC 6 163.190
str(pigeon)## 'data.frame': 400 obs. of 3 variables:
## $ Color: Factor w/ 29 levels "BB","BBPD","BBPI",..: 9 26 1 4 6 6 5 6 1 6 ...
## $ Pos : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Speed: num 172 164 163 163 163 ...
# select those colors that appear over 10 times
over_ten_colors <- pigeon %>%
group_by(Color) %>%
summarize(count = n()) %>%
filter(count > 10)
over_ten_colors## # A tibble: 4 x 2
## Color count
## <fctr> <int>
## 1 BB 177
## 2 BBWF 36
## 3 BC 92
## 4 RC 16
# calculate average speed for these colors
average_speed_per_color <- pigeon %>%
filter(Color %in% over_ten_colors$Color) %>%
group_by(Color) %>%
summarise(Average_Speed = mean(Speed))
average_speed_per_color## # A tibble: 4 x 2
## Color Average_Speed
## <fctr> <dbl>
## 1 BB 124.0063
## 2 BBWF 130.9791
## 3 BC 135.3931
## 4 RC 128.6933
# plot Speed
ggplot(pigeon, aes(x = Pos, y = Speed)) +
geom_point() +
xlab('Rank') +
ylab('Pigeon Speed') +
ggtitle('The correlation between speed and rank')In a study named An Investigation for Determining the Optimum Length of Chopsticks researchers set out to determine the optimal length for chopsticks. One of the criteria to measure chopstick performance is food pinching efficiency (the number of peanuts picked up and placed in a cup).
# Check it out
head(chopstick)## Food_Pinching_Efficiency Individual Chopstick_Length
## 1 19.55 1 180
## 2 27.24 2 180
## 3 28.76 3 180
## 4 31.19 4 180
## 5 21.91 5 180
## 6 27.62 6 180
str(chopstick)## 'data.frame': 186 obs. of 3 variables:
## $ Food_Pinching_Efficiency: num 19.6 27.2 28.8 31.2 21.9 ...
## $ Individual : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Chopstick_Length : Factor w/ 6 levels "180","210","240",..: 1 1 1 1 1 1 1 1 1 1 ...
ggplot(chopstick, aes(x = Food_Pinching_Efficiency, fill = Chopstick_Length)) +
geom_density(alpha = 0.3) +
xlab('Food Pinching Efficiency') +
ylab('Relative Frequency')The Spaniards kept very detailed records on how much silver they were producing during the colonial era (1720 - 1800). Let’s explore their efficiency over the years using some visualizations.
# Check it out
head(silver)## year situados_paid silver_minted was_american_revolution
## 1 1720 1226215 7874315 0
## 2 1721 1046440 9460730 0
## 3 1722 376311 8823927 0
## 4 1723 921332 8107343 0
## 5 1724 928764 7872819 0
## 6 1725 698335 7369815 0
str(silver)## 'data.frame': 81 obs. of 4 variables:
## $ year : num 1720 1721 1722 1723 1724 ...
## $ situados_paid : num 1226215 1046440 376311 921332 928764 ...
## $ silver_minted : num 7874315 9460730 8823927 8107343 7872819 ...
## $ was_american_revolution: num 0 0 0 0 0 0 0 0 0 0 ...
# yearly production of silver
plot_a <- ggplot(silver, aes(x = year, y = silver_minted/1000000)) +
geom_area(fill = 'blue', alpha = 0.3) +
xlab('Year') +
ylab('Silver Minted (000,000)') +
ggtitle('Yearly Production (1720 - 1800)')
# yearly production Of silver & cumulative production of silver between 1720 - 1800
silver_cs <- silver %>%
mutate(cumsum = cumsum(silver_minted))
plot_b <- ggplot(silver_cs, aes(x = year, y = silver_minted/1000000)) +
geom_area(fill = 'blue', alpha = 0.3) +
geom_area(aes(y = cumsum/1000000), fill = '#c0c0c0', alpha = 0.5) +
xlab('Year') +
ylab('Silver Minted (000,000)') +
ggtitle('Yearly & Cumulative Production (1720 - 1800)')
grid.arrange(plot_a, plot_b, nrow = 2)Investigate data from this repository of historical marijuana prices.
# Check it out
head(weed)## state price
## 1 alabama 339.06
## 2 alaska 288.75
## 3 arizona 303.31
## 4 arkansas 361.85
## 5 california 248.78
## 6 colorado 236.31
str(weed)## 'data.frame': 22899 obs. of 2 variables:
## $ state: chr "alabama" "alaska" "arizona" "arkansas" ...
## $ price: num 339 289 303 362 249 ...
# load in the dplyr package
library(acs)
library(plyr)
library(dplyr)
# get the average price per state
average_weed_price <- weed %>% group_by(state) %>% summarise(mean_price = mean(price))
average_weed_price## mean_price
## 1 329.7599
# load a mapping package
library(choroplethrMaps)
library(choroplethr)
# which states are most expensive?
colnames(average_weed_price) <- c('region','value')
state_choropleth(average_weed_price, title = 'Average Weed Price Per State', legend = 'Price in $')Different points of view of the R dataset.
library(ggplot2)
library(gridExtra)
c1 <- ggplot(iris, aes(Petal.Length, Petal.Width, color = Species)) +
geom_point() +
theme(legend.position = 'bottom')
c2 <- ggplot(iris, aes(Sepal.Length, Sepal.Width, color = Species)) +
geom_point() +
theme(legend.position = 'bottom')
grid.arrange(c1, c2, nrow = 1)grid.arrange(c1, c2, ncol = 1)grid.arrange(c1, c2, nrow = 1, widths = c(2, 4))grid.arrange(c1, c2, ncol = 1, heights = c(2, 4))Different points of view of the R dataset.
library(tidyr)
library(dplyr)
diam1 <- diamonds %>% select(carat, depth:z) %>% gather(dX, dV, carat:z)
ggplot(diam1, aes('dX', dV)) +
geom_boxplot() +
facet_wrap(~dX, scales = 'free_y', nrow = 1) +
xlab('') +
ylab('') +
scale_x_discrete(breaks = NULL)a2 <- ggplot(diamonds, aes(y, z)) +
geom_point() +
xlab('width') +
ylab('depth')
d2 <- filter(diamonds, y > 2 & y < 11 & z > 1 & z < 7)
b2 <- ggplot(d2, aes(y, z)) +
geom_point() +
xlab('width') +
ylab('depth')
grid.arrange(a2, b2, ncol = 2)p0 <- ggplot(diamonds, aes(x = carat)) + ylab('')
p1 <- p0 + geom_histogram(binwidth = 1)
p2 <- p0 + geom_histogram(binwidth = 0.1)
p3 <- p0 + geom_histogram(binwidth = 0.01)
grid.arrange(arrangeGrob(p1, p2, ncol = 2), p3, nrow = 2)