Today in class we will begin exploring the campaign finance reports of Gov. Roy Cooper. For homework —- in preparation for your final report – you will do the same analysis for your candidate.
#install.packages("tidyverse")
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.3
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
url <- "https://cf.ncsbe.gov/CFOrgLkup/ExportDetailResults/?ReportID=172102&Type=REC&Title=Cooper%20for%20North%20Carolina%20-%202019%20Year%20End%20Semi-Annual"
#read_csv() does not do a great job here guessing the correct col_types, so we have to define them explicitly.
cooper_rcpts_2019A <- read_csv(url,
col_types = cols(
`Account Abbr` = col_character(),
City = col_character(),
`Country Name` = col_character(),
Date = col_date(format = "%m/%d/%Y"),
Description = col_character(),
`Employers Name` = col_character(),
`Full Zip` = col_character(),
`Outside US Postal Code` = col_character(),
Profession = col_character(),
Purpose = col_character(),
State = col_character(),
`Street 1` = col_character(),
`Street 2` = col_character()),
skip = 1)
#Finally, we should rename the columns to remove spaces and generally promote brevity.
names(cooper_rcpts_2019A) <- c("date","prior","donor","street1","street2","city","state","zip","country","postal","profession","employer","purpose","type","account","payment_form","description","amount","sum_to_date")
#One Numeric Variable
cooper_rcpts_2019A %>%
ggplot(mapping = aes(x = amount)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
cooper_rcpts_2019A %>%
ggplot(mapping = aes(x = amount)) +
geom_histogram(bins = 2)
cooper_rcpts_2019A %>%
ggplot(mapping = aes(x = amount)) +
geom_histogram(bins = 8)
cooper_rcpts_2019A %>%
filter(amount > 5400)
## # A tibble: 4 x 19
## date prior donor street1 street2 city state zip country postal
## <date> <lgl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2019-12-31 NA NC D… 434 FA… <NA> RALE… NC 27601 United… <NA>
## 2 2019-12-31 NA NC D… 434 FA… <NA> RALE… NC 27601 United… <NA>
## 3 2019-12-31 NA NC D… 434 FA… <NA> RALE… NC 27601 United… <NA>
## 4 2019-12-31 NA NC D… 434 FA… <NA> RALE… NC 27601 United… <NA>
## # … with 9 more variables: profession <chr>, employer <chr>, purpose <chr>,
## # type <chr>, account <chr>, payment_form <chr>, description <chr>,
## # amount <dbl>, sum_to_date <dbl>
cooper_rcpts_2019A %>%
filter(amount <= 5400) %>%
ggplot(mapping = aes(x = amount)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
cooper_rcpts_2019A %>%
filter(amount <= 5400) %>%
ggplot(mapping = aes(x = amount)) +
geom_histogram(binwidth=500)
cooper_rcpts_2019A %>%
filter(amount <= 5400) %>%
ggplot(mapping = aes(x = amount)) +
geom_histogram(binwidth=10)
cooper_rcpts_2019A %>%
filter(amount <= 5400) %>%
ggplot(mapping = aes(x = amount)) +
geom_histogram(binwidth=10) +
scale_x_continuous(breaks = seq(0, 5500, 500))
cooper_rcpts_2019A %>%
filter(amount <= 5400) %>%
ggplot(mapping = aes(x = amount)) +
geom_histogram(bins = 3)
cooper_rcpts_2019A %>%
ggplot(mapping = aes(x=amount, fill=type)) +
geom_histogram(alpha=0.6, position = 'identity', binwidth = 100)
This is what you should do, but it will cause choking. Why? Because there are too many professions. But notice that even the “choking” reveals an insight.
cooper_rcpts_2019A %>%
filter(amount <= 5400) %>%
ggplot(mapping = aes(x=amount, fill=profession)) +
geom_histogram(alpha=0.6, position = 'identity', binwidth = 100)
cooper_rcpts_2019A %>%
filter(amount <= 5400) %>%
ggplot(mapping = aes(x=amount, fill=state)) +
geom_histogram(alpha=0.6, position = 'identity', binwidth = 100)
This shows you proportion, rather than raw count.
cooper_rcpts_2019A %>%
filter(amount <= 5400) %>%
ggplot(mapping = aes(x = amount, fill = type)) +
geom_density()
## Warning: Groups with fewer than two data points have been dropped.
cooper_rcpts_2019A %>%
filter(amount <= 5400) %>%
ggplot(mapping = aes(x = amount, fill = type)) +
geom_density() +
coord_cartesian(xlim=c(0,1000), ylim = c(0,.025))
## Warning: Groups with fewer than two data points have been dropped.
cooper_rcpts_2019A %>%
filter(amount <= 5400) %>%
ggplot(mapping = aes(x = amount)) +
geom_histogram(binwidth=500) +
scale_x_continuous(breaks = seq(0, 5500, 500))
We don’t really have this here. An example of this would be voter registration by precinct. So you would have the number of voters, etc.
This really only works with grouped. Check out the difference between these two bars that at first blush look the same, but really are different.
cooper_rcpts_2019A %>%
filter(state != "NC") %>%
ggplot(aes(x=state, y=amount)) +
geom_bar(stat = "identity", color = "black", fill="yellow") +
coord_flip()
cooper_rcpts_2019A %>%
filter(state != "NC") %>%
group_by(state) %>%
summarise(total = sum(amount)) %>%
ggplot(aes(x=state, y=total)) +
geom_bar(stat = "identity", color="black", fill="yellow") +
coord_flip()
An example with months …
cooper_rcpts_2019A %>%
group_by(month = month(date)) %>%
summarise(total = sum(amount)) %>%
ggplot(aes(x=month, y=total)) +
geom_bar(stat = "identity")
These stacked bars let us look at both month and type.
cooper_rcpts_2019A %>%
group_by(month = month(date), type) %>%
summarise(total = sum(amount)) %>%
ggplot(aes(x=month, y=total, fill = type)) +
geom_bar(stat = "identity")
This is the same, but with side-by-side bars
cooper_rcpts_2019A %>%
group_by(month = month(date), type) %>%
summarise(total = sum(amount)) %>%
ggplot(aes(x=month, y=total, fill = type)) +
geom_bar(stat = "identity", position="dodge")
###Stacked area graph
cooper_rcpts_2019A %>%
group_by(state, month = month(date)) %>%
summarise(total = sum(amount)) %>%
ggplot(aes(x=month, y=total, fill=state)) +
geom_area()
What does this show ? A ton of huge outliers every month.
cooper_rcpts_2019A %>%
filter(amount <= 5400) %>%
# We use as.factor() to turn a continuous variable into a categorical variable.
ggplot(mapping = aes(x = as.factor(month(date)), y = amount)) +
geom_boxplot()
cooper_rcpts_2019A %>%
group_by(type, month = month(date)) %>%
summarise(total = sum(amount)) %>%
ggplot(mapping = aes(x = month, y = type)) +
geom_tile(mapping = aes(fill = total)) +
scale_fill_continuous(name="Donations", labels = dollar_format())