Instructions

Today in class we will begin exploring the campaign finance reports of Gov. Roy Cooper. For homework —- in preparation for your final report – you will do the same analysis for your candidate.

Load the tidyverse package(s)

#install.packages("tidyverse")
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.3
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor

Load the data

url <- "https://cf.ncsbe.gov/CFOrgLkup/ExportDetailResults/?ReportID=172102&Type=REC&Title=Cooper%20for%20North%20Carolina%20-%202019%20Year%20End%20Semi-Annual"

#read_csv() does not do a great job here guessing the correct col_types, so we have to define them explicitly.

cooper_rcpts_2019A <- read_csv(url, 
    col_types = cols(
      `Account Abbr` = col_character(), 
        City = col_character(), 
      `Country Name` = col_character(), 
        Date = col_date(format = "%m/%d/%Y"), 
        Description = col_character(), 
      `Employers Name` = col_character(), 
        `Full Zip` = col_character(), 
      `Outside US Postal Code` = col_character(), 
        Profession = col_character(), 
      Purpose = col_character(), 
        State = col_character(), 
      `Street 1` = col_character(), 
        `Street 2` = col_character()),
    skip = 1)

#Finally, we should rename the columns to remove spaces and generally promote brevity.

names(cooper_rcpts_2019A) <- c("date","prior","donor","street1","street2","city","state","zip","country","postal","profession","employer","purpose","type","account","payment_form","description","amount","sum_to_date")

#One Numeric Variable

cooper_rcpts_2019A %>%
  ggplot(mapping = aes(x = amount)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

cooper_rcpts_2019A %>%
  ggplot(mapping = aes(x = amount)) +
  geom_histogram(bins = 2)

cooper_rcpts_2019A %>%
  ggplot(mapping = aes(x = amount)) +
  geom_histogram(bins = 8)

cooper_rcpts_2019A %>%
  filter(amount > 5400)
## # A tibble: 4 x 19
##   date       prior donor street1 street2 city  state zip   country postal
##   <date>     <lgl> <chr> <chr>   <chr>   <chr> <chr> <chr> <chr>   <chr> 
## 1 2019-12-31 NA    NC D… 434 FA… <NA>    RALE… NC    27601 United… <NA>  
## 2 2019-12-31 NA    NC D… 434 FA… <NA>    RALE… NC    27601 United… <NA>  
## 3 2019-12-31 NA    NC D… 434 FA… <NA>    RALE… NC    27601 United… <NA>  
## 4 2019-12-31 NA    NC D… 434 FA… <NA>    RALE… NC    27601 United… <NA>  
## # … with 9 more variables: profession <chr>, employer <chr>, purpose <chr>,
## #   type <chr>, account <chr>, payment_form <chr>, description <chr>,
## #   amount <dbl>, sum_to_date <dbl>
cooper_rcpts_2019A %>%
  filter(amount <= 5400) %>%
  ggplot(mapping = aes(x = amount)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

cooper_rcpts_2019A %>%
  filter(amount <= 5400) %>%
  ggplot(mapping = aes(x = amount)) +
  geom_histogram(binwidth=500)

cooper_rcpts_2019A %>%
  filter(amount <= 5400) %>%
  ggplot(mapping = aes(x = amount)) +
  geom_histogram(binwidth=10)

Extra credit: Add a new layer called scale_x_continuous to make it a little bit easier to interpret these granular results

cooper_rcpts_2019A %>%
  filter(amount <= 5400) %>%
  ggplot(mapping = aes(x = amount)) +
  geom_histogram(binwidth=10) +
  scale_x_continuous(breaks = seq(0, 5500, 500))

From Cooper’s perspective, what are “Small”, “medium”, “large” donations?

cooper_rcpts_2019A %>%
  filter(amount <= 5400) %>%
  ggplot(mapping = aes(x = amount)) +
  geom_histogram(bins = 3)

Comparing number of donations by type.

cooper_rcpts_2019A %>%
  ggplot(mapping = aes(x=amount, fill=type)) +
  geom_histogram(alpha=0.6, position = 'identity', binwidth = 100)

By profession.

This is what you should do, but it will cause choking. Why? Because there are too many professions. But notice that even the “choking” reveals an insight.

cooper_rcpts_2019A %>%
  filter(amount <= 5400) %>%
  ggplot(mapping = aes(x=amount, fill=profession)) +
  geom_histogram(alpha=0.6, position = 'identity', binwidth = 100)

By state. Also not super useful

cooper_rcpts_2019A %>%
  filter(amount <= 5400) %>%
  ggplot(mapping = aes(x=amount, fill=state)) +
  geom_histogram(alpha=0.6, position = 'identity', binwidth = 100)

Also with one numeric variable, density plot

This shows you proportion, rather than raw count.

cooper_rcpts_2019A %>%
  filter(amount <= 5400) %>%
  ggplot(mapping = aes(x = amount, fill = type)) +
  geom_density()
## Warning: Groups with fewer than two data points have been dropped.

Zoom with coord_cartesian()

cooper_rcpts_2019A %>%
  filter(amount <= 5400) %>%
  ggplot(mapping = aes(x = amount, fill = type)) +
  geom_density() +
  coord_cartesian(xlim=c(0,1000), ylim = c(0,.025))
## Warning: Groups with fewer than two data points have been dropped.

Extra Credit

cooper_rcpts_2019A %>%
  filter(amount <= 5400) %>%
  ggplot(mapping = aes(x = amount)) +
  geom_histogram(binwidth=500) +
  scale_x_continuous(breaks = seq(0, 5500, 500))

Two numeric values

We don’t really have this here. An example of this would be voter registration by precinct. So you would have the number of voters, etc.

One categorical variable.

This really only works with grouped. Check out the difference between these two bars that at first blush look the same, but really are different.

cooper_rcpts_2019A %>%
  filter(state != "NC") %>%
ggplot(aes(x=state, y=amount)) + 
  geom_bar(stat = "identity", color = "black", fill="yellow") +
  coord_flip()

cooper_rcpts_2019A %>%
  filter(state != "NC") %>%
  group_by(state) %>%
  summarise(total = sum(amount)) %>%
ggplot(aes(x=state, y=total)) + 
  geom_bar(stat = "identity", color="black", fill="yellow") +
  coord_flip()

An example with months …

cooper_rcpts_2019A %>%
  group_by(month = month(date)) %>%
  summarise(total = sum(amount)) %>%
ggplot(aes(x=month, y=total)) + 
  geom_bar(stat = "identity")

Comparing one numeric value across multiple categorical variables

These stacked bars let us look at both month and type.

cooper_rcpts_2019A %>%
  group_by(month = month(date), type) %>%
  summarise(total = sum(amount)) %>%
ggplot(aes(x=month, y=total, fill = type)) + 
  geom_bar(stat = "identity")

This is the same, but with side-by-side bars

cooper_rcpts_2019A %>%
  group_by(month = month(date), type) %>%
  summarise(total = sum(amount)) %>%
ggplot(aes(x=month, y=total, fill = type)) + 
  geom_bar(stat = "identity", position="dodge")

###Stacked area graph

cooper_rcpts_2019A %>%
  group_by(state, month = month(date)) %>%
  summarise(total = sum(amount)) %>% 
  ggplot(aes(x=month, y=total, fill=state)) + 
 geom_area()

Boxplot by month

What does this show ? A ton of huge outliers every month.

cooper_rcpts_2019A %>%
  filter(amount <= 5400) %>%
  # We use as.factor() to turn a continuous variable into a categorical variable.
  ggplot(mapping = aes(x = as.factor(month(date)), y = amount)) +
  geom_boxplot()

Kind of like those crosstabs we built.

cooper_rcpts_2019A %>%
    group_by(type, month = month(date)) %>%
  summarise(total = sum(amount)) %>% 
  ggplot(mapping = aes(x = month, y = type)) +
    geom_tile(mapping = aes(fill = total)) +
    scale_fill_continuous(name="Donations", labels = dollar_format())