library(tidyverse)
bees <- read_csv("C:/Users/Z/Downloads/plants_and_bees.csv")

Since we are focusing on which plants bees prefer, I removed all samples taken from the air.

bees_with_plant <- bees %>%
  filter(plant_species != "None")
View(bees_with_plant)

I then created various data frames to help focus on certain information.

First I created tables that focused on either natove or non-native bee species and graphed the results.

plant_summary_native <- bees_with_plant %>%
  filter(nonnative_bee == 0) %>%
  group_by(native_or_non) %>%
  count(plant_species, wt = bees_num, sort = TRUE)

plant_summary_non <- bees_with_plant %>%
  filter(nonnative_bee == 1) %>%
    group_by(native_or_non) %>%
  count(plant_species, wt = bees_num, sort = TRUE)

cols <- c("non-native" = "palevioletred2", "native" = "lightskyblue")    
 ggplot(plant_summary_native, 
       aes(x = reorder(plant_species, -n), 
           y = n,
           fill = native_or_non)) +
  geom_bar(stat = 'identity') +
  theme(axis.text.x = element_text(angle = 90))  +
  labs(x = "Plant Species", 
       y = "Total Bees Counted", 
       title = "Native Bee Preferences",
       fill = "Plant Origin") + 
    scale_fill_manual(values = cols)

ggplot(plant_summary_non, 
       aes(x = reorder(plant_species, -n), 
           y = n,
           fill = native_or_non)) +
  geom_bar(stat = 'identity') +
  theme(axis.text.x = element_text(angle = 90))  +
  labs(x = "Plant Species", 
       y = "Total Bees Counted", 
       title = "Non-native Bee Preferences",
       fill = "Plant Origin") + 
    scale_fill_manual(values = cols)

Next I created tables that provided information about the timeframe of samples, focusing on the native bees. I then zoomed in on the 6 most popular species and graphed the results.

native_season <- bees_with_plant  %>%
  group_by(season, native_or_non) %>%
  filter(nonnative_bee == 0) %>%
  count(plant_species, wt = bees_num, sort = TRUE)
pop_species <- c("Leucanthemum vulgare", "Rudbeckia hirta", "Cichorium intybus", "Daucus carota", "Chamaecrista fasciculata", "Asclepias tuberosa")

season_lab <- c("Early Season 04/18/2017 - 07/03/2017", "Late Season    
07/18/2017 - 08/02/2017")
  
native_season_zoom <- native_season %>%
  filter(plant_species %in% pop_species)

native_season_zoom['season'][native_season_zoom['season'] == "early.season"] <- "Early Season"
native_season_zoom['season'][native_season_zoom['season'] == "late.season"] <- "Late Season"

ggplot(native_season_zoom, 
       aes(x = reorder(plant_species, -n), 
           y = n,
           fill = plant_species)) +
  geom_bar(stat = 'identity') +
  facet_wrap(~season) +
  theme(axis.text.x = element_text(angle = 90))  +
  labs(x = "Plant Species", 
       y = "Total Bees Counted", 
       title = "Native Bee Preferences",
       fill = "Plant Species")

When examining a single sample, I opted to focus on the sample with the most entries. I limited it to samples that had plants. I then graphed the distribution of that sample.

diverse_sample <- bees_with_plant %>%
  group_by(sample_id) %>%
  summarize(n = n()) %>%
  arrange(desc(n))

sample_dist <- bees %>%
  filter( sample_id ==  17473) %>% 
  group_by(plant_species)
  

ggplot(sample_dist, aes(x = plant_species, y = bees_num, fill = reorder(bee_species, -bees_num))) +
  geom_col() +
  labs(x = "Plant Species", 
       y = "Total Bees Counted", 
       title = "Distribution of a Sample",
       fill = "Bee Species",
       subtitle = "Sample ID 17473, 07/18/2017") +
      theme(axis.text.x = element_text(angle = 90))