library(lubridate) # for data management
library(rtweet)    # for the lat_lng function


Read in the raw data. This is not dated so that I can keep updating it without breaking the link.

raw <- readRDS("data/raw-campfire-tweet-data.Rds") 

We start with 93962 collected tweets.

Data cleaning

Filter out some tweets.


English only tweets

en.only <- raw %>% filter(lang=="en")

Down to 84773 tweets.

Non-relevant keywords

Create a list of keywords to exclude.

drop.list <- c("deepawali", 'rap', 'sauna', 'Travel', 'hiphop', 'occult', 'island', 'domagick', 
            'peace', 'diwali', 'dewali', 'nature', 'beach', 'KPOP', 'lit', 'dragon', 'caribbean', 
            'paradiseisland2', 'gameinsight', 'bts_twt', 'idyllic garden', 'CelticFC', 'Mingyu', 
            'porn', 'woodburning', 'woodburner', 'vacation', 'gunman', 'shooter')

@ratboygenius ?


to_keep <- function(fixed_string, text) {
  !stringr::str_detect(text, stringr::fixed(fixed_string, ignore_case = TRUE))

out <- en.only %>% 
    drop.list %>%
      # apply the filter of all the text rows for each pattern
      # you'll get one list of logical by pattern ignored_string
      purrr::map(~ to_keep(.x, text = text)) %>%
      # get a logical vector of rows to keep

Down to 73467 tweets.

Non-relevant users

Remove tweets from users such as pop stars that trigger keywords like paradise but have nothing to do with the fire.

Geo coding

Use the lat_lng function to extract geo codes from the list variable and put them into their own variables.

drop.users <- c("BTS_twt", "theAleppoCatman", 'porn', 'vita_valery', 'RaveFootage', 'campfirejp')

out2 <- out %>% filter(!screen_name %in% drop.users) %>%
                lat_lng() %>% select(-geo_coords, -lang)

Down to 73465 tweets.

Dates and Times

Convert to PST and create different time blocks.

out3 <- out2 %>% 
        mutate(created_at_pst = with_tz(created_at, 'US/Pacific'),
               year = year(created_at_pst),
               month = month(created_at_pst), 
               day = day(created_at_pst), 
               hour = hour(created_at_pst), 
               minute = minute(created_at_pst), 
               tweet_min = round_date(created_at_pst, unit="minutes"),
               tweet_hour = round_date(created_at_pst, unit="hours"))

Exclude data on Woolsey fire (same time)

woolsey <- grepl("woolsey", out3$text, = TRUE)
campfire <- grepl("camp", out3$text,
drop.woolsey.only <- (woolsey & !campfire)
out3 <- out3[!drop.woolsey.only,]

Export cleaned data

Exporting 72748 tweets hopefully about the campfire.

saveRDS(out3, paste0("data/campfire-tweets-", today(), ".Rds"))

This code last run on 2020-04-27 20:55:29.

