library(tidyverse) 
library(lubridate) # for data management
library(rtweet)    # for the lat_lng function

Import

Read in the raw data. This is not dated so that I can keep updating it without breaking the link.

setwd("../")
raw <- readRDS("data/raw-campfire-tweet-data.Rds")

We start with 93962 collected tweets.

Data cleaning

Filter out some tweets.

Language

English only tweets

en.only <- raw %>% filter(lang=="en")

Down to 84773 tweets.

Non-relevant keywords

Create a list of keywords to exclude.

drop.list <- c("deepawali", 'rap', 'sauna', 'Travel', 'hiphop', 'occult', 'island', 'domagick', 
            'peace', 'diwali', 'dewali', 'nature', 'beach', 'KPOP', 'lit', 'dragon', 'caribbean', 
            'paradiseisland2', 'gameinsight', 'bts_twt', 'idyllic garden', 'CelticFC', 'Mingyu', 
            'porn', 'woodburning', 'woodburner', 'vacation', 'gunman', 'shooter')

@ratboygenius ?

_Ref:https://community.rstudio.com/t/multiple-str-detect-or-loop/25413/3_

to_keep <- function(fixed_string, text) {
  !stringr::str_detect(text, stringr::fixed(fixed_string, ignore_case = TRUE))
}

out <- en.only %>% 
  dplyr::filter(
    drop.list %>%
      # apply the filter of all the text rows for each pattern
      # you'll get one list of logical by pattern ignored_string
      purrr::map(~ to_keep(.x, text = text)) %>%
      # get a logical vector of rows to keep
      purrr::pmap_lgl(all)
  )

Down to 73467 tweets.

Non-relevant users

Remove tweets from users such as pop stars that trigger keywords like paradise but have nothing to do with the fire.

Geo coding

Use the lat_lng function to extract geo codes from the list variable and put them into their own variables.

drop.users <- c("BTS_twt", "theAleppoCatman", 'porn', 'vita_valery', 'RaveFootage', 'campfirejp')

out2 <- out %>% filter(!screen_name %in% drop.users) %>%
                lat_lng() %>% select(-geo_coords, -lang)

Down to 73465 tweets.

Dates and Times

Convert to PST and create different time blocks.

out3 <- out2 %>% 
        mutate(created_at_pst = with_tz(created_at, 'US/Pacific'),
               year = year(created_at_pst),
               month = month(created_at_pst), 
               day = day(created_at_pst), 
               hour = hour(created_at_pst), 
               minute = minute(created_at_pst), 
               tweet_min = round_date(created_at_pst, unit="minutes"),
               tweet_hour = round_date(created_at_pst, unit="hours"))

Exclude data on Woolsey fire (same time)

woolsey <- grepl("woolsey", out3$text, ignore.case = TRUE)
campfire <- grepl("camp", out3$text, ignore.case=TRUE)
drop.woolsey.only <- (woolsey & !campfire)
out3 <- out3[!drop.woolsey.only,]

Export cleaned data

Exporting 72748 tweets hopefully about the campfire.

setwd("../")
saveRDS(out3, paste0("data/campfire-tweets-", today(), ".Rds"))

This code last run on 2020-04-27 20:55:29.

sessionInfo()

## R version 3.6.2 (2019-12-12)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 18362)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] rtweet_0.7.0      lubridate_1.7.4   forcats_0.4.0    
##  [4] stringr_1.4.0     dplyr_0.8.99.9002 purrr_0.3.3      
##  [7] readr_1.3.1       tidyr_1.0.2       tibble_3.0.0     
## [10] ggplot2_3.2.1     tidyverse_1.2.1  
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.4        cellranger_1.1.0  pillar_1.4.3     
##  [4] compiler_3.6.2    tools_3.6.2       digest_0.6.25    
##  [7] jsonlite_1.6.1    evaluate_0.14     lifecycle_0.2.0  
## [10] nlme_3.1-142      gtable_0.3.0      lattice_0.20-38  
## [13] pkgconfig_2.0.3   rlang_0.4.5.9000  rstudioapi_0.11  
## [16] cli_2.0.2         yaml_2.2.1        haven_2.1.1      
## [19] xfun_0.9          withr_2.1.2       xml2_1.2.2       
## [22] httr_1.4.1        knitr_1.24        hms_0.5.3        
## [25] generics_0.0.2    vctrs_0.2.99.9011 grid_3.6.2       
## [28] tidyselect_1.0.0  glue_1.4.0        R6_2.4.1         
## [31] fansi_0.4.1       readxl_1.3.1      rmarkdown_1.15   
## [34] modelr_0.1.5      magrittr_1.5      backports_1.1.5  
## [37] scales_1.0.0      ellipsis_0.3.0    htmltools_0.3.6  
## [40] rvest_0.3.4       assertthat_0.2.1  colorspace_1.4-1 
## [43] stringi_1.4.6     lazyeval_0.2.2    munsell_0.5.0    
## [46] broom_0.5.2       crayon_1.3.4

Historical Twitter Data Cleaning