library(tidyverse)
library(lubridate) # for data management
library(rtweet) # for the lat_lng function
Read in the raw data. This is not dated so that I can keep updating it without breaking the link.
setwd("../")
raw <- readRDS("data/raw-campfire-tweet-data.Rds")
We start with 93962 collected tweets.
English only tweets
en.only <- raw %>% filter(lang=="en")
Down to 84773 tweets.
Create a list of keywords to exclude.
drop.list <- c("deepawali", 'rap', 'sauna', 'Travel', 'hiphop', 'occult', 'island', 'domagick',
'peace', 'diwali', 'dewali', 'nature', 'beach', 'KPOP', 'lit', 'dragon', 'caribbean',
'paradiseisland2', 'gameinsight', 'bts_twt', 'idyllic garden', 'CelticFC', 'Mingyu',
'porn', 'woodburning', 'woodburner', 'vacation', 'gunman', 'shooter')
@ratboygenius ?
_Ref:https://community.rstudio.com/t/multiple-str-detect-or-loop/25413/3_
to_keep <- function(fixed_string, text) {
!stringr::str_detect(text, stringr::fixed(fixed_string, ignore_case = TRUE))
}
out <- en.only %>%
dplyr::filter(
drop.list %>%
# apply the filter of all the text rows for each pattern
# you'll get one list of logical by pattern ignored_string
purrr::map(~ to_keep(.x, text = text)) %>%
# get a logical vector of rows to keep
purrr::pmap_lgl(all)
)
Down to 73467 tweets.
Remove tweets from users such as pop stars that trigger keywords like paradise but have nothing to do with the fire.
Use the lat_lng
function to extract geo codes from the list variable and put them into their own variables.
drop.users <- c("BTS_twt", "theAleppoCatman", 'porn', 'vita_valery', 'RaveFootage', 'campfirejp')
out2 <- out %>% filter(!screen_name %in% drop.users) %>%
lat_lng() %>% select(-geo_coords, -lang)
Down to 73465 tweets.
Convert to PST and create different time blocks.
out3 <- out2 %>%
mutate(created_at_pst = with_tz(created_at, 'US/Pacific'),
year = year(created_at_pst),
month = month(created_at_pst),
day = day(created_at_pst),
hour = hour(created_at_pst),
minute = minute(created_at_pst),
tweet_min = round_date(created_at_pst, unit="minutes"),
tweet_hour = round_date(created_at_pst, unit="hours"))
woolsey <- grepl("woolsey", out3$text, ignore.case = TRUE)
campfire <- grepl("camp", out3$text, ignore.case=TRUE)
drop.woolsey.only <- (woolsey & !campfire)
out3 <- out3[!drop.woolsey.only,]
Exporting 72748 tweets hopefully about the campfire.
setwd("../")
saveRDS(out3, paste0("data/campfire-tweets-", today(), ".Rds"))
This code last run on 2020-04-27 20:55:29.
sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 18362)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] rtweet_0.7.0 lubridate_1.7.4 forcats_0.4.0
## [4] stringr_1.4.0 dplyr_0.8.99.9002 purrr_0.3.3
## [7] readr_1.3.1 tidyr_1.0.2 tibble_3.0.0
## [10] ggplot2_3.2.1 tidyverse_1.2.1
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.4 cellranger_1.1.0 pillar_1.4.3
## [4] compiler_3.6.2 tools_3.6.2 digest_0.6.25
## [7] jsonlite_1.6.1 evaluate_0.14 lifecycle_0.2.0
## [10] nlme_3.1-142 gtable_0.3.0 lattice_0.20-38
## [13] pkgconfig_2.0.3 rlang_0.4.5.9000 rstudioapi_0.11
## [16] cli_2.0.2 yaml_2.2.1 haven_2.1.1
## [19] xfun_0.9 withr_2.1.2 xml2_1.2.2
## [22] httr_1.4.1 knitr_1.24 hms_0.5.3
## [25] generics_0.0.2 vctrs_0.2.99.9011 grid_3.6.2
## [28] tidyselect_1.0.0 glue_1.4.0 R6_2.4.1
## [31] fansi_0.4.1 readxl_1.3.1 rmarkdown_1.15
## [34] modelr_0.1.5 magrittr_1.5 backports_1.1.5
## [37] scales_1.0.0 ellipsis_0.3.0 htmltools_0.3.6
## [40] rvest_0.3.4 assertthat_0.2.1 colorspace_1.4-1
## [43] stringi_1.4.6 lazyeval_0.2.2 munsell_0.5.0
## [46] broom_0.5.2 crayon_1.3.4