library(tidyverse); library(foreach)
library(lubridate);library(rtweet)
Read in the raw data.
setwd("../")
raw <- readRDS("data/raw-covid-tweet-data.Rds")
Hydrating tweets returns the twitter data in a very different format than using rtweet
.
glimpse(raw)
## Rows: 410,564
## Columns: 93
## $ user_id <chr> "146569971", "146569971", "146569971",...
## $ tweet_id <dbl> 1.250552e+18, 1.250542e+18, 1.250534e+...
## $ created_at_UTC <dttm> 2020-04-15 22:31:01, 2020-04-15 21:52...
## $ screen_name <chr> "CDCgov", "CDCgov", "CDCgov", "CDCgov"...
## $ text <chr> "#HCPs: COCA Call on Thurs, April 16, ...
## $ source <chr> "Sprout Social", "Sprout Social", "Spr...
## $ display_text_width <dbl> 140, 140, 140, 140, NA, NA, NA, 140, 1...
## $ reply_to_status_id <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ reply_to_user_id <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ reply_to_screen_name <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ is_quote <lgl> FALSE, FALSE, FALSE, FALSE, TRUE, FALS...
## $ is_retweet <lgl> FALSE, FALSE, FALSE, FALSE, TRUE, TRUE...
## $ favorite_count <int> 127, 197, 249, 484, 0, 0, 0, 910, 304,...
## $ retweet_count <int> 64, 118, 178, 194, 0, 0, 0, 770, 189, ...
## $ quote_count <int> 6, 13, 11, 34, 0, 0, 0, 62, 11, 5, 5, ...
## $ reply_count <int> 17, 19, 36, 43, 0, 0, 0, 61, 38, 7, 29...
## $ hashtags <chr> "HCPs", "COVID19", "", "", "StayHome C...
## $ symbols <list> [NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ urls_url <list> ["twitter.com/i/web/status/1…", "twit...
## $ urls_t.co <list> ["https://t.co/doUlaKfV0M", "https://...
## $ urls_expanded_url <list> ["https://twitter.com/i/web/status/12...
## $ media_t.co <list> [NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ media_expanded_url <list> [NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ media_type <list> [NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ ext_media_url <list> [NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ ext_media_t.co <list> [NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ ext_media_expanded_url <list> [NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ ext_media_type <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ mentions_user_id <list> [NA, NA, NA, NA, "455024343", "447838...
## $ mentions_screen_name <list> [NA, NA, NA, NA, "Surgeon_General", "...
## $ lang <chr> "en", "en", "en", "en", "en", "en", "e...
## $ quoted_status_id <chr> NA, NA, NA, NA, "1250151847559671808",...
## $ quoted_text <chr> NA, NA, NA, NA, "Being #ActiveandHealt...
## $ quoted_created_at <dttm> NA, NA, NA, NA, 2020-04-14 20:00:01, ...
## $ quoted_source <chr> NA, NA, NA, NA, "Sprinklr Publishing",...
## $ quoted_favorite_count <int> NA, NA, NA, NA, 37, NA, NA, NA, NA, NA...
## $ quoted_retweet_count <int> NA, NA, NA, NA, 29, NA, NA, NA, NA, NA...
## $ quoted_user_id <chr> NA, NA, NA, NA, "44783853", NA, NA, NA...
## $ quoted_screen_name <chr> NA, NA, NA, NA, "HHSGov", NA, NA, NA, ...
## $ quoted_name <chr> NA, NA, NA, NA, "HHS.gov", NA, NA, NA,...
## $ quoted_followers_count <int> NA, NA, NA, NA, 929804, NA, NA, NA, NA...
## $ quoted_friends_count <int> NA, NA, NA, NA, 395, NA, NA, NA, NA, N...
## $ quoted_statuses_count <int> NA, NA, NA, NA, 21169, NA, NA, NA, NA,...
## $ quoted_location <chr> NA, NA, NA, NA, "Washington, D.C.", NA...
## $ quoted_description <chr> NA, NA, NA, NA, "News and information ...
## $ quoted_verified <lgl> NA, NA, NA, NA, TRUE, NA, NA, NA, NA, ...
## $ retweet_status_id <dbl> NA, NA, NA, NA, 1.250414e+18, 1.250458...
## $ retweet_text <chr> NA, NA, NA, NA, "This is particularly ...
## $ retweet_created_at <dttm> NA, NA, NA, NA, 2020-04-15 13:19:45, ...
## $ retweet_source <chr> NA, NA, NA, NA, "Twitter Web App", "Sp...
## $ retweet_favorite_count <int> NA, NA, NA, NA, 261, 167, 212, NA, NA,...
## $ retweet_retweet_count <int> NA, NA, NA, NA, 107, 89, 75, NA, NA, N...
## $ retweet_user_id <chr> NA, NA, NA, NA, "455024343", "44783853...
## $ retweet_screen_name <chr> NA, NA, NA, NA, "Surgeon_General", "HH...
## $ retweet_name <chr> NA, NA, NA, NA, "U.S. Surgeon General"...
## $ retweet_followers_count <int> NA, NA, NA, NA, 772909, 929804, 65073,...
## $ retweet_friends_count <int> NA, NA, NA, NA, 223, 395, 159, NA, NA,...
## $ retweet_statuses_count <int> NA, NA, NA, NA, 9501, 21169, 3219, NA,...
## $ retweet_location <chr> NA, NA, NA, NA, "Washington, DC", "Was...
## $ retweet_description <chr> NA, NA, NA, NA, "U.S. Surgeon General ...
## $ retweet_verified <lgl> NA, NA, NA, NA, TRUE, TRUE, TRUE, NA, ...
## $ place_url <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ place_name <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ place_full_name <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ place_type <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ country <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ country_code <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ coords_coords <list> [<NA, NA>, <NA, NA>, <NA, NA>, <NA, N...
## $ bbox_coords <list> [<NA, NA, NA, NA, NA, NA, NA, NA>, <N...
## $ tweet_url <chr> "https://twitter.com/CDCgov/status/125...
## $ name <chr> "CDC", "CDC", "CDC", "CDC", "CDC", "CD...
## $ user_location <chr> "Atlanta, GA", "Atlanta, GA", "Atlanta...
## $ user_description <chr> "CDC's official Twitter source for dai...
## $ url <chr> "http://www.cdc.gov", "http://www.cdc....
## $ protected <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FAL...
## $ user_followers_count <int> 2549787, 2549787, 2549787, 2549787, 25...
## $ user_friends_count <int> 266, 266, 266, 266, 266, 266, 266, 266...
## $ user_listed_count <int> 17160, 17160, 17160, 17160, 17160, 171...
## $ user_statuses_count <int> 26566, 26566, 26566, 26566, 26566, 265...
## $ user_favourites_count <int> 522, 522, 522, 522, 522, 522, 522, 522...
## $ account_created_at <dttm> 2010-05-21 19:40:40, 2010-05-21 19:40...
## $ verified <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TR...
## $ profile_url <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ profile_expanded_url <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ account_lang <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ profile_banner_url <chr> "https://pbs.twimg.com/profile_banners...
## $ profile_background_url <chr> "http://abs.twimg.com/images/themes/th...
## $ profile_image_url <chr> "http://pbs.twimg.com/profile_images/8...
## $ lat <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ lng <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ urls <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ possibly_sensitive <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ user_time_zone <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
It’s also interesting to note, that not all tweet ID’s that were submitted, were hydrated. If a tweet was deleted, it can’t be recovered. We’re currently at a very low (50%) hydrating rate, this is likely due to the large amount of tweets that were misinformation, and so have been removed.
out <- raw %>%
mutate(year = year(created_at_UTC),
month = month(created_at_UTC),
day = day(created_at_UTC),
hour = hour(created_at_UTC),
minute = minute(created_at_UTC),
tweet_min = round_date(created_at_UTC, unit="minutes"),
tweet_hour = round_date(created_at_UTC, unit="hours"),
tweet_day = round_date(created_at_UTC, unit="days")) %>%
arrange(created_at_UTC)
rtweet
, if the flag is_quote==TRUE
, then information on the original tweet is contained in the quoted_
variablesis_quote
variable, and no quoted_
variables)is_quote
then set to FALSE
.out$is_quote[is.na(out$is_quote)] <- FALSE
Not run. Just for example.
stats.on.orig.tweets <- out %>% filter(!is_quote, !is_retweet) %>%
select(screen_name, tweet_id, favorite_count:reply_count, user_followers_count)
stats.on.qt <- out %>% filter(is_quote) %>%
select(quoted_screen_name, tweet_id, quoted_favorite_count, quoted_retweet_count, quoted_followers_count) %>%
rename(screen_name = quoted_screen_name, favorite_count=quoted_favorite_count, retweet_count = quoted_retweet_count,
tweet_id = tweet_id, user_followers_count = quoted_followers_count)
stats.on.comb.tweets <- bind_rows(stats.on.orig.tweets, stats.on.qt) %>% group_by(tweet_id) %>% filter(n()==1)
setwd("../")
saveRDS(out, paste0("data/covid-tweets-", today(), ".Rds"))
Then copy this cleaned data to Google Drive.
Last run on 2020-04-30 10:17:49
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 18363)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] rtweet_0.7.0 lubridate_1.7.4 foreach_1.4.7 forcats_0.4.0
## [5] stringr_1.4.0 dplyr_0.8.4 purrr_0.3.3 readr_1.3.1
## [9] tidyr_1.0.2 tibble_3.0.0 ggplot2_3.2.1 tidyverse_1.2.1
##
## loaded via a namespace (and not attached):
## [1] tidyselect_0.2.5 xfun_0.10 haven_2.1.1 lattice_0.20-38
## [5] colorspace_1.4-1 vctrs_0.2.4 generics_0.0.2 htmltools_0.4.0
## [9] yaml_2.2.1 utf8_1.1.4 rlang_0.4.5 pillar_1.4.3
## [13] glue_1.3.1 withr_2.1.2 modelr_0.1.5 readxl_1.3.1
## [17] lifecycle_0.2.0 munsell_0.5.0 gtable_0.3.0 cellranger_1.1.0
## [21] rvest_0.3.4 codetools_0.2-16 evaluate_0.14 knitr_1.25
## [25] fansi_0.4.0 broom_0.5.2 Rcpp_1.0.3 scales_1.0.0
## [29] backports_1.1.5 jsonlite_1.6.1 hms_0.5.1 digest_0.6.21
## [33] stringi_1.4.3 grid_3.6.1 cli_2.0.1 tools_3.6.1
## [37] magrittr_1.5 lazyeval_0.2.2 crayon_1.3.4 pkgconfig_2.0.3
## [41] ellipsis_0.3.0 xml2_1.2.2 assertthat_0.2.1 rmarkdown_1.18
## [45] httr_1.4.1 rstudioapi_0.10 iterators_1.0.12 R6_2.4.0
## [49] nlme_3.1-140 compiler_3.6.1