library(tidyverse); library(foreach)
library(lubridate);library(rtweet)

Import

Read in the raw data.

setwd("../")
raw <- readRDS("data/raw-covid-tweet-data.Rds") 

Hydrating tweets returns the twitter data in a very different format than using rtweet.

glimpse(raw)
## Rows: 410,564
## Columns: 93
## $ user_id                 <chr> "146569971", "146569971", "146569971",...
## $ tweet_id                <dbl> 1.250552e+18, 1.250542e+18, 1.250534e+...
## $ created_at_UTC          <dttm> 2020-04-15 22:31:01, 2020-04-15 21:52...
## $ screen_name             <chr> "CDCgov", "CDCgov", "CDCgov", "CDCgov"...
## $ text                    <chr> "#HCPs: COCA Call on Thurs, April 16, ...
## $ source                  <chr> "Sprout Social", "Sprout Social", "Spr...
## $ display_text_width      <dbl> 140, 140, 140, 140, NA, NA, NA, 140, 1...
## $ reply_to_status_id      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ reply_to_user_id        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ reply_to_screen_name    <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ is_quote                <lgl> FALSE, FALSE, FALSE, FALSE, TRUE, FALS...
## $ is_retweet              <lgl> FALSE, FALSE, FALSE, FALSE, TRUE, TRUE...
## $ favorite_count          <int> 127, 197, 249, 484, 0, 0, 0, 910, 304,...
## $ retweet_count           <int> 64, 118, 178, 194, 0, 0, 0, 770, 189, ...
## $ quote_count             <int> 6, 13, 11, 34, 0, 0, 0, 62, 11, 5, 5, ...
## $ reply_count             <int> 17, 19, 36, 43, 0, 0, 0, 61, 38, 7, 29...
## $ hashtags                <chr> "HCPs", "COVID19", "", "", "StayHome C...
## $ symbols                 <list> [NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ urls_url                <list> ["twitter.com/i/web/status/1…", "twit...
## $ urls_t.co               <list> ["https://t.co/doUlaKfV0M", "https://...
## $ urls_expanded_url       <list> ["https://twitter.com/i/web/status/12...
## $ media_t.co              <list> [NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ media_expanded_url      <list> [NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ media_type              <list> [NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ ext_media_url           <list> [NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ ext_media_t.co          <list> [NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ ext_media_expanded_url  <list> [NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ ext_media_type          <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ mentions_user_id        <list> [NA, NA, NA, NA, "455024343", "447838...
## $ mentions_screen_name    <list> [NA, NA, NA, NA, "Surgeon_General", "...
## $ lang                    <chr> "en", "en", "en", "en", "en", "en", "e...
## $ quoted_status_id        <chr> NA, NA, NA, NA, "1250151847559671808",...
## $ quoted_text             <chr> NA, NA, NA, NA, "Being #ActiveandHealt...
## $ quoted_created_at       <dttm> NA, NA, NA, NA, 2020-04-14 20:00:01, ...
## $ quoted_source           <chr> NA, NA, NA, NA, "Sprinklr Publishing",...
## $ quoted_favorite_count   <int> NA, NA, NA, NA, 37, NA, NA, NA, NA, NA...
## $ quoted_retweet_count    <int> NA, NA, NA, NA, 29, NA, NA, NA, NA, NA...
## $ quoted_user_id          <chr> NA, NA, NA, NA, "44783853", NA, NA, NA...
## $ quoted_screen_name      <chr> NA, NA, NA, NA, "HHSGov", NA, NA, NA, ...
## $ quoted_name             <chr> NA, NA, NA, NA, "HHS.gov", NA, NA, NA,...
## $ quoted_followers_count  <int> NA, NA, NA, NA, 929804, NA, NA, NA, NA...
## $ quoted_friends_count    <int> NA, NA, NA, NA, 395, NA, NA, NA, NA, N...
## $ quoted_statuses_count   <int> NA, NA, NA, NA, 21169, NA, NA, NA, NA,...
## $ quoted_location         <chr> NA, NA, NA, NA, "Washington, D.C.", NA...
## $ quoted_description      <chr> NA, NA, NA, NA, "News and information ...
## $ quoted_verified         <lgl> NA, NA, NA, NA, TRUE, NA, NA, NA, NA, ...
## $ retweet_status_id       <dbl> NA, NA, NA, NA, 1.250414e+18, 1.250458...
## $ retweet_text            <chr> NA, NA, NA, NA, "This is particularly ...
## $ retweet_created_at      <dttm> NA, NA, NA, NA, 2020-04-15 13:19:45, ...
## $ retweet_source          <chr> NA, NA, NA, NA, "Twitter Web App", "Sp...
## $ retweet_favorite_count  <int> NA, NA, NA, NA, 261, 167, 212, NA, NA,...
## $ retweet_retweet_count   <int> NA, NA, NA, NA, 107, 89, 75, NA, NA, N...
## $ retweet_user_id         <chr> NA, NA, NA, NA, "455024343", "44783853...
## $ retweet_screen_name     <chr> NA, NA, NA, NA, "Surgeon_General", "HH...
## $ retweet_name            <chr> NA, NA, NA, NA, "U.S. Surgeon General"...
## $ retweet_followers_count <int> NA, NA, NA, NA, 772909, 929804, 65073,...
## $ retweet_friends_count   <int> NA, NA, NA, NA, 223, 395, 159, NA, NA,...
## $ retweet_statuses_count  <int> NA, NA, NA, NA, 9501, 21169, 3219, NA,...
## $ retweet_location        <chr> NA, NA, NA, NA, "Washington, DC", "Was...
## $ retweet_description     <chr> NA, NA, NA, NA, "U.S. Surgeon General ...
## $ retweet_verified        <lgl> NA, NA, NA, NA, TRUE, TRUE, TRUE, NA, ...
## $ place_url               <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ place_name              <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ place_full_name         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ place_type              <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ country                 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ country_code            <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ coords_coords           <list> [<NA, NA>, <NA, NA>, <NA, NA>, <NA, N...
## $ bbox_coords             <list> [<NA, NA, NA, NA, NA, NA, NA, NA>, <N...
## $ tweet_url               <chr> "https://twitter.com/CDCgov/status/125...
## $ name                    <chr> "CDC", "CDC", "CDC", "CDC", "CDC", "CD...
## $ user_location           <chr> "Atlanta, GA", "Atlanta, GA", "Atlanta...
## $ user_description        <chr> "CDC's official Twitter source for dai...
## $ url                     <chr> "http://www.cdc.gov", "http://www.cdc....
## $ protected               <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FAL...
## $ user_followers_count    <int> 2549787, 2549787, 2549787, 2549787, 25...
## $ user_friends_count      <int> 266, 266, 266, 266, 266, 266, 266, 266...
## $ user_listed_count       <int> 17160, 17160, 17160, 17160, 17160, 171...
## $ user_statuses_count     <int> 26566, 26566, 26566, 26566, 26566, 265...
## $ user_favourites_count   <int> 522, 522, 522, 522, 522, 522, 522, 522...
## $ account_created_at      <dttm> 2010-05-21 19:40:40, 2010-05-21 19:40...
## $ verified                <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TR...
## $ profile_url             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ profile_expanded_url    <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ account_lang            <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ profile_banner_url      <chr> "https://pbs.twimg.com/profile_banners...
## $ profile_background_url  <chr> "http://abs.twimg.com/images/themes/th...
## $ profile_image_url       <chr> "http://pbs.twimg.com/profile_images/8...
## $ lat                     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ lng                     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ urls                    <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ possibly_sensitive      <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ user_time_zone          <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...

It’s also interesting to note, that not all tweet ID’s that were submitted, were hydrated. If a tweet was deleted, it can’t be recovered. We’re currently at a very low (50%) hydrating rate, this is likely due to the large amount of tweets that were misinformation, and so have been removed.

Create some new variables.

out <- raw %>% 
        mutate(year = year(created_at_UTC),
               month = month(created_at_UTC), 
               day = day(created_at_UTC), 
               hour = hour(created_at_UTC), 
               minute = minute(created_at_UTC), 
               tweet_min = round_date(created_at_UTC, unit="minutes"),
               tweet_hour = round_date(created_at_UTC, unit="hours"),
               tweet_day = round_date(created_at_UTC, unit="days")) %>%
        arrange(created_at_UTC)

Extract original tweet text and stats out of RT+Quotes

Decision: If missing is_quote then set to FALSE.

out$is_quote[is.na(out$is_quote)] <- FALSE

Get original tweet data out of quote

Not run. Just for example.

stats.on.orig.tweets <- out %>% filter(!is_quote, !is_retweet) %>% 
                                select(screen_name, tweet_id, favorite_count:reply_count, user_followers_count)

stats.on.qt <- out %>% filter(is_quote) %>% 
                        select(quoted_screen_name, tweet_id, quoted_favorite_count, quoted_retweet_count, quoted_followers_count) %>%
                        rename(screen_name = quoted_screen_name, favorite_count=quoted_favorite_count, retweet_count = quoted_retweet_count,
                               tweet_id = tweet_id, user_followers_count = quoted_followers_count) 

stats.on.comb.tweets <- bind_rows(stats.on.orig.tweets, stats.on.qt) %>% group_by(tweet_id) %>% filter(n()==1)

Export cleaned data

setwd("../")
saveRDS(out, paste0("data/covid-tweets-", today(), ".Rds"))

Then copy this cleaned data to Google Drive.


Last run on 2020-04-30 10:17:49

sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 18363)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] rtweet_0.7.0    lubridate_1.7.4 foreach_1.4.7   forcats_0.4.0  
##  [5] stringr_1.4.0   dplyr_0.8.4     purrr_0.3.3     readr_1.3.1    
##  [9] tidyr_1.0.2     tibble_3.0.0    ggplot2_3.2.1   tidyverse_1.2.1
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_0.2.5 xfun_0.10        haven_2.1.1      lattice_0.20-38 
##  [5] colorspace_1.4-1 vctrs_0.2.4      generics_0.0.2   htmltools_0.4.0 
##  [9] yaml_2.2.1       utf8_1.1.4       rlang_0.4.5      pillar_1.4.3    
## [13] glue_1.3.1       withr_2.1.2      modelr_0.1.5     readxl_1.3.1    
## [17] lifecycle_0.2.0  munsell_0.5.0    gtable_0.3.0     cellranger_1.1.0
## [21] rvest_0.3.4      codetools_0.2-16 evaluate_0.14    knitr_1.25      
## [25] fansi_0.4.0      broom_0.5.2      Rcpp_1.0.3       scales_1.0.0    
## [29] backports_1.1.5  jsonlite_1.6.1   hms_0.5.1        digest_0.6.21   
## [33] stringi_1.4.3    grid_3.6.1       cli_2.0.1        tools_3.6.1     
## [37] magrittr_1.5     lazyeval_0.2.2   crayon_1.3.4     pkgconfig_2.0.3 
## [41] ellipsis_0.3.0   xml2_1.2.2       assertthat_0.2.1 rmarkdown_1.18  
## [45] httr_1.4.1       rstudioapi_0.10  iterators_1.0.12 R6_2.4.0        
## [49] nlme_3.1-140     compiler_3.6.1