library(tidyverse); library(foreach)


Read in the raw data.

raw <- readRDS("data/raw-covid-tweet-data.Rds") 

Hydrating tweets returns the twitter data in a very different format than using rtweet.

## Rows: 410,564
## Columns: 93
## $ user_id                 <chr> "146569971", "146569971", "146569971",...
## $ tweet_id                <dbl> 1.250552e+18, 1.250542e+18, 1.250534e+...
## $ created_at_UTC          <dttm> 2020-04-15 22:31:01, 2020-04-15 21:52...
## $ screen_name             <chr> "CDCgov", "CDCgov", "CDCgov", "CDCgov"...
## $ text                    <chr> "#HCPs: COCA Call on Thurs, April 16, ...
## $ source                  <chr> "Sprout Social", "Sprout Social", "Spr...
## $ display_text_width      <dbl> 140, 140, 140, 140, NA, NA, NA, 140, 1...
## $ reply_to_status_id      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ reply_to_user_id        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ reply_to_screen_name    <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ is_quote                <lgl> FALSE, FALSE, FALSE, FALSE, TRUE, FALS...
## $ is_retweet              <lgl> FALSE, FALSE, FALSE, FALSE, TRUE, TRUE...
## $ favorite_count          <int> 127, 197, 249, 484, 0, 0, 0, 910, 304,...
## $ retweet_count           <int> 64, 118, 178, 194, 0, 0, 0, 770, 189, ...
## $ quote_count             <int> 6, 13, 11, 34, 0, 0, 0, 62, 11, 5, 5, ...
## $ reply_count             <int> 17, 19, 36, 43, 0, 0, 0, 61, 38, 7, 29...
## $ hashtags                <chr> "HCPs", "COVID19", "", "", "StayHome C...
## $ symbols                 <list> [NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ urls_url                <list> ["…", "twit...
## $               <list> ["", "https://...
## $ urls_expanded_url       <list> ["
## $              <list> [NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ media_expanded_url      <list> [NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ media_type              <list> [NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ ext_media_url           <list> [NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $          <list> [NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ ext_media_expanded_url  <list> [NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ ext_media_type          <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ mentions_user_id        <list> [NA, NA, NA, NA, "455024343", "447838...
## $ mentions_screen_name    <list> [NA, NA, NA, NA, "Surgeon_General", "...
## $ lang                    <chr> "en", "en", "en", "en", "en", "en", "e...
## $ quoted_status_id        <chr> NA, NA, NA, NA, "1250151847559671808",...
## $ quoted_text             <chr> NA, NA, NA, NA, "Being #ActiveandHealt...
## $ quoted_created_at       <dttm> NA, NA, NA, NA, 2020-04-14 20:00:01, ...
## $ quoted_source           <chr> NA, NA, NA, NA, "Sprinklr Publishing",...
## $ quoted_favorite_count   <int> NA, NA, NA, NA, 37, NA, NA, NA, NA, NA...
## $ quoted_retweet_count    <int> NA, NA, NA, NA, 29, NA, NA, NA, NA, NA...
## $ quoted_user_id          <chr> NA, NA, NA, NA, "44783853", NA, NA, NA...
## $ quoted_screen_name      <chr> NA, NA, NA, NA, "HHSGov", NA, NA, NA, ...
## $ quoted_name             <chr> NA, NA, NA, NA, "", NA, NA, NA,...
## $ quoted_followers_count  <int> NA, NA, NA, NA, 929804, NA, NA, NA, NA...
## $ quoted_friends_count    <int> NA, NA, NA, NA, 395, NA, NA, NA, NA, N...
## $ quoted_statuses_count   <int> NA, NA, NA, NA, 21169, NA, NA, NA, NA,...
## $ quoted_location         <chr> NA, NA, NA, NA, "Washington, D.C.", NA...
## $ quoted_description      <chr> NA, NA, NA, NA, "News and information ...
## $ quoted_verified         <lgl> NA, NA, NA, NA, TRUE, NA, NA, NA, NA, ...
## $ retweet_status_id       <dbl> NA, NA, NA, NA, 1.250414e+18, 1.250458...
## $ retweet_text            <chr> NA, NA, NA, NA, "This is particularly ...
## $ retweet_created_at      <dttm> NA, NA, NA, NA, 2020-04-15 13:19:45, ...
## $ retweet_source          <chr> NA, NA, NA, NA, "Twitter Web App", "Sp...
## $ retweet_favorite_count  <int> NA, NA, NA, NA, 261, 167, 212, NA, NA,...
## $ retweet_retweet_count   <int> NA, NA, NA, NA, 107, 89, 75, NA, NA, N...
## $ retweet_user_id         <chr> NA, NA, NA, NA, "455024343", "44783853...
## $ retweet_screen_name     <chr> NA, NA, NA, NA, "Surgeon_General", "HH...
## $ retweet_name            <chr> NA, NA, NA, NA, "U.S. Surgeon General"...
## $ retweet_followers_count <int> NA, NA, NA, NA, 772909, 929804, 65073,...
## $ retweet_friends_count   <int> NA, NA, NA, NA, 223, 395, 159, NA, NA,...
## $ retweet_statuses_count  <int> NA, NA, NA, NA, 9501, 21169, 3219, NA,...
## $ retweet_location        <chr> NA, NA, NA, NA, "Washington, DC", "Was...
## $ retweet_description     <chr> NA, NA, NA, NA, "U.S. Surgeon General ...
## $ retweet_verified        <lgl> NA, NA, NA, NA, TRUE, TRUE, TRUE, NA, ...
## $ place_url               <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ place_name              <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ place_full_name         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ place_type              <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ country                 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ country_code            <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ coords_coords           <list> [<NA, NA>, <NA, NA>, <NA, NA>, <NA, N...
## $ bbox_coords             <list> [<NA, NA, NA, NA, NA, NA, NA, NA>, <N...
## $ tweet_url               <chr> "
## $ name                    <chr> "CDC", "CDC", "CDC", "CDC", "CDC", "CD...
## $ user_location           <chr> "Atlanta, GA", "Atlanta, GA", "Atlanta...
## $ user_description        <chr> "CDC's official Twitter source for dai...
## $ url                     <chr> "", "http://www.cdc....
## $ protected               <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FAL...
## $ user_followers_count    <int> 2549787, 2549787, 2549787, 2549787, 25...
## $ user_friends_count      <int> 266, 266, 266, 266, 266, 266, 266, 266...
## $ user_listed_count       <int> 17160, 17160, 17160, 17160, 17160, 171...
## $ user_statuses_count     <int> 26566, 26566, 26566, 26566, 26566, 265...
## $ user_favourites_count   <int> 522, 522, 522, 522, 522, 522, 522, 522...
## $ account_created_at      <dttm> 2010-05-21 19:40:40, 2010-05-21 19:40...
## $ verified                <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TR...
## $ profile_url             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ profile_expanded_url    <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ account_lang            <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ profile_banner_url      <chr> "
## $ profile_background_url  <chr> "
## $ profile_image_url       <chr> "
## $ lat                     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ lng                     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ urls                    <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ possibly_sensitive      <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ user_time_zone          <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...

It’s also interesting to note, that not all tweet ID’s that were submitted, were hydrated. If a tweet was deleted, it can’t be recovered. We’re currently at a very low (50%) hydrating rate, this is likely due to the large amount of tweets that were misinformation, and so have been removed.

Create some new variables.

out <- raw %>% 
        mutate(year = year(created_at_UTC),
               month = month(created_at_UTC), 
               day = day(created_at_UTC), 
               hour = hour(created_at_UTC), 
               minute = minute(created_at_UTC), 
               tweet_min = round_date(created_at_UTC, unit="minutes"),
               tweet_hour = round_date(created_at_UTC, unit="hours"),
               tweet_day = round_date(created_at_UTC, unit="days")) %>%

Extract original tweet text and stats out of RT+Quotes

Decision: If missing is_quote then set to FALSE.

out$is_quote[$is_quote)] <- FALSE

Get original tweet data out of quote

Not run. Just for example.

stats.on.orig.tweets <- out %>% filter(!is_quote, !is_retweet) %>% 
                                select(screen_name, tweet_id, favorite_count:reply_count, user_followers_count)

stats.on.qt <- out %>% filter(is_quote) %>% 
                        select(quoted_screen_name, tweet_id, quoted_favorite_count, quoted_retweet_count, quoted_followers_count) %>%
                        rename(screen_name = quoted_screen_name, favorite_count=quoted_favorite_count, retweet_count = quoted_retweet_count,
                               tweet_id = tweet_id, user_followers_count = quoted_followers_count) 

stats.on.comb.tweets <- bind_rows(stats.on.orig.tweets, stats.on.qt) %>% group_by(tweet_id) %>% filter(n()==1)

Export cleaned data

saveRDS(out, paste0("data/covid-tweets-", today(), ".Rds"))

Then copy this cleaned data to Google Drive.

Last run on 2020-04-30 10:17:49

