We wrangle and analyze the WeRateDogs data on Twitter to find interesting insights. We get some of the data from the WeRateDogs Twitter archive and download additional data using the Twitter API based on the tweets that are already available in the archive.
We only work with Tweets that have images attached to it and are not retweets to do the analysis.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import tweepy
import json
import re
pd.set_option('display.max_colwidth', None)
# WeRateDogs Twitter Archive
# load the WeRateDogs Twitter Archive provided
archive_df = pd.read_csv('twitter-archive-enhanced.csv')
# Predicted Images From Neural Network
# download Image Predictions
images_url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(images_url)
# # write response to file
with open('image_predictions.tsv', mode='wb') as file:
file.write(response.content)
# Load Predicted images into a dataframe
image_predictions_df = pd.read_csv('image_predictions.tsv', sep='\t')
# # Download additional data from twitter using the Twitter API
# # list of tweet ids
tweet_ids = archive_df['tweet_id'].values
# # initialize twitter api object
consumer_key = 'HIDDEN'
consumer_secret = 'HIDDEN'
access_token = 'HIDDEN'
access_token_secret = 'HIDDEN'
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth_handler=auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
# # download tweet data and write to file
failed_ids = []
with open('tweet_json.txt', 'w') as file:
for id in tweet_ids:
try:
tweet = api.get_status(id, tweet_mode='extended')
json.dump(tweet._json, file)
file.write('\n')
except tweepy.TweepError as e:
print(e)
failed_ids.append(id)
pass
# read and load downloaded tweet data into a dataframe
tweet_data = []
with open('tweet_json.txt') as file:
for line in file:
tweet_data.append(json.loads(line))
tweet_df = pd.DataFrame(tweet_data)
archive_df.shape
archive_df.head()
archive_df.tail()
archive_df.sample(5)
# Source containing 2 variables i.e the source_name and the source_link
archive_df.source.head(1)
# Duplicates in expanded_urls
archive_df['expanded_urls'].sample(15)
# Tweets with duplicates in expanded_urls
archive_df.loc[1578].expanded_urls
# Tweets without images. All tweets with an NaN value for expanded_urls were found to be without images.
archive_df[archive_df['expanded_urls'].isnull()]
# Non-original tweets, Retweets
archive_df[archive_df['text'].str.contains('RT @', flags=re.IGNORECASE, case=False)].head()
# Non-original tweets, PUPDATE(s)
archive_df[archive_df['text'].str.contains('PUPDATE', flags=re.IGNORECASE, case=False)]
archive_df.name.value_counts()
# Dogs whose names were not captured but assigned a value of 'None'
archive_df.query('name == "None"')[['tweet_id','text','name']].sample(15)
# Dog names that were captured as 'a'
archive_df.query('name == "a" ')[['tweet_id','text','name']]
archive_df.info()
archive_df.describe()
# doggo stage
archive_df[archive_df['text'].str.contains('doggo',flags=re.IGNORECASE, case=False)].shape[0]
# pupper stage
archive_df[archive_df['text'].str.contains('pupper',flags=re.IGNORECASE, case=False)].shape[0]
# puppo stage
archive_df[archive_df['text'].str.contains('puppo',flags=re.IGNORECASE, case=False)].shape[0]
# floofer stage
archive_df[archive_df['text'].str.contains('floofer',flags=re.IGNORECASE, case=False)].shape[0]
# floof stage
archive_df[archive_df['text'].str.contains('floof',flags=re.IGNORECASE, case=False)].shape[0]
# blep stage
archive_df[archive_df['text'].str.contains('blep',flags=re.IGNORECASE, case=False)].shape[0]
# snoot stage
archive_df[archive_df['text'].str.contains('snoot',flags=re.IGNORECASE, case=False)].shape[0]
archive_df.name.value_counts()
# Tweets with a name of 'a'
a_names = archive_df.query('name == "a"')[['tweet_id','text','name']]
a_names.shape
# Names that were not captured but begin with named
a_names[a_names['text'].str.contains('named',flags=re.IGNORECASE, case=False)]
# Names that were not captured but begin with named
a_names[a_names['text'].str.contains('name is', case=False, flags=re.IGNORECASE)]
# Names that were not captured but begin with 'meet'
a_names[a_names['text'].str.contains('meet', case=False, flags=re.IGNORECASE)]
# Names that were not captured but begin 'Say hello to'
a_names[a_names['text'].str.contains('hello to', case=False, flags=re.IGNORECASE)]
# Tweets with a name of 'None'
none_names = archive_df.query('name == "None"')[['tweet_id','text','name']]
none_names.shape
# Names that have a value of 'None' but were not captured and they begin with 'named'
none_names[none_names['text'].str.contains('named', case=False, flags=re.IGNORECASE)]
# Names that have a value of 'None' but were not captured and they begin with 'name is'
none_names[none_names['text'].str.contains('name is', case=False, flags=re.IGNORECASE)]
# Names that have a value of 'None' but were not captured and they begin with 'meet'
none_names[none_names['text'].str.contains('meet', case=False, flags=re.IGNORECASE)]
# Names that have a value of 'None' but were not captured and they begin with 'meet'
none_names[none_names['text'].str.contains('Say hello to', case=False, flags=re.IGNORECASE)]
# Names that have a value of 'None' but were not captured and they begin with 'This is'
none_names[none_names['text'].str.contains('This is', case=False, flags=re.IGNORECASE)]
image_predictions_df.shape
image_predictions_df.head()
image_predictions_df.sample(5)
image_predictions_df.info()
image_predictions_df.describe()
tweet_df.shape
tweet_df.head()
tweet_df.info()
tweet_df.describe()
all_columns = pd.Series(list(tweet_df) + list(archive_df) + list(image_predictions_df))
all_columns[all_columns.duplicated()]
twitter archive table
expanded_urlsexpanded_urls == NaNimage prediction table
twitter archive table
source column has full link html tag with source_name and source_linkDownloaded tweet data tweet_df table
favorite_count,retweet_count from tweet_df to be added to archive_dfImage predictions dataset
archive_clean_df = archive_df.copy()
images_clean_df = image_predictions_df.copy()
tweets_clean_df = tweet_df.copy()
# Drop unneeded columns
archive_clean_df.drop(columns=['in_reply_to_status_id','in_reply_to_user_id', 'retweeted_status_id',
'retweeted_status_user_id', 'retweeted_status_timestamp'], inplace=True)
archive_clean_df['source_url'] = archive_clean_df['source'].str.extract(r'href=[\'"]?([^\'" >]+)', expand=True, flags=re.IGNORECASE)
archive_clean_df['source_name'] = archive_clean_df['source'].str.extract(r'>(.*?)<\/a>', expand=True, flags=re.IGNORECASE)
archive_clean_df.drop(columns=['source'], inplace=True)
archive_clean_df['source_url'].head()
archive_clean_df['source_name'].head()
list(archive_clean_df)
# Extract dog stage
archive_clean_df['dog_stage'] = archive_clean_df['text'].str.extract(r'(doggo|pupper|puppo|blep|floofer|floof)',
expand=True, flags=re.IGNORECASE)
# Drop columns
archive_clean_df.drop(columns=['doggo','floofer', 'pupper', 'puppo'], inplace=True)
list(archive_clean_df)
archive_clean_df['dog_stage'].unique()
archive_clean_df['dog_stage'].value_counts()
# Rename id to tweet_id
tweets_clean_df.rename(columns={'id':'tweet_id'}, inplace=True)
# Merge favorite_count and retweet_count
archive_clean_df = pd.merge(archive_clean_df, tweets_clean_df[['tweet_id','favorite_count','retweet_count']],
on=['tweet_id'], how='left')
list(archive_clean_df)
archive_clean_df.head()
archive_clean_df = archive_clean_df[~archive_clean_df['text'].str.contains('RT @', case=False, flags=re.IGNORECASE)]
archive_clean_df.shape
archive_clean_df[archive_clean_df['text'].str.contains('RT @', case=False, flags=re.IGNORECASE)]
archive_clean_df = archive_clean_df[~archive_clean_df['text'].str.contains('PUPDATE', case=False, flags=re.IGNORECASE)]
archive_clean_df.shape
archive_clean_df[archive_clean_df['text'].str.contains('PUPDATE', case=False, flags=re.IGNORECASE)]
# Remove tweets without images
archive_clean_df = archive_clean_df[archive_clean_df['expanded_urls'].notnull()]
# Tweets without images
archive_clean_df[archive_clean_df['expanded_urls'].isnull()]
archive_clean_df.shape
# removing duplicates from expanded_urls
archive_clean_df['expanded_urls'] = archive_clean_df['expanded_urls'].apply(lambda x: ','.join(list(set(x.split(',')))))
archive_clean_df['expanded_urls'].sample(15)
archive_clean_df.loc[1578].expanded_urls
# Extract names
a_name_is = archive_clean_df.query('name == "a"')['text'].str.extract(r'[Nn]ame is (.*?)\.', expand=True, flags=re.IGNORECASE).dropna()
a_name_is
archive_clean_df.loc[a_name_is.index, 'name'] = a_name_is[0]
a_named = archive_clean_df.query('name == "a"')['text'].str.extract(r'[Nn]amed (.*?)\.', expand=True, flags=re.IGNORECASE).dropna()
a_named
a_named
archive_clean_df.loc[a_named.index, 'name'] = a_named[0]
# Set the name of Jacob (Yacōb) to Jacob i.e remove the attached pronunication
archive_clean_df.loc[2034, 'name'] = 'Jacob'
archive_clean_df.loc[a_name_is.index]['name']
archive_clean_df.loc[a_named.index, 'name']
None¶Extract the names from the text with names that have a value of None using a regex pattern.
The names are preceeded by name is,named or This is.
For the dogs who still have a name of a set their names to None after getting all the names for the dogs with a name of None we could find from the text.
none_name_is = archive_clean_df.query('name == "None"')['text'].str.extract(r'[Nn]ame is (.*?)\.', expand=True, flags=re.IGNORECASE).dropna()
none_named = archive_clean_df.query('name == "None"')['text'].str.extract(r'[Nn]amed (.*?)\.', expand=True, flags=re.IGNORECASE).dropna()
none_this_is = archive_clean_df.query('name == "None"')['text'].str.extract(r'This is (.*?)\.', expand=True, flags=re.IGNORECASE).dropna()
none_name_is
archive_clean_df.loc[none_name_is.index]
archive_clean_df.loc[none_name_is.index, 'name'] = ['Howard','Zoey','Thea','Sabertooth', 'Big Jumpy Rat']
none_named
archive_clean_df.loc[none_named.index, 'name'] = none_named[0]
none_this_is
archive_clean_df.loc[none_this_is.index][['text','name']]
# Assign dog names gleaned confirmed from the text
archive_clean_df.loc[[184,349,1068,1842], 'name'] = ['Charlie','Blue', 'Bretagne','Yoshi']
archive_clean_df.query('name == "None"').shape
# For the dogs who have a name of 'a' that we couldn't get their names, set it to 'None'
dog_a = archive_clean_df.query('name == "a"')
archive_clean_df.loc[dog_a.index,'name'] = 'None'
archive_clean_df.loc[none_name_is.index, 'name']
archive_clean_df.loc[none_named.index, 'name']
archive_clean_df.loc[[184,349,1068,1842], 'name']
archive_clean_df['name'].value_counts()
archive_clean_df.name = archive_clean_df.name.str.capitalize()
archive_clean_df['name'].value_counts()
archive_clean_df['rating'] = archive_clean_df[['rating_numerator', 'rating_denominator']].astype(str).apply('/'.join, axis=1)
archive_clean_df.head()
type(archive_clean_df['rating'][0])
archive_clean_df.info()
archive_clean_df.dog_stage = archive_clean_df.dog_stage.str.lower()
archive_clean_df['dog_stage'].value_counts()
# Get all dogs belonging to the floofer stage
clean_floofer_stage = archive_clean_df.query('dog_stage == "floofer"')
# Set the stage of the floofer dogs to floof
archive_clean_df.loc[clean_floofer_stage.index, 'dog_stage'] = 'floof'
archive_clean_df.loc[clean_floofer_stage.index, 'dog_stage']
archive_clean_df.dog_stage.value_counts()
archive_clean_df.info()
archive_clean_df.head()
# To datetime
archive_clean_df.timestamp = pd.to_datetime(archive_clean_df.timestamp)
# To category
# Change all the NaN values to string 'None'
archive_clean_df.dog_stage.fillna('None', inplace=True)
archive_clean_df.dog_stage = archive_clean_df.dog_stage.astype('category')
# Change the 'None' back to NaN
archive_clean_df.dog_stage.replace('None',np.nan, inplace=True)
# To int
# set all NaN values to -1
archive_clean_df.favorite_count.fillna(-1, inplace=True)
archive_clean_df.retweet_count.fillna(-1, inplace=True)
archive_clean_df.favorite_count = archive_clean_df.favorite_count.astype('int64')
archive_clean_df.retweet_count = archive_clean_df.retweet_count.astype('int64')
archive_clean_df.dog_stage.value_counts()
archive_clean_df.info()
archive_clean_df.head()
archive_clean_df.describe()
images_clean_df.p1 = images_clean_df.p1.str.lower()
images_clean_df.p2 = images_clean_df.p2.str.lower()
images_clean_df.p3 = images_clean_df.p3.str.lower()
images_clean_df.head()
breed_predictions = []
for i in range(images_clean_df.shape[0]):
if images_clean_df['p1_dog'][i]:
breed_predictions.append(images_clean_df['p1'][i])
elif images_clean_df['p2_dog'][i]:
breed_predictions.append(images_clean_df['p2'][i])
elif images_clean_df['p3_dog'][i]:
breed_predictions.append(images_clean_df['p3'][i])
else:
breed_predictions.append(np.nan)
images_clean_df['predicted_breed'] = breed_predictions
# Join image_predictions to twitter archive
combined_clean = pd.merge(archive_clean_df, images_clean_df[['tweet_id','jpg_url','predicted_breed']],
on=['tweet_id'], how='left')
combined_clean.head()
combined_clean.info()
combined_clean['dog_stage'].value_counts()
# The Stage Most of the Dogs Belong to
stage_count = combined_clean['dog_stage'].value_counts()
plt.subplots(figsize=(25,15))
plt.bar(stage_count.index, stage_count)
plt.xlabel('Dog Stage', fontsize=30)
plt.ylabel('Number of Tweets/Dogs', fontsize=30)
plt.xticks(fontsize=30)
plt.yticks(fontsize=30)
plt.title('Number of Tweets/Dogs Per The Stage of the Dogs', fontsize=30);
# The Most popular ratings
combined_clean['rating'].value_counts()
top_ratings = combined_clean['rating'].value_counts()[:14]
plt.subplots(figsize=(25,15))
plt.bar(top_ratings.index, top_ratings)
plt.xlabel('Dog Rating', fontsize=30)
plt.ylabel('Number of Tweets or Dogs', fontsize=30)
plt.xticks(fontsize=25)
plt.yticks(fontsize=20)
plt.title('Number of Tweets or Dogs Per Rating', fontsize=30);
# The Most Popular breed from the Predictions
combined_clean['predicted_breed'].value_counts()
top_breeds = combined_clean['predicted_breed'].value_counts()[:10]
plt.subplots(figsize=(25,15))
plt.bar(top_breeds.index, top_breeds)
plt.xlabel('Dog Breed', fontsize=30)
plt.ylabel('Number of Tweets or Dogs', fontsize=30)
plt.xticks(fontsize=25, rotation=45)
plt.yticks(fontsize=20)
plt.title('Number of Tweets or Dogs Per Breed', fontsize=30);
# The most favorited tweet
combined_clean.loc[combined_clean['favorite_count'].idxmax()]
# The most retweeted tweet
combined_clean.loc[combined_clean['retweet_count'].idxmax()]
# source of tweet
combined_clean['source_name'].value_counts()
tweet_source = combined_clean['source_name'].value_counts()[:10]
plt.subplots(figsize=(25,15))
plt.bar(tweet_source.index, tweet_source)
plt.xlabel('Tweet Source', fontsize=30)
plt.ylabel('Number of Tweets or Dogs', fontsize=30)
plt.xticks(fontsize=25)
plt.yticks(fontsize=20)
plt.title('Number of Tweets or Dogs By Source Name', fontsize=30);
# Save combined_clean to 'twitter_archive_master.csv'
combined_clean.to_csv('twitter_archive_master.csv', index=False)
pupper stage and blep being the least. None of the tweets indicated that a dog was at the snoot stage.12/10.Golden Retriever. The next four in descending order are Labrador Retriever, Pembroke, Chihuahua and Pug.Twitter for iPhone and the least come from TweetDeck.