The purpose of this project is practising data wrangling techniques on three separate files.
All wrangled files are related to the twitter account @dog_rates (WeRateDogs).
The project follows the wrangling workflow of gather, assess, clean.
There are three sources of data to gather:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import requests
import tweepy
from tweepy import OAuthHandler
import json
from timeit import default_timer as timer
pd.set_option('display.max_colwidth', None) # to display the whole strings and not collapse them
pd.set_option('display.max_columns', None) # to not collapse columns when viewing data
pd.set_option('display.max_rows', None) # to display all rows in the output
# load csv file
df_archive = pd.read_csv('twitter-archive-enhanced.csv')
df_archive.head(2)
# request tsv file
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)
# save the file
with open('image_predictions.tsv', mode = 'wb') as file:
file.write(response.content)
# load the tab separated file
df_preds = pd.read_csv('image_predictions.tsv', sep='\t')
df_preds.head(2)
# Query Twitter API for each tweet in the Twitter archive and save JSON in a text file
# These are hidden to comply with Twitter's API terms and conditions
consumer_key = 'HIDDEN'
consumer_secret = 'HIDDEN'
access_token = 'HIDDEN'
access_secret = 'HIDDEN'
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
# Tweet IDs for which to gather additional data via Twitter's API
tweet_ids = df_archive.tweet_id.values
print('Tweets to search for:', len(tweet_ids))
# Query Twitter's API for JSON data for each tweet ID in the Twitter archive
count = 0
success = 0
fails_dict = {}
start = timer()
# Save each tweet's returned JSON as a new line in a .txt file
with open('tweet_json.txt', 'w') as outfile:
# This loop takes 20-30 minutes to run because of Twitter's rate limit
for tweet_id in tweet_ids:
count += 1
if count % 100 == 0:
print('#',count, 'processed tweet IDs')
try:
tweet = api.get_status(tweet_id, tweet_mode='extended')
success += 1
json.dump(tweet._json, outfile)
outfile.write('\n')
except tweepy.TweepError as e:
print("Fail")
fails_dict[tweet_id] = e
pass
end = timer()
print('Time to compose the file:', end - start)
print('Tweets found:', success)
print('Tweet IDs not found:', fails_dict)
# read the just created tweet_json.txt file line by line to create a pandas DataFrame
# get tweet ID, retweet count, and like count
# a list to be converted to pd df
json_list = []
with open('tweet_json.txt', 'r') as json_file:
json_row = json_file.readline()
while json_row:
json_dict = json.loads(json_row)
# each tweet is written in its own dictionary because that's how the file was created
# we know the column names from visual inspection of the txt file or by reading
# https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object
tweet_id = json_dict['id']
retweet_count = json_dict['retweet_count']
like_count = json_dict['favorite_count']
# let's also get info about retweets to compare it with df_archive
# retweeted_status is only included when the tweet is a retweet:
retweeted = json_dict.get('retweeted_status', 0)
if retweeted != 0:
retweeted = 1
json_list.append({'tweet_id': tweet_id, 'retweets': retweet_count, 'likes': like_count, 'retweeted': retweeted})
# new json row, i.e. next tweet
json_row = json_file.readline()
# convert a list of dictionaries to df
df_likes = pd.DataFrame(json_list, columns = ['tweet_id', 'retweets', 'likes', 'retweeted'])
df_likes.head(2)
We only want to work with:
1) original tweets
2) tweets with an image of a dog
The end results after assessing and cleaning should be one master file (if that will make sense based on tidiness rules), which includes only the relevant tweets (based on the above definition) and has all the required columns with corrected data in it.
Let's visually and programatically analyze all three data frames separately:
df_archive.sample(10)
df_archive.info()
# in_reply and retweeted columns have nulls for all tweets that were not a reply or a retweet
df_archive.describe()
# there are some extreme values, some of which are wrong based on the text
df_archive.rating_denominator.value_counts()
df_archive.query('rating_denominator == 2').text
# there are some extreme values, some of which are wrong based on the text
df_archive.rating_numerator.value_counts()
df_archive.query('rating_numerator == 960').text
# decimal rating numerator: the actual rating is 13.5/10, but 5/10 is extracted
# df_archive.iloc[45] ... the below code has a more leggible output
df_archive[df_archive.index==45]
# some names are clearly not correctly extracted
df_archive.name.value_counts().nlargest(20)
# stages check
df_archive.query('doggo == "doggo"').sample(5)
# expanded url indicates tweets with a photo and there are some missing values, i.e. not all tweets have pictures
df_archive.expanded_urls.sample(10)
df_archive.expanded_urls.nunique()
df_archive.tweet_id.nunique()
df_archive
issues¶Validity:
Accuracy:
Consistency:
df_preds.sample(15)
# no non-nulls
df_preds.info()
# probability values are valid
df_preds.describe()
# 26% are not predictions of dogs (in p1)
1 - df_preds.p1_dog.mean()
# some dog names start with a lower case, some with an uppercase letter
df_preds.p1.value_counts().nlargest(20)
df_preds.tweet_id.nunique()
df_preds
issues¶df_likes.info()
Visual and programmatic inspection revealed the following issues:
Completeness:
Validity:
Accuracy:
Consistency:
The cleaning part will be done in define-code-test chunks in order which I deem as logical given the data structure and the goal.
# safe copies of all files
df_archive_clean = df_archive.copy()
df_preds_clean = df_preds.copy()
df_likes_clean = df_likes.copy()
# merging into one file
df_clean = pd.merge(df_archive_clean, df_preds_clean, on='tweet_id', how='left')
df_clean = pd.merge(df_clean, df_likes_clean, on='tweet_id', how='left')
df_clean.sample(2)
# data types will need to be adjusted, but let's leave it for later when we eliminate some of the columns
df_clean.info()
df_clean.retweeted.value_counts()
# all retweeted == 1 are also retweeted_status_id non null
df_clean[(df_clean['retweeted'] == 1) & (df_clean['retweeted_status_id'].isnull()==False)].shape
# delete 179 retweeted tweet rows and test
df_clean.drop(df_clean[df_clean['retweeted'] == 1].index, inplace=True)
df_clean.retweeted.value_counts()
df_clean.retweeted_status_id.isnull().value_counts()
# 2 remaining rows are retweets, but these were not included in df_likes (i.e. these tweet_ids were not processed with API)
df_clean[df_clean['retweeted_status_id'].isnull()==False]
df_clean.retweeted_status_id.isnull().value_counts()
# these should be deleted, too
df_clean.drop(df_clean[df_clean['retweeted_status_id'].isnull()==False].index, inplace=True)
# we eliminated all retweets and will not need related columns anymore
df_clean.drop(['retweeted', 'retweeted_status_id', 'retweeted_status_user_id', 'retweeted_status_timestamp'], axis=1, inplace=True)
list(df_clean)
# 58 rows have no picture information, let's delete them
df_clean[(df_clean['expanded_urls'].isnull()==True) & (df_clean['jpg_url'].isnull()==True)].shape
df_clean.drop(df_clean[(df_clean['expanded_urls'].isnull()==True) & (df_clean['jpg_url'].isnull()==True)].index, inplace=True)
df_clean.shape
# test for no remaining missing images
df_clean.expanded_urls.isnull().value_counts()
# there are still 123 missing jpg_urls, which is because these tweets were not included in the predictions file
df_clean.jpg_url.isnull().value_counts()
# upon visual inspection, these tweets are not tweets with an image after all because there is no 'photo' string
# included in the expanded_urls
df_clean[df_clean['jpg_url'].isnull()==True].sample(10)
# these should be deleted, too
df_clean.drop(df_clean[df_clean['jpg_url'].isnull()==True].index, inplace=True)
df_clean.jpg_url.isnull().value_counts()
# drop one of the columns because it will no longer be needed
df_clean.drop(['expanded_urls'], axis=1, inplace=True)
list(df_clean)
# so I will keep in True-False-False and True-True-False predictions
df_clean.groupby(['p1_dog', 'p2_dog', 'p3_dog']).count()['tweet_id']
df_clean.drop(df_clean[df_clean['p1_dog']==False].index, inplace=True)
df_clean.groupby(['p1_dog', 'p2_dog', 'p3_dog']).count()['tweet_id']
# we can delete some more information from df_clean coming initially from df_preds:
# img_num, p1_dog, p2_dog, p3_dog, p2_conf, p3_conf
df_clean.drop(['img_num', 'p1_dog', 'p2_dog', 'p3_dog', 'p2_conf', 'p3_conf'], axis=1, inplace=True)
# df now has fewer observations and columns after identifying retweets and tweets without images and then eliminating
# these columns and other columns, which will not be interesting for further analysis
df_clean.shape
list(df_clean)
df_clean['dog_stage'] = ''
def find_dog_stage(row):
dog_stage = []
if row['doggo'] == 'doggo':
dog_stage.append('doggo')
if row['floofer'] == 'floofer':
dog_stage.append('floofer')
if row['pupper'] == 'pupper':
dog_stage.append('pupper')
if row['puppo'] == 'puppo':
dog_stage.append('puppo')
if len(dog_stage) < 1:
row['dog_stage'] = 'None'
else:
# update the row (multiple stages per row are allowed by the if construction)
row['dog_stage'] = ','.join(dog_stage)
# return updated column
return row
# apply to all rows
df_clean = df_clean.apply(find_dog_stage, axis=1)
# drop redundant columns
df_clean = df_clean.drop(['doggo', 'floofer', 'pupper', 'puppo'], axis=1)
# there are 10 dogs that have more than one stage defined, let's clean these manually
df_clean.groupby('dog_stage').count()['tweet_id']
# should be floofer
df_clean[df_clean['dog_stage']=='doggo,floofer'][['tweet_id', 'text', 'dog_stage']]
# puppo
df_clean[df_clean['dog_stage']=='doggo,puppo'][['tweet_id', 'text', 'dog_stage']]
# there seem to be actually two dogs on these pictures
# but for the simplicity, let's make them all a doggo
df_clean[df_clean['dog_stage']=='doggo,pupper'][['tweet_id', 'text', 'dog_stage']]
df_clean['dog_stage'].replace({'doggo,floofer': 'floofer', 'doggo,puppo': 'puppo', 'doggo,pupper': 'doggo'}, inplace=True)
df_clean.groupby('dog_stage').count()['tweet_id']
# change data types first to allow storing all correct numbers including decimals
df_clean['rating_numerator'] = df_clean['rating_numerator'].astype(float)
df_clean['rating_denominator'] = df_clean['rating_denominator'].astype(float)
# to not display warnings when looking at refex results and not extracting them
import warnings
warnings.simplefilter('ignore')
# find decimal numerator (where denominator could be decimal, too) - there are 4 cases
df_clean[df_clean.text.str.contains(r"(\d+\.\d+\/\d+\.?\d*)")][['tweet_id', 'text', 'rating_numerator']]
# these can be cleaned manually
df_clean.loc[df_clean['tweet_id'] == 883482846933004288, ['rating_numerator']] = 13.5
df_clean.loc[df_clean['tweet_id'] == 786709082849828864, ['rating_numerator']] = 9.75
df_clean.loc[df_clean['tweet_id'] == 778027034220126208, ['rating_numerator']] = 11.27
df_clean.loc[df_clean['tweet_id'] == 680494726643068929, ['rating_numerator']] = 11.26
# find decimal denominators - there are not any
# (we already know that these will not have a decimanl numerator, because we found all these cases above)
df_clean[df_clean.text.str.contains(r"(\d+\/\d+\.\d+)")][['tweet_id', 'text', 'rating_numerator', 'rating_denominator']]
# check if any tweet has more than one "numeric/numeric" pattern in its text
df_clean[df_clean.text.str.contains( r"(\d+\.?\d*\/\d+\.?\d*\D+\d+\.?\d*\/\d+\.?\d*)")][['tweet_id', 'text', 'rating_numerator', 'rating_denominator']]
There are quite a few cases with more than one "rating" in the text:
df_clean.loc[df_clean['tweet_id'] == 740373189193256964, ['rating_numerator']] = 14
df_clean.loc[df_clean['tweet_id'] == 740373189193256964, ['rating_denominator']] = 10
df_clean.loc[df_clean['tweet_id'] == 722974582966214656, ['rating_numerator']] = 13
df_clean.loc[df_clean['tweet_id'] == 722974582966214656, ['rating_denominator']] = 10
df_clean.loc[df_clean['tweet_id'] == 716439118184652801, ['rating_numerator']] = 11
df_clean.loc[df_clean['tweet_id'] == 716439118184652801, ['rating_denominator']] = 10
df_clean.loc[df_clean['tweet_id'] == 666287406224695296, ['rating_numerator']] = 9
df_clean.loc[df_clean['tweet_id'] == 666287406224695296, ['rating_denominator']] = 10
# no wrong extracted decimal numerators remain
df_clean[df_clean.text.str.contains(r"(\d+\.\d+\/\d+\.?\d*)")][['tweet_id', 'text', 'rating_numerator', 'rating_denominator']]
# no wrong extracted double fractions remain
df_clean.loc[(df_clean['tweet_id'] == 740373189193256964) | (df_clean['tweet_id'] == 722974582966214656) |\
(df_clean['tweet_id'] == 716439118184652801) | (df_clean['tweet_id'] == 666287406224695296)]\
[['tweet_id', 'text', 'rating_numerator', 'rating_denominator']]
df_clean['rating'] = ''
def merge_rating(row):
rating = []
rating.append(str(row['rating_numerator']))
rating.append(str(row['rating_denominator']))
row['rating'] = '/'.join(rating)
# return updated column
return row
# apply to all rows
df_clean = df_clean.apply(merge_rating, axis=1)
# drop the separate rating columns
df_clean = df_clean.drop(['rating_numerator', 'rating_denominator'], axis=1)
# rating is now stored in one column, so we solved the tidiness issue
# most of the ratings which occur only once in the data seem plausible
# however, let's check some of the values for accuracy
df_clean.groupby('rating').count()['tweet_id'].nlargest(40)
Note: Rating column is intentionally kept as a string column to keep the unique rating system readable and comparable.
# let's check the couple of ratings that still look suspicious even after the cleaning we did in #7
# this is wrongly extracted and there is actually no real rating included in the text
df_clean.query('rating == "24.0/7.0"').text
df_clean['rating'].replace({'24.0/7.0': None}, inplace=True)
# this one is correct
df_clean.query('rating == "2.0/10.0"').text
# this one is correct
df_clean.query('rating == "165.0/150.0"').text
# this one is correct
df_clean.query('rating == "143.0/130.0"').text
# this one is correct
df_clean.query('rating == "121.0/110.0"').text
df_clean.groupby('rating').count()['tweet_id'].nlargest(30)
df_clean.info()
# let's make tweet_id the index of the df
df_clean.set_index('tweet_id', inplace=True)
# in_reply_to_status_id, in_reply_to_user_id only have 14 items, which will not be interesting for the analysis
df_clean = df_clean.drop(['in_reply_to_status_id', 'in_reply_to_user_id'], axis=1)
df_clean.info()
# timestamp as datetime
# retweets and likes as integers
df_clean['timestamp'] = pd.to_datetime(df_clean['timestamp'])
df_clean['retweets'] = df_clean['retweets'].astype(int)
df_clean['likes'] = df_clean['likes'].astype(int)
df_clean.info()
# this one is now easier to do on the final cleaned data than if we tried it in the beginning
df_clean.name.value_counts().nlargest(50)
# first, let's change "None" to real None
df_clean.name.replace('None', inplace=True)
# it looks like all names starting with lowercase letter seem not to be real names
df_clean.loc[df_clean['name'].str.islower()][['text','name']]
Displaying all lowercase names (which are actually not names) reveals several things:
# let's start with 3. and drop non-dog tweets
df_clean.drop(855459453768019968, inplace=True)
df_clean.drop(806219024703037440, inplace=True)
df_clean.drop(772581559778025472, inplace=True)
df_clean.drop(748977405889503236, inplace=True)
df_clean.drop(746872823977771008, inplace=True)
df_clean.drop(746369468511756288, inplace=True)
df_clean.drop(745422732645535745, inplace=True)
df_clean.drop(740214038584557568, inplace=True)
df_clean.drop(736225175608430592, inplace=True)
df_clean.drop(717537687239008257, inplace=True)
df_clean.drop(715733265223708672, inplace=True)
df_clean.drop(710272297844797440, inplace=True)
df_clean.drop(700747788515020802, inplace=True)
df_clean.drop(697259378236399616, inplace=True)
df_clean.drop(690360449368465409, inplace=True)
df_clean.drop(681297372102656000, inplace=True)
df_clean.drop(679530280114372609, inplace=True)
df_clean.drop(675534494439489536, inplace=True)
df_clean.drop(675109292475830276, inplace=True)
df_clean.drop(880872448815771648, inplace=True)
df_clean.drop(702217446468493312, inplace=True)
df_clean.drop(667793409583771648, inplace=True)
df_clean.drop(666373753744588802, inplace=True)
# next, let's check for real names in the remaining tweets
df_clean.loc[df_clean['name'].str.islower()][['text','name']]
df_clean.loc[828650029636317184, ['name']] = 'Grace'
df_clean.loc[765395769549590528, ['name']] = 'Zoey'
df_clean.loc[675706639471788032, ['name']] = 'Wylie'
df_clean.loc[671743150407421952, ['name']] = 'Jacob'
df_clean.loc[671147085991960577, ['name']] = 'Rufus'
df_clean.loc[670303360680108032, ['name']] = 'Hemry'
df_clean.loc[669564461267722241, ['name']] = 'Alfredo'
df_clean.loc[668955713004314625, ['name']] = 'Leroi'
df_clean.loc[668636665813057536, ['name']] = 'Berta'
df_clean.loc[668507509523615744, ['name']] = 'Chuk'
df_clean.loc[668171859951755264, ['name']] = 'Alfonso'
df_clean.loc[667861340749471744, ['name']] = 'Cheryl'
df_clean.loc[667773195014021121, ['name']] = 'Jessiga'
df_clean.loc[667538891197542400, ['name']] = 'Klint'
df_clean.loc[667470559035432960, ['name']] = 'Kohl'
df_clean.loc[667177989038297088, ['name']] = 'Daryl'
df_clean.loc[666781792255496192, ['name']] = 'Octaviath'
df_clean.loc[666701168228331520, ['name']] = 'Johm'
# finally, let's return to the remaining lowercase "names" that are still left and change them to none
df_clean.loc[df_clean['name'].str.islower()]['name'].value_counts()
# not names to null
df_clean.name.replace('a', inplace=True)
df_clean.name.replace('the', inplace=True)
df_clean.name.replace('an', inplace=True)
df_clean.name.replace('one', inplace=True)
df_clean.name.replace('just', inplace=True)
df_clean.name.replace('officially', inplace=True)
df_clean.name.replace('space', inplace=True)
df_clean.name.replace('light', inplace=True)
df_clean.name.replace('very', inplace=True)
df_clean.name.replace('infuriating', inplace=True)
# no lowercase names remain
df_clean.loc[df_clean['name'].str.islower()]['name']
df_clean.p1 = df_clean.p1.str.lower()
df_clean.p2 = df_clean.p2.str.lower()
df_clean.p3 = df_clean.p3.str.lower()
df_clean.sample(7)
# we end up with 1454 cleaned rows
df_clean.info()
Let's store the final cleaned data:
df_clean.to_csv('twitter_archive_master.csv', index=False)
The stored twitter_archive_master file is the cleaned data which we will now use for the analysis.
df = pd.read_csv('twitter_archive_master.csv')
# we can see the expected positive correlation of number of retweets and likes
sns.pairplot(df);
df.describe()
# likes and retweets scatter
plt.figure(figsize=(20,8))
plt.subplot(1,3,1)
g1 = sns.regplot(x = df.likes, y = df.retweets, color='blue', scatter_kws={'alpha':0.3})
g1.set(title = 'Correlation of likes and retweets')
plt.subplot(1,3,2)
g2 = sns.regplot(x = df.likes, y = df.retweets, fit_reg=False, color='blue', scatter_kws={'alpha':0.3})
g2.set(title = 'Correlation of likes and retweets', xlim=(0,15000), ylim=(0,4000))
plt.subplot(1,3,3)
g3 = sns.regplot(x = df.likes, y = df.retweets, fit_reg=False, color='blue', scatter_kws={'alpha':0.3})
g3.set(title = 'Correlation of likes and retweets', xlim=(0,6000), ylim=(0,2000))
plt.show()
df['likes'].corr(df['retweets'])
df_pred_breeds = pd.DataFrame(df.groupby('p1')['p1_conf'].mean().nlargest(15))
df_pred_breeds
breed_count = df.groupby('p1')['p1_conf'].count()
breed_count
# some of the breeds predicted with the highest probability have low counts among the tweets
df_pred_breeds['breed_count'] = breed_count
df_pred_breeds
df_pred_breeds.breed_count.sum()
# top 15 best predicted breeds covers 25% of all tweets
df_pred_breeds.breed_count.sum() / len(df)
# find what komondor, the breed predicted with the highest probability, looks like
df.query('p1 == "komondor"')['jpg_url']
# no wonder it is easy to recognize
from IPython.display import Image
Image(url='https://pbs.twimg.com/media/DBg_HT9WAAEeIMM.jpg')
# now let's look at the most frequently predicted breeds
df_frequent_breeds = df.groupby('p1').count()['likes'].nlargest(15)
df_frequent_breeds
# top 15 most frequently predicted breeds covers 51% of all tweets
df.groupby('p1').count()['likes'].nlargest(15).sum() / len(df)
# 5 of these breeds are also predicted with high probability
df_breeds_intersect = pd.merge(df_frequent_breeds, df_pred_breeds, on='p1', how='inner')
df_breeds_intersect.index
# likes and retweets scatter by breeds which are both most frequently predicted and with the highest probability
# separate dfs for each breed
df_golden_retriever = df[df['p1'] == 'golden_retriever']
df_pug = df[df['p1'] == 'pug']
df_samoyed = df[df['p1'] == 'samoyed']
df_pomeranian = df[df['p1'] == 'pomeranian']
df_french_bulldog = df[df['p1'] == 'french_bulldog']
# legend
import matplotlib.patches as mpatches
ret = mpatches.Patch(label='Golden retriever', color='blue')
pug = mpatches.Patch(label='Pug', color='green')
sam = mpatches.Patch(label='Samoyed', color='red')
pom = mpatches.Patch(label='Pomeranian', color='yellow')
bull = mpatches.Patch(label='French bulldog', color='black')
plt.figure(figsize=(20,12))
plt.subplot(1,3,1)
def reg_plot(x_lim, y_lim):
g = sns.regplot(x = df_golden_retriever.likes, y = df_golden_retriever.retweets, color='blue', scatter_kws={'alpha':0.5, 's':80}, fit_reg=False)
sns.regplot(x = df_pug.likes, y = df_pug.retweets, color='green', scatter_kws={'alpha':0.5, 's':80}, ax=g, fit_reg=False)
sns.regplot(x = df_samoyed.likes, y = df_samoyed.retweets, color='red', scatter_kws={'alpha':0.5, 's':80}, ax=g, fit_reg=False)
sns.regplot(x = df_pomeranian.likes, y = df_pomeranian.retweets, color='yellow', scatter_kws={'alpha':0.5, 's':80}, ax=g, fit_reg=False)
sns.regplot(x = df_french_bulldog.likes, y = df_french_bulldog.retweets, color='black', scatter_kws={'alpha':0.5, 's':80}, ax=g, fit_reg=False)
g.set(title = 'Correlation of likes and retweets', xlim=(0,x_lim), ylim=(0,y_lim))
plt.legend(handles=[ret, pug, sam, pom, bull])
return g
reg_plot(120000, 35000)
plt.subplot(1,3,2)
reg_plot(15000, 4000)
plt.subplot(1,3,3)
reg_plot(6000, 2000)
plt.show()
df.likes.mean()
# all ways below get the same result
# df['likes'].groupby(df['dog_stage']).mean()
# df.groupby('dog_stage').mean()['likes']
# df[['likes', 'dog_stage']].groupby(df['dog_stage']).mean() ... this one creates a dataframe instead of a series
df.groupby('dog_stage')['likes'].mean()
sns.barplot(x = df.dog_stage, y = df.likes, order = ['puppo', 'doggo', 'floofer', 'pupper'], color = 'blue', errwidth = 0)
plt.axhline(8729, color='black')
non = mpatches.Patch(label='No stage mentioned', color='black')
cover = mpatches.Patch(label='Covers 16% of all tweets', color='blue')
plt.legend(handles=[non, cover])
plt.xlabel('Dog stage')
plt.ylabel('Average likes per tweet')
plt.title('Tweets popularity by dog stage')
plt.ylim((0,25000))
plt.show()
# there are not many tweets with identified dog stage
df.groupby('dog_stage')['likes'].count()
# shares
shares = df.groupby('dog_stage').agg({'dog_stage': 'count'})
shares.apply(lambda x: x / len(df))
# only 16% of tweets have a dog stagementioned in them
1 - shares.loc['None'].apply(lambda x: x / len(df))
# check the high likes numbers
df.query('dog_stage == "puppo"').head(5)
# ratings by popularity
df.groupby('rating').count()['likes'].nlargest(10)
df.groupby('rating').count()['likes'].nlargest(4).sum() / len(df)
df.groupby('rating').count()['likes'].nlargest(10).sum() / len(df)
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
g = sns.barplot(x = ['12/10', '10/10', '11/10', '13/10'], \
y = df.groupby('rating').count()['likes'].nlargest(4), color = 'blue');
g.set(title = 'Top 4 most popular ratings', xlabel = 'Rating', ylabel = 'Number of tweets')
cover = mpatches.Patch(label='Covers 83% of all tweets', color='blue')
plt.legend(handles=[cover])
plt.subplot(1,2,2)
g = sns.barplot(x = ['12/10', '10/10', '11/10', '13/10', '9/10', '8/10', '7/10', '14/10', '6/10', '5/10'], \
y = df.groupby('rating').count()['likes'].nlargest(10), color = 'blue');
g.set(title = 'Top 10 most popular ratings', xlabel = 'Rating', ylabel = 'Number of tweets')
cover = mpatches.Patch(label='Covers 98% of all tweets', color='blue')
plt.legend(handles=[cover])
plt.show()
# names by popularity
df.groupby('name').count()['likes'].nlargest(10)
# top 10 names cover only 9% of tweets
df.groupby('name').count()['likes'].nlargest(10).sum() / len(df)
# because dog names are very diverse
df.name.nunique()
plt.figure(figsize=(8,4))
g = sns.barplot(x = ['Walter', 'Daisy', 'Cooper', 'Sadie', 'Charlie', 'Lucy', 'Oliver', 'Koda', 'Penny', 'Bella'], \
y = df.groupby('name').count()['likes'].nlargest(10), color = 'blue')
g.set(title = 'Top 10 most popular dog names', xlabel = 'Dog name', ylabel = 'Number of tweets')
cover = mpatches.Patch(label='Covers 9% of all tweets', color='blue')
plt.legend(handles=[cover])
plt.show()