Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

How to extract data from a Tweepy object into a pandas dataframe?

I am attempting to create a Pandas dataframe that looks like:

| user_name | followers | following | retweets | likes |  tweet date |     tweet    |
|:---------:|:---------:|:---------:|:--------:|:-----:|:-----------:|:------------:|
|   user1   |     50    |    100    |    25    |   10  |  Oct-1-2019 | lorem ipsum… |
|   user1   |     50    |    100    |    25    |   10  |  Oct-6-2019 | lorem ipsum… |
|   user1   |     50    |    100    |    25    |   10  | Oct-19-2019 | lorem ipsum… |
|   user1   |     50    |    100    |    25    |   10  |  Oct-4-2019 | lorem ipsum… |
|   user1   |     50    |    100    |    25    |   10  | Oct-16-2019 | lorem ipsum… |
|   user2   |    321    |   12151   |   2017   |   0   | Sep-12-2018 | lorem ipsum… |
|   user2   |    321    |   12151   |   2017   |   0   | Sep-15-2018 | lorem ipsum… |
|   user2   |    321    |   12151   |   2017   |   0   | Sep-17-2018 | lorem ipsum… |
|   user2   |    321    |   12151   |   2017   |   0   | Sep-17-2018 | lorem ipsum… |
|   user2   |    321    |   12151   |   2017   |   0   | Sep-17-2019 | lorem ipsum… |
|   user3   |    122    |    124    |    11    | 38337 |  Nov-1-2019 |    foobar    |

(The values here are arbitrary)

What I am trying to do is starting with a Twitter profile, to then scrape through the followers of that profile and extract the following features about that profile: {username (@), follower count, following count, # of retweets, # of likes}

I am using Tweepy to try and accomplish this.

So far, my current codes can grab followers, but it prints out the _json for the follower, and not the proper details I am looking for.

import tweepy
import time

#insert your Twitter keys here
consumer_key =''
consumer_secret=''
access_token=''
access_token_secret=''
#twitter_handle='TimBarbalace'

auth = tweepy.auth.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify = True)

users = []

if(api.verify_credentials):
    print("Logged In Successfully")
else:
    print("Error -- Could not log in with your credentials")

followers = tweepy.Cursor(api.followers).items()

i = 99
curr = 0
for follower in followers:
    if curr < i:
        print(follower)
        curr += 1
    else:
        exit()

And here is an example of the JSON

User(_api=<tweepy.api.API object at 0x0000028E4D3C8F60>, _json={'id': 1898321922, 'id_str': '1898321922', 'name': 'Creator Support', 'screen_name': 'GamerGrowthHQ', 'location': 'Global', 'description': 'Supporting Creators through advice, shout-outs, and daily support. Managed by @adron_foe', 'url': 'https://www.twitch.tv/adron_foe', 'entities': {'url': {'urls': [{'url': 'https://www.twitch.tv/adron_foe', 'expanded_url': 'https://twitch.tv/adron_foe', 'display_url': 'twitch.tv/adron_foe', 'indices': [0, 23]}]}, 'description': {'urls': []}}, 'protected': False, 'followers_count': 130539, 'friends_count': 73691, 'listed_count': 157, 'created_at': 'Mon Sep 23 20:37:10 +0000 2013', 'favourites_count': 2001, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 1540, 'lang': None, 'status': {'created_at': 'Sun Sep 29
23:49:54 +0000 2019', 'id': 1178456902491131909, 'id_str': '1178456902491131909', 'text': 'RT @zFakes_: Looking for an editor to make My first twitch emote', 'truncated': False, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': []}, 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'retweeted_status': {'created_at': 'Sun Sep 29 10:36:55 +0000 2019', 'id': 1178257339499110401, 'id_str': '1178257339499110401', 'text': 'Looking for an editor to make My first twitch emote', 'truncated': False, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': []}, 'source': '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 1, 'favorite_count': 23, 'favorited': False, 'retweeted': False, 'lang': 'en'}, 'is_quote_status': False, 'retweet_count': 1, 'favorite_count': 0, 'favorited': False, 'retweeted': False, 'lang': 'en'}, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': '000000', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme12/bg.gif', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme12/bg.gif', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1120067816118521856/PxOWQ_Qe_normal.png', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1120067816118521856/PxOWQ_Qe_normal.png', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/1898321922/1554732991', 'profile_link_color': '1B95E0', 'profile_sidebar_border_color': '000000', 'profile_sidebar_fill_color': '000000', 'profile_text_color': '000000', 'profile_use_background_image': False, 'has_extended_profile': False, 'default_profile': False, 'default_profile_image': False, 'can_media_tag': True, 'followed_by': True, 'following': False, 'live_following': False, 'follow_request_sent': False, 'notifications': False, 'muting': False, 'blocking': False, 'blocked_by': False, 'translator_type': 'none'}, id=1898321922, id_str='1898321922', name='Creator Support', screen_name='GamerGrowthHQ', location='Global', description='Supporting Creators through advice, shout-outs, and daily support. Managed by @adron_foe', url='https://www.twitch.tv/adron_foe', entities={'url':
{'urls': [{'url': 'https://www.twitch.tv/adron_foe', 'expanded_url': 'https://twitch.tv/adron_foe', 'display_url': 'twitch.tv/adron_foe', 'indices': [0, 23]}]}, 'description': {'urls': []}}, protected=False, followers_count=130539, friends_count=73691, listed_count=157, created_at=datetime.datetime(2013, 9, 23, 20, 37, 10), favourites_count=2001, utc_offset=None, time_zone=None, geo_enabled=False, verified=False, statuses_count=1540, lang=None, status=Status(_api=<tweepy.api.API object at 0x0000028E4D3C8F60>, _json={'created_at': 'Sun Sep 29 23:49:54 +0000 2019', 'id':
1178456902491131909, 'id_str': '1178456902491131909', 'text': 'RT @zFakes_: Looking for an editor to make My first twitch emote', 'truncated': False, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': []}, 'source': '<a href="http://twitter.com/download/iphone"
rel="nofollow">Twitter for iPhone</a>', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'retweeted_status': {'created_at': 'Sun Sep 29 10:36:55 +0000 2019', 'id': 1178257339499110401, 'id_str': '1178257339499110401', 'text': 'Looking for an editor to make My first twitch emote', 'truncated': False, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': []}, 'source': '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 1, 'favorite_count': 23, 'favorited': False, 'retweeted': False, 'lang': 'en'}, 'is_quote_status': False, 'retweet_count': 1, 'favorite_count': 0, 'favorited': False, 'retweeted': False, 'lang': 'en'}, created_at=datetime.datetime(2019, 9, 29, 23, 49, 54), id=1178456902491131909, id_str='1178456902491131909', text='RT @zFakes_: Looking for an editor to make My first twitch emote', truncated=False, entities={'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': []}, source='Twitter for iPhone', source_url='http://twitter.com/download/iphone', in_reply_to_status_id=None, in_reply_to_status_id_str=None, in_reply_to_user_id=None, in_reply_to_user_id_str=None, in_reply_to_screen_name=None, geo=None, coordinates=None, place=None, contributors=None, retweeted_status=Status(_api=<tweepy.api.API object at 0x0000028E4D3C8F60>, _json={'created_at': 'Sun Sep 29 10:36:55 +0000 2019', 'id': 1178257339499110401, 'id_str': '1178257339499110401', 'text': 'Looking for an editor to make My first twitch emote', 'truncated': False, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': []}, 'source': '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for
Android</a>', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None,
'in_reply_to_screen_name': None, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 1, 'favorite_count': 23, 'favorited': False, 'retweeted': False, 'lang': 'en'}, created_at=datetime.datetime(2019, 9, 29, 10, 36, 55), id=1178257339499110401, id_str='1178257339499110401', text='Looking for an editor to make My first twitch emote', truncated=False, entities={'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': []}, source='Twitter for Android', source_url='http://twitter.com/download/android',
in_reply_to_status_id=None, in_reply_to_status_id_str=None, in_reply_to_user_id=None, in_reply_to_user_id_str=None, in_reply_to_screen_name=None, geo=None, coordinates=None, place=None, contributors=None, is_quote_status=False, retweet_count=1, favorite_count=23, favorited=False, retweeted=False, lang='en'), is_quote_status=False, retweet_count=1, favorite_count=0, favorited=False, retweeted=False, lang='en'), contributors_enabled=False, is_translator=False, is_translation_enabled=False, profile_background_color='000000', profile_background_image_url='http://abs.twimg.com/images/themes/theme12/bg.gif', profile_background_image_url_https='https://abs.twimg.com/images/themes/theme12/bg.gif', profile_background_tile=False, profile_image_url='http://pbs.twimg.com/profile_images/1120067816118521856/PxOWQ_Qe_normal.png', profile_image_url_https='https://pbs.twimg.com/profile_images/1120067816118521856/PxOWQ_Qe_normal.png', profile_banner_url='https://pbs.twimg.com/profile_banners/1898321922/1554732991', profile_link_color='1B95E0', profile_sidebar_border_color='000000', profile_sidebar_fill_color='000000', profile_text_color='000000', profile_use_background_image=False, has_extended_profile=False, default_profile=False, default_profile_image=False, can_media_tag=True, followed_by=True, following=False, live_following=False, follow_request_sent=False, notifications=False, muting=False, blocking=False, blocked_by=False, translator_type='none')

I am trying to find a repeatable method that allows me to:

Take 200 followers from the signed in Twitter account, parse their account details (including tweets), and create a (large) Python Pandas dataframe object containing the mentioned details.

I have tried this link and this link, but I have not understood how to properly implement them to accomplish what I am looking for.

Another example is me being able to access the location of a user account, with the following:

import tweepy
import time

#insert your Twitter keys here
consumer_key =''
consumer_secret=''
access_token=''
access_token_secret=''
#twitter_handle='TimBarbalace'

auth = tweepy.auth.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify = True)

users = []

if(api.verify_credentials):
    print("Logged In Successfully")
else:
    print("Error -- Could not log in with your credentials")

followers = tweepy.Cursor(api.followers).items()

i = 99
curr = 0
for follower in followers:
    if curr < i:
        print(follower.screen_name, follower.location)
        curr += 1
    else:
        exit()

Results in:

crzyazn888 Washington, DC
narutouz16
GamerGrowthHQ Global
pleasantemma Hell, Pennsylvania
karadise_art in a galaxy far, far away
webdivaloper
Maurer_Ranger The Internet
megliebsch Philadelphia, Pennyslvania
hoang_le_96 Philadelphia, PA
lasallephilo Philadelphia, PA
brianmaxwell33
BobbyJPolitics Philadelphia, PA
_nadcas
JPower96IsTaken
crypticsmystic
ZacharyFlair Washington, DC
thegierczaks1
KFlahertyRN
cbars68
kaitlyndmcd Philadelphia, PA
illMELt_withyou
jesskidding07
BetaRayJohn
tew_dedicatesd Baltimore, MD
hbthen3rd Redmond, WA
g_laubenstein Philadelphia, PA
tewsaucey
leahgarloff Philadelphia, PA
TheCage52
softballkenz13
zyocard
josephsilvestr5 Mays Chapel, MD
jerry_ooooo
karadevanney Point Place, Wisconsin
omgitsfranipher New Jersey, USA
PaigeBuckworth
LSU_studyabroad
jcaskerr
Process_Pete Towson, MD
lexyandiknowiit Maryland, USA
lawoqTr
sucreidesc83 Казань
LaSalleSGA Philadelphia, PA
N_Pilny1
Kaileyminkk
allyssapingul HOBY MD
cgarvss
ubertev
beckwoodworth
lmgeee22
nosayslion Philadelphia, PA
CoreyRayEid Los Angeles
s0_krispy
aimeemarierose3 La Salle University
where_is_harry_ La Salle University
OfficialDriscoe Baltimore, MD
THEchubby_messi
Sera_Numquam Philadelphia, PA
3dBeddingsets
CelanoScott
alixleto1
dzhuzham4 Missouri, USA
tayyheath D(M)V
50ShadesOfGlaze
Deidre_Mc
nicole_wickizer
Thomasmedia2019 California, USA
water2142
DurkinSays Philadelphia, PA
tavia_overton Baltimore, MD
NotKTLeu
CornHub35 West Palm Beach, FL
The0kayJosh cincinnati zoo
sherree_wale
XavierRivera_ Baltimore, MD
phinguyen_163
dannywess83
okweightlossdna
cd_somers Baltimore, MD
OscarOr85985212
LawAbidingHuman London Town
LorenzoTanoueAK Durham, NC
cdvsmith
StephanieeLynn0
MrAlphonsoJones Virginia
baltiMAURA
keondra281
yagirlmels
HBroughaha
mi_erna
mike_wieczorek
chase_brennan13
Maryjs93 Phoenixville, PA
Brady_McKinney Baltimore... UMD Alumni
akbashor Philadelphia, PA
LinzJustin
cabarca_14
013MG
B_kroner82

NOTE - After reading some Stack Overflow posts, I think only the newest 200 tweets per user can suffice.

I also found this Github link for extracting just tweets?

I have added a Bounty to this question.

like image 934
artemis Avatar asked Nov 01 '19 21:11

artemis


People also ask

How does Python fetch data from twitter?

To be able to interact with the Twitter API using Python, we'll use a library called Tweepy. To be able to hide the credentials from the source code and load them as environment variables, we'll use python-dotenv. First, create a . env file to hold your credentials.

How do you get data from a column in a DataFrame?

You can use the loc and iloc functions to access columns in a Pandas DataFrame. Let's see how. If we wanted to access a certain column in our DataFrame, for example the Grades column, we could simply use the loc function and specify the name of the column in order to retrieve it.


1 Answers

Convert the tweepy object to JSON:

  • Attribution to Tweepy for beginners
  • followers is a generator containing User(...), which is a tweepy.models.User type
    • Wrap followers in list() to unpack the generator, or just iterate through the followers without unpacking it.
    • I unpacked it into a list in case there's some need to inspect the content
  • Extract _json for each user, with def jsonify_tweepy
  • Call the function to create a list containing _json for each follower, in a JSON format
  • Load it into a dataframe with json_normalize.

To get followers:

import tweepy
import json
import pandas as pd
from pandas.io.json import json_normalize

#insert your Twitter keys here
consumer_key = ''
consumer_secret= ''
access_token = ''
access_token_secret = ''

auth = tweepy.auth.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

followers = list(tweepy.Cursor(api.followers).items())

# function to convert _json to JSON
def jsonify_tweepy(tweepy_object):
    json_str = json.dumps(tweepy_object._json)
    return json.loads(json_str)

# Call the function and unload each _json into follower_list
followers_list = [jsonify_tweepy(follower) for follower in followers]

# Convert followers_list to a pandas dataframe
df = json_normalize(followers_list)

To get follower tweets:

  • Use class TweetMiner, as shown in the link at the top
  • As already noted, I did not write this class, but I did use it and it performs as specified, to extract tweets.
  • That said, bare except clauses are a no-no.
from datetime import datetime

class TweetMiner(object):

    result_limit = 20    
    data = list()
    api = False

    twitter_keys = {'consumer_key': 'your consumer_key',
                    'consumer_secret': 'your consumer_secret',
                    'access_token_key': 'your access_token',
                    'access_token_secret': 'your access_token_secret'}

    def __init__(self, keys_dict=twitter_keys, api=api, result_limit=20):

        self.twitter_keys = keys_dict

        auth = tweepy.OAuthHandler(keys_dict['consumer_key'],
                                   keys_dict['consumer_secret'])
        auth.set_access_token(keys_dict['access_token_key'],
                              keys_dict['access_token_secret'])

        self.api = tweepy.API(auth, wait_on_rate_limit=True,
                              wait_on_rate_limit_notify=True)
        self.twitter_keys = keys_dict
        self.result_limit = result_limit


    def mine_user_tweets(self, user, mine_rewteets=False, max_pages=5):

        data = list()
        last_tweet_id = False
        page = 1

        while page <= max_pages:
            if last_tweet_id:
                statuses =  self.api.user_timeline(screen_name=user,
                                                   count=self.result_limit,
                                                   max_id=last_tweet_id - 1,
                                                   tweet_mode = 'extended',
                                                   include_retweets=True)        
            else:
                statuses = self.api.user_timeline(screen_name=user,
                                                  count=self.result_limit,
                                                  tweet_mode = 'extended',
                                                  include_retweets=True)

            for item in statuses:

                mined = {'tweet_id': item.id,
                         'name': item.user.name,
                         'screen_name': item.user.screen_name,
                         'retweet_count': item.retweet_count,
                         'text': item.full_text,
                         'mined_at': datetime.now(),
                         'created_at': item.created_at,
                         'favourite_count': item.favorite_count,
                         'hashtags': item.entities['hashtags'],
                         'status_count': item.user.statuses_count,
                         'location': item.place,
                         'source_device': item.source}

                try:
                    mined['retweet_text'] = item.retweeted_status.full_text
                except:
                    mined['retweet_text'] = 'None'
                try:
                    mined['quote_text'] = item.quoted_status.full_text
                    mined['quote_screen_name'] = status.quoted_status.user.screen_name
                except:
                    mined['quote_text'] = 'None'
                    mined['quote_screen_name'] = 'None'

                last_tweet_id = item.id
                data.append(mined)

            page += 1

        return data

Call the class

  • The original object does not contain tweets
  • Using df from above, get all the followers and use class TweetMiner to download the tweets for each user.
  • The follow code, will create a dict of dataframes, mined_tweets_dict, where each key is a user.
miner=TweetMiner(result_limit=200)
mined_tweets_dict = dict()
for name in df['screen_name'].unique():
    try:
        mined_tweets = miner.mine_user_tweets(user=name, max_pages=17)
        mined_tweets_dict[name] = pd.DataFrame(mined_tweets)
    except tweepy.TweepError as e:
        print(f'{name} could not be processed because {e}')

Save with .to_csv:

with open('follower_tweets.csv', mode='a', encoding='utf-8') as f:
    for i, df in enumerate(mined_tweets_dict.values()):
        if i == 0:
            df.to_csv(f, header=True, index=False)
        else:
            df.to_csv(f, header=False, index=False)
like image 154
Trenton McKinney Avatar answered Oct 16 '22 13:10

Trenton McKinney