Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Parse This Custom Twitter Capture Data With Python and Create Statistics

I'm trying to gather twitter statistics from a specific dataset that was provided to me. I have no control over how the data is formatted before it is given to me so I'm locked into this messy for.

I would like some suggestions on how I can build a python program to parse this sort of input and outputting something more along the lines of a CSV file with the field titles as header and the values below.

I want to use python because eventually I would like to use some statistical tools that I've already put together.

Additionally the output of a CSV type format is preferred because I may input it into something like SPSS for statistical verification.

Here's a sample of what a single post looks like from the dataset:

{"text":"A gente todos os dias arruma os cabelos: por que não o coração?","contributors":null,"geo":null,"retweeted":false,"in_reply_to_screen_name":null,"truncated":false,"entities":{"urls":[],"hashtags":[],"user_mentions":[]},"in_reply_to_status_id_str":null,"id":50270714498002945,"source":"web","in_reply_to_user_id_str":null,"favorited":false,"in_reply_to_status_id":null,"created_at":"Tue Mar 22 19:00:46 +0000 2011","in_reply_to_user_id":null,"retweet_count":0,"id_str":"50270714498002945","place":null,"user":{"location":"Brasil, Recife-PE","statuses_count":16,"profile_background_tile":true,"lang":"en","profile_link_color":"867c5f","id":59154474,"following":null,"favourites_count":0,"protected":false,"profile_text_color":"91957f","verified":false,"contributors_enabled":false,"description":"","profile_sidebar_border_color":"eae2bc","name":"Natalia Aráujo","profile_background_color":"eae2bc","created_at":"Wed Jul 22 15:27:15 +0000 2009","followers_count":10,"geo_enabled":false,"profile_background_image_url":"http://a3.twimg.com/profile_background_images/220796682/music-2.png","follow_request_sent":null,"url":null,"utc_offset":-10800,"time_zone":"Brasilia","notifications":null,"profile_use_background_image":true,"friends_count":18,"profile_sidebar_fill_color":"eae2bc","screen_name":"nat_araujo","id_str":"59154474","show_all_inline_media":false,"profile_image_url":"http://a0.twimg.com/profile_images/1247378890/154254_normal.JPG","listed_count":1,"is_translator":false},"coordinates":null}

The dataset is one continuous line with NO line returns between posts. The only delimiter between the actual posts is:

All posts start with

{"text": 

and end with

null}

Any suggestions would be appreciated, and I of course would be glad to share my results with everyone.


edit

Based on what everyone is saying I've started off with the following:

import sys
import json
from pprint import pprint

if len(sys.argv) != 2:
    print 'To Use: twitterjson2cvs.py (path/filename)'
    sys.exit()

inputfile = open(sys.argv[1])

jsondatain = json.load(inputfile)
pprint(jsondatain)

inputfile.close()

which outputs something a bit cleaner in the form of:

{u'contributors': None,
 u'coordinates': None,
 u'created_at': u'Tue Mar 22 19:00:46 +0000 2011',
 u'entities': {u'hashtags': [], u'urls': [], u'user_mentions': []},
 u'favorited': False,
 u'geo': None,
 u'id': 50270714498002945L,
 u'id_str': u'50270714498002945',
 u'in_reply_to_screen_name': None,
 u'in_reply_to_status_id': None,
 u'in_reply_to_status_id_str': None,
 u'in_reply_to_user_id': None,
 u'in_reply_to_user_id_str': None,
 u'place': None,
 u'retweet_count': 0,
 u'retweeted': False,
 u'source': u'web',
 u'text': u'A gente todos os dias arruma os cabelos: por que n\xe3o o cora\xe7\xe3o?',
 u'truncated': False,
 u'user': {u'contributors_enabled': False,
           u'created_at': u'Wed Jul 22 15:27:15 +0000 2009',
           u'description': u'',
           u'favourites_count': 0,
           u'follow_request_sent': None,
           u'followers_count': 10,
           u'following': None,
           u'friends_count': 18,
           u'geo_enabled': False,
           u'id': 59154474,
           u'id_str': u'59154474',
           u'is_translator': False,
           u'lang': u'en',
           u'listed_count': 1,
           u'location': u'Brasil, Recife-PE',
           u'name': u'Natalia Ar\xe1ujo',
           u'notifications': None,
           u'profile_background_color': u'eae2bc',
           u'profile_background_image_url': u'http://a3.twimg.com/profile_background_images/220796682/music-2.png',
           u'profile_background_tile': True,
           u'profile_image_url': u'http://a0.twimg.com/profile_images/1247378890/154254_normal.JPG',
           u'profile_link_color': u'867c5f',
           u'profile_sidebar_border_color': u'eae2bc',
           u'profile_sidebar_fill_color': u'eae2bc',
           u'profile_text_color': u'91957f',
           u'profile_use_background_image': True,
           u'protected': False,
           u'screen_name': u'nat_araujo',
           u'show_all_inline_media': False,
           u'statuses_count': 16,
           u'time_zone': u'Brasilia',
           u'url': None,
           u'utc_offset': -10800,
           u'verified': False}

edit

I've added to the previous code in an attempt to output to a csv file:

import sys
import json
#from pprint import pprint
import csv

if len(sys.argv) != 2:
    print 'To Use: twitterjson2cvs.py (path/filename)'
    sys.exit()

inputfile = open(sys.argv[1])

jsondatain = json.load(inputfile)

f=csv.writer(open("test.csv","wb+"))

f.writerow(["contributors","coordinates","created_at","entities","hashtags","urls","user_mentions","favorited","geo","id","id_str","in_reply_to_screen_name","in_reply_to_status_id","in_reply_to_status_id_str","in_reply_to_user_id","in_reply_to_user_id_str","place","retweet_count","retweeted","source","text","truncated","user","contributors_enabled","created_at","description","favourites_count","follow_request_sent","followers_count","following","friends_count","geo_enabled","id","id_str","is_translator","lang","listed_count","location","name","notifications","profile_background_color","profile_background_image_url","profile_background_tile","profile_image_url","profile_link_color","profile_sidebar_border_color","profile_sidebar_fill_color","profile_text_color","profile_use_background_image","protected","screen_name","show_all_inline_media","statuses_count","time_zone","url","utc_offset","verified"])
for x in jsondatain:
    f.writerow([x["contributors"],x["fields"]["coordinates"],x["fields"]["created_at"],x["fields"]["entities"],x["fields"]["hashtags"],x["fields"]["urls"],x["fields"]["user_mentions"],x["fields"]["favorited"],x["fields"]["geo"],x["fields"]["id"],x["fields"]["id_str"],x["fields"]["in_reply_to_screen_name"],x["fields"]["in_reply_to_status_id"],x["fields"]["in_reply_to_status_id_str"],x["fields"]["in_reply_to_user_id"],x["fields"]["in_reply_to_user_id_str"],x["fields"]["place"],x["fields"]["retweet_count"],x["fields"]["retweeted"],x["fields"]["source"],x["fields"]["text"],x["fields"]["truncated"],x["fields"]["user"],x["fields"]["contributors_enabled"],x["fields"]["created_at"],x["fields"]["description"],x["fields"]["favourites_count"],x["fields"]["follow_request_sent"],x["fields"]["followers_count"],x["fields"]["following"],x["fields"]["friends_count"],x["fields"]["geo_enabled"],x["fields"]["id"],x["fields"]["id_str"],x["fields"]["is_translator"],x["fields"]["lang"],x["fields"]["listed_count"],x["fields"]["location"],x["fields"]["name"],x["fields"]["notifications"],x["fields"]["profile_background_color"],x["fields"]["profile_background_image_url"],x["fields"]["profile_background_tile"],x["fields"]["profile_image_url"],x["fields"]["profile_link_color"],x["fields"]["profile_sidebar_border_color"],x["fields"]["profile_sidebar_fill_color"],x["fields"]["profile_text_color"],x["fields"]["profile_use_background_image"],x["fields"]["protected"],x["fields"]["screen_name"],x["fields"]["show_all_inline_media"],x["fields"]["statuses_count"],x["fields"]["time_zone"],x["fields"]["url"],x["fields"]["utc_offset"],x["fields"]["verified"]])

#pprint(jsondatain)

inputfile.close()

However when I run it I get:

  File "twitterjson2cvs.py", line 28, in 
    f.writerow([x["contributors"],x["fields"]["coordinates"],x["fields"]["created_at"],x["fields"]["entities"],x["fields"]["hashtags"],x["fields"]["urls"],x["fields"]["user_mentions"],x["fields"]["favorited"],x["fields"]["geo"],x["fields"]["id"],x["fields"]["id_str"],x["fields"]["in_reply_to_screen_name"],x["fields"]["in_reply_to_status_id"],x["fields"]["in_reply_to_status_id_str"],x["fields"]["in_reply_to_user_id"],x["fields"]["in_reply_to_user_id_str"],x["fields"]["place"],x["fields"]["retweet_count"],x["fields"]["retweeted"],x["fields"]["source"],x["fields"]["text"],x["fields"]["truncated"],x["fields"]["user"],x["fields"]["contributors_enabled"],x["fields"]["created_at"],x["fields"]["description"],x["fields"]["favourites_count"],x["fields"]["follow_request_sent"],x["fields"]["followers_count"],x["fields"]["following"],x["fields"]["friends_count"],x["fields"]["geo_enabled"],x["fields"]["id"],x["fields"]["id_str"],x["fields"]["is_translator"],x["fields"]["lang"],x["fields"]["listed_count"],x["fields"]["location"],x["fields"]["name"],x["fields"]["notifications"],x["fields"]["profile_background_color"],x["fields"]["profile_background_image_url"],x["fields"]["profile_background_tile"],x["fields"]["profile_image_url"],x["fields"]["profile_link_color"],x["fields"]["profile_sidebar_border_color"],x["fields"]["profile_sidebar_fill_color"],x["fields"]["profile_text_color"],x["fields"]["profile_use_background_image"],x["fields"]["protected"],x["fields"]["screen_name"],x["fields"]["show_all_inline_media"],x["fields"]["statuses_count"],x["fields"]["time_zone"],x["fields"]["url"],x["fields"]["utc_offset"],x["fields"]["verified"]])
TypeError: string indices must be integers

The error has something to do with how the formatting of the fields but I'm not seeing it.


edit

I updated the code to reflect your format suggestion as follows:


import sys
import json
import csv

if len(sys.argv) != 2:
    print 'To Use: twitterjson2cvs.py (path/filename)'
    sys.exit()

inputfile = open(sys.argv[1])

jsondatain = json.load(inputfile)

f=csv.writer(open("test.csv","wb+"))

f.writerow(["contributors","coordinates","created_at","entities","hashtags","urls","user_mentions","favorited","geo","id","id_str","in_reply_to_screen_name","in_reply_to_status_id","in_reply_to_status_id_str","in_reply_to_user_id","in_reply_to_user_id_str","place","retweet_count","retweeted","source","text","truncated","user","contributors_enabled","created_at","description","favourites_count","follow_request_sent","followers_count","following","friends_count","geo_enabled","id","id_str","is_translator","lang","listed_count","location","name","notifications","profile_background_color","profile_background_image_url","profile_background_tile","profile_image_url","profile_link_color","profile_sidebar_border_color","profile_sidebar_fill_color","profile_text_color","profile_use_background_image","protected","screen_name","show_all_inline_media","statuses_count","time_zone","url","utc_offset","verified"])
for x in jsondatain:
    f.writerow( 
        (   
            x['contributors'],
            x['coordinates'],
            x['created_at'],
            x['entities']['hashtags'],
            x['entities']['urls'],
            x['entities']['user_mentions'],
            x['favorited'],
            x['geo'],
            x['id'],
            x['id_str'],
            x['in_reply_to_screen_name'],
            x['in_reply_to_status_id'],
            x['in_reply_to_status_id_str'],
            x['in_reply_to_user_id'],
            x['in_reply_to_user_id_str'],
            x['place'],
            x['retweet_count'],
            x['retweeted'],
            x['source'],
            x['text'].encode('utf8'),
            x['truncated'],
            x['user']['contributors_enabled'],
            x['user']['created_at'],
            x['user']['description'],
            x['user']['favourites_count'],
            x['user']['follow_request_sent'],
            x['user']['followers_count'],
            x['user']['following'],
            x['user']['friends_count'],
            x['user']['geo_enabled'],
            x['user']['id'],
            x['user']['id_str'],
            x['user']['is_translator'],
            x['user']['lang'],
            x['user']['listed_count'],
            x['user']['location'],
            x['user']['name'].encode('utf8'),
            x['user']['notifications'],
            x['user']['profile_background_color'],
            x['user']['profile_background_image_url'],
            x['user']['profile_background_tile'],
            x['user']['profile_image_url'],
            x['user']['profile_link_color'],
            x['user']['profile_sidebar_border_color'],
            x['user']['profile_sidebar_fill_color'],
            x['user']['profile_text_color'],
            x['user']['profile_use_background_image'],
            x['user']['protected'],
            x['user']['screen_name'],
            x['user']['show_all_inline_media'],
            x['user']['statuses_count'],
            x['user']['time_zone'],
            x['user']['url'],
            x['user']['utc_offset'],
            x['user']['verified']
        )
    )


inputfile.close()

I still get the following error of:


twitterjson2cvs.py TweetFile1300820340639.tcm.online 
Traceback (most recent call last):
  File "workspace/coalmine-datafilter/src/twitterjson2csv.py", line 30, in 
    x['contributors'],
TypeError: string indices must be integers


edit

Everything is working great up to this point now for a single json formatted input file. The previous example json string input into this program:

import sys
import json
import csv

if len(sys.argv) != 2:
    print 'To Use: twitterjson2cvs.py (path/filename)'
    sys.exit()

inputfile = open(sys.argv[1])

jsonindata = json.load(inputfile)

f=csv.writer(open("test.csv","wb+"))

f.writerow(["contributors","coordinates","created_at","entities","hashtags","urls","user_mentions","favorited","geo","id","id_str","in_reply_to_screen_name","in_reply_to_status_id","in_reply_to_status_id_str","in_reply_to_user_id","in_reply_to_user_id_str","place","retweet_count","retweeted","source","text","truncated","user","contributors_enabled","created_at","description","favourites_count","follow_request_sent","followers_count","following","friends_count","geo_enabled","id","id_str","is_translator","lang","listed_count","location","name","notifications","profile_background_color","profile_background_image_url","profile_background_tile","profile_image_url","profile_link_color","profile_sidebar_border_color","profile_sidebar_fill_color","profile_text_color","profile_use_background_image","protected","screen_name","show_all_inline_media","statuses_count","time_zone","url","utc_offset","verified"])

f.writerow( 
        (
            jsonindata['contributors'],
            jsonindata['coordinates'],
            jsonindata['created_at'],
            jsonindata['entities']['hashtags'],
            jsonindata['entities']['urls'],
            jsonindata['entities']['user_mentions'],
            jsonindata['favorited'],
            jsonindata['geo'],
            jsonindata['id'],
            jsonindata['id_str'],
            jsonindata['in_reply_to_screen_name'],
            jsonindata['in_reply_to_status_id'],
            jsonindata['in_reply_to_status_id_str'],
            jsonindata['in_reply_to_user_id'],
            jsonindata['in_reply_to_user_id_str'],
            jsonindata['place'],
            jsonindata['retweet_count'],
            jsonindata['retweeted'],
            jsonindata['source'],
            jsonindata['text'].encode('utf8'),
            jsonindata['truncated'],
            jsonindata['user']['contributors_enabled'],
            jsonindata['user']['created_at'],
            jsonindata['user']['description'],
            jsonindata['user']['favourites_count'],
            jsonindata['user']['follow_request_sent'],
            jsonindata['user']['followers_count'],
            jsonindata['user']['following'],
            jsonindata['user']['friends_count'],
            jsonindata['user']['geo_enabled'],
            jsonindata['user']['id'],
            jsonindata['user']['id_str'],
            jsonindata['user']['is_translator'],
            jsonindata['user']['lang'],
            jsonindata['user']['listed_count'],
            jsonindata['user']['location'],
            jsonindata['user']['name'].encode('utf8'),
            jsonindata['user']['notifications'],
            jsonindata['user']['profile_background_color'],
            jsonindata['user']['profile_background_image_url'],
            jsonindata['user']['profile_background_tile'],
            jsonindata['user']['profile_image_url'],
            jsonindata['user']['profile_link_color'],
            jsonindata['user']['profile_sidebar_border_color'],
            jsonindata['user']['profile_sidebar_fill_color'],
            jsonindata['user']['profile_text_color'],
            jsonindata['user']['profile_use_background_image'],
            jsonindata['user']['protected'],
            jsonindata['user']['screen_name'],
            jsonindata['user']['show_all_inline_media'],
            jsonindata['user']['statuses_count'],
            jsonindata['user']['time_zone'],
            jsonindata['user']['url'],
            jsonindata['user']['utc_offset'],
            jsonindata['user']['verified']
        )
    )

inputfile.close()

results in a nicely formatted output ready for tools like SPSS to use as follows:

contributors,coordinates,created_at,entities,hashtags,urls,user_mentions,favorited,geo,id,id_str,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,place,retweet_count,retweeted,source,text,truncated,user,contributors_enabled,created_at,description,favourites_count,follow_request_sent,followers_count,following,friends_count,geo_enabled,id,id_str,is_translator,lang,listed_count,location,name,notifications,profile_background_color,profile_background_image_url,profile_background_tile,profile_image_url,profile_link_color,profile_sidebar_border_color,profile_sidebar_fill_color,profile_text_color,profile_use_background_image,protected,screen_name,show_all_inline_media,statuses_count,time_zone,url,utc_offset,verified
,,Tue Mar 22 19:00:46 +0000 2011,[],[],[],False,,50270714498002945,50270714498002945,,,,,,,0,False,web,A gente todos os dias arruma os cabelos: por que não o coração?,False,False,Wed Jul 22 15:27:15 +0000 2009,,0,,10,,18,False,59154474,59154474,False,en,1,"Brasil, Recife-PE",Natalia Aráujo,,eae2bc,http://a3.twimg.com/profile_background_images/220796682/music-2.png,True,http://a0.twimg.com/profile_images/1247378890/154254_normal.JPG,867c5f,eae2bc,eae2bc,91957f,True,False,nat_araujo,False,16,Brasilia,,-10800,False

Now the only problem is that my I have input files that have multiple json strings inline with each other all on one continuous line. When I attempt to run the same program on those files I get the following error:

Traceback (most recent call last):
  File "workspace/coalmine-datafilter/src/twitterjson2cvs.py", line 22, in 
    jsonindata = json.load(inputfile)
  File "/usr/lib/python2.6/json/__init__.py", line 267, in load
    parse_constant=parse_constant, **kw)
  File "/usr/lib/python2.6/json/__init__.py", line 307, in loads
    return _default_decoder.decode(s)
  File "/usr/lib/python2.6/json/decoder.py", line 322, in decode
    raise ValueError(errmsg("Extra data", s, end, len(s)))
ValueError: Extra data: line 1 column 1514 - line 2 column 1 (char 1514 - 2427042)

The input file is very large, (ie: multiple thousands of twitter posts), I don't know if the error is due to the number of posts or if it's because the file has multiple {"...."}{"...."} all on the same line. Any ideas? Do I perhaps need to add a line return somehow after each feed?

like image 316
secumind Avatar asked Mar 23 '11 04:03

secumind


People also ask

Does Twitter allow scraping?

As of 2022, it is legal and allowed to scrape publicly available data from Twitter. It means anything that you can see without logging into the website.


2 Answers

this should get you started...you will need to take care of the nested objects

import json
import csv
f = file('test.json', 'r')
data = json.load(f)
#result = []
for k,v in data.iteritems():
    print k,v
    #result.append(v)
f = file('output.csv', 'w')
writer = csv.writer(f)
writer.writerows(result)
like image 141
eat_a_lemon Avatar answered Oct 17 '22 08:10

eat_a_lemon


The input here is JSON. Python has a JSON module. Happily it has a CSV module too. So that's your input and output taken care of!

Update

You are getting there!

Your call to writerow() needs to look more like (not in a for-loop):

f.writerow( 
        (
            jsonindata['contributors'],
            jsonindata['coordinates'],
            jsonindata['created_at'],
            jsonindata['entities']['hashtags'],
            jsonindata['entities']['urls'],
            jsonindata['entities']['user_mentions'],
            jsonindata['favorited'],
            jsonindata['geo'],
            jsonindata['id'],
            jsonindata['id_str'],
            jsonindata['in_reply_to_screen_name'],
            jsonindata['in_reply_to_status_id'],
            jsonindata['in_reply_to_status_id_str'],
            jsonindata['in_reply_to_user_id'],
            jsonindata['in_reply_to_user_id_str'],
            jsonindata['place'],
            jsonindata['retweet_count'],
            jsonindata['retweeted'],
            jsonindata['source'],
            jsonindata['text'].encode('utf8'),
            jsonindata['truncated'],
            jsonindata['user']['contributors_enabled'],
            jsonindata['user']['created_at'],
            jsonindata['user']['description'],
            jsonindata['user']['favourites_count'],
            jsonindata['user']['follow_request_sent'],
            jsonindata['user']['followers_count'],
            jsonindata['user']['following'],
            jsonindata['user']['friends_count'],
            jsonindata['user']['geo_enabled'],
            jsonindata['user']['id'],
            jsonindata['user']['id_str'],
            jsonindata['user']['is_translator'],
            jsonindata['user']['lang'],
            jsonindata['user']['listed_count'],
            jsonindata['user']['location'],
            jsonindata['user']['name'].encode('utf8'),
            jsonindata['user']['notifications'],
            jsonindata['user']['profile_background_color'],
            jsonindata['user']['profile_background_image_url'],
            jsonindata['user']['profile_background_tile'],
            jsonindata['user']['profile_image_url'],
            jsonindata['user']['profile_link_color'],
            jsonindata['user']['profile_sidebar_border_color'],
            jsonindata['user']['profile_sidebar_fill_color'],
            jsonindata['user']['profile_text_color'],
            jsonindata['user']['profile_use_background_image'],
            jsonindata['user']['protected'],
            jsonindata['user']['screen_name'],
            jsonindata['user']['show_all_inline_media'],
            jsonindata['user']['statuses_count'],
            jsonindata['user']['time_zone'],
            jsonindata['user']['url'],
            jsonindata['user']['utc_offset'],
            jsonindata['user']['verified']
        )
    )

Also consider using DictWriter, but remember that Python's CSV module deals badly with Unicode, hence .encode('utf8') on a couple of elements of the tuple.

like image 35
Johnsyweb Avatar answered Oct 17 '22 07:10

Johnsyweb