I'm trying to gather twitter statistics from a specific dataset that was provided to me. I have no control over how the data is formatted before it is given to me so I'm locked into this messy for.
I would like some suggestions on how I can build a python program to parse this sort of input and outputting something more along the lines of a CSV file with the field titles as header and the values below.
I want to use python because eventually I would like to use some statistical tools that I've already put together.
Additionally the output of a CSV type format is preferred because I may input it into something like SPSS for statistical verification.
Here's a sample of what a single post looks like from the dataset:
{"text":"A gente todos os dias arruma os cabelos: por que não o coração?","contributors":null,"geo":null,"retweeted":false,"in_reply_to_screen_name":null,"truncated":false,"entities":{"urls":[],"hashtags":[],"user_mentions":[]},"in_reply_to_status_id_str":null,"id":50270714498002945,"source":"web","in_reply_to_user_id_str":null,"favorited":false,"in_reply_to_status_id":null,"created_at":"Tue Mar 22 19:00:46 +0000 2011","in_reply_to_user_id":null,"retweet_count":0,"id_str":"50270714498002945","place":null,"user":{"location":"Brasil, Recife-PE","statuses_count":16,"profile_background_tile":true,"lang":"en","profile_link_color":"867c5f","id":59154474,"following":null,"favourites_count":0,"protected":false,"profile_text_color":"91957f","verified":false,"contributors_enabled":false,"description":"","profile_sidebar_border_color":"eae2bc","name":"Natalia Aráujo","profile_background_color":"eae2bc","created_at":"Wed Jul 22 15:27:15 +0000 2009","followers_count":10,"geo_enabled":false,"profile_background_image_url":"http://a3.twimg.com/profile_background_images/220796682/music-2.png","follow_request_sent":null,"url":null,"utc_offset":-10800,"time_zone":"Brasilia","notifications":null,"profile_use_background_image":true,"friends_count":18,"profile_sidebar_fill_color":"eae2bc","screen_name":"nat_araujo","id_str":"59154474","show_all_inline_media":false,"profile_image_url":"http://a0.twimg.com/profile_images/1247378890/154254_normal.JPG","listed_count":1,"is_translator":false},"coordinates":null}
The dataset is one continuous line with NO line returns between posts. The only delimiter between the actual posts is:
All posts start with
{"text":
and end with
null}
Any suggestions would be appreciated, and I of course would be glad to share my results with everyone.
edit
Based on what everyone is saying I've started off with the following:
import sys import json from pprint import pprint if len(sys.argv) != 2: print 'To Use: twitterjson2cvs.py (path/filename)' sys.exit() inputfile = open(sys.argv[1]) jsondatain = json.load(inputfile) pprint(jsondatain) inputfile.close()
which outputs something a bit cleaner in the form of:
{u'contributors': None, u'coordinates': None, u'created_at': u'Tue Mar 22 19:00:46 +0000 2011', u'entities': {u'hashtags': [], u'urls': [], u'user_mentions': []}, u'favorited': False, u'geo': None, u'id': 50270714498002945L, u'id_str': u'50270714498002945', u'in_reply_to_screen_name': None, u'in_reply_to_status_id': None, u'in_reply_to_status_id_str': None, u'in_reply_to_user_id': None, u'in_reply_to_user_id_str': None, u'place': None, u'retweet_count': 0, u'retweeted': False, u'source': u'web', u'text': u'A gente todos os dias arruma os cabelos: por que n\xe3o o cora\xe7\xe3o?', u'truncated': False, u'user': {u'contributors_enabled': False, u'created_at': u'Wed Jul 22 15:27:15 +0000 2009', u'description': u'', u'favourites_count': 0, u'follow_request_sent': None, u'followers_count': 10, u'following': None, u'friends_count': 18, u'geo_enabled': False, u'id': 59154474, u'id_str': u'59154474', u'is_translator': False, u'lang': u'en', u'listed_count': 1, u'location': u'Brasil, Recife-PE', u'name': u'Natalia Ar\xe1ujo', u'notifications': None, u'profile_background_color': u'eae2bc', u'profile_background_image_url': u'http://a3.twimg.com/profile_background_images/220796682/music-2.png', u'profile_background_tile': True, u'profile_image_url': u'http://a0.twimg.com/profile_images/1247378890/154254_normal.JPG', u'profile_link_color': u'867c5f', u'profile_sidebar_border_color': u'eae2bc', u'profile_sidebar_fill_color': u'eae2bc', u'profile_text_color': u'91957f', u'profile_use_background_image': True, u'protected': False, u'screen_name': u'nat_araujo', u'show_all_inline_media': False, u'statuses_count': 16, u'time_zone': u'Brasilia', u'url': None, u'utc_offset': -10800, u'verified': False}
edit
I've added to the previous code in an attempt to output to a csv file:
import sys import json #from pprint import pprint import csv if len(sys.argv) != 2: print 'To Use: twitterjson2cvs.py (path/filename)' sys.exit() inputfile = open(sys.argv[1]) jsondatain = json.load(inputfile) f=csv.writer(open("test.csv","wb+")) f.writerow(["contributors","coordinates","created_at","entities","hashtags","urls","user_mentions","favorited","geo","id","id_str","in_reply_to_screen_name","in_reply_to_status_id","in_reply_to_status_id_str","in_reply_to_user_id","in_reply_to_user_id_str","place","retweet_count","retweeted","source","text","truncated","user","contributors_enabled","created_at","description","favourites_count","follow_request_sent","followers_count","following","friends_count","geo_enabled","id","id_str","is_translator","lang","listed_count","location","name","notifications","profile_background_color","profile_background_image_url","profile_background_tile","profile_image_url","profile_link_color","profile_sidebar_border_color","profile_sidebar_fill_color","profile_text_color","profile_use_background_image","protected","screen_name","show_all_inline_media","statuses_count","time_zone","url","utc_offset","verified"]) for x in jsondatain: f.writerow([x["contributors"],x["fields"]["coordinates"],x["fields"]["created_at"],x["fields"]["entities"],x["fields"]["hashtags"],x["fields"]["urls"],x["fields"]["user_mentions"],x["fields"]["favorited"],x["fields"]["geo"],x["fields"]["id"],x["fields"]["id_str"],x["fields"]["in_reply_to_screen_name"],x["fields"]["in_reply_to_status_id"],x["fields"]["in_reply_to_status_id_str"],x["fields"]["in_reply_to_user_id"],x["fields"]["in_reply_to_user_id_str"],x["fields"]["place"],x["fields"]["retweet_count"],x["fields"]["retweeted"],x["fields"]["source"],x["fields"]["text"],x["fields"]["truncated"],x["fields"]["user"],x["fields"]["contributors_enabled"],x["fields"]["created_at"],x["fields"]["description"],x["fields"]["favourites_count"],x["fields"]["follow_request_sent"],x["fields"]["followers_count"],x["fields"]["following"],x["fields"]["friends_count"],x["fields"]["geo_enabled"],x["fields"]["id"],x["fields"]["id_str"],x["fields"]["is_translator"],x["fields"]["lang"],x["fields"]["listed_count"],x["fields"]["location"],x["fields"]["name"],x["fields"]["notifications"],x["fields"]["profile_background_color"],x["fields"]["profile_background_image_url"],x["fields"]["profile_background_tile"],x["fields"]["profile_image_url"],x["fields"]["profile_link_color"],x["fields"]["profile_sidebar_border_color"],x["fields"]["profile_sidebar_fill_color"],x["fields"]["profile_text_color"],x["fields"]["profile_use_background_image"],x["fields"]["protected"],x["fields"]["screen_name"],x["fields"]["show_all_inline_media"],x["fields"]["statuses_count"],x["fields"]["time_zone"],x["fields"]["url"],x["fields"]["utc_offset"],x["fields"]["verified"]]) #pprint(jsondatain) inputfile.close()
However when I run it I get:
File "twitterjson2cvs.py", line 28, in f.writerow([x["contributors"],x["fields"]["coordinates"],x["fields"]["created_at"],x["fields"]["entities"],x["fields"]["hashtags"],x["fields"]["urls"],x["fields"]["user_mentions"],x["fields"]["favorited"],x["fields"]["geo"],x["fields"]["id"],x["fields"]["id_str"],x["fields"]["in_reply_to_screen_name"],x["fields"]["in_reply_to_status_id"],x["fields"]["in_reply_to_status_id_str"],x["fields"]["in_reply_to_user_id"],x["fields"]["in_reply_to_user_id_str"],x["fields"]["place"],x["fields"]["retweet_count"],x["fields"]["retweeted"],x["fields"]["source"],x["fields"]["text"],x["fields"]["truncated"],x["fields"]["user"],x["fields"]["contributors_enabled"],x["fields"]["created_at"],x["fields"]["description"],x["fields"]["favourites_count"],x["fields"]["follow_request_sent"],x["fields"]["followers_count"],x["fields"]["following"],x["fields"]["friends_count"],x["fields"]["geo_enabled"],x["fields"]["id"],x["fields"]["id_str"],x["fields"]["is_translator"],x["fields"]["lang"],x["fields"]["listed_count"],x["fields"]["location"],x["fields"]["name"],x["fields"]["notifications"],x["fields"]["profile_background_color"],x["fields"]["profile_background_image_url"],x["fields"]["profile_background_tile"],x["fields"]["profile_image_url"],x["fields"]["profile_link_color"],x["fields"]["profile_sidebar_border_color"],x["fields"]["profile_sidebar_fill_color"],x["fields"]["profile_text_color"],x["fields"]["profile_use_background_image"],x["fields"]["protected"],x["fields"]["screen_name"],x["fields"]["show_all_inline_media"],x["fields"]["statuses_count"],x["fields"]["time_zone"],x["fields"]["url"],x["fields"]["utc_offset"],x["fields"]["verified"]]) TypeError: string indices must be integers
The error has something to do with how the formatting of the fields but I'm not seeing it.
edit
I updated the code to reflect your format suggestion as follows:
import sys import json import csv if len(sys.argv) != 2: print 'To Use: twitterjson2cvs.py (path/filename)' sys.exit() inputfile = open(sys.argv[1]) jsondatain = json.load(inputfile) f=csv.writer(open("test.csv","wb+")) f.writerow(["contributors","coordinates","created_at","entities","hashtags","urls","user_mentions","favorited","geo","id","id_str","in_reply_to_screen_name","in_reply_to_status_id","in_reply_to_status_id_str","in_reply_to_user_id","in_reply_to_user_id_str","place","retweet_count","retweeted","source","text","truncated","user","contributors_enabled","created_at","description","favourites_count","follow_request_sent","followers_count","following","friends_count","geo_enabled","id","id_str","is_translator","lang","listed_count","location","name","notifications","profile_background_color","profile_background_image_url","profile_background_tile","profile_image_url","profile_link_color","profile_sidebar_border_color","profile_sidebar_fill_color","profile_text_color","profile_use_background_image","protected","screen_name","show_all_inline_media","statuses_count","time_zone","url","utc_offset","verified"]) for x in jsondatain: f.writerow( ( x['contributors'], x['coordinates'], x['created_at'], x['entities']['hashtags'], x['entities']['urls'], x['entities']['user_mentions'], x['favorited'], x['geo'], x['id'], x['id_str'], x['in_reply_to_screen_name'], x['in_reply_to_status_id'], x['in_reply_to_status_id_str'], x['in_reply_to_user_id'], x['in_reply_to_user_id_str'], x['place'], x['retweet_count'], x['retweeted'], x['source'], x['text'].encode('utf8'), x['truncated'], x['user']['contributors_enabled'], x['user']['created_at'], x['user']['description'], x['user']['favourites_count'], x['user']['follow_request_sent'], x['user']['followers_count'], x['user']['following'], x['user']['friends_count'], x['user']['geo_enabled'], x['user']['id'], x['user']['id_str'], x['user']['is_translator'], x['user']['lang'], x['user']['listed_count'], x['user']['location'], x['user']['name'].encode('utf8'), x['user']['notifications'], x['user']['profile_background_color'], x['user']['profile_background_image_url'], x['user']['profile_background_tile'], x['user']['profile_image_url'], x['user']['profile_link_color'], x['user']['profile_sidebar_border_color'], x['user']['profile_sidebar_fill_color'], x['user']['profile_text_color'], x['user']['profile_use_background_image'], x['user']['protected'], x['user']['screen_name'], x['user']['show_all_inline_media'], x['user']['statuses_count'], x['user']['time_zone'], x['user']['url'], x['user']['utc_offset'], x['user']['verified'] ) ) inputfile.close()
I still get the following error of:
twitterjson2cvs.py TweetFile1300820340639.tcm.online Traceback (most recent call last): File "workspace/coalmine-datafilter/src/twitterjson2csv.py", line 30, in x['contributors'], TypeError: string indices must be integers
edit
Everything is working great up to this point now for a single json formatted input file. The previous example json string input into this program:
import sys import json import csv if len(sys.argv) != 2: print 'To Use: twitterjson2cvs.py (path/filename)' sys.exit() inputfile = open(sys.argv[1]) jsonindata = json.load(inputfile) f=csv.writer(open("test.csv","wb+")) f.writerow(["contributors","coordinates","created_at","entities","hashtags","urls","user_mentions","favorited","geo","id","id_str","in_reply_to_screen_name","in_reply_to_status_id","in_reply_to_status_id_str","in_reply_to_user_id","in_reply_to_user_id_str","place","retweet_count","retweeted","source","text","truncated","user","contributors_enabled","created_at","description","favourites_count","follow_request_sent","followers_count","following","friends_count","geo_enabled","id","id_str","is_translator","lang","listed_count","location","name","notifications","profile_background_color","profile_background_image_url","profile_background_tile","profile_image_url","profile_link_color","profile_sidebar_border_color","profile_sidebar_fill_color","profile_text_color","profile_use_background_image","protected","screen_name","show_all_inline_media","statuses_count","time_zone","url","utc_offset","verified"]) f.writerow( ( jsonindata['contributors'], jsonindata['coordinates'], jsonindata['created_at'], jsonindata['entities']['hashtags'], jsonindata['entities']['urls'], jsonindata['entities']['user_mentions'], jsonindata['favorited'], jsonindata['geo'], jsonindata['id'], jsonindata['id_str'], jsonindata['in_reply_to_screen_name'], jsonindata['in_reply_to_status_id'], jsonindata['in_reply_to_status_id_str'], jsonindata['in_reply_to_user_id'], jsonindata['in_reply_to_user_id_str'], jsonindata['place'], jsonindata['retweet_count'], jsonindata['retweeted'], jsonindata['source'], jsonindata['text'].encode('utf8'), jsonindata['truncated'], jsonindata['user']['contributors_enabled'], jsonindata['user']['created_at'], jsonindata['user']['description'], jsonindata['user']['favourites_count'], jsonindata['user']['follow_request_sent'], jsonindata['user']['followers_count'], jsonindata['user']['following'], jsonindata['user']['friends_count'], jsonindata['user']['geo_enabled'], jsonindata['user']['id'], jsonindata['user']['id_str'], jsonindata['user']['is_translator'], jsonindata['user']['lang'], jsonindata['user']['listed_count'], jsonindata['user']['location'], jsonindata['user']['name'].encode('utf8'), jsonindata['user']['notifications'], jsonindata['user']['profile_background_color'], jsonindata['user']['profile_background_image_url'], jsonindata['user']['profile_background_tile'], jsonindata['user']['profile_image_url'], jsonindata['user']['profile_link_color'], jsonindata['user']['profile_sidebar_border_color'], jsonindata['user']['profile_sidebar_fill_color'], jsonindata['user']['profile_text_color'], jsonindata['user']['profile_use_background_image'], jsonindata['user']['protected'], jsonindata['user']['screen_name'], jsonindata['user']['show_all_inline_media'], jsonindata['user']['statuses_count'], jsonindata['user']['time_zone'], jsonindata['user']['url'], jsonindata['user']['utc_offset'], jsonindata['user']['verified'] ) ) inputfile.close()
results in a nicely formatted output ready for tools like SPSS to use as follows:
contributors,coordinates,created_at,entities,hashtags,urls,user_mentions,favorited,geo,id,id_str,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,place,retweet_count,retweeted,source,text,truncated,user,contributors_enabled,created_at,description,favourites_count,follow_request_sent,followers_count,following,friends_count,geo_enabled,id,id_str,is_translator,lang,listed_count,location,name,notifications,profile_background_color,profile_background_image_url,profile_background_tile,profile_image_url,profile_link_color,profile_sidebar_border_color,profile_sidebar_fill_color,profile_text_color,profile_use_background_image,protected,screen_name,show_all_inline_media,statuses_count,time_zone,url,utc_offset,verified ,,Tue Mar 22 19:00:46 +0000 2011,[],[],[],False,,50270714498002945,50270714498002945,,,,,,,0,False,web,A gente todos os dias arruma os cabelos: por que não o coração?,False,False,Wed Jul 22 15:27:15 +0000 2009,,0,,10,,18,False,59154474,59154474,False,en,1,"Brasil, Recife-PE",Natalia Aráujo,,eae2bc,http://a3.twimg.com/profile_background_images/220796682/music-2.png,True,http://a0.twimg.com/profile_images/1247378890/154254_normal.JPG,867c5f,eae2bc,eae2bc,91957f,True,False,nat_araujo,False,16,Brasilia,,-10800,False
Now the only problem is that my I have input files that have multiple json strings inline with each other all on one continuous line. When I attempt to run the same program on those files I get the following error:
Traceback (most recent call last): File "workspace/coalmine-datafilter/src/twitterjson2cvs.py", line 22, in jsonindata = json.load(inputfile) File "/usr/lib/python2.6/json/__init__.py", line 267, in load parse_constant=parse_constant, **kw) File "/usr/lib/python2.6/json/__init__.py", line 307, in loads return _default_decoder.decode(s) File "/usr/lib/python2.6/json/decoder.py", line 322, in decode raise ValueError(errmsg("Extra data", s, end, len(s))) ValueError: Extra data: line 1 column 1514 - line 2 column 1 (char 1514 - 2427042)
The input file is very large, (ie: multiple thousands of twitter posts), I don't know if the error is due to the number of posts or if it's because the file has multiple {"...."}{"...."} all on the same line. Any ideas? Do I perhaps need to add a line return somehow after each feed?
As of 2022, it is legal and allowed to scrape publicly available data from Twitter. It means anything that you can see without logging into the website.
this should get you started...you will need to take care of the nested objects
import json
import csv
f = file('test.json', 'r')
data = json.load(f)
#result = []
for k,v in data.iteritems():
print k,v
#result.append(v)
f = file('output.csv', 'w')
writer = csv.writer(f)
writer.writerows(result)
The input here is JSON. Python has a JSON module. Happily it has a CSV module too. So that's your input and output taken care of!
You are getting there!
Your call to writerow()
needs to look more like (not in a for
-loop):
f.writerow(
(
jsonindata['contributors'],
jsonindata['coordinates'],
jsonindata['created_at'],
jsonindata['entities']['hashtags'],
jsonindata['entities']['urls'],
jsonindata['entities']['user_mentions'],
jsonindata['favorited'],
jsonindata['geo'],
jsonindata['id'],
jsonindata['id_str'],
jsonindata['in_reply_to_screen_name'],
jsonindata['in_reply_to_status_id'],
jsonindata['in_reply_to_status_id_str'],
jsonindata['in_reply_to_user_id'],
jsonindata['in_reply_to_user_id_str'],
jsonindata['place'],
jsonindata['retweet_count'],
jsonindata['retweeted'],
jsonindata['source'],
jsonindata['text'].encode('utf8'),
jsonindata['truncated'],
jsonindata['user']['contributors_enabled'],
jsonindata['user']['created_at'],
jsonindata['user']['description'],
jsonindata['user']['favourites_count'],
jsonindata['user']['follow_request_sent'],
jsonindata['user']['followers_count'],
jsonindata['user']['following'],
jsonindata['user']['friends_count'],
jsonindata['user']['geo_enabled'],
jsonindata['user']['id'],
jsonindata['user']['id_str'],
jsonindata['user']['is_translator'],
jsonindata['user']['lang'],
jsonindata['user']['listed_count'],
jsonindata['user']['location'],
jsonindata['user']['name'].encode('utf8'),
jsonindata['user']['notifications'],
jsonindata['user']['profile_background_color'],
jsonindata['user']['profile_background_image_url'],
jsonindata['user']['profile_background_tile'],
jsonindata['user']['profile_image_url'],
jsonindata['user']['profile_link_color'],
jsonindata['user']['profile_sidebar_border_color'],
jsonindata['user']['profile_sidebar_fill_color'],
jsonindata['user']['profile_text_color'],
jsonindata['user']['profile_use_background_image'],
jsonindata['user']['protected'],
jsonindata['user']['screen_name'],
jsonindata['user']['show_all_inline_media'],
jsonindata['user']['statuses_count'],
jsonindata['user']['time_zone'],
jsonindata['user']['url'],
jsonindata['user']['utc_offset'],
jsonindata['user']['verified']
)
)
Also consider using DictWriter
, but remember that Python's CSV module deals badly with Unicode, hence .encode('utf8')
on a couple of elements of the tuple.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With