Hello I have a jsonLines files where I am trying to get all the Hashtags (and Mentions which should be same process) from the jsonline file here: https://github.com/THsTestingGround/JsonL_Quest_SO/blob/master/output-2020-01-21.jsonl (SO not allowing me to put urls, and there are a lot of them)
Here is a reproducible example that gets the single key-object. How would I proceed to get more than one hashtags (mentions will be same)? Currently I have to manually specify. Anyways to get them all in one go or something? I was able to get csv using this code here:
import json
import csv
import io
# creates a .csv file using a Twitter .json file
# the fields have to be set manually
def extract_json(fileobj):
# Iterates over an open JSONL file and yields
# decoded lines. Closes the file once it has been
# read completely.
with fileobj:
for line in fileobj:
yield json.loads(line)
#path to the jsonl file
data_json = io.open('output-2020-01-21.json', mode='r', encoding='utf-8') # Opens in the JSONL file
data_python = extract_json(data_json)
csv_out = io.open('tweets_out_utf8.csv', mode='w', encoding='utf-8') #opens csv file
#if you're adding additional columns please don't forget to add them here
fields = u'created_at,text,full_text, screen_name,followers,friends,rt,fav' #field names
csv_out.write(fields)
csv_out.write(u'\n')
for line in data_python:
#because retweet is not common, sometimes jsonl won't have the key, so this is safer
try:
retweeted_status_full_text = '"' +line.get('retweeted_status').get('full_text').replace('"','""') + '"'
except:
retweeted_status_full_text = 'NA'
#gets me only one hastags even when there are more than one
try:
entities= '"' + line.get('entities').get('hashtags')[0].get('text').replace('"', '""') + '"'
except:
entities = 'NA'
#writes a row and gets the fields from the json object
#screen_name and followers/friends are found on the second level hence two get methods
row = [line.get('created_at'),
'"' + line.get('full_text').replace('"','""') + '"', #creates double quotes
retweeted_status_full_text,
line.get('user').get('screen_name'),
str(line.get('user').get('followers_count')),
str(line.get('user').get('friends_count')),
str(line.get('retweet_count')),
str(line.get('favorite_count'))]
row_joined = u','.join(row)
csv_out.write(row_joined)
csv_out.write(u'\n')
csv_out.close()
I did made an attempt but it gave me an error. I can't seem to find a solution in SO either. Little weaker in json at the moment, so would appreciate any help I can get. Thanks.
import json
import csv
import io
def extract_json(fileobj):
with fileobj:
for line in fileobj:
yield json.loads(line)
data_json = io.open('a.json', mode='r', encoding='utf-8')
data_python = extract_json(data_json)
csv_out = io.open('tweets_out_utf8.csv', mode='w', encoding='utf-8')
fields = u'created_at,text,full_text, screen_name,followers,friends,rt,fav'
csv_out.write(fields)
csv_out.write(u'\n')
for line in data_python:
try:
retweeted_status_full_text = '"' +line.get('retweeted_status').get('full_text').replace('"','""') + '"'
except:
retweeted_status_full_text = 'NA'
try:
temp = line.get('entities').get('hashtags')
entities = ""
for val in temp:
entities += '"' + val.get('text').replace('"', '""') + '"' + ' '
except:
entities = ""
row = [line.get('created_at'),
'"' + line.get('full_text').replace('"','""') + '"',
retweeted_status_full_text,
line.get('user').get('screen_name'),
str(line.get('user').get('followers_count')),
str(line.get('user').get('friends_count')),
str(line.get('retweet_count')),
str(line.get('favorite_count'))]
print('entities' + ' ' + str(entities))
row_joined = u','.join(row)
csv_out.write(row_joined)
csv_out.write(u'\n')
csv_out.close()
I tried something like this. I replaced empty entities with entities = ''
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With