I'm trying to scrap all departures and arrivals in one day from all airports in all country using Python and Scrapy.
The JSON database used by this famous site (flight radar) need to query page by page when departure or arrival is > 100 in one airport. I also compute a timestamp based on an actual day UTC for the query.
I try to create a database with this hierarchy:
country 1
- airport 1
- departures
- page 1
- page ...
- arrivals
- page 1
- page ...
- airport 2
- departures
- page 1
- page ...
- arrivals
- page
- page ...
...
I use two methods to compute timestamp and url query by page :
def compute_timestamp(self):
from datetime import datetime, date
import calendar
# +/- 24 heures
d = date(2017, 4, 27)
timestamp = calendar.timegm(d.timetuple())
return timestamp
def build_api_call(self,code,page,timestamp):
return 'https://api.flightradar24.com/common/v1/airport.json?code={code}&plugin\[\]=&plugin-setting\[schedule\]\[mode\]=&plugin-setting\[schedule\]\[timestamp\]={timestamp}&page={page}&limit=100&token='.format(
code=code, page=page, timestamp=timestamp)
I store result into CountryItem
, which contain lots of AirportItem
into airports. My item.py
is :
class CountryItem(scrapy.Item):
name = scrapy.Field()
link = scrapy.Field()
num_airports = scrapy.Field()
airports = scrapy.Field()
other_url= scrapy.Field()
last_updated = scrapy.Field(serializer=str)
class AirportItem(scrapy.Item):
name = scrapy.Field()
code_little = scrapy.Field()
code_total = scrapy.Field()
lat = scrapy.Field()
lon = scrapy.Field()
link = scrapy.Field()
departures = scrapy.Field()
arrivals = scrapy.Field()
My main parse builds a Country item for all countries (i limit here to Israel for example). Next, I yield for each country a scrapy.Request
to scrape airports.
###################################
# MAIN PARSE
####################################
def parse(self, response):
count_country = 0
countries = []
for country in response.xpath('//a[@data-country]'):
item = CountryItem()
url = country.xpath('./@href').extract()
name = country.xpath('./@title').extract()
item['link'] = url[0]
item['name'] = name[0]
item['airports'] = []
count_country += 1
if name[0] == "Israel":
countries.append(item)
self.logger.info("Country name : %s with link %s" , item['name'] , item['link'])
yield scrapy.Request(url[0],meta={'my_country_item':item}, callback=self.parse_airports)
This method scrape information for each airport, and also call for each airport a scrapy.request
with airport url to scrape departures and arrivals :
###################################
# PARSE EACH AIRPORT
####################################
def parse_airports(self, response):
item = response.meta['my_country_item']
item['airports'] = []
for airport in response.xpath('//a[@data-iata]'):
url = airport.xpath('./@href').extract()
iata = airport.xpath('./@data-iata').extract()
iatabis = airport.xpath('./small/text()').extract()
name = ''.join(airport.xpath('./text()').extract()).strip()
lat = airport.xpath("./@data-lat").extract()
lon = airport.xpath("./@data-lon").extract()
iAirport = AirportItem()
iAirport['name'] = self.clean_html(name)
iAirport['link'] = url[0]
iAirport['lat'] = lat[0]
iAirport['lon'] = lon[0]
iAirport['code_little'] = iata[0]
iAirport['code_total'] = iatabis[0]
item['airports'].append(iAirport)
urls = []
for airport in item['airports']:
json_url = self.build_api_call(airport['code_little'], 1, self.compute_timestamp())
urls.append(json_url)
if not urls:
return item
# start with first url
next_url = urls.pop()
return scrapy.Request(next_url, self.parse_schedule, meta={'airport_item': item, 'airport_urls': urls, 'i': 0})
With the recursive method parse_schedule
i add each airport to country item. SO members already help me on this point.
###################################
# PARSE EACH AIRPORT OF COUNTRY
###################################
def parse_schedule(self, response):
"""we want to loop this continuously to build every departure and arrivals requests"""
item = response.meta['airport_item']
i = response.meta['i']
urls = response.meta['airport_urls']
urls_departures, urls_arrivals = self.compute_urls_by_page(response, item['airports'][i]['name'], item['airports'][i]['code_little'])
print("urls_departures = ", len(urls_departures))
print("urls_arrivals = ", len(urls_arrivals))
## YIELD NOT CALLED
yield scrapy.Request(response.url, self.parse_departures_page, meta={'airport_item': item, 'page_urls': urls_departures, 'i':0 , 'p': 0}, dont_filter=True)
# now do next schedule items
if not urls:
yield item
return
url = urls.pop()
yield scrapy.Request(url, self.parse_schedule, meta={'airport_item': item, 'airport_urls': urls, 'i': i + 1})
the self.compute_urls_by_page
method compute correct URLs to retrieve all departure and arrivals for one airport.
###################################
# PARSE EACH DEPARTURES / ARRIVALS
###################################
def parse_departures_page(self, response):
item = response.meta['airport_item']
p = response.meta['p']
i = response.meta['i']
page_urls = response.meta['page_urls']
print("PAGE URL = ", page_urls)
if not page_urls:
yield item
return
page_url = page_urls.pop()
print("GET PAGE FOR ", item['airports'][i]['name'], ">> ", p)
jsonload = json.loads(response.body_as_unicode())
json_expression = jmespath.compile("result.response.airport.pluginData.schedule.departures.data")
item['airports'][i]['departures'] = json_expression.search(jsonload)
yield scrapy.Request(page_url, self.parse_departures_page, meta={'airport_item': item, 'page_urls': page_urls, 'i': i, 'p': p + 1})
Next, the first yield in parse_schedule
which normally call self.parse_departure_page
recursive method produces weird results. Scrapy call this method, but I collect the departures page for only one airport i don't understand why... I have probably an ordering error in my request or yield source code, so perhaps you could help me to find out.
The complete code is on GitHub https://github.com/IDEES-Rouen/Flight-Scrapping/tree/master/flight/flight_project
You could run it using scrapy cawl airports
commands.
Update 1 :
I try to answer the question alone using yield from
, without success as you can see answer bottom ... so if you have an idea?
Yes, i finally found the answer here on SO ...
When you use a recursive yield
, you need to use yield from
. Here one example simplified :
airport_list = ["airport1", "airport2", "airport3", "airport4"]
def parse_page_departure(airport, next_url, page_urls):
print(airport, " / ", next_url)
if not page_urls:
return
next_url = page_urls.pop()
yield from parse_page_departure(airport, next_url, page_urls)
###################################
# PARSE EACH AIRPORT OF COUNTRY
###################################
def parse_schedule(next_airport, airport_list):
## GET EACH DEPARTURE PAGE
departures_list = ["p1", "p2", "p3", "p4"]
next_departure_url = departures_list.pop()
yield parse_page_departure(next_airport,next_departure_url, departures_list)
if not airport_list:
print("no new airport")
return
next_airport_url = airport_list.pop()
yield from parse_schedule(next_airport_url, airport_list)
next_airport_url = airport_list.pop()
result = parse_schedule(next_airport_url, airport_list)
for i in result:
print(i)
for d in i:
print(d)
UPDATE, Don't WORK with real program :
I try to reproduce the same yield from
pattern with the real program here, but i have an error using it on scrapy.Request
, don't understand why...
Here the python traceback :
Traceback (most recent call last):
File "/home/reyman/.pyenv/versions/venv352/lib/python3.5/site-packages/scrapy/utils/defer.py", line 102, in iter_errback
yield next(it)
File "/home/reyman/.pyenv/versions/venv352/lib/python3.5/site-packages/scrapy/spidermiddlewares/offsite.py", line 29, in process_spider_output
for x in result:
File "/home/reyman/.pyenv/versions/venv352/lib/python3.5/site-packages/scrapy/spidermiddlewares/referer.py", line 339, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/home/reyman/.pyenv/versions/venv352/lib/python3.5/site-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "/home/reyman/.pyenv/versions/venv352/lib/python3.5/site-packages/scrapy/spidermiddlewares/depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "/home/reyman/Projets/Flight-Scrapping/flight/flight_project/spiders/AirportsSpider.py", line 209, in parse_schedule
yield from scrapy.Request(url, self.parse_schedule, meta={'airport_item': item, 'airport_urls': urls, 'i': i + 1})
TypeError: 'Request' object is not iterable
2017-06-27 17:40:50 [scrapy.core.engine] INFO: Closing spider (finished)
2017-06-27 17:40:50 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
Comment: ... not totally clear ... you call AirportData(response, 1) ... also a little typo here : self.pprint(schedule)
I used class AirportData
to implement (Limit to 2 Pages and 2 Flights).
Updated my code, removed class AirportData
and added class Page
.
Should now fullfill all dependencies.
This is not a typo, self.pprint(...
is a class AirportsSpider Method
used for Pretty Printing the object, like the Output shown at the End. I have enhanced class Schedule
to show the Basic Usage.
Comment: What is AirportData in your answer ?
EDIT: class AirportData
removed.
As noted at # ENDPOINT
, a Page object
of Flight Data splited for page.arrivals
and page.departures
.
(Limited to 2 Pages and 2 Flights)
Page = [Flight 1, Flight 1, ... Flight n] schedule.airport['arrivals'] == [Page 1, Page 2, ..., Page n] schedule.airport['departures'] == [Page 1, Page 2, ..., Page n]
Comment: ... we have multiples pages which contains multiples departures/arrivals.
Yes, at the time of first Answer I didn't have any api json
respons to get further.
Now I got response from the api json
but does not reflect the given timestamp
, returns from current date
.
The api params
looking uncommon, have you a link to the Description?
Nevertheless, consider this simplified approach:
# Page object holding one Page of Arrivals/Departures Flight Data
class Page(object):
def __init__(self, title, schedule):
# schedule includes ['arrivals'] or ['departures]
self.current = schedule['page']['current']
self.total = schedule['page']['total']
self.header = '{}:page:{} item:{}'.format(title, schedule['page'], schedule['item'])
self.flight = []
for data in schedule['data']:
self.flight.append(data['flight'])
def __iter__(self):
yield from self.flight
# Schedule object holding one Airport all Pages
class Schedule(object):
def __init__(self):
self.country = None
self.airport = None
def __str__(self):
arrivals = self.airport['arrivals'][0]
departures = self.airport['departures'][0]
return '{}\n\t{}\n\t\t{}\n\t\t\t{}\n\t\t{}\n\t\t\t{}'. \
format(self.country['name'],
self.airport['name'],
arrivals.header,
arrivals.flight[0]['airline']['name'],
departures.header,
departures.flight[0]['airline']['name'], )
# PARSE EACH AIRPORT OF COUNTRY
def parse_schedule(self, response):
meta = response.meta
if 'airport' in meta:
# First call from parse_airports
schedule = Schedule()
schedule.country = response.meta['country']
schedule.airport = response.meta['airport']
else:
schedule = response.meta['schedule']
data = json.loads(response.body_as_unicode())
airport = data['result']['response']['airport']
schedule.airport['arrivals'].append(Page('Arrivals', airport['pluginData']['schedule']['arrivals']))
schedule.airport['departures'].append(Page('Departures', airport['pluginData']['schedule']['departures']))
page = schedule.airport['departures'][-1]
if page.current < page.total:
json_url = self.build_api_call(schedule.airport['code_little'], page.current + 1, self.compute_timestamp())
yield scrapy.Request(json_url, meta={'schedule': schedule}, callback=self.parse_schedule)
else:
# ENDPOINT Schedule object holding one Airport.
# schedule.airport['arrivals'] and schedule.airport['departures'] ==
# List of Page with List of Flight Data
print(schedule)
# PARSE EACH AIRPORT
def parse_airports(self, response):
country = response.meta['country']
for airport in response.xpath('//a[@data-iata]'):
name = ''.join(airport.xpath('./text()').extract()[0]).strip()
if 'Charles' in name:
meta = response.meta
meta['airport'] = AirportItem()
meta['airport']['name'] = name
meta['airport']['link'] = airport.xpath('./@href').extract()[0]
meta['airport']['lat'] = airport.xpath("./@data-lat").extract()[0]
meta['airport']['lon'] = airport.xpath("./@data-lon").extract()[0]
meta['airport']['code_little'] = airport.xpath('./@data-iata').extract()[0]
meta['airport']['code_total'] = airport.xpath('./small/text()').extract()[0]
json_url = self.build_api_call(meta['airport']['code_little'], 1, self.compute_timestamp())
yield scrapy.Request(json_url, meta=meta, callback=self.parse_schedule)
# MAIN PARSE
Note:
response.xpath('//a[@data-country]')
returns all Countrys two times!
def parse(self, response):
for a_country in response.xpath('//a[@data-country]'):
name = a_country.xpath('./@title').extract()[0]
if name == "France":
country = CountryItem()
country['name'] = name
country['link'] = a_country.xpath('./@href').extract()[0]
yield scrapy.Request(country['link'],
meta={'country': country},
callback=self.parse_airports)
Qutput: Shorten to 2 Pages and 2 Flights per Page
France Paris Charles de Gaulle Airport Departures:(page=(1, 1, 7)) 2017-07-02 21:28:00 page:{'current': 1, 'total': 7} item:{'current': 100, 'limit': 100, 'total': 696} 21:30 PM AF1558 Newcastle Airport (NCL) Air France ARJ Estimated dep 21:30 21:30 PM VY8833 Seville San Pablo Airport (SVQ) Vueling 320 Estimated dep 21:30 ... (omitted for brevity) Departures:(page=(2, 2, 7)) 2017-07-02 21:28:00 page:{'current': 2, 'total': 7} item:{'current': 100, 'limit': 100, 'total': 696} 07:30 AM AF1680 London Heathrow Airport (LHR) Air France 789 Scheduled 07:30 AM SN3628 Brussels Airport (BRU) Brussels Airlines 733 Scheduled ... (omitted for brevity) Arrivals:(page=(1, 1, 7)) 2017-07-02 21:28:00 page:{'current': 1, 'total': 7} item:{'current': 100, 'limit': 100, 'total': 693} 16:30 PM LY325 Tel Aviv Ben Gurion International Airport (TLV) El Al Israel Airlines B739 Estimated 21:29 18:30 PM AY877 Helsinki Vantaa Airport (HEL) Finnair E190 Landed 21:21 ... (omitted for brevity) Arrivals:(page=(2, 2, 7)) 2017-07-02 21:28:00 page:{'current': 2, 'total': 7} item:{'current': 100, 'limit': 100, 'total': 693} 00:15 AM AF982 Douala International Airport (DLA) Air France 772 Scheduled 23:15 PM AA44 New York John F. Kennedy International Airport (JFK) American Airlines B763 Scheduled ... (omitted for brevity)
Tested with Python: 3.4.2 - Scrapy 1.4.0
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With