I'm trying to parse XML string which I get from youtube video feeds, using Python 3.3.1
. Here is the code:
import re
import sys
import urllib.request
import urllib.parse
import xml.etree.ElementTree as element_tree
def get_video_id(video_url):
return re.search(r'watch\?v=.*', video_url).group(0)[8:]
def get_video_feed(video_url):
video_feed = "http://gdata.youtube.com/feeds/api/videos/" + get_video_id(video_url)
return urllib.request.urlopen(video_feed).read()
def get_media_info(video_url):
content = get_video_feed(video_url)
content = str(content, 'ascii')
media = {}
e = element_tree.XML(content);
print ( "CONTENT: \n" + content )
print ( "\n\nELEMENTS : \n")
for i in list(e):
print (i)
media['title'] = e.findall('title') //NOTE THIS!
return media
def main():
video_url = 'http://youtube.com/watch?v=q5sOLzEerwA'
print ( get_media_info(video_url) )
if __name__ == '__main__':
main()
I've no idea why the for
loop in get_media_info()
prints elements as
<Element '{http://www.w3.org/2005/Atom}title' at 0x0000000002BF7D18>
instead of this:
<Element 'title' at 0x0000000002BF7D18>
Frankly, I don't care what it prints. All I care is that I want to pass 'title'
to findall()
and expect a list of element(s) as returned value. But it returns empty list, even though there is one element with name title
in the xml.
So I tried this:
media['title'] = e.findall('{http://www.w3.org/2005/Atom}title')
And it did return a list of one element. I'm sure this is not the way to do it, and I feel I'm missing something.
How to fix this?
This is the output of the above code:
CONTENT:
<?xml version='1.0' encoding='UTF-8'?>
<entry xmlns='http://www.w3.org/2005/Atom' xmlns:media='http://search.yahoo.com/mrss/' xmlns:gd='http://schemas.google.com/g/2005' xmlns:yt='http://gdata.youtube.com/schemas/2007'>
<id>http://gdata.youtube.com/feeds/api/videos/q5sOLzEerwA</id>
<published>2011-12-01T18:18:36.000Z</published>
<updated>2013-05-07T03:20:04.000Z</updated>
<category scheme='http://schemas.google.com/g/2005#kind' term='http://gdata.youtube.com/schemas/2007#video'/>
<category scheme='http://gdata.youtube.com/schemas/2007/categories.cat' term='Music' label='Music'/>
<title type='text'>Kala Bazaar - Khoya Khoya Chand Khula Aasman - Mohd Rafi.flv</title>
<content type='text'>tanhayi me akele me khoya khoya chand.........</content>
<link rel='alternate' type='text/html' href='http://www.youtube.com/watch?v=q5sOLzEerwA&feature=youtube_gdata'/>
<link rel='http://gdata.youtube.com/schemas/2007#video.responses' type='application/atom+xml' href='http://gdata.youtube.com/feeds/api/videos/q5sOLzEerwA/responses'/>
<link rel='http://gdata.youtube.com/schemas/2007#video.related' type='application/atom+xml' href='http://gdata.youtube.com/feeds/api/videos/q5sOLzEerwA/related'/>
<link rel='http://gdata.youtube.com/schemas/2007#mobile' type='text/html' href='http://m.youtube.com/details?v=q5sOLzEerwA'/>
<link rel='self' type='application/atom+xml' href='http://gdata.youtube.com/feeds/api/videos/q5sOLzEerwA'/>
<author>
<name>a1a2a3a4a786</name>
<uri>http://gdata.youtube.com/feeds/api/users/a1a2a3a4a786</uri>
</author>
<gd:comments>
<gd:feedLink rel='http://gdata.youtube.com/schemas/2007#comments' href='http://gdata.youtube.com/feeds/api/videos/q5sOLzEerwA/comments' countHint='6'/>
</gd:comments>
<media:group>
<media:category label='Music' scheme='http://gdata.youtube.com/schemas/2007/categories.cat'>Music</media:category>
<media:content url='http://www.youtube.com/v/q5sOLzEerwA?version=3&f=videos&app=youtube_gdata' type='application/x-shockwave-flash' medium='video' isDefault='true' expression='full' duration='293' yt:format='5'/>
<media:content url='rtsp://v6.cache3.c.youtube.com/CiILENy73wIaGQkArx4xLw6bqxMYDSANFEgGUgZ2aWRlb3MM/0/0/0/video.3gp' type='video/3gpp' medium='video' expression='full' duration='293' yt:format='1'/>
<media:content url='rtsp://v6.cache3.c.youtube.com/CiILENy73wIaGQkArx4xLw6bqxMYESARFEgGUgZ2aWRlb3MM/0/0/0/video.3gp' type='video/3gpp' medium='video' expression='full' duration='293' yt:format='6'/>
<media:description type='plain'>tanhayi me akele me khoya khoya chand.........</media:description>
<media:keywords/>
<media:player url='http://www.youtube.com/watch?v=q5sOLzEerwA&feature=youtube_gdata_player'/>
<media:thumbnail url='http://i.ytimg.com/vi/q5sOLzEerwA/0.jpg' height='360' width='480' time='00:02:26.500'/>
<media:thumbnail url='http://i.ytimg.com/vi/q5sOLzEerwA/1.jpg' height='90' width='120' time='00:01:13.250'/>
<media:thumbnail url='http://i.ytimg.com/vi/q5sOLzEerwA/2.jpg' height='90' width='120' time='00:02:26.500'/>
<media:thumbnail url='http://i.ytimg.com/vi/q5sOLzEerwA/3.jpg' height='90' width='120' time='00:03:39.750'/>
<media:title type='plain'>Kala Bazaar - Khoya Khoya Chand Khula Aasman - Mohd Rafi.flv</media:title>
<yt:duration seconds='293'/>
</media:group>
<gd:rating average='4.733333' max='5' min='1' numRaters='30' rel='http://schemas.google.com/g/2005#overall'/>
<yt:statistics favoriteCount='0' viewCount='8140'/>
</entry>
ELEMENTS :
<Element '{http://www.w3.org/2005/Atom}id' at 0x0000000002BF79F8>
<Element '{http://www.w3.org/2005/Atom}published' at 0x0000000002BF7B88>
<Element '{http://www.w3.org/2005/Atom}updated' at 0x0000000002BF7A48>
<Element '{http://www.w3.org/2005/Atom}category' at 0x0000000002BF7C78>
<Element '{http://www.w3.org/2005/Atom}category' at 0x0000000002BF7CC8>
<Element '{http://www.w3.org/2005/Atom}title' at 0x0000000002BF7D18>
<Element '{http://www.w3.org/2005/Atom}content' at 0x0000000002BF7D68>
<Element '{http://www.w3.org/2005/Atom}link' at 0x0000000002BF7DB8>
<Element '{http://www.w3.org/2005/Atom}link' at 0x0000000002BF7E08>
<Element '{http://www.w3.org/2005/Atom}link' at 0x0000000002BF7E58>
<Element '{http://www.w3.org/2005/Atom}link' at 0x0000000002BF7EA8>
<Element '{http://www.w3.org/2005/Atom}link' at 0x0000000002BF7EF8>
<Element '{http://www.w3.org/2005/Atom}author' at 0x0000000002BF7F48>
<Element '{http://schemas.google.com/g/2005}comments' at 0x0000000002C0B0E8>
<Element '{http://search.yahoo.com/mrss/}group' at 0x0000000002C0B1D8>
<Element '{http://schemas.google.com/g/2005}rating' at 0x0000000002C0B778>
<Element '{http://gdata.youtube.com/schemas/2007}statistics' at 0x0000000002C0B7C8>
{'title': []}
The namespace of an XML document is significant. ElementTree requires tags to be fully qualified to find the right element. Here's an example of three elements with the same tag in different namespaces:
data = '''\
<root xmlns="xyz" xmlns:name="abc">
<object name="one" />
<name:object name="two" />
<object xmlns="def" name="three" />
</root>
'''
Here's the elements that ElementTree sees:
>>> from xml.etree import ElementTree as et
>>> tree = et.fromstring(data)
>>> print(tree.findall('.//*'))
>>> et.dump(tree)
[<Element '{xyz}object' at 0x0000000003B07BD8>,
<Element '{abc}object' at 0x0000000003B07C28>,
<Element '{def}object' at 0x0000000003B07C78>]
So you have it right. Given the default namespace definition of:
<entry xmlns='http://www.w3.org/2005/Atom' ...
To access the 'title' tag, which uses the default namespace:
media['title'] = e.findall('{http://www.w3.org/2005/Atom}title')
to access the 'media:group' tag, refer to the media namespace definition:
<entry ... xmlns:media='http://search.yahoo.com/mrss/' ...
And use:
e.findall('{http://search.yahoo.com/mrss/}group')
Note the different ways a namespace can be specified:
<root xmlns="xyz" xmlns:name="abc"> # default namespace and
# 'abc' namespace with id 'name'.
<object name="one" /> # Uses default namespace 'xyz'.
<name:object name="two" /> # uses 'abc' namespace (specified by id).
<object xmlns="def" name="three" /> # change the default namespace to 'def'.
</root>
To read a specific tag from a specific namespace:
>>> print(tree.find('{abc}object').attrib['name'])
'two'
Note the namespace IDs are just shortcuts. Here's what happens when you dump the parsed XML tree. ElementTree doesn't bother to save the original namespace IDs and generates its own in the format ns#
:
>>> et.dump(tree)
<ns0:root xmlns:ns0="xyz" xmlns:ns1="abc" xmlns:ns2="def">
<ns0:object name="one" />
<ns1:object name="two" />
<ns2:object name="three" />
</ns0:root>
If you want specific shortcuts defined, use `register_namespace':
>>> et.register_namespace('','xyz') # default namespace
>>> et.register_namespace('name','abc')
>>> et.register_namespace('custom','def')
>>> et.dump(tree)
<root xmlns="xyz" xmlns:custom="def" xmlns:name="abc">
<object name="one" />
<name:object name="two" />
<custom:object name="three" />
</root>
Actually I have tried the following way using xml.dom.minidom
, Just in case it helps you anyway.
#!/usr/bin/python
from xml.dom.minidom import parseString
import re
import urllib
def get_video_id(video_url):
return re.search(r'watch\?v=.*', video_url).group(0)[8:]
def get_video_feed(video_url):
video_feed = "http://gdata.youtube.com/feeds/api/videos/" + get_video_id(video_url)
print video_feed
return urllib.urlopen(video_feed).read()
def get_media_info(video_url):
content = get_video_feed(video_url)
dom = parseString(content)
media = {}
media['title'] = dom.getElementsByTagName('title')[0].firstChild.nodeValue
return media
def main():
video_url = 'http://youtube.com/watch?v=q5sOLzEerwA'
print ( get_media_info(video_url) )
if __name__ == '__main__':
main()
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With