I am working on retrieving metadata from youtube channels and it's videos.
Everything is going fine, but currently I am struggling to put all the information in a dataframe which I need. Here is the following code which I am using from this github https://gist.github.com/andkamau/0d4e312c97f41a975440a05fd76b1d29
import urllib.request
import json
from bs4 import BeautifulSoup
from collections import namedtuple
import pafy
from pandas import *
import pandas as pd
df = pd.DataFrame()
Video = namedtuple("Video", "video_id title duration views thumbnail Description")
def parse_video_div(div):
video_id = div.get("data-context-item-id", "")
title = div.find("a", "yt-uix-tile-link").text
duration = div.find("span", "video-time").contents[0].text
views = str(div.find("ul", "yt-lockup-meta-info").contents[0].text.rstrip(" views").replace(",", ""))
img = div.find("img")
videoDescription = pafy.new("https://www.youtube.com/watch?v="+video_id)
thumbnail = "http:" + img.get("src", "") if img else ""
Description = videoDescription.description
l = Video(video_id, title, duration, views, thumbnail, Description)
# storing in the dataframe
df = pd.DataFrame(list(Video(video_id, title, duration, views, thumbnail, Description)))
return Video(video_id, title, duration, views, thumbnail, Description)
def parse_videos_page(page):
video_divs = page.find_all("div", "yt-lockup-video")
return [parse_video_div(div) for div in video_divs]
def find_load_more_url(page):
for button in page.find_all("button"):
url = button.get("data-uix-load-more-href")
if url:
return "http://www.youtube.com" + url
def download_page(url):
print("Downloading {0}".format(url))
return urllib.request.urlopen(url).read()
def get_videos(username):
page_url = "http://www.youtube.com/channel/{0}/videos".format(username)
page = BeautifulSoup(download_page(page_url))
videos = parse_videos_page(page)
page_url = find_load_more_url(page)
while page_url:
json_data = json.loads(str(download_page(page_url).decode("utf-8")))
page = BeautifulSoup(json_data.get("content_html", ""))
videos.extend(parse_videos_page(page))
page_url = find_load_more_url(BeautifulSoup(json_data.get("load_more_widget_html", "")))
return videos
if __name__ == "__main__":
videos = get_videos("UC-M9eLhclbe16sDaxLzc0ng")
for video in videos:
print(video)
print("{0} videos".format(len(videos)))
The function parse_video_div(div) is having all the information and my dataframe. But unfortunately the dataframe returns nothing. May be I need to loop the namedtuple somehow.
Any lead on how I can achieve my dataframe to see my data?
pd.DataFrame goes perfectly with namedtuple and actually constructs the columns.
Sample data:
In [21]: Video = namedtuple("Video", "video_id title duration views thumbnail De
...: scription")
In [22]: In [20]: pd.DataFrame(data=[Video(1, 'Vid Title', 5, 10, 'Thumb',' Des'
...: )])
Out[22]:
video_id title duration views thumbnail Description
0 1 Vid Title 5 10 Thumb Des
Since your function is not actually returning the df and not utilizing it anywhere else in the code, how are you sure that it is empty?
Update
You just need to edit the return of parse_video_div to return a pd.DataFrame and concatenate the list into a single pd.DataFrame in get_videos function.
Here are the edits highlighted.
def parse_video_div(div):
#####
return pd.DataFrame(data=[Video(video_id, title, duration, views, thumbnail, Description)])
# shorter version
# return pd.DataFrame(data=[l])
def get_videos(username):
####
videos_df = pd.concat(videos, ignore_index=True)
return videos_df # return the DataFrame
You need a concantenation function in the end. in the parse_page_div, you can return any pd.DataFrame input, let that be dict, pd.Series, namedtuple, or even a list. In this example, I chose a pd.DataFrame to ease things, however, in terms of performance, it can add a few milliseconds to your processing.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With