There is this little program that goes to a vocabulary, print all the words from that page and then click at the button to go to the next page and print again all the vocabulary on that page.
I used a loop to repeat the process and loop through all the words spread on multiple pages.
#Create csv
outfile = open("Vocab.csv","w",newline='')
writer = csv.writer(outfile)
#Define the dataframe
df = pd.DataFrame(columns=['rating'])
PATH="C:\Program Files (x86)\chromedriver.exe"
driver= webdriver.Chrome(PATH)
driver.get("https://sq.m.wiktionary.org/w/index.php?title=Kategoria:Shqip&pagefrom=agall%C3%ABk#mw-pages")
for x in range(3):
rating_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#mw-pages > div > div > div > ul"))
)
rating=rating_element.text
print(rating)
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.LINK_TEXT, "faqja pasardhëse"))
)
element.click()
df2 = pd.DataFrame([rating],columns=['rating'])
df = df.append(df2,ignore_index=True)
The code itself works perfectly fine, however when I tried to implement the function of parsing all the data into a DataFrame, I only get an empty Csv File. I'm trying to have only one column with the thousands of words.
You can iterate over each word to append to the column:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import selenium.common.exceptions
import os
import pandas as pd
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--window-size=1920x1080")
# chrome_options.add_argument("--headless")
chrome_driver = os.getcwd() + "\\chromedriver.exe"
driver = webdriver.Chrome(options=chrome_options, executable_path=chrome_driver)
# Define the dataframe
df = pd.DataFrame(columns=['rating'])
driver.get("https://sq.m.wiktionary.org/w/index.php?title=Kategoria:Shqip&pagefrom=agall%C3%ABk#mw-pages")
for x in range(200):
rating_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#mw-pages > div > div > div > ul"))
)
rating = rating_element.text
for word in rating.split('\n'):
df2 = pd.DataFrame([word], columns=['rating'])
df = df.append(df2, ignore_index=True)
try:
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.LINK_TEXT, "faqja pasardhëse"))
)
element.click()
except selenium.common.exceptions.TimeoutException:
break
print(df)
df.to_csv('word_list.csv', encoding='utf-8', index=False)
rating
0 agallëk
1 agar
2 agave
3 agde
4 ageshë
.. ...
595 ankim
596 ankimor
597 ankohem
598 ankoj
599 ankojë
[600 rows x 1 columns]
Added the option to write to a file.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With