- Published on
A selenium crawler for xiaobot.net
- Authors
- Name
- Gene Zhang
Crawled articles from xiaobot.net(小报童) and generate a markdown file.
A context manager is used for selenium session.
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class SeleniumContextManager:
def __enter__(self):
self.driver = webdriver.Chrome()
return self.driver
def __exit__(self, exc_type, exc_value, traceback):
self.driver.quit()
with SeleniumContextManager() as driver:
# Login
driver.get("https://xiaobot.net/wechat_auth?redirect=/subscribed/")
print("Please scan the QR code using your mobile device to log in.")
WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, "title"))
)
# Season 7 page
driver.get("https://xiaobot.net/p/pmdogs7")
WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, "title"))
)
print("Please scroll down the page manually to load all contents.")
time.sleep(10)
i = 1 # Skip the first one, which is an Ad
with open('articles.md', 'a+') as f:
while True:
titles = driver.find_elements(By.CLASS_NAME, "title")
title = titles[i]
print(f"{i} {title.text}")
f.write(f"## {title.text}\n\n")
title.click()
WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, "post-content"))
)
date = driver.find_element(By.CLASS_NAME, "date")
f.write(f"{date.text}\n\n")
images = driver.find_elements('css selector', "div.post-content img")
for image in images:
image_html = image.get_attribute("outerHTML")
f.write(f"{image_html}\n\n")
content = driver.find_element(By.CLASS_NAME, "post-content")
f.write(f"{content.text}\n\n")
time.sleep(1)
driver.back()
time.sleep(1)
# Scroll down to load all contents
driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
time.sleep(1)
driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
time.sleep(1)
i += 1
if i >= len(titles):
break
print("done")
time.sleep(30)