Published on

A selenium crawler for xiaobot.net

Authors
  • avatar
    Name
    Gene Zhang
    Twitter

Crawled articles from xiaobot.net(小报童) and generate a markdown file.

A context manager is used for selenium session.

import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


class SeleniumContextManager:
    def __enter__(self):
        self.driver = webdriver.Chrome()
        return self.driver

    def __exit__(self, exc_type, exc_value, traceback):
        self.driver.quit()


with SeleniumContextManager() as driver:
    # Login
    driver.get("https://xiaobot.net/wechat_auth?redirect=/subscribed/")
    print("Please scan the QR code using your mobile device to log in.")

    WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.CLASS_NAME, "title"))
    )

    # Season 7 page
    driver.get("https://xiaobot.net/p/pmdogs7")

    WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.CLASS_NAME, "title"))
    )

    print("Please scroll down the page manually to load all contents.")
    time.sleep(10)
    i = 1  # Skip the first one, which is an Ad

    with open('articles.md', 'a+') as f:
        while True:
            titles = driver.find_elements(By.CLASS_NAME, "title")
            title = titles[i]
            print(f"{i} {title.text}")
            f.write(f"## {title.text}\n\n")
            title.click()
            WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CLASS_NAME, "post-content"))
            )

            date = driver.find_element(By.CLASS_NAME, "date")
            f.write(f"{date.text}\n\n")

            images = driver.find_elements('css selector', "div.post-content img")
            for image in images:
                image_html = image.get_attribute("outerHTML")
                f.write(f"{image_html}\n\n")

            content = driver.find_element(By.CLASS_NAME, "post-content")
            f.write(f"{content.text}\n\n")

            time.sleep(1)
            driver.back()
            time.sleep(1)
            # Scroll down to load all contents
            driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
            time.sleep(1)
            driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
            time.sleep(1)
            i += 1
            if i >= len(titles):
                break

    print("done")
    time.sleep(30)