From 46f7d3a27df23a9f6f4ac3e40a8656e30203373c Mon Sep 17 00:00:00 2001 From: Tomuxs Date: Sat, 26 Jul 2025 22:52:02 +0200 Subject: [PATCH] Initial commit --- .gitignore | 14 +++++++ config.toml | 6 +++ requirements.txt | 16 ++++++++ scraper/__main__.py | 50 +++++++++++++++++++++++ scraper/scraper.py | 96 +++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 182 insertions(+) create mode 100644 .gitignore create mode 100644 config.toml create mode 100644 requirements.txt create mode 100644 scraper/__main__.py create mode 100644 scraper/scraper.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e417e70 --- /dev/null +++ b/.gitignore @@ -0,0 +1,14 @@ +# Python virtual environment +/env/ +/venv/ + +# Python cache +__pycache__/ +*.pyc + +# Jupyter stuff +.ipynb_checkpoints/ + +# Output files +/last_pid.txt +/data.db diff --git a/config.toml b/config.toml new file mode 100644 index 0000000..0a9a48e --- /dev/null +++ b/config.toml @@ -0,0 +1,6 @@ +[scraper] +start-pid = -1 # 86604 +end-pid = 10_000_000 + +retry-delay = [8, 12] +next-delay = [5, 8] \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..58c4edd --- /dev/null +++ b/requirements.txt @@ -0,0 +1,16 @@ +beautifulsoup4==4.13.4 +bs4==0.0.2 +certifi==2025.7.14 +charset-normalizer==3.4.2 +cloudscraper==1.2.71 +decorator==5.2.1 +idna==3.10 +pillow==11.3.0 +py==1.11.0 +pyparsing==3.2.3 +requests==2.32.4 +requests-toolbelt==1.0.0 +retry==0.9.2 +soupsieve==2.7 +typing_extensions==4.14.1 +urllib3==2.5.0 diff --git a/scraper/__main__.py b/scraper/__main__.py new file mode 100644 index 0000000..480c3a4 --- /dev/null +++ b/scraper/__main__.py @@ -0,0 +1,50 @@ +import os +import sqlite3 +import time +import random +import tomllib + +from .scraper import get_posts + +with open("config.toml", "rb") as file: + config = tomllib.load(file) + +retry_delay = tuple(config["scraper"]["retry-delay"]) +next_delay = tuple(config["scraper"]["next-delay"]) + +start_pid = config["scraper"]["start-pid"] +end_pid = config["scraper"]["end-pid"] + +db = sqlite3.connect("data.db") +db.execute("CREATE TABLE IF NOT EXISTS post(id INT UNIQUE NOT NULL, image_dir INT NOT NULL, image_id TEXT NOT NULL, tags TEXT NOT NULL, thumbnail BLOB NOT NULL);") +db.commit() + +last_exception: Exception | None = None + +if start_pid == -1: + if os.path.exists("last_pid.txt"): + with open("last_pid.txt", "r") as file: + start_pid = int(file.read().strip()) + else: + start_pid = 0 + +for pid in range(start_pid, end_pid, 42): + print(pid) + for _ in range(3): + try: + last_exception = None + posts = get_posts(f"https://rule34.xxx/index.php?page=post&s=list&pid={pid}") + break + except Exception as e: + last_exception = e + print("Retrying") + scraper = cloudscraper.CloudScraper() + time.sleep(random.randint(*retry_delay)) + if last_exception: + raise last_exception + post_values = list(map(lambda p: (p.id, p.image_dir, p.image_id, " ".join(p.tags), p.thumbnail_data), posts)) + db.executemany("INSERT OR REPLACE INTO post(id, image_dir, image_id, tags, thumbnail) VALUES(?, ?, ?, ?, ?)", post_values) + db.commit() + with open("last_pid.txt", "w") as file: + file.write(str(pid)) + time.sleep(random.randint(*next_delay)) diff --git a/scraper/scraper.py b/scraper/scraper.py new file mode 100644 index 0000000..b6a7ff6 --- /dev/null +++ b/scraper/scraper.py @@ -0,0 +1,96 @@ +from bs4 import BeautifulSoup +import os +import base64 +from threading import Thread +from retry import retry +from PIL import Image +import cloudscraper +import io + + +enable_cache = False + + +scraper = cloudscraper.CloudScraper() + + +def bs4(*argv, **kwargs) -> BeautifulSoup: + return BeautifulSoup(*argv, **kwargs, features="html.parser") + +@retry(Exception, tries=5, delay=3) +def get(url: str) -> bs4: + if enable_cache: + if not os.path.exists("cache"): + os.mkdir("cache") + path = "cache/" + base64.b32encode(url.encode()).decode() + if os.path.exists(path): + with open(path, "rb") as file: + return file.read() + res = scraper.get(url) + if res.status_code != 200: + raise Exception(f"Failed to get {url}") + data = res.content + if enable_cache: + with open(path, "wb") as file: + file.write(data) + return data + + +class Post: + def __init__(self, id: int, image_dir: int, image_id: str, tags: list[str]): + self.id = id + self.image_dir = image_dir + self.image_id = image_id + self.tags = tags.copy() + self.thumbnail_data: bytes | None = None + self.thumbnail: Image.ImageFile.ImageFile | None = None + + def _thread(): + self.thumbnail_data = get(f"https://wimg.rule34.xxx/thumbnails/{self.image_dir}/thumbnail_{self.image_id}.jpg") + self.thumbnail = Image.open(io.BytesIO(self.thumbnail_data)) + self._thread = Thread(target=_thread) + self._thread.start() + + def _join(self): + self._thread.join() + + def __str__(self): + return f"" + + def __repr__(self): + return self.__str__() + +def get_posts(url: str) -> list[Post]: + posts: list[Post] = [] + document = get(url) + try: + for entry in bs4(document).find_all("div", {"class": "image-list"})[0].children: + # Skip garbage + if str(entry).strip() == "": continue + if entry.name != "span": continue + + # Extract image + img = entry.find_all("img")[0] + if "src" in img.attrs: + img_src = img["src"].split("?")[0].split("/")[-2:] + else: + img_src = img["data-cfsrc"].split("?")[0].split("/")[-2:] + + # Append post + posts.append(Post( + int(entry["id"][1:]), + int(img_src[0]), + img_src[1].split("_")[1].split(".")[0], # Thumbnail_[id].jpg + img["alt"].split(" "), + )) + + # Process posts + for post in posts: + post._join() + + return posts + + except Exception as e: + with open("errored-document.html", "wb") as file: + file.write(document) + raise e