Initial commit

2025-07-26 22:52:02 +02:00
commit 46f7d3a27d
5 changed files with 182 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,14 @@
+# Python virtual environment
+/env/
+/venv/
+
+# Python cache
+__pycache__/
+*.pyc
+
+# Jupyter stuff
+.ipynb_checkpoints/
+
+# Output files
+/last_pid.txt
+/data.db
--- a/config.toml
+++ b/config.toml
@ -0,0 +1,6 @@
+[scraper]
+start-pid = -1 # 86604
+end-pid = 10_000_000
+
+retry-delay = [8, 12]
+next-delay = [5, 8]
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,16 @@
+beautifulsoup4==4.13.4
+bs4==0.0.2
+certifi==2025.7.14
+charset-normalizer==3.4.2
+cloudscraper==1.2.71
+decorator==5.2.1
+idna==3.10
+pillow==11.3.0
+py==1.11.0
+pyparsing==3.2.3
+requests==2.32.4
+requests-toolbelt==1.0.0
+retry==0.9.2
+soupsieve==2.7
+typing_extensions==4.14.1
+urllib3==2.5.0
--- a/scraper/main.py
+++ b/scraper/main.py
@ -0,0 +1,50 @@
+import os
+import sqlite3
+import time
+import random
+import tomllib
+
+from .scraper import get_posts
+
+with open("config.toml", "rb") as file:
+    config = tomllib.load(file)
+
+retry_delay = tuple(config["scraper"]["retry-delay"])
+next_delay = tuple(config["scraper"]["next-delay"])
+
+start_pid = config["scraper"]["start-pid"]
+end_pid = config["scraper"]["end-pid"]
+
+db = sqlite3.connect("data.db")
+db.execute("CREATE TABLE IF NOT EXISTS post(id INT UNIQUE NOT NULL, image_dir INT NOT NULL, image_id TEXT NOT NULL, tags TEXT NOT NULL, thumbnail BLOB NOT NULL);")
+db.commit()
+
+last_exception: Exception | None = None
+
+if start_pid == -1:
+    if os.path.exists("last_pid.txt"):
+        with open("last_pid.txt", "r") as file:
+            start_pid = int(file.read().strip())
+    else:
+        start_pid = 0
+
+for pid in range(start_pid, end_pid, 42):
+    print(pid)
+    for _ in range(3):
+        try:
+            last_exception = None
+            posts = get_posts(f"https://rule34.xxx/index.php?page=post&s=list&pid={pid}")
+            break
+        except Exception as e:
+            last_exception = e
+            print("Retrying")
+            scraper = cloudscraper.CloudScraper()
+            time.sleep(random.randint(*retry_delay))
+    if last_exception:
+        raise last_exception
+    post_values = list(map(lambda p: (p.id, p.image_dir, p.image_id, " ".join(p.tags), p.thumbnail_data), posts))
+    db.executemany("INSERT OR REPLACE INTO post(id, image_dir, image_id, tags, thumbnail) VALUES(?, ?, ?, ?, ?)", post_values)
+    db.commit()
+    with open("last_pid.txt", "w") as file:
+        file.write(str(pid))
+    time.sleep(random.randint(*next_delay))
--- a/scraper/scraper.py
+++ b/scraper/scraper.py
@ -0,0 +1,96 @@
+from bs4 import BeautifulSoup
+import os
+import base64
+from threading import Thread
+from retry import retry
+from PIL import Image
+import cloudscraper
+import io
+
+
+enable_cache = False
+
+
+scraper = cloudscraper.CloudScraper()
+
+
+def bs4(*argv, **kwargs) -> BeautifulSoup:
+    return BeautifulSoup(*argv, **kwargs, features="html.parser")
+
+@retry(Exception, tries=5, delay=3)
+def get(url: str) -> bs4:
+    if enable_cache:
+        if not os.path.exists("cache"):
+            os.mkdir("cache")
+        path = "cache/" + base64.b32encode(url.encode()).decode()
+        if os.path.exists(path):
+            with open(path, "rb") as file:
+                return file.read()
+    res = scraper.get(url)
+    if res.status_code != 200:
+        raise Exception(f"Failed to get {url}")
+    data = res.content
+    if enable_cache:
+        with open(path, "wb") as file:
+            file.write(data)
+    return data
+
+
+class Post:
+    def __init__(self, id: int, image_dir: int, image_id: str, tags: list[str]):
+        self.id = id
+        self.image_dir = image_dir
+        self.image_id = image_id
+        self.tags = tags.copy()
+        self.thumbnail_data: bytes | None = None
+        self.thumbnail: Image.ImageFile.ImageFile | None = None
+
+        def _thread():
+            self.thumbnail_data = get(f"https://wimg.rule34.xxx/thumbnails/{self.image_dir}/thumbnail_{self.image_id}.jpg")
+            self.thumbnail = Image.open(io.BytesIO(self.thumbnail_data))
+        self._thread = Thread(target=_thread)
+        self._thread.start()
+
+    def _join(self):
+        self._thread.join()
+    
+    def __str__(self):
+        return f"<Post {self.id}:{self.image_dir}/{self.image_id}>"
+    
+    def __repr__(self):
+        return self.__str__()
+
+def get_posts(url: str) -> list[Post]:
+    posts: list[Post] = []
+    document = get(url)
+    try:
+        for entry in bs4(document).find_all("div", {"class": "image-list"})[0].children:
+            # Skip garbage
+            if str(entry).strip() == "": continue
+            if entry.name != "span": continue
+
+            # Extract image
+            img = entry.find_all("img")[0]
+            if "src" in img.attrs:
+                img_src = img["src"].split("?")[0].split("/")[-2:]
+            else:
+                img_src = img["data-cfsrc"].split("?")[0].split("/")[-2:]
+
+            # Append post
+            posts.append(Post(
+                int(entry["id"][1:]),
+                int(img_src[0]),
+                img_src[1].split("_")[1].split(".")[0], # Thumbnail_[id].jpg
+                img["alt"].split(" "),
+            ))
+        
+        # Process posts
+        for post in posts:
+            post._join()
+
+        return posts
+
+    except Exception as e:
+        with open("errored-document.html", "wb") as file:
+            file.write(document)
+            raise e