Initial commit

This commit is contained in:
2025-07-26 22:52:02 +02:00
commit 46f7d3a27d
5 changed files with 182 additions and 0 deletions

14
.gitignore vendored Normal file
View File

@ -0,0 +1,14 @@
# Python virtual environment
/env/
/venv/
# Python cache
__pycache__/
*.pyc
# Jupyter stuff
.ipynb_checkpoints/
# Output files
/last_pid.txt
/data.db

6
config.toml Normal file
View File

@ -0,0 +1,6 @@
[scraper]
start-pid = -1 # 86604
end-pid = 10_000_000
retry-delay = [8, 12]
next-delay = [5, 8]

16
requirements.txt Normal file
View File

@ -0,0 +1,16 @@
beautifulsoup4==4.13.4
bs4==0.0.2
certifi==2025.7.14
charset-normalizer==3.4.2
cloudscraper==1.2.71
decorator==5.2.1
idna==3.10
pillow==11.3.0
py==1.11.0
pyparsing==3.2.3
requests==2.32.4
requests-toolbelt==1.0.0
retry==0.9.2
soupsieve==2.7
typing_extensions==4.14.1
urllib3==2.5.0

50
scraper/__main__.py Normal file
View File

@ -0,0 +1,50 @@
import os
import sqlite3
import time
import random
import tomllib
from .scraper import get_posts
with open("config.toml", "rb") as file:
config = tomllib.load(file)
retry_delay = tuple(config["scraper"]["retry-delay"])
next_delay = tuple(config["scraper"]["next-delay"])
start_pid = config["scraper"]["start-pid"]
end_pid = config["scraper"]["end-pid"]
db = sqlite3.connect("data.db")
db.execute("CREATE TABLE IF NOT EXISTS post(id INT UNIQUE NOT NULL, image_dir INT NOT NULL, image_id TEXT NOT NULL, tags TEXT NOT NULL, thumbnail BLOB NOT NULL);")
db.commit()
last_exception: Exception | None = None
if start_pid == -1:
if os.path.exists("last_pid.txt"):
with open("last_pid.txt", "r") as file:
start_pid = int(file.read().strip())
else:
start_pid = 0
for pid in range(start_pid, end_pid, 42):
print(pid)
for _ in range(3):
try:
last_exception = None
posts = get_posts(f"https://rule34.xxx/index.php?page=post&s=list&pid={pid}")
break
except Exception as e:
last_exception = e
print("Retrying")
scraper = cloudscraper.CloudScraper()
time.sleep(random.randint(*retry_delay))
if last_exception:
raise last_exception
post_values = list(map(lambda p: (p.id, p.image_dir, p.image_id, " ".join(p.tags), p.thumbnail_data), posts))
db.executemany("INSERT OR REPLACE INTO post(id, image_dir, image_id, tags, thumbnail) VALUES(?, ?, ?, ?, ?)", post_values)
db.commit()
with open("last_pid.txt", "w") as file:
file.write(str(pid))
time.sleep(random.randint(*next_delay))

96
scraper/scraper.py Normal file
View File

@ -0,0 +1,96 @@
from bs4 import BeautifulSoup
import os
import base64
from threading import Thread
from retry import retry
from PIL import Image
import cloudscraper
import io
enable_cache = False
scraper = cloudscraper.CloudScraper()
def bs4(*argv, **kwargs) -> BeautifulSoup:
return BeautifulSoup(*argv, **kwargs, features="html.parser")
@retry(Exception, tries=5, delay=3)
def get(url: str) -> bs4:
if enable_cache:
if not os.path.exists("cache"):
os.mkdir("cache")
path = "cache/" + base64.b32encode(url.encode()).decode()
if os.path.exists(path):
with open(path, "rb") as file:
return file.read()
res = scraper.get(url)
if res.status_code != 200:
raise Exception(f"Failed to get {url}")
data = res.content
if enable_cache:
with open(path, "wb") as file:
file.write(data)
return data
class Post:
def __init__(self, id: int, image_dir: int, image_id: str, tags: list[str]):
self.id = id
self.image_dir = image_dir
self.image_id = image_id
self.tags = tags.copy()
self.thumbnail_data: bytes | None = None
self.thumbnail: Image.ImageFile.ImageFile | None = None
def _thread():
self.thumbnail_data = get(f"https://wimg.rule34.xxx/thumbnails/{self.image_dir}/thumbnail_{self.image_id}.jpg")
self.thumbnail = Image.open(io.BytesIO(self.thumbnail_data))
self._thread = Thread(target=_thread)
self._thread.start()
def _join(self):
self._thread.join()
def __str__(self):
return f"<Post {self.id}:{self.image_dir}/{self.image_id}>"
def __repr__(self):
return self.__str__()
def get_posts(url: str) -> list[Post]:
posts: list[Post] = []
document = get(url)
try:
for entry in bs4(document).find_all("div", {"class": "image-list"})[0].children:
# Skip garbage
if str(entry).strip() == "": continue
if entry.name != "span": continue
# Extract image
img = entry.find_all("img")[0]
if "src" in img.attrs:
img_src = img["src"].split("?")[0].split("/")[-2:]
else:
img_src = img["data-cfsrc"].split("?")[0].split("/")[-2:]
# Append post
posts.append(Post(
int(entry["id"][1:]),
int(img_src[0]),
img_src[1].split("_")[1].split(".")[0], # Thumbnail_[id].jpg
img["alt"].split(" "),
))
# Process posts
for post in posts:
post._join()
return posts
except Exception as e:
with open("errored-document.html", "wb") as file:
file.write(document)
raise e