Files
py34/scraper/__main__.py
2025-07-26 22:52:02 +02:00

51 lines
1.6 KiB
Python

import os
import sqlite3
import time
import random
import tomllib
from .scraper import get_posts
with open("config.toml", "rb") as file:
config = tomllib.load(file)
retry_delay = tuple(config["scraper"]["retry-delay"])
next_delay = tuple(config["scraper"]["next-delay"])
start_pid = config["scraper"]["start-pid"]
end_pid = config["scraper"]["end-pid"]
db = sqlite3.connect("data.db")
db.execute("CREATE TABLE IF NOT EXISTS post(id INT UNIQUE NOT NULL, image_dir INT NOT NULL, image_id TEXT NOT NULL, tags TEXT NOT NULL, thumbnail BLOB NOT NULL);")
db.commit()
last_exception: Exception | None = None
if start_pid == -1:
if os.path.exists("last_pid.txt"):
with open("last_pid.txt", "r") as file:
start_pid = int(file.read().strip())
else:
start_pid = 0
for pid in range(start_pid, end_pid, 42):
print(pid)
for _ in range(3):
try:
last_exception = None
posts = get_posts(f"https://rule34.xxx/index.php?page=post&s=list&pid={pid}")
break
except Exception as e:
last_exception = e
print("Retrying")
scraper = cloudscraper.CloudScraper()
time.sleep(random.randint(*retry_delay))
if last_exception:
raise last_exception
post_values = list(map(lambda p: (p.id, p.image_dir, p.image_id, " ".join(p.tags), p.thumbnail_data), posts))
db.executemany("INSERT OR REPLACE INTO post(id, image_dir, image_id, tags, thumbnail) VALUES(?, ?, ?, ?, ?)", post_values)
db.commit()
with open("last_pid.txt", "w") as file:
file.write(str(pid))
time.sleep(random.randint(*next_delay))