51 lines
1.6 KiB
Python
51 lines
1.6 KiB
Python
import os
|
|
import sqlite3
|
|
import time
|
|
import random
|
|
import tomllib
|
|
|
|
from .scraper import get_posts
|
|
|
|
with open("config.toml", "rb") as file:
|
|
config = tomllib.load(file)
|
|
|
|
retry_delay = tuple(config["scraper"]["retry-delay"])
|
|
next_delay = tuple(config["scraper"]["next-delay"])
|
|
|
|
start_pid = config["scraper"]["start-pid"]
|
|
end_pid = config["scraper"]["end-pid"]
|
|
|
|
db = sqlite3.connect("data.db")
|
|
db.execute("CREATE TABLE IF NOT EXISTS post(id INT UNIQUE NOT NULL, image_dir INT NOT NULL, image_id TEXT NOT NULL, tags TEXT NOT NULL, thumbnail BLOB NOT NULL);")
|
|
db.commit()
|
|
|
|
last_exception: Exception | None = None
|
|
|
|
if start_pid == -1:
|
|
if os.path.exists("last_pid.txt"):
|
|
with open("last_pid.txt", "r") as file:
|
|
start_pid = int(file.read().strip())
|
|
else:
|
|
start_pid = 0
|
|
|
|
for pid in range(start_pid, end_pid, 42):
|
|
print(pid)
|
|
for _ in range(3):
|
|
try:
|
|
last_exception = None
|
|
posts = get_posts(f"https://rule34.xxx/index.php?page=post&s=list&pid={pid}")
|
|
break
|
|
except Exception as e:
|
|
last_exception = e
|
|
print("Retrying")
|
|
scraper = cloudscraper.CloudScraper()
|
|
time.sleep(random.randint(*retry_delay))
|
|
if last_exception:
|
|
raise last_exception
|
|
post_values = list(map(lambda p: (p.id, p.image_dir, p.image_id, " ".join(p.tags), p.thumbnail_data), posts))
|
|
db.executemany("INSERT OR REPLACE INTO post(id, image_dir, image_id, tags, thumbnail) VALUES(?, ?, ?, ?, ?)", post_values)
|
|
db.commit()
|
|
with open("last_pid.txt", "w") as file:
|
|
file.write(str(pid))
|
|
time.sleep(random.randint(*next_delay))
|