Initial commit
This commit is contained in:
14
.gitignore
vendored
Normal file
14
.gitignore
vendored
Normal file
@ -0,0 +1,14 @@
|
||||
# Python virtual environment
|
||||
/env/
|
||||
/venv/
|
||||
|
||||
# Python cache
|
||||
__pycache__/
|
||||
*.pyc
|
||||
|
||||
# Jupyter stuff
|
||||
.ipynb_checkpoints/
|
||||
|
||||
# Output files
|
||||
/last_pid.txt
|
||||
/data.db
|
||||
6
config.toml
Normal file
6
config.toml
Normal file
@ -0,0 +1,6 @@
|
||||
[scraper]
|
||||
start-pid = -1 # 86604
|
||||
end-pid = 10_000_000
|
||||
|
||||
retry-delay = [8, 12]
|
||||
next-delay = [5, 8]
|
||||
16
requirements.txt
Normal file
16
requirements.txt
Normal file
@ -0,0 +1,16 @@
|
||||
beautifulsoup4==4.13.4
|
||||
bs4==0.0.2
|
||||
certifi==2025.7.14
|
||||
charset-normalizer==3.4.2
|
||||
cloudscraper==1.2.71
|
||||
decorator==5.2.1
|
||||
idna==3.10
|
||||
pillow==11.3.0
|
||||
py==1.11.0
|
||||
pyparsing==3.2.3
|
||||
requests==2.32.4
|
||||
requests-toolbelt==1.0.0
|
||||
retry==0.9.2
|
||||
soupsieve==2.7
|
||||
typing_extensions==4.14.1
|
||||
urllib3==2.5.0
|
||||
50
scraper/__main__.py
Normal file
50
scraper/__main__.py
Normal file
@ -0,0 +1,50 @@
|
||||
import os
|
||||
import sqlite3
|
||||
import time
|
||||
import random
|
||||
import tomllib
|
||||
|
||||
from .scraper import get_posts
|
||||
|
||||
with open("config.toml", "rb") as file:
|
||||
config = tomllib.load(file)
|
||||
|
||||
retry_delay = tuple(config["scraper"]["retry-delay"])
|
||||
next_delay = tuple(config["scraper"]["next-delay"])
|
||||
|
||||
start_pid = config["scraper"]["start-pid"]
|
||||
end_pid = config["scraper"]["end-pid"]
|
||||
|
||||
db = sqlite3.connect("data.db")
|
||||
db.execute("CREATE TABLE IF NOT EXISTS post(id INT UNIQUE NOT NULL, image_dir INT NOT NULL, image_id TEXT NOT NULL, tags TEXT NOT NULL, thumbnail BLOB NOT NULL);")
|
||||
db.commit()
|
||||
|
||||
last_exception: Exception | None = None
|
||||
|
||||
if start_pid == -1:
|
||||
if os.path.exists("last_pid.txt"):
|
||||
with open("last_pid.txt", "r") as file:
|
||||
start_pid = int(file.read().strip())
|
||||
else:
|
||||
start_pid = 0
|
||||
|
||||
for pid in range(start_pid, end_pid, 42):
|
||||
print(pid)
|
||||
for _ in range(3):
|
||||
try:
|
||||
last_exception = None
|
||||
posts = get_posts(f"https://rule34.xxx/index.php?page=post&s=list&pid={pid}")
|
||||
break
|
||||
except Exception as e:
|
||||
last_exception = e
|
||||
print("Retrying")
|
||||
scraper = cloudscraper.CloudScraper()
|
||||
time.sleep(random.randint(*retry_delay))
|
||||
if last_exception:
|
||||
raise last_exception
|
||||
post_values = list(map(lambda p: (p.id, p.image_dir, p.image_id, " ".join(p.tags), p.thumbnail_data), posts))
|
||||
db.executemany("INSERT OR REPLACE INTO post(id, image_dir, image_id, tags, thumbnail) VALUES(?, ?, ?, ?, ?)", post_values)
|
||||
db.commit()
|
||||
with open("last_pid.txt", "w") as file:
|
||||
file.write(str(pid))
|
||||
time.sleep(random.randint(*next_delay))
|
||||
96
scraper/scraper.py
Normal file
96
scraper/scraper.py
Normal file
@ -0,0 +1,96 @@
|
||||
from bs4 import BeautifulSoup
|
||||
import os
|
||||
import base64
|
||||
from threading import Thread
|
||||
from retry import retry
|
||||
from PIL import Image
|
||||
import cloudscraper
|
||||
import io
|
||||
|
||||
|
||||
enable_cache = False
|
||||
|
||||
|
||||
scraper = cloudscraper.CloudScraper()
|
||||
|
||||
|
||||
def bs4(*argv, **kwargs) -> BeautifulSoup:
|
||||
return BeautifulSoup(*argv, **kwargs, features="html.parser")
|
||||
|
||||
@retry(Exception, tries=5, delay=3)
|
||||
def get(url: str) -> bs4:
|
||||
if enable_cache:
|
||||
if not os.path.exists("cache"):
|
||||
os.mkdir("cache")
|
||||
path = "cache/" + base64.b32encode(url.encode()).decode()
|
||||
if os.path.exists(path):
|
||||
with open(path, "rb") as file:
|
||||
return file.read()
|
||||
res = scraper.get(url)
|
||||
if res.status_code != 200:
|
||||
raise Exception(f"Failed to get {url}")
|
||||
data = res.content
|
||||
if enable_cache:
|
||||
with open(path, "wb") as file:
|
||||
file.write(data)
|
||||
return data
|
||||
|
||||
|
||||
class Post:
|
||||
def __init__(self, id: int, image_dir: int, image_id: str, tags: list[str]):
|
||||
self.id = id
|
||||
self.image_dir = image_dir
|
||||
self.image_id = image_id
|
||||
self.tags = tags.copy()
|
||||
self.thumbnail_data: bytes | None = None
|
||||
self.thumbnail: Image.ImageFile.ImageFile | None = None
|
||||
|
||||
def _thread():
|
||||
self.thumbnail_data = get(f"https://wimg.rule34.xxx/thumbnails/{self.image_dir}/thumbnail_{self.image_id}.jpg")
|
||||
self.thumbnail = Image.open(io.BytesIO(self.thumbnail_data))
|
||||
self._thread = Thread(target=_thread)
|
||||
self._thread.start()
|
||||
|
||||
def _join(self):
|
||||
self._thread.join()
|
||||
|
||||
def __str__(self):
|
||||
return f"<Post {self.id}:{self.image_dir}/{self.image_id}>"
|
||||
|
||||
def __repr__(self):
|
||||
return self.__str__()
|
||||
|
||||
def get_posts(url: str) -> list[Post]:
|
||||
posts: list[Post] = []
|
||||
document = get(url)
|
||||
try:
|
||||
for entry in bs4(document).find_all("div", {"class": "image-list"})[0].children:
|
||||
# Skip garbage
|
||||
if str(entry).strip() == "": continue
|
||||
if entry.name != "span": continue
|
||||
|
||||
# Extract image
|
||||
img = entry.find_all("img")[0]
|
||||
if "src" in img.attrs:
|
||||
img_src = img["src"].split("?")[0].split("/")[-2:]
|
||||
else:
|
||||
img_src = img["data-cfsrc"].split("?")[0].split("/")[-2:]
|
||||
|
||||
# Append post
|
||||
posts.append(Post(
|
||||
int(entry["id"][1:]),
|
||||
int(img_src[0]),
|
||||
img_src[1].split("_")[1].split(".")[0], # Thumbnail_[id].jpg
|
||||
img["alt"].split(" "),
|
||||
))
|
||||
|
||||
# Process posts
|
||||
for post in posts:
|
||||
post._join()
|
||||
|
||||
return posts
|
||||
|
||||
except Exception as e:
|
||||
with open("errored-document.html", "wb") as file:
|
||||
file.write(document)
|
||||
raise e
|
||||
Reference in New Issue
Block a user