diff --git a/config.toml b/config.toml deleted file mode 100644 index 0a9a48e..0000000 --- a/config.toml +++ /dev/null @@ -1,6 +0,0 @@ -[scraper] -start-pid = -1 # 86604 -end-pid = 10_000_000 - -retry-delay = [8, 12] -next-delay = [5, 8] \ No newline at end of file diff --git a/py34/__init__.py b/py34/__init__.py new file mode 100644 index 0000000..342fd78 --- /dev/null +++ b/py34/__init__.py @@ -0,0 +1,3 @@ +from .view import View, ViewTags +from .post import Post +from .list import List, ListException diff --git a/py34/__main__.py b/py34/__main__.py deleted file mode 100644 index 39b98f0..0000000 --- a/py34/__main__.py +++ /dev/null @@ -1,5 +0,0 @@ -from .view import View - - -view = View(11171862) -view.get_image().show() diff --git a/py34/dockid.py b/py34/dockid.py new file mode 100644 index 0000000..29de63d --- /dev/null +++ b/py34/dockid.py @@ -0,0 +1,34 @@ +from bs4 import BeautifulSoup + + +def bs4(data: BeautifulSoup | str | bytes) -> BeautifulSoup: + if isinstance(data, BeautifulSoup): + return data + return BeautifulSoup(data, features="html.parser"); + + +def _is_header(doc: BeautifulSoup, label: str) -> bool: + try: + content = doc.find_all("div", attrs={"class": "content"})[0] + h1 = content.find_all("h1")[0] + return h1.text.strip().lower() == label.lower() + except Exception: + return False + + +def is_captcha(data: BeautifulSoup | str | bytes) -> bool: + try: + h2 = bs4(data).find_all("h2")[0] + return h2.text.strip().lower() == "please enter the captcha to continue to rule34.xxx." + except Exception as e: + # TODO: Check if this entire function actually works + raise e + return False + + +def is_nochick(data: BeautifulSoup | str | bytes) -> bool: + return _is_header(bs4(data), "nobody here but us chickens!") + + +def is_toodeep(data: BeautifulSoup | str | bytes) -> bool: + return _is_header(bs4(data), "unable to search this deep in temporarily.") diff --git a/py34/list.py b/py34/list.py index 460d956..3171ac0 100644 --- a/py34/list.py +++ b/py34/list.py @@ -1,5 +1,6 @@ from .post import Post from .scraper import scraper +from .dockid import is_nochick, is_toodeep import urllib.parse from threading import Thread @@ -18,6 +19,12 @@ class List: document = scraper.get_html(f"https://rule34.xxx/index.php?page=post&s=list&tags={tags}&pid={offset}") threads: list[Thread] = [] + if is_nochick(document): + return [] + + if is_toodeep(document): + raise ListException(document, "Search to deep") + try: for entry in document.find_all("div", {"class": "image-list"})[0].children: # Skip garbage diff --git a/py34/scraper.py b/py34/scraper.py index e13d8f1..d7c92b5 100644 --- a/py34/scraper.py +++ b/py34/scraper.py @@ -1,3 +1,4 @@ +from .dockid import bs4 from cloudscraper import CloudScraper from requests import Response from retry import retry @@ -31,7 +32,7 @@ class Scraper: return self._get(url) def get_html(self, url: str, retry: bool = True) -> BeautifulSoup: - return BeautifulSoup(self.get(url=url, retry=retry), features="html.parser") + return bs4(self.get(url=url, retry=retry)) # Default scraper diff --git a/py34/view.py b/py34/view.py index 6713f9d..c294fa8 100644 --- a/py34/view.py +++ b/py34/view.py @@ -13,6 +13,10 @@ class ViewTags: self.meta = met.copy() + def to_list(self) -> list[str]: + return [tag for tag in self] + + def __iter__(self): for tag in self.copyright: yield tag diff --git a/scraper/__main__.py b/scraper/__main__.py deleted file mode 100644 index f819333..0000000 --- a/scraper/__main__.py +++ /dev/null @@ -1,51 +0,0 @@ -import os -import sqlite3 -import time -import random -import tomllib - -from .scraper import get_posts, scraper -import cloudscraper - -with open("config.toml", "rb") as file: - config = tomllib.load(file) - -retry_delay = tuple(config["scraper"]["retry-delay"]) -next_delay = tuple(config["scraper"]["next-delay"]) - -start_pid = config["scraper"]["start-pid"] -end_pid = config["scraper"]["end-pid"] - -db = sqlite3.connect("data.db") -db.execute("CREATE TABLE IF NOT EXISTS post(id INT UNIQUE NOT NULL, image_dir INT NOT NULL, image_id TEXT NOT NULL, tags TEXT NOT NULL, thumbnail BLOB NOT NULL);") -db.commit() - -last_exception: Exception | None = None - -if start_pid == -1: - if os.path.exists("last_pid.txt"): - with open("last_pid.txt", "r") as file: - start_pid = int(file.read().strip()) - else: - start_pid = 0 - -for pid in range(start_pid, end_pid, 42): - print(pid) - for _ in range(3): - try: - last_exception = None - posts = get_posts(f"https://rule34.xxx/index.php?page=post&s=list&tags=all&pid={pid}") - break - except Exception as e: - last_exception = e - print("Retrying") - scraper = cloudscraper.CloudScraper() - time.sleep(random.randint(*retry_delay)) - if last_exception: - raise last_exception - post_values = list(map(lambda p: (p.id, p.image_dir, p.image_id, " ".join(p.tags), p.thumbnail_data), posts)) - db.executemany("INSERT OR REPLACE INTO post(id, image_dir, image_id, tags, thumbnail) VALUES(?, ?, ?, ?, ?)", post_values) - db.commit() - with open("last_pid.txt", "w") as file: - file.write(str(pid)) - time.sleep(random.randint(*next_delay)) diff --git a/scraper/scraper.py b/scraper/scraper.py deleted file mode 100644 index b6a7ff6..0000000 --- a/scraper/scraper.py +++ /dev/null @@ -1,96 +0,0 @@ -from bs4 import BeautifulSoup -import os -import base64 -from threading import Thread -from retry import retry -from PIL import Image -import cloudscraper -import io - - -enable_cache = False - - -scraper = cloudscraper.CloudScraper() - - -def bs4(*argv, **kwargs) -> BeautifulSoup: - return BeautifulSoup(*argv, **kwargs, features="html.parser") - -@retry(Exception, tries=5, delay=3) -def get(url: str) -> bs4: - if enable_cache: - if not os.path.exists("cache"): - os.mkdir("cache") - path = "cache/" + base64.b32encode(url.encode()).decode() - if os.path.exists(path): - with open(path, "rb") as file: - return file.read() - res = scraper.get(url) - if res.status_code != 200: - raise Exception(f"Failed to get {url}") - data = res.content - if enable_cache: - with open(path, "wb") as file: - file.write(data) - return data - - -class Post: - def __init__(self, id: int, image_dir: int, image_id: str, tags: list[str]): - self.id = id - self.image_dir = image_dir - self.image_id = image_id - self.tags = tags.copy() - self.thumbnail_data: bytes | None = None - self.thumbnail: Image.ImageFile.ImageFile | None = None - - def _thread(): - self.thumbnail_data = get(f"https://wimg.rule34.xxx/thumbnails/{self.image_dir}/thumbnail_{self.image_id}.jpg") - self.thumbnail = Image.open(io.BytesIO(self.thumbnail_data)) - self._thread = Thread(target=_thread) - self._thread.start() - - def _join(self): - self._thread.join() - - def __str__(self): - return f"" - - def __repr__(self): - return self.__str__() - -def get_posts(url: str) -> list[Post]: - posts: list[Post] = [] - document = get(url) - try: - for entry in bs4(document).find_all("div", {"class": "image-list"})[0].children: - # Skip garbage - if str(entry).strip() == "": continue - if entry.name != "span": continue - - # Extract image - img = entry.find_all("img")[0] - if "src" in img.attrs: - img_src = img["src"].split("?")[0].split("/")[-2:] - else: - img_src = img["data-cfsrc"].split("?")[0].split("/")[-2:] - - # Append post - posts.append(Post( - int(entry["id"][1:]), - int(img_src[0]), - img_src[1].split("_")[1].split(".")[0], # Thumbnail_[id].jpg - img["alt"].split(" "), - )) - - # Process posts - for post in posts: - post._join() - - return posts - - except Exception as e: - with open("errored-document.html", "wb") as file: - file.write(document) - raise e