From d112ea4ce930026f3627fca1b2d49ab3f00a4ab4 Mon Sep 17 00:00:00 2001 From: Tomuxs Date: Mon, 4 Aug 2025 14:34:04 +0200 Subject: [PATCH 01/23] Added threadpool --- py34/list.py | 13 +++++-------- py34/thread.py | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 8 deletions(-) create mode 100644 py34/thread.py diff --git a/py34/list.py b/py34/list.py index 8bc44b7..e8491c2 100644 --- a/py34/list.py +++ b/py34/list.py @@ -2,8 +2,8 @@ from .post import Post from .scraper import scraper, ScraperException from .url import parse_thumbnail_url from .dockid import is_nochick, is_toodeep +from .thread import Pool import urllib.parse -from threading import Thread class ListException(Exception): @@ -55,13 +55,10 @@ class List: # Download thumbnails if fetch_thumbnails: - threads = [Thread(target=Post.get_thumbnail, args=(post,)) for post in self.posts] - - for thread in threads: - thread.start() - - for thread in threads: - thread.join() + pool = Pool() + for post in self.posts: + pool.submit(Post.get_thumbnail_data, post) + pool.join() except ScraperException as ex: raise ex diff --git a/py34/thread.py b/py34/thread.py new file mode 100644 index 0000000..a15b37b --- /dev/null +++ b/py34/thread.py @@ -0,0 +1,37 @@ +from threading import Thread +from typing import Callable + + +class Pool: + def __init__(self, max_workers: int = 5): + self.max_workers = max_workers + self.jobs: list[Thread] = [] + self.workers: list[Thread] = [] + + def submit(self, func: Callable, *vargs, **kwargs): + def proc(self, func: Callable, *vargs, **kwargs): + func(*vargs, **kwargs) + self._pool_proc() + + self.jobs.append(Thread( + target = proc, + args = (self, func, *vargs, ), + kwargs = kwargs + )) + + self._pool_proc() + + def join(self): + while len(self.workers) != 0: + self.workers[-1].join() + self._pool_proc() + + def _pool_proc(self): + # Remove any dead workers + self.workers = list(filter(Thread.is_alive, self.workers)) + + # Process jobs if any + while len(self.workers) < self.max_workers and len(self.jobs) != 0: + job = self.jobs.pop() + job.start() + self.workers.append(job) From 8dd0b576a2706d9dc845fc4b603d79a0fe7fe503 Mon Sep 17 00:00:00 2001 From: Tomuxs Date: Mon, 4 Aug 2025 14:40:38 +0200 Subject: [PATCH 02/23] Removed empty "tags" --- py34/list.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py34/list.py b/py34/list.py index e8491c2..c17c120 100644 --- a/py34/list.py +++ b/py34/list.py @@ -50,7 +50,7 @@ class List: post_id, img_src.dir, img_src.id, - img["alt"].split(" "), + sorted(list(filter(bool, map(str.strip, img["alt"].split(" "))))), )) # Download thumbnails From 6834a2d0567c67b6584833b9f88a7375ecab2607 Mon Sep 17 00:00:00 2001 From: Tomuxs Date: Mon, 4 Aug 2025 14:58:07 +0200 Subject: [PATCH 03/23] Retry if received captcha --- py34/list.py | 9 +++++++-- py34/scraper.py | 10 ++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/py34/list.py b/py34/list.py index c17c120..2d46627 100644 --- a/py34/list.py +++ b/py34/list.py @@ -1,9 +1,10 @@ from .post import Post -from .scraper import scraper, ScraperException +from .scraper import scraper, ScraperException, CaptchaException from .url import parse_thumbnail_url -from .dockid import is_nochick, is_toodeep +from .dockid import is_nochick, is_toodeep, is_captcha from .thread import Pool import urllib.parse +from retry import retry class ListException(Exception): @@ -12,6 +13,7 @@ class ListException(Exception): self.document = document +@retry(CaptchaException, tries=5, delay=3, jitter=2) class List: def __init__(self, tags: list[str], offset: int = 0, fetch_thumbnails: bool = True): self.posts: list[Post] = [] @@ -25,6 +27,9 @@ class List: if is_toodeep(document): raise ListException(document, "Search to deep") + if is_captcha(document): + raise CaptchaException("Received captcha") + try: for entry in document.find_all("div", {"class": "image-list"})[0].children: # Skip garbage diff --git a/py34/scraper.py b/py34/scraper.py index 01bcf06..ae10bd0 100644 --- a/py34/scraper.py +++ b/py34/scraper.py @@ -11,6 +11,16 @@ class ScraperException(Exception): self.response = res +class CaptchaException(Exception): + def __init__(self, *argv: any): + # Reset scraper + global scraper + scraper = Scraper() + + # Construct the exception + super(Exception, *argv) + + class Scraper: def __init__(self): self._scraper: CloudScraper = CloudScraper() From 3831b951610f658bedc17484ce4a2f9b324e949e Mon Sep 17 00:00:00 2001 From: Tomuxs Date: Wed, 6 Aug 2025 04:35:18 +0200 Subject: [PATCH 04/23] Changed to ThreadPoolExecutor --- py34/list.py | 9 ++++----- py34/thread.py | 37 ------------------------------------- 2 files changed, 4 insertions(+), 42 deletions(-) delete mode 100644 py34/thread.py diff --git a/py34/list.py b/py34/list.py index 2d46627..b02cc16 100644 --- a/py34/list.py +++ b/py34/list.py @@ -2,7 +2,7 @@ from .post import Post from .scraper import scraper, ScraperException, CaptchaException from .url import parse_thumbnail_url from .dockid import is_nochick, is_toodeep, is_captcha -from .thread import Pool +from concurrent.futures import ThreadPoolExecutor import urllib.parse from retry import retry @@ -60,10 +60,9 @@ class List: # Download thumbnails if fetch_thumbnails: - pool = Pool() - for post in self.posts: - pool.submit(Post.get_thumbnail_data, post) - pool.join() + with ThreadPoolExecutor(max_workers=5) as pool: + for post in self.posts: + pool.submit(Post.get_thumbnail_data, post) except ScraperException as ex: raise ex diff --git a/py34/thread.py b/py34/thread.py deleted file mode 100644 index a15b37b..0000000 --- a/py34/thread.py +++ /dev/null @@ -1,37 +0,0 @@ -from threading import Thread -from typing import Callable - - -class Pool: - def __init__(self, max_workers: int = 5): - self.max_workers = max_workers - self.jobs: list[Thread] = [] - self.workers: list[Thread] = [] - - def submit(self, func: Callable, *vargs, **kwargs): - def proc(self, func: Callable, *vargs, **kwargs): - func(*vargs, **kwargs) - self._pool_proc() - - self.jobs.append(Thread( - target = proc, - args = (self, func, *vargs, ), - kwargs = kwargs - )) - - self._pool_proc() - - def join(self): - while len(self.workers) != 0: - self.workers[-1].join() - self._pool_proc() - - def _pool_proc(self): - # Remove any dead workers - self.workers = list(filter(Thread.is_alive, self.workers)) - - # Process jobs if any - while len(self.workers) < self.max_workers and len(self.jobs) != 0: - job = self.jobs.pop() - job.start() - self.workers.append(job) From 435fcd2fc3a6e8e3139423f896e1acd1675d355a Mon Sep 17 00:00:00 2001 From: Tomuxs Date: Fri, 8 Aug 2025 06:01:20 +0200 Subject: [PATCH 05/23] Add "is None" checks, instead of infering --- py34/post.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/py34/post.py b/py34/post.py index 93a344a..cd8550a 100644 --- a/py34/post.py +++ b/py34/post.py @@ -13,7 +13,7 @@ class Post: self.tags: list[str] = tags.copy() self._thumbnail_data: bytes | None = thumbnail - self._thumbnail: ImageFile | None = Image.open(BytesIO(thumbnail)) if thumbnail else None + self._thumbnail: ImageFile | None = None self._image_format: str | None = None self._image_data: bytes | None = None @@ -21,14 +21,14 @@ class Post: def get_thumbnail(self) -> ImageFile: - if self._thumbnail: + if self._thumbnail is not None: return self._thumbnail self._thumbnail = Image.open(BytesIO(self.get_thumbnail_data())) return self._thumbnail def get_thumbnail_data(self) -> bytes: - if self._thumbnail_data: + if self._thumbnail_data is not None: return self._thumbnail_data self._thumbnail_data = scraper.get(ThumbnailURL(self.image_dir, self.image_id)) return self._thumbnail_data From 41ecfaec9019b0c79bd7006a445c3688bd981d3a Mon Sep 17 00:00:00 2001 From: Tomuxs Date: Fri, 8 Aug 2025 23:40:23 +0200 Subject: [PATCH 06/23] Added SampleURL --- py34/url.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/py34/url.py b/py34/url.py index 125f21f..e5568bf 100644 --- a/py34/url.py +++ b/py34/url.py @@ -15,6 +15,16 @@ class ImageURL: return f"https://wimg.rule34.xxx//images/{self.dir}/{self.id}.{self.format}" +class SampleURL: + def __init__(self, image_dir: int, image_id: str, image_format: str): + self.dir: int = image_dir + self.id: str = image_id + self.format: str = image_format.lstrip(".") + + def __str__(self) -> str: + return f"https://rule34.xxx//samples/{self.dir}/sample_{self.id}.{self.format}" + + class ThumbnailURL: def __init__(self, image_dir: int, image_id: str): self.dir: int = image_dir From f879535b25e93307790459b89c25d1288a000fee Mon Sep 17 00:00:00 2001 From: Tomuxs Date: Sat, 9 Aug 2025 01:07:29 +0200 Subject: [PATCH 07/23] Samples are always JPG --- py34/url.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/py34/url.py b/py34/url.py index e5568bf..a2f0e5b 100644 --- a/py34/url.py +++ b/py34/url.py @@ -16,13 +16,12 @@ class ImageURL: class SampleURL: - def __init__(self, image_dir: int, image_id: str, image_format: str): + def __init__(self, image_dir: int, image_id: str): self.dir: int = image_dir self.id: str = image_id - self.format: str = image_format.lstrip(".") def __str__(self) -> str: - return f"https://rule34.xxx//samples/{self.dir}/sample_{self.id}.{self.format}" + return f"https://rule34.xxx//samples/{self.dir}/sample_{self.id}.jpg" class ThumbnailURL: From e4814f8a37852648bf9592c5f1016b16a74be775 Mon Sep 17 00:00:00 2001 From: Tomuxs Date: Sat, 9 Aug 2025 02:04:45 +0200 Subject: [PATCH 08/23] Added is_view check, fixed exception constructors, added sample image --- py34/dockid.py | 8 ++++++++ py34/list.py | 2 +- py34/scraper.py | 4 ++-- py34/view.py | 52 ++++++++++++++++++++++++++++++++++++------------- 4 files changed, 50 insertions(+), 16 deletions(-) diff --git a/py34/dockid.py b/py34/dockid.py index 29de63d..d4e599c 100644 --- a/py34/dockid.py +++ b/py34/dockid.py @@ -32,3 +32,11 @@ def is_nochick(data: BeautifulSoup | str | bytes) -> bool: def is_toodeep(data: BeautifulSoup | str | bytes) -> bool: return _is_header(bs4(data), "unable to search this deep in temporarily.") + + +def is_view(data: BeautifulSoup | str | bytes) -> bool: + doc = bs4(data) + els = doc.find_all("img", attrs = {"id": "image"}) + if len(els) == 1: + return True + return False diff --git a/py34/list.py b/py34/list.py index b02cc16..7648931 100644 --- a/py34/list.py +++ b/py34/list.py @@ -9,7 +9,7 @@ from retry import retry class ListException(Exception): def __init__(self, documnet: bytes, *argv): - super(self, *argv) + super().__init__(self, *argv) self.document = document diff --git a/py34/scraper.py b/py34/scraper.py index ae10bd0..7f0eba8 100644 --- a/py34/scraper.py +++ b/py34/scraper.py @@ -7,7 +7,7 @@ from bs4 import BeautifulSoup class ScraperException(Exception): def __init__(self, res: Response, *argv: any): - super(Exception, *argv) + super().__init__(self, *argv) self.response = res @@ -18,7 +18,7 @@ class CaptchaException(Exception): scraper = Scraper() # Construct the exception - super(Exception, *argv) + super().__init__(self, *argv) class Scraper: diff --git a/py34/view.py b/py34/view.py index d37ebf7..7b19f89 100644 --- a/py34/view.py +++ b/py34/view.py @@ -1,11 +1,17 @@ from .post import Post -from .url import ImageURL, ThumbnailURL, parse_image_url +from .url import ImageURL, SampleURL, ThumbnailURL, parse_image_url from .scraper import scraper +from .dockid import is_view from io import BytesIO from PIL import Image from PIL.ImageFile import ImageFile +class ViewMissingException(Exception): + def __init__(self, *args, **kwargs): + super().__init__(self, *args, **kwargs) + + class ViewTags: def __init__(self, cpr: list[str], chr: list[str], art: list[str], gen: list[str], met: list[str]): self.copyright = cpr.copy() @@ -35,14 +41,20 @@ class ViewTags: class View: def __init__(self, id: int): self.id = int(id) - self._image_data: bytes | None = None - self._image: ImageFile | None = None - self._image_url: ImageURL | None = None - self._thumb_data: bytes | None = None - self._thumb: ImageFile | None = None - self._thumb_url: ThumbnailURL | None = None + self._image_data: bytes | None = None + self._image: ImageFile | None = None + self.image_url: ImageURL | None = None + self._sample_data: bytes | None = None + self._sample: ImageFile | None = None + self.sample_url: ThumbnailURL | None = None + self._thumb_data: bytes | None = None + self._thumb: ImageFile | None = None + self.thumb_url: ThumbnailURL | None = None document = scraper.get_html(f"https://rule34.xxx/index.php?page=post&s=view&id={id}") + if not is_view(document): + raise ViewMissingException("View does not exist") + tag_bar = document.find_all("ul", attrs={"id": "tag-sidebar"})[0] cpr = [] chr = [] @@ -71,9 +83,10 @@ class View: label = ent.text.lower().strip() match label: case "original image": - self._image_url = parse_image_url(ent.find_all("a")[0]["href"]) + self.image_url = parse_image_url(ent.find_all("a")[0]["href"]) - self._thumb_url = ThumbnailURL(self._image_url.dir, self._image_url.id) + self.sample_url = SampleURL(self.image_url.dir, self.image_url.id) + self.thumb_url = ThumbnailURL(self.image_url.dir, self.image_url.id) def get_image(self) -> ImageFile: @@ -86,10 +99,23 @@ class View: def get_image_data(self) -> bytes: if self._image_data is not None: return self._image_data - self._image_data = scraper.get(self._image_url) + self._image_data = scraper.get(self.image_url) return self._image_data + def get_sample(self) -> ImageFile: + if self._sample is not None: + return self._sample + self._sample = Image.open(BytesIO(self.get_sample_data())) + return self._sample + + + def get_sample_data(self) -> bytes: + if self._sample_data is not None: + return self._sample_data + self._sample_data = scraper.get(self.sample_url) + + def get_thumbnail(self) -> ImageFile: if self._thumb is not None: return self._thumb @@ -100,14 +126,14 @@ class View: def get_thumbnail_data(self) -> bytes: if self._thumb_data is not None: return self._thumb_data - self._thumb_data = scraper.get(self._thumb_url) + self._thumb_data = scraper.get(self.thumb_url) def to_post(self) -> Post: return Post( self.id, - self._image_url.dir, - self._image_url.id, + self.image_url.dir, + self.image_url.id, self.tags.to_list(), ) From 8fa6e2cce42bf7d39c9c5a63fd24d67615695435 Mon Sep 17 00:00:00 2001 From: Tomuxs Date: Sat, 9 Aug 2025 02:06:04 +0200 Subject: [PATCH 09/23] Made urls consistent --- py34/url.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py34/url.py b/py34/url.py index a2f0e5b..62da613 100644 --- a/py34/url.py +++ b/py34/url.py @@ -21,7 +21,7 @@ class SampleURL: self.id: str = image_id def __str__(self) -> str: - return f"https://rule34.xxx//samples/{self.dir}/sample_{self.id}.jpg" + return f"https://wimg.rule34.xxx//samples/{self.dir}/sample_{self.id}.jpg" class ThumbnailURL: From 7ca4c4bfacba23178fde375c65fdcc7941e68685 Mon Sep 17 00:00:00 2001 From: Tomuxs Date: Sat, 9 Aug 2025 02:25:50 +0200 Subject: [PATCH 10/23] Added scraper.reset --- py34/scraper.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/py34/scraper.py b/py34/scraper.py index 7f0eba8..78df2fd 100644 --- a/py34/scraper.py +++ b/py34/scraper.py @@ -37,6 +37,9 @@ class Scraper: def _retry_get(self, url: str, body: bool) -> bytes | Response: return self._get(url, body=body) + def reset(self): + self._scraper = CloudScraper() + def get(self, url: str, retry: bool = True, body: bool = True): if retry: return self._retry_get(url, body=body) From 8f228bde36f5119260619d2e7d6107850bbe1b5a Mon Sep 17 00:00:00 2001 From: Tomuxs Date: Sat, 9 Aug 2025 02:27:23 +0200 Subject: [PATCH 11/23] Close response handles --- py34/scraper.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/py34/scraper.py b/py34/scraper.py index 78df2fd..272f67c 100644 --- a/py34/scraper.py +++ b/py34/scraper.py @@ -31,7 +31,9 @@ class Scraper: return res if res.status_code != 200: raise ScraperException(res, "Request did not succeed") - return res.content + content = res.content + res.close() + return content @retry(Exception, tries=5, delay=3) def _retry_get(self, url: str, body: bool) -> bytes | Response: From 14ed66aad75b53951cf23d05bb51eedb0d54ffd9 Mon Sep 17 00:00:00 2001 From: Tomuxs Date: Sat, 9 Aug 2025 04:24:16 +0200 Subject: [PATCH 12/23] Added empty Post constructor --- py34/post.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/py34/post.py b/py34/post.py index cd8550a..2e34f69 100644 --- a/py34/post.py +++ b/py34/post.py @@ -20,6 +20,10 @@ class Post: self._image: ImageFile | None = None + def empty(id: int) -> "Post": + return Post(id, 0, "00", [], b"") + + def get_thumbnail(self) -> ImageFile: if self._thumbnail is not None: return self._thumbnail From c9928342c2ad150d329c8e9adff8e08fdcee0e25 Mon Sep 17 00:00:00 2001 From: Tomuxs Date: Sat, 9 Aug 2025 04:24:37 +0200 Subject: [PATCH 13/23] Added ViewURL --- py34/url.py | 8 ++++++++ py34/view.py | 4 ++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/py34/url.py b/py34/url.py index 62da613..e784d9b 100644 --- a/py34/url.py +++ b/py34/url.py @@ -5,6 +5,14 @@ from os.path import splitext IMAGE_FORMATS = ["jpeg", "jpg", "png", "gif", "mp4", "webm"] +class ViewURL: + def __init__(self, post_id: int): + self.id = post_id + + def __str__(self) -> str: + return f"https://rule34.xxx/index.php?page=post&s=view&id={self.id}" + + class ImageURL: def __init__(self, image_dir: int, image_id: str, image_format: str): self.dir: int = image_dir diff --git a/py34/view.py b/py34/view.py index 7b19f89..313d74f 100644 --- a/py34/view.py +++ b/py34/view.py @@ -1,5 +1,5 @@ from .post import Post -from .url import ImageURL, SampleURL, ThumbnailURL, parse_image_url +from .url import ViewURL, ImageURL, SampleURL, ThumbnailURL, parse_image_url from .scraper import scraper from .dockid import is_view from io import BytesIO @@ -50,7 +50,7 @@ class View: self._thumb_data: bytes | None = None self._thumb: ImageFile | None = None self.thumb_url: ThumbnailURL | None = None - document = scraper.get_html(f"https://rule34.xxx/index.php?page=post&s=view&id={id}") + document = scraper.get_html(ViewURL(self.id)) if not is_view(document): raise ViewMissingException("View does not exist") From dddb0e95837adbbbcef745e1e4f5e65c9b3a75f2 Mon Sep 17 00:00:00 2001 From: Tomuxs Date: Sat, 9 Aug 2025 04:25:24 +0200 Subject: [PATCH 14/23] Fixed handle leak --- py34/scraper.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/py34/scraper.py b/py34/scraper.py index 272f67c..303c0d0 100644 --- a/py34/scraper.py +++ b/py34/scraper.py @@ -27,13 +27,12 @@ class Scraper: def _get(self, url: str, body: bool) -> bytes | Response: res: Response = self._scraper.get(url) + res.close() if not body: return res if res.status_code != 200: raise ScraperException(res, "Request did not succeed") - content = res.content - res.close() - return content + return res.content @retry(Exception, tries=5, delay=3) def _retry_get(self, url: str, body: bool) -> bytes | Response: From 12038f9477d7fc33dfbeda260f929670c3e9174c Mon Sep 17 00:00:00 2001 From: Tomuxs Date: Sat, 9 Aug 2025 04:31:37 +0200 Subject: [PATCH 15/23] Fixed view identification --- py34/dockid.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/py34/dockid.py b/py34/dockid.py index d4e599c..5bbd54e 100644 --- a/py34/dockid.py +++ b/py34/dockid.py @@ -37,6 +37,6 @@ def is_toodeep(data: BeautifulSoup | str | bytes) -> bool: def is_view(data: BeautifulSoup | str | bytes) -> bool: doc = bs4(data) els = doc.find_all("img", attrs = {"id": "image"}) - if len(els) == 1: - return True - return False + if len(els) == 0: + return False + return True From a0e38071d88bc746fdfb00bfe3a133c979933849 Mon Sep 17 00:00:00 2001 From: Tomuxs Date: Sat, 9 Aug 2025 05:12:15 +0200 Subject: [PATCH 16/23] When bot detection kicks in, timeout and reload CloudScraper --- py34/scraper.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/py34/scraper.py b/py34/scraper.py index 303c0d0..a4f654a 100644 --- a/py34/scraper.py +++ b/py34/scraper.py @@ -3,6 +3,7 @@ from cloudscraper import CloudScraper from requests import Response from retry import retry from bs4 import BeautifulSoup +import time class ScraperException(Exception): @@ -26,15 +27,20 @@ class Scraper: self._scraper: CloudScraper = CloudScraper() def _get(self, url: str, body: bool) -> bytes | Response: - res: Response = self._scraper.get(url) - res.close() - if not body: - return res - if res.status_code != 200: - raise ScraperException(res, "Request did not succeed") - return res.content + while True: + res: Response = self._scraper.get(url) + res.close() + if res.status_code == 429: + self.reset() + time.sleep(10) + continue + if not body: + return res + if res.status_code != 200: + raise ScraperException(res, "Request did not succeed") + return res.content - @retry(Exception, tries=5, delay=3) + @retry(Exception, tries=5, delay=5) def _retry_get(self, url: str, body: bool) -> bytes | Response: return self._get(url, body=body) From abdcef0a80086f41208a9a46660d8d8f8ef70239 Mon Sep 17 00:00:00 2001 From: Tomuxs Date: Sat, 9 Aug 2025 16:13:25 +0200 Subject: [PATCH 17/23] Fixed handle leak --- py34/scraper.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/py34/scraper.py b/py34/scraper.py index a4f654a..4a7054a 100644 --- a/py34/scraper.py +++ b/py34/scraper.py @@ -16,7 +16,7 @@ class CaptchaException(Exception): def __init__(self, *argv: any): # Reset scraper global scraper - scraper = Scraper() + scraper.reset() # Construct the exception super().__init__(self, *argv) @@ -44,7 +44,11 @@ class Scraper: def _retry_get(self, url: str, body: bool) -> bytes | Response: return self._get(url, body=body) + def close(self): + self._scraper.close() + def reset(self): + self._scraper.close() self._scraper = CloudScraper() def get(self, url: str, retry: bool = True, body: bool = True): From 462d24ab5e4e87dd1413fac1b8d00af50ea5c869 Mon Sep 17 00:00:00 2001 From: Tomuxs Date: Sat, 9 Aug 2025 17:00:38 +0200 Subject: [PATCH 18/23] Ignore removed posts --- py34/list.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/py34/list.py b/py34/list.py index 7648931..4cdb5cb 100644 --- a/py34/list.py +++ b/py34/list.py @@ -46,9 +46,17 @@ class List: # Extract image img = entry.find_all("img")[0] if "src" in img.attrs: - img_src = parse_thumbnail_url(img["src"]) + img_src = img["src"] else: - img_src = parse_thumbnail_url(img["data-cfsrc"]) + img_src = img["data-cfsrc"] + + # Is it a deleted post? + if img_src.split('?')[0].endswith("thumbnail_.jpg"): + # Post has been deleted, continue + continue + + # Parse thumbnail url + img_src = parse_thumbnail_url(img_src) # Append post self.posts.append(Post( From 68b6a505c115546b5b121cc107832dcf8c20d53e Mon Sep 17 00:00:00 2001 From: Tomuxs Date: Sat, 9 Aug 2025 17:00:56 +0200 Subject: [PATCH 19/23] Close handle when deleted --- py34/scraper.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/py34/scraper.py b/py34/scraper.py index 4a7054a..52a75a8 100644 --- a/py34/scraper.py +++ b/py34/scraper.py @@ -26,6 +26,9 @@ class Scraper: def __init__(self): self._scraper: CloudScraper = CloudScraper() + def __del__(self): + self.close() + def _get(self, url: str, body: bool) -> bytes | Response: while True: res: Response = self._scraper.get(url) From 6fc820b3970014b9c545b70549747d8aba9a0fcf Mon Sep 17 00:00:00 2001 From: Tomuxs Date: Sat, 9 Aug 2025 17:12:56 +0200 Subject: [PATCH 20/23] Added ListURL --- py34/list.py | 5 ++--- py34/url.py | 12 +++++++++++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/py34/list.py b/py34/list.py index 4cdb5cb..18880f6 100644 --- a/py34/list.py +++ b/py34/list.py @@ -1,6 +1,6 @@ from .post import Post from .scraper import scraper, ScraperException, CaptchaException -from .url import parse_thumbnail_url +from .url import parse_thumbnail_url, ListURL from .dockid import is_nochick, is_toodeep, is_captcha from concurrent.futures import ThreadPoolExecutor import urllib.parse @@ -18,8 +18,7 @@ class List: def __init__(self, tags: list[str], offset: int = 0, fetch_thumbnails: bool = True): self.posts: list[Post] = [] - tags = "+".join(map(urllib.parse.quote_plus, tags)) - document = scraper.get_html(f"https://rule34.xxx/index.php?page=post&s=list&tags={tags}&pid={offset}") + document = scraper.get_html(ListURL(tags, offset)) if is_nochick(document): return [] diff --git a/py34/url.py b/py34/url.py index e784d9b..f76914c 100644 --- a/py34/url.py +++ b/py34/url.py @@ -1,4 +1,4 @@ -from urllib.parse import urlparse +from urllib.parse import urlparse, quote_plus from os.path import splitext @@ -13,6 +13,16 @@ class ViewURL: return f"https://rule34.xxx/index.php?page=post&s=view&id={self.id}" +class ListURL: + def __init__(self, tags: list[str], offset: int): + self.tags = tags + self.offset = offset + + def __str__(self): + tags = "+".join(map(quote_plus, self.tags)) + return f"https://rule34.xxx/index.php?page=post&s=list&tags={tags}&pid={self.offset}" + + class ImageURL: def __init__(self, image_dir: int, image_id: str, image_format: str): self.dir: int = image_dir From fb06339cc773f334aee6a5789a5f79234a8fbc6e Mon Sep 17 00:00:00 2001 From: Tomas Date: Sun, 10 Aug 2025 15:26:14 +0200 Subject: [PATCH 21/23] Added flask --- requirements.txt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/requirements.txt b/requirements.txt index 58c4edd..0424155 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,16 @@ beautifulsoup4==4.13.4 +blinker==1.9.0 bs4==0.0.2 certifi==2025.7.14 charset-normalizer==3.4.2 +click==8.2.1 cloudscraper==1.2.71 decorator==5.2.1 +Flask==3.1.1 idna==3.10 +itsdangerous==2.2.0 +Jinja2==3.1.6 +MarkupSafe==3.0.2 pillow==11.3.0 py==1.11.0 pyparsing==3.2.3 @@ -14,3 +20,4 @@ retry==0.9.2 soupsieve==2.7 typing_extensions==4.14.1 urllib3==2.5.0 +Werkzeug==3.1.3 From d0dec584a81f4df2098659c4ca7091c4d8bcad59 Mon Sep 17 00:00:00 2001 From: Tomas Date: Sun, 10 Aug 2025 15:49:03 +0200 Subject: [PATCH 22/23] Removed debug print --- scraper/client/__main__.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/scraper/client/__main__.py b/scraper/client/__main__.py index 4de36fa..c1710cd 100644 --- a/scraper/client/__main__.py +++ b/scraper/client/__main__.py @@ -9,9 +9,6 @@ import gc import py34 import scraper.job -import os -print(f"/proc/{os.getpid()}/fd/") - _spinner = 0 def spinner() -> str: global _spinner From 8e3a7b105a253e9c1dc5e896c5978b0385f2003f Mon Sep 17 00:00:00 2001 From: Tomas Date: Sun, 10 Aug 2025 15:50:40 +0200 Subject: [PATCH 23/23] Fixed invalid extension check --- scraper/server/__main__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scraper/server/__main__.py b/scraper/server/__main__.py index b34f18f..1a4c756 100644 --- a/scraper/server/__main__.py +++ b/scraper/server/__main__.py @@ -2,7 +2,7 @@ from scraper.config import config from flask import Flask, Response, request, render_template, url_for from pathlib import Path import py34 -from .block import BLOCK_SIZE, list_blocks, load_blocks, load_block, load_block_stats, save_block, loads as parse_block +from .block import BLOCK_SIZE, list_blocks, load_blocks, load_block, load_block_stats, save_block, loads as parse_block, enttype2ext from .job import assign_job, working_on, any_job, jobs @@ -107,7 +107,7 @@ def get_image(post_id: int = None): return Response( status = 307, headers = { - "Location": str(py34.url.ImageURL(entry.dir, entry.image, entry.ext)) + "Location": str(py34.url.ImageURL(entry.dir, entry.image, enttype2ext(entry.type))) } )