diff --git a/py34/dockid.py b/py34/dockid.py index 29de63d..5bbd54e 100644 --- a/py34/dockid.py +++ b/py34/dockid.py @@ -32,3 +32,11 @@ def is_nochick(data: BeautifulSoup | str | bytes) -> bool: def is_toodeep(data: BeautifulSoup | str | bytes) -> bool: return _is_header(bs4(data), "unable to search this deep in temporarily.") + + +def is_view(data: BeautifulSoup | str | bytes) -> bool: + doc = bs4(data) + els = doc.find_all("img", attrs = {"id": "image"}) + if len(els) == 0: + return False + return True diff --git a/py34/list.py b/py34/list.py index 8bc44b7..18880f6 100644 --- a/py34/list.py +++ b/py34/list.py @@ -1,23 +1,24 @@ from .post import Post -from .scraper import scraper, ScraperException -from .url import parse_thumbnail_url -from .dockid import is_nochick, is_toodeep +from .scraper import scraper, ScraperException, CaptchaException +from .url import parse_thumbnail_url, ListURL +from .dockid import is_nochick, is_toodeep, is_captcha +from concurrent.futures import ThreadPoolExecutor import urllib.parse -from threading import Thread +from retry import retry class ListException(Exception): def __init__(self, documnet: bytes, *argv): - super(self, *argv) + super().__init__(self, *argv) self.document = document +@retry(CaptchaException, tries=5, delay=3, jitter=2) class List: def __init__(self, tags: list[str], offset: int = 0, fetch_thumbnails: bool = True): self.posts: list[Post] = [] - tags = "+".join(map(urllib.parse.quote_plus, tags)) - document = scraper.get_html(f"https://rule34.xxx/index.php?page=post&s=list&tags={tags}&pid={offset}") + document = scraper.get_html(ListURL(tags, offset)) if is_nochick(document): return [] @@ -25,6 +26,9 @@ class List: if is_toodeep(document): raise ListException(document, "Search to deep") + if is_captcha(document): + raise CaptchaException("Received captcha") + try: for entry in document.find_all("div", {"class": "image-list"})[0].children: # Skip garbage @@ -41,27 +45,31 @@ class List: # Extract image img = entry.find_all("img")[0] if "src" in img.attrs: - img_src = parse_thumbnail_url(img["src"]) + img_src = img["src"] else: - img_src = parse_thumbnail_url(img["data-cfsrc"]) + img_src = img["data-cfsrc"] + + # Is it a deleted post? + if img_src.split('?')[0].endswith("thumbnail_.jpg"): + # Post has been deleted, continue + continue + + # Parse thumbnail url + img_src = parse_thumbnail_url(img_src) # Append post self.posts.append(Post( post_id, img_src.dir, img_src.id, - img["alt"].split(" "), + sorted(list(filter(bool, map(str.strip, img["alt"].split(" "))))), )) # Download thumbnails if fetch_thumbnails: - threads = [Thread(target=Post.get_thumbnail, args=(post,)) for post in self.posts] - - for thread in threads: - thread.start() - - for thread in threads: - thread.join() + with ThreadPoolExecutor(max_workers=5) as pool: + for post in self.posts: + pool.submit(Post.get_thumbnail_data, post) except ScraperException as ex: raise ex diff --git a/py34/post.py b/py34/post.py index 93a344a..2e34f69 100644 --- a/py34/post.py +++ b/py34/post.py @@ -13,22 +13,26 @@ class Post: self.tags: list[str] = tags.copy() self._thumbnail_data: bytes | None = thumbnail - self._thumbnail: ImageFile | None = Image.open(BytesIO(thumbnail)) if thumbnail else None + self._thumbnail: ImageFile | None = None self._image_format: str | None = None self._image_data: bytes | None = None self._image: ImageFile | None = None + def empty(id: int) -> "Post": + return Post(id, 0, "00", [], b"") + + def get_thumbnail(self) -> ImageFile: - if self._thumbnail: + if self._thumbnail is not None: return self._thumbnail self._thumbnail = Image.open(BytesIO(self.get_thumbnail_data())) return self._thumbnail def get_thumbnail_data(self) -> bytes: - if self._thumbnail_data: + if self._thumbnail_data is not None: return self._thumbnail_data self._thumbnail_data = scraper.get(ThumbnailURL(self.image_dir, self.image_id)) return self._thumbnail_data diff --git a/py34/scraper.py b/py34/scraper.py index 01bcf06..52a75a8 100644 --- a/py34/scraper.py +++ b/py34/scraper.py @@ -3,30 +3,57 @@ from cloudscraper import CloudScraper from requests import Response from retry import retry from bs4 import BeautifulSoup +import time class ScraperException(Exception): def __init__(self, res: Response, *argv: any): - super(Exception, *argv) + super().__init__(self, *argv) self.response = res +class CaptchaException(Exception): + def __init__(self, *argv: any): + # Reset scraper + global scraper + scraper.reset() + + # Construct the exception + super().__init__(self, *argv) + + class Scraper: def __init__(self): self._scraper: CloudScraper = CloudScraper() - def _get(self, url: str, body: bool) -> bytes | Response: - res: Response = self._scraper.get(url) - if not body: - return res - if res.status_code != 200: - raise ScraperException(res, "Request did not succeed") - return res.content + def __del__(self): + self.close() - @retry(Exception, tries=5, delay=3) + def _get(self, url: str, body: bool) -> bytes | Response: + while True: + res: Response = self._scraper.get(url) + res.close() + if res.status_code == 429: + self.reset() + time.sleep(10) + continue + if not body: + return res + if res.status_code != 200: + raise ScraperException(res, "Request did not succeed") + return res.content + + @retry(Exception, tries=5, delay=5) def _retry_get(self, url: str, body: bool) -> bytes | Response: return self._get(url, body=body) + def close(self): + self._scraper.close() + + def reset(self): + self._scraper.close() + self._scraper = CloudScraper() + def get(self, url: str, retry: bool = True, body: bool = True): if retry: return self._retry_get(url, body=body) diff --git a/py34/url.py b/py34/url.py index 125f21f..f76914c 100644 --- a/py34/url.py +++ b/py34/url.py @@ -1,10 +1,28 @@ -from urllib.parse import urlparse +from urllib.parse import urlparse, quote_plus from os.path import splitext IMAGE_FORMATS = ["jpeg", "jpg", "png", "gif", "mp4", "webm"] +class ViewURL: + def __init__(self, post_id: int): + self.id = post_id + + def __str__(self) -> str: + return f"https://rule34.xxx/index.php?page=post&s=view&id={self.id}" + + +class ListURL: + def __init__(self, tags: list[str], offset: int): + self.tags = tags + self.offset = offset + + def __str__(self): + tags = "+".join(map(quote_plus, self.tags)) + return f"https://rule34.xxx/index.php?page=post&s=list&tags={tags}&pid={self.offset}" + + class ImageURL: def __init__(self, image_dir: int, image_id: str, image_format: str): self.dir: int = image_dir @@ -15,6 +33,15 @@ class ImageURL: return f"https://wimg.rule34.xxx//images/{self.dir}/{self.id}.{self.format}" +class SampleURL: + def __init__(self, image_dir: int, image_id: str): + self.dir: int = image_dir + self.id: str = image_id + + def __str__(self) -> str: + return f"https://wimg.rule34.xxx//samples/{self.dir}/sample_{self.id}.jpg" + + class ThumbnailURL: def __init__(self, image_dir: int, image_id: str): self.dir: int = image_dir diff --git a/py34/view.py b/py34/view.py index d37ebf7..313d74f 100644 --- a/py34/view.py +++ b/py34/view.py @@ -1,11 +1,17 @@ from .post import Post -from .url import ImageURL, ThumbnailURL, parse_image_url +from .url import ViewURL, ImageURL, SampleURL, ThumbnailURL, parse_image_url from .scraper import scraper +from .dockid import is_view from io import BytesIO from PIL import Image from PIL.ImageFile import ImageFile +class ViewMissingException(Exception): + def __init__(self, *args, **kwargs): + super().__init__(self, *args, **kwargs) + + class ViewTags: def __init__(self, cpr: list[str], chr: list[str], art: list[str], gen: list[str], met: list[str]): self.copyright = cpr.copy() @@ -35,13 +41,19 @@ class ViewTags: class View: def __init__(self, id: int): self.id = int(id) - self._image_data: bytes | None = None - self._image: ImageFile | None = None - self._image_url: ImageURL | None = None - self._thumb_data: bytes | None = None - self._thumb: ImageFile | None = None - self._thumb_url: ThumbnailURL | None = None - document = scraper.get_html(f"https://rule34.xxx/index.php?page=post&s=view&id={id}") + self._image_data: bytes | None = None + self._image: ImageFile | None = None + self.image_url: ImageURL | None = None + self._sample_data: bytes | None = None + self._sample: ImageFile | None = None + self.sample_url: ThumbnailURL | None = None + self._thumb_data: bytes | None = None + self._thumb: ImageFile | None = None + self.thumb_url: ThumbnailURL | None = None + document = scraper.get_html(ViewURL(self.id)) + + if not is_view(document): + raise ViewMissingException("View does not exist") tag_bar = document.find_all("ul", attrs={"id": "tag-sidebar"})[0] cpr = [] @@ -71,9 +83,10 @@ class View: label = ent.text.lower().strip() match label: case "original image": - self._image_url = parse_image_url(ent.find_all("a")[0]["href"]) + self.image_url = parse_image_url(ent.find_all("a")[0]["href"]) - self._thumb_url = ThumbnailURL(self._image_url.dir, self._image_url.id) + self.sample_url = SampleURL(self.image_url.dir, self.image_url.id) + self.thumb_url = ThumbnailURL(self.image_url.dir, self.image_url.id) def get_image(self) -> ImageFile: @@ -86,10 +99,23 @@ class View: def get_image_data(self) -> bytes: if self._image_data is not None: return self._image_data - self._image_data = scraper.get(self._image_url) + self._image_data = scraper.get(self.image_url) return self._image_data + def get_sample(self) -> ImageFile: + if self._sample is not None: + return self._sample + self._sample = Image.open(BytesIO(self.get_sample_data())) + return self._sample + + + def get_sample_data(self) -> bytes: + if self._sample_data is not None: + return self._sample_data + self._sample_data = scraper.get(self.sample_url) + + def get_thumbnail(self) -> ImageFile: if self._thumb is not None: return self._thumb @@ -100,14 +126,14 @@ class View: def get_thumbnail_data(self) -> bytes: if self._thumb_data is not None: return self._thumb_data - self._thumb_data = scraper.get(self._thumb_url) + self._thumb_data = scraper.get(self.thumb_url) def to_post(self) -> Post: return Post( self.id, - self._image_url.dir, - self._image_url.id, + self.image_url.dir, + self.image_url.id, self.tags.to_list(), )