Compare commits

...

17 Commits

Author SHA1 Message Date
ac8aa3418a Merge branch 'scraper' into scraper-dev 2025-10-09 20:38:07 +02:00
3ead328a0f Merge branch 'master' into scraper 2025-10-09 20:37:55 +02:00
fb90935d42 Remove non image formats 2025-10-09 20:37:39 +02:00
889e9b69a8 Remove non image formats 2025-10-09 20:36:06 +02:00
66bef9279f Merge branch 'scraper' into scraper-dev 2025-10-09 20:07:14 +02:00
f55f118271 Merge branch 'master' into scraper 2025-10-09 20:07:02 +02:00
3279e8e3c0 Scan video formats aswell 2025-10-09 20:06:51 +02:00
97a63937df Merge branch 'scraper' into scraper-dev 2025-10-09 19:25:28 +02:00
9247ab28ce Merge branch 'master' into scraper-dev 2025-10-09 19:25:17 +02:00
5cedbb2526 Merge branch 'scraper' into scraper-dev 2025-10-09 19:18:37 +02:00
bf9237750b Merge branch 'scraper' into scraper-dev 2025-10-03 00:09:36 +02:00
a3dca99a5d Merge branch 'scraper' into scraper-dev 2025-09-23 12:44:52 +02:00
bb7d010694 Merge branch 'scraper' into scraper-dev 2025-09-23 02:27:13 +02:00
ee236445c2 Made single threaded, added back delay 2025-08-28 21:38:32 +02:00
f0efc80fd8 Removed sleeping between service overload attempts.
https://stackoverflow.com/questions/47147328/thread-wait-for-tstate-lock-never-returns#55643196
2025-08-27 19:06:31 +02:00
54fc8fc213 Merge branch 'scraper' into scraper-dev 2025-08-27 17:47:59 +02:00
64908f298e Added verbosity to requests 2025-08-27 17:10:17 +02:00
5 changed files with 26 additions and 23 deletions

View File

@ -2,7 +2,6 @@ from .post import Post
from .scraper import scraper, ScraperException, CaptchaException
from .url import parse_thumbnail_url, ListURL
from .dockid import is_nochick, is_toodeep, is_captcha
from concurrent.futures import ThreadPoolExecutor
import urllib.parse
from retry import retry
@ -67,9 +66,8 @@ class List:
# Download thumbnails
if fetch_thumbnails:
with ThreadPoolExecutor(max_workers=5) as pool:
for post in self.posts:
pool.submit(Post.get_thumbnail_data, post)
for post in self.posts:
post.get_thumbnail_data()
except ScraperException as ex:
raise ex

View File

@ -1,5 +1,5 @@
from .scraper import scraper
from .url import ImageURL, ThumbnailURL, IMAGE_FORMATS
from .url import ImageURL, ThumbnailURL, IMAGE_FORMATS, VIDEO_FORMATS
from io import BytesIO
from PIL import Image
from PIL.ImageFile import ImageFile
@ -57,7 +57,7 @@ class Post:
def get_image_data(self) -> bytes:
if self._image_data is not None:
return self._image_data
for ext in IMAGE_FORMATS:
for ext in VIDEO_FORMATS + IMAGE_FORMATS:
try:
self._image_data = scraper.get(ImageURL(self.image_dir, self.image_id, ext), retry=False)
self._image_format = ext

View File

@ -2,6 +2,7 @@ from .dockid import bs4
from cloudscraper import CloudScraper
from requests import Response
from retry import retry
from time import sleep
from bs4 import BeautifulSoup
@ -30,16 +31,21 @@ class Scraper:
def _request(self, method: str, url: str, body: bool) -> bytes | Response:
while True:
res: Response = self._scraper.request(method, url)
res.close()
if res.status_code == 429:
self.reset()
continue
if not body:
return res
if res.status_code != 200:
raise ScraperException(res, f"Request did not succeed: {method} {url}")
return res.content
print(f"{method} {url}")
with self._scraper.request(method, url) as res:
if res.status_code == 429:
print(f"\x1B[33mOverloaded\x1B[0m {method} {url}")
self.reset()
sleep(5)
continue
if not body:
print(f"\x1B[32mOk\x1B[0m {method} {url}")
return res
if res.status_code != 200:
print(f"\x1B[31mFailed\x1B[0m {method} {url}")
raise ScraperException(res, f"Request did not succeed: {method} {url}")
print(f"\x1B[32mOk\x1B[0m {method} {url}")
return res.content
@retry(Exception, tries=5, delay=5)
def _retry_request(self, method: str, url: str, body: bool) -> bytes | Response:
@ -49,8 +55,9 @@ class Scraper:
self._scraper.close()
def reset(self):
self._scraper.close()
self._scraper = CloudScraper()
pass
# self._scraper.close()
# self._scraper = CloudScraper()
def request(self, method: str, url: str, retry: bool = True, body: bool = True):
if retry:

View File

@ -2,7 +2,7 @@ from urllib.parse import urlparse, quote_plus
from os.path import splitext
IMAGE_FORMATS = ["jpeg", "jpg", "png", "gif", "mp4", "webm"]
IMAGE_FORMATS = ["jpeg", "jpg", "png", "gif"]
VIDEO_FORMATS = ["mp4", "webm"]

View File

@ -1,6 +1,5 @@
from datetime import datetime
from enum import StrEnum
from concurrent.futures import ThreadPoolExecutor
import requests
import tomllib
import traceback
@ -125,9 +124,8 @@ if True:
print(f"{FG.r}{ex}{FG._}")
print(f"{FG.y}{traceback.format_exc()}{FG._}")
raise ex
with ThreadPoolExecutor(max_workers=netthd) as pool:
for post in lst.posts:
pool.submit(_add_post, block, post)
for post in lst.posts:
_add_post(block, post)
# Increase pid for next iteration
pid += len(lst.posts)