Merge branch 'scraper' into scraper-dev

Merge branch 'master' into scraper
Remove non image formats
2025-10-09 20:38:07 +02:00 · 2025-10-09 20:37:55 +02:00 · 2025-10-09 20:37:39 +02:00 · 2025-10-09 20:36:06 +02:00 · 2025-10-09 20:07:14 +02:00 · 2025-10-09 19:25:28 +02:00
4 changed files with 24 additions and 21 deletions
--- a/py34/list.py
+++ b/py34/list.py
@ -2,7 +2,6 @@ from .post import Post
 from .scraper import scraper, ScraperException, CaptchaException
 from .url import parse_thumbnail_url, ListURL
 from .dockid import is_nochick, is_toodeep, is_captcha
-from concurrent.futures import ThreadPoolExecutor
 import urllib.parse
 from retry import retry

@ -67,9 +66,8 @@ class List:

            # Download thumbnails
            if fetch_thumbnails:
-                with ThreadPoolExecutor(max_workers=5) as pool:
-                    for post in self.posts:
-                        pool.submit(Post.get_thumbnail_data, post)
+                for post in self.posts:
+                    post.get_thumbnail_data()

        except ScraperException as ex:
            raise ex
--- a/py34/scraper.py
+++ b/py34/scraper.py
@ -2,6 +2,7 @@ from .dockid import bs4
 from cloudscraper import CloudScraper
 from requests import Response
 from retry import retry
+from time import sleep
 from bs4 import BeautifulSoup


@ -30,16 +31,21 @@ class Scraper:

    def _request(self, method: str, url: str, body: bool) -> bytes | Response:
        while True:
-            res: Response = self._scraper.request(method, url)
-            res.close()
-            if res.status_code == 429:
-                self.reset()
-                continue
-            if not body:
-                return res
-            if res.status_code != 200:
-                raise ScraperException(res, f"Request did not succeed: {method} {url}")
-            return res.content
+            print(f"{method} {url}")
+            with self._scraper.request(method, url) as res:
+                if res.status_code == 429:
+                    print(f"\x1B[33mOverloaded\x1B[0m {method} {url}")
+                    self.reset()
+                    sleep(5)
+                    continue
+                if not body:
+                    print(f"\x1B[32mOk\x1B[0m {method} {url}")
+                    return res
+                if res.status_code != 200:
+                    print(f"\x1B[31mFailed\x1B[0m {method} {url}")
+                    raise ScraperException(res, f"Request did not succeed: {method} {url}")
+                print(f"\x1B[32mOk\x1B[0m {method} {url}")
+                return res.content

    @retry(Exception, tries=5, delay=5)
    def _retry_request(self, method: str, url: str, body: bool) -> bytes | Response:
@ -49,8 +55,9 @@ class Scraper:
        self._scraper.close()

    def reset(self):
-        self._scraper.close()
-        self._scraper = CloudScraper()
+        pass
+        # self._scraper.close()
+        # self._scraper = CloudScraper()

    def request(self, method: str, url: str, retry: bool = True, body: bool = True):
        if retry:
--- a/py34/url.py
+++ b/py34/url.py
@ -2,7 +2,7 @@ from urllib.parse import urlparse, quote_plus
 from os.path import splitext


-IMAGE_FORMATS = ["jpeg", "jpg", "png", "gif", "mp4", "webm"]
+IMAGE_FORMATS = ["jpeg", "jpg", "png", "gif"]
 VIDEO_FORMATS = ["mp4", "webm"]


--- a/scraper/client/main.py
+++ b/scraper/client/main.py
@ -1,6 +1,5 @@
 from datetime import datetime
 from enum import StrEnum
-from concurrent.futures import ThreadPoolExecutor
 import requests
 import tomllib
 import traceback
@ -125,9 +124,8 @@ if True:
                print(f"{FG.r}{ex}{FG._}")
                print(f"{FG.y}{traceback.format_exc()}{FG._}")
                raise ex
-        with ThreadPoolExecutor(max_workers=netthd) as pool:
-            for post in lst.posts:
-                pool.submit(_add_post, block, post)
+        for post in lst.posts:
+            _add_post(block, post)

        # Increase pid for next iteration
        pid += len(lst.posts)
Author	SHA1	Message	Date
Tomas	ac8aa3418a	Merge branch 'scraper' into scraper-dev	2025-10-09 20:38:07 +02:00
Tomas	3ead328a0f	Merge branch 'master' into scraper	2025-10-09 20:37:55 +02:00
Tomas	fb90935d42	Remove non image formats	2025-10-09 20:37:39 +02:00
Tomas	889e9b69a8	Remove non image formats	2025-10-09 20:36:06 +02:00
Tomas	66bef9279f	Merge branch 'scraper' into scraper-dev	2025-10-09 20:07:14 +02:00
Tomas	97a63937df	Merge branch 'scraper' into scraper-dev	2025-10-09 19:25:28 +02:00
Tomas	9247ab28ce	Merge branch 'master' into scraper-dev	2025-10-09 19:25:17 +02:00
Tomas	5cedbb2526	Merge branch 'scraper' into scraper-dev	2025-10-09 19:18:37 +02:00
Tomas	bf9237750b	Merge branch 'scraper' into scraper-dev	2025-10-03 00:09:36 +02:00
Tomas	a3dca99a5d	Merge branch 'scraper' into scraper-dev	2025-09-23 12:44:52 +02:00
Tomas	bb7d010694	Merge branch 'scraper' into scraper-dev	2025-09-23 02:27:13 +02:00
Tomas	ee236445c2	Made single threaded, added back delay	2025-08-28 21:38:32 +02:00
Tomuxs	f0efc80fd8	Removed sleeping between service overload attempts. https://stackoverflow.com/questions/47147328/thread-wait-for-tstate-lock-never-returns#55643196	2025-08-27 19:06:31 +02:00
Tomuxs	54fc8fc213	Merge branch 'scraper' into scraper-dev	2025-08-27 17:47:59 +02:00
Tomuxs	64908f298e	Added verbosity to requests	2025-08-27 17:10:17 +02:00