Merge branch 'scraper' into scraper-dev

Remove non image formats
2025-10-09 20:38:07 +02:00 · 2025-10-09 20:36:06 +02:00 · 2025-10-09 20:07:14 +02:00 · 2025-10-09 19:25:28 +02:00 · 2025-10-09 19:25:17 +02:00 · 2025-10-09 19:18:37 +02:00
3 changed files with 23 additions and 20 deletions
--- a/py34/list.py
+++ b/py34/list.py
@ -2,7 +2,6 @@ from .post import Post
 from .scraper import scraper, ScraperException, CaptchaException
 from .url import parse_thumbnail_url, ListURL
 from .dockid import is_nochick, is_toodeep, is_captcha
 from concurrent.futures import ThreadPoolExecutor
 import urllib.parse
 from retry import retry
@ -67,9 +66,8 @@ class List:
            # Download thumbnails
            if fetch_thumbnails:
-                with ThreadPoolExecutor(max_workers=5) as pool:
+                for post in self.posts:
-                    for post in self.posts:
+                    post.get_thumbnail_data()
                        pool.submit(Post.get_thumbnail_data, post)
        except ScraperException as ex:
            raise ex
--- a/py34/scraper.py
+++ b/py34/scraper.py
@ -2,6 +2,7 @@ from .dockid import bs4
 from cloudscraper import CloudScraper
 from requests import Response
 from retry import retry
 from time import sleep
 from bs4 import BeautifulSoup
@ -30,16 +31,21 @@ class Scraper:
    def _request(self, method: str, url: str, body: bool) -> bytes | Response:
        while True:
-            res: Response = self._scraper.request(method, url)
+            print(f"{method} {url}")
-            res.close()
+            with self._scraper.request(method, url) as res:
-            if res.status_code == 429:
+                if res.status_code == 429:
-                self.reset()
+                    print(f"\x1B[33mOverloaded\x1B[0m {method} {url}")
-                continue
+                    self.reset()
-            if not body:
+                    sleep(5)
-                return res
+                    continue
-            if res.status_code != 200:
+                if not body:
-                raise ScraperException(res, f"Request did not succeed: {method} {url}")
+                    print(f"\x1B[32mOk\x1B[0m {method} {url}")
-            return res.content
+                    return res
                if res.status_code != 200:
                    print(f"\x1B[31mFailed\x1B[0m {method} {url}")
                    raise ScraperException(res, f"Request did not succeed: {method} {url}")
                print(f"\x1B[32mOk\x1B[0m {method} {url}")
                return res.content
    @retry(Exception, tries=5, delay=5)
    def _retry_request(self, method: str, url: str, body: bool) -> bytes | Response:
@ -49,8 +55,9 @@ class Scraper:
        self._scraper.close()
    def reset(self):
-        self._scraper.close()
+        pass
-        self._scraper = CloudScraper()
+        # self._scraper.close()
        # self._scraper = CloudScraper()
    def request(self, method: str, url: str, retry: bool = True, body: bool = True):
        if retry:
--- a/scraper/client/main.py
+++ b/scraper/client/main.py
@ -1,6 +1,5 @@
 from datetime import datetime
 from enum import StrEnum
 from concurrent.futures import ThreadPoolExecutor
 import requests
 import tomllib
 import traceback
@ -125,9 +124,8 @@ if True:
                print(f"{FG.r}{ex}{FG._}")
                print(f"{FG.y}{traceback.format_exc()}{FG._}")
                raise ex
-        with ThreadPoolExecutor(max_workers=netthd) as pool:
+        for post in lst.posts:
-            for post in lst.posts:
+            _add_post(block, post)
                pool.submit(_add_post, block, post)
        # Increase pid for next iteration
        pid += len(lst.posts)
Author	SHA1	Message	Date
Tomas	ac8aa3418a	Merge branch 'scraper' into scraper-dev	2025-10-09 20:38:07 +02:00
Tomas	889e9b69a8	Remove non image formats	2025-10-09 20:36:06 +02:00
Tomas	66bef9279f	Merge branch 'scraper' into scraper-dev	2025-10-09 20:07:14 +02:00
Tomas	97a63937df	Merge branch 'scraper' into scraper-dev	2025-10-09 19:25:28 +02:00
Tomas	9247ab28ce	Merge branch 'master' into scraper-dev	2025-10-09 19:25:17 +02:00
Tomas	5cedbb2526	Merge branch 'scraper' into scraper-dev	2025-10-09 19:18:37 +02:00
Tomas	bf9237750b	Merge branch 'scraper' into scraper-dev	2025-10-03 00:09:36 +02:00
Tomas	a3dca99a5d	Merge branch 'scraper' into scraper-dev	2025-09-23 12:44:52 +02:00
Tomas	bb7d010694	Merge branch 'scraper' into scraper-dev	2025-09-23 02:27:13 +02:00
Tomas	ee236445c2	Made single threaded, added back delay	2025-08-28 21:38:32 +02:00
Tomuxs	f0efc80fd8	Removed sleeping between service overload attempts. https://stackoverflow.com/questions/47147328/thread-wait-for-tstate-lock-never-returns#55643196	2025-08-27 19:06:31 +02:00
Tomuxs	54fc8fc213	Merge branch 'scraper' into scraper-dev	2025-08-27 17:47:59 +02:00
Tomuxs	64908f298e	Added verbosity to requests	2025-08-27 17:10:17 +02:00