13 Commits

3 changed files with 23 additions and 20 deletions

View File

@ -2,7 +2,6 @@ from .post import Post
from .scraper import scraper, ScraperException, CaptchaException from .scraper import scraper, ScraperException, CaptchaException
from .url import parse_thumbnail_url, ListURL from .url import parse_thumbnail_url, ListURL
from .dockid import is_nochick, is_toodeep, is_captcha from .dockid import is_nochick, is_toodeep, is_captcha
from concurrent.futures import ThreadPoolExecutor
import urllib.parse import urllib.parse
from retry import retry from retry import retry
@ -67,9 +66,8 @@ class List:
# Download thumbnails # Download thumbnails
if fetch_thumbnails: if fetch_thumbnails:
with ThreadPoolExecutor(max_workers=5) as pool: for post in self.posts:
for post in self.posts: post.get_thumbnail_data()
pool.submit(Post.get_thumbnail_data, post)
except ScraperException as ex: except ScraperException as ex:
raise ex raise ex

View File

@ -2,6 +2,7 @@ from .dockid import bs4
from cloudscraper import CloudScraper from cloudscraper import CloudScraper
from requests import Response from requests import Response
from retry import retry from retry import retry
from time import sleep
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -30,16 +31,21 @@ class Scraper:
def _request(self, method: str, url: str, body: bool) -> bytes | Response: def _request(self, method: str, url: str, body: bool) -> bytes | Response:
while True: while True:
res: Response = self._scraper.request(method, url) print(f"{method} {url}")
res.close() with self._scraper.request(method, url) as res:
if res.status_code == 429: if res.status_code == 429:
self.reset() print(f"\x1B[33mOverloaded\x1B[0m {method} {url}")
continue self.reset()
if not body: sleep(5)
return res continue
if res.status_code != 200: if not body:
raise ScraperException(res, f"Request did not succeed: {method} {url}") print(f"\x1B[32mOk\x1B[0m {method} {url}")
return res.content return res
if res.status_code != 200:
print(f"\x1B[31mFailed\x1B[0m {method} {url}")
raise ScraperException(res, f"Request did not succeed: {method} {url}")
print(f"\x1B[32mOk\x1B[0m {method} {url}")
return res.content
@retry(Exception, tries=5, delay=5) @retry(Exception, tries=5, delay=5)
def _retry_request(self, method: str, url: str, body: bool) -> bytes | Response: def _retry_request(self, method: str, url: str, body: bool) -> bytes | Response:
@ -49,8 +55,9 @@ class Scraper:
self._scraper.close() self._scraper.close()
def reset(self): def reset(self):
self._scraper.close() pass
self._scraper = CloudScraper() # self._scraper.close()
# self._scraper = CloudScraper()
def request(self, method: str, url: str, retry: bool = True, body: bool = True): def request(self, method: str, url: str, retry: bool = True, body: bool = True):
if retry: if retry:

View File

@ -1,6 +1,5 @@
from datetime import datetime from datetime import datetime
from enum import StrEnum from enum import StrEnum
from concurrent.futures import ThreadPoolExecutor
import requests import requests
import tomllib import tomllib
import traceback import traceback
@ -125,9 +124,8 @@ if True:
print(f"{FG.r}{ex}{FG._}") print(f"{FG.r}{ex}{FG._}")
print(f"{FG.y}{traceback.format_exc()}{FG._}") print(f"{FG.y}{traceback.format_exc()}{FG._}")
raise ex raise ex
with ThreadPoolExecutor(max_workers=netthd) as pool: for post in lst.posts:
for post in lst.posts: _add_post(block, post)
pool.submit(_add_post, block, post)
# Increase pid for next iteration # Increase pid for next iteration
pid += len(lst.posts) pid += len(lst.posts)