Compare commits
15 Commits
f55f118271
...
scraper-de
| Author | SHA1 | Date | |
|---|---|---|---|
| ac8aa3418a | |||
| 3ead328a0f | |||
| fb90935d42 | |||
| 889e9b69a8 | |||
| 66bef9279f | |||
| 97a63937df | |||
| 9247ab28ce | |||
| 5cedbb2526 | |||
| bf9237750b | |||
| a3dca99a5d | |||
| bb7d010694 | |||
| ee236445c2 | |||
| f0efc80fd8 | |||
| 54fc8fc213 | |||
| 64908f298e |
@ -2,7 +2,6 @@ from .post import Post
|
||||
from .scraper import scraper, ScraperException, CaptchaException
|
||||
from .url import parse_thumbnail_url, ListURL
|
||||
from .dockid import is_nochick, is_toodeep, is_captcha
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
import urllib.parse
|
||||
from retry import retry
|
||||
|
||||
@ -67,9 +66,8 @@ class List:
|
||||
|
||||
# Download thumbnails
|
||||
if fetch_thumbnails:
|
||||
with ThreadPoolExecutor(max_workers=5) as pool:
|
||||
for post in self.posts:
|
||||
pool.submit(Post.get_thumbnail_data, post)
|
||||
for post in self.posts:
|
||||
post.get_thumbnail_data()
|
||||
|
||||
except ScraperException as ex:
|
||||
raise ex
|
||||
|
||||
@ -2,6 +2,7 @@ from .dockid import bs4
|
||||
from cloudscraper import CloudScraper
|
||||
from requests import Response
|
||||
from retry import retry
|
||||
from time import sleep
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
@ -30,16 +31,21 @@ class Scraper:
|
||||
|
||||
def _request(self, method: str, url: str, body: bool) -> bytes | Response:
|
||||
while True:
|
||||
res: Response = self._scraper.request(method, url)
|
||||
res.close()
|
||||
if res.status_code == 429:
|
||||
self.reset()
|
||||
continue
|
||||
if not body:
|
||||
return res
|
||||
if res.status_code != 200:
|
||||
raise ScraperException(res, f"Request did not succeed: {method} {url}")
|
||||
return res.content
|
||||
print(f"{method} {url}")
|
||||
with self._scraper.request(method, url) as res:
|
||||
if res.status_code == 429:
|
||||
print(f"\x1B[33mOverloaded\x1B[0m {method} {url}")
|
||||
self.reset()
|
||||
sleep(5)
|
||||
continue
|
||||
if not body:
|
||||
print(f"\x1B[32mOk\x1B[0m {method} {url}")
|
||||
return res
|
||||
if res.status_code != 200:
|
||||
print(f"\x1B[31mFailed\x1B[0m {method} {url}")
|
||||
raise ScraperException(res, f"Request did not succeed: {method} {url}")
|
||||
print(f"\x1B[32mOk\x1B[0m {method} {url}")
|
||||
return res.content
|
||||
|
||||
@retry(Exception, tries=5, delay=5)
|
||||
def _retry_request(self, method: str, url: str, body: bool) -> bytes | Response:
|
||||
@ -49,8 +55,9 @@ class Scraper:
|
||||
self._scraper.close()
|
||||
|
||||
def reset(self):
|
||||
self._scraper.close()
|
||||
self._scraper = CloudScraper()
|
||||
pass
|
||||
# self._scraper.close()
|
||||
# self._scraper = CloudScraper()
|
||||
|
||||
def request(self, method: str, url: str, retry: bool = True, body: bool = True):
|
||||
if retry:
|
||||
|
||||
@ -2,7 +2,7 @@ from urllib.parse import urlparse, quote_plus
|
||||
from os.path import splitext
|
||||
|
||||
|
||||
IMAGE_FORMATS = ["jpeg", "jpg", "png", "gif", "mp4", "webm"]
|
||||
IMAGE_FORMATS = ["jpeg", "jpg", "png", "gif"]
|
||||
VIDEO_FORMATS = ["mp4", "webm"]
|
||||
|
||||
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
from datetime import datetime
|
||||
from enum import StrEnum
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
import requests
|
||||
import tomllib
|
||||
import traceback
|
||||
@ -125,9 +124,8 @@ if True:
|
||||
print(f"{FG.r}{ex}{FG._}")
|
||||
print(f"{FG.y}{traceback.format_exc()}{FG._}")
|
||||
raise ex
|
||||
with ThreadPoolExecutor(max_workers=netthd) as pool:
|
||||
for post in lst.posts:
|
||||
pool.submit(_add_post, block, post)
|
||||
for post in lst.posts:
|
||||
_add_post(block, post)
|
||||
|
||||
# Increase pid for next iteration
|
||||
pid += len(lst.posts)
|
||||
|
||||
Reference in New Issue
Block a user