Compare commits
13 Commits
scraper
...
scraper-de
| Author | SHA1 | Date | |
|---|---|---|---|
| ac8aa3418a | |||
| 889e9b69a8 | |||
| 66bef9279f | |||
| 97a63937df | |||
| 9247ab28ce | |||
| 5cedbb2526 | |||
| bf9237750b | |||
| a3dca99a5d | |||
| bb7d010694 | |||
| ee236445c2 | |||
| f0efc80fd8 | |||
| 54fc8fc213 | |||
| 64908f298e |
@ -2,7 +2,6 @@ from .post import Post
|
|||||||
from .scraper import scraper, ScraperException, CaptchaException
|
from .scraper import scraper, ScraperException, CaptchaException
|
||||||
from .url import parse_thumbnail_url, ListURL
|
from .url import parse_thumbnail_url, ListURL
|
||||||
from .dockid import is_nochick, is_toodeep, is_captcha
|
from .dockid import is_nochick, is_toodeep, is_captcha
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
from retry import retry
|
from retry import retry
|
||||||
|
|
||||||
@ -67,9 +66,8 @@ class List:
|
|||||||
|
|
||||||
# Download thumbnails
|
# Download thumbnails
|
||||||
if fetch_thumbnails:
|
if fetch_thumbnails:
|
||||||
with ThreadPoolExecutor(max_workers=5) as pool:
|
|
||||||
for post in self.posts:
|
for post in self.posts:
|
||||||
pool.submit(Post.get_thumbnail_data, post)
|
post.get_thumbnail_data()
|
||||||
|
|
||||||
except ScraperException as ex:
|
except ScraperException as ex:
|
||||||
raise ex
|
raise ex
|
||||||
|
|||||||
@ -2,6 +2,7 @@ from .dockid import bs4
|
|||||||
from cloudscraper import CloudScraper
|
from cloudscraper import CloudScraper
|
||||||
from requests import Response
|
from requests import Response
|
||||||
from retry import retry
|
from retry import retry
|
||||||
|
from time import sleep
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
@ -30,15 +31,20 @@ class Scraper:
|
|||||||
|
|
||||||
def _request(self, method: str, url: str, body: bool) -> bytes | Response:
|
def _request(self, method: str, url: str, body: bool) -> bytes | Response:
|
||||||
while True:
|
while True:
|
||||||
res: Response = self._scraper.request(method, url)
|
print(f"{method} {url}")
|
||||||
res.close()
|
with self._scraper.request(method, url) as res:
|
||||||
if res.status_code == 429:
|
if res.status_code == 429:
|
||||||
|
print(f"\x1B[33mOverloaded\x1B[0m {method} {url}")
|
||||||
self.reset()
|
self.reset()
|
||||||
|
sleep(5)
|
||||||
continue
|
continue
|
||||||
if not body:
|
if not body:
|
||||||
|
print(f"\x1B[32mOk\x1B[0m {method} {url}")
|
||||||
return res
|
return res
|
||||||
if res.status_code != 200:
|
if res.status_code != 200:
|
||||||
|
print(f"\x1B[31mFailed\x1B[0m {method} {url}")
|
||||||
raise ScraperException(res, f"Request did not succeed: {method} {url}")
|
raise ScraperException(res, f"Request did not succeed: {method} {url}")
|
||||||
|
print(f"\x1B[32mOk\x1B[0m {method} {url}")
|
||||||
return res.content
|
return res.content
|
||||||
|
|
||||||
@retry(Exception, tries=5, delay=5)
|
@retry(Exception, tries=5, delay=5)
|
||||||
@ -49,8 +55,9 @@ class Scraper:
|
|||||||
self._scraper.close()
|
self._scraper.close()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self._scraper.close()
|
pass
|
||||||
self._scraper = CloudScraper()
|
# self._scraper.close()
|
||||||
|
# self._scraper = CloudScraper()
|
||||||
|
|
||||||
def request(self, method: str, url: str, retry: bool = True, body: bool = True):
|
def request(self, method: str, url: str, retry: bool = True, body: bool = True):
|
||||||
if retry:
|
if retry:
|
||||||
|
|||||||
@ -1,6 +1,5 @@
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from enum import StrEnum
|
from enum import StrEnum
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
|
||||||
import requests
|
import requests
|
||||||
import tomllib
|
import tomllib
|
||||||
import traceback
|
import traceback
|
||||||
@ -125,9 +124,8 @@ if True:
|
|||||||
print(f"{FG.r}{ex}{FG._}")
|
print(f"{FG.r}{ex}{FG._}")
|
||||||
print(f"{FG.y}{traceback.format_exc()}{FG._}")
|
print(f"{FG.y}{traceback.format_exc()}{FG._}")
|
||||||
raise ex
|
raise ex
|
||||||
with ThreadPoolExecutor(max_workers=netthd) as pool:
|
|
||||||
for post in lst.posts:
|
for post in lst.posts:
|
||||||
pool.submit(_add_post, block, post)
|
_add_post(block, post)
|
||||||
|
|
||||||
# Increase pid for next iteration
|
# Increase pid for next iteration
|
||||||
pid += len(lst.posts)
|
pid += len(lst.posts)
|
||||||
|
|||||||
Reference in New Issue
Block a user