73 lines
2.1 KiB
Python
73 lines
2.1 KiB
Python
from .dockid import bs4
|
|
from cloudscraper import CloudScraper
|
|
from requests import Response
|
|
from retry import retry
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
class ScraperException(Exception):
|
|
def __init__(self, res: Response, *argv: any):
|
|
super().__init__(self, *argv)
|
|
self.response = res
|
|
|
|
|
|
class CaptchaException(Exception):
|
|
def __init__(self, *argv: any):
|
|
# Reset scraper
|
|
global scraper
|
|
scraper.reset()
|
|
|
|
# Construct the exception
|
|
super().__init__(self, *argv)
|
|
|
|
|
|
class Scraper:
|
|
def __init__(self):
|
|
self._scraper: CloudScraper = CloudScraper()
|
|
|
|
def __del__(self):
|
|
self.close()
|
|
|
|
def _request(self, method: str, url: str, body: bool) -> bytes | Response:
|
|
while True:
|
|
res: Response = self._scraper.request(method, url)
|
|
res.close()
|
|
if res.status_code == 429:
|
|
self.reset()
|
|
continue
|
|
if not body:
|
|
return res
|
|
if res.status_code != 200:
|
|
raise ScraperException(res, f"Request did not succeed: {method} {url}")
|
|
return res.content
|
|
|
|
@retry(Exception, tries=5, delay=5)
|
|
def _retry_request(self, method: str, url: str, body: bool) -> bytes | Response:
|
|
return self._request(method, url, body=body)
|
|
|
|
def close(self):
|
|
self._scraper.close()
|
|
|
|
def reset(self):
|
|
self._scraper.close()
|
|
self._scraper = CloudScraper()
|
|
|
|
def request(self, method: str, url: str, retry: bool = True, body: bool = True):
|
|
if retry:
|
|
return self._retry_request(method, url, body=body)
|
|
else:
|
|
return self._request(method, url, body=body)
|
|
|
|
def head(self, url: str, retry: bool = True, body: bool = True):
|
|
return self.request("HEAD", url, retry=retry, body=body)
|
|
|
|
def get(self, url: str, retry: bool = True, body: bool = True):
|
|
return self.request("GET", url, retry=retry, body=body)
|
|
|
|
def get_html(self, url: str, retry: bool = True) -> BeautifulSoup:
|
|
return bs4(self.get(url=url, retry=retry, body=True))
|
|
|
|
|
|
# Default scraper
|
|
scraper: Scraper = Scraper()
|