Files
py34/py34/scraper.py

73 lines
2.1 KiB
Python

from .dockid import bs4
from cloudscraper import CloudScraper
from requests import Response
from retry import retry
from bs4 import BeautifulSoup
class ScraperException(Exception):
def __init__(self, res: Response, *argv: any):
super().__init__(self, *argv)
self.response = res
class CaptchaException(Exception):
def __init__(self, *argv: any):
# Reset scraper
global scraper
scraper.reset()
# Construct the exception
super().__init__(self, *argv)
class Scraper:
def __init__(self):
self._scraper: CloudScraper = CloudScraper()
def __del__(self):
self.close()
def _request(self, method: str, url: str, body: bool) -> bytes | Response:
while True:
res: Response = self._scraper.request(method, url)
res.close()
if res.status_code == 429:
self.reset()
continue
if not body:
return res
if res.status_code != 200:
raise ScraperException(res, f"Request did not succeed: {method} {url}")
return res.content
@retry(Exception, tries=5, delay=5)
def _retry_request(self, method: str, url: str, body: bool) -> bytes | Response:
return self._request(method, url, body=body)
def close(self):
self._scraper.close()
def reset(self):
self._scraper.close()
self._scraper = CloudScraper()
def request(self, method: str, url: str, retry: bool = True, body: bool = True):
if retry:
return self._retry_request(method, url, body=body)
else:
return self._request(method, url, body=body)
def head(self, url: str, retry: bool = True, body: bool = True):
return self.request("HEAD", url, retry=retry, body=body)
def get(self, url: str, retry: bool = True, body: bool = True):
return self.request("GET", url, retry=retry, body=body)
def get_html(self, url: str, retry: bool = True) -> BeautifulSoup:
return bs4(self.get(url=url, retry=retry, body=True))
# Default scraper
scraper: Scraper = Scraper()