py34/py34/scraper.py

from .dockid import bs4
from cloudscraper import CloudScraper
from requests import Response
from retry import retry
from bs4 import BeautifulSoup


class ScraperException(Exception):
    def __init__(self, res: Response, *argv: any):
        super().__init__(self, *argv)
        self.response = res


class CaptchaException(Exception):
    def __init__(self, *argv: any):
        # Reset scraper
        global scraper
        scraper.reset()

        # Construct the exception
        super().__init__(self, *argv)


class Scraper:
    def __init__(self):
        self._scraper: CloudScraper = CloudScraper()

    def __del__(self):
        self.close()

    def _request(self, method: str, url: str, body: bool) -> bytes | Response:
        while True:
            res: Response = self._scraper.request(method, url)
            res.close()
            if res.status_code == 429:
                self.reset()
                continue
            if not body:
                return res
            if res.status_code != 200:
                raise ScraperException(res, f"Request did not succeed: {method} {url}")
            return res.content

    @retry(Exception, tries=5, delay=5)
    def _retry_request(self, method: str, url: str, body: bool) -> bytes | Response:
        return self._request(method, url, body=body)

    def close(self):
        self._scraper.close()

    def reset(self):
        self._scraper.close()
        self._scraper = CloudScraper()

    def request(self, method: str, url: str, retry: bool = True, body: bool = True):
        if retry:
            return self._retry_request(method, url, body=body)
        else:
            return self._request(method, url, body=body)

    def head(self, url: str, retry: bool = True, body: bool = True):
        return self.request("HEAD", url, retry=retry, body=body)

    def get(self, url: str, retry: bool = True, body: bool = True):
        return self.request("GET", url, retry=retry, body=body)

    def get_html(self, url: str, retry: bool = True) -> BeautifulSoup:
        return bs4(self.get(url=url, retry=retry, body=True))


# Default scraper
scraper: Scraper = Scraper()