py34/scraper/client/__main__.py

from datetime import datetime
from enum import StrEnum
from concurrent.futures import ThreadPoolExecutor
import requests
import tomllib
import traceback
import gc

import py34
import scraper.job

_spinner = 0
def spinner() -> str:
    global _spinner
    _spinner += 1
    return "/-\\|"[_spinner % 4]


class FG(StrEnum):
    _ = "\x1B[0m"
    r = "\x1B[31m"
    g = "\x1B[32m"
    y = "\x1B[33m"
    b = "\x1B[34m"
    p = "\x1B[35m"
    c = "\x1B[36m"


def web_get(url: str) -> str:
    res = requests.get(url)
    if res.status_code != 200:
        raise Exception("Failed to request "+url)
    content = res.content
    res.close()
    return content


def web_post(url: str, data: bytes):
    res = requests.post(url, data)
    if res.status_code != 200:
        raise Exception("Failed to request "+url)
    res.close()


# Load config
print("Loading config.toml")
with open("config.toml", "rb") as file:
    config = tomllib.load(file)

netthd = config["client"]["network-threads"]
server = config["client"]["server"]
check_missing = config["client"]["check-missing"]
patch_failed = config["client"]["patch-failed"]

if server[-1] != "/":
    server += "/"


# Print config stats
print("=========================================================")
print("network threads:", netthd)
print("server:         ", server)
print("check missing:  ", check_missing)
print("patch failed:   ", patch_failed)
print("=========================================================")


# Enter main loop
# while True:
# It leaks, idk how to fix it, restart program for now :p
if True:
    # Fetch a job
    print("Requesting job...", end="")
    job = scraper.job.loads(web_get(server + "job"))

    # Print job stats
    deadline_date = datetime.fromtimestamp(job.deadline)
    print(
        f"\rGot work from {FG.c}{job.start}{FG._}" +
        f" to {FG.c}{job.start + job.size - 1}{FG._}," +
        f" with deadline at {FG.p}{deadline_date}{FG._}"
    )

    # Prepare a block
    block = scraper.block.Block(job.start, job.size)

    # Prepare dictionary for expected blocks
    expected: dict[bool] = {}
    for n in range(block.start, block.start + block.size):
        expected[n] = False

    # Start scraping
    pid = 0
    end = False
    failed: list[py34.Post] = []
    while not end:
        # Get a list
        # print(f"Downloading listing page {FG.b}{pid}{FG._}")
        lst = py34.List(["id:<"+str(job.start + job.size)], pid)

        # Check if we have reached the end
        for post in lst.posts:
            if post.id <= job.start:
                end = True

        # If so, remove posts that exceed our job quota
        if end:
            lst.posts = list(filter(lambda p: p.id >= job.start, lst.posts))

        # Add posts to block, fetching extensions in the process
        def _add_post(block: scraper.block.Block, post: py34.Post):
            try:
                expected[post.id] = True
                entry = scraper.block.BlockEntry(post, None)
                block.add_entry(entry)
                if entry.type == scraper.block.BlockEntryType.ERROR:
                    failed.append(post)
                print(
                    f"\rPage {FG.b}{pid}{FG._}" +
                    f" Constructing block {FG.y}{len(block.entries)}/{block.size}{FG._}" +
                    (f", failed {FG.r}{len(failed)}{FG._}" if len(failed) > 0 else ""),
                    end="",
                )
            except Exception as ex:
                print(f"{FG.r}{ex}{FG._}")
                print(f"{FG.y}{traceback.format_exc()}{FG._}")
                raise ex
        with ThreadPoolExecutor(max_workers=netthd) as pool:
            for post in lst.posts:
                pool.submit(_add_post, block, post)

        # Increase pid for next iteration
        pid += len(lst.posts)

        # Clean leaking connection handles
        py34.scraper.scraper.reset()
        gc.collect()
    print()

    # Patch failed block entries
    if patch_failed:
        for post in failed:
            print(f"Investigating {py34.url.ViewURL(post.id)}...", end="")
            try:
                view = py34.View(post.id)
                block.add_post(view.to_post(), scraper.block.ext2enttype(view.image_url.format))
                print(f" {FG.g}Found {view.image_url.format.upper()}!{FG._}")
            except py34.view.ViewMissingException as ex:
                block.add_post(py34.Post.empty(post.id), scraper.block.BlockEntryType.MISSING)
                print(f" {FG.r}Entry does not exist.{FG._}")

    # Find missing block entries
    missing: list[int] = []
    for post_id in expected:
        if expected[post_id]:
            continue
        missing.append(post_id)

    if check_missing:
        found: int = 0
        for n, post_id in zip(range(len(missing)), missing):
            print(
                f"\x1B[2K\rSearching for missing entries" +
                f" {FG.y}{n+1}/{len(missing)}{FG._}" +
                f", found {FG.g}{found}{FG._}" +
                f", missing {FG.r}{len(missing)-found}{FG._}",
                end="",
            )
            try:
                view = py34.View(post_id)
                block.add_post(view.to_post(), scraper.block.ext2enttype(view.image_url.format))
                found += 1
            except py34.view.ViewMissingException as ex:
                block.add_post(py34.Post.empty(post_id), scraper.block.BlockEntryType.MISSING)
        print()
    else:
        for post_id in missing:
            block.add_post(py34.Post.empty(post_id), scraper.block.BlockEntryType.MAYBE_MISSING)

    # Sending block to server
    print(f"Block: {len(block.entries)}/{block.size}")
    print("Sending block to server...", end="")
    try:
        web_post(server + "block", scraper.block.dumps(block))
        print(f"{FG.g}Ok{FG._}")
    except Exception as ex:
        print(f"{FG.r}Fail{FG._}")