from datetime import datetime from enum import StrEnum import requests import tomllib import traceback import gc import py34 import scraper.job _spinner = 0 def spinner() -> str: global _spinner _spinner += 1 return "/-\\|"[_spinner % 4] class FG(StrEnum): _ = "\x1B[0m" r = "\x1B[31m" g = "\x1B[32m" y = "\x1B[33m" b = "\x1B[34m" p = "\x1B[35m" c = "\x1B[36m" def web_get(url: str) -> str: res = requests.get(url) if res.status_code != 200: raise Exception("Failed to request "+url) content = res.content res.close() return content def web_post(url: str, data: bytes): res = requests.post(url, data) if res.status_code != 200: raise Exception("Failed to request "+url) res.close() # Load config print("Loading config.toml") with open("config.toml", "rb") as file: config = tomllib.load(file) netthd = config["client"]["network-threads"] server = config["client"]["server"] check_missing = config["client"]["check-missing"] patch_failed = config["client"]["patch-failed"] if server[-1] != "/": server += "/" # Print config stats print("=========================================================") print("network threads:", netthd) print("server: ", server) print("check missing: ", check_missing) print("patch failed: ", patch_failed) print("=========================================================") # Enter main loop # while True: # It leaks, idk how to fix it, restart program for now :p if True: # Fetch a job print("Requesting job...", end="") job = scraper.job.loads(web_get(server + "job")) # Print job stats deadline_date = datetime.fromtimestamp(job.deadline) print( f"\rGot work from {FG.c}{job.start}{FG._}" + f" to {FG.c}{job.start + job.size - 1}{FG._}," + f" with deadline at {FG.p}{deadline_date}{FG._}" ) # Prepare a block block = scraper.block.Block(job.start, job.size) # Prepare dictionary for expected blocks expected: dict[bool] = {} for n in range(block.start, block.start + block.size): expected[n] = False # Start scraping pid = 0 end = False failed: list[py34.Post] = [] while not end: # Get a list # print(f"Downloading listing page {FG.b}{pid}{FG._}") lst = py34.List(["id:<"+str(job.start + job.size)], pid) # Check if we have reached the end for post in lst.posts: if post.id <= job.start: end = True # If so, remove posts that exceed our job quota if end: lst.posts = list(filter(lambda p: p.id >= job.start, lst.posts)) # Add posts to block, fetching extensions in the process def _add_post(block: scraper.block.Block, post: py34.Post): try: expected[post.id] = True entry = scraper.block.BlockEntry(post, None) block.add_entry(entry) if entry.type == scraper.block.BlockEntryType.ERROR: failed.append(post) print( f"\rPage {FG.b}{pid}{FG._}" + f" Constructing block {FG.y}{len(block.entries)}/{block.size}{FG._}" + (f", failed {FG.r}{len(failed)}{FG._}" if len(failed) > 0 else ""), end="", ) except Exception as ex: print(f"{FG.r}{ex}{FG._}") print(f"{FG.y}{traceback.format_exc()}{FG._}") raise ex for post in lst.posts: _add_post(block, post) # Increase pid for next iteration pid += len(lst.posts) # Clean leaking connection handles py34.scraper.scraper.reset() gc.collect() print() # Patch failed block entries if patch_failed: for post in failed: print(f"Investigating {py34.url.ViewURL(post.id)}...", end="") try: view = py34.View(post.id) block.add_post(view.to_post(), scraper.block.ext2enttype(view.image_url.format)) print(f" {FG.g}Found {view.image_url.format.upper()}!{FG._}") except py34.view.ViewMissingException as ex: block.add_post(py34.Post.empty(post.id), scraper.block.BlockEntryType.MISSING) print(f" {FG.r}Entry does not exist.{FG._}") # Find missing block entries missing: list[int] = [] for post_id in expected: if expected[post_id]: continue missing.append(post_id) if check_missing: found: int = 0 for n, post_id in zip(range(len(missing)), missing): print( f"\x1B[2K\rSearching for missing entries" + f" {FG.y}{n+1}/{len(missing)}{FG._}" + f", found {FG.g}{found}{FG._}" + f", missing {FG.r}{len(missing)-found}{FG._}", end="", ) try: view = py34.View(post_id) block.add_post(view.to_post(), scraper.block.ext2enttype(view.image_url.format)) found += 1 except py34.view.ViewMissingException as ex: block.add_post(py34.Post.empty(post_id), scraper.block.BlockEntryType.MISSING) print() else: for post_id in missing: block.add_post(py34.Post.empty(post_id), scraper.block.BlockEntryType.MAYBE_MISSING) # Sending block to server print(f"Block: {len(block.entries)}/{block.size}") print("Sending block to server...", end="") try: web_post(server + "block", scraper.block.dumps(block)) print(f"{FG.g}Ok{FG._}") except Exception as ex: print(f"{FG.r}Fail{FG._}")