Files
py34/scraper/client/__main__.py

186 lines
5.6 KiB
Python

from datetime import datetime
from enum import StrEnum
import requests
import tomllib
import traceback
import gc
import py34
import scraper.job
_spinner = 0
def spinner() -> str:
global _spinner
_spinner += 1
return "/-\\|"[_spinner % 4]
class FG(StrEnum):
_ = "\x1B[0m"
r = "\x1B[31m"
g = "\x1B[32m"
y = "\x1B[33m"
b = "\x1B[34m"
p = "\x1B[35m"
c = "\x1B[36m"
def web_get(url: str) -> str:
res = requests.get(url)
if res.status_code != 200:
raise Exception("Failed to request "+url)
content = res.content
res.close()
return content
def web_post(url: str, data: bytes):
res = requests.post(url, data)
if res.status_code != 200:
raise Exception("Failed to request "+url)
res.close()
# Load config
print("Loading config.toml")
with open("config.toml", "rb") as file:
config = tomllib.load(file)
netthd = config["client"]["network-threads"]
server = config["client"]["server"]
check_missing = config["client"]["check-missing"]
patch_failed = config["client"]["patch-failed"]
if server[-1] != "/":
server += "/"
# Print config stats
print("=========================================================")
print("network threads:", netthd)
print("server: ", server)
print("check missing: ", check_missing)
print("patch failed: ", patch_failed)
print("=========================================================")
# Enter main loop
# while True:
# It leaks, idk how to fix it, restart program for now :p
if True:
# Fetch a job
print("Requesting job...", end="")
job = scraper.job.loads(web_get(server + "job"))
# Print job stats
deadline_date = datetime.fromtimestamp(job.deadline)
print(
f"\rGot work from {FG.c}{job.start}{FG._}" +
f" to {FG.c}{job.start + job.size - 1}{FG._}," +
f" with deadline at {FG.p}{deadline_date}{FG._}"
)
# Prepare a block
block = scraper.block.Block(job.start, job.size)
# Prepare dictionary for expected blocks
expected: dict[bool] = {}
for n in range(block.start, block.start + block.size):
expected[n] = False
# Start scraping
pid = 0
end = False
failed: list[py34.Post] = []
while not end:
# Get a list
# print(f"Downloading listing page {FG.b}{pid}{FG._}")
lst = py34.List(["id:<"+str(job.start + job.size)], pid)
# Check if we have reached the end
for post in lst.posts:
if post.id <= job.start:
end = True
# If so, remove posts that exceed our job quota
if end:
lst.posts = list(filter(lambda p: p.id >= job.start, lst.posts))
# Add posts to block, fetching extensions in the process
def _add_post(block: scraper.block.Block, post: py34.Post):
try:
expected[post.id] = True
entry = scraper.block.BlockEntry(post, None)
block.add_entry(entry)
if entry.type == scraper.block.BlockEntryType.ERROR:
failed.append(post)
print(
f"\rPage {FG.b}{pid}{FG._}" +
f" Constructing block {FG.y}{len(block.entries)}/{block.size}{FG._}" +
(f", failed {FG.r}{len(failed)}{FG._}" if len(failed) > 0 else ""),
end="",
)
except Exception as ex:
print(f"{FG.r}{ex}{FG._}")
print(f"{FG.y}{traceback.format_exc()}{FG._}")
raise ex
for post in lst.posts:
_add_post(block, post)
# Increase pid for next iteration
pid += len(lst.posts)
# Clean leaking connection handles
py34.scraper.scraper.reset()
gc.collect()
print()
# Patch failed block entries
if patch_failed:
for post in failed:
print(f"Investigating {py34.url.ViewURL(post.id)}...", end="")
try:
view = py34.View(post.id)
block.add_post(view.to_post(), scraper.block.ext2enttype(view.image_url.format))
print(f" {FG.g}Found {view.image_url.format.upper()}!{FG._}")
except py34.view.ViewMissingException as ex:
block.add_post(py34.Post.empty(post.id), scraper.block.BlockEntryType.MISSING)
print(f" {FG.r}Entry does not exist.{FG._}")
# Find missing block entries
missing: list[int] = []
for post_id in expected:
if expected[post_id]:
continue
missing.append(post_id)
if check_missing:
found: int = 0
for n, post_id in zip(range(len(missing)), missing):
print(
f"\x1B[2K\rSearching for missing entries" +
f" {FG.y}{n+1}/{len(missing)}{FG._}" +
f", found {FG.g}{found}{FG._}" +
f", missing {FG.r}{len(missing)-found}{FG._}",
end="",
)
try:
view = py34.View(post_id)
block.add_post(view.to_post(), scraper.block.ext2enttype(view.image_url.format))
found += 1
except py34.view.ViewMissingException as ex:
block.add_post(py34.Post.empty(post_id), scraper.block.BlockEntryType.MISSING)
print()
else:
for post_id in missing:
block.add_post(py34.Post.empty(post_id), scraper.block.BlockEntryType.MAYBE_MISSING)
# Sending block to server
print(f"Block: {len(block.entries)}/{block.size}")
print("Sending block to server...", end="")
try:
web_post(server + "block", scraper.block.dumps(block))
print(f"{FG.g}Ok{FG._}")
except Exception as ex:
print(f"{FG.r}Fail{FG._}")