188 lines
5.8 KiB
Python
188 lines
5.8 KiB
Python
from datetime import datetime
|
|
from enum import StrEnum
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
import requests
|
|
import tomllib
|
|
import traceback
|
|
import gc
|
|
|
|
import py34
|
|
import scraper.job
|
|
|
|
_spinner = 0
|
|
def spinner() -> str:
|
|
global _spinner
|
|
_spinner += 1
|
|
return "/-\\|"[_spinner % 4]
|
|
|
|
|
|
class FG(StrEnum):
|
|
_ = "\x1B[0m"
|
|
r = "\x1B[31m"
|
|
g = "\x1B[32m"
|
|
y = "\x1B[33m"
|
|
b = "\x1B[34m"
|
|
p = "\x1B[35m"
|
|
c = "\x1B[36m"
|
|
|
|
|
|
def web_get(url: str) -> str:
|
|
res = requests.get(url)
|
|
if res.status_code != 200:
|
|
raise Exception("Failed to request "+url)
|
|
content = res.content
|
|
res.close()
|
|
return content
|
|
|
|
|
|
def web_post(url: str, data: bytes):
|
|
res = requests.post(url, data)
|
|
if res.status_code != 200:
|
|
raise Exception("Failed to request "+url)
|
|
res.close()
|
|
|
|
|
|
# Load config
|
|
print("Loading config.toml")
|
|
with open("config.toml", "rb") as file:
|
|
config = tomllib.load(file)
|
|
|
|
netthd = config["client"]["network-threads"]
|
|
server = config["client"]["server"]
|
|
check_missing = config["client"]["check-missing"]
|
|
patch_failed = config["client"]["patch-failed"]
|
|
|
|
if server[-1] != "/":
|
|
server += "/"
|
|
|
|
|
|
# Print config stats
|
|
print("=========================================================")
|
|
print("network threads:", netthd)
|
|
print("server: ", server)
|
|
print("check missing: ", check_missing)
|
|
print("patch failed: ", patch_failed)
|
|
print("=========================================================")
|
|
|
|
|
|
# Enter main loop
|
|
# while True:
|
|
# It leaks, idk how to fix it, restart program for now :p
|
|
if True:
|
|
# Fetch a job
|
|
print("Requesting job...", end="")
|
|
job = scraper.job.loads(web_get(server + "job"))
|
|
|
|
# Print job stats
|
|
deadline_date = datetime.fromtimestamp(job.deadline)
|
|
print(
|
|
f"\rGot work from {FG.c}{job.start}{FG._}" +
|
|
f" to {FG.c}{job.start + job.size - 1}{FG._}," +
|
|
f" with deadline at {FG.p}{deadline_date}{FG._}"
|
|
)
|
|
|
|
# Prepare a block
|
|
block = scraper.block.Block(job.start, job.size)
|
|
|
|
# Prepare dictionary for expected blocks
|
|
expected: dict[bool] = {}
|
|
for n in range(block.start, block.start + block.size):
|
|
expected[n] = False
|
|
|
|
# Start scraping
|
|
pid = 0
|
|
end = False
|
|
failed: list[py34.Post] = []
|
|
while not end:
|
|
# Get a list
|
|
# print(f"Downloading listing page {FG.b}{pid}{FG._}")
|
|
lst = py34.List(["id:<"+str(job.start + job.size)], pid)
|
|
|
|
# Check if we have reached the end
|
|
for post in lst.posts:
|
|
if post.id <= job.start:
|
|
end = True
|
|
|
|
# If so, remove posts that exceed our job quota
|
|
if end:
|
|
lst.posts = list(filter(lambda p: p.id >= job.start, lst.posts))
|
|
|
|
# Add posts to block, fetching extensions in the process
|
|
def _add_post(block: scraper.block.Block, post: py34.Post):
|
|
try:
|
|
expected[post.id] = True
|
|
entry = scraper.block.BlockEntry(post, None)
|
|
block.add_entry(entry)
|
|
if entry.type == scraper.block.BlockEntryType.ERROR:
|
|
failed.append(post)
|
|
print(
|
|
f"\rPage {FG.b}{pid}{FG._}" +
|
|
f" Constructing block {FG.y}{len(block.entries)}/{block.size}{FG._}" +
|
|
(f", failed {FG.r}{len(failed)}{FG._}" if len(failed) > 0 else ""),
|
|
end="",
|
|
)
|
|
except Exception as ex:
|
|
print(f"{FG.r}{ex}{FG._}")
|
|
print(f"{FG.y}{traceback.format_exc()}{FG._}")
|
|
raise ex
|
|
with ThreadPoolExecutor(max_workers=netthd) as pool:
|
|
for post in lst.posts:
|
|
pool.submit(_add_post, block, post)
|
|
|
|
# Increase pid for next iteration
|
|
pid += len(lst.posts)
|
|
|
|
# Clean leaking connection handles
|
|
py34.scraper.scraper.reset()
|
|
gc.collect()
|
|
print()
|
|
|
|
# Patch failed block entries
|
|
if patch_failed:
|
|
for post in failed:
|
|
print(f"Investigating {py34.url.ViewURL(post.id)}...", end="")
|
|
try:
|
|
view = py34.View(post.id)
|
|
block.add_post(view.to_post(), scraper.block.ext2enttype(view.image_url.format))
|
|
print(f" {FG.g}Found {view.image_url.format.upper()}!{FG._}")
|
|
except py34.view.ViewMissingException as ex:
|
|
block.add_post(py34.Post.empty(post.id), scraper.block.BlockEntryType.MISSING)
|
|
print(f" {FG.r}Entry does not exist.{FG._}")
|
|
|
|
# Find missing block entries
|
|
missing: list[int] = []
|
|
for post_id in expected:
|
|
if expected[post_id]:
|
|
continue
|
|
missing.append(post_id)
|
|
|
|
if check_missing:
|
|
found: int = 0
|
|
for n, post_id in zip(range(len(missing)), missing):
|
|
print(
|
|
f"\x1B[2K\rSearching for missing entries" +
|
|
f" {FG.y}{n+1}/{len(missing)}{FG._}" +
|
|
f", found {FG.g}{found}{FG._}" +
|
|
f", missing {FG.r}{len(missing)-found}{FG._}",
|
|
end="",
|
|
)
|
|
try:
|
|
view = py34.View(post_id)
|
|
block.add_post(view.to_post(), scraper.block.ext2enttype(view.image_url.format))
|
|
found += 1
|
|
except py34.view.ViewMissingException as ex:
|
|
block.add_post(py34.Post.empty(post_id), scraper.block.BlockEntryType.MISSING)
|
|
print()
|
|
else:
|
|
for post_id in missing:
|
|
block.add_post(py34.Post.empty(post_id), scraper.block.BlockEntryType.MAYBE_MISSING)
|
|
|
|
# Sending block to server
|
|
print(f"Block: {len(block.entries)}/{block.size}")
|
|
print("Sending block to server...", end="")
|
|
try:
|
|
web_post(server + "block", scraper.block.dumps(block))
|
|
print(f"{FG.g}Ok{FG._}")
|
|
except Exception as ex:
|
|
print(f"{FG.r}Fail{FG._}")
|