Compare commits
26 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 3ead328a0f | |||
| f55f118271 | |||
| 1dc17d7670 | |||
| 6d6e13c231 | |||
| a4e3ced8ab | |||
| 0b3293eaa8 | |||
| 5473f92470 | |||
| ee4ba5d5f2 | |||
| 217595fcdd | |||
| be6740a24c | |||
| 71efaf7c25 | |||
| b81684de19 | |||
| f9fe6af292 | |||
| 6d1b1628f1 | |||
| 0143cc3999 | |||
| a39bfcd0b1 | |||
| 826c5c3473 | |||
| 265ec8d58e | |||
| 241bc90e82 | |||
| bdf77b2920 | |||
| 5f4358a3d1 | |||
| 8e3a7b105a | |||
| d0dec584a8 | |||
| fb06339cc7 | |||
| 48d17fcf7b | |||
| c1b8be46aa |
17
config.toml
Normal file
17
config.toml
Normal file
@ -0,0 +1,17 @@
|
||||
[scraper]
|
||||
block-size = 1050 # Max 65536
|
||||
job-deadline = 3600
|
||||
|
||||
[client]
|
||||
server = "http://localhost:8000/"
|
||||
network-threads = 12
|
||||
check-missing = false
|
||||
patch-failed = true
|
||||
|
||||
[server]
|
||||
block-dir = "blocks"
|
||||
|
||||
[server.web]
|
||||
debug = true
|
||||
server-name = "localhost:8000"
|
||||
application-root = "/"
|
||||
@ -1,10 +1,16 @@
|
||||
beautifulsoup4==4.13.4
|
||||
blinker==1.9.0
|
||||
bs4==0.0.2
|
||||
certifi==2025.7.14
|
||||
charset-normalizer==3.4.2
|
||||
click==8.2.1
|
||||
cloudscraper==1.2.71
|
||||
decorator==5.2.1
|
||||
Flask==3.1.1
|
||||
idna==3.10
|
||||
itsdangerous==2.2.0
|
||||
Jinja2==3.1.6
|
||||
MarkupSafe==3.0.2
|
||||
pillow==11.3.0
|
||||
py==1.11.0
|
||||
pyparsing==3.2.3
|
||||
@ -14,3 +20,4 @@ retry==0.9.2
|
||||
soupsieve==2.7
|
||||
typing_extensions==4.14.1
|
||||
urllib3==2.5.0
|
||||
Werkzeug==3.1.3
|
||||
|
||||
331
scraper/block.py
Normal file
331
scraper/block.py
Normal file
@ -0,0 +1,331 @@
|
||||
from py34 import Post
|
||||
from py34.url import ImageURL, IMAGE_FORMATS, VIDEO_FORMATS
|
||||
from py34.scraper import scraper
|
||||
from typing import IO
|
||||
from io import BytesIO
|
||||
from enum import IntEnum
|
||||
from bisect import insort
|
||||
import math
|
||||
import zlib
|
||||
import time
|
||||
|
||||
|
||||
class BlockEntryType(IntEnum):
|
||||
# Status
|
||||
ERROR = 0
|
||||
MISSING = 1
|
||||
MAYBE_MISSING = 2 # Did not appear on the listing, but the post was never checked
|
||||
|
||||
# Image
|
||||
JPG = 100
|
||||
JPEG = 101
|
||||
PNG = 102
|
||||
|
||||
# Animated Image
|
||||
GIF = 200
|
||||
|
||||
# Audio
|
||||
|
||||
# Video
|
||||
MP4 = 400
|
||||
WEBM = 401
|
||||
|
||||
|
||||
def ext2enttype(ext: str) -> BlockEntryType:
|
||||
return BlockEntryType[ext.upper()]
|
||||
|
||||
def enttype2ext(enttype: BlockEntryType) -> str:
|
||||
if enttype.value < 50:
|
||||
raise Exception("Entry does not refer to a file")
|
||||
return enttype.name.lower()
|
||||
|
||||
|
||||
def _compute_tag_bits(tags: list[str]) -> list[dict[int]]:
|
||||
# Compute tag bits
|
||||
tag_bits = [{} for _ in range(math.ceil(len(tags) / 32))]
|
||||
for n in range(len(tags)):
|
||||
bit = n % 32
|
||||
rank = n // 32
|
||||
tag_bits[rank][tags[n]] = 1 << bit
|
||||
return tag_bits
|
||||
|
||||
|
||||
def _pad_bytes(size: int, pad_addr: int) -> int:
|
||||
addr_bytes = pad_addr // 8
|
||||
overflow = size % addr_bytes
|
||||
if overflow == 0:
|
||||
return 0
|
||||
else:
|
||||
return addr_bytes - overflow
|
||||
|
||||
|
||||
class Block:
|
||||
def __init__(self, start: int, size: int):
|
||||
self.start: int = start
|
||||
self.size: int = size
|
||||
self.entries: list[BlockEntry] = []
|
||||
|
||||
def add_post(self, post: Post, enttype: BlockEntryType | None = None):
|
||||
self.add_entry(BlockEntry(post, enttype))
|
||||
|
||||
def add_entry(self, entry: "BlockEntry"):
|
||||
# Check if entry belongs to this block
|
||||
if not self.belongs(entry.post):
|
||||
raise Exception("Given post does not belong to this block")
|
||||
# Remove entry with matching post-id, if exists
|
||||
self.entries = list(filter(lambda e: e.post != entry.post, self.entries))
|
||||
# insert the entry
|
||||
insort(self.entries, entry)
|
||||
|
||||
def dump(self, fd: IO[bytes]):
|
||||
# Sort all entries
|
||||
self.entries.sort()
|
||||
|
||||
# Combine all tags
|
||||
tags: set[str] = set()
|
||||
for entry in self.entries:
|
||||
tags |= set(entry.tags)
|
||||
tags: list[str] = list(tags)
|
||||
tags.sort()
|
||||
|
||||
# Compute tag bits
|
||||
tag_bits = _compute_tag_bits(tags)
|
||||
|
||||
# Compute length of tag stringzs
|
||||
tags_len_b = 0
|
||||
for tag in tags:
|
||||
tags_len_b += len(tag) + 1
|
||||
|
||||
# Compress tag data
|
||||
tag_data = zlib.compress((b"\0".join(map(str.encode, tags)))+b"\0", level=9)
|
||||
|
||||
# Magic 4B
|
||||
fd.write(b"34BK")
|
||||
|
||||
# Entry amount 2B
|
||||
fd.write(len(self.entries).to_bytes(2, "little", signed=False))
|
||||
|
||||
# Size 2B
|
||||
fd.write(self.size.to_bytes(2, "little", signed=False))
|
||||
|
||||
# Start 4B
|
||||
fd.write(self.start.to_bytes(4, "little", signed=False))
|
||||
|
||||
# Amount of different tags 2B
|
||||
fd.write(len(tags).to_bytes(2, "little", signed=False))
|
||||
|
||||
# Size of tag data 2B
|
||||
fd.write(len(tag_data).to_bytes(2, "little", signed=False))
|
||||
|
||||
# Write all the tags
|
||||
fd.write(tag_data)
|
||||
|
||||
# Pad to the nearest 32bit address
|
||||
for _ in range(_pad_bytes(len(tag_data), 32)):
|
||||
fd.write(b"\0")
|
||||
|
||||
# Dump entries
|
||||
for entry in self.entries:
|
||||
image = bytearray.fromhex(entry.image)
|
||||
|
||||
# Post ID 4B
|
||||
fd.write(entry.post.to_bytes(4, "little", signed=False))
|
||||
|
||||
# Entry type enum 2B
|
||||
fd.write(entry.type.to_bytes(2, "little", signed=False))
|
||||
|
||||
# Image id length 2B
|
||||
fd.write(len(image).to_bytes(2, "little", signed=False))
|
||||
|
||||
# Image dir 4B
|
||||
fd.write(entry.dir.to_bytes(4, "little", signed=False))
|
||||
|
||||
# Thumbnail size 4B
|
||||
fd.write(len(entry.thumbnail).to_bytes(4, "little", signed=False))
|
||||
|
||||
# Tag bits 4B*ranks
|
||||
for rank in tag_bits:
|
||||
word = 0
|
||||
for tag in entry.tags:
|
||||
if tag in rank:
|
||||
word |= rank[tag]
|
||||
fd.write(word.to_bytes(4, "little", signed=False))
|
||||
|
||||
# Image ID
|
||||
fd.write(image)
|
||||
|
||||
# Thumbnail
|
||||
fd.write(entry.thumbnail)
|
||||
|
||||
# Pad to the nearest 32bit address
|
||||
for _ in range(_pad_bytes(len(image) + len(entry.thumbnail), 32)):
|
||||
fd.write(b"\0")
|
||||
|
||||
def dumps(self) -> bytes:
|
||||
io = BytesIO()
|
||||
self.dump(io)
|
||||
return io.getvalue()
|
||||
|
||||
def entry(self, post_id: int) -> "BlockEntry":
|
||||
for entry in self.entries:
|
||||
if entry.post == post_id:
|
||||
return entry
|
||||
raise IndexError("Entry not found")
|
||||
|
||||
def belongs(self, post_id: int) -> bool:
|
||||
return post_id >= self.start and post_id < self.start+self.size
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
"start": self.start,
|
||||
"size": self.size,
|
||||
"entries": list(map(BlockEntry.to_dict, self.entries)),
|
||||
}
|
||||
|
||||
|
||||
class BlockEntry:
|
||||
def __init__(self, post: Post, enttype: BlockEntryType | None):
|
||||
self.post = post.id
|
||||
self.dir = post.image_dir
|
||||
self.image = post.image_id
|
||||
self.type = enttype
|
||||
self.tags = post.tags.copy()
|
||||
self.thumbnail = post.get_thumbnail_data()
|
||||
|
||||
if self.type is None:
|
||||
for ext in VIDEO_FORMATS + IMAGE_FORMATS:
|
||||
image_url = ImageURL(post.image_dir, post.image_id, ext)
|
||||
status: int = None
|
||||
# CDN tends to return 503 : Service Unavailable
|
||||
while status != 200 and status != 404:
|
||||
status = scraper.head(image_url, body=False).status_code
|
||||
if status != 200 and status != 404:
|
||||
scraper.reset()
|
||||
time.sleep(1)
|
||||
# HEAD could fail, try with GET
|
||||
while status != 200 and status != 404:
|
||||
status = scraper.get(image_url, body=False).status_code
|
||||
if status != 200 and status != 404:
|
||||
scraper.reset()
|
||||
time.sleep(1)
|
||||
if status == 200:
|
||||
self.type = ext2enttype(ext)
|
||||
break
|
||||
if self.type is None:
|
||||
self.type = BlockEntryType.ERROR
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
"post": self.post,
|
||||
"dir": self.dir,
|
||||
"image": self.image,
|
||||
"type": self.type.name,
|
||||
"tags": self.tags,
|
||||
}
|
||||
|
||||
def __lt__(self, other):
|
||||
return self.post < other.post
|
||||
|
||||
|
||||
class BlockHeader:
|
||||
def __init__(self, start: int, size: int, entries: int):
|
||||
self.start = start
|
||||
self.size = size
|
||||
self.entries = entries
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
"start": self.start,
|
||||
"size": self.size,
|
||||
"entries": self.entries,
|
||||
}
|
||||
|
||||
|
||||
def dump(block: Block, fd: IO[bytes]):
|
||||
block.dump(fd)
|
||||
|
||||
|
||||
def dumps(block: Block) -> bytes:
|
||||
return block.dumps()
|
||||
|
||||
|
||||
def load_header(fd: IO[bytes]) -> Block:
|
||||
if fd.read(4) != b"34BK":
|
||||
raise Exception("Stream is not Block data")
|
||||
|
||||
def read_dword(fd: IO[bytes]) -> int:
|
||||
return int.from_bytes(fd.read(4), "little", signed=False)
|
||||
|
||||
def read_word(fd: IO[bytes]) -> int:
|
||||
return int.from_bytes(fd.read(2), "little", signed=False)
|
||||
|
||||
|
||||
entries_len = read_word(fd)
|
||||
block_size = read_word(fd)
|
||||
block_start = read_dword(fd)
|
||||
|
||||
return BlockHeader(block_start, block_size, entries_len)
|
||||
|
||||
|
||||
def load(fd: IO[bytes]) -> Block:
|
||||
if fd.read(4) != b"34BK":
|
||||
raise Exception("Stream is not Block data")
|
||||
|
||||
def read_dword(fd: IO[bytes]) -> int:
|
||||
return int.from_bytes(fd.read(4), "little", signed=False)
|
||||
|
||||
def read_word(fd: IO[bytes]) -> int:
|
||||
return int.from_bytes(fd.read(2), "little", signed=False)
|
||||
|
||||
# Read header
|
||||
entries_len = read_word(fd)
|
||||
block_size = read_word(fd)
|
||||
block_start = read_dword(fd)
|
||||
tags_len = read_word(fd)
|
||||
tags_data_len = read_word(fd)
|
||||
|
||||
# Read tags
|
||||
tags_data = zlib.decompress(fd.read(tags_data_len))
|
||||
tags = list(map(bytes.decode, tags_data.rstrip(b"\0").split(b"\0")))
|
||||
|
||||
# Slurp padding bytes
|
||||
fd.read(_pad_bytes(tags_data_len, 32))
|
||||
|
||||
# Compute tag bits
|
||||
tag_bits = _compute_tag_bits(tags)
|
||||
|
||||
# Load entries
|
||||
block = Block(block_start, block_size)
|
||||
for n in range(entries_len):
|
||||
# Read header
|
||||
post_id = read_dword(fd)
|
||||
enttype = BlockEntryType(read_word(fd))
|
||||
image_id_len = read_word(fd)
|
||||
image_dir = read_dword(fd)
|
||||
thumbnail_size = read_dword(fd)
|
||||
|
||||
# Read tags
|
||||
post_tags = []
|
||||
for rank in tag_bits:
|
||||
bits = read_dword(fd)
|
||||
for tag in rank:
|
||||
if rank[tag] & bits:
|
||||
post_tags.append(tag)
|
||||
|
||||
# Read image id
|
||||
image = fd.read(image_id_len)
|
||||
image_id = image.hex()
|
||||
|
||||
# Read image thumbnail
|
||||
thumbnail = fd.read(thumbnail_size)
|
||||
|
||||
# Slurp padding bytes
|
||||
fd.read(_pad_bytes(image_id_len + thumbnail_size, 32))
|
||||
|
||||
block.add_post(Post(post_id, image_dir, image_id, post_tags, thumbnail), enttype)
|
||||
|
||||
return block
|
||||
|
||||
|
||||
def loads(data: bytes) -> Block:
|
||||
return load(BytesIO(data));
|
||||
187
scraper/client/__main__.py
Normal file
187
scraper/client/__main__.py
Normal file
@ -0,0 +1,187 @@
|
||||
from datetime import datetime
|
||||
from enum import StrEnum
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
import requests
|
||||
import tomllib
|
||||
import traceback
|
||||
import gc
|
||||
|
||||
import py34
|
||||
import scraper.job
|
||||
|
||||
_spinner = 0
|
||||
def spinner() -> str:
|
||||
global _spinner
|
||||
_spinner += 1
|
||||
return "/-\\|"[_spinner % 4]
|
||||
|
||||
|
||||
class FG(StrEnum):
|
||||
_ = "\x1B[0m"
|
||||
r = "\x1B[31m"
|
||||
g = "\x1B[32m"
|
||||
y = "\x1B[33m"
|
||||
b = "\x1B[34m"
|
||||
p = "\x1B[35m"
|
||||
c = "\x1B[36m"
|
||||
|
||||
|
||||
def web_get(url: str) -> str:
|
||||
res = requests.get(url)
|
||||
if res.status_code != 200:
|
||||
raise Exception("Failed to request "+url)
|
||||
content = res.content
|
||||
res.close()
|
||||
return content
|
||||
|
||||
|
||||
def web_post(url: str, data: bytes):
|
||||
res = requests.post(url, data)
|
||||
if res.status_code != 200:
|
||||
raise Exception("Failed to request "+url)
|
||||
res.close()
|
||||
|
||||
|
||||
# Load config
|
||||
print("Loading config.toml")
|
||||
with open("config.toml", "rb") as file:
|
||||
config = tomllib.load(file)
|
||||
|
||||
netthd = config["client"]["network-threads"]
|
||||
server = config["client"]["server"]
|
||||
check_missing = config["client"]["check-missing"]
|
||||
patch_failed = config["client"]["patch-failed"]
|
||||
|
||||
if server[-1] != "/":
|
||||
server += "/"
|
||||
|
||||
|
||||
# Print config stats
|
||||
print("=========================================================")
|
||||
print("network threads:", netthd)
|
||||
print("server: ", server)
|
||||
print("check missing: ", check_missing)
|
||||
print("patch failed: ", patch_failed)
|
||||
print("=========================================================")
|
||||
|
||||
|
||||
# Enter main loop
|
||||
# while True:
|
||||
# It leaks, idk how to fix it, restart program for now :p
|
||||
if True:
|
||||
# Fetch a job
|
||||
print("Requesting job...", end="")
|
||||
job = scraper.job.loads(web_get(server + "job"))
|
||||
|
||||
# Print job stats
|
||||
deadline_date = datetime.fromtimestamp(job.deadline)
|
||||
print(
|
||||
f"\rGot work from {FG.c}{job.start}{FG._}" +
|
||||
f" to {FG.c}{job.start + job.size - 1}{FG._}," +
|
||||
f" with deadline at {FG.p}{deadline_date}{FG._}"
|
||||
)
|
||||
|
||||
# Prepare a block
|
||||
block = scraper.block.Block(job.start, job.size)
|
||||
|
||||
# Prepare dictionary for expected blocks
|
||||
expected: dict[bool] = {}
|
||||
for n in range(block.start, block.start + block.size):
|
||||
expected[n] = False
|
||||
|
||||
# Start scraping
|
||||
pid = 0
|
||||
end = False
|
||||
failed: list[py34.Post] = []
|
||||
while not end:
|
||||
# Get a list
|
||||
# print(f"Downloading listing page {FG.b}{pid}{FG._}")
|
||||
lst = py34.List(["id:<"+str(job.start + job.size)], pid)
|
||||
|
||||
# Check if we have reached the end
|
||||
for post in lst.posts:
|
||||
if post.id <= job.start:
|
||||
end = True
|
||||
|
||||
# If so, remove posts that exceed our job quota
|
||||
if end:
|
||||
lst.posts = list(filter(lambda p: p.id >= job.start, lst.posts))
|
||||
|
||||
# Add posts to block, fetching extensions in the process
|
||||
def _add_post(block: scraper.block.Block, post: py34.Post):
|
||||
try:
|
||||
expected[post.id] = True
|
||||
entry = scraper.block.BlockEntry(post, None)
|
||||
block.add_entry(entry)
|
||||
if entry.type == scraper.block.BlockEntryType.ERROR:
|
||||
failed.append(post)
|
||||
print(
|
||||
f"\rPage {FG.b}{pid}{FG._}" +
|
||||
f" Constructing block {FG.y}{len(block.entries)}/{block.size}{FG._}" +
|
||||
(f", failed {FG.r}{len(failed)}{FG._}" if len(failed) > 0 else ""),
|
||||
end="",
|
||||
)
|
||||
except Exception as ex:
|
||||
print(f"{FG.r}{ex}{FG._}")
|
||||
print(f"{FG.y}{traceback.format_exc()}{FG._}")
|
||||
raise ex
|
||||
with ThreadPoolExecutor(max_workers=netthd) as pool:
|
||||
for post in lst.posts:
|
||||
pool.submit(_add_post, block, post)
|
||||
|
||||
# Increase pid for next iteration
|
||||
pid += len(lst.posts)
|
||||
|
||||
# Clean leaking connection handles
|
||||
py34.scraper.scraper.reset()
|
||||
gc.collect()
|
||||
print()
|
||||
|
||||
# Patch failed block entries
|
||||
if patch_failed:
|
||||
for post in failed:
|
||||
print(f"Investigating {py34.url.ViewURL(post.id)}...", end="")
|
||||
try:
|
||||
view = py34.View(post.id)
|
||||
block.add_post(view.to_post(), scraper.block.ext2enttype(view.image_url.format))
|
||||
print(f" {FG.g}Found {view.image_url.format.upper()}!{FG._}")
|
||||
except py34.view.ViewMissingException as ex:
|
||||
block.add_post(py34.Post.empty(post.id), scraper.block.BlockEntryType.MISSING)
|
||||
print(f" {FG.r}Entry does not exist.{FG._}")
|
||||
|
||||
# Find missing block entries
|
||||
missing: list[int] = []
|
||||
for post_id in expected:
|
||||
if expected[post_id]:
|
||||
continue
|
||||
missing.append(post_id)
|
||||
|
||||
if check_missing:
|
||||
found: int = 0
|
||||
for n, post_id in zip(range(len(missing)), missing):
|
||||
print(
|
||||
f"\x1B[2K\rSearching for missing entries" +
|
||||
f" {FG.y}{n+1}/{len(missing)}{FG._}" +
|
||||
f", found {FG.g}{found}{FG._}" +
|
||||
f", missing {FG.r}{len(missing)-found}{FG._}",
|
||||
end="",
|
||||
)
|
||||
try:
|
||||
view = py34.View(post_id)
|
||||
block.add_post(view.to_post(), scraper.block.ext2enttype(view.image_url.format))
|
||||
found += 1
|
||||
except py34.view.ViewMissingException as ex:
|
||||
block.add_post(py34.Post.empty(post_id), scraper.block.BlockEntryType.MISSING)
|
||||
print()
|
||||
else:
|
||||
for post_id in missing:
|
||||
block.add_post(py34.Post.empty(post_id), scraper.block.BlockEntryType.MAYBE_MISSING)
|
||||
|
||||
# Sending block to server
|
||||
print(f"Block: {len(block.entries)}/{block.size}")
|
||||
print("Sending block to server...", end="")
|
||||
try:
|
||||
web_post(server + "block", scraper.block.dumps(block))
|
||||
print(f"{FG.g}Ok{FG._}")
|
||||
except Exception as ex:
|
||||
print(f"{FG.r}Fail{FG._}")
|
||||
5
scraper/config.py
Normal file
5
scraper/config.py
Normal file
@ -0,0 +1,5 @@
|
||||
import tomllib
|
||||
|
||||
|
||||
with open("config.toml", "rb") as file:
|
||||
config = tomllib.load(file)
|
||||
38
scraper/job.py
Normal file
38
scraper/job.py
Normal file
@ -0,0 +1,38 @@
|
||||
from .config import config
|
||||
from .block import Block
|
||||
from typing import IO
|
||||
from datetime import datetime, timedelta
|
||||
import json
|
||||
|
||||
|
||||
class Job:
|
||||
def __init__(self, start, size, deadline = None):
|
||||
self.start = start
|
||||
self.size = size
|
||||
if deadline is None:
|
||||
self.deadline = (datetime.now() + timedelta(seconds=config["scraper"]["job-deadline"])).timestamp()
|
||||
else:
|
||||
self.deadline = deadline
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
"start": self.start,
|
||||
"size": self.size,
|
||||
"deadline": self.deadline,
|
||||
}
|
||||
|
||||
|
||||
def dump(job: Job, fd: IO[str | bytes]):
|
||||
json.dump(job.to_dict(), fd)
|
||||
|
||||
def dumps(job: Job) -> str:
|
||||
return json.dumps(job.to_dict(), fd)
|
||||
|
||||
|
||||
def load(fd: IO[str | bytes]) -> Job:
|
||||
data = json.load(fd)
|
||||
return Job(data["start"], data["size"], data["deadline"])
|
||||
|
||||
def loads(data: str | bytes) -> Job:
|
||||
data = json.loads(data)
|
||||
return Job(data["start"], data["size"], data["deadline"])
|
||||
134
scraper/server/__main__.py
Normal file
134
scraper/server/__main__.py
Normal file
@ -0,0 +1,134 @@
|
||||
from scraper.config import config
|
||||
from flask import Flask, Response, request, render_template, url_for, send_file
|
||||
from pathlib import Path
|
||||
import py34
|
||||
from .block import BLOCK_SIZE, list_blocks, load_blocks, load_block, load_block_stats, save_block, loads as parse_block, enttype2ext
|
||||
from .job import assign_job, working_on, any_job, jobs
|
||||
|
||||
|
||||
# Create Flask application
|
||||
app = Flask(__name__)
|
||||
|
||||
|
||||
# Setup application configuration
|
||||
for key in config["server"]["web"]:
|
||||
app.config[key.replace("-", "_").upper()] = config["server"]["web"][key]
|
||||
|
||||
|
||||
@app.route("/")
|
||||
def index():
|
||||
return render_template("index.j2")
|
||||
|
||||
@app.get("/job")
|
||||
def get_job():
|
||||
blocks = list_blocks()
|
||||
blocks.sort(key = lambda b: b.start)
|
||||
for n, block in zip(range(len(blocks)), blocks):
|
||||
next_block = blocks[n+1] if len(blocks) != n+1 else None
|
||||
|
||||
# Try to fill up hallow blocks
|
||||
if not block.full and not working_on(block.start):
|
||||
return assign_job(block.start, block.size).to_dict()
|
||||
|
||||
# Try to append new block
|
||||
virt_start = block.start + block.size
|
||||
virt_size = BLOCK_SIZE - virt_start % BLOCK_SIZE + 1
|
||||
if next_block is None:
|
||||
|
||||
# We are the last block
|
||||
while True:
|
||||
if not working_on(virt_start):
|
||||
return assign_job(virt_start, virt_size).to_dict()
|
||||
virt_start += virt_size
|
||||
virt_size = BLOCK_SIZE
|
||||
else:
|
||||
|
||||
# We are not the last block
|
||||
while virt_start < next_block.start:
|
||||
if not working_on(virt_start):
|
||||
return assign_job(virt_start, virt_size).to_dict()
|
||||
virt_start += virt_size
|
||||
virt_size = BLOCK_SIZE
|
||||
|
||||
assert len(blocks) == 0 # We should not be here if blocks exists
|
||||
|
||||
if not any_job():
|
||||
return assign_job(1, BLOCK_SIZE).to_dict()
|
||||
return assign_job(
|
||||
max(jobs(), key = lambda j: j.start).start + BLOCK_SIZE,
|
||||
BLOCK_SIZE,
|
||||
).to_dict()
|
||||
|
||||
|
||||
@app.get("/blocks")
|
||||
def get_block_ids():
|
||||
return list(map(lambda l: l.to_dict(), list_blocks()))
|
||||
|
||||
@app.get("/block_stats")
|
||||
def get_block_stats():
|
||||
return list(map(lambda h: h.to_dict(), load_block_stats()))
|
||||
|
||||
@app.get("/rawblock/<post_id>")
|
||||
def get_rawblock(post_id: int = None):
|
||||
assert post_id is not None
|
||||
return Response(
|
||||
mimetype = "application/octet-stream",
|
||||
response = load_block(int(post_id)).dumps(),
|
||||
)
|
||||
|
||||
@app.get("/block/<post_id>")
|
||||
def get_block(post_id: int = None):
|
||||
assert post_id is not None
|
||||
return load_block(int(post_id)).to_dict()
|
||||
|
||||
@app.post("/block")
|
||||
def put_block():
|
||||
save_block(parse_block(request.data))
|
||||
return "ok"
|
||||
|
||||
@app.get("/thumbnail/<post_id>")
|
||||
def get_thumbnail(post_id: int = None):
|
||||
assert post_id is not None
|
||||
post_id = int(post_id)
|
||||
return Response(
|
||||
response=load_block(post_id).entry(post_id).thumbnail,
|
||||
content_type="image/jpeg",
|
||||
)
|
||||
|
||||
@app.get("/sample/<post_id>")
|
||||
def get_sample(post_id: int = None):
|
||||
assert post_id is not None
|
||||
post_id = int(post_id)
|
||||
entry = load_block(post_id).entry(post_id)
|
||||
return Response(
|
||||
status = 307,
|
||||
headers = {
|
||||
"Location": str(py34.url.SampleURL(entry.dir, entry.image))
|
||||
}
|
||||
)
|
||||
|
||||
@app.get("/image/<post_id>")
|
||||
def get_image(post_id: int = None):
|
||||
assert post_id is not None
|
||||
post_id = int(post_id)
|
||||
entry = load_block(post_id).entry(post_id)
|
||||
if entry.type < 100:
|
||||
return Response(
|
||||
"Image not found",
|
||||
status = 404,
|
||||
)
|
||||
|
||||
path = Path.cwd() / Path(f"image/{entry.dir}/{entry.image}.{enttype2ext(entry.type)}")
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
if path.exists():
|
||||
return send_file(path)
|
||||
else:
|
||||
image = py34.scraper.scraper.get(py34.url.ImageURL(entry.dir, entry.image, enttype2ext(entry.type)))
|
||||
with open(path, "wb") as file:
|
||||
file.write(image)
|
||||
return send_file(path)
|
||||
|
||||
|
||||
# Run application
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
123
scraper/server/block.py
Normal file
123
scraper/server/block.py
Normal file
@ -0,0 +1,123 @@
|
||||
from scraper.block import *
|
||||
from scraper.config import config
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
BLOCK_SIZE = config["scraper"]["block-size"]
|
||||
|
||||
|
||||
class BlockListing:
|
||||
def __init__(self, start: int, size: int, full: bool):
|
||||
self.start = int(start)
|
||||
self.size = int(size)
|
||||
self.full = bool(full)
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
"start": self.start,
|
||||
"size": self.size,
|
||||
"full": self.full,
|
||||
}
|
||||
|
||||
|
||||
def _block_dir() -> Path:
|
||||
return Path(config["server"]["block-dir"])
|
||||
|
||||
def _ensure_block_dir() -> Path:
|
||||
path = _block_dir()
|
||||
if path.exists():
|
||||
return path
|
||||
path.mkdir(parents=True, exist_ok=True)
|
||||
return path
|
||||
|
||||
|
||||
def list_blocks() -> list[BlockListing]:
|
||||
path = _ensure_block_dir()
|
||||
block_files = path.glob("*")
|
||||
parts = map(lambda p: p.name.split("-"), path.glob("*"))
|
||||
return sorted(list(map(lambda p: BlockListing(*tuple(map(int, p))), parts)), key=lambda bl: bl.start)
|
||||
|
||||
|
||||
def load_blocks() -> list[Block]:
|
||||
path = _ensure_block_dir()
|
||||
blocks = []
|
||||
for block_path in path.glob("*"):
|
||||
with open(block_path, "rb") as file:
|
||||
blocks.append(load(file))
|
||||
return blocks
|
||||
|
||||
|
||||
def load_block_stats() -> list[BlockHeader]:
|
||||
path = _ensure_block_dir()
|
||||
headers = []
|
||||
for block in path.glob("*"):
|
||||
with open(block, "rb") as file:
|
||||
headers.append(load_header(file))
|
||||
return headers
|
||||
|
||||
|
||||
def load_block(entry_id: int) -> Block:
|
||||
path = _ensure_block_dir()
|
||||
block_files = path.glob("*")
|
||||
low_block: int = None
|
||||
low_block_size = 0
|
||||
high_block: int = None
|
||||
high_block_size = 0
|
||||
|
||||
# Try to find block file
|
||||
for file, start, size, full in map(lambda f: [f]+list(map(int, f.name.split("-"))), path.glob("*")):
|
||||
|
||||
# Find closest non-matching lower block
|
||||
if entry_id >= start and (low_block == None or start >= low_block):
|
||||
low_block = start
|
||||
low_block_size = size
|
||||
|
||||
# Find closest non-matching higher block
|
||||
if entry_id <= start and (high_block == None or start <= high_block):
|
||||
high_block = start
|
||||
high_block_size = size
|
||||
|
||||
# Find matching block
|
||||
if entry_id >= start and entry_id < start+size:
|
||||
# Found it, return
|
||||
with open(file, "rb") as file:
|
||||
return load(file)
|
||||
# Failed...
|
||||
|
||||
# Determine name of the block file
|
||||
block_size = BLOCK_SIZE
|
||||
virtual_block = entry_id // block_size * block_size + 1
|
||||
virtual_block_end = virtual_block + block_size # Not inclusive
|
||||
|
||||
# Clamp around lower block
|
||||
if low_block is not None:
|
||||
virtual_block = max(virtual_block, low_block+low_block_size)
|
||||
|
||||
# Clamp around higher block
|
||||
if high_block is not None:
|
||||
virtual_block_end = min(virtual_block_end, high_block)
|
||||
|
||||
# Create a new block
|
||||
return Block(virtual_block, virtual_block_end - virtual_block)
|
||||
|
||||
|
||||
def save_block(new_block: Block):
|
||||
# Don't bother saving empty blocks
|
||||
if len(new_block.entries) == 0:
|
||||
return
|
||||
|
||||
def write_block(b: Block):
|
||||
path = _ensure_block_dir()
|
||||
with open(path / f"{b.start}-{b.size}-{int(len(b.entries)==b.size)}", "wb") as file:
|
||||
dump(block, file)
|
||||
badfile = path / f"{b.start}-{b.size}-{int(len(b.entries)!=b.size)}"
|
||||
if badfile.exists():
|
||||
badfile.unlink()
|
||||
|
||||
block = load_block(new_block.entries[0].post)
|
||||
for entry in new_block.entries:
|
||||
if block.start + block.size <= entry.post:
|
||||
write_block(block)
|
||||
block = load_block(entry.post)
|
||||
block.add_entry(entry)
|
||||
write_block(block)
|
||||
44
scraper/server/job.py
Normal file
44
scraper/server/job.py
Normal file
@ -0,0 +1,44 @@
|
||||
from scraper.job import *
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
_jobs: list[Job] = []
|
||||
|
||||
|
||||
def _clean_jobs():
|
||||
global _jobs
|
||||
_jobs = list(filter(lambda j: datetime.fromtimestamp(j.deadline) > datetime.now(), _jobs))
|
||||
|
||||
|
||||
def working_on(block_start: int) -> bool:
|
||||
global _jobs
|
||||
|
||||
_clean_jobs()
|
||||
|
||||
for job in _jobs:
|
||||
if job.start <= block_start and job.start + job.size > block_start:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def assign_job(block_start: int, block_size: int) -> Job:
|
||||
global _jobs
|
||||
|
||||
if working_on(block_start):
|
||||
raise Exception("Job already assigned")
|
||||
job = Job(block_start, block_size)
|
||||
_jobs.append(job)
|
||||
return job
|
||||
|
||||
|
||||
def jobs() -> Job:
|
||||
global _jobs
|
||||
_clean_jobs()
|
||||
for job in _jobs:
|
||||
yield job
|
||||
|
||||
|
||||
def any_job() -> bool:
|
||||
global _jobs
|
||||
_clean_jobs()
|
||||
return bool(len(_jobs))
|
||||
2
scraper/server/static/d3.min.js
vendored
Normal file
2
scraper/server/static/d3.min.js
vendored
Normal file
File diff suppressed because one or more lines are too long
BIN
scraper/server/static/sus.png
Normal file
BIN
scraper/server/static/sus.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 18 KiB |
11
scraper/server/templates/index.j2
Normal file
11
scraper/server/templates/index.j2
Normal file
@ -0,0 +1,11 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Rule34 Scraper</title>
|
||||
<script src={{url_for("static", filename="d3.min.js")}}></script>
|
||||
</head>
|
||||
<head>
|
||||
<h1>Hello, World!</h1>
|
||||
<img src={{url_for("static", filename="sus.png")}}>
|
||||
</head>
|
||||
</html>
|
||||
Reference in New Issue
Block a user