26 Commits

Author SHA1 Message Date
3ead328a0f Merge branch 'master' into scraper 2025-10-09 20:37:55 +02:00
f55f118271 Merge branch 'master' into scraper 2025-10-09 20:07:02 +02:00
1dc17d7670 Don't create py34.Post for downloading images
py34.Post does extension scan, that is not required as it is saved
inside the block data.
2025-10-09 20:05:36 +02:00
6d6e13c231 Merge branch 'master' into scraper 2025-10-09 19:25:12 +02:00
a4e3ced8ab Download videos too, prioritize video over image 2025-10-09 19:18:26 +02:00
0b3293eaa8 Merge branch 'master' into scraper 2025-10-09 19:17:26 +02:00
5473f92470 Merge branch 'master' into scraper 2025-10-03 00:09:06 +02:00
ee4ba5d5f2 Create file after the image was retrieved
`post.get_image_data()` can throw, creating file and later calling the function can create empty "image" files, confusing the program.
2025-10-02 23:40:33 +02:00
217595fcdd Download and cache images 2025-09-23 12:44:36 +02:00
be6740a24c Added api route for getting raw block 2025-09-23 02:24:57 +02:00
71efaf7c25 Merge branch 'master' into scraper 2025-08-27 19:22:17 +02:00
b81684de19 Merge branch 'master' into scraper 2025-08-27 17:47:50 +02:00
f9fe6af292 Merge branch 'master' into scraper 2025-08-26 18:20:06 +02:00
6d1b1628f1 Merge branch 'master' into scraper 2025-08-26 18:15:14 +02:00
0143cc3999 Merge branch 'master' into scraper 2025-08-26 18:02:07 +02:00
a39bfcd0b1 Merge branch 'master' into scraper 2025-08-15 19:32:33 +02:00
826c5c3473 Merge branch 'master' into scraper 2025-08-15 19:31:40 +02:00
265ec8d58e Merge branch 'master' into scraper 2025-08-15 19:30:42 +02:00
241bc90e82 Merge branch 'master' into scraper 2025-08-15 19:28:08 +02:00
bdf77b2920 Merge remote-tracking branch 'refs/remotes/origin/scraper' into scraper 2025-08-15 18:48:23 +02:00
5f4358a3d1 Probe with HEAD requests, not GET 2025-08-15 18:47:58 +02:00
8e3a7b105a Fixed invalid extension check 2025-08-10 15:50:40 +02:00
d0dec584a8 Removed debug print 2025-08-10 15:49:03 +02:00
fb06339cc7 Added flask 2025-08-10 15:26:14 +02:00
48d17fcf7b Merge branch 'master' into scraper 2025-08-09 17:23:49 +02:00
c1b8be46aa Initial scraper commit 2025-08-09 17:15:20 +02:00
12 changed files with 899 additions and 0 deletions

17
config.toml Normal file
View File

@ -0,0 +1,17 @@
[scraper]
block-size = 1050 # Max 65536
job-deadline = 3600
[client]
server = "http://localhost:8000/"
network-threads = 12
check-missing = false
patch-failed = true
[server]
block-dir = "blocks"
[server.web]
debug = true
server-name = "localhost:8000"
application-root = "/"

View File

@ -1,10 +1,16 @@
beautifulsoup4==4.13.4
blinker==1.9.0
bs4==0.0.2
certifi==2025.7.14
charset-normalizer==3.4.2
click==8.2.1
cloudscraper==1.2.71
decorator==5.2.1
Flask==3.1.1
idna==3.10
itsdangerous==2.2.0
Jinja2==3.1.6
MarkupSafe==3.0.2
pillow==11.3.0
py==1.11.0
pyparsing==3.2.3
@ -14,3 +20,4 @@ retry==0.9.2
soupsieve==2.7
typing_extensions==4.14.1
urllib3==2.5.0
Werkzeug==3.1.3

331
scraper/block.py Normal file
View File

@ -0,0 +1,331 @@
from py34 import Post
from py34.url import ImageURL, IMAGE_FORMATS, VIDEO_FORMATS
from py34.scraper import scraper
from typing import IO
from io import BytesIO
from enum import IntEnum
from bisect import insort
import math
import zlib
import time
class BlockEntryType(IntEnum):
# Status
ERROR = 0
MISSING = 1
MAYBE_MISSING = 2 # Did not appear on the listing, but the post was never checked
# Image
JPG = 100
JPEG = 101
PNG = 102
# Animated Image
GIF = 200
# Audio
# Video
MP4 = 400
WEBM = 401
def ext2enttype(ext: str) -> BlockEntryType:
return BlockEntryType[ext.upper()]
def enttype2ext(enttype: BlockEntryType) -> str:
if enttype.value < 50:
raise Exception("Entry does not refer to a file")
return enttype.name.lower()
def _compute_tag_bits(tags: list[str]) -> list[dict[int]]:
# Compute tag bits
tag_bits = [{} for _ in range(math.ceil(len(tags) / 32))]
for n in range(len(tags)):
bit = n % 32
rank = n // 32
tag_bits[rank][tags[n]] = 1 << bit
return tag_bits
def _pad_bytes(size: int, pad_addr: int) -> int:
addr_bytes = pad_addr // 8
overflow = size % addr_bytes
if overflow == 0:
return 0
else:
return addr_bytes - overflow
class Block:
def __init__(self, start: int, size: int):
self.start: int = start
self.size: int = size
self.entries: list[BlockEntry] = []
def add_post(self, post: Post, enttype: BlockEntryType | None = None):
self.add_entry(BlockEntry(post, enttype))
def add_entry(self, entry: "BlockEntry"):
# Check if entry belongs to this block
if not self.belongs(entry.post):
raise Exception("Given post does not belong to this block")
# Remove entry with matching post-id, if exists
self.entries = list(filter(lambda e: e.post != entry.post, self.entries))
# insert the entry
insort(self.entries, entry)
def dump(self, fd: IO[bytes]):
# Sort all entries
self.entries.sort()
# Combine all tags
tags: set[str] = set()
for entry in self.entries:
tags |= set(entry.tags)
tags: list[str] = list(tags)
tags.sort()
# Compute tag bits
tag_bits = _compute_tag_bits(tags)
# Compute length of tag stringzs
tags_len_b = 0
for tag in tags:
tags_len_b += len(tag) + 1
# Compress tag data
tag_data = zlib.compress((b"\0".join(map(str.encode, tags)))+b"\0", level=9)
# Magic 4B
fd.write(b"34BK")
# Entry amount 2B
fd.write(len(self.entries).to_bytes(2, "little", signed=False))
# Size 2B
fd.write(self.size.to_bytes(2, "little", signed=False))
# Start 4B
fd.write(self.start.to_bytes(4, "little", signed=False))
# Amount of different tags 2B
fd.write(len(tags).to_bytes(2, "little", signed=False))
# Size of tag data 2B
fd.write(len(tag_data).to_bytes(2, "little", signed=False))
# Write all the tags
fd.write(tag_data)
# Pad to the nearest 32bit address
for _ in range(_pad_bytes(len(tag_data), 32)):
fd.write(b"\0")
# Dump entries
for entry in self.entries:
image = bytearray.fromhex(entry.image)
# Post ID 4B
fd.write(entry.post.to_bytes(4, "little", signed=False))
# Entry type enum 2B
fd.write(entry.type.to_bytes(2, "little", signed=False))
# Image id length 2B
fd.write(len(image).to_bytes(2, "little", signed=False))
# Image dir 4B
fd.write(entry.dir.to_bytes(4, "little", signed=False))
# Thumbnail size 4B
fd.write(len(entry.thumbnail).to_bytes(4, "little", signed=False))
# Tag bits 4B*ranks
for rank in tag_bits:
word = 0
for tag in entry.tags:
if tag in rank:
word |= rank[tag]
fd.write(word.to_bytes(4, "little", signed=False))
# Image ID
fd.write(image)
# Thumbnail
fd.write(entry.thumbnail)
# Pad to the nearest 32bit address
for _ in range(_pad_bytes(len(image) + len(entry.thumbnail), 32)):
fd.write(b"\0")
def dumps(self) -> bytes:
io = BytesIO()
self.dump(io)
return io.getvalue()
def entry(self, post_id: int) -> "BlockEntry":
for entry in self.entries:
if entry.post == post_id:
return entry
raise IndexError("Entry not found")
def belongs(self, post_id: int) -> bool:
return post_id >= self.start and post_id < self.start+self.size
def to_dict(self):
return {
"start": self.start,
"size": self.size,
"entries": list(map(BlockEntry.to_dict, self.entries)),
}
class BlockEntry:
def __init__(self, post: Post, enttype: BlockEntryType | None):
self.post = post.id
self.dir = post.image_dir
self.image = post.image_id
self.type = enttype
self.tags = post.tags.copy()
self.thumbnail = post.get_thumbnail_data()
if self.type is None:
for ext in VIDEO_FORMATS + IMAGE_FORMATS:
image_url = ImageURL(post.image_dir, post.image_id, ext)
status: int = None
# CDN tends to return 503 : Service Unavailable
while status != 200 and status != 404:
status = scraper.head(image_url, body=False).status_code
if status != 200 and status != 404:
scraper.reset()
time.sleep(1)
# HEAD could fail, try with GET
while status != 200 and status != 404:
status = scraper.get(image_url, body=False).status_code
if status != 200 and status != 404:
scraper.reset()
time.sleep(1)
if status == 200:
self.type = ext2enttype(ext)
break
if self.type is None:
self.type = BlockEntryType.ERROR
def to_dict(self):
return {
"post": self.post,
"dir": self.dir,
"image": self.image,
"type": self.type.name,
"tags": self.tags,
}
def __lt__(self, other):
return self.post < other.post
class BlockHeader:
def __init__(self, start: int, size: int, entries: int):
self.start = start
self.size = size
self.entries = entries
def to_dict(self):
return {
"start": self.start,
"size": self.size,
"entries": self.entries,
}
def dump(block: Block, fd: IO[bytes]):
block.dump(fd)
def dumps(block: Block) -> bytes:
return block.dumps()
def load_header(fd: IO[bytes]) -> Block:
if fd.read(4) != b"34BK":
raise Exception("Stream is not Block data")
def read_dword(fd: IO[bytes]) -> int:
return int.from_bytes(fd.read(4), "little", signed=False)
def read_word(fd: IO[bytes]) -> int:
return int.from_bytes(fd.read(2), "little", signed=False)
entries_len = read_word(fd)
block_size = read_word(fd)
block_start = read_dword(fd)
return BlockHeader(block_start, block_size, entries_len)
def load(fd: IO[bytes]) -> Block:
if fd.read(4) != b"34BK":
raise Exception("Stream is not Block data")
def read_dword(fd: IO[bytes]) -> int:
return int.from_bytes(fd.read(4), "little", signed=False)
def read_word(fd: IO[bytes]) -> int:
return int.from_bytes(fd.read(2), "little", signed=False)
# Read header
entries_len = read_word(fd)
block_size = read_word(fd)
block_start = read_dword(fd)
tags_len = read_word(fd)
tags_data_len = read_word(fd)
# Read tags
tags_data = zlib.decompress(fd.read(tags_data_len))
tags = list(map(bytes.decode, tags_data.rstrip(b"\0").split(b"\0")))
# Slurp padding bytes
fd.read(_pad_bytes(tags_data_len, 32))
# Compute tag bits
tag_bits = _compute_tag_bits(tags)
# Load entries
block = Block(block_start, block_size)
for n in range(entries_len):
# Read header
post_id = read_dword(fd)
enttype = BlockEntryType(read_word(fd))
image_id_len = read_word(fd)
image_dir = read_dword(fd)
thumbnail_size = read_dword(fd)
# Read tags
post_tags = []
for rank in tag_bits:
bits = read_dword(fd)
for tag in rank:
if rank[tag] & bits:
post_tags.append(tag)
# Read image id
image = fd.read(image_id_len)
image_id = image.hex()
# Read image thumbnail
thumbnail = fd.read(thumbnail_size)
# Slurp padding bytes
fd.read(_pad_bytes(image_id_len + thumbnail_size, 32))
block.add_post(Post(post_id, image_dir, image_id, post_tags, thumbnail), enttype)
return block
def loads(data: bytes) -> Block:
return load(BytesIO(data));

187
scraper/client/__main__.py Normal file
View File

@ -0,0 +1,187 @@
from datetime import datetime
from enum import StrEnum
from concurrent.futures import ThreadPoolExecutor
import requests
import tomllib
import traceback
import gc
import py34
import scraper.job
_spinner = 0
def spinner() -> str:
global _spinner
_spinner += 1
return "/-\\|"[_spinner % 4]
class FG(StrEnum):
_ = "\x1B[0m"
r = "\x1B[31m"
g = "\x1B[32m"
y = "\x1B[33m"
b = "\x1B[34m"
p = "\x1B[35m"
c = "\x1B[36m"
def web_get(url: str) -> str:
res = requests.get(url)
if res.status_code != 200:
raise Exception("Failed to request "+url)
content = res.content
res.close()
return content
def web_post(url: str, data: bytes):
res = requests.post(url, data)
if res.status_code != 200:
raise Exception("Failed to request "+url)
res.close()
# Load config
print("Loading config.toml")
with open("config.toml", "rb") as file:
config = tomllib.load(file)
netthd = config["client"]["network-threads"]
server = config["client"]["server"]
check_missing = config["client"]["check-missing"]
patch_failed = config["client"]["patch-failed"]
if server[-1] != "/":
server += "/"
# Print config stats
print("=========================================================")
print("network threads:", netthd)
print("server: ", server)
print("check missing: ", check_missing)
print("patch failed: ", patch_failed)
print("=========================================================")
# Enter main loop
# while True:
# It leaks, idk how to fix it, restart program for now :p
if True:
# Fetch a job
print("Requesting job...", end="")
job = scraper.job.loads(web_get(server + "job"))
# Print job stats
deadline_date = datetime.fromtimestamp(job.deadline)
print(
f"\rGot work from {FG.c}{job.start}{FG._}" +
f" to {FG.c}{job.start + job.size - 1}{FG._}," +
f" with deadline at {FG.p}{deadline_date}{FG._}"
)
# Prepare a block
block = scraper.block.Block(job.start, job.size)
# Prepare dictionary for expected blocks
expected: dict[bool] = {}
for n in range(block.start, block.start + block.size):
expected[n] = False
# Start scraping
pid = 0
end = False
failed: list[py34.Post] = []
while not end:
# Get a list
# print(f"Downloading listing page {FG.b}{pid}{FG._}")
lst = py34.List(["id:<"+str(job.start + job.size)], pid)
# Check if we have reached the end
for post in lst.posts:
if post.id <= job.start:
end = True
# If so, remove posts that exceed our job quota
if end:
lst.posts = list(filter(lambda p: p.id >= job.start, lst.posts))
# Add posts to block, fetching extensions in the process
def _add_post(block: scraper.block.Block, post: py34.Post):
try:
expected[post.id] = True
entry = scraper.block.BlockEntry(post, None)
block.add_entry(entry)
if entry.type == scraper.block.BlockEntryType.ERROR:
failed.append(post)
print(
f"\rPage {FG.b}{pid}{FG._}" +
f" Constructing block {FG.y}{len(block.entries)}/{block.size}{FG._}" +
(f", failed {FG.r}{len(failed)}{FG._}" if len(failed) > 0 else ""),
end="",
)
except Exception as ex:
print(f"{FG.r}{ex}{FG._}")
print(f"{FG.y}{traceback.format_exc()}{FG._}")
raise ex
with ThreadPoolExecutor(max_workers=netthd) as pool:
for post in lst.posts:
pool.submit(_add_post, block, post)
# Increase pid for next iteration
pid += len(lst.posts)
# Clean leaking connection handles
py34.scraper.scraper.reset()
gc.collect()
print()
# Patch failed block entries
if patch_failed:
for post in failed:
print(f"Investigating {py34.url.ViewURL(post.id)}...", end="")
try:
view = py34.View(post.id)
block.add_post(view.to_post(), scraper.block.ext2enttype(view.image_url.format))
print(f" {FG.g}Found {view.image_url.format.upper()}!{FG._}")
except py34.view.ViewMissingException as ex:
block.add_post(py34.Post.empty(post.id), scraper.block.BlockEntryType.MISSING)
print(f" {FG.r}Entry does not exist.{FG._}")
# Find missing block entries
missing: list[int] = []
for post_id in expected:
if expected[post_id]:
continue
missing.append(post_id)
if check_missing:
found: int = 0
for n, post_id in zip(range(len(missing)), missing):
print(
f"\x1B[2K\rSearching for missing entries" +
f" {FG.y}{n+1}/{len(missing)}{FG._}" +
f", found {FG.g}{found}{FG._}" +
f", missing {FG.r}{len(missing)-found}{FG._}",
end="",
)
try:
view = py34.View(post_id)
block.add_post(view.to_post(), scraper.block.ext2enttype(view.image_url.format))
found += 1
except py34.view.ViewMissingException as ex:
block.add_post(py34.Post.empty(post_id), scraper.block.BlockEntryType.MISSING)
print()
else:
for post_id in missing:
block.add_post(py34.Post.empty(post_id), scraper.block.BlockEntryType.MAYBE_MISSING)
# Sending block to server
print(f"Block: {len(block.entries)}/{block.size}")
print("Sending block to server...", end="")
try:
web_post(server + "block", scraper.block.dumps(block))
print(f"{FG.g}Ok{FG._}")
except Exception as ex:
print(f"{FG.r}Fail{FG._}")

5
scraper/config.py Normal file
View File

@ -0,0 +1,5 @@
import tomllib
with open("config.toml", "rb") as file:
config = tomllib.load(file)

38
scraper/job.py Normal file
View File

@ -0,0 +1,38 @@
from .config import config
from .block import Block
from typing import IO
from datetime import datetime, timedelta
import json
class Job:
def __init__(self, start, size, deadline = None):
self.start = start
self.size = size
if deadline is None:
self.deadline = (datetime.now() + timedelta(seconds=config["scraper"]["job-deadline"])).timestamp()
else:
self.deadline = deadline
def to_dict(self):
return {
"start": self.start,
"size": self.size,
"deadline": self.deadline,
}
def dump(job: Job, fd: IO[str | bytes]):
json.dump(job.to_dict(), fd)
def dumps(job: Job) -> str:
return json.dumps(job.to_dict(), fd)
def load(fd: IO[str | bytes]) -> Job:
data = json.load(fd)
return Job(data["start"], data["size"], data["deadline"])
def loads(data: str | bytes) -> Job:
data = json.loads(data)
return Job(data["start"], data["size"], data["deadline"])

134
scraper/server/__main__.py Normal file
View File

@ -0,0 +1,134 @@
from scraper.config import config
from flask import Flask, Response, request, render_template, url_for, send_file
from pathlib import Path
import py34
from .block import BLOCK_SIZE, list_blocks, load_blocks, load_block, load_block_stats, save_block, loads as parse_block, enttype2ext
from .job import assign_job, working_on, any_job, jobs
# Create Flask application
app = Flask(__name__)
# Setup application configuration
for key in config["server"]["web"]:
app.config[key.replace("-", "_").upper()] = config["server"]["web"][key]
@app.route("/")
def index():
return render_template("index.j2")
@app.get("/job")
def get_job():
blocks = list_blocks()
blocks.sort(key = lambda b: b.start)
for n, block in zip(range(len(blocks)), blocks):
next_block = blocks[n+1] if len(blocks) != n+1 else None
# Try to fill up hallow blocks
if not block.full and not working_on(block.start):
return assign_job(block.start, block.size).to_dict()
# Try to append new block
virt_start = block.start + block.size
virt_size = BLOCK_SIZE - virt_start % BLOCK_SIZE + 1
if next_block is None:
# We are the last block
while True:
if not working_on(virt_start):
return assign_job(virt_start, virt_size).to_dict()
virt_start += virt_size
virt_size = BLOCK_SIZE
else:
# We are not the last block
while virt_start < next_block.start:
if not working_on(virt_start):
return assign_job(virt_start, virt_size).to_dict()
virt_start += virt_size
virt_size = BLOCK_SIZE
assert len(blocks) == 0 # We should not be here if blocks exists
if not any_job():
return assign_job(1, BLOCK_SIZE).to_dict()
return assign_job(
max(jobs(), key = lambda j: j.start).start + BLOCK_SIZE,
BLOCK_SIZE,
).to_dict()
@app.get("/blocks")
def get_block_ids():
return list(map(lambda l: l.to_dict(), list_blocks()))
@app.get("/block_stats")
def get_block_stats():
return list(map(lambda h: h.to_dict(), load_block_stats()))
@app.get("/rawblock/<post_id>")
def get_rawblock(post_id: int = None):
assert post_id is not None
return Response(
mimetype = "application/octet-stream",
response = load_block(int(post_id)).dumps(),
)
@app.get("/block/<post_id>")
def get_block(post_id: int = None):
assert post_id is not None
return load_block(int(post_id)).to_dict()
@app.post("/block")
def put_block():
save_block(parse_block(request.data))
return "ok"
@app.get("/thumbnail/<post_id>")
def get_thumbnail(post_id: int = None):
assert post_id is not None
post_id = int(post_id)
return Response(
response=load_block(post_id).entry(post_id).thumbnail,
content_type="image/jpeg",
)
@app.get("/sample/<post_id>")
def get_sample(post_id: int = None):
assert post_id is not None
post_id = int(post_id)
entry = load_block(post_id).entry(post_id)
return Response(
status = 307,
headers = {
"Location": str(py34.url.SampleURL(entry.dir, entry.image))
}
)
@app.get("/image/<post_id>")
def get_image(post_id: int = None):
assert post_id is not None
post_id = int(post_id)
entry = load_block(post_id).entry(post_id)
if entry.type < 100:
return Response(
"Image not found",
status = 404,
)
path = Path.cwd() / Path(f"image/{entry.dir}/{entry.image}.{enttype2ext(entry.type)}")
path.parent.mkdir(parents=True, exist_ok=True)
if path.exists():
return send_file(path)
else:
image = py34.scraper.scraper.get(py34.url.ImageURL(entry.dir, entry.image, enttype2ext(entry.type)))
with open(path, "wb") as file:
file.write(image)
return send_file(path)
# Run application
if __name__ == "__main__":
app.run()

123
scraper/server/block.py Normal file
View File

@ -0,0 +1,123 @@
from scraper.block import *
from scraper.config import config
from pathlib import Path
BLOCK_SIZE = config["scraper"]["block-size"]
class BlockListing:
def __init__(self, start: int, size: int, full: bool):
self.start = int(start)
self.size = int(size)
self.full = bool(full)
def to_dict(self):
return {
"start": self.start,
"size": self.size,
"full": self.full,
}
def _block_dir() -> Path:
return Path(config["server"]["block-dir"])
def _ensure_block_dir() -> Path:
path = _block_dir()
if path.exists():
return path
path.mkdir(parents=True, exist_ok=True)
return path
def list_blocks() -> list[BlockListing]:
path = _ensure_block_dir()
block_files = path.glob("*")
parts = map(lambda p: p.name.split("-"), path.glob("*"))
return sorted(list(map(lambda p: BlockListing(*tuple(map(int, p))), parts)), key=lambda bl: bl.start)
def load_blocks() -> list[Block]:
path = _ensure_block_dir()
blocks = []
for block_path in path.glob("*"):
with open(block_path, "rb") as file:
blocks.append(load(file))
return blocks
def load_block_stats() -> list[BlockHeader]:
path = _ensure_block_dir()
headers = []
for block in path.glob("*"):
with open(block, "rb") as file:
headers.append(load_header(file))
return headers
def load_block(entry_id: int) -> Block:
path = _ensure_block_dir()
block_files = path.glob("*")
low_block: int = None
low_block_size = 0
high_block: int = None
high_block_size = 0
# Try to find block file
for file, start, size, full in map(lambda f: [f]+list(map(int, f.name.split("-"))), path.glob("*")):
# Find closest non-matching lower block
if entry_id >= start and (low_block == None or start >= low_block):
low_block = start
low_block_size = size
# Find closest non-matching higher block
if entry_id <= start and (high_block == None or start <= high_block):
high_block = start
high_block_size = size
# Find matching block
if entry_id >= start and entry_id < start+size:
# Found it, return
with open(file, "rb") as file:
return load(file)
# Failed...
# Determine name of the block file
block_size = BLOCK_SIZE
virtual_block = entry_id // block_size * block_size + 1
virtual_block_end = virtual_block + block_size # Not inclusive
# Clamp around lower block
if low_block is not None:
virtual_block = max(virtual_block, low_block+low_block_size)
# Clamp around higher block
if high_block is not None:
virtual_block_end = min(virtual_block_end, high_block)
# Create a new block
return Block(virtual_block, virtual_block_end - virtual_block)
def save_block(new_block: Block):
# Don't bother saving empty blocks
if len(new_block.entries) == 0:
return
def write_block(b: Block):
path = _ensure_block_dir()
with open(path / f"{b.start}-{b.size}-{int(len(b.entries)==b.size)}", "wb") as file:
dump(block, file)
badfile = path / f"{b.start}-{b.size}-{int(len(b.entries)!=b.size)}"
if badfile.exists():
badfile.unlink()
block = load_block(new_block.entries[0].post)
for entry in new_block.entries:
if block.start + block.size <= entry.post:
write_block(block)
block = load_block(entry.post)
block.add_entry(entry)
write_block(block)

44
scraper/server/job.py Normal file
View File

@ -0,0 +1,44 @@
from scraper.job import *
from datetime import datetime
_jobs: list[Job] = []
def _clean_jobs():
global _jobs
_jobs = list(filter(lambda j: datetime.fromtimestamp(j.deadline) > datetime.now(), _jobs))
def working_on(block_start: int) -> bool:
global _jobs
_clean_jobs()
for job in _jobs:
if job.start <= block_start and job.start + job.size > block_start:
return True
return False
def assign_job(block_start: int, block_size: int) -> Job:
global _jobs
if working_on(block_start):
raise Exception("Job already assigned")
job = Job(block_start, block_size)
_jobs.append(job)
return job
def jobs() -> Job:
global _jobs
_clean_jobs()
for job in _jobs:
yield job
def any_job() -> bool:
global _jobs
_clean_jobs()
return bool(len(_jobs))

2
scraper/server/static/d3.min.js vendored Normal file

File diff suppressed because one or more lines are too long

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

View File

@ -0,0 +1,11 @@
<!DOCTYPE html>
<html>
<head>
<title>Rule34 Scraper</title>
<script src={{url_for("static", filename="d3.min.js")}}></script>
</head>
<head>
<h1>Hello, World!</h1>
<img src={{url_for("static", filename="sus.png")}}>
</head>
</html>