Merge branch 'master' into scraper

Don't create py34.Post for downloading images
2025-10-09 20:37:55 +02:00 · 2025-10-09 20:07:02 +02:00 · 2025-10-09 20:05:36 +02:00 · 2025-10-09 19:25:12 +02:00 · 2025-10-09 19:18:26 +02:00 · 2025-10-09 19:17:26 +02:00
12 changed files with 899 additions and 0 deletions
--- a/config.toml
+++ b/config.toml
@ -0,0 +1,17 @@
+[scraper]
+block-size = 1050  # Max 65536
+job-deadline = 3600
+
+[client]
+server = "http://localhost:8000/"
+network-threads = 12
+check-missing = false
+patch-failed = true
+
+[server]
+block-dir = "blocks"
+
+[server.web]
+debug = true
+server-name = "localhost:8000"
+application-root = "/"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,10 +1,16 @@
 beautifulsoup4==4.13.4
+blinker==1.9.0
 bs4==0.0.2
 certifi==2025.7.14
 charset-normalizer==3.4.2
+click==8.2.1
 cloudscraper==1.2.71
 decorator==5.2.1
+Flask==3.1.1
 idna==3.10
+itsdangerous==2.2.0
+Jinja2==3.1.6
+MarkupSafe==3.0.2
 pillow==11.3.0
 py==1.11.0
 pyparsing==3.2.3
@ -14,3 +20,4 @@ retry==0.9.2
 soupsieve==2.7
 typing_extensions==4.14.1
 urllib3==2.5.0
+Werkzeug==3.1.3
--- a/scraper/block.py
+++ b/scraper/block.py
@ -0,0 +1,331 @@
+from py34 import Post
+from py34.url import ImageURL, IMAGE_FORMATS, VIDEO_FORMATS
+from py34.scraper import scraper
+from typing import IO
+from io import BytesIO
+from enum import IntEnum
+from bisect import insort
+import math
+import zlib
+import time
+
+
+class BlockEntryType(IntEnum):
+    # Status
+    ERROR = 0
+    MISSING = 1
+    MAYBE_MISSING = 2  # Did not appear on the listing, but the post was never checked
+
+    # Image
+    JPG = 100
+    JPEG = 101
+    PNG = 102
+
+    # Animated Image
+    GIF = 200
+
+    # Audio
+
+    # Video
+    MP4 = 400
+    WEBM = 401
+
+
+def ext2enttype(ext: str) -> BlockEntryType:
+    return BlockEntryType[ext.upper()]
+
+def enttype2ext(enttype: BlockEntryType) -> str:
+    if enttype.value < 50:
+        raise Exception("Entry does not refer to a file")
+    return enttype.name.lower()
+
+
+def _compute_tag_bits(tags: list[str]) -> list[dict[int]]:
+    # Compute tag bits
+    tag_bits = [{} for _ in range(math.ceil(len(tags) / 32))]
+    for n in range(len(tags)):
+        bit = n % 32
+        rank = n // 32
+        tag_bits[rank][tags[n]] = 1 << bit
+    return tag_bits
+
+
+def _pad_bytes(size: int, pad_addr: int) -> int:
+    addr_bytes = pad_addr // 8
+    overflow = size % addr_bytes
+    if overflow == 0:
+        return 0
+    else:
+        return addr_bytes - overflow
+
+
+class Block:
+    def __init__(self, start: int, size: int):
+        self.start:   int              = start
+        self.size:    int              = size
+        self.entries: list[BlockEntry] = []
+
+    def add_post(self, post: Post, enttype: BlockEntryType | None = None):
+        self.add_entry(BlockEntry(post, enttype))
+
+    def add_entry(self, entry: "BlockEntry"):
+        # Check if entry belongs to this block
+        if not self.belongs(entry.post):
+            raise Exception("Given post does not belong to this block")
+        # Remove entry with matching post-id, if exists
+        self.entries = list(filter(lambda e: e.post != entry.post, self.entries))
+        # insert the entry
+        insort(self.entries, entry)
+
+    def dump(self, fd: IO[bytes]):
+        # Sort all entries
+        self.entries.sort()
+
+        # Combine all tags
+        tags: set[str] = set()
+        for entry in self.entries:
+            tags |= set(entry.tags)
+        tags: list[str] = list(tags)
+        tags.sort()
+
+        # Compute tag bits
+        tag_bits = _compute_tag_bits(tags)
+
+        # Compute length of tag stringzs
+        tags_len_b = 0
+        for tag in tags:
+            tags_len_b += len(tag) + 1
+
+        # Compress tag data
+        tag_data = zlib.compress((b"\0".join(map(str.encode, tags)))+b"\0", level=9)
+
+        # Magic 4B
+        fd.write(b"34BK")
+
+        # Entry amount 2B
+        fd.write(len(self.entries).to_bytes(2, "little", signed=False))
+
+        # Size 2B
+        fd.write(self.size.to_bytes(2, "little", signed=False))
+
+        # Start 4B
+        fd.write(self.start.to_bytes(4, "little", signed=False))
+
+        # Amount of different tags 2B
+        fd.write(len(tags).to_bytes(2, "little", signed=False))
+
+        # Size of tag data 2B
+        fd.write(len(tag_data).to_bytes(2, "little", signed=False))
+
+        # Write all the tags
+        fd.write(tag_data)
+
+        # Pad to the nearest 32bit address
+        for _ in range(_pad_bytes(len(tag_data), 32)):
+            fd.write(b"\0")
+
+        # Dump entries
+        for entry in self.entries:
+            image = bytearray.fromhex(entry.image)
+
+            # Post ID 4B
+            fd.write(entry.post.to_bytes(4, "little", signed=False))
+
+            # Entry type enum 2B
+            fd.write(entry.type.to_bytes(2, "little", signed=False))
+
+            # Image id length 2B
+            fd.write(len(image).to_bytes(2, "little", signed=False))
+
+            # Image dir 4B
+            fd.write(entry.dir.to_bytes(4, "little", signed=False))
+
+            # Thumbnail size 4B
+            fd.write(len(entry.thumbnail).to_bytes(4, "little", signed=False))
+
+            # Tag bits 4B*ranks
+            for rank in tag_bits:
+                word = 0
+                for tag in entry.tags:
+                    if tag in rank:
+                        word |= rank[tag]
+                fd.write(word.to_bytes(4, "little", signed=False))
+
+            # Image ID
+            fd.write(image)
+
+            # Thumbnail
+            fd.write(entry.thumbnail)
+
+            # Pad to the nearest 32bit address
+            for _ in range(_pad_bytes(len(image) + len(entry.thumbnail), 32)):
+                fd.write(b"\0")
+
+    def dumps(self) -> bytes:
+        io = BytesIO()
+        self.dump(io)
+        return io.getvalue()
+
+    def entry(self, post_id: int) -> "BlockEntry":
+        for entry in self.entries:
+            if entry.post == post_id:
+                return entry
+        raise IndexError("Entry not found")
+
+    def belongs(self, post_id: int) -> bool:
+        return post_id >= self.start and post_id < self.start+self.size
+
+    def to_dict(self):
+        return {
+            "start": self.start,
+            "size": self.size,
+            "entries": list(map(BlockEntry.to_dict, self.entries)),
+        }
+
+
+class BlockEntry:
+    def __init__(self, post: Post, enttype: BlockEntryType | None):
+        self.post = post.id
+        self.dir = post.image_dir
+        self.image = post.image_id
+        self.type = enttype
+        self.tags = post.tags.copy()
+        self.thumbnail = post.get_thumbnail_data()
+
+        if self.type is None:
+            for ext in VIDEO_FORMATS + IMAGE_FORMATS:
+                image_url = ImageURL(post.image_dir, post.image_id, ext)
+                status: int = None
+                # CDN tends to return 503 : Service Unavailable
+                while status != 200 and status != 404:
+                    status = scraper.head(image_url, body=False).status_code
+                    if status != 200 and status != 404:
+                        scraper.reset()
+                        time.sleep(1)
+                # HEAD could fail, try with GET
+                while status != 200 and status != 404:
+                    status = scraper.get(image_url, body=False).status_code
+                    if status != 200 and status != 404:
+                        scraper.reset()
+                        time.sleep(1)
+                if status == 200:
+                    self.type = ext2enttype(ext)
+                    break
+        if self.type is None:
+            self.type = BlockEntryType.ERROR
+
+    def to_dict(self):
+        return {
+            "post": self.post,
+            "dir": self.dir,
+            "image": self.image,
+            "type": self.type.name,
+            "tags": self.tags,
+        }
+
+    def __lt__(self, other):
+        return self.post < other.post
+
+
+class BlockHeader:
+    def __init__(self, start: int, size: int, entries: int):
+        self.start = start
+        self.size = size
+        self.entries = entries
+
+    def to_dict(self):
+        return {
+            "start": self.start,
+            "size": self.size,
+            "entries": self.entries,
+        }
+
+
+def dump(block: Block, fd: IO[bytes]):
+    block.dump(fd)
+
+
+def dumps(block: Block) -> bytes:
+    return block.dumps()
+
+
+def load_header(fd: IO[bytes]) -> Block:
+    if fd.read(4) != b"34BK":
+        raise Exception("Stream is not Block data")
+
+    def read_dword(fd: IO[bytes]) -> int:
+        return int.from_bytes(fd.read(4), "little", signed=False)
+
+    def read_word(fd: IO[bytes]) -> int:
+        return int.from_bytes(fd.read(2), "little", signed=False)
+
+
+    entries_len = read_word(fd)
+    block_size = read_word(fd)
+    block_start = read_dword(fd)
+
+    return BlockHeader(block_start, block_size, entries_len)
+
+
+def load(fd: IO[bytes]) -> Block:
+    if fd.read(4) != b"34BK":
+        raise Exception("Stream is not Block data")
+
+    def read_dword(fd: IO[bytes]) -> int:
+        return int.from_bytes(fd.read(4), "little", signed=False)
+
+    def read_word(fd: IO[bytes]) -> int:
+        return int.from_bytes(fd.read(2), "little", signed=False)
+
+    # Read header
+    entries_len = read_word(fd)
+    block_size = read_word(fd)
+    block_start = read_dword(fd)
+    tags_len = read_word(fd)
+    tags_data_len = read_word(fd)
+
+    # Read tags
+    tags_data = zlib.decompress(fd.read(tags_data_len))
+    tags = list(map(bytes.decode, tags_data.rstrip(b"\0").split(b"\0")))
+
+    # Slurp padding bytes
+    fd.read(_pad_bytes(tags_data_len, 32))
+
+    # Compute tag bits
+    tag_bits = _compute_tag_bits(tags)
+
+    # Load entries
+    block = Block(block_start, block_size)
+    for n in range(entries_len):
+        # Read header
+        post_id = read_dword(fd)
+        enttype = BlockEntryType(read_word(fd))
+        image_id_len = read_word(fd)
+        image_dir = read_dword(fd)
+        thumbnail_size = read_dword(fd)
+
+        # Read tags
+        post_tags = []
+        for rank in tag_bits:
+            bits = read_dword(fd)
+            for tag in rank:
+                if rank[tag] & bits:
+                    post_tags.append(tag)
+
+        # Read image id
+        image = fd.read(image_id_len)
+        image_id = image.hex()
+
+        # Read image thumbnail
+        thumbnail = fd.read(thumbnail_size)
+
+        # Slurp padding bytes
+        fd.read(_pad_bytes(image_id_len + thumbnail_size, 32))
+
+        block.add_post(Post(post_id, image_dir, image_id, post_tags, thumbnail), enttype)
+
+    return block
+
+
+def loads(data: bytes) -> Block:
+    return load(BytesIO(data));
--- a/scraper/client/main.py
+++ b/scraper/client/main.py
@ -0,0 +1,187 @@
+from datetime import datetime
+from enum import StrEnum
+from concurrent.futures import ThreadPoolExecutor
+import requests
+import tomllib
+import traceback
+import gc
+
+import py34
+import scraper.job
+
+_spinner = 0
+def spinner() -> str:
+    global _spinner
+    _spinner += 1
+    return "/-\\|"[_spinner % 4]
+
+
+class FG(StrEnum):
+    _ = "\x1B[0m"
+    r = "\x1B[31m"
+    g = "\x1B[32m"
+    y = "\x1B[33m"
+    b = "\x1B[34m"
+    p = "\x1B[35m"
+    c = "\x1B[36m"
+
+
+def web_get(url: str) -> str:
+    res = requests.get(url)
+    if res.status_code != 200:
+        raise Exception("Failed to request "+url)
+    content = res.content
+    res.close()
+    return content
+
+
+def web_post(url: str, data: bytes):
+    res = requests.post(url, data)
+    if res.status_code != 200:
+        raise Exception("Failed to request "+url)
+    res.close()
+
+
+# Load config
+print("Loading config.toml")
+with open("config.toml", "rb") as file:
+    config = tomllib.load(file)
+
+netthd = config["client"]["network-threads"]
+server = config["client"]["server"]
+check_missing = config["client"]["check-missing"]
+patch_failed = config["client"]["patch-failed"]
+
+if server[-1] != "/":
+    server += "/"
+
+
+# Print config stats
+print("=========================================================")
+print("network threads:", netthd)
+print("server:         ", server)
+print("check missing:  ", check_missing)
+print("patch failed:   ", patch_failed)
+print("=========================================================")
+
+
+# Enter main loop
+# while True:
+# It leaks, idk how to fix it, restart program for now :p
+if True:
+    # Fetch a job
+    print("Requesting job...", end="")
+    job = scraper.job.loads(web_get(server + "job"))
+
+    # Print job stats
+    deadline_date = datetime.fromtimestamp(job.deadline)
+    print(
+        f"\rGot work from {FG.c}{job.start}{FG._}" +
+        f" to {FG.c}{job.start + job.size - 1}{FG._}," +
+        f" with deadline at {FG.p}{deadline_date}{FG._}"
+    )
+
+    # Prepare a block
+    block = scraper.block.Block(job.start, job.size)
+
+    # Prepare dictionary for expected blocks
+    expected: dict[bool] = {}
+    for n in range(block.start, block.start + block.size):
+        expected[n] = False
+
+    # Start scraping
+    pid = 0
+    end = False
+    failed: list[py34.Post] = []
+    while not end:
+        # Get a list
+        # print(f"Downloading listing page {FG.b}{pid}{FG._}")
+        lst = py34.List(["id:<"+str(job.start + job.size)], pid)
+
+        # Check if we have reached the end
+        for post in lst.posts:
+            if post.id <= job.start:
+                end = True
+
+        # If so, remove posts that exceed our job quota
+        if end:
+            lst.posts = list(filter(lambda p: p.id >= job.start, lst.posts))
+
+        # Add posts to block, fetching extensions in the process
+        def _add_post(block: scraper.block.Block, post: py34.Post):
+            try:
+                expected[post.id] = True
+                entry = scraper.block.BlockEntry(post, None)
+                block.add_entry(entry)
+                if entry.type == scraper.block.BlockEntryType.ERROR:
+                    failed.append(post)
+                print(
+                    f"\rPage {FG.b}{pid}{FG._}" +
+                    f" Constructing block {FG.y}{len(block.entries)}/{block.size}{FG._}" +
+                    (f", failed {FG.r}{len(failed)}{FG._}" if len(failed) > 0 else ""),
+                    end="",
+                )
+            except Exception as ex:
+                print(f"{FG.r}{ex}{FG._}")
+                print(f"{FG.y}{traceback.format_exc()}{FG._}")
+                raise ex
+        with ThreadPoolExecutor(max_workers=netthd) as pool:
+            for post in lst.posts:
+                pool.submit(_add_post, block, post)
+
+        # Increase pid for next iteration
+        pid += len(lst.posts)
+
+        # Clean leaking connection handles
+        py34.scraper.scraper.reset()
+        gc.collect()
+    print()
+
+    # Patch failed block entries
+    if patch_failed:
+        for post in failed:
+            print(f"Investigating {py34.url.ViewURL(post.id)}...", end="")
+            try:
+                view = py34.View(post.id)
+                block.add_post(view.to_post(), scraper.block.ext2enttype(view.image_url.format))
+                print(f" {FG.g}Found {view.image_url.format.upper()}!{FG._}")
+            except py34.view.ViewMissingException as ex:
+                block.add_post(py34.Post.empty(post.id), scraper.block.BlockEntryType.MISSING)
+                print(f" {FG.r}Entry does not exist.{FG._}")
+
+    # Find missing block entries
+    missing: list[int] = []
+    for post_id in expected:
+        if expected[post_id]:
+            continue
+        missing.append(post_id)
+
+    if check_missing:
+        found: int = 0
+        for n, post_id in zip(range(len(missing)), missing):
+            print(
+                f"\x1B[2K\rSearching for missing entries" +
+                f" {FG.y}{n+1}/{len(missing)}{FG._}" +
+                f", found {FG.g}{found}{FG._}" +
+                f", missing {FG.r}{len(missing)-found}{FG._}",
+                end="",
+            )
+            try:
+                view = py34.View(post_id)
+                block.add_post(view.to_post(), scraper.block.ext2enttype(view.image_url.format))
+                found += 1
+            except py34.view.ViewMissingException as ex:
+                block.add_post(py34.Post.empty(post_id), scraper.block.BlockEntryType.MISSING)
+        print()
+    else:
+        for post_id in missing:
+            block.add_post(py34.Post.empty(post_id), scraper.block.BlockEntryType.MAYBE_MISSING)
+
+    # Sending block to server
+    print(f"Block: {len(block.entries)}/{block.size}")
+    print("Sending block to server...", end="")
+    try:
+        web_post(server + "block", scraper.block.dumps(block))
+        print(f"{FG.g}Ok{FG._}")
+    except Exception as ex:
+        print(f"{FG.r}Fail{FG._}")
--- a/scraper/config.py
+++ b/scraper/config.py
@ -0,0 +1,5 @@
+import tomllib
+
+
+with open("config.toml", "rb") as file:
+    config = tomllib.load(file)
--- a/scraper/job.py
+++ b/scraper/job.py
@ -0,0 +1,38 @@
+from .config import config
+from .block import Block
+from typing import IO
+from datetime import datetime, timedelta
+import json
+
+
+class Job:
+    def __init__(self, start, size, deadline = None):
+        self.start = start
+        self.size = size
+        if deadline is None:
+            self.deadline = (datetime.now() + timedelta(seconds=config["scraper"]["job-deadline"])).timestamp()
+        else:
+            self.deadline = deadline
+
+    def to_dict(self):
+        return {
+            "start": self.start,
+            "size": self.size,
+            "deadline": self.deadline,
+        }
+
+
+def dump(job: Job, fd: IO[str | bytes]):
+    json.dump(job.to_dict(), fd)
+
+def dumps(job: Job) -> str:
+    return json.dumps(job.to_dict(), fd)
+
+
+def load(fd: IO[str | bytes]) -> Job:
+    data = json.load(fd)
+    return Job(data["start"], data["size"], data["deadline"])
+
+def loads(data: str | bytes) -> Job:
+    data = json.loads(data)
+    return Job(data["start"], data["size"], data["deadline"])
--- a/scraper/server/main.py
+++ b/scraper/server/main.py
@ -0,0 +1,134 @@
+from scraper.config import config
+from flask import Flask, Response, request, render_template, url_for, send_file
+from pathlib import Path
+import py34
+from .block import BLOCK_SIZE, list_blocks, load_blocks, load_block, load_block_stats, save_block, loads as parse_block, enttype2ext
+from .job import assign_job, working_on, any_job, jobs
+
+
+# Create Flask application
+app = Flask(__name__)
+
+
+# Setup application configuration
+for key in config["server"]["web"]:
+    app.config[key.replace("-", "_").upper()] = config["server"]["web"][key]
+
+
+@app.route("/")
+def index():
+    return render_template("index.j2")
+
+@app.get("/job")
+def get_job():
+    blocks = list_blocks()
+    blocks.sort(key = lambda b: b.start)
+    for n, block in zip(range(len(blocks)), blocks):
+        next_block = blocks[n+1] if len(blocks) != n+1 else None
+
+        # Try to fill up hallow blocks
+        if not block.full and not working_on(block.start):
+            return assign_job(block.start, block.size).to_dict()
+
+        # Try to append new block
+        virt_start = block.start + block.size
+        virt_size = BLOCK_SIZE - virt_start % BLOCK_SIZE + 1
+        if next_block is None:
+
+            # We are the last block
+            while True:
+                if not working_on(virt_start):
+                    return assign_job(virt_start, virt_size).to_dict()
+                virt_start += virt_size
+                virt_size = BLOCK_SIZE
+        else:
+
+            # We are not the last block
+            while virt_start < next_block.start:
+                if not working_on(virt_start):
+                    return assign_job(virt_start, virt_size).to_dict()
+                virt_start += virt_size
+                virt_size = BLOCK_SIZE
+
+    assert len(blocks) == 0  # We should not be here if blocks exists
+
+    if not any_job():
+        return assign_job(1, BLOCK_SIZE).to_dict()
+    return assign_job(
+        max(jobs(), key = lambda j: j.start).start + BLOCK_SIZE,
+        BLOCK_SIZE,
+    ).to_dict()
+
+
+@app.get("/blocks")
+def get_block_ids():
+    return list(map(lambda l: l.to_dict(), list_blocks()))
+
+@app.get("/block_stats")
+def get_block_stats():
+    return list(map(lambda h: h.to_dict(), load_block_stats()))
+
+@app.get("/rawblock/<post_id>")
+def get_rawblock(post_id: int = None):
+    assert post_id is not None
+    return Response(
+        mimetype = "application/octet-stream",
+        response = load_block(int(post_id)).dumps(),
+    )
+
+@app.get("/block/<post_id>")
+def get_block(post_id: int = None):
+    assert post_id is not None
+    return load_block(int(post_id)).to_dict()
+
+@app.post("/block")
+def put_block():
+    save_block(parse_block(request.data))
+    return "ok"
+
+@app.get("/thumbnail/<post_id>")
+def get_thumbnail(post_id: int = None):
+    assert post_id is not None
+    post_id = int(post_id)
+    return Response(
+        response=load_block(post_id).entry(post_id).thumbnail,
+        content_type="image/jpeg",
+    )
+
+@app.get("/sample/<post_id>")
+def get_sample(post_id: int = None):
+    assert post_id is not None
+    post_id = int(post_id)
+    entry = load_block(post_id).entry(post_id)
+    return Response(
+        status = 307,
+        headers = {
+            "Location": str(py34.url.SampleURL(entry.dir, entry.image))
+        }
+    )
+
+@app.get("/image/<post_id>")
+def get_image(post_id: int = None):
+    assert post_id is not None
+    post_id = int(post_id)
+    entry = load_block(post_id).entry(post_id)
+    if entry.type < 100:
+        return Response(
+            "Image not found",
+            status = 404,
+        )
+
+    path = Path.cwd() / Path(f"image/{entry.dir}/{entry.image}.{enttype2ext(entry.type)}")
+    path.parent.mkdir(parents=True, exist_ok=True)
+    if path.exists():
+        return send_file(path)
+    else:
+        image = py34.scraper.scraper.get(py34.url.ImageURL(entry.dir, entry.image, enttype2ext(entry.type)))
+        with open(path, "wb") as file:
+            file.write(image)
+        return send_file(path)
+
+
+# Run application
+if __name__ == "__main__":
+    app.run()
--- a/scraper/server/block.py
+++ b/scraper/server/block.py
@ -0,0 +1,123 @@
+from scraper.block import *
+from scraper.config import config
+from pathlib import Path
+
+
+BLOCK_SIZE = config["scraper"]["block-size"]
+
+
+class BlockListing:
+    def __init__(self, start: int, size: int, full: bool):
+        self.start = int(start)
+        self.size = int(size)
+        self.full = bool(full)
+
+    def to_dict(self):
+        return {
+            "start": self.start,
+            "size": self.size,
+            "full": self.full,
+        }
+
+
+def _block_dir() -> Path:
+    return Path(config["server"]["block-dir"])
+
+def _ensure_block_dir() -> Path:
+    path = _block_dir()
+    if path.exists():
+        return path
+    path.mkdir(parents=True, exist_ok=True)
+    return path
+
+
+def list_blocks() -> list[BlockListing]:
+    path = _ensure_block_dir()
+    block_files = path.glob("*")
+    parts = map(lambda p: p.name.split("-"), path.glob("*"))
+    return sorted(list(map(lambda p: BlockListing(*tuple(map(int, p))), parts)), key=lambda bl: bl.start)
+
+
+def load_blocks() -> list[Block]:
+    path = _ensure_block_dir()
+    blocks = []
+    for block_path in path.glob("*"):
+        with open(block_path, "rb") as file:
+            blocks.append(load(file))
+    return blocks
+
+
+def load_block_stats() -> list[BlockHeader]:
+    path = _ensure_block_dir()
+    headers = []
+    for block in path.glob("*"):
+        with open(block, "rb") as file:
+            headers.append(load_header(file))
+    return headers
+
+
+def load_block(entry_id: int) -> Block:
+    path = _ensure_block_dir()
+    block_files = path.glob("*")
+    low_block: int  = None
+    low_block_size  = 0
+    high_block: int = None
+    high_block_size = 0
+
+    # Try to find block file
+    for file, start, size, full in map(lambda f: [f]+list(map(int, f.name.split("-"))), path.glob("*")):
+
+        # Find closest non-matching lower block
+        if entry_id >= start and (low_block == None or start >= low_block):
+            low_block = start
+            low_block_size = size
+
+        # Find closest non-matching higher block
+        if entry_id <= start and (high_block == None or start <= high_block):
+            high_block = start
+            high_block_size = size
+
+        # Find matching block
+        if entry_id >= start and entry_id < start+size:
+            # Found it, return
+            with open(file, "rb") as file:
+                return load(file)
+    # Failed...
+
+    # Determine name of the block file
+    block_size = BLOCK_SIZE
+    virtual_block = entry_id // block_size * block_size + 1
+    virtual_block_end = virtual_block + block_size  # Not inclusive
+
+    # Clamp around lower block
+    if low_block is not None:
+        virtual_block = max(virtual_block, low_block+low_block_size)
+
+    # Clamp around higher block
+    if high_block is not None:
+        virtual_block_end = min(virtual_block_end, high_block)
+
+    # Create a new block
+    return Block(virtual_block, virtual_block_end - virtual_block)
+
+
+def save_block(new_block: Block):
+    # Don't bother saving empty blocks
+    if len(new_block.entries) == 0:
+        return
+
+    def write_block(b: Block):
+        path = _ensure_block_dir()
+        with open(path / f"{b.start}-{b.size}-{int(len(b.entries)==b.size)}", "wb") as file:
+            dump(block, file)
+        badfile = path / f"{b.start}-{b.size}-{int(len(b.entries)!=b.size)}"
+        if badfile.exists():
+            badfile.unlink()
+
+    block = load_block(new_block.entries[0].post)
+    for entry in new_block.entries:
+        if block.start + block.size <= entry.post:
+            write_block(block)
+            block = load_block(entry.post)
+        block.add_entry(entry)
+    write_block(block)
--- a/scraper/server/job.py
+++ b/scraper/server/job.py
@ -0,0 +1,44 @@
+from scraper.job import *
+from datetime import datetime
+
+
+_jobs: list[Job] = []
+
+
+def _clean_jobs():
+    global _jobs
+    _jobs = list(filter(lambda j: datetime.fromtimestamp(j.deadline) > datetime.now(), _jobs))
+
+
+def working_on(block_start: int) -> bool:
+    global _jobs
+
+    _clean_jobs()
+
+    for job in _jobs:
+        if job.start <= block_start and job.start + job.size > block_start:
+            return True
+    return False
+
+
+def assign_job(block_start: int, block_size: int) -> Job:
+    global _jobs
+
+    if working_on(block_start):
+        raise Exception("Job already assigned")
+    job = Job(block_start, block_size)
+    _jobs.append(job)
+    return job
+
+
+def jobs() -> Job:
+    global _jobs
+    _clean_jobs()
+    for job in _jobs:
+        yield job
+
+
+def any_job() -> bool:
+    global _jobs
+    _clean_jobs()
+    return bool(len(_jobs))
--- a/scraper/server/static/d3.min.js
+++ b/scraper/server/static/d3.min.js
--- a/scraper/server/static/sus.png
+++ b/scraper/server/static/sus.png
--- a/scraper/server/templates/index.j2
+++ b/scraper/server/templates/index.j2
@ -0,0 +1,11 @@
+<!DOCTYPE html>
+<html>
+    <head>
+        <title>Rule34 Scraper</title>
+        <script src={{url_for("static", filename="d3.min.js")}}></script>
+    </head>
+    <head>
+        <h1>Hello, World!</h1>
+        <img src={{url_for("static", filename="sus.png")}}>
+    </head>
+</html>
Author	SHA1	Message	Date
Tomas	3ead328a0f	Merge branch 'master' into scraper	2025-10-09 20:37:55 +02:00
Tomas	f55f118271	Merge branch 'master' into scraper	2025-10-09 20:07:02 +02:00
Tomas	1dc17d7670	Don't create py34.Post for downloading images py34.Post does extension scan, that is not required as it is saved inside the block data.	2025-10-09 20:05:36 +02:00
Tomas	6d6e13c231	Merge branch 'master' into scraper	2025-10-09 19:25:12 +02:00
Tomas	a4e3ced8ab	Download videos too, prioritize video over image	2025-10-09 19:18:26 +02:00
Tomas	0b3293eaa8	Merge branch 'master' into scraper	2025-10-09 19:17:26 +02:00
Tomas	5473f92470	Merge branch 'master' into scraper	2025-10-03 00:09:06 +02:00
Tomas	ee4ba5d5f2	Create file after the image was retrieved `post.get_image_data()` can throw, creating file and later calling the function can create empty "image" files, confusing the program.	2025-10-02 23:40:33 +02:00
Tomas	217595fcdd	Download and cache images	2025-09-23 12:44:36 +02:00
Tomas	be6740a24c	Added api route for getting raw block	2025-09-23 02:24:57 +02:00
Tomuxs	71efaf7c25	Merge branch 'master' into scraper	2025-08-27 19:22:17 +02:00
Tomuxs	b81684de19	Merge branch 'master' into scraper	2025-08-27 17:47:50 +02:00
Tomuxs	f9fe6af292	Merge branch 'master' into scraper	2025-08-26 18:20:06 +02:00
Tomuxs	6d1b1628f1	Merge branch 'master' into scraper	2025-08-26 18:15:14 +02:00
Tomuxs	0143cc3999	Merge branch 'master' into scraper	2025-08-26 18:02:07 +02:00
Tomuxs	a39bfcd0b1	Merge branch 'master' into scraper	2025-08-15 19:32:33 +02:00
Tomuxs	826c5c3473	Merge branch 'master' into scraper	2025-08-15 19:31:40 +02:00
Tomuxs	265ec8d58e	Merge branch 'master' into scraper	2025-08-15 19:30:42 +02:00
Tomuxs	241bc90e82	Merge branch 'master' into scraper	2025-08-15 19:28:08 +02:00
Tomuxs	bdf77b2920	Merge remote-tracking branch 'refs/remotes/origin/scraper' into scraper	2025-08-15 18:48:23 +02:00
Tomuxs	5f4358a3d1	Probe with HEAD requests, not GET	2025-08-15 18:47:58 +02:00
Tomas	8e3a7b105a	Fixed invalid extension check	2025-08-10 15:50:40 +02:00
Tomas	d0dec584a8	Removed debug print	2025-08-10 15:49:03 +02:00
Tomas	fb06339cc7	Added flask	2025-08-10 15:26:14 +02:00
Tomas	48d17fcf7b	Merge branch 'master' into scraper	2025-08-09 17:23:49 +02:00
Tomuxs	c1b8be46aa	Initial scraper commit	2025-08-09 17:15:20 +02:00