Merge branch 'master' into scraper

Don't create py34.Post for downloading images
2025-10-09 20:37:55 +02:00 · 2025-10-09 20:07:02 +02:00 · 2025-10-09 20:05:36 +02:00 · 2025-10-09 19:25:12 +02:00 · 2025-10-09 19:18:26 +02:00 · 2025-10-09 19:17:26 +02:00
12 changed files with 899 additions and 0 deletions
--- a/config.toml
+++ b/config.toml
@ -0,0 +1,17 @@
 [scraper]
 block-size = 1050  # Max 65536
 job-deadline = 3600
 [client]
 server = "http://localhost:8000/"
 network-threads = 12
 check-missing = false
 patch-failed = true
 [server]
 block-dir = "blocks"
 [server.web]
 debug = true
 server-name = "localhost:8000"
 application-root = "/"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,10 +1,16 @@
 beautifulsoup4==4.13.4
 blinker==1.9.0
 bs4==0.0.2
 certifi==2025.7.14
 charset-normalizer==3.4.2
 click==8.2.1
 cloudscraper==1.2.71
 decorator==5.2.1
 Flask==3.1.1
 idna==3.10
 itsdangerous==2.2.0
 Jinja2==3.1.6
 MarkupSafe==3.0.2
 pillow==11.3.0
 py==1.11.0
 pyparsing==3.2.3
@ -14,3 +20,4 @@ retry==0.9.2
 soupsieve==2.7
 typing_extensions==4.14.1
 urllib3==2.5.0
 Werkzeug==3.1.3
--- a/scraper/block.py
+++ b/scraper/block.py
@ -0,0 +1,331 @@
 from py34 import Post
 from py34.url import ImageURL, IMAGE_FORMATS, VIDEO_FORMATS
 from py34.scraper import scraper
 from typing import IO
 from io import BytesIO
 from enum import IntEnum
 from bisect import insort
 import math
 import zlib
 import time
 class BlockEntryType(IntEnum):
    # Status
    ERROR = 0
    MISSING = 1
    MAYBE_MISSING = 2  # Did not appear on the listing, but the post was never checked
    # Image
    JPG = 100
    JPEG = 101
    PNG = 102
    # Animated Image
    GIF = 200
    # Audio
    # Video
    MP4 = 400
    WEBM = 401
 def ext2enttype(ext: str) -> BlockEntryType:
    return BlockEntryType[ext.upper()]
 def enttype2ext(enttype: BlockEntryType) -> str:
    if enttype.value < 50:
        raise Exception("Entry does not refer to a file")
    return enttype.name.lower()
 def _compute_tag_bits(tags: list[str]) -> list[dict[int]]:
    # Compute tag bits
    tag_bits = [{} for _ in range(math.ceil(len(tags) / 32))]
    for n in range(len(tags)):
        bit = n % 32
        rank = n // 32
        tag_bits[rank][tags[n]] = 1 << bit
    return tag_bits
 def _pad_bytes(size: int, pad_addr: int) -> int:
    addr_bytes = pad_addr // 8
    overflow = size % addr_bytes
    if overflow == 0:
        return 0
    else:
        return addr_bytes - overflow
 class Block:
    def __init__(self, start: int, size: int):
        self.start:   int              = start
        self.size:    int              = size
        self.entries: list[BlockEntry] = []
    def add_post(self, post: Post, enttype: BlockEntryType | None = None):
        self.add_entry(BlockEntry(post, enttype))
    def add_entry(self, entry: "BlockEntry"):
        # Check if entry belongs to this block
        if not self.belongs(entry.post):
            raise Exception("Given post does not belong to this block")
        # Remove entry with matching post-id, if exists
        self.entries = list(filter(lambda e: e.post != entry.post, self.entries))
        # insert the entry
        insort(self.entries, entry)
    def dump(self, fd: IO[bytes]):
        # Sort all entries
        self.entries.sort()
        # Combine all tags
        tags: set[str] = set()
        for entry in self.entries:
            tags |= set(entry.tags)
        tags: list[str] = list(tags)
        tags.sort()
        # Compute tag bits
        tag_bits = _compute_tag_bits(tags)
        # Compute length of tag stringzs
        tags_len_b = 0
        for tag in tags:
            tags_len_b += len(tag) + 1
        # Compress tag data
        tag_data = zlib.compress((b"\0".join(map(str.encode, tags)))+b"\0", level=9)
        # Magic 4B
        fd.write(b"34BK")
        # Entry amount 2B
        fd.write(len(self.entries).to_bytes(2, "little", signed=False))
        # Size 2B
        fd.write(self.size.to_bytes(2, "little", signed=False))
        # Start 4B
        fd.write(self.start.to_bytes(4, "little", signed=False))
        # Amount of different tags 2B
        fd.write(len(tags).to_bytes(2, "little", signed=False))
        # Size of tag data 2B
        fd.write(len(tag_data).to_bytes(2, "little", signed=False))
        # Write all the tags
        fd.write(tag_data)
        # Pad to the nearest 32bit address
        for _ in range(_pad_bytes(len(tag_data), 32)):
            fd.write(b"\0")
        # Dump entries
        for entry in self.entries:
            image = bytearray.fromhex(entry.image)
            # Post ID 4B
            fd.write(entry.post.to_bytes(4, "little", signed=False))
            # Entry type enum 2B
            fd.write(entry.type.to_bytes(2, "little", signed=False))
            # Image id length 2B
            fd.write(len(image).to_bytes(2, "little", signed=False))
            # Image dir 4B
            fd.write(entry.dir.to_bytes(4, "little", signed=False))
            # Thumbnail size 4B
            fd.write(len(entry.thumbnail).to_bytes(4, "little", signed=False))
            # Tag bits 4B*ranks
            for rank in tag_bits:
                word = 0
                for tag in entry.tags:
                    if tag in rank:
                        word |= rank[tag]
                fd.write(word.to_bytes(4, "little", signed=False))
            # Image ID
            fd.write(image)
            # Thumbnail
            fd.write(entry.thumbnail)
            # Pad to the nearest 32bit address
            for _ in range(_pad_bytes(len(image) + len(entry.thumbnail), 32)):
                fd.write(b"\0")
    def dumps(self) -> bytes:
        io = BytesIO()
        self.dump(io)
        return io.getvalue()
    def entry(self, post_id: int) -> "BlockEntry":
        for entry in self.entries:
            if entry.post == post_id:
                return entry
        raise IndexError("Entry not found")
    def belongs(self, post_id: int) -> bool:
        return post_id >= self.start and post_id < self.start+self.size
    def to_dict(self):
        return {
            "start": self.start,
            "size": self.size,
            "entries": list(map(BlockEntry.to_dict, self.entries)),
        }
 class BlockEntry:
    def __init__(self, post: Post, enttype: BlockEntryType | None):
        self.post = post.id
        self.dir = post.image_dir
        self.image = post.image_id
        self.type = enttype
        self.tags = post.tags.copy()
        self.thumbnail = post.get_thumbnail_data()
        if self.type is None:
            for ext in VIDEO_FORMATS + IMAGE_FORMATS:
                image_url = ImageURL(post.image_dir, post.image_id, ext)
                status: int = None
                # CDN tends to return 503 : Service Unavailable
                while status != 200 and status != 404:
                    status = scraper.head(image_url, body=False).status_code
                    if status != 200 and status != 404:
                        scraper.reset()
                        time.sleep(1)
                # HEAD could fail, try with GET
                while status != 200 and status != 404:
                    status = scraper.get(image_url, body=False).status_code
                    if status != 200 and status != 404:
                        scraper.reset()
                        time.sleep(1)
                if status == 200:
                    self.type = ext2enttype(ext)
                    break
        if self.type is None:
            self.type = BlockEntryType.ERROR
    def to_dict(self):
        return {
            "post": self.post,
            "dir": self.dir,
            "image": self.image,
            "type": self.type.name,
            "tags": self.tags,
        }
    def __lt__(self, other):
        return self.post < other.post
 class BlockHeader:
    def __init__(self, start: int, size: int, entries: int):
        self.start = start
        self.size = size
        self.entries = entries
    def to_dict(self):
        return {
            "start": self.start,
            "size": self.size,
            "entries": self.entries,
        }
 def dump(block: Block, fd: IO[bytes]):
    block.dump(fd)
 def dumps(block: Block) -> bytes:
    return block.dumps()
 def load_header(fd: IO[bytes]) -> Block:
    if fd.read(4) != b"34BK":
        raise Exception("Stream is not Block data")
    def read_dword(fd: IO[bytes]) -> int:
        return int.from_bytes(fd.read(4), "little", signed=False)
    def read_word(fd: IO[bytes]) -> int:
        return int.from_bytes(fd.read(2), "little", signed=False)
    entries_len = read_word(fd)
    block_size = read_word(fd)
    block_start = read_dword(fd)
    return BlockHeader(block_start, block_size, entries_len)
 def load(fd: IO[bytes]) -> Block:
    if fd.read(4) != b"34BK":
        raise Exception("Stream is not Block data")
    def read_dword(fd: IO[bytes]) -> int:
        return int.from_bytes(fd.read(4), "little", signed=False)
    def read_word(fd: IO[bytes]) -> int:
        return int.from_bytes(fd.read(2), "little", signed=False)
    # Read header
    entries_len = read_word(fd)
    block_size = read_word(fd)
    block_start = read_dword(fd)
    tags_len = read_word(fd)
    tags_data_len = read_word(fd)
    # Read tags
    tags_data = zlib.decompress(fd.read(tags_data_len))
    tags = list(map(bytes.decode, tags_data.rstrip(b"\0").split(b"\0")))
    # Slurp padding bytes
    fd.read(_pad_bytes(tags_data_len, 32))
    # Compute tag bits
    tag_bits = _compute_tag_bits(tags)
    # Load entries
    block = Block(block_start, block_size)
    for n in range(entries_len):
        # Read header
        post_id = read_dword(fd)
        enttype = BlockEntryType(read_word(fd))
        image_id_len = read_word(fd)
        image_dir = read_dword(fd)
        thumbnail_size = read_dword(fd)
        # Read tags
        post_tags = []
        for rank in tag_bits:
            bits = read_dword(fd)
            for tag in rank:
                if rank[tag] & bits:
                    post_tags.append(tag)
        # Read image id
        image = fd.read(image_id_len)
        image_id = image.hex()
        # Read image thumbnail
        thumbnail = fd.read(thumbnail_size)
        # Slurp padding bytes
        fd.read(_pad_bytes(image_id_len + thumbnail_size, 32))
        block.add_post(Post(post_id, image_dir, image_id, post_tags, thumbnail), enttype)
    return block
 def loads(data: bytes) -> Block:
    return load(BytesIO(data));
--- a/scraper/client/main.py
+++ b/scraper/client/main.py
@ -0,0 +1,187 @@
 from datetime import datetime
 from enum import StrEnum
 from concurrent.futures import ThreadPoolExecutor
 import requests
 import tomllib
 import traceback
 import gc
 import py34
 import scraper.job
 _spinner = 0
 def spinner() -> str:
    global _spinner
    _spinner += 1
    return "/-\\|"[_spinner % 4]
 class FG(StrEnum):
    _ = "\x1B[0m"
    r = "\x1B[31m"
    g = "\x1B[32m"
    y = "\x1B[33m"
    b = "\x1B[34m"
    p = "\x1B[35m"
    c = "\x1B[36m"
 def web_get(url: str) -> str:
    res = requests.get(url)
    if res.status_code != 200:
        raise Exception("Failed to request "+url)
    content = res.content
    res.close()
    return content
 def web_post(url: str, data: bytes):
    res = requests.post(url, data)
    if res.status_code != 200:
        raise Exception("Failed to request "+url)
    res.close()
 # Load config
 print("Loading config.toml")
 with open("config.toml", "rb") as file:
    config = tomllib.load(file)
 netthd = config["client"]["network-threads"]
 server = config["client"]["server"]
 check_missing = config["client"]["check-missing"]
 patch_failed = config["client"]["patch-failed"]
 if server[-1] != "/":
    server += "/"
 # Print config stats
 print("=========================================================")
 print("network threads:", netthd)
 print("server:         ", server)
 print("check missing:  ", check_missing)
 print("patch failed:   ", patch_failed)
 print("=========================================================")
 # Enter main loop
 # while True:
 # It leaks, idk how to fix it, restart program for now :p
 if True:
    # Fetch a job
    print("Requesting job...", end="")
    job = scraper.job.loads(web_get(server + "job"))
    # Print job stats
    deadline_date = datetime.fromtimestamp(job.deadline)
    print(
        f"\rGot work from {FG.c}{job.start}{FG._}" +
        f" to {FG.c}{job.start + job.size - 1}{FG._}," +
        f" with deadline at {FG.p}{deadline_date}{FG._}"
    )
    # Prepare a block
    block = scraper.block.Block(job.start, job.size)
    # Prepare dictionary for expected blocks
    expected: dict[bool] = {}
    for n in range(block.start, block.start + block.size):
        expected[n] = False
    # Start scraping
    pid = 0
    end = False
    failed: list[py34.Post] = []
    while not end:
        # Get a list
        # print(f"Downloading listing page {FG.b}{pid}{FG._}")
        lst = py34.List(["id:<"+str(job.start + job.size)], pid)
        # Check if we have reached the end
        for post in lst.posts:
            if post.id <= job.start:
                end = True
        # If so, remove posts that exceed our job quota
        if end:
            lst.posts = list(filter(lambda p: p.id >= job.start, lst.posts))
        # Add posts to block, fetching extensions in the process
        def _add_post(block: scraper.block.Block, post: py34.Post):
            try:
                expected[post.id] = True
                entry = scraper.block.BlockEntry(post, None)
                block.add_entry(entry)
                if entry.type == scraper.block.BlockEntryType.ERROR:
                    failed.append(post)
                print(
                    f"\rPage {FG.b}{pid}{FG._}" +
                    f" Constructing block {FG.y}{len(block.entries)}/{block.size}{FG._}" +
                    (f", failed {FG.r}{len(failed)}{FG._}" if len(failed) > 0 else ""),
                    end="",
                )
            except Exception as ex:
                print(f"{FG.r}{ex}{FG._}")
                print(f"{FG.y}{traceback.format_exc()}{FG._}")
                raise ex
        with ThreadPoolExecutor(max_workers=netthd) as pool:
            for post in lst.posts:
                pool.submit(_add_post, block, post)
        # Increase pid for next iteration
        pid += len(lst.posts)
        # Clean leaking connection handles
        py34.scraper.scraper.reset()
        gc.collect()
    print()
    # Patch failed block entries
    if patch_failed:
        for post in failed:
            print(f"Investigating {py34.url.ViewURL(post.id)}...", end="")
            try:
                view = py34.View(post.id)
                block.add_post(view.to_post(), scraper.block.ext2enttype(view.image_url.format))
                print(f" {FG.g}Found {view.image_url.format.upper()}!{FG._}")
            except py34.view.ViewMissingException as ex:
                block.add_post(py34.Post.empty(post.id), scraper.block.BlockEntryType.MISSING)
                print(f" {FG.r}Entry does not exist.{FG._}")
    # Find missing block entries
    missing: list[int] = []
    for post_id in expected:
        if expected[post_id]:
            continue
        missing.append(post_id)
    if check_missing:
        found: int = 0
        for n, post_id in zip(range(len(missing)), missing):
            print(
                f"\x1B[2K\rSearching for missing entries" +
                f" {FG.y}{n+1}/{len(missing)}{FG._}" +
                f", found {FG.g}{found}{FG._}" +
                f", missing {FG.r}{len(missing)-found}{FG._}",
                end="",
            )
            try:
                view = py34.View(post_id)
                block.add_post(view.to_post(), scraper.block.ext2enttype(view.image_url.format))
                found += 1
            except py34.view.ViewMissingException as ex:
                block.add_post(py34.Post.empty(post_id), scraper.block.BlockEntryType.MISSING)
        print()
    else:
        for post_id in missing:
            block.add_post(py34.Post.empty(post_id), scraper.block.BlockEntryType.MAYBE_MISSING)
    # Sending block to server
    print(f"Block: {len(block.entries)}/{block.size}")
    print("Sending block to server...", end="")
    try:
        web_post(server + "block", scraper.block.dumps(block))
        print(f"{FG.g}Ok{FG._}")
    except Exception as ex:
        print(f"{FG.r}Fail{FG._}")
--- a/scraper/config.py
+++ b/scraper/config.py
@ -0,0 +1,5 @@
 import tomllib
 with open("config.toml", "rb") as file:
    config = tomllib.load(file)
--- a/scraper/job.py
+++ b/scraper/job.py
@ -0,0 +1,38 @@
 from .config import config
 from .block import Block
 from typing import IO
 from datetime import datetime, timedelta
 import json
 class Job:
    def __init__(self, start, size, deadline = None):
        self.start = start
        self.size = size
        if deadline is None:
            self.deadline = (datetime.now() + timedelta(seconds=config["scraper"]["job-deadline"])).timestamp()
        else:
            self.deadline = deadline
    def to_dict(self):
        return {
            "start": self.start,
            "size": self.size,
            "deadline": self.deadline,
        }
 def dump(job: Job, fd: IO[str | bytes]):
    json.dump(job.to_dict(), fd)
 def dumps(job: Job) -> str:
    return json.dumps(job.to_dict(), fd)
 def load(fd: IO[str | bytes]) -> Job:
    data = json.load(fd)
    return Job(data["start"], data["size"], data["deadline"])
 def loads(data: str | bytes) -> Job:
    data = json.loads(data)
    return Job(data["start"], data["size"], data["deadline"])
--- a/scraper/server/main.py
+++ b/scraper/server/main.py
@ -0,0 +1,134 @@
 from scraper.config import config
 from flask import Flask, Response, request, render_template, url_for, send_file
 from pathlib import Path
 import py34
 from .block import BLOCK_SIZE, list_blocks, load_blocks, load_block, load_block_stats, save_block, loads as parse_block, enttype2ext
 from .job import assign_job, working_on, any_job, jobs
 # Create Flask application
 app = Flask(__name__)
 # Setup application configuration
 for key in config["server"]["web"]:
    app.config[key.replace("-", "_").upper()] = config["server"]["web"][key]
@app.route("/")
 def index():
    return render_template("index.j2")
@app.get("/job")
 def get_job():
    blocks = list_blocks()
    blocks.sort(key = lambda b: b.start)
    for n, block in zip(range(len(blocks)), blocks):
        next_block = blocks[n+1] if len(blocks) != n+1 else None
        # Try to fill up hallow blocks
        if not block.full and not working_on(block.start):
            return assign_job(block.start, block.size).to_dict()
        # Try to append new block
        virt_start = block.start + block.size
        virt_size = BLOCK_SIZE - virt_start % BLOCK_SIZE + 1
        if next_block is None:
            # We are the last block
            while True:
                if not working_on(virt_start):
                    return assign_job(virt_start, virt_size).to_dict()
                virt_start += virt_size
                virt_size = BLOCK_SIZE
        else:
            # We are not the last block
            while virt_start < next_block.start:
                if not working_on(virt_start):
                    return assign_job(virt_start, virt_size).to_dict()
                virt_start += virt_size
                virt_size = BLOCK_SIZE
    assert len(blocks) == 0  # We should not be here if blocks exists
    if not any_job():
        return assign_job(1, BLOCK_SIZE).to_dict()
    return assign_job(
        max(jobs(), key = lambda j: j.start).start + BLOCK_SIZE,
        BLOCK_SIZE,
    ).to_dict()
@app.get("/blocks")
 def get_block_ids():
    return list(map(lambda l: l.to_dict(), list_blocks()))
@app.get("/block_stats")
 def get_block_stats():
    return list(map(lambda h: h.to_dict(), load_block_stats()))
@app.get("/rawblock/<post_id>")
 def get_rawblock(post_id: int = None):
    assert post_id is not None
    return Response(
        mimetype = "application/octet-stream",
        response = load_block(int(post_id)).dumps(),
    )
@app.get("/block/<post_id>")
 def get_block(post_id: int = None):
    assert post_id is not None
    return load_block(int(post_id)).to_dict()
@app.post("/block")
 def put_block():
    save_block(parse_block(request.data))
    return "ok"
@app.get("/thumbnail/<post_id>")
 def get_thumbnail(post_id: int = None):
    assert post_id is not None
    post_id = int(post_id)
    return Response(
        response=load_block(post_id).entry(post_id).thumbnail,
        content_type="image/jpeg",
    )
@app.get("/sample/<post_id>")
 def get_sample(post_id: int = None):
    assert post_id is not None
    post_id = int(post_id)
    entry = load_block(post_id).entry(post_id)
    return Response(
        status = 307,
        headers = {
            "Location": str(py34.url.SampleURL(entry.dir, entry.image))
        }
    )
@app.get("/image/<post_id>")
 def get_image(post_id: int = None):
    assert post_id is not None
    post_id = int(post_id)
    entry = load_block(post_id).entry(post_id)
    if entry.type < 100:
        return Response(
            "Image not found",
            status = 404,
        )
    path = Path.cwd() / Path(f"image/{entry.dir}/{entry.image}.{enttype2ext(entry.type)}")
    path.parent.mkdir(parents=True, exist_ok=True)
    if path.exists():
        return send_file(path)
    else:
        image = py34.scraper.scraper.get(py34.url.ImageURL(entry.dir, entry.image, enttype2ext(entry.type)))
        with open(path, "wb") as file:
            file.write(image)
        return send_file(path)
 # Run application
 if __name__ == "__main__":
    app.run()
--- a/scraper/server/block.py
+++ b/scraper/server/block.py
@ -0,0 +1,123 @@
 from scraper.block import *
 from scraper.config import config
 from pathlib import Path
 BLOCK_SIZE = config["scraper"]["block-size"]
 class BlockListing:
    def __init__(self, start: int, size: int, full: bool):
        self.start = int(start)
        self.size = int(size)
        self.full = bool(full)
    def to_dict(self):
        return {
            "start": self.start,
            "size": self.size,
            "full": self.full,
        }
 def _block_dir() -> Path:
    return Path(config["server"]["block-dir"])
 def _ensure_block_dir() -> Path:
    path = _block_dir()
    if path.exists():
        return path
    path.mkdir(parents=True, exist_ok=True)
    return path
 def list_blocks() -> list[BlockListing]:
    path = _ensure_block_dir()
    block_files = path.glob("*")
    parts = map(lambda p: p.name.split("-"), path.glob("*"))
    return sorted(list(map(lambda p: BlockListing(*tuple(map(int, p))), parts)), key=lambda bl: bl.start)
 def load_blocks() -> list[Block]:
    path = _ensure_block_dir()
    blocks = []
    for block_path in path.glob("*"):
        with open(block_path, "rb") as file:
            blocks.append(load(file))
    return blocks
 def load_block_stats() -> list[BlockHeader]:
    path = _ensure_block_dir()
    headers = []
    for block in path.glob("*"):
        with open(block, "rb") as file:
            headers.append(load_header(file))
    return headers
 def load_block(entry_id: int) -> Block:
    path = _ensure_block_dir()
    block_files = path.glob("*")
    low_block: int  = None
    low_block_size  = 0
    high_block: int = None
    high_block_size = 0
    # Try to find block file
    for file, start, size, full in map(lambda f: [f]+list(map(int, f.name.split("-"))), path.glob("*")):
        # Find closest non-matching lower block
        if entry_id >= start and (low_block == None or start >= low_block):
            low_block = start
            low_block_size = size
        # Find closest non-matching higher block
        if entry_id <= start and (high_block == None or start <= high_block):
            high_block = start
            high_block_size = size
        # Find matching block
        if entry_id >= start and entry_id < start+size:
            # Found it, return
            with open(file, "rb") as file:
                return load(file)
    # Failed...
    # Determine name of the block file
    block_size = BLOCK_SIZE
    virtual_block = entry_id // block_size * block_size + 1
    virtual_block_end = virtual_block + block_size  # Not inclusive
    # Clamp around lower block
    if low_block is not None:
        virtual_block = max(virtual_block, low_block+low_block_size)
    # Clamp around higher block
    if high_block is not None:
        virtual_block_end = min(virtual_block_end, high_block)
    # Create a new block
    return Block(virtual_block, virtual_block_end - virtual_block)
 def save_block(new_block: Block):
    # Don't bother saving empty blocks
    if len(new_block.entries) == 0:
        return
    def write_block(b: Block):
        path = _ensure_block_dir()
        with open(path / f"{b.start}-{b.size}-{int(len(b.entries)==b.size)}", "wb") as file:
            dump(block, file)
        badfile = path / f"{b.start}-{b.size}-{int(len(b.entries)!=b.size)}"
        if badfile.exists():
            badfile.unlink()
    block = load_block(new_block.entries[0].post)
    for entry in new_block.entries:
        if block.start + block.size <= entry.post:
            write_block(block)
            block = load_block(entry.post)
        block.add_entry(entry)
    write_block(block)
--- a/scraper/server/job.py
+++ b/scraper/server/job.py
@ -0,0 +1,44 @@
 from scraper.job import *
 from datetime import datetime
 _jobs: list[Job] = []
 def _clean_jobs():
    global _jobs
    _jobs = list(filter(lambda j: datetime.fromtimestamp(j.deadline) > datetime.now(), _jobs))
 def working_on(block_start: int) -> bool:
    global _jobs
    _clean_jobs()
    for job in _jobs:
        if job.start <= block_start and job.start + job.size > block_start:
            return True
    return False
 def assign_job(block_start: int, block_size: int) -> Job:
    global _jobs
    if working_on(block_start):
        raise Exception("Job already assigned")
    job = Job(block_start, block_size)
    _jobs.append(job)
    return job
 def jobs() -> Job:
    global _jobs
    _clean_jobs()
    for job in _jobs:
        yield job
 def any_job() -> bool:
    global _jobs
    _clean_jobs()
    return bool(len(_jobs))
--- a/scraper/server/static/d3.min.js
+++ b/scraper/server/static/d3.min.js
--- a/scraper/server/static/sus.png
+++ b/scraper/server/static/sus.png
--- a/scraper/server/templates/index.j2
+++ b/scraper/server/templates/index.j2
@ -0,0 +1,11 @@
 <!DOCTYPE html>
 <html>
    <head>
        <title>Rule34 Scraper</title>
        <script src={{url_for("static", filename="d3.min.js")}}></script>
    </head>
    <head>
        <h1>Hello, World!</h1>
        <img src={{url_for("static", filename="sus.png")}}>
    </head>
 </html>
Author	SHA1	Message	Date
Tomas	3ead328a0f	Merge branch 'master' into scraper	2025-10-09 20:37:55 +02:00
Tomas	f55f118271	Merge branch 'master' into scraper	2025-10-09 20:07:02 +02:00
Tomas	1dc17d7670	Don't create py34.Post for downloading images py34.Post does extension scan, that is not required as it is saved inside the block data.	2025-10-09 20:05:36 +02:00
Tomas	6d6e13c231	Merge branch 'master' into scraper	2025-10-09 19:25:12 +02:00
Tomas	a4e3ced8ab	Download videos too, prioritize video over image	2025-10-09 19:18:26 +02:00
Tomas	0b3293eaa8	Merge branch 'master' into scraper	2025-10-09 19:17:26 +02:00
Tomas	5473f92470	Merge branch 'master' into scraper	2025-10-03 00:09:06 +02:00
Tomas	ee4ba5d5f2	Create file after the image was retrieved `post.get_image_data()` can throw, creating file and later calling the function can create empty "image" files, confusing the program.	2025-10-02 23:40:33 +02:00
Tomas	217595fcdd	Download and cache images	2025-09-23 12:44:36 +02:00
Tomas	be6740a24c	Added api route for getting raw block	2025-09-23 02:24:57 +02:00
Tomuxs	71efaf7c25	Merge branch 'master' into scraper	2025-08-27 19:22:17 +02:00
Tomuxs	b81684de19	Merge branch 'master' into scraper	2025-08-27 17:47:50 +02:00
Tomuxs	f9fe6af292	Merge branch 'master' into scraper	2025-08-26 18:20:06 +02:00
Tomuxs	6d1b1628f1	Merge branch 'master' into scraper	2025-08-26 18:15:14 +02:00
Tomuxs	0143cc3999	Merge branch 'master' into scraper	2025-08-26 18:02:07 +02:00
Tomuxs	a39bfcd0b1	Merge branch 'master' into scraper	2025-08-15 19:32:33 +02:00
Tomuxs	826c5c3473	Merge branch 'master' into scraper	2025-08-15 19:31:40 +02:00
Tomuxs	265ec8d58e	Merge branch 'master' into scraper	2025-08-15 19:30:42 +02:00
Tomuxs	241bc90e82	Merge branch 'master' into scraper	2025-08-15 19:28:08 +02:00
Tomuxs	bdf77b2920	Merge remote-tracking branch 'refs/remotes/origin/scraper' into scraper	2025-08-15 18:48:23 +02:00
Tomuxs	5f4358a3d1	Probe with HEAD requests, not GET	2025-08-15 18:47:58 +02:00
Tomas	8e3a7b105a	Fixed invalid extension check	2025-08-10 15:50:40 +02:00
Tomas	d0dec584a8	Removed debug print	2025-08-10 15:49:03 +02:00
Tomas	fb06339cc7	Added flask	2025-08-10 15:26:14 +02:00
Tomas	48d17fcf7b	Merge branch 'master' into scraper	2025-08-09 17:23:49 +02:00
Tomuxs	c1b8be46aa	Initial scraper commit	2025-08-09 17:15:20 +02:00