Compare commits
26 Commits
master
...
3ead328a0f
| Author | SHA1 | Date | |
|---|---|---|---|
| 3ead328a0f | |||
| f55f118271 | |||
| 1dc17d7670 | |||
| 6d6e13c231 | |||
| a4e3ced8ab | |||
| 0b3293eaa8 | |||
| 5473f92470 | |||
| ee4ba5d5f2 | |||
| 217595fcdd | |||
| be6740a24c | |||
| 71efaf7c25 | |||
| b81684de19 | |||
| f9fe6af292 | |||
| 6d1b1628f1 | |||
| 0143cc3999 | |||
| a39bfcd0b1 | |||
| 826c5c3473 | |||
| 265ec8d58e | |||
| 241bc90e82 | |||
| bdf77b2920 | |||
| 5f4358a3d1 | |||
| 8e3a7b105a | |||
| d0dec584a8 | |||
| fb06339cc7 | |||
| 48d17fcf7b | |||
| c1b8be46aa |
17
config.toml
Normal file
17
config.toml
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
[scraper]
|
||||||
|
block-size = 1050 # Max 65536
|
||||||
|
job-deadline = 3600
|
||||||
|
|
||||||
|
[client]
|
||||||
|
server = "http://localhost:8000/"
|
||||||
|
network-threads = 12
|
||||||
|
check-missing = false
|
||||||
|
patch-failed = true
|
||||||
|
|
||||||
|
[server]
|
||||||
|
block-dir = "blocks"
|
||||||
|
|
||||||
|
[server.web]
|
||||||
|
debug = true
|
||||||
|
server-name = "localhost:8000"
|
||||||
|
application-root = "/"
|
||||||
@ -1,10 +1,16 @@
|
|||||||
beautifulsoup4==4.13.4
|
beautifulsoup4==4.13.4
|
||||||
|
blinker==1.9.0
|
||||||
bs4==0.0.2
|
bs4==0.0.2
|
||||||
certifi==2025.7.14
|
certifi==2025.7.14
|
||||||
charset-normalizer==3.4.2
|
charset-normalizer==3.4.2
|
||||||
|
click==8.2.1
|
||||||
cloudscraper==1.2.71
|
cloudscraper==1.2.71
|
||||||
decorator==5.2.1
|
decorator==5.2.1
|
||||||
|
Flask==3.1.1
|
||||||
idna==3.10
|
idna==3.10
|
||||||
|
itsdangerous==2.2.0
|
||||||
|
Jinja2==3.1.6
|
||||||
|
MarkupSafe==3.0.2
|
||||||
pillow==11.3.0
|
pillow==11.3.0
|
||||||
py==1.11.0
|
py==1.11.0
|
||||||
pyparsing==3.2.3
|
pyparsing==3.2.3
|
||||||
@ -14,3 +20,4 @@ retry==0.9.2
|
|||||||
soupsieve==2.7
|
soupsieve==2.7
|
||||||
typing_extensions==4.14.1
|
typing_extensions==4.14.1
|
||||||
urllib3==2.5.0
|
urllib3==2.5.0
|
||||||
|
Werkzeug==3.1.3
|
||||||
|
|||||||
331
scraper/block.py
Normal file
331
scraper/block.py
Normal file
@ -0,0 +1,331 @@
|
|||||||
|
from py34 import Post
|
||||||
|
from py34.url import ImageURL, IMAGE_FORMATS, VIDEO_FORMATS
|
||||||
|
from py34.scraper import scraper
|
||||||
|
from typing import IO
|
||||||
|
from io import BytesIO
|
||||||
|
from enum import IntEnum
|
||||||
|
from bisect import insort
|
||||||
|
import math
|
||||||
|
import zlib
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
class BlockEntryType(IntEnum):
|
||||||
|
# Status
|
||||||
|
ERROR = 0
|
||||||
|
MISSING = 1
|
||||||
|
MAYBE_MISSING = 2 # Did not appear on the listing, but the post was never checked
|
||||||
|
|
||||||
|
# Image
|
||||||
|
JPG = 100
|
||||||
|
JPEG = 101
|
||||||
|
PNG = 102
|
||||||
|
|
||||||
|
# Animated Image
|
||||||
|
GIF = 200
|
||||||
|
|
||||||
|
# Audio
|
||||||
|
|
||||||
|
# Video
|
||||||
|
MP4 = 400
|
||||||
|
WEBM = 401
|
||||||
|
|
||||||
|
|
||||||
|
def ext2enttype(ext: str) -> BlockEntryType:
|
||||||
|
return BlockEntryType[ext.upper()]
|
||||||
|
|
||||||
|
def enttype2ext(enttype: BlockEntryType) -> str:
|
||||||
|
if enttype.value < 50:
|
||||||
|
raise Exception("Entry does not refer to a file")
|
||||||
|
return enttype.name.lower()
|
||||||
|
|
||||||
|
|
||||||
|
def _compute_tag_bits(tags: list[str]) -> list[dict[int]]:
|
||||||
|
# Compute tag bits
|
||||||
|
tag_bits = [{} for _ in range(math.ceil(len(tags) / 32))]
|
||||||
|
for n in range(len(tags)):
|
||||||
|
bit = n % 32
|
||||||
|
rank = n // 32
|
||||||
|
tag_bits[rank][tags[n]] = 1 << bit
|
||||||
|
return tag_bits
|
||||||
|
|
||||||
|
|
||||||
|
def _pad_bytes(size: int, pad_addr: int) -> int:
|
||||||
|
addr_bytes = pad_addr // 8
|
||||||
|
overflow = size % addr_bytes
|
||||||
|
if overflow == 0:
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
return addr_bytes - overflow
|
||||||
|
|
||||||
|
|
||||||
|
class Block:
|
||||||
|
def __init__(self, start: int, size: int):
|
||||||
|
self.start: int = start
|
||||||
|
self.size: int = size
|
||||||
|
self.entries: list[BlockEntry] = []
|
||||||
|
|
||||||
|
def add_post(self, post: Post, enttype: BlockEntryType | None = None):
|
||||||
|
self.add_entry(BlockEntry(post, enttype))
|
||||||
|
|
||||||
|
def add_entry(self, entry: "BlockEntry"):
|
||||||
|
# Check if entry belongs to this block
|
||||||
|
if not self.belongs(entry.post):
|
||||||
|
raise Exception("Given post does not belong to this block")
|
||||||
|
# Remove entry with matching post-id, if exists
|
||||||
|
self.entries = list(filter(lambda e: e.post != entry.post, self.entries))
|
||||||
|
# insert the entry
|
||||||
|
insort(self.entries, entry)
|
||||||
|
|
||||||
|
def dump(self, fd: IO[bytes]):
|
||||||
|
# Sort all entries
|
||||||
|
self.entries.sort()
|
||||||
|
|
||||||
|
# Combine all tags
|
||||||
|
tags: set[str] = set()
|
||||||
|
for entry in self.entries:
|
||||||
|
tags |= set(entry.tags)
|
||||||
|
tags: list[str] = list(tags)
|
||||||
|
tags.sort()
|
||||||
|
|
||||||
|
# Compute tag bits
|
||||||
|
tag_bits = _compute_tag_bits(tags)
|
||||||
|
|
||||||
|
# Compute length of tag stringzs
|
||||||
|
tags_len_b = 0
|
||||||
|
for tag in tags:
|
||||||
|
tags_len_b += len(tag) + 1
|
||||||
|
|
||||||
|
# Compress tag data
|
||||||
|
tag_data = zlib.compress((b"\0".join(map(str.encode, tags)))+b"\0", level=9)
|
||||||
|
|
||||||
|
# Magic 4B
|
||||||
|
fd.write(b"34BK")
|
||||||
|
|
||||||
|
# Entry amount 2B
|
||||||
|
fd.write(len(self.entries).to_bytes(2, "little", signed=False))
|
||||||
|
|
||||||
|
# Size 2B
|
||||||
|
fd.write(self.size.to_bytes(2, "little", signed=False))
|
||||||
|
|
||||||
|
# Start 4B
|
||||||
|
fd.write(self.start.to_bytes(4, "little", signed=False))
|
||||||
|
|
||||||
|
# Amount of different tags 2B
|
||||||
|
fd.write(len(tags).to_bytes(2, "little", signed=False))
|
||||||
|
|
||||||
|
# Size of tag data 2B
|
||||||
|
fd.write(len(tag_data).to_bytes(2, "little", signed=False))
|
||||||
|
|
||||||
|
# Write all the tags
|
||||||
|
fd.write(tag_data)
|
||||||
|
|
||||||
|
# Pad to the nearest 32bit address
|
||||||
|
for _ in range(_pad_bytes(len(tag_data), 32)):
|
||||||
|
fd.write(b"\0")
|
||||||
|
|
||||||
|
# Dump entries
|
||||||
|
for entry in self.entries:
|
||||||
|
image = bytearray.fromhex(entry.image)
|
||||||
|
|
||||||
|
# Post ID 4B
|
||||||
|
fd.write(entry.post.to_bytes(4, "little", signed=False))
|
||||||
|
|
||||||
|
# Entry type enum 2B
|
||||||
|
fd.write(entry.type.to_bytes(2, "little", signed=False))
|
||||||
|
|
||||||
|
# Image id length 2B
|
||||||
|
fd.write(len(image).to_bytes(2, "little", signed=False))
|
||||||
|
|
||||||
|
# Image dir 4B
|
||||||
|
fd.write(entry.dir.to_bytes(4, "little", signed=False))
|
||||||
|
|
||||||
|
# Thumbnail size 4B
|
||||||
|
fd.write(len(entry.thumbnail).to_bytes(4, "little", signed=False))
|
||||||
|
|
||||||
|
# Tag bits 4B*ranks
|
||||||
|
for rank in tag_bits:
|
||||||
|
word = 0
|
||||||
|
for tag in entry.tags:
|
||||||
|
if tag in rank:
|
||||||
|
word |= rank[tag]
|
||||||
|
fd.write(word.to_bytes(4, "little", signed=False))
|
||||||
|
|
||||||
|
# Image ID
|
||||||
|
fd.write(image)
|
||||||
|
|
||||||
|
# Thumbnail
|
||||||
|
fd.write(entry.thumbnail)
|
||||||
|
|
||||||
|
# Pad to the nearest 32bit address
|
||||||
|
for _ in range(_pad_bytes(len(image) + len(entry.thumbnail), 32)):
|
||||||
|
fd.write(b"\0")
|
||||||
|
|
||||||
|
def dumps(self) -> bytes:
|
||||||
|
io = BytesIO()
|
||||||
|
self.dump(io)
|
||||||
|
return io.getvalue()
|
||||||
|
|
||||||
|
def entry(self, post_id: int) -> "BlockEntry":
|
||||||
|
for entry in self.entries:
|
||||||
|
if entry.post == post_id:
|
||||||
|
return entry
|
||||||
|
raise IndexError("Entry not found")
|
||||||
|
|
||||||
|
def belongs(self, post_id: int) -> bool:
|
||||||
|
return post_id >= self.start and post_id < self.start+self.size
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
return {
|
||||||
|
"start": self.start,
|
||||||
|
"size": self.size,
|
||||||
|
"entries": list(map(BlockEntry.to_dict, self.entries)),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class BlockEntry:
|
||||||
|
def __init__(self, post: Post, enttype: BlockEntryType | None):
|
||||||
|
self.post = post.id
|
||||||
|
self.dir = post.image_dir
|
||||||
|
self.image = post.image_id
|
||||||
|
self.type = enttype
|
||||||
|
self.tags = post.tags.copy()
|
||||||
|
self.thumbnail = post.get_thumbnail_data()
|
||||||
|
|
||||||
|
if self.type is None:
|
||||||
|
for ext in VIDEO_FORMATS + IMAGE_FORMATS:
|
||||||
|
image_url = ImageURL(post.image_dir, post.image_id, ext)
|
||||||
|
status: int = None
|
||||||
|
# CDN tends to return 503 : Service Unavailable
|
||||||
|
while status != 200 and status != 404:
|
||||||
|
status = scraper.head(image_url, body=False).status_code
|
||||||
|
if status != 200 and status != 404:
|
||||||
|
scraper.reset()
|
||||||
|
time.sleep(1)
|
||||||
|
# HEAD could fail, try with GET
|
||||||
|
while status != 200 and status != 404:
|
||||||
|
status = scraper.get(image_url, body=False).status_code
|
||||||
|
if status != 200 and status != 404:
|
||||||
|
scraper.reset()
|
||||||
|
time.sleep(1)
|
||||||
|
if status == 200:
|
||||||
|
self.type = ext2enttype(ext)
|
||||||
|
break
|
||||||
|
if self.type is None:
|
||||||
|
self.type = BlockEntryType.ERROR
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
return {
|
||||||
|
"post": self.post,
|
||||||
|
"dir": self.dir,
|
||||||
|
"image": self.image,
|
||||||
|
"type": self.type.name,
|
||||||
|
"tags": self.tags,
|
||||||
|
}
|
||||||
|
|
||||||
|
def __lt__(self, other):
|
||||||
|
return self.post < other.post
|
||||||
|
|
||||||
|
|
||||||
|
class BlockHeader:
|
||||||
|
def __init__(self, start: int, size: int, entries: int):
|
||||||
|
self.start = start
|
||||||
|
self.size = size
|
||||||
|
self.entries = entries
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
return {
|
||||||
|
"start": self.start,
|
||||||
|
"size": self.size,
|
||||||
|
"entries": self.entries,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def dump(block: Block, fd: IO[bytes]):
|
||||||
|
block.dump(fd)
|
||||||
|
|
||||||
|
|
||||||
|
def dumps(block: Block) -> bytes:
|
||||||
|
return block.dumps()
|
||||||
|
|
||||||
|
|
||||||
|
def load_header(fd: IO[bytes]) -> Block:
|
||||||
|
if fd.read(4) != b"34BK":
|
||||||
|
raise Exception("Stream is not Block data")
|
||||||
|
|
||||||
|
def read_dword(fd: IO[bytes]) -> int:
|
||||||
|
return int.from_bytes(fd.read(4), "little", signed=False)
|
||||||
|
|
||||||
|
def read_word(fd: IO[bytes]) -> int:
|
||||||
|
return int.from_bytes(fd.read(2), "little", signed=False)
|
||||||
|
|
||||||
|
|
||||||
|
entries_len = read_word(fd)
|
||||||
|
block_size = read_word(fd)
|
||||||
|
block_start = read_dword(fd)
|
||||||
|
|
||||||
|
return BlockHeader(block_start, block_size, entries_len)
|
||||||
|
|
||||||
|
|
||||||
|
def load(fd: IO[bytes]) -> Block:
|
||||||
|
if fd.read(4) != b"34BK":
|
||||||
|
raise Exception("Stream is not Block data")
|
||||||
|
|
||||||
|
def read_dword(fd: IO[bytes]) -> int:
|
||||||
|
return int.from_bytes(fd.read(4), "little", signed=False)
|
||||||
|
|
||||||
|
def read_word(fd: IO[bytes]) -> int:
|
||||||
|
return int.from_bytes(fd.read(2), "little", signed=False)
|
||||||
|
|
||||||
|
# Read header
|
||||||
|
entries_len = read_word(fd)
|
||||||
|
block_size = read_word(fd)
|
||||||
|
block_start = read_dword(fd)
|
||||||
|
tags_len = read_word(fd)
|
||||||
|
tags_data_len = read_word(fd)
|
||||||
|
|
||||||
|
# Read tags
|
||||||
|
tags_data = zlib.decompress(fd.read(tags_data_len))
|
||||||
|
tags = list(map(bytes.decode, tags_data.rstrip(b"\0").split(b"\0")))
|
||||||
|
|
||||||
|
# Slurp padding bytes
|
||||||
|
fd.read(_pad_bytes(tags_data_len, 32))
|
||||||
|
|
||||||
|
# Compute tag bits
|
||||||
|
tag_bits = _compute_tag_bits(tags)
|
||||||
|
|
||||||
|
# Load entries
|
||||||
|
block = Block(block_start, block_size)
|
||||||
|
for n in range(entries_len):
|
||||||
|
# Read header
|
||||||
|
post_id = read_dword(fd)
|
||||||
|
enttype = BlockEntryType(read_word(fd))
|
||||||
|
image_id_len = read_word(fd)
|
||||||
|
image_dir = read_dword(fd)
|
||||||
|
thumbnail_size = read_dword(fd)
|
||||||
|
|
||||||
|
# Read tags
|
||||||
|
post_tags = []
|
||||||
|
for rank in tag_bits:
|
||||||
|
bits = read_dword(fd)
|
||||||
|
for tag in rank:
|
||||||
|
if rank[tag] & bits:
|
||||||
|
post_tags.append(tag)
|
||||||
|
|
||||||
|
# Read image id
|
||||||
|
image = fd.read(image_id_len)
|
||||||
|
image_id = image.hex()
|
||||||
|
|
||||||
|
# Read image thumbnail
|
||||||
|
thumbnail = fd.read(thumbnail_size)
|
||||||
|
|
||||||
|
# Slurp padding bytes
|
||||||
|
fd.read(_pad_bytes(image_id_len + thumbnail_size, 32))
|
||||||
|
|
||||||
|
block.add_post(Post(post_id, image_dir, image_id, post_tags, thumbnail), enttype)
|
||||||
|
|
||||||
|
return block
|
||||||
|
|
||||||
|
|
||||||
|
def loads(data: bytes) -> Block:
|
||||||
|
return load(BytesIO(data));
|
||||||
187
scraper/client/__main__.py
Normal file
187
scraper/client/__main__.py
Normal file
@ -0,0 +1,187 @@
|
|||||||
|
from datetime import datetime
|
||||||
|
from enum import StrEnum
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
import requests
|
||||||
|
import tomllib
|
||||||
|
import traceback
|
||||||
|
import gc
|
||||||
|
|
||||||
|
import py34
|
||||||
|
import scraper.job
|
||||||
|
|
||||||
|
_spinner = 0
|
||||||
|
def spinner() -> str:
|
||||||
|
global _spinner
|
||||||
|
_spinner += 1
|
||||||
|
return "/-\\|"[_spinner % 4]
|
||||||
|
|
||||||
|
|
||||||
|
class FG(StrEnum):
|
||||||
|
_ = "\x1B[0m"
|
||||||
|
r = "\x1B[31m"
|
||||||
|
g = "\x1B[32m"
|
||||||
|
y = "\x1B[33m"
|
||||||
|
b = "\x1B[34m"
|
||||||
|
p = "\x1B[35m"
|
||||||
|
c = "\x1B[36m"
|
||||||
|
|
||||||
|
|
||||||
|
def web_get(url: str) -> str:
|
||||||
|
res = requests.get(url)
|
||||||
|
if res.status_code != 200:
|
||||||
|
raise Exception("Failed to request "+url)
|
||||||
|
content = res.content
|
||||||
|
res.close()
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
||||||
|
def web_post(url: str, data: bytes):
|
||||||
|
res = requests.post(url, data)
|
||||||
|
if res.status_code != 200:
|
||||||
|
raise Exception("Failed to request "+url)
|
||||||
|
res.close()
|
||||||
|
|
||||||
|
|
||||||
|
# Load config
|
||||||
|
print("Loading config.toml")
|
||||||
|
with open("config.toml", "rb") as file:
|
||||||
|
config = tomllib.load(file)
|
||||||
|
|
||||||
|
netthd = config["client"]["network-threads"]
|
||||||
|
server = config["client"]["server"]
|
||||||
|
check_missing = config["client"]["check-missing"]
|
||||||
|
patch_failed = config["client"]["patch-failed"]
|
||||||
|
|
||||||
|
if server[-1] != "/":
|
||||||
|
server += "/"
|
||||||
|
|
||||||
|
|
||||||
|
# Print config stats
|
||||||
|
print("=========================================================")
|
||||||
|
print("network threads:", netthd)
|
||||||
|
print("server: ", server)
|
||||||
|
print("check missing: ", check_missing)
|
||||||
|
print("patch failed: ", patch_failed)
|
||||||
|
print("=========================================================")
|
||||||
|
|
||||||
|
|
||||||
|
# Enter main loop
|
||||||
|
# while True:
|
||||||
|
# It leaks, idk how to fix it, restart program for now :p
|
||||||
|
if True:
|
||||||
|
# Fetch a job
|
||||||
|
print("Requesting job...", end="")
|
||||||
|
job = scraper.job.loads(web_get(server + "job"))
|
||||||
|
|
||||||
|
# Print job stats
|
||||||
|
deadline_date = datetime.fromtimestamp(job.deadline)
|
||||||
|
print(
|
||||||
|
f"\rGot work from {FG.c}{job.start}{FG._}" +
|
||||||
|
f" to {FG.c}{job.start + job.size - 1}{FG._}," +
|
||||||
|
f" with deadline at {FG.p}{deadline_date}{FG._}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Prepare a block
|
||||||
|
block = scraper.block.Block(job.start, job.size)
|
||||||
|
|
||||||
|
# Prepare dictionary for expected blocks
|
||||||
|
expected: dict[bool] = {}
|
||||||
|
for n in range(block.start, block.start + block.size):
|
||||||
|
expected[n] = False
|
||||||
|
|
||||||
|
# Start scraping
|
||||||
|
pid = 0
|
||||||
|
end = False
|
||||||
|
failed: list[py34.Post] = []
|
||||||
|
while not end:
|
||||||
|
# Get a list
|
||||||
|
# print(f"Downloading listing page {FG.b}{pid}{FG._}")
|
||||||
|
lst = py34.List(["id:<"+str(job.start + job.size)], pid)
|
||||||
|
|
||||||
|
# Check if we have reached the end
|
||||||
|
for post in lst.posts:
|
||||||
|
if post.id <= job.start:
|
||||||
|
end = True
|
||||||
|
|
||||||
|
# If so, remove posts that exceed our job quota
|
||||||
|
if end:
|
||||||
|
lst.posts = list(filter(lambda p: p.id >= job.start, lst.posts))
|
||||||
|
|
||||||
|
# Add posts to block, fetching extensions in the process
|
||||||
|
def _add_post(block: scraper.block.Block, post: py34.Post):
|
||||||
|
try:
|
||||||
|
expected[post.id] = True
|
||||||
|
entry = scraper.block.BlockEntry(post, None)
|
||||||
|
block.add_entry(entry)
|
||||||
|
if entry.type == scraper.block.BlockEntryType.ERROR:
|
||||||
|
failed.append(post)
|
||||||
|
print(
|
||||||
|
f"\rPage {FG.b}{pid}{FG._}" +
|
||||||
|
f" Constructing block {FG.y}{len(block.entries)}/{block.size}{FG._}" +
|
||||||
|
(f", failed {FG.r}{len(failed)}{FG._}" if len(failed) > 0 else ""),
|
||||||
|
end="",
|
||||||
|
)
|
||||||
|
except Exception as ex:
|
||||||
|
print(f"{FG.r}{ex}{FG._}")
|
||||||
|
print(f"{FG.y}{traceback.format_exc()}{FG._}")
|
||||||
|
raise ex
|
||||||
|
with ThreadPoolExecutor(max_workers=netthd) as pool:
|
||||||
|
for post in lst.posts:
|
||||||
|
pool.submit(_add_post, block, post)
|
||||||
|
|
||||||
|
# Increase pid for next iteration
|
||||||
|
pid += len(lst.posts)
|
||||||
|
|
||||||
|
# Clean leaking connection handles
|
||||||
|
py34.scraper.scraper.reset()
|
||||||
|
gc.collect()
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Patch failed block entries
|
||||||
|
if patch_failed:
|
||||||
|
for post in failed:
|
||||||
|
print(f"Investigating {py34.url.ViewURL(post.id)}...", end="")
|
||||||
|
try:
|
||||||
|
view = py34.View(post.id)
|
||||||
|
block.add_post(view.to_post(), scraper.block.ext2enttype(view.image_url.format))
|
||||||
|
print(f" {FG.g}Found {view.image_url.format.upper()}!{FG._}")
|
||||||
|
except py34.view.ViewMissingException as ex:
|
||||||
|
block.add_post(py34.Post.empty(post.id), scraper.block.BlockEntryType.MISSING)
|
||||||
|
print(f" {FG.r}Entry does not exist.{FG._}")
|
||||||
|
|
||||||
|
# Find missing block entries
|
||||||
|
missing: list[int] = []
|
||||||
|
for post_id in expected:
|
||||||
|
if expected[post_id]:
|
||||||
|
continue
|
||||||
|
missing.append(post_id)
|
||||||
|
|
||||||
|
if check_missing:
|
||||||
|
found: int = 0
|
||||||
|
for n, post_id in zip(range(len(missing)), missing):
|
||||||
|
print(
|
||||||
|
f"\x1B[2K\rSearching for missing entries" +
|
||||||
|
f" {FG.y}{n+1}/{len(missing)}{FG._}" +
|
||||||
|
f", found {FG.g}{found}{FG._}" +
|
||||||
|
f", missing {FG.r}{len(missing)-found}{FG._}",
|
||||||
|
end="",
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
view = py34.View(post_id)
|
||||||
|
block.add_post(view.to_post(), scraper.block.ext2enttype(view.image_url.format))
|
||||||
|
found += 1
|
||||||
|
except py34.view.ViewMissingException as ex:
|
||||||
|
block.add_post(py34.Post.empty(post_id), scraper.block.BlockEntryType.MISSING)
|
||||||
|
print()
|
||||||
|
else:
|
||||||
|
for post_id in missing:
|
||||||
|
block.add_post(py34.Post.empty(post_id), scraper.block.BlockEntryType.MAYBE_MISSING)
|
||||||
|
|
||||||
|
# Sending block to server
|
||||||
|
print(f"Block: {len(block.entries)}/{block.size}")
|
||||||
|
print("Sending block to server...", end="")
|
||||||
|
try:
|
||||||
|
web_post(server + "block", scraper.block.dumps(block))
|
||||||
|
print(f"{FG.g}Ok{FG._}")
|
||||||
|
except Exception as ex:
|
||||||
|
print(f"{FG.r}Fail{FG._}")
|
||||||
5
scraper/config.py
Normal file
5
scraper/config.py
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
import tomllib
|
||||||
|
|
||||||
|
|
||||||
|
with open("config.toml", "rb") as file:
|
||||||
|
config = tomllib.load(file)
|
||||||
38
scraper/job.py
Normal file
38
scraper/job.py
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
from .config import config
|
||||||
|
from .block import Block
|
||||||
|
from typing import IO
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
class Job:
|
||||||
|
def __init__(self, start, size, deadline = None):
|
||||||
|
self.start = start
|
||||||
|
self.size = size
|
||||||
|
if deadline is None:
|
||||||
|
self.deadline = (datetime.now() + timedelta(seconds=config["scraper"]["job-deadline"])).timestamp()
|
||||||
|
else:
|
||||||
|
self.deadline = deadline
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
return {
|
||||||
|
"start": self.start,
|
||||||
|
"size": self.size,
|
||||||
|
"deadline": self.deadline,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def dump(job: Job, fd: IO[str | bytes]):
|
||||||
|
json.dump(job.to_dict(), fd)
|
||||||
|
|
||||||
|
def dumps(job: Job) -> str:
|
||||||
|
return json.dumps(job.to_dict(), fd)
|
||||||
|
|
||||||
|
|
||||||
|
def load(fd: IO[str | bytes]) -> Job:
|
||||||
|
data = json.load(fd)
|
||||||
|
return Job(data["start"], data["size"], data["deadline"])
|
||||||
|
|
||||||
|
def loads(data: str | bytes) -> Job:
|
||||||
|
data = json.loads(data)
|
||||||
|
return Job(data["start"], data["size"], data["deadline"])
|
||||||
134
scraper/server/__main__.py
Normal file
134
scraper/server/__main__.py
Normal file
@ -0,0 +1,134 @@
|
|||||||
|
from scraper.config import config
|
||||||
|
from flask import Flask, Response, request, render_template, url_for, send_file
|
||||||
|
from pathlib import Path
|
||||||
|
import py34
|
||||||
|
from .block import BLOCK_SIZE, list_blocks, load_blocks, load_block, load_block_stats, save_block, loads as parse_block, enttype2ext
|
||||||
|
from .job import assign_job, working_on, any_job, jobs
|
||||||
|
|
||||||
|
|
||||||
|
# Create Flask application
|
||||||
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# Setup application configuration
|
||||||
|
for key in config["server"]["web"]:
|
||||||
|
app.config[key.replace("-", "_").upper()] = config["server"]["web"][key]
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/")
|
||||||
|
def index():
|
||||||
|
return render_template("index.j2")
|
||||||
|
|
||||||
|
@app.get("/job")
|
||||||
|
def get_job():
|
||||||
|
blocks = list_blocks()
|
||||||
|
blocks.sort(key = lambda b: b.start)
|
||||||
|
for n, block in zip(range(len(blocks)), blocks):
|
||||||
|
next_block = blocks[n+1] if len(blocks) != n+1 else None
|
||||||
|
|
||||||
|
# Try to fill up hallow blocks
|
||||||
|
if not block.full and not working_on(block.start):
|
||||||
|
return assign_job(block.start, block.size).to_dict()
|
||||||
|
|
||||||
|
# Try to append new block
|
||||||
|
virt_start = block.start + block.size
|
||||||
|
virt_size = BLOCK_SIZE - virt_start % BLOCK_SIZE + 1
|
||||||
|
if next_block is None:
|
||||||
|
|
||||||
|
# We are the last block
|
||||||
|
while True:
|
||||||
|
if not working_on(virt_start):
|
||||||
|
return assign_job(virt_start, virt_size).to_dict()
|
||||||
|
virt_start += virt_size
|
||||||
|
virt_size = BLOCK_SIZE
|
||||||
|
else:
|
||||||
|
|
||||||
|
# We are not the last block
|
||||||
|
while virt_start < next_block.start:
|
||||||
|
if not working_on(virt_start):
|
||||||
|
return assign_job(virt_start, virt_size).to_dict()
|
||||||
|
virt_start += virt_size
|
||||||
|
virt_size = BLOCK_SIZE
|
||||||
|
|
||||||
|
assert len(blocks) == 0 # We should not be here if blocks exists
|
||||||
|
|
||||||
|
if not any_job():
|
||||||
|
return assign_job(1, BLOCK_SIZE).to_dict()
|
||||||
|
return assign_job(
|
||||||
|
max(jobs(), key = lambda j: j.start).start + BLOCK_SIZE,
|
||||||
|
BLOCK_SIZE,
|
||||||
|
).to_dict()
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/blocks")
|
||||||
|
def get_block_ids():
|
||||||
|
return list(map(lambda l: l.to_dict(), list_blocks()))
|
||||||
|
|
||||||
|
@app.get("/block_stats")
|
||||||
|
def get_block_stats():
|
||||||
|
return list(map(lambda h: h.to_dict(), load_block_stats()))
|
||||||
|
|
||||||
|
@app.get("/rawblock/<post_id>")
|
||||||
|
def get_rawblock(post_id: int = None):
|
||||||
|
assert post_id is not None
|
||||||
|
return Response(
|
||||||
|
mimetype = "application/octet-stream",
|
||||||
|
response = load_block(int(post_id)).dumps(),
|
||||||
|
)
|
||||||
|
|
||||||
|
@app.get("/block/<post_id>")
|
||||||
|
def get_block(post_id: int = None):
|
||||||
|
assert post_id is not None
|
||||||
|
return load_block(int(post_id)).to_dict()
|
||||||
|
|
||||||
|
@app.post("/block")
|
||||||
|
def put_block():
|
||||||
|
save_block(parse_block(request.data))
|
||||||
|
return "ok"
|
||||||
|
|
||||||
|
@app.get("/thumbnail/<post_id>")
|
||||||
|
def get_thumbnail(post_id: int = None):
|
||||||
|
assert post_id is not None
|
||||||
|
post_id = int(post_id)
|
||||||
|
return Response(
|
||||||
|
response=load_block(post_id).entry(post_id).thumbnail,
|
||||||
|
content_type="image/jpeg",
|
||||||
|
)
|
||||||
|
|
||||||
|
@app.get("/sample/<post_id>")
|
||||||
|
def get_sample(post_id: int = None):
|
||||||
|
assert post_id is not None
|
||||||
|
post_id = int(post_id)
|
||||||
|
entry = load_block(post_id).entry(post_id)
|
||||||
|
return Response(
|
||||||
|
status = 307,
|
||||||
|
headers = {
|
||||||
|
"Location": str(py34.url.SampleURL(entry.dir, entry.image))
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
@app.get("/image/<post_id>")
|
||||||
|
def get_image(post_id: int = None):
|
||||||
|
assert post_id is not None
|
||||||
|
post_id = int(post_id)
|
||||||
|
entry = load_block(post_id).entry(post_id)
|
||||||
|
if entry.type < 100:
|
||||||
|
return Response(
|
||||||
|
"Image not found",
|
||||||
|
status = 404,
|
||||||
|
)
|
||||||
|
|
||||||
|
path = Path.cwd() / Path(f"image/{entry.dir}/{entry.image}.{enttype2ext(entry.type)}")
|
||||||
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
if path.exists():
|
||||||
|
return send_file(path)
|
||||||
|
else:
|
||||||
|
image = py34.scraper.scraper.get(py34.url.ImageURL(entry.dir, entry.image, enttype2ext(entry.type)))
|
||||||
|
with open(path, "wb") as file:
|
||||||
|
file.write(image)
|
||||||
|
return send_file(path)
|
||||||
|
|
||||||
|
|
||||||
|
# Run application
|
||||||
|
if __name__ == "__main__":
|
||||||
|
app.run()
|
||||||
123
scraper/server/block.py
Normal file
123
scraper/server/block.py
Normal file
@ -0,0 +1,123 @@
|
|||||||
|
from scraper.block import *
|
||||||
|
from scraper.config import config
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
BLOCK_SIZE = config["scraper"]["block-size"]
|
||||||
|
|
||||||
|
|
||||||
|
class BlockListing:
|
||||||
|
def __init__(self, start: int, size: int, full: bool):
|
||||||
|
self.start = int(start)
|
||||||
|
self.size = int(size)
|
||||||
|
self.full = bool(full)
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
return {
|
||||||
|
"start": self.start,
|
||||||
|
"size": self.size,
|
||||||
|
"full": self.full,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _block_dir() -> Path:
|
||||||
|
return Path(config["server"]["block-dir"])
|
||||||
|
|
||||||
|
def _ensure_block_dir() -> Path:
|
||||||
|
path = _block_dir()
|
||||||
|
if path.exists():
|
||||||
|
return path
|
||||||
|
path.mkdir(parents=True, exist_ok=True)
|
||||||
|
return path
|
||||||
|
|
||||||
|
|
||||||
|
def list_blocks() -> list[BlockListing]:
|
||||||
|
path = _ensure_block_dir()
|
||||||
|
block_files = path.glob("*")
|
||||||
|
parts = map(lambda p: p.name.split("-"), path.glob("*"))
|
||||||
|
return sorted(list(map(lambda p: BlockListing(*tuple(map(int, p))), parts)), key=lambda bl: bl.start)
|
||||||
|
|
||||||
|
|
||||||
|
def load_blocks() -> list[Block]:
|
||||||
|
path = _ensure_block_dir()
|
||||||
|
blocks = []
|
||||||
|
for block_path in path.glob("*"):
|
||||||
|
with open(block_path, "rb") as file:
|
||||||
|
blocks.append(load(file))
|
||||||
|
return blocks
|
||||||
|
|
||||||
|
|
||||||
|
def load_block_stats() -> list[BlockHeader]:
|
||||||
|
path = _ensure_block_dir()
|
||||||
|
headers = []
|
||||||
|
for block in path.glob("*"):
|
||||||
|
with open(block, "rb") as file:
|
||||||
|
headers.append(load_header(file))
|
||||||
|
return headers
|
||||||
|
|
||||||
|
|
||||||
|
def load_block(entry_id: int) -> Block:
|
||||||
|
path = _ensure_block_dir()
|
||||||
|
block_files = path.glob("*")
|
||||||
|
low_block: int = None
|
||||||
|
low_block_size = 0
|
||||||
|
high_block: int = None
|
||||||
|
high_block_size = 0
|
||||||
|
|
||||||
|
# Try to find block file
|
||||||
|
for file, start, size, full in map(lambda f: [f]+list(map(int, f.name.split("-"))), path.glob("*")):
|
||||||
|
|
||||||
|
# Find closest non-matching lower block
|
||||||
|
if entry_id >= start and (low_block == None or start >= low_block):
|
||||||
|
low_block = start
|
||||||
|
low_block_size = size
|
||||||
|
|
||||||
|
# Find closest non-matching higher block
|
||||||
|
if entry_id <= start and (high_block == None or start <= high_block):
|
||||||
|
high_block = start
|
||||||
|
high_block_size = size
|
||||||
|
|
||||||
|
# Find matching block
|
||||||
|
if entry_id >= start and entry_id < start+size:
|
||||||
|
# Found it, return
|
||||||
|
with open(file, "rb") as file:
|
||||||
|
return load(file)
|
||||||
|
# Failed...
|
||||||
|
|
||||||
|
# Determine name of the block file
|
||||||
|
block_size = BLOCK_SIZE
|
||||||
|
virtual_block = entry_id // block_size * block_size + 1
|
||||||
|
virtual_block_end = virtual_block + block_size # Not inclusive
|
||||||
|
|
||||||
|
# Clamp around lower block
|
||||||
|
if low_block is not None:
|
||||||
|
virtual_block = max(virtual_block, low_block+low_block_size)
|
||||||
|
|
||||||
|
# Clamp around higher block
|
||||||
|
if high_block is not None:
|
||||||
|
virtual_block_end = min(virtual_block_end, high_block)
|
||||||
|
|
||||||
|
# Create a new block
|
||||||
|
return Block(virtual_block, virtual_block_end - virtual_block)
|
||||||
|
|
||||||
|
|
||||||
|
def save_block(new_block: Block):
|
||||||
|
# Don't bother saving empty blocks
|
||||||
|
if len(new_block.entries) == 0:
|
||||||
|
return
|
||||||
|
|
||||||
|
def write_block(b: Block):
|
||||||
|
path = _ensure_block_dir()
|
||||||
|
with open(path / f"{b.start}-{b.size}-{int(len(b.entries)==b.size)}", "wb") as file:
|
||||||
|
dump(block, file)
|
||||||
|
badfile = path / f"{b.start}-{b.size}-{int(len(b.entries)!=b.size)}"
|
||||||
|
if badfile.exists():
|
||||||
|
badfile.unlink()
|
||||||
|
|
||||||
|
block = load_block(new_block.entries[0].post)
|
||||||
|
for entry in new_block.entries:
|
||||||
|
if block.start + block.size <= entry.post:
|
||||||
|
write_block(block)
|
||||||
|
block = load_block(entry.post)
|
||||||
|
block.add_entry(entry)
|
||||||
|
write_block(block)
|
||||||
44
scraper/server/job.py
Normal file
44
scraper/server/job.py
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
from scraper.job import *
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
|
||||||
|
_jobs: list[Job] = []
|
||||||
|
|
||||||
|
|
||||||
|
def _clean_jobs():
|
||||||
|
global _jobs
|
||||||
|
_jobs = list(filter(lambda j: datetime.fromtimestamp(j.deadline) > datetime.now(), _jobs))
|
||||||
|
|
||||||
|
|
||||||
|
def working_on(block_start: int) -> bool:
|
||||||
|
global _jobs
|
||||||
|
|
||||||
|
_clean_jobs()
|
||||||
|
|
||||||
|
for job in _jobs:
|
||||||
|
if job.start <= block_start and job.start + job.size > block_start:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def assign_job(block_start: int, block_size: int) -> Job:
|
||||||
|
global _jobs
|
||||||
|
|
||||||
|
if working_on(block_start):
|
||||||
|
raise Exception("Job already assigned")
|
||||||
|
job = Job(block_start, block_size)
|
||||||
|
_jobs.append(job)
|
||||||
|
return job
|
||||||
|
|
||||||
|
|
||||||
|
def jobs() -> Job:
|
||||||
|
global _jobs
|
||||||
|
_clean_jobs()
|
||||||
|
for job in _jobs:
|
||||||
|
yield job
|
||||||
|
|
||||||
|
|
||||||
|
def any_job() -> bool:
|
||||||
|
global _jobs
|
||||||
|
_clean_jobs()
|
||||||
|
return bool(len(_jobs))
|
||||||
2
scraper/server/static/d3.min.js
vendored
Normal file
2
scraper/server/static/d3.min.js
vendored
Normal file
File diff suppressed because one or more lines are too long
BIN
scraper/server/static/sus.png
Normal file
BIN
scraper/server/static/sus.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 18 KiB |
11
scraper/server/templates/index.j2
Normal file
11
scraper/server/templates/index.j2
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Rule34 Scraper</title>
|
||||||
|
<script src={{url_for("static", filename="d3.min.js")}}></script>
|
||||||
|
</head>
|
||||||
|
<head>
|
||||||
|
<h1>Hello, World!</h1>
|
||||||
|
<img src={{url_for("static", filename="sus.png")}}>
|
||||||
|
</head>
|
||||||
|
</html>
|
||||||
Reference in New Issue
Block a user