import argparse
import json
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple

import pdfplumber


UPPER_CODE_RE = re.compile(r"^[A-Z]{2,5}$")
KM_RE = re.compile(r"(?P<km>\d+)\s*km\b")

GREEN_COLORS = {(0.0, 0.624, 0.0)}
RED_BAND_COLORS = {(0.875, 0.0, 0.0), (0.922, 0.361, 0.247)}
IGNORE_BORDER_RED = {(1.0, 0.0, 0.0)}
IGNORED_TIMELINE_ANNOTATION_RE = re.compile(r"\bReparatur\b", re.IGNORECASE)


@dataclass
class Metrics:
    fzg_x0: float
    von_x0: float
    timeline_x0: float
    timeline_last00_x0: float
    nach_x0: float
    hour_xs: List[float]
    slot_w: float
    header_bottom: float
    hour_w: float
    train_text_offset_x: float
    vertical_train_x_offset: float


def norm_color(color: Any) -> Optional[Tuple[float, ...]]:
    if color is None:
        return None
    if isinstance(color, tuple):
        return tuple(round(float(v), 3) for v in color)
    if isinstance(color, (int, float)):
        v = round(float(color), 3)
        return (v, v, v)
    return None


def group_words_by_line(words: List[Dict[str, Any]], tol: float = 1.6) -> List[List[Dict[str, Any]]]:
    lines: List[List[Dict[str, Any]]] = []
    for w in sorted(words, key=lambda x: (x["top"], x["x0"])):
        placed = False
        for line in lines:
            avg_top = sum(i["top"] for i in line) / len(line)
            if abs(w["top"] - avg_top) <= tol:
                line.append(w)
                line.sort(key=lambda x: x["x0"])
                placed = True
                break
        if not placed:
            lines.append([w])
    return lines


def line_text(words: Iterable[Dict[str, Any]]) -> str:
    return " ".join(w["text"] for w in sorted(words, key=lambda x: x["x0"])).strip()


def line_top(words: List[Dict[str, Any]]) -> float:
    return min(w["top"] for w in words)


def line_bottom(words: List[Dict[str, Any]]) -> float:
    return max(w["bottom"] for w in words)


def extract_metrics(page) -> Metrics:
    words = page.extract_words(x_tolerance=1, y_tolerance=1, keep_blank_chars=False)
    # A különböző napi PDF-ekben a header sor függőleges pozíciója elcsúszhat.
    # Ezért nem fix y-sávval dolgozunk, hanem megkeressük azt a sort, ahol FZG/Von/Nach együtt szerepel.
    header_lines = group_words_by_line([w for w in words if w["top"] < 80], tol=2.0)
    header_words: List[Dict[str, Any]] = []
    for line in header_lines:
        texts = {w["text"] for w in line}
        if {"FZG", "Von", "Nach"}.issubset(texts):
            header_words = line
            break
    if not header_words:
        # Fallback: régi működés (ha a fenti nem talál)
        header_words = [w for w in words if 40 <= w["top"] <= 65]

    def find_word(txt: str, nth: int = 0) -> Dict[str, Any]:
        matches = [w for w in header_words if w["text"] == txt]
        if len(matches) <= nth:
            raise ValueError(f"Header word not found: {txt}[{nth}]")
        return sorted(matches, key=lambda x: x["x0"])[nth]

    fzg = find_word("FZG")
    von = find_word("Von")
    nach = find_word("Nach")
    hour_words = [w for w in header_words if re.fullmatch(r"\d{2}", w["text"])]
    hour_words = sorted(hour_words, key=lambda x: x["x0"])
    first_00 = hour_words[0]
    last_00 = [w for w in hour_words if w["text"] == "00"][-1]
    slot_w = (last_00["x0"] - first_00["x0"]) / 144.0
    hour_w = (last_00["x0"] - first_00["x0"]) / 24.0
    # Empirikus kalibráció: a vízszintes vonatszám szöveg bal széle kb. ennyivel jobbra van a valódi időponttól.
    train_text_offset_x = hour_w * 0.299  # ~8.23 px ezen a layouton
    # A sima (nem dense) vertikális train címkék x0-ja tipikusan balra esik a valós időoszloptól.
    vertical_train_x_offset = slot_w * 2.25
    return Metrics(
        fzg_x0=fzg["x0"],
        von_x0=von["x0"],
        timeline_x0=first_00["x0"],
        timeline_last00_x0=last_00["x0"],
        nach_x0=nach["x0"],
        hour_xs=[w["x0"] for w in hour_words],
        slot_w=slot_w,
        header_bottom=max(w["bottom"] for w in header_words),
        hour_w=hour_w,
        train_text_offset_x=train_text_offset_x,
        vertical_train_x_offset=vertical_train_x_offset,
    )


def words_in_box(words: List[Dict[str, Any]], x0: float, x1: float, y0: float, y1: float) -> List[Dict[str, Any]]:
    return [w for w in words if x0 <= w["x0"] < x1 and y0 <= w["top"] < y1]


def chars_in_box(chars: List[Dict[str, Any]], x0: float, x1: float, y0: float, y1: float) -> List[Dict[str, Any]]:
    return [c for c in chars if x0 <= c["x0"] < x1 and y0 <= c["top"] < y1]


def extract_page_meta(words: List[Dict[str, Any]]) -> Dict[str, Any]:
    # A címfejléc és az órasor közelebb kerülhet egymáshoz; keressük azt a konkrét sort, amelyikben "Seite:" szerepel.
    candidate_lines = group_words_by_line([w for w in words if w["top"] < 60], tol=2.0)
    txt = ""
    for lw in candidate_lines:
        line = line_text(lw)
        if "Seite:" in line:
            txt = line
            break
    if not txt:
        # Fallback a régi logikára
        txt = line_text([w for w in words if w["top"] < 50])

    m = re.search(r"(\d{2}\.\d{2}\.\d{4})\s+\[(.+?)\].*Seite:\s*(\d+)/(\d+)", txt)
    if not m:
        return {}
    return {
        "date_str": m.group(1),
        "weekday_label": m.group(2),
        "page_no": int(m.group(3)),
        "page_total": int(m.group(4)),
    }


def build_block_index(page_words: List[Dict[str, Any]], metrics: Metrics, page_height: float) -> List[Dict[str, Any]]:
    lines = group_words_by_line([w for w in page_words if w["top"] > metrics.header_bottom + 1])
    line_rows: List[Dict[str, Any]] = []
    for lw in lines:
        txt = line_text(lw)
        line_rows.append(
            {
                "top": line_top(lw),
                "bottom": line_bottom(lw),
                "text": txt,
                "words": lw,
            }
        )

    start_lines: List[Dict[str, Any]] = []
    remark_lines: List[Dict[str, Any]] = []
    for row in line_rows:
        txt = row["text"]
        if "Bemerkungen:" in txt:
            remark_lines.append(row)
            continue
        if " km" in txt:
            continue
        left_words = [w for w in row["words"] if w["x0"] < metrics.von_x0 - 2]
        if not left_words:
            continue
        left_text = line_text(left_words)
        if re.fullmatch(r"\d{3,5}", left_text):
            start_lines.append(row)

    start_lines = sorted(start_lines, key=lambda r: r["top"])
    remark_lines = sorted(remark_lines, key=lambda r: r["top"])

    blocks: List[Dict[str, Any]] = []
    used_starts = set()
    for remark_idx, remark in enumerate(remark_lines):
        candidates = [s for i, s in enumerate(start_lines) if i not in used_starts and s["top"] < remark["top"]]
        if not candidates:
            continue
        start = candidates[-1]
        used_starts.add(start_lines.index(start))

        next_start = next((s for s in start_lines if s["top"] > remark["top"]), None)
        body_y0 = max(metrics.header_bottom + 1, start["top"] - 1)
        body_y1 = max(body_y0, remark["top"] - 1)
        remark_y0 = remark["top"] - 0.5
        remark_y1 = (next_start["top"] - 1) if next_start else (page_height - 1)

        blocks.append(
            {
                "seq_on_page": len(blocks) + 1,
                "start_row": start,
                "remark_row": remark,
                "body_y0": body_y0,
                "body_y1": body_y1,
                "remark_y0": remark_y0,
                "remark_y1": remark_y1,
            }
        )
    return blocks


def parse_triplet(lines: List[str]) -> Tuple[Optional[str], Optional[str], Optional[str]]:
    vals = [v.strip() for v in lines if v and v.strip()]
    if not vals:
        return None, None, None
    if len(vals) == 1:
        return None, None, vals[0]
    if len(vals) == 2:
        return None, vals[0], vals[1]
    return " ".join(vals[:-2]), vals[-2], vals[-1]


def parse_block_columns(page_words: List[Dict[str, Any]], block: Dict[str, Any], metrics: Metrics) -> Dict[str, Any]:
    body_words = [w for w in page_words if block["body_y0"] <= w["top"] < block["body_y1"]]
    lines = group_words_by_line(body_words)

    fzg_lines: List[str] = []
    von_lines: List[str] = []
    nach_lines: List[str] = []
    distance_km: Optional[int] = None

    for lw in lines:
        fzg_words = [w for w in lw if w["x0"] < metrics.von_x0 - 2]
        von_words = [w for w in lw if metrics.von_x0 <= w["x0"] < metrics.timeline_x0 - 2]
        nach_words = [w for w in lw if w["x0"] >= metrics.nach_x0]

        fzg_txt = line_text(fzg_words)
        von_txt = line_text(von_words)
        nach_txt = line_text(nach_words)

        if fzg_txt:
            fzg_lines.append(fzg_txt)
            km_match = KM_RE.search(fzg_txt)
            if km_match:
                distance_km = int(km_match.group("km"))
        if von_txt:
            von_lines.append(von_txt)
        if nach_txt:
            nach_lines.append(nach_txt)

    vehicle_no = next((v for v in fzg_lines if re.fullmatch(r"\d{3,5}", v)), None)
    rotation_raw = next((v for v in fzg_lines if re.fullmatch(r"\(.+\)", v)), None)
    rotation_no = rotation_raw.strip("()") if rotation_raw else None

    von_prev_train, von_prev_forda, von_location = parse_triplet(von_lines)
    nach_next_train, nach_next_forda, nach_location = parse_triplet(nach_lines)

    return {
        "vehicle_no": vehicle_no,
        "rotation_no": rotation_no,
        "distance_km": distance_km,
        "von_prev_train": von_prev_train,
        "von_prev_forda": von_prev_forda,
        "von_location": von_location,
        "nach_next_train": nach_next_train,
        "nach_next_forda": nach_next_forda,
        "nach_location": nach_location,
        "debug_fzg_lines": fzg_lines,
        "debug_von_lines": von_lines,
        "debug_nach_lines": nach_lines,
    }


def parse_remark(page_words: List[Dict[str, Any]], block: Dict[str, Any]) -> str:
    remark_words = [w for w in page_words if block["remark_y0"] <= w["top"] < block["remark_y1"]]
    lines = group_words_by_line(remark_words)
    texts = []
    for lw in lines:
        txt = line_text(lw)
        if txt:
            texts.append(txt)
    joined = "\n".join(texts).strip()
    return re.sub(r"^Bemerkungen:\s*", "", joined).strip()


def cluster_single_char_stacks(
    words: List[Dict[str, Any]], excluded_tops: Optional[List[float]] = None
) -> List[Dict[str, Any]]:
    excluded_tops = excluded_tops or []
    singletons = [
        w for w in words
        if len(w["text"]) == 1
        and re.fullmatch(r"[A-Za-z0-9|]", w["text"])
        and all(abs(w["top"] - yt) > 1.2 for yt in excluded_tops)
    ]
    cols: List[List[Dict[str, Any]]] = []
    for w in sorted(singletons, key=lambda x: (x["x0"], x["top"])):
        placed = False
        for col in cols:
            avg_x = sum(i["x0"] for i in col) / len(col)
            if abs(w["x0"] - avg_x) <= 2.2:
                col.append(w)
                placed = True
                break
        if not placed:
            cols.append([w])
    out = []
    for col in cols:
        col = sorted(col, key=lambda x: x["top"])
        if len(col) < 2:
            continue
        txt = "".join(c["text"] for c in col)
        if not (
            txt.isdigit()
            or (txt[0].isalpha() and txt[1:].isdigit())
            or re.fullmatch(r"[A-Z]\|\d{2,5}", txt)
        ):
            continue
        out.append(
            {
                "kind": "vertical_train",
                "text": txt,
                "x0": min(c["x0"] for c in col),
                "x1": max(c["x1"] for c in col),
                "top": min(c["top"] for c in col),
                "bottom": max(c["bottom"] for c in col),
                "parts": [c["text"] for c in col],
                "part_keys": [(round(c["x0"], 2), round(c["top"], 2), c["text"]) for c in col],
            }
        )
    return out


def extract_dense_vertical_matrix_trains(t_words: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], set]:
    """
    Special-case parser for rows like FZG 642 where train numbers are built from a dense matrix
    of stacked single digits (e.g. 2 / 1 / 2 -> 212) across many columns.
    """
    lines = group_words_by_line(t_words)
    dense_digit_lines: List[List[Dict[str, Any]]] = []
    for lw in lines:
        sw = sorted(lw, key=lambda x: x["x0"])
        if len(sw) < 20:
            continue
        if all(len(w["text"]) == 1 and w["text"].isdigit() for w in sw):
            dense_digit_lines.append(sw)

    # Need at least 2 rows (e.g. 11,12,13...) or 3 rows (e.g. 212,213...).
    if len(dense_digit_lines) < 2:
        return [], set()

    def align_run_columns(run: List[List[Dict[str, Any]]], tol_x: float = 2.0) -> List[List[Dict[str, Any]]]:
        """
        Align stacked digit rows by X position (not by raw index). This is more robust when the
        top digit row slightly overlaps the remark area and gains/loses a digit near the edges.
        """
        base_idx = min(range(len(run)), key=lambda i: abs(i - (len(run) - 1) / 2))
        base = sorted(run[base_idx], key=lambda x: x["x0"])
        others = [sorted(r, key=lambda x: x["x0"]) for r in run]

        aligned_cols: List[List[Dict[str, Any]]] = []
        for b in base:
            col = [None] * len(run)  # type: ignore[list-item]
            col[base_idx] = b
            ok = True
            for ridx, row in enumerate(others):
                if ridx == base_idx:
                    continue
                candidates = [w for w in row if abs(w["x0"] - b["x0"]) <= tol_x]
                if not candidates:
                    ok = False
                    break
                # Prefer the closest x; tie-break on smaller vertical distance.
                chosen = sorted(
                    candidates,
                    key=lambda w: (abs(w["x0"] - b["x0"]), abs(w["top"] - b["top"]))
                )[0]
                col[ridx] = chosen
            if ok:
                aligned_cols.append([w for w in col if w is not None])

        # Deduplicate if multiple base points snapped to the same stack.
        dedup: List[List[Dict[str, Any]]] = []
        seen = set()
        for col in aligned_cols:
            key = tuple((round(w["x0"], 2), round(w["top"], 2), w["text"]) for w in col)
            if key in seen:
                continue
            seen.add(key)
            dedup.append(col)
        return dedup

    # Pick the best aligned run, preferring 3 rows over 2 rows and the one with the most matched columns.
    chosen = None
    chosen_cols: List[List[Dict[str, Any]]] = []
    for run_len in (3, 2):
        if len(dense_digit_lines) < run_len:
            continue
        best_for_len = None
        best_cols_for_len: List[List[Dict[str, Any]]] = []
        best_score = -1
        for i in range(len(dense_digit_lines) - run_len + 1):
            run = dense_digit_lines[i : i + run_len]
            cols = align_run_columns(run)
            if len(cols) < 2:
                continue
            # Score: matched column count first, then compact x-spread agreement.
            spread_penalty = 0.0
            for col in cols:
                xs = [w["x0"] for w in col]
                spread_penalty += (max(xs) - min(xs))
            score = len(cols) * 1000 - spread_penalty
            if score > best_score:
                best_score = score
                best_for_len = run
                best_cols_for_len = cols
        if best_for_len is not None:
            chosen = best_for_len
            chosen_cols = best_cols_for_len
            break

    if not chosen:
        return [], set()

    # Reject false positives: prefixed vertical train labels (e.g. D/F/C + 10xx) can produce a
    # 3-row digit run in the middle. In that case, a row of single uppercase letters is typically
    # aligned directly above the candidate digit columns.
    run_top = min(min(w["top"] for w in row) for row in chosen)
    uppercase_singles = [
        w for w in t_words
        if len(w["text"]) == 1 and w["text"].isalpha() and w["text"].isupper() and (run_top - 8.0) <= w["top"] <= (run_top + 1.0)
    ]
    if uppercase_singles and chosen_cols:
        aligned_upper = 0
        for col in chosen_cols:
            col_x = sum(w["x0"] for w in col) / len(col)
            if any(abs(u["x0"] - col_x) <= 2.0 for u in uppercase_singles):
                aligned_upper += 1
        if aligned_upper >= max(4, int(len(chosen_cols) * 0.2)):
            return [], set()

    trains: List[Dict[str, Any]] = []
    consumed = set()
    for parts in sorted(chosen_cols, key=lambda col: min(p["x0"] for p in col)):
        for p in parts:
            consumed.add((round(p["x0"], 2), round(p["top"], 2), p["text"]))
        txt = "".join(p["text"] for p in parts)
        if not txt.isdigit():
            continue
        trains.append(
            {
                "kind": "vertical_dense_train",
                "text": txt,
                "x0": min(p["x0"] for p in parts),
                "x1": max(p["x1"] for p in parts),
                "top": min(p["top"] for p in parts),
                "bottom": max(p["bottom"] for p in parts),
                "parts": [p["text"] for p in parts],
                "prefix": None,
                "train_no": txt,
            }
        )

    return trains, consumed


def parse_train_token(token_text: str) -> Tuple[Optional[str], Optional[str], Optional[str]]:
    t = (token_text or "").strip()
    if not t:
        return None, None, None

    # PDF text extraction sometimes uses alternate pipe glyphs or injects spaces around the separator.
    t_norm = re.sub(r"\s*([|¦｜∣])\s*", r"|", t)

    if "|" in t_norm:
        m = re.fullmatch(r"([A-Z])\|(\d{2,5})", t_norm)
        if m:
            return t_norm, m.group(1), m.group(2)
        # Fallback: keep arbitrary RHS if it is not purely numeric but still preserve prefix split.
        m = re.fullmatch(r"([A-Z])\|(.+)", t_norm)
        if m:
            return t_norm, m.group(1), m.group(2).strip()

    if t_norm and t_norm[0].isalpha() and t_norm[1:].isdigit() and len(t_norm) >= 2:
        return t_norm, t_norm[0], t_norm[1:]
    if re.fullmatch(r"\d{2,5}", t_norm):
        return t_norm, None, t_norm
    return None, None, None


def color_name(color: Optional[Tuple[float, ...]]) -> Optional[str]:
    if color is None:
        return None
    if color in GREEN_COLORS:
        return "green"
    if color in RED_BAND_COLORS:
        return "red"
    if color in IGNORE_BORDER_RED:
        return "red_border"
    return f"rgb:{','.join(str(v) for v in color)}"


def slot_index_from_x(x: float, metrics: Metrics) -> int:
    idx = int(round((x - metrics.timeline_x0) / metrics.slot_w))
    return idx % 144


def precise_minute_from_train_x(x: float, metrics: Metrics, train_kind: str) -> int:
    offset = metrics.train_text_offset_x if train_kind == "horizontal_train" else 0.0
    rel_x = x - metrics.timeline_x0 - offset
    minute = int(round((rel_x / metrics.hour_w) * 60))
    return minute % 1440


def minute_from_timeline_x(x: float, metrics: Metrics) -> int:
    rel_x = x - metrics.timeline_x0
    minute = int(round((rel_x / metrics.hour_w) * 60))
    return minute % 1440


def duration_minutes_from_x_span(x0: float, x1: float, metrics: Metrics) -> int:
    span = max(0.0, x1 - x0)
    dur = int(round((span / metrics.hour_w) * 60))
    if dur <= 0 and span > 0.5:
        dur = 1
    return max(0, min(dur, 1439))


def extract_timeline(page_words: List[Dict[str, Any]], page_chars: List[Dict[str, Any]], page_rects: List[Dict[str, Any]], block: Dict[str, Any], metrics: Metrics) -> Dict[str, Any]:
    timeline_clip_x0 = metrics.timeline_x0 - max(12.0, metrics.slot_w * 3)
    # The topmost character of vertical train labels can slightly overlap upward; leave extra headroom.
    timeline_clip_y0 = max(metrics.header_bottom - 0.5, block["body_y0"] - 10.0)
    t_words = [
        w for w in page_words
        if timeline_clip_y0 <= w["top"] < block["body_y1"]
        and timeline_clip_x0 <= w["x0"] < metrics.nach_x0
    ]
    t_chars = chars_in_box(page_chars, timeline_clip_x0, metrics.nach_x0, timeline_clip_y0, block["body_y1"])

    colored_rects = []
    for r in page_rects:
        c = norm_color(r.get("non_stroking_color"))
        if c not in (GREEN_COLORS | RED_BAND_COLORS):
            continue
        if r["height"] < 2 or r["width"] < 2:
            continue
        if not (timeline_clip_y0 <= r["top"] < block["body_y1"] + 4):
            continue
        if r["x1"] <= metrics.timeline_x0 or r["x0"] >= metrics.nach_x0:
            continue
        colored_rects.append(
            {
                "x0": r["x0"],
                "x1": r["x1"],
                "top": r["top"],
                "bottom": r["bottom"],
                "color": c,
                "color_name": color_name(c),
            }
        )

    dense_vertical_trains, dense_consumed = extract_dense_vertical_matrix_trains(t_words)

    def wkey(w: Dict[str, Any]) -> Tuple[float, float, str]:
        return (round(w["x0"], 2), round(w["top"], 2), w["text"])

    horizontal_trains = []
    for w in t_words:
        if wkey(w) in dense_consumed:
            continue
        txt = w["text"]
        if len(txt) == 1:
            continue
        raw, prefix, num = parse_train_token(txt)
        if raw is None:
            continue
        # Exclude station-like all-caps codes that accidentally match regex
        if UPPER_CODE_RE.fullmatch(txt):
            continue
        horizontal_trains.append(
            {
                "kind": "horizontal_train",
                "text": raw,
                "prefix": prefix,
                "train_no": num,
                "x0": w["x0"],
                "x1": w["x1"],
                "top": w["top"],
                "bottom": w["bottom"],
            }
        )

    # Marker-only rows contain many single-digit coupling-order markers (often 2/4 alternating).
    # Exclude the whole row from vertical train reconstruction so they do not get merged into labels.
    marker_heavy_tops: List[float] = []
    grouped_non_dense = group_words_by_line([w for w in t_words if wkey(w) not in dense_consumed])
    line_infos = []
    for lw in grouped_non_dense:
        top_avg = sum(w["top"] for w in lw) / len(lw)
        line_infos.append(
            {
                "top": top_avg,
                "is_digit_only": len(lw) >= 4 and all(len(w["text"]) == 1 and w["text"].isdigit() for w in lw),
                "has_station_like": any(UPPER_CODE_RE.fullmatch(w["text"]) for w in lw),
            }
        )
    line_infos = sorted(line_infos, key=lambda x: x["top"])
    # Marker row is typically the digit-only row directly above the station-code row (KLO/DAV/SELF/...).
    for i, li in enumerate(line_infos):
        if not li["is_digit_only"]:
            continue
        next_line = line_infos[i + 1] if i + 1 < len(line_infos) else None
        if next_line and next_line["has_station_like"] and (next_line["top"] - li["top"]) <= 8.5:
            marker_heavy_tops.append(li["top"])

    vertical_trains = []
    for v in cluster_single_char_stacks([w for w in t_words if wkey(w) not in dense_consumed], excluded_tops=marker_heavy_tops):
        raw, prefix, num = parse_train_token(v["text"])
        if raw is None:
            continue
        vertical_trains.append(
            {
                **v,
                "text": raw,
                "prefix": prefix,
                "train_no": num,
            }
        )

    trains = horizontal_trains[:] + dense_vertical_trains[:]
    vertical_train_part_keys = set()
    for vt in vertical_trains:
        for pk in vt.get("part_keys", []):
            vertical_train_part_keys.add(pk)
        dup = any(abs(vt["x0"] - ht["x0"]) <= 3 and abs(vt["top"] - ht["top"]) <= 8 for ht in trains)
        if not dup:
            trains.append(vt)
    trains.sort(key=lambda t: (t["x0"], t["top"]))

    station_tokens = [
        w for w in t_words
        if wkey(w) not in dense_consumed
        if UPPER_CODE_RE.fullmatch(w["text"])
    ]
    marker_tokens = [
        w for w in t_words
        if wkey(w) not in dense_consumed
        and wkey(w) not in vertical_train_part_keys
        and re.fullmatch(r"[1-9]", w["text"])
    ]

    lower_annotations = []
    for w in t_words:
        txt = w["text"]
        if wkey(w) in dense_consumed:
            continue
        if any(w is s for s in station_tokens):
            continue
        if txt == "2":
            continue
        if any(abs(w["x0"] - t["x0"]) <= 1 and abs(w["top"] - t["top"]) <= 1 and txt == t["text"] for t in horizontal_trains):
            continue
        if re.search(r"[a-z]", txt) or txt in {"-", "/"}:
            lower_annotations.append(w)

    slots = []
    used_station_ids = set()
    used_marker_ids = set()

    def wid(w: Dict[str, Any]) -> Tuple[float, float, str]:
        return (round(w["x0"], 2), round(w["top"], 2), w["text"])

    for idx, tr in enumerate(trains):
        # Start station: prefer uppercase station token aligned with the train/event and below it.
        station_candidates = []
        vertical_like = tr["kind"] in {"vertical_train", "vertical_dense_train"}
        dx_left_max = 18 if vertical_like else 45
        fallback_dx_max = 16 if vertical_like else 35
        if tr["kind"] == "vertical_dense_train":
            for s in station_tokens:
                if wid(s) in used_station_ids:
                    continue
                dx = abs(tr["x0"] - s["x0"])
                dy = abs(s["top"] - tr["bottom"])
                if dx <= 2.0 and dy <= 40:
                    station_candidates.append((dx + dy * 0.1, s))
        for s in station_tokens:
            if wid(s) in used_station_ids:
                continue
            dx = tr["x0"] - s["x0"]
            if dx < -5 or dx > dx_left_max:
                continue
            dy_top = s["top"] - tr["top"]
            dy_bottom = s["top"] - tr["bottom"]
            if -3 <= dy_top <= 40 or (-2 <= dy_bottom <= 28):
                station_candidates.append((abs(dx) + max(0, min(dy_top, max(dy_bottom, 0))) * 0.4, s))
        if not station_candidates:
            for s in station_tokens:
                if wid(s) in used_station_ids:
                    continue
                dx = abs((tr["x0"] + tr["x1"]) / 2 - (s["x0"] + s["x1"]) / 2)
                dy = min(abs(s["top"] - tr["top"]), abs(s["top"] - tr["bottom"]))
                if dx <= fallback_dx_max and dy <= 34:
                    station_candidates.append((dx + dy * 0.5 + 100, s))
        station = sorted(station_candidates, key=lambda x: x[0])[0][1] if station_candidates else None

        marker = None
        marker_candidates = []
        if tr["kind"] != "vertical_dense_train":
            for m in marker_tokens:
                key = wid(m)
                if key in used_marker_ids:
                    continue
                if key in vertical_train_part_keys:
                    continue
                if tr["kind"] == "vertical_train":
                    dx = abs(m["x0"] - tr["x0"])
                    dy = m["top"] - tr["bottom"]
                    if dx <= 6 and -1 <= dy <= 10:
                        marker_candidates.append((dx + max(0, dy) * 0.4, m))
                else:
                    dx = abs(m["x0"] - tr["x0"])
                    dy = m["top"] - tr["top"]
                    if dx <= 28 and 0 <= dy <= 18:
                        marker_candidates.append((dx + dy * 0.5, m))
            if marker_candidates:
                marker = sorted(marker_candidates, key=lambda x: x[0])[0][1]
                used_marker_ids.add(wid(marker))

        if tr["kind"] == "vertical_train":
            anchor_x = (tr["x0"] + tr["x1"]) / 2 + metrics.vertical_train_x_offset
        elif vertical_like:
            anchor_x = (tr["x0"] + tr["x1"]) / 2
        else:
            # Horizontal train timing is more stable from the train label than the station label.
            anchor_x = tr["x0"]
        slot_idx = slot_index_from_x(anchor_x, metrics)
        if tr["kind"] == "vertical_dense_train":
            start_minute = slot_idx * 10
        elif tr["kind"] == "vertical_train":
            start_minute = precise_minute_from_train_x(tr["x0"] + metrics.vertical_train_x_offset, metrics, tr["kind"])
        else:
            time_x = anchor_x if vertical_like else tr["x0"]
            start_minute = precise_minute_from_train_x(time_x, metrics, tr["kind"])

        next_train_x = trains[idx + 1]["x0"] if idx + 1 < len(trains) else metrics.nach_x0 - 1
        next_word_x = min(
            [w["x0"] for w in t_words if w["x0"] > tr["x1"] + 1 and abs(w["top"] - tr["top"]) <= 28] or [metrics.nach_x0 - 1]
        )
        event_x0 = min(anchor_x, tr["x0"])
        if tr["kind"] == "vertical_train":
            local_cap = min(next_train_x - 2, next_word_x - 2, tr["x1"] + metrics.slot_w * 1.8)
            event_x1 = max(local_cap, tr["x1"])
        else:
            local_cap = min(next_train_x - 2, next_word_x - 2)
            # If no natural boundary was found, do not stretch the event to the right page edge.
            if next_train_x >= metrics.nach_x0 - 2 and next_word_x >= metrics.nach_x0 - 2:
                local_cap = tr["x1"] + metrics.hour_w * 2.0
            event_x1 = max(local_cap, tr["x1"])

        # Refine station assignment using the computed event window (helps dense/vertical rows).
        if station is None:
            event_station_candidates = []
            for s in station_tokens:
                if wid(s) in used_station_ids:
                    continue
                sx = (s["x0"] + s["x1"]) / 2
                overlapish = (event_x0 - 8) <= sx <= (event_x1 + 8)
                if not overlapish:
                    continue
                dyb = s["top"] - tr["bottom"]
                dyt = s["top"] - tr["top"]
                if vertical_like:
                    if -2 <= dyb <= 50:
                        event_station_candidates.append((abs(sx - event_x0) * 0.2 + max(0, dyb) * 0.2, s))
                else:
                    if -2 <= dyb <= 34 or 0 <= dyt <= 46:
                        event_station_candidates.append((abs(sx - event_x0) * 0.2 + max(0, dyb) * 0.25, s))
            if event_station_candidates:
                station = sorted(event_station_candidates, key=lambda x: x[0])[0][1]
        if station is None and not vertical_like:
            # Last-resort fallback for sparse shunting rows (R1/R3/K4...), allow wider search and token reuse.
            relaxed_candidates = []
            for s in station_tokens:
                sx = (s["x0"] + s["x1"]) / 2
                dxe = min(abs(sx - event_x0), abs(sx - event_x1))
                dyb = abs(s["top"] - tr["bottom"])
                if dxe <= 110 and dyb <= 70:
                    relaxed_candidates.append((dxe + dyb * 0.6 + (20 if wid(s) in used_station_ids else 0), s))
            if relaxed_candidates:
                station = sorted(relaxed_candidates, key=lambda x: x[0])[0][1]
        if station:
            used_station_ids.add(wid(station))

        slot_colors = []
        for r in colored_rects:
            overlap = min(event_x1, r["x1"]) - max(event_x0, r["x0"])
            if overlap > 1:
                slot_colors.append({"color": r["color_name"], "x0": r["x0"], "x1": r["x1"], "rect": r})
        primary_color = None
        if slot_colors:
            primary_color = max(slot_colors, key=lambda c: min(event_x1, c["x1"]) - max(event_x0, c["x0"]))["color"]

        duration_min = duration_minutes_from_x_span(event_x0, event_x1, metrics)
        end_minute = (start_minute + duration_min) % 1440 if duration_min > 0 else start_minute
        slots.append(
            {
                "slot_index": slot_idx,
                "start_minute": start_minute,
                "start_hhmm": f"{start_minute // 60:02d}:{start_minute % 60:02d}",
                "end_minute": end_minute,
                "end_hhmm": f"{end_minute // 60:02d}:{end_minute % 60:02d}",
                "station_code": station["text"] if station else None,
                "train_prefix": tr.get("prefix"),
                "train_no": tr.get("train_no"),
                "raw_train_token": tr["text"],
                # A csíkban megjelenő szám a kapcsolt gépek sorrendjét jelzi (1, 2, ...).
                "strip_marker": int(marker["text"]) if marker is not None and str(marker.get("text", "")).isdigit() else None,
                "color": primary_color,
                "x_anchor": anchor_x,
                "x_train": tr["x0"],
                "x_train_end": tr["x1"],
                "x_event0": event_x0,
                "x_event1": event_x1,
                "colors": [c["color"] for c in slot_colors],
                "train_kind": tr["kind"],
            }
        )

    # Remaining colored segments that could not be attached to any train slot (e.g. repair blocks)
    used_rect_ids = set()
    for s in slots:
        for r in colored_rects:
            overlap = min(s["x_event1"], r["x1"]) - max(s["x_event0"], r["x0"])
            if overlap > 1:
                used_rect_ids.add((round(r["x0"], 1), round(r["x1"], 1), round(r["top"], 1)))
    orphan_bands = [
        r for r in colored_rects
        if (round(r["x0"], 1), round(r["x1"], 1), round(r["top"], 1)) not in used_rect_ids
    ]

    # Build human-readable annotation rows for debug
    ann_lines = []
    for lw in group_words_by_line(lower_annotations):
        txt = line_text(lw)
        if not txt:
            continue
        if IGNORED_TIMELINE_ANNOTATION_RE.search(txt):
            continue
        ann_lines.append(txt)

    def band_text_for_rect(rect: Dict[str, Any]) -> Optional[str]:
        band_words = [
            w for w in t_words
            if (rect["x0"] - 2) <= w["x0"] <= (rect["x1"] + 2)
            and (rect["top"] - 3) <= w["top"] <= (rect["bottom"] + 3)
            and w["text"] != "2"
        ]
        if not band_words:
            return None
        lines = []
        for lw in group_words_by_line(band_words):
            txt = line_text(lw)
            if not txt:
                continue
            # Színes sávon belüli label lehet vegyes (pl. "Reparatur - LQ")
            lines.append(txt)
        if not lines:
            return None
        return " | ".join(lines)

    synthetic_slots = []
    for r in orphan_bands:
        slot_idx = slot_index_from_x(r["x0"], metrics)
        # A sávok a grid-balhatárhoz igazodnak; a vertikális offset itt közelebb visz a vizuális kezdőoszlophoz.
        minute = precise_minute_from_train_x(r["x0"] + metrics.vertical_train_x_offset, metrics, "vertical_train")
        duration_min = duration_minutes_from_x_span(r["x0"], r["x1"], metrics)
        end_minute = (minute + duration_min) % 1440 if duration_min > 0 else minute
        band_label = band_text_for_rect(r)
        synthetic_slots.append(
            {
                "slot_index": slot_idx,
                "start_minute": minute,
                "start_hhmm": f"{minute // 60:02d}:{minute % 60:02d}",
                "end_minute": end_minute,
                "end_hhmm": f"{end_minute // 60:02d}:{end_minute % 60:02d}",
                "station_code": None,
                "train_prefix": None,
                "train_no": None,
                "raw_train_token": band_label,
                "strip_marker": None,
                "color": r["color_name"],
                "x_anchor": r["x0"],
                "x_train": r["x0"],
                "x_train_end": r["x1"],
                "x_event0": r["x0"],
                "x_event1": r["x1"],
                "colors": [r["color_name"]],
                "train_kind": "band_only",
            }
        )

    return {
        "slots": sorted(slots + synthetic_slots, key=lambda s: (s["slot_index"], s["x_train"])),
        "bands": colored_rects,
        "orphan_bands": orphan_bands,
        "annotations": ann_lines,
        "char_count_timeline": len(t_chars),
    }


def load_mysql_config(config_path: str) -> Dict[str, Any]:
    raw = Path(config_path).read_text(encoding="utf-8")
    keys = ["host", "port", "dbname", "user", "pass", "charset", "socket", "driver"]
    out: Dict[str, Any] = {}
    for key in keys:
        m_str = re.search(rf"'{re.escape(key)}'\s*=>\s*'([^']*)'", raw)
        if m_str:
            out[key] = m_str.group(1)
            continue
        m_num = re.search(rf"'{re.escape(key)}'\s*=>\s*(\d+)", raw)
        if m_num:
            out[key] = int(m_num.group(1))
    if out.get("driver", "mysql") != "mysql":
        raise RuntimeError("Only MySQL is supported (db.driver must be 'mysql').")
    out.setdefault("host", "127.0.0.1")
    out.setdefault("port", 3306)
    out.setdefault("dbname", "")
    out.setdefault("user", "")
    out.setdefault("pass", "")
    out.setdefault("charset", "utf8mb4")
    return out


def mysql_connect(cfg: Dict[str, Any]):
    try:
        import mysql.connector  # type: ignore
        params = {
            "host": cfg["host"],
            "port": int(cfg["port"]),
            "database": cfg["dbname"],
            "user": cfg["user"],
            "password": cfg["pass"],
            "charset": cfg.get("charset", "utf8mb4"),
            "use_unicode": True,
            "autocommit": False,
        }
        if cfg.get("socket"):
            params["unix_socket"] = cfg["socket"]
        return mysql.connector.connect(**params)
    except ImportError:
        try:
            import pymysql  # type: ignore
            return pymysql.connect(
                host=cfg["host"],
                port=int(cfg["port"]),
                db=cfg["dbname"],
                user=cfg["user"],
                password=cfg["pass"],
                charset=cfg.get("charset", "utf8mb4"),
                unix_socket=cfg.get("socket") or None,
                autocommit=False,
            )
        except ImportError as e:
            raise RuntimeError(
                "MySQL driver missing. Install one of these manually:\n"
                "python3 -m pip install --user --break-system-packages PyMySQL\n"
                "python3 -m pip install --user --break-system-packages mysql-connector-python"
            ) from e


def create_schema(conn) -> None:
    ddls = [
        """
        CREATE TABLE IF NOT EXISTS schedule_day (
            id INT AUTO_INCREMENT PRIMARY KEY,
            source_pdf VARCHAR(255) NOT NULL,
            service_date VARCHAR(32) NULL,
            weekday_label VARCHAR(32) NULL
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
        """,
        """
        CREATE TABLE IF NOT EXISTS schedule_page (
            id INT AUTO_INCREMENT PRIMARY KEY,
            day_id INT NOT NULL,
            page_number INT NOT NULL,
            page_total INT NULL,
            width DOUBLE NULL,
            height DOUBLE NULL,
            CONSTRAINT fk_schedule_page_day FOREIGN KEY (day_id) REFERENCES schedule_day(id) ON DELETE CASCADE
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
        """,
        """
        CREATE TABLE IF NOT EXISTS locomotive_type (
            id INT AUTO_INCREMENT PRIMARY KEY,
            code VARCHAR(64) NULL UNIQUE,
            name VARCHAR(255) NOT NULL,
            sort_order INT NOT NULL DEFAULT 0,
            active TINYINT(1) NOT NULL DEFAULT 1,
            notes TEXT NULL
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
        """,
        """
        CREATE TABLE IF NOT EXISTS locomotive (
            vehicle_no VARCHAR(64) PRIMARY KEY,
            locomotive_type_id INT NULL,
            locomotive_type VARCHAR(255) NULL,
            notes TEXT NULL,
            CONSTRAINT fk_locomotive_type FOREIGN KEY (locomotive_type_id) REFERENCES locomotive_type(id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
        """,
        """
        CREATE TABLE IF NOT EXISTS station (
            id INT AUTO_INCREMENT PRIMARY KEY,
            code VARCHAR(64) NOT NULL UNIQUE,
            name VARCHAR(255) NULL,
            sort_order INT NOT NULL DEFAULT 0,
            active TINYINT(1) NOT NULL DEFAULT 1,
            notes TEXT NULL
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
        """,
        """
        CREATE TABLE IF NOT EXISTS duty (
            id INT AUTO_INCREMENT PRIMARY KEY,
            page_id INT NOT NULL,
            seq_on_page INT NOT NULL,
            vehicle_no VARCHAR(64) NULL,
            rotation_no VARCHAR(64) NULL,
            distance_km INT NULL,
            von_prev_train VARCHAR(64) NULL,
            von_prev_forda VARCHAR(64) NULL,
            von_location VARCHAR(64) NULL,
            von_station_id INT NULL,
            nach_next_train VARCHAR(64) NULL,
            nach_next_forda VARCHAR(64) NULL,
            nach_location VARCHAR(64) NULL,
            nach_station_id INT NULL,
            remark TEXT NULL,
            body_y0 DOUBLE NULL,
            body_y1 DOUBLE NULL,
            debug_fzg_lines_json LONGTEXT NULL,
            debug_von_lines_json LONGTEXT NULL,
            debug_nach_lines_json LONGTEXT NULL,
            CONSTRAINT fk_duty_page FOREIGN KEY (page_id) REFERENCES schedule_page(id) ON DELETE CASCADE,
            CONSTRAINT fk_duty_von_station FOREIGN KEY (von_station_id) REFERENCES station(id),
            CONSTRAINT fk_duty_nach_station FOREIGN KEY (nach_station_id) REFERENCES station(id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
        """,
        """
        CREATE TABLE IF NOT EXISTS duty_slot (
            id INT AUTO_INCREMENT PRIMARY KEY,
            duty_id INT NOT NULL,
            slot_index INT NOT NULL,
            start_minute INT NOT NULL,
            start_hhmm VARCHAR(16) NOT NULL,
            end_minute INT NULL,
            end_hhmm VARCHAR(16) NULL,
            station_code VARCHAR(64) NULL,
            station_id INT NULL,
            train_prefix VARCHAR(16) NULL,
            train_no VARCHAR(64) NULL,
            raw_train_token VARCHAR(255) NULL,
            strip_marker INT NULL,
            color VARCHAR(64) NULL,
            x_anchor DOUBLE NULL,
            x_event0 DOUBLE NULL,
            x_event1 DOUBLE NULL,
            x_train DOUBLE NULL,
            x_train_end DOUBLE NULL,
            train_kind VARCHAR(64) NULL,
            debug_colors_json LONGTEXT NULL,
            CONSTRAINT fk_duty_slot_duty FOREIGN KEY (duty_id) REFERENCES duty(id) ON DELETE CASCADE,
            CONSTRAINT fk_duty_slot_station FOREIGN KEY (station_id) REFERENCES station(id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
        """,
        """
        CREATE TABLE IF NOT EXISTS duty_timeline_band (
            id INT AUTO_INCREMENT PRIMARY KEY,
            duty_id INT NOT NULL,
            color VARCHAR(64) NOT NULL,
            x0 DOUBLE NOT NULL,
            x1 DOUBLE NOT NULL,
            y0 DOUBLE NOT NULL,
            y1 DOUBLE NOT NULL,
            orphan TINYINT(1) NOT NULL DEFAULT 0,
            CONSTRAINT fk_duty_timeline_band_duty FOREIGN KEY (duty_id) REFERENCES duty(id) ON DELETE CASCADE
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
        """,
        """
        CREATE TABLE IF NOT EXISTS duty_annotation (
            id INT AUTO_INCREMENT PRIMARY KEY,
            duty_id INT NOT NULL,
            text TEXT NOT NULL,
            CONSTRAINT fk_duty_annotation_duty FOREIGN KEY (duty_id) REFERENCES duty(id) ON DELETE CASCADE
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
        """,
    ]
    cur = conn.cursor()
    for ddl in ddls:
        cur.execute(ddl)
    cur.close()


def insert_day(conn, pdf_path: str, meta: Dict[str, Any]) -> int:
    cur = conn.cursor()
    cur.execute(
        "INSERT INTO schedule_day (source_pdf, service_date, weekday_label) VALUES (%s, %s, %s)",
        (pdf_path, meta.get("date_str"), meta.get("weekday_label")),
    )
    day_id = int(cur.lastrowid)
    cur.close()
    return day_id


def sync_locomotive_master(conn) -> None:
    cur = conn.cursor()
    cur.execute(
        """
        INSERT IGNORE INTO locomotive (vehicle_no)
        SELECT DISTINCT vehicle_no
        FROM duty
        WHERE vehicle_no IS NOT NULL AND TRIM(vehicle_no) <> ''
        """
    )
    cur.close()


def cleanup_orphan_locomotives(conn) -> None:
    cur = conn.cursor()
    cur.execute(
        """
        DELETE FROM locomotive
        WHERE vehicle_no NOT IN (
            SELECT DISTINCT vehicle_no
            FROM duty
            WHERE vehicle_no IS NOT NULL AND TRIM(vehicle_no) <> ''
        )
        """
    )
    cur.close()


def sync_station_master(conn) -> None:
    cur = conn.cursor()
    cur.execute(
        """
        INSERT IGNORE INTO station (code, name)
        SELECT code, code
        FROM (
            SELECT DISTINCT TRIM(station_code) AS code
            FROM duty_slot
            WHERE station_code IS NOT NULL AND TRIM(station_code) <> ''
            UNION
            SELECT DISTINCT TRIM(von_location) AS code
            FROM duty
            WHERE von_location IS NOT NULL AND TRIM(von_location) <> ''
            UNION
            SELECT DISTINCT TRIM(nach_location) AS code
            FROM duty
            WHERE nach_location IS NOT NULL AND TRIM(nach_location) <> ''
        ) src
        WHERE code REGEXP '^[A-Z0-9]+$'
        """
    )
    cur.execute(
        """
        UPDATE station
        SET name = code
        WHERE (name IS NULL OR TRIM(name) = '')
        """
    )
    cur.close()


def assign_station_foreign_keys(conn) -> None:
    cur = conn.cursor()
    cur.execute("SELECT id, code FROM station")
    code_to_id = {
        str(code).strip(): int(station_id)
        for station_id, code in cur.fetchall()
        if code is not None and str(code).strip() != ""
    }
    cur.close()
    if not code_to_id:
        return
    cur = conn.cursor()
    for code, station_id in code_to_id.items():
        cur.execute(
            "UPDATE duty_slot SET station_id = %s WHERE station_code IS NOT NULL AND TRIM(station_code) = %s",
            (station_id, code),
        )
        cur.execute(
            "UPDATE duty SET von_station_id = %s WHERE von_location IS NOT NULL AND TRIM(von_location) = %s",
            (station_id, code),
        )
        cur.execute(
            "UPDATE duty SET nach_station_id = %s WHERE nach_location IS NOT NULL AND TRIM(nach_location) = %s",
            (station_id, code),
        )
    cur.close()


def cleanup_orphan_stations(conn) -> None:
    cur = conn.cursor()
    cur.execute(
        """
        DELETE FROM station
        WHERE id NOT IN (
            SELECT DISTINCT station_id FROM duty_slot WHERE station_id IS NOT NULL
            UNION
            SELECT DISTINCT von_station_id FROM duty WHERE von_station_id IS NOT NULL
            UNION
            SELECT DISTINCT nach_station_id FROM duty WHERE nach_station_id IS NOT NULL
        )
        """
    )
    cur.close()


def reset_all_schedule_data(conn) -> None:
    cur = conn.cursor()
    cur.execute("SET FOREIGN_KEY_CHECKS=0")
    for table in ["duty_annotation", "duty_timeline_band", "duty_slot", "duty", "schedule_page", "schedule_day"]:
        cur.execute(f"TRUNCATE TABLE {table}")
    cur.execute("SET FOREIGN_KEY_CHECKS=1")
    cur.close()


def delete_existing_day_by_source_pdf(conn, source_pdf: str) -> None:
    cur = conn.cursor()
    cur.execute("SELECT id FROM schedule_day WHERE source_pdf = %s", (source_pdf,))
    day_ids = [int(r[0]) for r in cur.fetchall()]
    for day_id in day_ids:
        cur.execute("DELETE FROM schedule_day WHERE id = %s", (day_id,))
    cur.close()


def debug_print_duty(page_no: int, duty_data: Dict[str, Any], timeline: Dict[str, Any]) -> None:
    print(
        f"[DEBUG] p{page_no} duty#{duty_data['seq_on_page']:02d} "
        f"FZG={duty_data.get('vehicle_no')} forda={duty_data.get('rotation_no')} "
        f"km={duty_data.get('distance_km')} "
        f"von=({duty_data.get('von_prev_train')},{duty_data.get('von_prev_forda')},{duty_data.get('von_location')}) "
        f"nach=({duty_data.get('nach_next_train')},{duty_data.get('nach_next_forda')},{duty_data.get('nach_location')})"
    )
    if duty_data.get("remark"):
        print(f"        remark: {duty_data['remark']}")
    if timeline["annotations"]:
        for ann in timeline["annotations"]:
            print(f"        annotation: {ann}")
    for s in timeline["slots"]:
        print(
            f"        slot {s['start_hhmm']} idx={s['slot_index']:03d} "
            f"end={s.get('end_hhmm') or '-'} "
            f"st={s['station_code'] or '-'} train={s['raw_train_token'] or '-'} "
            f"prefix={s['train_prefix'] or '-'} no={s['train_no'] or '-'} "
            f"strip={s['strip_marker'] or '-'} color={s['color'] or '-'} "
            f"kind={s['train_kind']}"
        )
    for b in timeline["orphan_bands"]:
        print(
            f"        orphan_band color={b['color_name']} "
            f"x={b['x0']:.1f}-{b['x1']:.1f} y={b['top']:.1f}-{b['bottom']:.1f}"
        )


def convert_pdf(
    pdf_path: str,
    mysql_config_path: str,
    debug: bool = True,
    limit_pages: Optional[int] = None,
    append: bool = False,
) -> None:
    pdf_path_obj = Path(pdf_path)
    if not pdf_path_obj.exists():
        raise FileNotFoundError(pdf_path)

    cfg = load_mysql_config(mysql_config_path)
    conn = mysql_connect(cfg)
    create_schema(conn)
    if not append:
        reset_all_schedule_data(conn)

    with pdfplumber.open(str(pdf_path_obj)) as pdf:
        first_page_words = pdf.pages[0].extract_words(x_tolerance=1, y_tolerance=1, keep_blank_chars=False)
        day_meta = extract_page_meta(first_page_words)
        if append:
            delete_existing_day_by_source_pdf(conn, str(pdf_path_obj.name))
        day_id = insert_day(conn, str(pdf_path_obj.name), day_meta)

        metrics = extract_metrics(pdf.pages[0])

        total_pages = len(pdf.pages)
        pages_to_process = pdf.pages[: limit_pages or total_pages]
        total_duties = 0
        total_slots = 0

        for page_idx, page in enumerate(pages_to_process, start=1):
            page_words = page.extract_words(x_tolerance=1, y_tolerance=1, keep_blank_chars=False)
            page_chars = page.chars
            page_rects = page.rects
            meta = extract_page_meta(page_words)
            page_no_meta = meta.get("page_no")
            page_total_meta = meta.get("page_total")
            page_no = int(page_no_meta) if page_no_meta is not None else page_idx
            page_total = int(page_total_meta) if page_total_meta is not None else total_pages

            cur = conn.cursor()
            cur.execute(
                "INSERT INTO schedule_page (day_id, page_number, page_total, width, height) VALUES (%s, %s, %s, %s, %s)",
                (day_id, page_no, page_total, page.width, page.height),
            )
            page_id = int(cur.lastrowid)
            cur.close()

            blocks = build_block_index(page_words, metrics, page.height)
            for block in blocks:
                cols = parse_block_columns(page_words, block, metrics)
                remark = parse_remark(page_words, block)
                duty_data = {
                    **cols,
                    "remark": remark,
                    "seq_on_page": block["seq_on_page"],
                }
                cur = conn.cursor()
                cur.execute(
                    """
                    INSERT INTO duty (
                        page_id, seq_on_page, vehicle_no, rotation_no, distance_km,
                        von_prev_train, von_prev_forda, von_location,
                        nach_next_train, nach_next_forda, nach_location,
                        remark, body_y0, body_y1,
                        debug_fzg_lines_json, debug_von_lines_json, debug_nach_lines_json
                    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                    """,
                    (
                        page_id,
                        block["seq_on_page"],
                        cols["vehicle_no"],
                        cols["rotation_no"],
                        cols["distance_km"],
                        cols["von_prev_train"],
                        cols["von_prev_forda"],
                        cols["von_location"],
                        cols["nach_next_train"],
                        cols["nach_next_forda"],
                        cols["nach_location"],
                        remark,
                        block["body_y0"],
                        block["body_y1"],
                        json.dumps(cols["debug_fzg_lines"], ensure_ascii=False),
                        json.dumps(cols["debug_von_lines"], ensure_ascii=False),
                        json.dumps(cols["debug_nach_lines"], ensure_ascii=False),
                    ),
                )
                duty_id = int(cur.lastrowid)
                cur.close()

                timeline = extract_timeline(page_words, page_chars, page_rects, block, metrics)

                for s in timeline["slots"]:
                    cur = conn.cursor()
                    cur.execute(
                        """
                        INSERT INTO duty_slot (
                            duty_id, slot_index, start_minute, start_hhmm, end_minute, end_hhmm, station_code,
                            train_prefix, train_no, raw_train_token, strip_marker, color,
                            x_anchor, x_event0, x_event1, x_train, x_train_end, train_kind, debug_colors_json
                        ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                        """,
                        (
                            duty_id,
                            s["slot_index"],
                            s["start_minute"],
                            s["start_hhmm"],
                            s.get("end_minute"),
                            s.get("end_hhmm"),
                            s["station_code"],
                            s["train_prefix"],
                            s["train_no"],
                            s["raw_train_token"],
                            s["strip_marker"],
                            s["color"],
                            s["x_anchor"],
                            s["x_event0"],
                            s["x_event1"],
                            s["x_train"],
                            s["x_train_end"],
                            s["train_kind"],
                            json.dumps(s["colors"], ensure_ascii=False),
                        ),
                    )
                    cur.close()
                for r in timeline["bands"]:
                    orphan = 1 if r in timeline["orphan_bands"] else 0
                    cur = conn.cursor()
                    cur.execute(
                        """
                        INSERT INTO duty_timeline_band (duty_id, color, x0, x1, y0, y1, orphan)
                        VALUES (%s, %s, %s, %s, %s, %s, %s)
                        """,
                        (duty_id, r["color_name"], r["x0"], r["x1"], r["top"], r["bottom"], orphan),
                    )
                    cur.close()
                for ann in timeline["annotations"]:
                    cur = conn.cursor()
                    cur.execute("INSERT INTO duty_annotation (duty_id, text) VALUES (%s, %s)", (duty_id, ann))
                    cur.close()

                total_duties += 1
                total_slots += len(timeline["slots"])
                if debug:
                    debug_print_duty(page_no, duty_data, timeline)

    # Commit parsed rows first.
    conn.commit()
    sync_locomotive_master(conn)
    cleanup_orphan_locomotives(conn)
    sync_station_master(conn)
    assign_station_foreign_keys(conn)
    cleanup_orphan_stations(conn)
    conn.commit()
    print(
        f"[SUMMARY] mode={'append' if append else 'replace'} pages={len(pages_to_process)}/{total_pages} "
        f"duties={total_duties} slots={total_slots} db=mysql:{cfg.get('host')}:{cfg.get('dbname')}"
    )
    conn.close()


def main() -> None:
    parser = argparse.ArgumentParser(description="Lokdienst PDF -> MySQL parser (debug-friendly)")
    parser.add_argument("pdf", nargs="?", default="Lokdienst_25.02.2026.pdf", help="Input PDF")
    parser.add_argument("--mysql-config", default="php_web/config.php", help="MySQL config file path")
    parser.add_argument("--db", default=None, help="Deprecated. SQLite is removed; use --mysql-config.")
    parser.add_argument("--no-debug", action="store_true", help="Disable debug print")
    parser.add_argument("--limit-pages", type=int, default=None, help="Only parse first N pages")
    parser.add_argument("--append", action="store_true", help="Append day into existing DB (replace same source_pdf only)")
    args = parser.parse_args()
    if args.db:
        raise RuntimeError("`--db` már nem támogatott. SQLite eltávolítva, használd: --mysql-config php_web/config.php")

    convert_pdf(args.pdf, args.mysql_config, debug=not args.no_debug, limit_pages=args.limit_pages, append=args.append)


if __name__ == "__main__":
    main()
