import argparse import json import re from dataclasses import dataclass from pathlib import Path from typing import Any, Dict, Iterable, List, Optional, Tuple import pdfplumber UPPER_CODE_RE = re.compile(r"^[A-Z]{2,5}$") KM_RE = re.compile(r"(?P\d+)\s*km\b") GREEN_COLORS = {(0.0, 0.624, 0.0)} RED_BAND_COLORS = {(0.875, 0.0, 0.0), (0.922, 0.361, 0.247)} IGNORE_BORDER_RED = {(1.0, 0.0, 0.0)} IGNORED_TIMELINE_ANNOTATION_RE = re.compile(r"\bReparatur\b", re.IGNORECASE) @dataclass class Metrics: fzg_x0: float von_x0: float timeline_x0: float timeline_last00_x0: float nach_x0: float hour_xs: List[float] slot_w: float header_bottom: float hour_w: float train_text_offset_x: float vertical_train_x_offset: float def norm_color(color: Any) -> Optional[Tuple[float, ...]]: if color is None: return None if isinstance(color, tuple): return tuple(round(float(v), 3) for v in color) if isinstance(color, (int, float)): v = round(float(color), 3) return (v, v, v) return None def group_words_by_line(words: List[Dict[str, Any]], tol: float = 1.6) -> List[List[Dict[str, Any]]]: lines: List[List[Dict[str, Any]]] = [] for w in sorted(words, key=lambda x: (x["top"], x["x0"])): placed = False for line in lines: avg_top = sum(i["top"] for i in line) / len(line) if abs(w["top"] - avg_top) <= tol: line.append(w) line.sort(key=lambda x: x["x0"]) placed = True break if not placed: lines.append([w]) return lines def line_text(words: Iterable[Dict[str, Any]]) -> str: return " ".join(w["text"] for w in sorted(words, key=lambda x: x["x0"])).strip() def line_top(words: List[Dict[str, Any]]) -> float: return min(w["top"] for w in words) def line_bottom(words: List[Dict[str, Any]]) -> float: return max(w["bottom"] for w in words) def extract_metrics(page) -> Metrics: words = page.extract_words(x_tolerance=1, y_tolerance=1, keep_blank_chars=False) # A különböző napi PDF-ekben a header sor függőleges pozíciója elcsúszhat. # Ezért nem fix y-sávval dolgozunk, hanem megkeressük azt a sort, ahol FZG/Von/Nach együtt szerepel. header_lines = group_words_by_line([w for w in words if w["top"] < 80], tol=2.0) header_words: List[Dict[str, Any]] = [] for line in header_lines: texts = {w["text"] for w in line} if {"FZG", "Von", "Nach"}.issubset(texts): header_words = line break if not header_words: # Fallback: régi működés (ha a fenti nem talál) header_words = [w for w in words if 40 <= w["top"] <= 65] def find_word(txt: str, nth: int = 0) -> Dict[str, Any]: matches = [w for w in header_words if w["text"] == txt] if len(matches) <= nth: raise ValueError(f"Header word not found: {txt}[{nth}]") return sorted(matches, key=lambda x: x["x0"])[nth] fzg = find_word("FZG") von = find_word("Von") nach = find_word("Nach") hour_words = [w for w in header_words if re.fullmatch(r"\d{2}", w["text"])] hour_words = sorted(hour_words, key=lambda x: x["x0"]) first_00 = hour_words[0] last_00 = [w for w in hour_words if w["text"] == "00"][-1] slot_w = (last_00["x0"] - first_00["x0"]) / 144.0 hour_w = (last_00["x0"] - first_00["x0"]) / 24.0 # Empirikus kalibráció: a vízszintes vonatszám szöveg bal széle kb. ennyivel jobbra van a valódi időponttól. train_text_offset_x = hour_w * 0.299 # ~8.23 px ezen a layouton # A sima (nem dense) vertikális train címkék x0-ja tipikusan balra esik a valós időoszloptól. vertical_train_x_offset = slot_w * 2.25 return Metrics( fzg_x0=fzg["x0"], von_x0=von["x0"], timeline_x0=first_00["x0"], timeline_last00_x0=last_00["x0"], nach_x0=nach["x0"], hour_xs=[w["x0"] for w in hour_words], slot_w=slot_w, header_bottom=max(w["bottom"] for w in header_words), hour_w=hour_w, train_text_offset_x=train_text_offset_x, vertical_train_x_offset=vertical_train_x_offset, ) def words_in_box(words: List[Dict[str, Any]], x0: float, x1: float, y0: float, y1: float) -> List[Dict[str, Any]]: return [w for w in words if x0 <= w["x0"] < x1 and y0 <= w["top"] < y1] def chars_in_box(chars: List[Dict[str, Any]], x0: float, x1: float, y0: float, y1: float) -> List[Dict[str, Any]]: return [c for c in chars if x0 <= c["x0"] < x1 and y0 <= c["top"] < y1] def extract_page_meta(words: List[Dict[str, Any]]) -> Dict[str, Any]: # A címfejléc és az órasor közelebb kerülhet egymáshoz; keressük azt a konkrét sort, amelyikben "Seite:" szerepel. candidate_lines = group_words_by_line([w for w in words if w["top"] < 60], tol=2.0) txt = "" for lw in candidate_lines: line = line_text(lw) if "Seite:" in line: txt = line break if not txt: # Fallback a régi logikára txt = line_text([w for w in words if w["top"] < 50]) m = re.search(r"(\d{2}\.\d{2}\.\d{4})\s+\[(.+?)\].*Seite:\s*(\d+)/(\d+)", txt) if not m: return {} return { "date_str": m.group(1), "weekday_label": m.group(2), "page_no": int(m.group(3)), "page_total": int(m.group(4)), } def build_block_index(page_words: List[Dict[str, Any]], metrics: Metrics, page_height: float) -> List[Dict[str, Any]]: lines = group_words_by_line([w for w in page_words if w["top"] > metrics.header_bottom + 1]) line_rows: List[Dict[str, Any]] = [] for lw in lines: txt = line_text(lw) line_rows.append( { "top": line_top(lw), "bottom": line_bottom(lw), "text": txt, "words": lw, } ) start_lines: List[Dict[str, Any]] = [] remark_lines: List[Dict[str, Any]] = [] for row in line_rows: txt = row["text"] if "Bemerkungen:" in txt: remark_lines.append(row) continue if " km" in txt: continue left_words = [w for w in row["words"] if w["x0"] < metrics.von_x0 - 2] if not left_words: continue left_text = line_text(left_words) if re.fullmatch(r"\d{3,5}", left_text): start_lines.append(row) start_lines = sorted(start_lines, key=lambda r: r["top"]) remark_lines = sorted(remark_lines, key=lambda r: r["top"]) blocks: List[Dict[str, Any]] = [] used_starts = set() for remark_idx, remark in enumerate(remark_lines): candidates = [s for i, s in enumerate(start_lines) if i not in used_starts and s["top"] < remark["top"]] if not candidates: continue start = candidates[-1] used_starts.add(start_lines.index(start)) next_start = next((s for s in start_lines if s["top"] > remark["top"]), None) body_y0 = max(metrics.header_bottom + 1, start["top"] - 1) body_y1 = max(body_y0, remark["top"] - 1) remark_y0 = remark["top"] - 0.5 remark_y1 = (next_start["top"] - 1) if next_start else (page_height - 1) blocks.append( { "seq_on_page": len(blocks) + 1, "start_row": start, "remark_row": remark, "body_y0": body_y0, "body_y1": body_y1, "remark_y0": remark_y0, "remark_y1": remark_y1, } ) return blocks def parse_triplet(lines: List[str]) -> Tuple[Optional[str], Optional[str], Optional[str]]: vals = [v.strip() for v in lines if v and v.strip()] if not vals: return None, None, None if len(vals) == 1: return None, None, vals[0] if len(vals) == 2: return None, vals[0], vals[1] return " ".join(vals[:-2]), vals[-2], vals[-1] def parse_block_columns(page_words: List[Dict[str, Any]], block: Dict[str, Any], metrics: Metrics) -> Dict[str, Any]: body_words = [w for w in page_words if block["body_y0"] <= w["top"] < block["body_y1"]] lines = group_words_by_line(body_words) fzg_lines: List[str] = [] von_lines: List[str] = [] nach_lines: List[str] = [] distance_km: Optional[int] = None for lw in lines: fzg_words = [w for w in lw if w["x0"] < metrics.von_x0 - 2] von_words = [w for w in lw if metrics.von_x0 <= w["x0"] < metrics.timeline_x0 - 2] nach_words = [w for w in lw if w["x0"] >= metrics.nach_x0] fzg_txt = line_text(fzg_words) von_txt = line_text(von_words) nach_txt = line_text(nach_words) if fzg_txt: fzg_lines.append(fzg_txt) km_match = KM_RE.search(fzg_txt) if km_match: distance_km = int(km_match.group("km")) if von_txt: von_lines.append(von_txt) if nach_txt: nach_lines.append(nach_txt) vehicle_no = next((v for v in fzg_lines if re.fullmatch(r"\d{3,5}", v)), None) rotation_raw = next((v for v in fzg_lines if re.fullmatch(r"$.+$", v)), None) rotation_no = rotation_raw.strip("()") if rotation_raw else None von_prev_train, von_prev_forda, von_location = parse_triplet(von_lines) nach_next_train, nach_next_forda, nach_location = parse_triplet(nach_lines) return { "vehicle_no": vehicle_no, "rotation_no": rotation_no, "distance_km": distance_km, "von_prev_train": von_prev_train, "von_prev_forda": von_prev_forda, "von_location": von_location, "nach_next_train": nach_next_train, "nach_next_forda": nach_next_forda, "nach_location": nach_location, "debug_fzg_lines": fzg_lines, "debug_von_lines": von_lines, "debug_nach_lines": nach_lines, } def parse_remark(page_words: List[Dict[str, Any]], block: Dict[str, Any]) -> str: remark_words = [w for w in page_words if block["remark_y0"] <= w["top"] < block["remark_y1"]] lines = group_words_by_line(remark_words) texts = [] for lw in lines: txt = line_text(lw) if txt: texts.append(txt) joined = "\n".join(texts).strip() return re.sub(r"^Bemerkungen:\s*", "", joined).strip() def cluster_single_char_stacks( words: List[Dict[str, Any]], excluded_tops: Optional[List[float]] = None ) -> List[Dict[str, Any]]: excluded_tops = excluded_tops or [] singletons = [ w for w in words if len(w["text"]) == 1 and re.fullmatch(r"[A-Za-z0-9|]", w["text"]) and all(abs(w["top"] - yt) > 1.2 for yt in excluded_tops) ] cols: List[List[Dict[str, Any]]] = [] for w in sorted(singletons, key=lambda x: (x["x0"], x["top"])): placed = False for col in cols: avg_x = sum(i["x0"] for i in col) / len(col) if abs(w["x0"] - avg_x) <= 2.2: col.append(w) placed = True break if not placed: cols.append([w]) out = [] for col in cols: col = sorted(col, key=lambda x: x["top"]) if len(col) < 2: continue txt = "".join(c["text"] for c in col) if not ( txt.isdigit() or (txt[0].isalpha() and txt[1:].isdigit()) or re.fullmatch(r"[A-Z]\|\d{2,5}", txt) ): continue out.append( { "kind": "vertical_train", "text": txt, "x0": min(c["x0"] for c in col), "x1": max(c["x1"] for c in col), "top": min(c["top"] for c in col), "bottom": max(c["bottom"] for c in col), "parts": [c["text"] for c in col], "part_keys": [(round(c["x0"], 2), round(c["top"], 2), c["text"]) for c in col], } ) return out def extract_dense_vertical_matrix_trains(t_words: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], set]: """ Special-case parser for rows like FZG 642 where train numbers are built from a dense matrix of stacked single digits (e.g. 2 / 1 / 2 -> 212) across many columns. """ lines = group_words_by_line(t_words) dense_digit_lines: List[List[Dict[str, Any]]] = [] for lw in lines: sw = sorted(lw, key=lambda x: x["x0"]) if len(sw) < 20: continue if all(len(w["text"]) == 1 and w["text"].isdigit() for w in sw): dense_digit_lines.append(sw) # Need at least 2 rows (e.g. 11,12,13...) or 3 rows (e.g. 212,213...). if len(dense_digit_lines) < 2: return [], set() def align_run_columns(run: List[List[Dict[str, Any]]], tol_x: float = 2.0) -> List[List[Dict[str, Any]]]: """ Align stacked digit rows by X position (not by raw index). This is more robust when the top digit row slightly overlaps the remark area and gains/loses a digit near the edges. """ base_idx = min(range(len(run)), key=lambda i: abs(i - (len(run) - 1) / 2)) base = sorted(run[base_idx], key=lambda x: x["x0"]) others = [sorted(r, key=lambda x: x["x0"]) for r in run] aligned_cols: List[List[Dict[str, Any]]] = [] for b in base: col = [None] * len(run) # type: ignore[list-item] col[base_idx] = b ok = True for ridx, row in enumerate(others): if ridx == base_idx: continue candidates = [w for w in row if abs(w["x0"] - b["x0"]) <= tol_x] if not candidates: ok = False break # Prefer the closest x; tie-break on smaller vertical distance. chosen = sorted( candidates, key=lambda w: (abs(w["x0"] - b["x0"]), abs(w["top"] - b["top"])) )[0] col[ridx] = chosen if ok: aligned_cols.append([w for w in col if w is not None]) # Deduplicate if multiple base points snapped to the same stack. dedup: List[List[Dict[str, Any]]] = [] seen = set() for col in aligned_cols: key = tuple((round(w["x0"], 2), round(w["top"], 2), w["text"]) for w in col) if key in seen: continue seen.add(key) dedup.append(col) return dedup # Pick the best aligned run, preferring 3 rows over 2 rows and the one with the most matched columns. chosen = None chosen_cols: List[List[Dict[str, Any]]] = [] for run_len in (3, 2): if len(dense_digit_lines) < run_len: continue best_for_len = None best_cols_for_len: List[List[Dict[str, Any]]] = [] best_score = -1 for i in range(len(dense_digit_lines) - run_len + 1): run = dense_digit_lines[i : i + run_len] cols = align_run_columns(run) if len(cols) < 2: continue # Score: matched column count first, then compact x-spread agreement. spread_penalty = 0.0 for col in cols: xs = [w["x0"] for w in col] spread_penalty += (max(xs) - min(xs)) score = len(cols) * 1000 - spread_penalty if score > best_score: best_score = score best_for_len = run best_cols_for_len = cols if best_for_len is not None: chosen = best_for_len chosen_cols = best_cols_for_len break if not chosen: return [], set() # Reject false positives: prefixed vertical train labels (e.g. D/F/C + 10xx) can produce a # 3-row digit run in the middle. In that case, a row of single uppercase letters is typically # aligned directly above the candidate digit columns. run_top = min(min(w["top"] for w in row) for row in chosen) uppercase_singles = [ w for w in t_words if len(w["text"]) == 1 and w["text"].isalpha() and w["text"].isupper() and (run_top - 8.0) <= w["top"] <= (run_top + 1.0) ] if uppercase_singles and chosen_cols: aligned_upper = 0 for col in chosen_cols: col_x = sum(w["x0"] for w in col) / len(col) if any(abs(u["x0"] - col_x) <= 2.0 for u in uppercase_singles): aligned_upper += 1 if aligned_upper >= max(4, int(len(chosen_cols) * 0.2)): return [], set() trains: List[Dict[str, Any]] = [] consumed = set() for parts in sorted(chosen_cols, key=lambda col: min(p["x0"] for p in col)): for p in parts: consumed.add((round(p["x0"], 2), round(p["top"], 2), p["text"])) txt = "".join(p["text"] for p in parts) if not txt.isdigit(): continue trains.append( { "kind": "vertical_dense_train", "text": txt, "x0": min(p["x0"] for p in parts), "x1": max(p["x1"] for p in parts), "top": min(p["top"] for p in parts), "bottom": max(p["bottom"] for p in parts), "parts": [p["text"] for p in parts], "prefix": None, "train_no": txt, } ) return trains, consumed def parse_train_token(token_text: str) -> Tuple[Optional[str], Optional[str], Optional[str]]: t = (token_text or "").strip() if not t: return None, None, None # PDF text extraction sometimes uses alternate pipe glyphs or injects spaces around the separator. t_norm = re.sub(r"\s*([|¦｜∣])\s*", r"|", t) if "|" in t_norm: m = re.fullmatch(r"([A-Z])\|(\d{2,5})", t_norm) if m: return t_norm, m.group(1), m.group(2) # Fallback: keep arbitrary RHS if it is not purely numeric but still preserve prefix split. m = re.fullmatch(r"([A-Z])\|(.+)", t_norm) if m: return t_norm, m.group(1), m.group(2).strip() if t_norm and t_norm[0].isalpha() and t_norm[1:].isdigit() and len(t_norm) >= 2: return t_norm, t_norm[0], t_norm[1:] if re.fullmatch(r"\d{2,5}", t_norm): return t_norm, None, t_norm return None, None, None def color_name(color: Optional[Tuple[float, ...]]) -> Optional[str]: if color is None: return None if color in GREEN_COLORS: return "green" if color in RED_BAND_COLORS: return "red" if color in IGNORE_BORDER_RED: return "red_border" return f"rgb:{','.join(str(v) for v in color)}" def slot_index_from_x(x: float, metrics: Metrics) -> int: idx = int(round((x - metrics.timeline_x0) / metrics.slot_w)) return idx % 144 def precise_minute_from_train_x(x: float, metrics: Metrics, train_kind: str) -> int: offset = metrics.train_text_offset_x if train_kind == "horizontal_train" else 0.0 rel_x = x - metrics.timeline_x0 - offset minute = int(round((rel_x / metrics.hour_w) * 60)) return minute % 1440 def minute_from_timeline_x(x: float, metrics: Metrics) -> int: rel_x = x - metrics.timeline_x0 minute = int(round((rel_x / metrics.hour_w) * 60)) return minute % 1440 def duration_minutes_from_x_span(x0: float, x1: float, metrics: Metrics) -> int: span = max(0.0, x1 - x0) dur = int(round((span / metrics.hour_w) * 60)) if dur <= 0 and span > 0.5: dur = 1 return max(0, min(dur, 1439)) def extract_timeline(page_words: List[Dict[str, Any]], page_chars: List[Dict[str, Any]], page_rects: List[Dict[str, Any]], block: Dict[str, Any], metrics: Metrics) -> Dict[str, Any]: timeline_clip_x0 = metrics.timeline_x0 - max(12.0, metrics.slot_w * 3) # The topmost character of vertical train labels can slightly overlap upward; leave extra headroom. timeline_clip_y0 = max(metrics.header_bottom - 0.5, block["body_y0"] - 10.0) t_words = [ w for w in page_words if timeline_clip_y0 <= w["top"] < block["body_y1"] and timeline_clip_x0 <= w["x0"] < metrics.nach_x0 ] t_chars = chars_in_box(page_chars, timeline_clip_x0, metrics.nach_x0, timeline_clip_y0, block["body_y1"]) colored_rects = [] for r in page_rects: c = norm_color(r.get("non_stroking_color")) if c not in (GREEN_COLORS | RED_BAND_COLORS): continue if r["height"] < 2 or r["width"] < 2: continue if not (timeline_clip_y0 <= r["top"] < block["body_y1"] + 4): continue if r["x1"] <= metrics.timeline_x0 or r["x0"] >= metrics.nach_x0: continue colored_rects.append( { "x0": r["x0"], "x1": r["x1"], "top": r["top"], "bottom": r["bottom"], "color": c, "color_name": color_name(c), } ) dense_vertical_trains, dense_consumed = extract_dense_vertical_matrix_trains(t_words) def wkey(w: Dict[str, Any]) -> Tuple[float, float, str]: return (round(w["x0"], 2), round(w["top"], 2), w["text"]) horizontal_trains = [] for w in t_words: if wkey(w) in dense_consumed: continue txt = w["text"] if len(txt) == 1: continue raw, prefix, num = parse_train_token(txt) if raw is None: continue # Exclude station-like all-caps codes that accidentally match regex if UPPER_CODE_RE.fullmatch(txt): continue horizontal_trains.append( { "kind": "horizontal_train", "text": raw, "prefix": prefix, "train_no": num, "x0": w["x0"], "x1": w["x1"], "top": w["top"], "bottom": w["bottom"], } ) # Marker-only rows contain many single-digit coupling-order markers (often 2/4 alternating). # Exclude the whole row from vertical train reconstruction so they do not get merged into labels. marker_heavy_tops: List[float] = [] grouped_non_dense = group_words_by_line([w for w in t_words if wkey(w) not in dense_consumed]) line_infos = [] for lw in grouped_non_dense: top_avg = sum(w["top"] for w in lw) / len(lw) line_infos.append( { "top": top_avg, "is_digit_only": len(lw) >= 4 and all(len(w["text"]) == 1 and w["text"].isdigit() for w in lw), "has_station_like": any(UPPER_CODE_RE.fullmatch(w["text"]) for w in lw), } ) line_infos = sorted(line_infos, key=lambda x: x["top"]) # Marker row is typically the digit-only row directly above the station-code row (KLO/DAV/SELF/...). for i, li in enumerate(line_infos): if not li["is_digit_only"]: continue next_line = line_infos[i + 1] if i + 1 < len(line_infos) else None if next_line and next_line["has_station_like"] and (next_line["top"] - li["top"]) <= 8.5: marker_heavy_tops.append(li["top"]) vertical_trains = [] for v in cluster_single_char_stacks([w for w in t_words if wkey(w) not in dense_consumed], excluded_tops=marker_heavy_tops): raw, prefix, num = parse_train_token(v["text"]) if raw is None: continue vertical_trains.append( { **v, "text": raw, "prefix": prefix, "train_no": num, } ) trains = horizontal_trains[:] + dense_vertical_trains[:] vertical_train_part_keys = set() for vt in vertical_trains: for pk in vt.get("part_keys", []): vertical_train_part_keys.add(pk) dup = any(abs(vt["x0"] - ht["x0"]) <= 3 and abs(vt["top"] - ht["top"]) <= 8 for ht in trains) if not dup: trains.append(vt) trains.sort(key=lambda t: (t["x0"], t["top"])) station_tokens = [ w for w in t_words if wkey(w) not in dense_consumed if UPPER_CODE_RE.fullmatch(w["text"]) ] marker_tokens = [ w for w in t_words if wkey(w) not in dense_consumed and wkey(w) not in vertical_train_part_keys and re.fullmatch(r"[1-9]", w["text"]) ] lower_annotations = [] for w in t_words: txt = w["text"] if wkey(w) in dense_consumed: continue if any(w is s for s in station_tokens): continue if txt == "2": continue if any(abs(w["x0"] - t["x0"]) <= 1 and abs(w["top"] - t["top"]) <= 1 and txt == t["text"] for t in horizontal_trains): continue if re.search(r"[a-z]", txt) or txt in {"-", "/"}: lower_annotations.append(w) slots = [] used_station_ids = set() used_marker_ids = set() def wid(w: Dict[str, Any]) -> Tuple[float, float, str]: return (round(w["x0"], 2), round(w["top"], 2), w["text"]) for idx, tr in enumerate(trains): # Start station: prefer uppercase station token aligned with the train/event and below it. station_candidates = [] vertical_like = tr["kind"] in {"vertical_train", "vertical_dense_train"} dx_left_max = 18 if vertical_like else 45 fallback_dx_max = 16 if vertical_like else 35 if tr["kind"] == "vertical_dense_train": for s in station_tokens: if wid(s) in used_station_ids: continue dx = abs(tr["x0"] - s["x0"]) dy = abs(s["top"] - tr["bottom"]) if dx <= 2.0 and dy <= 40: station_candidates.append((dx + dy * 0.1, s)) for s in station_tokens: if wid(s) in used_station_ids: continue dx = tr["x0"] - s["x0"] if dx < -5 or dx > dx_left_max: continue dy_top = s["top"] - tr["top"] dy_bottom = s["top"] - tr["bottom"] if -3 <= dy_top <= 40 or (-2 <= dy_bottom <= 28): station_candidates.append((abs(dx) + max(0, min(dy_top, max(dy_bottom, 0))) * 0.4, s)) if not station_candidates: for s in station_tokens: if wid(s) in used_station_ids: continue dx = abs((tr["x0"] + tr["x1"]) / 2 - (s["x0"] + s["x1"]) / 2) dy = min(abs(s["top"] - tr["top"]), abs(s["top"] - tr["bottom"])) if dx <= fallback_dx_max and dy <= 34: station_candidates.append((dx + dy * 0.5 + 100, s)) station = sorted(station_candidates, key=lambda x: x[0])[0][1] if station_candidates else None marker = None marker_candidates = [] if tr["kind"] != "vertical_dense_train": for m in marker_tokens: key = wid(m) if key in used_marker_ids: continue if key in vertical_train_part_keys: continue if tr["kind"] == "vertical_train": dx = abs(m["x0"] - tr["x0"]) dy = m["top"] - tr["bottom"] if dx <= 6 and -1 <= dy <= 10: marker_candidates.append((dx + max(0, dy) * 0.4, m)) else: dx = abs(m["x0"] - tr["x0"]) dy = m["top"] - tr["top"] if dx <= 28 and 0 <= dy <= 18: marker_candidates.append((dx + dy * 0.5, m)) if marker_candidates: marker = sorted(marker_candidates, key=lambda x: x[0])[0][1] used_marker_ids.add(wid(marker)) if tr["kind"] == "vertical_train": anchor_x = (tr["x0"] + tr["x1"]) / 2 + metrics.vertical_train_x_offset elif vertical_like: anchor_x = (tr["x0"] + tr["x1"]) / 2 else: # Horizontal train timing is more stable from the train label than the station label. anchor_x = tr["x0"] slot_idx = slot_index_from_x(anchor_x, metrics) if tr["kind"] == "vertical_dense_train": start_minute = slot_idx * 10 elif tr["kind"] == "vertical_train": start_minute = precise_minute_from_train_x(tr["x0"] + metrics.vertical_train_x_offset, metrics, tr["kind"]) else: time_x = anchor_x if vertical_like else tr["x0"] start_minute = precise_minute_from_train_x(time_x, metrics, tr["kind"]) next_train_x = trains[idx + 1]["x0"] if idx + 1 < len(trains) else metrics.nach_x0 - 1 next_word_x = min( [w["x0"] for w in t_words if w["x0"] > tr["x1"] + 1 and abs(w["top"] - tr["top"]) <= 28] or [metrics.nach_x0 - 1] ) event_x0 = min(anchor_x, tr["x0"]) if tr["kind"] == "vertical_train": local_cap = min(next_train_x - 2, next_word_x - 2, tr["x1"] + metrics.slot_w * 1.8) event_x1 = max(local_cap, tr["x1"]) else: local_cap = min(next_train_x - 2, next_word_x - 2) # If no natural boundary was found, do not stretch the event to the right page edge. if next_train_x >= metrics.nach_x0 - 2 and next_word_x >= metrics.nach_x0 - 2: local_cap = tr["x1"] + metrics.hour_w * 2.0 event_x1 = max(local_cap, tr["x1"]) # Refine station assignment using the computed event window (helps dense/vertical rows). if station is None: event_station_candidates = [] for s in station_tokens: if wid(s) in used_station_ids: continue sx = (s["x0"] + s["x1"]) / 2 overlapish = (event_x0 - 8) <= sx <= (event_x1 + 8) if not overlapish: continue dyb = s["top"] - tr["bottom"] dyt = s["top"] - tr["top"] if vertical_like: if -2 <= dyb <= 50: event_station_candidates.append((abs(sx - event_x0) * 0.2 + max(0, dyb) * 0.2, s)) else: if -2 <= dyb <= 34 or 0 <= dyt <= 46: event_station_candidates.append((abs(sx - event_x0) * 0.2 + max(0, dyb) * 0.25, s)) if event_station_candidates: station = sorted(event_station_candidates, key=lambda x: x[0])[0][1] if station is None and not vertical_like: # Last-resort fallback for sparse shunting rows (R1/R3/K4...), allow wider search and token reuse. relaxed_candidates = [] for s in station_tokens: sx = (s["x0"] + s["x1"]) / 2 dxe = min(abs(sx - event_x0), abs(sx - event_x1)) dyb = abs(s["top"] - tr["bottom"]) if dxe <= 110 and dyb <= 70: relaxed_candidates.append((dxe + dyb * 0.6 + (20 if wid(s) in used_station_ids else 0), s)) if relaxed_candidates: station = sorted(relaxed_candidates, key=lambda x: x[0])[0][1] if station: used_station_ids.add(wid(station)) slot_colors = [] for r in colored_rects: overlap = min(event_x1, r["x1"]) - max(event_x0, r["x0"]) if overlap > 1: slot_colors.append({"color": r["color_name"], "x0": r["x0"], "x1": r["x1"], "rect": r}) primary_color = None if slot_colors: primary_color = max(slot_colors, key=lambda c: min(event_x1, c["x1"]) - max(event_x0, c["x0"]))["color"] duration_min = duration_minutes_from_x_span(event_x0, event_x1, metrics) end_minute = (start_minute + duration_min) % 1440 if duration_min > 0 else start_minute slots.append( { "slot_index": slot_idx, "start_minute": start_minute, "start_hhmm": f"{start_minute // 60:02d}:{start_minute % 60:02d}", "end_minute": end_minute, "end_hhmm": f"{end_minute // 60:02d}:{end_minute % 60:02d}", "station_code": station["text"] if station else None, "train_prefix": tr.get("prefix"), "train_no": tr.get("train_no"), "raw_train_token": tr["text"], # A csíkban megjelenő szám a kapcsolt gépek sorrendjét jelzi (1, 2, ...). "strip_marker": int(marker["text"]) if marker is not None and str(marker.get("text", "")).isdigit() else None, "color": primary_color, "x_anchor": anchor_x, "x_train": tr["x0"], "x_train_end": tr["x1"], "x_event0": event_x0, "x_event1": event_x1, "colors": [c["color"] for c in slot_colors], "train_kind": tr["kind"], } ) # Remaining colored segments that could not be attached to any train slot (e.g. repair blocks) used_rect_ids = set() for s in slots: for r in colored_rects: overlap = min(s["x_event1"], r["x1"]) - max(s["x_event0"], r["x0"]) if overlap > 1: used_rect_ids.add((round(r["x0"], 1), round(r["x1"], 1), round(r["top"], 1))) orphan_bands = [ r for r in colored_rects if (round(r["x0"], 1), round(r["x1"], 1), round(r["top"], 1)) not in used_rect_ids ] # Build human-readable annotation rows for debug ann_lines = [] for lw in group_words_by_line(lower_annotations): txt = line_text(lw) if not txt: continue if IGNORED_TIMELINE_ANNOTATION_RE.search(txt): continue ann_lines.append(txt) def band_text_for_rect(rect: Dict[str, Any]) -> Optional[str]: band_words = [ w for w in t_words if (rect["x0"] - 2) <= w["x0"] <= (rect["x1"] + 2) and (rect["top"] - 3) <= w["top"] <= (rect["bottom"] + 3) and w["text"] != "2" ] if not band_words: return None lines = [] for lw in group_words_by_line(band_words): txt = line_text(lw) if not txt: continue # Színes sávon belüli label lehet vegyes (pl. "Reparatur - LQ") lines.append(txt) if not lines: return None return " | ".join(lines) synthetic_slots = [] for r in orphan_bands: slot_idx = slot_index_from_x(r["x0"], metrics) # A sávok a grid-balhatárhoz igazodnak; a vertikális offset itt közelebb visz a vizuális kezdőoszlophoz. minute = precise_minute_from_train_x(r["x0"] + metrics.vertical_train_x_offset, metrics, "vertical_train") duration_min = duration_minutes_from_x_span(r["x0"], r["x1"], metrics) end_minute = (minute + duration_min) % 1440 if duration_min > 0 else minute band_label = band_text_for_rect(r) synthetic_slots.append( { "slot_index": slot_idx, "start_minute": minute, "start_hhmm": f"{minute // 60:02d}:{minute % 60:02d}", "end_minute": end_minute, "end_hhmm": f"{end_minute // 60:02d}:{end_minute % 60:02d}", "station_code": None, "train_prefix": None, "train_no": None, "raw_train_token": band_label, "strip_marker": None, "color": r["color_name"], "x_anchor": r["x0"], "x_train": r["x0"], "x_train_end": r["x1"], "x_event0": r["x0"], "x_event1": r["x1"], "colors": [r["color_name"]], "train_kind": "band_only", } ) return { "slots": sorted(slots + synthetic_slots, key=lambda s: (s["slot_index"], s["x_train"])), "bands": colored_rects, "orphan_bands": orphan_bands, "annotations": ann_lines, "char_count_timeline": len(t_chars), } def load_mysql_config(config_path: str) -> Dict[str, Any]: raw = Path(config_path).read_text(encoding="utf-8") keys = ["host", "port", "dbname", "user", "pass", "charset", "socket", "driver"] out: Dict[str, Any] = {} for key in keys: m_str = re.search(rf"'{re.escape(key)}'\s*=>\s*'([^']*)'", raw) if m_str: out[key] = m_str.group(1) continue m_num = re.search(rf"'{re.escape(key)}'\s*=>\s*(\d+)", raw) if m_num: out[key] = int(m_num.group(1)) if out.get("driver", "mysql") != "mysql": raise RuntimeError("Only MySQL is supported (db.driver must be 'mysql').") out.setdefault("host", "127.0.0.1") out.setdefault("port", 3306) out.setdefault("dbname", "") out.setdefault("user", "") out.setdefault("pass", "") out.setdefault("charset", "utf8mb4") return out def mysql_connect(cfg: Dict[str, Any]): try: import mysql.connector # type: ignore params = { "host": cfg["host"], "port": int(cfg["port"]), "database": cfg["dbname"], "user": cfg["user"], "password": cfg["pass"], "charset": cfg.get("charset", "utf8mb4"), "use_unicode": True, "autocommit": False, } if cfg.get("socket"): params["unix_socket"] = cfg["socket"] return mysql.connector.connect(**params) except ImportError: try: import pymysql # type: ignore return pymysql.connect( host=cfg["host"], port=int(cfg["port"]), db=cfg["dbname"], user=cfg["user"], password=cfg["pass"], charset=cfg.get("charset", "utf8mb4"), unix_socket=cfg.get("socket") or None, autocommit=False, ) except ImportError as e: raise RuntimeError( "MySQL driver missing. Install one of these manually:\n" "python3 -m pip install --user --break-system-packages PyMySQL\n" "python3 -m pip install --user --break-system-packages mysql-connector-python" ) from e def create_schema(conn) -> None: ddls = [ """ CREATE TABLE IF NOT EXISTS schedule_day ( id INT AUTO_INCREMENT PRIMARY KEY, source_pdf VARCHAR(255) NOT NULL, service_date VARCHAR(32) NULL, weekday_label VARCHAR(32) NULL ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 """, """ CREATE TABLE IF NOT EXISTS schedule_page ( id INT AUTO_INCREMENT PRIMARY KEY, day_id INT NOT NULL, page_number INT NOT NULL, page_total INT NULL, width DOUBLE NULL, height DOUBLE NULL, CONSTRAINT fk_schedule_page_day FOREIGN KEY (day_id) REFERENCES schedule_day(id) ON DELETE CASCADE ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 """, """ CREATE TABLE IF NOT EXISTS locomotive_type ( id INT AUTO_INCREMENT PRIMARY KEY, code VARCHAR(64) NULL UNIQUE, name VARCHAR(255) NOT NULL, sort_order INT NOT NULL DEFAULT 0, active TINYINT(1) NOT NULL DEFAULT 1, notes TEXT NULL ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 """, """ CREATE TABLE IF NOT EXISTS locomotive ( vehicle_no VARCHAR(64) PRIMARY KEY, locomotive_type_id INT NULL, locomotive_type VARCHAR(255) NULL, notes TEXT NULL, CONSTRAINT fk_locomotive_type FOREIGN KEY (locomotive_type_id) REFERENCES locomotive_type(id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 """, """ CREATE TABLE IF NOT EXISTS station ( id INT AUTO_INCREMENT PRIMARY KEY, code VARCHAR(64) NOT NULL UNIQUE, name VARCHAR(255) NULL, sort_order INT NOT NULL DEFAULT 0, active TINYINT(1) NOT NULL DEFAULT 1, notes TEXT NULL ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 """, """ CREATE TABLE IF NOT EXISTS duty ( id INT AUTO_INCREMENT PRIMARY KEY, page_id INT NOT NULL, seq_on_page INT NOT NULL, vehicle_no VARCHAR(64) NULL, rotation_no VARCHAR(64) NULL, distance_km INT NULL, von_prev_train VARCHAR(64) NULL, von_prev_forda VARCHAR(64) NULL, von_location VARCHAR(64) NULL, von_station_id INT NULL, nach_next_train VARCHAR(64) NULL, nach_next_forda VARCHAR(64) NULL, nach_location VARCHAR(64) NULL, nach_station_id INT NULL, remark TEXT NULL, body_y0 DOUBLE NULL, body_y1 DOUBLE NULL, debug_fzg_lines_json LONGTEXT NULL, debug_von_lines_json LONGTEXT NULL, debug_nach_lines_json LONGTEXT NULL, CONSTRAINT fk_duty_page FOREIGN KEY (page_id) REFERENCES schedule_page(id) ON DELETE CASCADE, CONSTRAINT fk_duty_von_station FOREIGN KEY (von_station_id) REFERENCES station(id), CONSTRAINT fk_duty_nach_station FOREIGN KEY (nach_station_id) REFERENCES station(id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 """, """ CREATE TABLE IF NOT EXISTS duty_slot ( id INT AUTO_INCREMENT PRIMARY KEY, duty_id INT NOT NULL, slot_index INT NOT NULL, start_minute INT NOT NULL, start_hhmm VARCHAR(16) NOT NULL, end_minute INT NULL, end_hhmm VARCHAR(16) NULL, station_code VARCHAR(64) NULL, station_id INT NULL, train_prefix VARCHAR(16) NULL, train_no VARCHAR(64) NULL, raw_train_token VARCHAR(255) NULL, strip_marker INT NULL, color VARCHAR(64) NULL, x_anchor DOUBLE NULL, x_event0 DOUBLE NULL, x_event1 DOUBLE NULL, x_train DOUBLE NULL, x_train_end DOUBLE NULL, train_kind VARCHAR(64) NULL, debug_colors_json LONGTEXT NULL, CONSTRAINT fk_duty_slot_duty FOREIGN KEY (duty_id) REFERENCES duty(id) ON DELETE CASCADE, CONSTRAINT fk_duty_slot_station FOREIGN KEY (station_id) REFERENCES station(id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 """, """ CREATE TABLE IF NOT EXISTS duty_timeline_band ( id INT AUTO_INCREMENT PRIMARY KEY, duty_id INT NOT NULL, color VARCHAR(64) NOT NULL, x0 DOUBLE NOT NULL, x1 DOUBLE NOT NULL, y0 DOUBLE NOT NULL, y1 DOUBLE NOT NULL, orphan TINYINT(1) NOT NULL DEFAULT 0, CONSTRAINT fk_duty_timeline_band_duty FOREIGN KEY (duty_id) REFERENCES duty(id) ON DELETE CASCADE ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 """, """ CREATE TABLE IF NOT EXISTS duty_annotation ( id INT AUTO_INCREMENT PRIMARY KEY, duty_id INT NOT NULL, text TEXT NOT NULL, CONSTRAINT fk_duty_annotation_duty FOREIGN KEY (duty_id) REFERENCES duty(id) ON DELETE CASCADE ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 """, ] cur = conn.cursor() for ddl in ddls: cur.execute(ddl) cur.close() def insert_day(conn, pdf_path: str, meta: Dict[str, Any]) -> int: cur = conn.cursor() cur.execute( "INSERT INTO schedule_day (source_pdf, service_date, weekday_label) VALUES (%s, %s, %s)", (pdf_path, meta.get("date_str"), meta.get("weekday_label")), ) day_id = int(cur.lastrowid) cur.close() return day_id def sync_locomotive_master(conn) -> None: cur = conn.cursor() cur.execute( """ INSERT IGNORE INTO locomotive (vehicle_no) SELECT DISTINCT vehicle_no FROM duty WHERE vehicle_no IS NOT NULL AND TRIM(vehicle_no) <> '' """ ) cur.close() def cleanup_orphan_locomotives(conn) -> None: cur = conn.cursor() cur.execute( """ DELETE FROM locomotive WHERE vehicle_no NOT IN ( SELECT DISTINCT vehicle_no FROM duty WHERE vehicle_no IS NOT NULL AND TRIM(vehicle_no) <> '' ) """ ) cur.close() def sync_station_master(conn) -> None: cur = conn.cursor() cur.execute( """ INSERT IGNORE INTO station (code, name) SELECT code, code FROM ( SELECT DISTINCT TRIM(station_code) AS code FROM duty_slot WHERE station_code IS NOT NULL AND TRIM(station_code) <> '' UNION SELECT DISTINCT TRIM(von_location) AS code FROM duty WHERE von_location IS NOT NULL AND TRIM(von_location) <> '' UNION SELECT DISTINCT TRIM(nach_location) AS code FROM duty WHERE nach_location IS NOT NULL AND TRIM(nach_location) <> '' ) src WHERE code REGEXP '^[A-Z0-9]+$' """ ) cur.execute( """ UPDATE station SET name = code WHERE (name IS NULL OR TRIM(name) = '') """ ) cur.close() def assign_station_foreign_keys(conn) -> None: cur = conn.cursor() cur.execute("SELECT id, code FROM station") code_to_id = { str(code).strip(): int(station_id) for station_id, code in cur.fetchall() if code is not None and str(code).strip() != "" } cur.close() if not code_to_id: return cur = conn.cursor() for code, station_id in code_to_id.items(): cur.execute( "UPDATE duty_slot SET station_id = %s WHERE station_code IS NOT NULL AND TRIM(station_code) = %s", (station_id, code), ) cur.execute( "UPDATE duty SET von_station_id = %s WHERE von_location IS NOT NULL AND TRIM(von_location) = %s", (station_id, code), ) cur.execute( "UPDATE duty SET nach_station_id = %s WHERE nach_location IS NOT NULL AND TRIM(nach_location) = %s", (station_id, code), ) cur.close() def cleanup_orphan_stations(conn) -> None: cur = conn.cursor() cur.execute( """ DELETE FROM station WHERE id NOT IN ( SELECT DISTINCT station_id FROM duty_slot WHERE station_id IS NOT NULL UNION SELECT DISTINCT von_station_id FROM duty WHERE von_station_id IS NOT NULL UNION SELECT DISTINCT nach_station_id FROM duty WHERE nach_station_id IS NOT NULL ) """ ) cur.close() def reset_all_schedule_data(conn) -> None: cur = conn.cursor() cur.execute("SET FOREIGN_KEY_CHECKS=0") for table in ["duty_annotation", "duty_timeline_band", "duty_slot", "duty", "schedule_page", "schedule_day"]: cur.execute(f"TRUNCATE TABLE {table}") cur.execute("SET FOREIGN_KEY_CHECKS=1") cur.close() def delete_existing_day_by_source_pdf(conn, source_pdf: str) -> None: cur = conn.cursor() cur.execute("SELECT id FROM schedule_day WHERE source_pdf = %s", (source_pdf,)) day_ids = [int(r[0]) for r in cur.fetchall()] for day_id in day_ids: cur.execute("DELETE FROM schedule_day WHERE id = %s", (day_id,)) cur.close() def debug_print_duty(page_no: int, duty_data: Dict[str, Any], timeline: Dict[str, Any]) -> None: print( f"[DEBUG] p{page_no} duty#{duty_data['seq_on_page']:02d} " f"FZG={duty_data.get('vehicle_no')} forda={duty_data.get('rotation_no')} " f"km={duty_data.get('distance_km')} " f"von=({duty_data.get('von_prev_train')},{duty_data.get('von_prev_forda')},{duty_data.get('von_location')}) " f"nach=({duty_data.get('nach_next_train')},{duty_data.get('nach_next_forda')},{duty_data.get('nach_location')})" ) if duty_data.get("remark"): print(f" remark: {duty_data['remark']}") if timeline["annotations"]: for ann in timeline["annotations"]: print(f" annotation: {ann}") for s in timeline["slots"]: print( f" slot {s['start_hhmm']} idx={s['slot_index']:03d} " f"end={s.get('end_hhmm') or '-'} " f"st={s['station_code'] or '-'} train={s['raw_train_token'] or '-'} " f"prefix={s['train_prefix'] or '-'} no={s['train_no'] or '-'} " f"strip={s['strip_marker'] or '-'} color={s['color'] or '-'} " f"kind={s['train_kind']}" ) for b in timeline["orphan_bands"]: print( f" orphan_band color={b['color_name']} " f"x={b['x0']:.1f}-{b['x1']:.1f} y={b['top']:.1f}-{b['bottom']:.1f}" ) def convert_pdf( pdf_path: str, mysql_config_path: str, debug: bool = True, limit_pages: Optional[int] = None, append: bool = False, ) -> None: pdf_path_obj = Path(pdf_path) if not pdf_path_obj.exists(): raise FileNotFoundError(pdf_path) cfg = load_mysql_config(mysql_config_path) conn = mysql_connect(cfg) create_schema(conn) if not append: reset_all_schedule_data(conn) with pdfplumber.open(str(pdf_path_obj)) as pdf: first_page_words = pdf.pages[0].extract_words(x_tolerance=1, y_tolerance=1, keep_blank_chars=False) day_meta = extract_page_meta(first_page_words) if append: delete_existing_day_by_source_pdf(conn, str(pdf_path_obj.name)) day_id = insert_day(conn, str(pdf_path_obj.name), day_meta) metrics = extract_metrics(pdf.pages[0]) total_pages = len(pdf.pages) pages_to_process = pdf.pages[: limit_pages or total_pages] total_duties = 0 total_slots = 0 for page_idx, page in enumerate(pages_to_process, start=1): page_words = page.extract_words(x_tolerance=1, y_tolerance=1, keep_blank_chars=False) page_chars = page.chars page_rects = page.rects meta = extract_page_meta(page_words) page_no_meta = meta.get("page_no") page_total_meta = meta.get("page_total") page_no = int(page_no_meta) if page_no_meta is not None else page_idx page_total = int(page_total_meta) if page_total_meta is not None else total_pages cur = conn.cursor() cur.execute( "INSERT INTO schedule_page (day_id, page_number, page_total, width, height) VALUES (%s, %s, %s, %s, %s)", (day_id, page_no, page_total, page.width, page.height), ) page_id = int(cur.lastrowid) cur.close() blocks = build_block_index(page_words, metrics, page.height) for block in blocks: cols = parse_block_columns(page_words, block, metrics) remark = parse_remark(page_words, block) duty_data = { **cols, "remark": remark, "seq_on_page": block["seq_on_page"], } cur = conn.cursor() cur.execute( """ INSERT INTO duty ( page_id, seq_on_page, vehicle_no, rotation_no, distance_km, von_prev_train, von_prev_forda, von_location, nach_next_train, nach_next_forda, nach_location, remark, body_y0, body_y1, debug_fzg_lines_json, debug_von_lines_json, debug_nach_lines_json ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) """, ( page_id, block["seq_on_page"], cols["vehicle_no"], cols["rotation_no"], cols["distance_km"], cols["von_prev_train"], cols["von_prev_forda"], cols["von_location"], cols["nach_next_train"], cols["nach_next_forda"], cols["nach_location"], remark, block["body_y0"], block["body_y1"], json.dumps(cols["debug_fzg_lines"], ensure_ascii=False), json.dumps(cols["debug_von_lines"], ensure_ascii=False), json.dumps(cols["debug_nach_lines"], ensure_ascii=False), ), ) duty_id = int(cur.lastrowid) cur.close() timeline = extract_timeline(page_words, page_chars, page_rects, block, metrics) for s in timeline["slots"]: cur = conn.cursor() cur.execute( """ INSERT INTO duty_slot ( duty_id, slot_index, start_minute, start_hhmm, end_minute, end_hhmm, station_code, train_prefix, train_no, raw_train_token, strip_marker, color, x_anchor, x_event0, x_event1, x_train, x_train_end, train_kind, debug_colors_json ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) """, ( duty_id, s["slot_index"], s["start_minute"], s["start_hhmm"], s.get("end_minute"), s.get("end_hhmm"), s["station_code"], s["train_prefix"], s["train_no"], s["raw_train_token"], s["strip_marker"], s["color"], s["x_anchor"], s["x_event0"], s["x_event1"], s["x_train"], s["x_train_end"], s["train_kind"], json.dumps(s["colors"], ensure_ascii=False), ), ) cur.close() for r in timeline["bands"]: orphan = 1 if r in timeline["orphan_bands"] else 0 cur = conn.cursor() cur.execute( """ INSERT INTO duty_timeline_band (duty_id, color, x0, x1, y0, y1, orphan) VALUES (%s, %s, %s, %s, %s, %s, %s) """, (duty_id, r["color_name"], r["x0"], r["x1"], r["top"], r["bottom"], orphan), ) cur.close() for ann in timeline["annotations"]: cur = conn.cursor() cur.execute("INSERT INTO duty_annotation (duty_id, text) VALUES (%s, %s)", (duty_id, ann)) cur.close() total_duties += 1 total_slots += len(timeline["slots"]) if debug: debug_print_duty(page_no, duty_data, timeline) # Commit parsed rows first. conn.commit() sync_locomotive_master(conn) cleanup_orphan_locomotives(conn) sync_station_master(conn) assign_station_foreign_keys(conn) cleanup_orphan_stations(conn) conn.commit() print( f"[SUMMARY] mode={'append' if append else 'replace'} pages={len(pages_to_process)}/{total_pages} " f"duties={total_duties} slots={total_slots} db=mysql:{cfg.get('host')}:{cfg.get('dbname')}" ) conn.close() def main() -> None: parser = argparse.ArgumentParser(description="Lokdienst PDF -> MySQL parser (debug-friendly)") parser.add_argument("pdf", nargs="?", default="Lokdienst_25.02.2026.pdf", help="Input PDF") parser.add_argument("--mysql-config", default="php_web/config.php", help="MySQL config file path") parser.add_argument("--db", default=None, help="Deprecated. SQLite is removed; use --mysql-config.") parser.add_argument("--no-debug", action="store_true", help="Disable debug print") parser.add_argument("--limit-pages", type=int, default=None, help="Only parse first N pages") parser.add_argument("--append", action="store_true", help="Append day into existing DB (replace same source_pdf only)") args = parser.parse_args() if args.db: raise RuntimeError("`--db` már nem támogatott. SQLite eltávolítva, használd: --mysql-config php_web/config.php") convert_pdf(args.pdf, args.mysql_config, debug=not args.no_debug, limit_pages=args.limit_pages, append=args.append) if __name__ == "__main__": main()