#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
extract_content_tokens.py

Dump content image URL tokens using the same policy as `Z_Batch_Create_Content_Token.py`.

Usage:
  python extract_content_tokens.py --file sample.html --show-dropped
  type sample.html | python extract_content_tokens.py --stdin --show-dropped
"""

from __future__ import annotations

import argparse
import sys
from typing import Any


def _load_text_from_stdin() -> str:
    data = sys.stdin.read()
    return data if data is not None else ""


def main() -> None:
    ap = argparse.ArgumentParser()
    g = ap.add_mutually_exclusive_group(required=False)
    g.add_argument("--file", help="Path to HTML file (default: tmp_sample_user_html.html when no args)")
    g.add_argument("--stdin", action="store_true", help="Read HTML from stdin")
    ap.add_argument("--show-urls", action="store_true", help="Print extracted raw URLs from HTML before filtering")
    ap.add_argument("--show-dropped", action="store_true", help="Print drop reasons (stoplist/normalize failure)")
    ap.add_argument("--max", type=int, default=8, help="Max tokens to keep (default: 8)")
    args = ap.parse_args()

    import Z_Batch_Create_Content_Token as z  # noqa: N812

    # Default behavior: if no args are provided, read from tmp_sample_user_html.html
    default_file = "tmp_sample_user_html.html"
    if args.stdin:
        html = _load_text_from_stdin()
    else:
        file_path = args.file or default_file
        with open(file_path, "r", encoding="utf-8", errors="replace") as f:
            html = f.read()

    urls = z._extract_img_urls_from_html(html)  # type: ignore[attr-defined]
    if args.show_urls:
        print(f"[urls] cnt={len(urls)}")
        for u in urls:
            print(" -", u)

    kept: list[str] = []
    seen: set[str] = set()

    def _drop(reason: str, u: str) -> None:
        if args.show_dropped:
            print(f"[drop] {reason}: {u}")

    for u in urls:
        # Apply the exact same stop policy as Z_Batch_Create_Content_Token.py
        try:
            p = z.urlparse(str(u))  # type: ignore[attr-defined]
            fn = z.os.path.basename(p.path or "")  # type: ignore[attr-defined]
            fn = z.unquote(fn or "").strip().lower()  # type: ignore[attr-defined]
        except Exception:
            fn = ""
        if fn:
            stem = fn.rsplit(".", 1)[0].strip()
            if fn in z.STOP_IMG_FILENAMES:  # type: ignore[attr-defined]
                _drop("stop_filename(eq)", str(u))
                continue
            if any(sf in fn for sf in z.STOP_IMG_FILENAMES):  # type: ignore[attr-defined]
                _drop("stop_filename(substr)", str(u))
                continue
            if stem in z.STOP_IMG_STEMS:  # type: ignore[attr-defined]
                _drop("stop_stem(eq)", str(u))
                continue
            if any(stem.startswith(pfx) for pfx in z.STOP_IMG_STEM_PREFIXES):  # type: ignore[attr-defined]
                _drop("stop_stem(prefix)", str(u))
                continue
            if any(ss in stem for ss in z.STOP_IMG_STEM_SUBSTRINGS):  # type: ignore[attr-defined]
                _drop("stop_stem(substr)", str(u))
                continue

        tok = z._normalize_img_url_token(str(u))  # type: ignore[attr-defined]
        if not tok:
            _drop("normalize(None)", str(u))
            continue
        if tok in seen:
            continue
        seen.add(tok)
        kept.append(tok)
        if len(kept) >= int(args.max):
            break

    print(f"[tokens] cnt={len(kept)}")
    for t in kept:
        print(" -", t)


if __name__ == "__main__":
    main()

