# -*- coding: utf-8 -*-
"""
Analyze_Content_Token_Missing.py

export_*.csv (1컬럼 content)에서 content token(이미지 URL 토큰) 누락 케이스를 샘플링/집계한다.

사용 예:
  python Analyze_Content_Token_Missing.py --csv export_202602030907.csv --max-rows 200000
  python Analyze_Content_Token_Missing.py --csv export_202602030907.csv --max-rows 50000 --show-samples 20
"""

from __future__ import annotations

import argparse
import csv
import html as _html
import os
import re
from collections import Counter
from urllib.parse import unquote, urlparse


# ---- keep policy aligned with Z_Batch_Create_Content_Token.py ----
STOP_IMG_FILENAMES: set[str] = {
    "배송공지.jpg",
    "delivery.jpg",
    "notice.jpg",
    "noti.jpg",
    "top_open_dome.jpg",
    "down_open_dome.jpg",
    "dome_bottom.jpg",
    "vender_top.jpg",
    "top.jpg",
    "title.jpg",
    "info-title.jpg",
    "info.jpg",
    "information.jpg",
    "product_info_bottom.jpg",
    "info_issue.jpg",
    "도매의신.jpg",
    "dome.jpg",
    # derived from shopprod_group_content_token_202602101706.txt (high-frequency template images)
    "privacy_policy.jpg",
    "lawprivacy.jpg",
    "bslogo-on.jpg",
    "sllogo.jpg",
    "info-dome.jpg",
    "info-slmall.jpg",
    "all_top_img.jpg",
    "openmarket_top_oc.jpg",
    "openmarket_bottom_oc.jpg",
    "outletbanner.jpg",
    "coatingbottom.jpg",
    "acftop.jpg",
    "acfbottom.jpg",
    "foodnine_parcel_info_001.jpg",
    "20231214165047_product_bottom.jpg",
}
STOP_IMG_STEMS: set[str] = {
    "notice",
    # 너무 범용적인 파일명 stem은 제외
    "img",
    "image",
    "photo",
    "pic",
    "logo",
    "banner",
    # derived from shopprod_group_content_token_202602101706.txt (high-frequency stems)
    "privacy_policy",
    "lawprivacy",
    "info-dome",
    "info-slmall",
    "bslogo-on",
    "sllogo",
    "all_top_img",
    "openmarket_top_oc",
    "openmarket_bottom_oc",
    "outletbanner",
    "coatingbottom",
    "acftop",
    "acfbottom",
    "foodnine_parcel_info_001",
    "20231214165047_product_bottom",
}
STOP_IMG_STEM_PREFIXES: set[str] = {
    "top",
    "bottom",
    "title",
    "noti",
    "intro",
    "dome",
    "도매의신",
    "무료배송",
    "개인정보",
    "교환반품",
    "안내서",
    "공지",
    "도매신",
}
STOP_IMG_STEM_SUBSTRINGS: set[str] = {
    "배송공지",
    "delivery",
    "notice",
    "top_open_dome",
    "down_open_dome",
    "dome_bottom",
    "vender_top",
    "info-title",
    "information",
    "product_info_bottom",
    "info_issue",
    "도매의신",
    "정품인증",
    "스튜디오",
    # derived from shopprod_group_content_token_202602101706.txt (template keyword substrings)
    "privacy",
    "policy",
    "return",
    "exchange",
    "refund",
    "parcel_info",
    "product_bottom",
}


def _extract_img_candidate_urls_from_html(html_text: str | None) -> list[str]:
    raw = str(html_text or "")
    raw = raw.replace('\\"', '"').replace("\\'", "'")
    raw = _html.unescape(raw)

    attr_names = ("src", "data-src", "data-original", "data-lazy-src", "data-zoom-image", "data-url")
    urls: list[str] = []
    for m in re.finditer(r"<img\b[^>]*>", raw, flags=re.IGNORECASE):
        tag = m.group(0) or ""
        for an in attr_names:
            mm = re.search(
                rf"""\b{re.escape(an)}\s*=\s*(?:(["'])(.*?)\1|([^'"\s>]+))""",
                tag,
                flags=re.IGNORECASE | re.DOTALL,
            )
            if mm:
                u = (mm.group(2) or mm.group(3) or "").strip()
                if u:
                    urls.append(u)
                break
        mm2 = re.search(
            r"""\bsrcset\s*=\s*(?:(["'])(.*?)\1|([^'"\s>]+))""",
            tag,
            flags=re.IGNORECASE | re.DOTALL,
        )
        if mm2:
            ss = (mm2.group(2) or mm2.group(3) or "").strip()
            if ss:
                first = ss.split(",")[0].strip()
                first = first.split(" ")[0].strip()
                if first:
                    urls.append(first)

        # 깨진 HTML 방어:
        # - 예: <img src="https:<img src="https://ai.esmplus.com/...jpg">
        for u2 in re.findall(r"""https?://[^\s<>"']+""", tag, flags=re.IGNORECASE):
            u2 = str(u2).strip()
            if u2:
                urls.append(u2)

    # dedup (order-preserving)
    seen = set()
    out: list[str] = []
    for u in urls:
        if u in seen:
            continue
        seen.add(u)
        out.append(u)
    if not out:
        # <img> 없이 URL만 들어오는 케이스 보완
        for u in re.findall(
            r"""https?://[^\s<>"']+\.(?:jpg|jpeg|png|gif|webp|bmp|svg)(?:\?[^\s<>"']*)?""",
            raw,
            flags=re.IGNORECASE,
        ):
            out.append(str(u).strip())
    return out


def _normalize_img_url_token(u: str) -> str | None:
    raw = str(u or "").strip()
    if not raw:
        return None
    if raw.startswith("//"):
        raw = "https:" + raw
    # 깨진 스키마 보정
    if raw.startswith("http:////"):
        raw = "http://" + raw[len("http:////") :]
    if raw.startswith("https:////"):
        raw = "https://" + raw[len("https:////") :]
    if raw.startswith("http:/") and (not raw.startswith("http://")):
        raw = "http://" + raw[len("http:/") :]
    if raw.startswith("https:/") and (not raw.startswith("https://")):
        raw = "https://" + raw[len("https:/") :]

    # 상대경로 허용
    if raw.startswith("/"):
        return "rel:" + unquote(raw).strip().lower()
    if raw.startswith("./") or raw.startswith("../"):
        return "rel:" + unquote(raw).strip().lower()
    # "shopimages/..." 처럼 leading slash가 없는 상대경로도 일부 존재 → rel:로 허용
    if (not re.match(r"^[a-zA-Z][a-zA-Z0-9+.-]*:", raw)) and ("/" in raw):
        return "rel:" + unquote(raw).strip().lower()

    p = urlparse(raw)
    if p.scheme not in ("http", "https"):
        return None
    netloc = (p.netloc or "").strip().lower()
    if not netloc:
        return None
    path = unquote(p.path or "").strip()
    if not path:
        return None
    path = path.lower()
    # http/https는 동일 리소스로 취급(토큰 분산 방지): scheme을 https로 통일
    return f"https://{netloc}{path}"


def extract_url_tokens(html_text: str | None, *, max_tokens: int = 8) -> list[str]:
    urls = _extract_img_candidate_urls_from_html(html_text)
    if not urls:
        return []
    out: list[str] = []
    seen: set[str] = set()
    for u in urls:
        # STOP policy must match DB tokenization policy, otherwise "missing" analysis is misleading.
        try:
            p = urlparse(str(u))
            fn = os.path.basename(p.path or "")
            fn = unquote(fn or "").strip().lower()
        except Exception:
            fn = ""
        if fn:
            stem = fn.rsplit(".", 1)[0].strip()
            if fn in STOP_IMG_FILENAMES:
                continue
            if any(sf in fn for sf in STOP_IMG_FILENAMES):
                continue
            if stem in STOP_IMG_STEMS:
                continue
            if any(stem.startswith(p) for p in STOP_IMG_STEM_PREFIXES):
                continue
            if any(p in stem for p in STOP_IMG_STEM_SUBSTRINGS):
                continue

        tok = _normalize_img_url_token(u)
        if not tok:
            continue
        if tok in seen:
            continue
        seen.add(tok)
        out.append(tok)
        if len(out) >= int(max_tokens):
            break
    return out


def main() -> None:
    ap = argparse.ArgumentParser()
    ap.add_argument("--csv", required=True)
    ap.add_argument("--max-rows", type=int, default=200000)
    ap.add_argument("--show-samples", type=int, default=10)
    args = ap.parse_args()

    c = Counter()
    samples: list[tuple[str, str]] = []

    def classify(html: str) -> str:
        low = html.lower()
        if "<img" not in low:
            return "no_img_tag"
        if re.search(r"<img[^>]+\\b(src|data-src|data-original|data-lazy-src|data-zoom-image|srcset)\\b", html, flags=re.I) is None:
            return "no_img_attr"
        # has img attr but no usable tokens
        # quick heuristics
        if re.search(r"data:image/", low):
            return "data_uri_only"
        if re.search(r"javascript:", low):
            return "javascript_only"
        return "has_img_but_token0"

    with open(args.csv, "r", encoding="utf-8-sig", errors="replace", newline="") as f:
        r = csv.reader(f)
        header = next(r, None)
        if not header:
            print("[ERR] empty csv")
            return
        # 1컬럼 content 전제
        for i, row in enumerate(r):
            if i >= int(args.max_rows):
                break
            html = row[0] if row else ""
            toks = extract_url_tokens(html)
            if not toks:
                key = classify(html)
                c[key] += 1
                if len(samples) < int(args.show_samples):
                    samples.append((key, html[:350].replace("\n", " ")))
            else:
                c["token_ok"] += 1

    print(f"[INFO] scanned_rows={sum(c.values()):,}")
    for k, v in c.most_common():
        print(f"[INFO] {k}={v:,}")
    if samples:
        print("[INFO] samples:")
        for k, s in samples:
            print(f"--- {k} ---")
            print(s)


if __name__ == "__main__":
    main()

