import psycopg2
import psycopg2.extras
import os
import re
import html as _html
import time
from urllib.parse import urlparse, unquote
from typing import List, Iterable


# ---- content token extraction (winner content -> img URL tokens) ----
# NOTE: Z_Batch_Create_Content_Token.py와 동일 목적의 경량 로직만 포함(동일한 방향으로 유지)
MAX_CONTENT_TOKENS_PER_GROUP = 8

# 너무 흔한(배송/공지/안내) 이미지 파일명은 토큰으로 쓰면 후보군이 과도해져서 제외한다.
STOP_IMG_FILENAMES: set[str] = {
    "배송공지.jpg",
    "delivery.jpg",
    "notice.jpg",
    "noti.jpg",
    "top_open_dome.jpg",
    "down_open_dome.jpg",
    "dome_bottom.jpg",
    "vender_top.jpg",
    "top.jpg",
    "title.jpg",
    "info-title.jpg",
    "info.jpg",
    "information.jpg",
    "product_info_bottom.jpg",
    "info_issue.jpg",
    "도매의신.jpg",
    "dome.jpg",
    # derived from shopprod_group_content_token_202602101706.txt (high-frequency template images)
    "privacy_policy.jpg",
    "lawprivacy.jpg",
    "bslogo-on.jpg",
    "sllogo.jpg",
    "info-dome.jpg",
    "info-slmall.jpg",
    "all_top_img.jpg",
    "openmarket_top_oc.jpg",
    "openmarket_bottom_oc.jpg",
    "outletbanner.jpg",
    "coatingbottom.jpg",
    "acftop.jpg",
    "acfbottom.jpg",
    "foodnine_parcel_info_001.jpg",
    "20231214165047_product_bottom.jpg",
}
STOP_IMG_STEMS: set[str] = {
    "notice",
    # 너무 범용적인 파일명 stem은 제외
    "img",
    "image",
    "photo",
    "pic",
    "logo",
    "banner",
    # derived from shopprod_group_content_token_202602101706.txt (high-frequency stems)
    "privacy_policy",
    "lawprivacy",
    "info-dome",
    "info-slmall",
    "bslogo-on",
    "sllogo",
    "all_top_img",
    "openmarket_top_oc",
    "openmarket_bottom_oc",
    "outletbanner",
    "coatingbottom",
    "acftop",
    "acfbottom",
    "foodnine_parcel_info_001",
    "20231214165047_product_bottom",
}
STOP_IMG_STEM_PREFIXES: set[str] = {
    "top",
    "bottom",
    "title",
    "noti",
    "intro",
    "dome",
    "도매의신",
    "무료배송",
    "개인정보",
    "교환반품",
    "안내서",
    "공지",
    "도매신",
}
STOP_IMG_STEM_SUBSTRINGS: set[str] = {
    "배송공지",
    "delivery",
    "notice",
    "top_open_dome",
    "down_open_dome",
    "dome_bottom",
    "vender_top",
    "info-title",
    "information",
    "product_info_bottom",
    "info_issue",
    "도매의신",
    "정품인증",
    "스튜디오",
    # derived from shopprod_group_content_token_202602101706.txt (template keyword substrings)
    "privacy",
    "policy",
    "return",
    "exchange",
    "refund",
    "parcel_info",
    "product_bottom",
}


def _extract_img_urls_from_html(html_text: str | None) -> List[str]:
    raw = str(html_text or "")
    # DB/JSON 등을 거치며 \" 형태로 이스케이프된 HTML 대응
    raw = raw.replace('\\"', '"').replace("\\'", "'")
    raw = _html.unescape(raw)
    # <img ...> 태그를 먼저 뽑고, 속성(src/data-src/srcset 등)을 안전하게 추출한다.
    attr_names = ("src", "data-src", "data-original", "data-lazy-src", "data-zoom-image", "data-url")
    urls: List[str] = []
    for m in re.finditer(r"<img\b[^>]*>", raw, flags=re.IGNORECASE):
        tag = m.group(0) or ""
        for an in attr_names:
            mm = re.search(
                rf"""\b{re.escape(an)}\s*=\s*(?:(["'])(.*?)\1|([^'"\s>]+))""",
                tag,
                flags=re.IGNORECASE | re.DOTALL,
            )
            if mm:
                u = (mm.group(2) or mm.group(3) or "").strip()
                if u:
                    urls.append(u)
                break
        mm2 = re.search(
            r"""\bsrcset\s*=\s*(?:(["'])(.*?)\1|([^'"\s>]+))""",
            tag,
            flags=re.IGNORECASE | re.DOTALL,
        )
        if mm2:
            ss = (mm2.group(2) or mm2.group(3) or "").strip()
            if ss:
                first = ss.split(",")[0].strip()
                first = first.split(" ")[0].strip()
                if first:
                    urls.append(first)

        # 깨진 HTML 방어:
        # - 예: <img src="https:<img src="https://ai.esmplus.com/...jpg">
        for u2 in re.findall(r"""https?://[^\s<>"']+""", tag, flags=re.IGNORECASE):
            u2 = str(u2).strip()
            if u2:
                urls.append(u2)
    if not urls:
        for u in re.findall(
            r"""https?://[^\s<>"']+\.(?:jpg|jpeg|png|gif|webp|bmp|svg)(?:\?[^\s<>"']*)?""",
            raw,
            flags=re.IGNORECASE,
        ):
            urls.append(str(u).strip())

    # dedup
    seen = set()
    out: List[str] = []
    for u in urls:
        if u in seen:
            continue
        seen.add(u)
        out.append(u)
    return out


def _normalize_img_url_token(u: str) -> str | None:
    """
    content_token용 이미지 URL 정규화.
    - query/fragment 제거(추적 파라미터 등으로 토큰 분산 방지)
    - scheme/host는 소문자
    - path는 unquote 후 소문자(매칭 우선)
    """
    try:
        # Guard against Postgres btree index entry size limit (~8KB).
        # Very long URL tokens can crash inserts.
        MAX_LEN = 512
        raw = str(u).strip()
        if not raw:
            return None
        # scheme-less URL: //cdn.example.com/x.jpg
        if raw.startswith("//"):
            raw = "https:" + raw
        if raw.startswith("http:////"):
            raw = "http://" + raw[len("http:////") :]
        if raw.startswith("https:////"):
            raw = "https://" + raw[len("https:////") :]
        if raw.startswith("http:/") and (not raw.startswith("http://")):
            raw = "http://" + raw[len("http:/") :]
        if raw.startswith("https:/") and (not raw.startswith("https://")):
            raw = "https://" + raw[len("https:/") :]
        if raw.startswith("/"):
            path = unquote(raw).strip().lower()
            tok = f"rel:{path}"
            return tok if len(tok) <= MAX_LEN else None
        if raw.startswith("./") or raw.startswith("../"):
            path = unquote(raw).strip().lower()
            tok = f"rel:{path}"
            return tok if len(tok) <= MAX_LEN else None
        if (not re.match(r"^[a-zA-Z][a-zA-Z0-9+.-]*:", raw)) and ("/" in raw):
            path = unquote(raw).strip().lower()
            tok = f"rel:{path}"
            return tok if len(tok) <= MAX_LEN else None
        p = urlparse(raw)
        if p.scheme not in ("http", "https"):
            return None
        netloc = (p.netloc or "").strip().lower()
        if not netloc:
            return None
        path = unquote(p.path or "").strip()
        if not path:
            return None
        path = path.lower()
        # http/https는 동일 리소스로 취급(토큰 분산 방지): scheme을 https로 통일
        tok = f"https://{netloc}{path}"
        return tok if len(tok) <= MAX_LEN else None
    except Exception:
        return None


def _extract_img_filenames_from_html(html_text: str | None) -> List[str]:
    # (호환용 함수명 유지) 실제로는 "파일명"이 아니라 "정규화된 URL" 토큰을 반환한다.
    urls = _extract_img_urls_from_html(html_text)
    if not urls:
        return []
    out: List[str] = []
    seen: set[str] = set()
    for u in urls:
        try:
            p = urlparse(str(u))
            fn = os.path.basename(p.path or "")
            fn = unquote(fn or "").strip().lower()
        except Exception:
            continue
        if not fn:
            continue
        stem = fn.rsplit(".", 1)[0].strip()

        # stoplist (정확/포함)
        if fn in STOP_IMG_FILENAMES:
            continue
        # STOP_IMG_FILENAMES 변형(문자 포함)도 제외: 예) steadylady_notice.jpg, NOTICE_end.jpg ...
        if any(sf in fn for sf in STOP_IMG_FILENAMES):
            continue
        if stem in STOP_IMG_STEMS:
            continue
        if any(stem.startswith(p) for p in STOP_IMG_STEM_PREFIXES):
            continue
        if any(p in stem for p in STOP_IMG_STEM_SUBSTRINGS):
            continue

        tok = _normalize_img_url_token(str(u))
        if not tok:
            continue
        if tok in seen:
            continue
        seen.add(tok)
        out.append(tok)
        if len(out) >= int(MAX_CONTENT_TOKENS_PER_GROUP):
            break
    return out


def _chunks(xs: Iterable, n: int) -> Iterable[list]:
    buf = []
    for x in xs:
        buf.append(x)
        if len(buf) >= n:
            yield buf
            buf = []
    if buf:
        yield buf

def refresh_winner_views(conn):
    cur = conn.cursor()
    cur.execute("REFRESH MATERIALIZED VIEW CONCURRENTLY mlinkdw.mv_group_winner_candidate;")
    cur.execute("REFRESH MATERIALIZED VIEW CONCURRENTLY mlinkdw.mv_group_winner_rank;")
    cur.execute("REFRESH MATERIALIZED VIEW CONCURRENTLY mlinkdw.mv_group_winner;")
    conn.commit()
    cur.close()


def apply_winner_with_history(conn):
    cur = conn.cursor()
    t0 = time.perf_counter()

    # 1) 이력 기록 + 2) 위너 반영 (변경된 group_id를 반환)
    cur.execute(
        """
        WITH changed AS (
            SELECT
                g.group_id,
                g.winner_icode AS old_icode,
                w.winner_icode AS new_icode,
                g.winner_price AS old_price,
                w.winner_price AS new_price,
                g.winner_vender_code AS old_vender_code,
                w.winner_vender_code AS new_vender_code
            FROM mlinkdw.shopprod_group2 g
            JOIN mlinkdw.mv_group_winner w
              ON g.group_id = w.group_id
            WHERE g.winner_icode IS DISTINCT FROM w.winner_icode
        ),
        hist AS (
            INSERT INTO mlinkdw.shopprod_group_winner_hist (
                group_id,
                old_icode, new_icode,
                old_price, new_price,
                old_vender_code, new_vender_code,
                change_reason,
                change_desc
            )
            SELECT
                group_id,
                old_icode, new_icode,
                old_price, new_price,
                old_vender_code, new_vender_code,
                'AUTO_WINNER_BATCH',
                '2단계 위너 자동 재선정'
            FROM changed
            RETURNING 1
        ),
        upd AS (
            UPDATE mlinkdw.shopprod_group2 g
            SET
                winner_icode = c.new_icode,
                winner_price = c.new_price,
                winner_vender_code = c.new_vender_code,
                winner_updated_at = now()
            FROM changed c
            WHERE g.group_id = c.group_id
            RETURNING c.group_id
        )
        SELECT group_id FROM upd;
        """
    )
    changed_gids = [int(r[0]) for r in (cur.fetchall() or []) if r and r[0] is not None]
    print(f"[WINNER] changed_groups={len(changed_gids):,}")

    # 3) winner_icode 변경 시, content_token을 현재 winner 기준으로 동기화(삭제/재생성)
    if changed_gids:
        # 기존 토큰 삭제(그룹 기준으로 전체 삭제가 안전)
        cur.execute(
            "DELETE FROM mlinkdw.shopprod_group_content_token WHERE group_id = ANY(%s);",
            (changed_gids,),
        )
        deleted_tokens = cur.rowcount if cur.rowcount is not None else 0
        print(f"[CONTENT_TOKEN] deleted_rows={deleted_tokens:,}")

        # 현재 winner의 content를 가져와 토큰 재생성
        cur.execute(
            """
            SELECT
              g.group_id,
              g.winner_vender_code,
              g.winner_icode,
              gm.content,
              gm.dome_code
            FROM mlinkdw.shopprod_group2 g
            JOIN mlinkdw.shopprod_group_map2 gm
              ON gm.group_id = g.group_id
             AND gm.vender_code = g.winner_vender_code
             AND gm.icode = g.winner_icode
            WHERE g.group_id = ANY(%s);
            """,
            (changed_gids,),
        )
        winner_rows = cur.fetchall() or []
        print(f"[CONTENT_TOKEN] refetch_winner_rows={len(winner_rows):,}")

        insert_rows = []
        for group_id, vender_code, icode, content_html, dome_code in winner_rows:
            fns = _extract_img_filenames_from_html(content_html)
            for tok in (fns or []):
                if not tok:
                    continue
                insert_rows.append((
                    int(group_id),
                    str(vender_code) if vender_code is not None else "",
                    str(icode) if icode is not None else "",
                    str(tok),
                    str(dome_code) if dome_code is not None else "",
                ))

        if insert_rows:
            print(f"[CONTENT_TOKEN] upsert_prepare_rows={len(insert_rows):,}")
            sql_ins = """
                INSERT INTO mlinkdw.shopprod_group_content_token
                (group_id, vender_code, icode, token, dome_code)
                VALUES %s
                ON CONFLICT (group_id, vender_code, icode, token)
                DO UPDATE SET
                  dome_code = EXCLUDED.dome_code
            """
            upserted = 0
            for chunk in _chunks(insert_rows, 5000):
                psycopg2.extras.execute_values(cur, sql_ins, chunk, page_size=1000)
                upserted += len(chunk)
            print(f"[CONTENT_TOKEN] upsert_rows={upserted:,}")
        else:
            print("[CONTENT_TOKEN] upsert_prepare_rows=0")
    else:
        print("[CONTENT_TOKEN] skipped (no winner changes)")

    conn.commit()
    print(f"[WINNER] apply_done elapsed={time.perf_counter() - t0:.2f}s")
    cur.close()


if __name__ == "__main__":
    from db_config import DB_INFO_PSYCOPG2
    conn = psycopg2.connect(**DB_INFO_PSYCOPG2)
    refresh_winner_views(conn)
    apply_winner_with_history(conn)
    conn.close()
