# -*- coding: utf-8 -*-
"""
Union_Content_Token.py

- token 기반 group merge
- 1차: PostgreSQL pg_trgm similarity (>= 0.3)
- 2차: Python difflib SequenceMatcher (>= 70)
"""

import re
import difflib
import os
import time
from datetime import datetime
from typing import List, Dict
import psycopg2

from db_config import DB_INFO_PSYCOPG2 as DB_INFO

# ----------------------------------------------------------------------
# logging helpers
# ----------------------------------------------------------------------

def _now_str() -> str:
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")


def _fmt_seconds(sec: float) -> str:
    try:
        sec = float(sec)
    except Exception:
        return str(sec)
    if sec < 60:
        return f"{sec:.1f}s"
    m = int(sec // 60)
    s = int(sec % 60)
    if m < 60:
        return f"{m}m {s:02d}s"
    h = int(m // 60)
    m2 = int(m % 60)
    return f"{h}h {m2:02d}m {s:02d}s"

# ----------------------------------------------------------------------
# 문자열 정규화 & 유사도
# ----------------------------------------------------------------------

def _norm_iname(s: str | None) -> str:
    if not s:
        return ""
    s = s.lower()
    s = re.sub(r"\s+", " ", s)
    s = re.sub(r"[^0-9a-z가-힣 ]", "", s)
    return s.strip()


def iname_similarity_pct(a: str | None, b: str | None) -> float:
    a2 = _norm_iname(a)
    b2 = _norm_iname(b)
    if not a2 or not b2:
        return 0.0
    return difflib.SequenceMatcher(None, a2, b2).ratio() * 100.0


# ----------------------------------------------------------------------
# DB 1차 후보 조회
# ----------------------------------------------------------------------

SQL_MERGE_CANDIDATE = """
WITH token_master AS (
    SELECT DISTINCT
        t.token,
        FIRST_VALUE(g.group_id) OVER (
            PARTITION BY t.token
            ORDER BY g.winner_price ASC, g.group_id ASC
        ) AS master_group_id
    FROM mlinkdw.shopprod_group_content_token t
    JOIN mlinkdw.shopprod_group2 g
      ON g.group_id = t.group_id
),
merge_plan AS (
    SELECT
        t.group_id        AS src_group_id,
        tm.master_group_id,
        ms.iname          AS src_iname,
        mt.iname          AS master_iname,
        mlinkdw.similarity(ms.iname::text, mt.iname::text) AS name_sim
    FROM mlinkdw.shopprod_group_content_token t
    JOIN token_master tm ON tm.token = t.token
    JOIN mlinkdw.shopprod_group2 g ON g.group_id = t.group_id
    JOIN mlinkdw.shopprod_group2 mg ON mg.group_id = tm.master_group_id
    JOIN mlinkdw.shopprod_group_map2 ms ON ms.group_id = g.group_id AND ms.icode = g.winner_icode AND ms.vender_code = g.winner_vender_code
    JOIN mlinkdw.shopprod_group_map2 mt ON mt.group_id = mg.group_id AND mt.icode = mg.winner_icode AND mt.vender_code = mg.winner_vender_code
    WHERE t.group_id <> tm.master_group_id
)
SELECT
    src_group_id,
    master_group_id,
    MAX(name_sim) AS max_name_sim,
    MIN(src_iname) AS src_iname,
    MIN(master_iname) AS master_iname
FROM merge_plan
GROUP BY
    src_group_id,
    master_group_id
HAVING MAX(name_sim) >= 0.1
ORDER BY
    master_group_id,
    src_group_id;
"""


# ----------------------------------------------------------------------
# merge_cluster (기존 로직 그대로)
# ----------------------------------------------------------------------

def merge_cluster(conn, *, target: int, sources: List[int], dry_run: bool) -> None:
    if not sources or dry_run:
        return

    srcs = [int(x) for x in sources]
    print(f"[{_now_str()}] [MERGE] start target={int(target)} sources={len(srcs)}")
    cur = conn.cursor()
    try:
        # content_token 정리
        try:
            cur.execute(
                "DELETE FROM mlinkdw.shopprod_group_content_token WHERE group_id = ANY(%s)",
                (srcs,),
            )
            cur.execute(
                """
                DELETE FROM mlinkdw.shopprod_group_content_token t
                WHERE t.group_id = %s
                  AND EXISTS (
                    SELECT 1
                    FROM mlinkdw.shopprod_group2 g
                    WHERE g.group_id = t.group_id
                      AND (t.vender_code IS DISTINCT FROM g.winner_vender_code
                           OR t.icode IS DISTINCT FROM g.winner_icode)
                  )
                """,
                (int(target),),
            )
        except Exception:
            pass

        cur.execute(
            """
            UPDATE mlinkdw.shopprod_group_map2
            SET group_id = %s
            WHERE group_id = ANY(%s)
            """,
            (int(target), srcs),
        )

        cur.execute(
            """
            UPDATE mlinkdw.shopprod_sub_group2
            SET group_id = %s
            WHERE group_id = ANY(%s)
            """,
            (int(target), srcs),
        )

        try:
            cur.execute(
                """
                DELETE FROM mlinkdw.shopprod_group2 g
                WHERE g.group_id = ANY(%s)
                  AND NOT EXISTS (SELECT 1 FROM mlinkdw.shopprod_group_map2 gm WHERE gm.group_id = g.group_id)
                  AND NOT EXISTS (SELECT 1 FROM mlinkdw.shopprod_sub_group2 sg WHERE sg.group_id = g.group_id)
                  AND NOT EXISTS (SELECT 1 FROM mlinkdw.shopprod_group_content_token ct WHERE ct.group_id = g.group_id)
                """,
                (srcs,),
            )
        except Exception:
            pass

        conn.commit()
        print(f"[{_now_str()}] [MERGE] committed target={int(target)} sources={len(srcs)}")
    except Exception:
        conn.rollback()
        print(f"[{_now_str()}] [MERGE] rollback target={int(target)} sources={len(srcs)}")
        raise
    finally:
        cur.close()


# ----------------------------------------------------------------------
# 메인 실행 함수
# ----------------------------------------------------------------------

def run_union_content_token(conn, *, dry_run: bool = False) -> None:
    """
    token + 상품명 유사도 기반 group merge 통합 실행
    """
    t0 = time.perf_counter()
    print(f"[{_now_str()}] [UNION] start dry_run={bool(dry_run)}")
    cur = conn.cursor()
    try:
        t_sql0 = time.perf_counter()
        print(f"[{_now_str()}] [UNION] fetch_candidates(sql) start")
        cur.execute(SQL_MERGE_CANDIDATE)
        rows = cur.fetchall()
        print(f"[{_now_str()}] [UNION] fetch_candidates(sql) done rows={len(rows):,} elapsed={_fmt_seconds(time.perf_counter()-t_sql0)}")
    finally:
        cur.close()

    # master_group_id → source group list
    merge_map: Dict[int, List[int]] = {}
    passed = 0
    dropped = 0
    # 상세 로그(기본은 과도한 출력 방지용으로 샘플만)
    LOG_DETAIL_LIMIT = int(os.environ.get("UNION_CONTENT_TOKEN_LOG_LIMIT", "200") or "200")
    LOG_ALL = str(os.environ.get("UNION_CONTENT_TOKEN_LOG_ALL", "")).strip().lower() in ("1", "true", "yes", "y", "on")
    passed_details = []
    dropped_details = []

    # rows: (src_group_id, master_group_id, max_name_sim, src_iname, master_iname)
    for src_gid, master_gid, name_sim, src_iname, master_iname in rows:
        sim = iname_similarity_pct(src_iname, master_iname)

        if sim >= 15:
            merge_map.setdefault(master_gid, []).append(src_gid)
            passed += 1
            if LOG_ALL or len(passed_details) < LOG_DETAIL_LIMIT:
                passed_details.append((int(master_gid), int(src_gid), float(name_sim or 0.0), float(sim)))
        else:
            dropped += 1
            if LOG_ALL or len(dropped_details) < LOG_DETAIL_LIMIT:
                dropped_details.append((int(master_gid), int(src_gid), float(name_sim or 0.0), float(sim)))

    print(f"[{_now_str()}] [UNION] similarity_filter done passed={passed:,} dropped={dropped:,} masters={len(merge_map):,}")

    if passed_details:
        trunc = "" if (LOG_ALL or len(passed_details) >= passed) else ""
        if (not LOG_ALL) and (passed > len(passed_details)):
            trunc = f" (showing first {len(passed_details):,}/{passed:,}; set UNION_CONTENT_TOKEN_LOG_ALL=1 for all)"
        print(f"[{_now_str()}] [UNION] passed_details{name_sim and ''}{trunc}")
        for master_gid, src_gid, ns, s in passed_details:
            print(f"[{_now_str()}] [PASS] master_gid={master_gid} src_gid={src_gid} name_sim={ns:.4f} sim={s:.2f}")

    if dropped_details:
        trunc = ""
        if (not LOG_ALL) and (dropped > len(dropped_details)):
            trunc = f" (showing first {len(dropped_details):,}/{dropped:,}; set UNION_CONTENT_TOKEN_LOG_ALL=1 for all)"
        print(f"[{_now_str()}] [UNION] dropped_details{trunc}")
        for master_gid, src_gid, ns, s in dropped_details:
            print(f"[{_now_str()}] [DROP] master_gid={master_gid} src_gid={src_gid} name_sim={ns:.4f} sim={s:.2f}")

    # 너무 많은 로그를 방지하기 위해 상위 일부만 미리보기
    preview_items = list(merge_map.items())[:10]
    if preview_items:
        pv = ", ".join([f"{k}->{len(v)}" for k, v in preview_items])
        more = "" if len(merge_map) <= 10 else f", ... (+{len(merge_map)-10} masters)"
        print(f"[{_now_str()}] [UNION] merge_plan preview: {pv}{more}")

    merge_targets = 0
    merge_sources = 0
    for target, sources in merge_map.items():
        merge_targets += 1
        merge_sources += len(sources)
        merge_cluster(
            conn,
            target=target,
            sources=sources,
            dry_run=dry_run
        )

    print(f"[{_now_str()}] [UNION] done targets={merge_targets:,} sources={merge_sources:,} elapsed={_fmt_seconds(time.perf_counter()-t0)}")

if __name__ == "__main__":
    conn = psycopg2.connect(**DB_INFO)
    run_union_content_token(conn, dry_run=False)