#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Propose_Split_Group_FromCSV.py

목적:
  shopprod_group_map export CSV(예: shopprod_group_map_202602031844.csv)에서
  특정 group_id가 "잘못 병합된" 케이스를 자동으로 재클러스터링(분리 후보)해준다.

주의:
  - 이 스크립트는 "분리 제안"만 출력한다. DB를 직접 수정하지 않는다.
  - winner 재선정은 DB 반영 후 Search_DomeOrchestration.py 등 기존 파이프라인을 권장.

사용 예:
  python Propose_Split_Group_FromCSV.py --csv shopprod_group_map_202602031844.csv --group-id 2859133 --top 30
"""

from __future__ import annotations

import argparse
import csv
from collections import Counter, defaultdict
from dataclasses import dataclass
from typing import Dict, List, Tuple


# ---- small union-find ----
class DSU:
    def __init__(self, n: int):
        self.p = list(range(n))
        self.sz = [1] * n

    def find(self, x: int) -> int:
        while self.p[x] != x:
            self.p[x] = self.p[self.p[x]]
            x = self.p[x]
        return x

    def union(self, a: int, b: int) -> None:
        ra = self.find(a)
        rb = self.find(b)
        if ra == rb:
            return
        if self.sz[ra] < self.sz[rb]:
            ra, rb = rb, ra
        self.p[rb] = ra
        self.sz[ra] += self.sz[rb]


@dataclass
class Row:
    group_id: int
    vender_code: str
    icode: str
    iname: str
    price: int | None
    img_url: str
    content: str
    dome_code: str


def _safe_int(x) -> int | None:
    try:
        if x is None:
            return None
        s = str(x).strip().replace(",", "")
        if not s:
            return None
        return int(float(s))
    except Exception:
        return None


def main() -> None:
    ap = argparse.ArgumentParser()
    ap.add_argument("--csv", required=True)
    ap.add_argument("--group-id", type=int, required=True)
    ap.add_argument("--top", type=int, default=20, help="출력할 상위 클러스터 수")
    ap.add_argument("--max-rows", type=int, default=500000, help="CSV scan 상한(안전장치)")
    ap.add_argument("--max-token-df", type=int, default=200, help="너무 흔한 토큰(df>k)은 연결에 사용하지 않음")
    ap.add_argument("--min-shared", type=int, default=2, help="아이템 연결 최소 shared token 수(기본 2)")
    args = ap.parse_args()

    # 토큰 정책은 DB 토큰 생성 스크립트와 동일해야 함
    import Z_Batch_Create_Content_Token as z  # noqa: N812

    rows: List[Row] = []
    with open(args.csv, "r", encoding="utf-8-sig", errors="replace", newline="") as f:
        r = csv.DictReader(f)
        for i, d in enumerate(r):
            if i >= int(args.max_rows):
                break
            try:
                gid = int(str(d.get("group_id") or "").strip())
            except Exception:
                continue
            if gid != int(args.group_id):
                continue
            rows.append(
                Row(
                    group_id=gid,
                    vender_code=str(d.get("vender_code") or "").strip(),
                    icode=str(d.get("icode") or "").strip(),
                    iname=str(d.get("iname") or "").strip(),
                    price=_safe_int(d.get("price")),
                    img_url=str(d.get("img_url") or "").strip(),
                    content=str(d.get("content") or "").strip(),
                    dome_code=str(d.get("dome_code") or "").strip(),
                )
            )

    if not rows:
        print(f"[ERR] no rows found for group_id={args.group_id}")
        return

    # 1) 아이템별 토큰 추출
    item_tokens: List[List[str]] = []
    for rr in rows:
        toks = []
        try:
            toks.extend(z._extract_img_url_tokens_from_html(rr.content))  # type: ignore[attr-defined]
        except Exception:
            pass
        # thumb도 약한 신호로 1개만 추가(너무 흔한 CDN placeholder는 df 필터로 걸러짐)
        try:
            t = z._normalize_img_url_token(rr.img_url)  # type: ignore[attr-defined]
            if t:
                toks.append(t)
        except Exception:
            pass
        # dedup
        seen = set()
        out = []
        for t in toks:
            if t in seen:
                continue
            seen.add(t)
            out.append(t)
        item_tokens.append(out)

    # 2) token df 계산 + 너무 흔한 토큰 제거
    df = Counter()
    for toks in item_tokens:
        for t in toks:
            df[t] += 1
    max_df = int(args.max_token_df)
    filtered_tokens: List[List[str]] = []
    for toks in item_tokens:
        filtered_tokens.append([t for t in toks if df.get(t, 0) <= max_df])

    # 3) token -> items 역인덱스
    inv: Dict[str, List[int]] = defaultdict(list)
    for idx, toks in enumerate(filtered_tokens):
        for t in toks:
            inv[t].append(idx)

    # 4) pairwise shared count (희소 인덱스 기반) + DSU로 컴포넌트 생성
    shared = defaultdict(int)  # (i,j) -> cnt
    for t, ids in inv.items():
        if len(ids) < 2:
            continue
        ids2 = sorted(ids)
        for a_pos in range(len(ids2)):
            a = ids2[a_pos]
            for b_pos in range(a_pos + 1, len(ids2)):
                b = ids2[b_pos]
                shared[(a, b)] += 1

    dsu = DSU(len(rows))
    min_shared = int(args.min_shared)
    for (a, b), cnt in shared.items():
        if cnt >= min_shared:
            dsu.union(a, b)

    # 5) 컴포넌트 구성
    comps: Dict[int, List[int]] = defaultdict(list)
    for i in range(len(rows)):
        comps[dsu.find(i)].append(i)

    comp_list = sorted(comps.values(), key=lambda xs: (-len(xs), min(xs)))
    print(f"[INFO] group_id={args.group_id} items={len(rows)} comps={len(comp_list)}")

    # 6) 리포트
    top = int(args.top)
    for ci, ids in enumerate(comp_list[:top], start=1):
        # winner 후보(간단): 최저가 우선, 그 다음 icode
        cand = []
        for i in ids:
            rr = rows[i]
            cand.append((rr.price if rr.price is not None else 10**18, rr.icode, rr))
        cand.sort(key=lambda x: (x[0], x[1]))
        w = cand[0][2]

        # 간단 키워드(이름) 상위
        words = []
        for i in ids:
            s = rows[i].iname
            for tok in str(s).replace("/", " ").replace("-", " ").split():
                tok = tok.strip()
                if len(tok) >= 2:
                    words.append(tok)
        wc = Counter(words)

        print(f"\n--- cluster #{ci} size={len(ids)} ---")
        print(f"[winner_candidate] vender={w.vender_code} icode={w.icode} price={w.price} iname={w.iname[:60]}")
        print("[top_words]", ", ".join([f"{k}({v})" for k, v in wc.most_common(8)]))
        # 샘플 5개
        for i in ids[:5]:
            rr = rows[i]
            print(f" - {rr.vender_code}/{rr.icode} price={rr.price} iname={rr.iname[:80]}")

    if len(comp_list) > top:
        print(f"\n[INFO] ... {len(comp_list) - top} more clusters not shown (use --top to increase)")


if __name__ == "__main__":
    main()

