#!/usr/bin/env python # -*- coding: utf-8 -*- """ Quick smoke test for content image URL token extraction. Usage: python test_content_token_extract_samples.py """ from __future__ import annotations from typing import Callable def _run(name: str, html: str, extract: Callable[[str], list[str]]) -> None: toks = extract(html) print(f"\n=== {name} ===") print(f"tokens({len(toks)}):") for t in toks: print(" -", t) def main() -> None: # Prefer the batch tokenizer (it is the canonical policy for DB tokens) import Z_Batch_Create_Content_Token as z # noqa: N812 extract = lambda s: z._extract_img_url_tokens_from_html(s) # type: ignore[attr-defined] samples: list[tuple[str, str]] = [ ( "escaped_quotes_and_no_ext", r'

' r'

', ), ( "unquoted_src", r"

", ), ( "comma_broken_img_tag", r'', ), ( "srcset", r'

', ), ( "relative_path", r'

', ), ( "broken_scheme", r'

', ), ( "scheme_less", r'

', ), ( "data_uri_should_be_dropped_by_normalize", r'

', ), ] for name, html in samples: _run(name, html, extract) if __name__ == "__main__": main()