Source code for wcwidth._wcswidth

"""This is a python implementation of wcswidth()."""

from __future__ import annotations

from typing import Optional

__lazy_modules__ = [
    "wcwidth._constants",
    "wcwidth._wcwidth",
    "wcwidth.bisearch",
    "wcwidth.table_grapheme",
    "wcwidth.table_vs16",
]
# local
from . import table_grapheme_overrides
from ._wcwidth import wcwidth
from .bisearch import bisearch
from ._constants import (_EMOJI_ZWJ_SET,
                         _ISC_VIRAMA_SET,
                         _CATEGORY_MC_TABLE,
                         _FITZPATRICK_RANGE,
                         _REGIONAL_INDICATOR_SET,
                         resolve_terminal,
                         get_term_overrides)
from .table_vs15 import VS15_WIDE_TO_NARROW
from .table_vs16 import VS16_NARROW_TO_WIDE
from .table_grapheme import GRAPHEME_EXTEND


def _scan_zwj_cluster_end(text: str, start: int, end: int) -> int:
    """
    Scan forward from *start* (base character) to end of a ZWJ grapheme cluster.

    Follows the UAX #29 GB11 pattern (ExtPict Extend* ZWJ x ExtPict) chained repeatedly until no
    more ZWJ joins are found.
    """
    idx = start + 1
    # Skip Extend characters (Fitzpatrick modifiers, etc.) before first ZWJ
    while idx < end and bisearch(ord(text[idx]), GRAPHEME_EXTEND):
        idx += 1
    # Follow ZWJ chains
    while idx < end:
        if ord(text[idx]) != 0x200D:
            break
        idx += 1
        # GB11: \p{ExtPict} Extend* ZWJ × \p{ExtPict}
        # Extend modifiers (VS16, Fitzpatrick skin tones, etc.) attach to
        # the ExtPict *before* the ZWJ, not after it.  After ZWJ the next
        # codepoint is always an ExtPict directly, no Extend skip needed.
        if idx < end and ord(text[idx]) in _EMOJI_ZWJ_SET:
            idx += 1
            # Skip trailing Extend (VS16, etc.) after ExtPict before next ZWJ
            while idx < end and bisearch(ord(text[idx]), GRAPHEME_EXTEND):
                idx += 1
            continue
        break
    return idx


[docs] def wcswidth( pwcs: str, n: Optional[int] = None, unicode_version: str = 'auto', ambiguous_width: int = 1, ) -> int: """ Given a unicode string, return its printable length on a terminal. See :ref:`Specification` for details of cell measurement. This implementation differs from Markus Khun's original POSIX C implementation, in that this ``wcswidth()`` processes graphemes strings yielded by :func:`wcwidth.iter_graphemes` defined by `Unicode Standard Annex #29`_. POSIX wcswidth(3) is not grapheme-aware and does not measure many kinds of Emojis or complex scripts correctly. :param pwcs: Measure width of given unicode string. :param n: When ``n`` is None (default), return the length of the entire string, otherwise only the first ``n`` characters are measured. :param unicode_version: Ignored. Retained for backwards compatibility. .. deprecated:: 0.3.0 Only the latest Unicode version is now shipped. :param ambiguous_width: Width to use for East Asian Ambiguous (A) characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. :returns: The width, in cells, needed to display the first ``n`` characters of the unicode string ``pwcs``. Returns ``-1`` for C0 and C1 control characters! .. _`Unicode Standard Annex #29`: https://www.unicode.org/reports/tr29/ """ # pylint: disable=unused-argument,too-many-locals,too-many-statements,redefined-variable-type # pylint: disable=too-complex,too-many-branches,duplicate-code,too-many-nested-blocks # Fast path: pure ASCII printable strings are always width == length if n is None and pwcs.isascii() and pwcs.isprintable(): return len(pwcs) _wcwidth = wcwidth if ambiguous_width == 1 else lambda c: wcwidth(c, 'auto', ambiguous_width) end = len(pwcs) if n is None else n total_width = 0 idx = 0 last_measured_idx = -2 # -2 sentinel blocks VS16/VS15 (no base available) last_measured_ucs = -1 last_measured_w = 0 prev_was_virama = False cluster_width = 0 vs16_nw_table = VS16_NARROW_TO_WIDE['9.0.0'] vs15_wn_table = VS15_WIDE_TO_NARROW['9.0.0'] _bisearch = bisearch while idx < end: char = pwcs[idx] ucs = ord(char) # 5. ZWJ (U+200D): consumed without contributing width. # Virama codepoints are treated as zero-width combining marks (Mn). When a # virama+consonant sequence forms a conjunct, its width is capped at 2 cells. # ZWJ (U+200D) if ucs == 0x200D: if prev_was_virama: idx += 1 elif idx + 1 < end: last_measured_w = 0 prev_was_virama = False idx += 2 else: prev_was_virama = False idx += 1 continue # 6. VS16 (U+FE0F): converts preceding narrow character to wide. if ucs == 0xFE0F and last_measured_idx >= 0: if _bisearch(last_measured_ucs, vs16_nw_table): cluster_width = 2 last_measured_idx = -2 idx += 1 continue # VS15 (U+FE0E): text variation selector, requests narrow presentation. if ucs == 0xFE0E and last_measured_idx >= 0: if bisearch(last_measured_ucs, vs15_wn_table) and last_measured_w == 2: total_width -= 1 idx += 1 continue # 7. Regional Indicator & Fitzpatrick (both above BMP) if ucs > 0xFFFF: if ucs in _REGIONAL_INDICATOR_SET: ri_before = 0 j = idx - 1 while j >= 0 and ord(pwcs[j]) in _REGIONAL_INDICATOR_SET: ri_before += 1 j -= 1 if ri_before % 2 == 1: last_measured_ucs = ucs idx += 1 continue elif (_FITZPATRICK_RANGE[0] <= ucs <= _FITZPATRICK_RANGE[1] and last_measured_ucs in _EMOJI_ZWJ_SET): idx += 1 continue # 8. Normal character: measure with wcwidth w = _wcwidth(char) if w < 0: return -1 if w > 0: if prev_was_virama: cluster_width = 2 elif cluster_width: total_width += cluster_width cluster_width = w else: cluster_width = w last_measured_idx = idx last_measured_ucs = ucs last_measured_w = w prev_was_virama = False elif ucs in _ISC_VIRAMA_SET: prev_was_virama = True elif last_measured_idx >= 0 and _bisearch(ucs, _CATEGORY_MC_TABLE): cluster_width = 2 last_measured_idx = -2 prev_was_virama = False else: prev_was_virama = False idx += 1 if cluster_width: total_width += cluster_width return total_width
[docs] def wcstwidth( pwcs: str, n: Optional[int] = None, unicode_version: str = 'auto', ambiguous_width: int = 1, term_program: bool | str = True, ) -> int: """ Given a unicode string, return its printable length on a terminal given by ``term_program``. See :ref:`Specification` for details of cell measurement. Unlike :func:`wcswidth`, this function applies per-terminal correction tables for emoji presentation and grapheme clusters. :param pwcs: Measure width of given unicode string. :param n: When ``n`` is None (default), return the length of the entire string, otherwise only the first ``n`` characters are measured. :param unicode_version: Ignored. Retained for backwards compatibility. :param ambiguous_width: Width to use for East Asian Ambiguous (A) characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. :param term_program: Terminal software identifier for table correction. ``True`` (default) reads the ``TERM_PROGRAM`` or ``TERM`` environment variable for auto-detection. ``False`` disables override lookup. Accepts a canonical terminal name matching :func:`list_term_programs`, such as from XTVERSION_, ENQ_, or ``TERM_PROGRAM``. .. versionadded:: 0.8.0 :returns: The width, in cells, needed to display the first ``n`` characters of the unicode string ``pwcs``. Returns ``-1`` for C0 and C1 control characters! """ # pylint: disable=unused-argument,too-many-locals,too-many-statements,redefined-variable-type # pylint: disable=too-complex,too-many-branches,duplicate-code,too-many-nested-blocks # This function intentionally keeps all logic inline for performance. # Fast path: pure ASCII printable strings are always width == length if n is None and pwcs.isascii() and pwcs.isprintable(): return len(pwcs) # Resolve terminal software for override lookup term_canonical = resolve_terminal(term_program) # Skip override lookup when no terminal detected (avoids lru_cache call overhead). # Extract locals for hot-loop performance (NamedTuple attribute access is slow). if term_canonical: overrides = get_term_overrides(term_canonical) _narrower = overrides.narrower _vs16_narrower = overrides.vs16_narrower _vs15_wider = overrides.vs15_wider _zeroer = overrides.zeroer _narrow_wider = overrides.narrow_wider _narrow_zeroer = overrides.narrow_zeroer _grapheme_overrides = table_grapheme_overrides.get(term_canonical) else: _narrower = () _vs16_narrower = () _vs15_wider = () _zeroer = () _narrow_wider = () _narrow_zeroer = () _grapheme_overrides = {} # Select wcwidth call pattern for best lru_cache performance _wcwidth = wcwidth if ambiguous_width == 1 else lambda c: wcwidth(c, 'auto', ambiguous_width) end = len(pwcs) if n is None else n total_width = 0 idx = 0 # grapheme-clustering state and local re-binding for performance. # Widths accumulate in cluster_width and flush at boundaries. A cluster is a base character # plus combining marks, deferring the flush lets grapheme overrides replace the measured width # retrospectively. last_measured_idx = -2 # -2 sentinel blocks VS16/VS15 (no base available) last_measured_ucs = -1 last_measured_w = 0 prev_was_virama = False cluster_start = -1 total_before_cluster = 0 cluster_width = 0 vs16_nw_table = VS16_NARROW_TO_WIDE['9.0.0'] vs15_wn_table = VS15_WIDE_TO_NARROW['9.0.0'] _bisearch = bisearch while idx < end: char = pwcs[idx] ucs = ord(char) # # Much of the logic below matches the logic in width(), but is repeated for improved # performance, they are given matching index reference numbers (starting at #5). # # 5. ZWJ (U+200D): consumed without contributing width. # Virama codepoints are treated as zero-width combining marks (Mn). When a # virama+consonant sequence forms a conjunct, its width is capped at 2 cells # matching behavior of popular terminals (PR #224) # ZWJ (U+200D) if ucs == 0x200D: if prev_was_virama: idx += 1 elif idx + 1 < end: # Check for terminal grapheme override when base char is ExtPict/RI if (_grapheme_overrides and last_measured_idx >= 0 and last_measured_ucs in _EMOJI_ZWJ_SET): cluster_end = _scan_zwj_cluster_end(pwcs, last_measured_idx, end) cluster = pwcs[last_measured_idx:cluster_end] override_w = _grapheme_overrides.get(cluster) if override_w is not None: total_width += (override_w - last_measured_w) last_measured_idx = -2 last_measured_ucs = -1 last_measured_w = 0 prev_was_virama = False cluster_start = -1 idx = cluster_end continue # No override; ZWJ breaks VS adjacency. # VS16 already set last_measured_idx = -2, blocking further VS16. last_measured_w = 0 prev_was_virama = False idx += 2 else: prev_was_virama = False idx += 1 continue # 6. VS16 (U+FE0F): converts preceding narrow character to wide. if ucs == 0xFE0F and last_measured_idx >= 0: if _vs16_narrower and _bisearch(last_measured_ucs, _vs16_narrower): pass elif _bisearch(last_measured_ucs, vs16_nw_table): cluster_width = 2 last_measured_idx = -2 # prevent double application idx += 1 continue # VS15 (U+FE0E): text variation selector, requests narrow presentation. if ucs == 0xFE0E and last_measured_idx >= 0: base_ucs = last_measured_ucs vs15_narrow = bisearch(base_ucs, vs15_wn_table) if _vs15_wider and bisearch(base_ucs, _vs15_wider): vs15_narrow = False if vs15_narrow and last_measured_w == 2: total_width -= 1 idx += 1 continue # 7. Regional Indicator & Fitzpatrick (both above BMP) if ucs > 0xFFFF: if ucs in _REGIONAL_INDICATOR_SET: ri_before = 0 j = idx - 1 while j >= 0 and ord(pwcs[j]) in _REGIONAL_INDICATOR_SET: ri_before += 1 j -= 1 if ri_before % 2 == 1: last_measured_ucs = ucs idx += 1 continue elif (_FITZPATRICK_RANGE[0] <= ucs <= _FITZPATRICK_RANGE[1] and last_measured_ucs in _EMOJI_ZWJ_SET): idx += 1 continue # 8. Normal character: measure with wcwidth w = _wcwidth(char) if w < 0: # C0/C1 control character return -1 # Apply single-codepoint terminal overrides (pre-merged tuples) if w == 2 and _narrower and bisearch(ucs, _narrower): w = 1 elif w == 2 and _zeroer and bisearch(ucs, _zeroer): w = 0 if w == 1 and _narrow_wider and bisearch(ucs, _narrow_wider): w = 2 elif w == 1 and _narrow_zeroer and bisearch(ucs, _narrow_zeroer): w = 0 if w > 0: # virama+consonant extends current cluster; otherwise start new if prev_was_virama: cluster_width = 2 elif cluster_width: # flush previous cluster, check for grapheme overrides flushed = False if _grapheme_overrides and cluster_start >= 0: # Two-phase override lookup: candidate (cluster+current) catches Lo+Lo pairs # where both chars bear width (Thai KO KAI + SARA AM). cluster_text (cluster # alone) catches C+Mc clusters where the override key is shorter. candidate = pwcs[cluster_start:idx + 1] override_w = _grapheme_overrides.get(candidate) if override_w is not None: total_width = total_before_cluster + override_w flushed = True cluster_width = 0 else: cluster_text = pwcs[cluster_start:idx] override_w = _grapheme_overrides.get(cluster_text) if override_w is not None: total_width = total_before_cluster + override_w else: total_width += cluster_width else: total_width += cluster_width if not flushed: cluster_width = w cluster_start = idx total_before_cluster = total_width else: cluster_width = w cluster_start = idx total_before_cluster = total_width last_measured_idx = idx last_measured_ucs = ucs last_measured_w = w prev_was_virama = False elif ucs in _ISC_VIRAMA_SET: prev_was_virama = True elif last_measured_idx >= 0 and _bisearch(ucs, _CATEGORY_MC_TABLE): # Spacing Combining Mark (Mc) following a base character cluster_width = 2 last_measured_idx = -2 prev_was_virama = False else: prev_was_virama = False idx += 1 if cluster_width: if _grapheme_overrides and cluster_start >= 0: cluster_text = pwcs[cluster_start:end] override_w = _grapheme_overrides.get(cluster_text) if override_w is not None: total_width = total_before_cluster + override_w else: total_width += cluster_width else: total_width += cluster_width return total_width