Source code for pycantonese.parsing

import concurrent.futures as cf
import functools
import multiprocessing as mp
import re
import sys
from string import ascii_uppercase

from pycantonese.corpus import CHAT
from pycantonese.jyutping.characters import characters_to_jyutping
from pycantonese.pos_tagging.tagger import pos_tag

# Punctuation marks for utterance segmentation.
_UTTERANCE_PUNCT_MARKS = frozenset(("。", "!", "?"))
_ASCII_UPPERCASE = frozenset(ascii_uppercase)

_UNKNOWN_PARTICIPANT = "X"

_IS_WASM = sys.platform == "emscripten"
_CPU_COUNT = mp.cpu_count()
_CHUNK_SIZE = 4


def _parse_text(text: str, pos_tag_kwargs):
    chars_jps = characters_to_jyutping(text)
    segmented, jyutping = [], []
    for chars, jps in chars_jps:
        segmented.append(chars)
        jyutping.append(jps)
    tags = [pos for _, pos in pos_tag(segmented, **(pos_tag_kwargs or {}))]
    return segmented, tags, jyutping


def _get_utterance(unparsed_sent, pos_tag_kwargs, participant):
    """Parse text into (participant, words_str, mor_str) tuple."""
    if participant is None:
        participant = _UNKNOWN_PARTICIPANT

    if not unparsed_sent:
        return (str(participant), "", "")

    if isinstance(unparsed_sent, tuple):
        participant, unparsed_sent, *_ = unparsed_sent

    participant = str(participant)

    words, tags, jps = _parse_text(unparsed_sent, pos_tag_kwargs)

    words_str = " ".join(words)
    mor_items = []
    for word, pos, jp in zip(words, tags, jps):
        if pos == "PUNCT" or pos[0].upper() not in _ASCII_UPPERCASE:
            mor_items.append(word)
        else:
            # CHAT %mor uses space as the inter-word delimiter, so the
            # per-word Jyutping must be space-free.
            mor_items.append(f"{pos}|{(jp or '').replace(' ', '')}")
    mor_str = " ".join(mor_items)

    return (participant, words_str, mor_str)


[docs] def parse_text( data, *, pos_tag_kwargs=None, participant: str | None = None, parallel: bool = True, ) -> CHAT: """Parse raw Cantonese text. Args: data (str or Iterable[str] or Iterable[tuple[str, str]]): Raw Cantonese text data, in one of the following formats: - A single string, e.g., ``"廣東話好難學?都唔係吖!"`` (which would be two utterances). Basic utterance segmentation (i.e., splitting by the end-of-line character ``\\n`` or one of the Chinese full-width punctuation marks from {"。", "!", "?"}) will be applied to this string, and each segmented utterance will be an utterance in the resulting CHAT reader. - An iterable of strings, e.g., ``["廣東話好難學?", "都唔係吖!"]``. No utterance segmentation will be done. Use this option to pass in data that's utterance-segmented to your liking. - An iterable of tuples, where each tuple has two strings, one for the participant and the other for the utterance, e.g., ``[("小芬", "你食咗飯未呀?"), ("小明", "我食咗喇。")]``. if an empty input or ``None`` is provided, then an empty :class:`~pycantonese.CHAT` instance is returned. pos_tag_kwargs (dict, optional): To customize part-of-speech tagging, provide a dictionary here which would then be passed as keyword arguments to :func:`~pycantonese.pos_tag`. participant (str, optional): If provided, this will be the participant in the output CHAT-formatted data (and will override all the particpants if your input to ``data`` is an iterable of tuples). If not provided, a default dummy participant ``"X"`` is used when your ``data`` is either a single string or an iterable of strings. parallel (bool, optional): If ``True`` (the default), this function attempts to parallelize parsing for speed-up. (In case the data volume is very small, the parsing is not parallelized even if you pass in ``True``.) Under certain circumstances (e.g., your application is already parallelized and further parallelization from within this function might be undesirable), you may like to consider setting this parameter to ``False``. Returns: :class:`~pycantonese.CHAT` """ if not data: return CHAT() if isinstance(data, str): # Perform basic sentence segmentation. for punct in _UTTERANCE_PUNCT_MARKS: data = data.replace(punct, f"{punct}\n") data = data.replace("\r\n", "\n") data = re.sub(r"\n{2,}", "\n", data) data = data.strip().split("\n") # Disable parallelization in Pyodide (no subprocess/thread support). if _IS_WASM: parallel = False # If there's not much data, don't bother with parallelization. if parallel and len(data) < (_CPU_COUNT * _CHUNK_SIZE): parallel = False if parallel: func = functools.partial( _get_utterance, pos_tag_kwargs=pos_tag_kwargs, participant=participant, ) with cf.ProcessPoolExecutor() as executor: utterances = list(executor.map(func, data, chunksize=_CHUNK_SIZE)) else: utterances = [ _get_utterance(sent, pos_tag_kwargs, participant) for sent in data ] # Build CHAT-format string all_participants = sorted(set(p for p, _, _ in utterances)) lines = ["@Begin"] parts = ", ".join(f"{p} Other" for p in all_participants) lines.append(f"@Participants:\t{parts}") for p, words_str, mor_str in utterances: lines.append(f"*{p}:\t{words_str}") if mor_str: lines.append(f"%mor:\t{mor_str}") lines.append("@End") return CHAT.from_strs(["\n".join(lines)], strict=False)