Source code for pycantonese.corpus

from __future__ import annotations

import functools
import itertools
import os
import re
import sys
from collections.abc import Sequence
from typing import cast

from pycantonese._rust import Chat as _RustChat, Token, Utterance
from pycantonese.search import _perform_search

_IS_WASM = sys.platform == "emscripten"


def _flatten(iterable):
    """Flatten one level of nesting."""
    return list(itertools.chain.from_iterable(iterable))


[docs] class CHAT: """A reader for Cantonese CHAT corpus data. This class wraps a Rust-backed CHAT parser and provides Cantonese-specific functionality such as Jyutping extraction, character-level access, and corpus search. """
[docs] def __init__( self, chat: _RustChat | None = None, ): self._chat = chat if chat is not None else _RustChat()
[docs] @classmethod def from_zip( cls, path: str | os.PathLike[str], *, match: str | None = None, extension=".cha", parallel=True, strict=True, mor_tier="%mor", gra_tier="%gra", ): """Read CHAT data from a ZIP file. Args: path (str or os.PathLike[str]): Path to the ZIP file. match (str, optional): Glob pattern to match filenames within the ZIP. extension (str, optional): File extension to match. Default is ``".cha"``. parallel (bool, optional): If True, parse files in parallel. strict (bool, optional): If True, enforce strict parsing. mor_tier (str or None, optional): Name of the dependent tier to treat as the morphology tier, e.g. ``"%mor"`` or ``"%xmor"``. Default is ``"%mor"``. Set to None to disable mor+gra handling. gra_tier (str or None, optional): Name of the dependent tier to treat as the grammatical relation tier, e.g. ``"%gra"`` or ``"%xgra"``. Default is ``"%gra"``. Set to None to disable mor+gra handling. Returns: :class:`~pycantonese.CHAT` """ return cls( _RustChat.from_zip( os.fspath(path), match=match, extension=extension, parallel=parallel, strict=strict, mor_tier=mor_tier, gra_tier=gra_tier, ) )
[docs] @classmethod def from_dir( cls, path: str | os.PathLike[str], *, match: str | None = None, extension=".cha", parallel=True, strict=True, mor_tier="%mor", gra_tier="%gra", ): """Read CHAT data from a directory. Args: path (str or os.PathLike[str]): Path to the directory. match (str, optional): Glob pattern to match filenames within the directory. extension (str, optional): File extension to match. Default is ``".cha"``. parallel (bool, optional): If True, parse files in parallel. strict (bool, optional): If True, enforce strict parsing. mor_tier (str or None, optional): Name of the dependent tier to treat as the morphology tier, e.g. ``"%mor"`` or ``"%xmor"``. Default is ``"%mor"``. Set to None to disable mor+gra handling. gra_tier (str or None, optional): Name of the dependent tier to treat as the grammatical relation tier, e.g. ``"%gra"`` or ``"%xgra"``. Default is ``"%gra"``. Set to None to disable mor+gra handling. Returns: :class:`~pycantonese.CHAT` """ return cls( _RustChat.from_dir( os.fspath(path), match=match, extension=extension, parallel=parallel, strict=strict, mor_tier=mor_tier, gra_tier=gra_tier, ) )
[docs] @classmethod def from_files( cls, paths: Sequence[str | os.PathLike[str]], *, parallel=True, strict=True, mor_tier="%mor", gra_tier="%gra", ): """Read CHAT data from file paths. Args: paths (Sequence[str | os.PathLike[str]]): Paths to CHAT files. parallel (bool, optional): If True, parse files in parallel. strict (bool, optional): If True, enforce strict parsing. mor_tier (str or None, optional): Name of the dependent tier to treat as the morphology tier, e.g. ``"%mor"`` or ``"%xmor"``. Default is ``"%mor"``. Set to None to disable mor+gra handling. gra_tier (str or None, optional): Name of the dependent tier to treat as the grammatical relation tier, e.g. ``"%gra"`` or ``"%xgra"``. Default is ``"%gra"``. Set to None to disable mor+gra handling. Returns: :class:`~pycantonese.CHAT` """ return cls( _RustChat.from_files( [os.fspath(p) for p in paths], parallel=parallel, strict=strict, mor_tier=mor_tier, gra_tier=gra_tier, ) )
[docs] @classmethod def from_strs( cls, strs, *, ids=None, parallel=True, strict=True, mor_tier="%mor", gra_tier="%gra", ): """Read CHAT data from strings. Args: strs (list[str]): CHAT-formatted strings. ids (list[str], optional): Identifiers for each string. parallel (bool, optional): If True, parse strings in parallel. strict (bool, optional): If True, enforce strict parsing. mor_tier (str or None, optional): Name of the dependent tier to treat as the morphology tier, e.g. ``"%mor"`` or ``"%xmor"``. Default is ``"%mor"``. Set to None to disable mor+gra handling. gra_tier (str or None, optional): Name of the dependent tier to treat as the grammatical relation tier, e.g. ``"%gra"`` or ``"%xgra"``. Default is ``"%gra"``. Set to None to disable mor+gra handling. Returns: :class:`~pycantonese.CHAT` """ return cls( _RustChat.from_strs( strs, ids=ids, parallel=parallel, strict=strict, mor_tier=mor_tier, gra_tier=gra_tier, ) )
[docs] @classmethod def from_git( cls, url: str, *, rev: str | None = None, depth: int | None = None, match: str | None = None, extension=".cha", cache_dir: str | os.PathLike[str] | None = None, force_download=False, parallel=True, strict=True, mor_tier="%mor", gra_tier="%gra", ): """Read CHAT data from a Git repository. Args: url (str): URL of the Git repository. rev (str, optional): Git revision (branch, tag, or commit hash). depth (int, optional): Clone depth for shallow clones. match (str, optional): Glob pattern to match filenames within the repository. extension (str, optional): File extension to match. Default is ``".cha"``. cache_dir (str or os.PathLike[str], optional): Directory to cache the cloned repository. force_download (bool, optional): If True, force re-download even if cached. parallel (bool, optional): If True, parse files in parallel. strict (bool, optional): If True, enforce strict parsing. mor_tier (str or None, optional): Name of the dependent tier to treat as the morphology tier, e.g. ``"%mor"`` or ``"%xmor"``. Default is ``"%mor"``. Set to None to disable mor+gra handling. gra_tier (str or None, optional): Name of the dependent tier to treat as the grammatical relation tier, e.g. ``"%gra"`` or ``"%xgra"``. Default is ``"%gra"``. Set to None to disable mor+gra handling. Returns: :class:`~pycantonese.CHAT` """ return cls( _RustChat.from_git( url, rev=rev, depth=depth, match=match, extension=extension, cache_dir=os.fspath(cache_dir) if cache_dir is not None else None, force_download=force_download, parallel=parallel, strict=strict, mor_tier=mor_tier, gra_tier=gra_tier, ) )
[docs] @classmethod def from_url( cls, url: str, *, match: str | None = None, extension=".cha", cache_dir: str | os.PathLike[str] | None = None, force_download=False, parallel=True, strict=True, mor_tier="%mor", gra_tier="%gra", ): """Read CHAT data from a URL pointing to a ZIP archive. Args: url (str): URL of the ZIP archive. match (str, optional): Glob pattern to match filenames within the archive. extension (str, optional): File extension to match. Default is ``".cha"``. cache_dir (str or os.PathLike[str], optional): Directory to cache the downloaded archive. force_download (bool, optional): If True, force re-download even if cached. parallel (bool, optional): If True, parse files in parallel. strict (bool, optional): If True, enforce strict parsing. mor_tier (str or None, optional): Name of the dependent tier to treat as the morphology tier, e.g. ``"%mor"`` or ``"%xmor"``. Default is ``"%mor"``. Set to None to disable mor+gra handling. gra_tier (str or None, optional): Name of the dependent tier to treat as the grammatical relation tier, e.g. ``"%gra"`` or ``"%xgra"``. Default is ``"%gra"``. Set to None to disable mor+gra handling. Returns: :class:`~pycantonese.CHAT` """ return cls( _RustChat.from_url( url, match=match, extension=extension, cache_dir=os.fspath(cache_dir) if cache_dir is not None else None, force_download=force_download, parallel=parallel, strict=strict, mor_tier=mor_tier, gra_tier=gra_tier, ) )
[docs] @classmethod def from_utterances(cls, utterances): """Construct a CHAT reader from a list of utterances. Creates a new reader containing a single virtual file with the given utterances. Useful for splitting a reader into sub-readers based on utterance boundaries. Args: utterances (Sequence[Utterance]): Utterance objects to include. Returns: :class:`~pycantonese.CHAT` """ return cls(_RustChat.from_utterances(utterances))
def __getattr__(self, name): return getattr(self._chat, name)
[docs] def tokens( self, *, by_utterance=False, by_file=False, ) -> list[Token] | list[list[Token]] | list[list[list[Token]]]: """Return the tokens. Args: by_utterance (bool, optional): If True, return tokens grouped by utterance. by_file (bool, optional): If True, return tokens grouped by file. Returns: list """ return self._chat.tokens( by_utterance=by_utterance, by_file=by_file, )
[docs] def words( self, *, by_utterance=False, by_file=False, ) -> list[str] | list[list[str]] | list[list[list[str]]]: """Return the words. Args: by_utterance (bool, optional): If True, return words grouped by utterance. by_file (bool, optional): If True, return words grouped by file. Returns: list """ return self._chat.words(by_utterance=by_utterance, by_file=by_file)
[docs] def jyutping( self, *, by_utterance=False, by_file=False, ) -> list[str | None] | list[list[str | None]] | list[list[list[str | None]]]: """Return the data in Jyutping romanization. Args: by_utterance (bool, optional): If True, return Jyutping grouped by utterance. by_file (bool, optional): If True, return Jyutping grouped by file. Returns: list """ return self._chat.jyutping(by_utterance=by_utterance, by_file=by_file)
@staticmethod def _get_chars_from_sent(sent: list[str]) -> list[str]: result = [] for word in sent: if word and "\u4e00" <= word[0] <= "\u9fff": result.extend(list(word)) else: result.append(word) return result
[docs] def characters( self, *, by_utterance=False, by_file=False, ) -> list[str] | list[list[str]] | list[list[list[str]]]: """Return the data in individual Chinese characters. Args: by_utterance (bool, optional): If True, return characters grouped by utterance. by_file (bool, optional): If True, return characters grouped by file. Returns: list """ sents = cast( list[list[list[str]]], self.words(by_utterance=True, by_file=True), ) result = [ [self._get_chars_from_sent(sent) for sent in sents_for_file] for sents_for_file in sents ] if by_file and by_utterance: pass elif by_file and not by_utterance: result = [_flatten(f) for f in result] elif not by_file and by_utterance: result = _flatten(result) else: result = _flatten(_flatten(f) for f in result) return result
[docs] def word_ngrams(self, n: int): """Return word n-grams across all utterances. N-grams do not cross utterance boundaries. Args: n (int): The n-gram order (1 for unigrams, 2 for bigrams, etc.). Returns: Ngrams """ return self._chat.word_ngrams(n)
[docs] def search( self, *, onset=None, nucleus=None, coda=None, tone=None, initial=None, final=None, jyutping=None, character=None, pos=None, word_range=(0, 0), utterance_range=(0, 0), by_token=True, by_utterance=False, by_file=False, ): """Search the data for the given criteria. Args: onset (str, optional): Onset to search for. A regex is supported. nucleus (str, optional): Nucleus to search for. A regex is supported. coda (str, optional): Coda to search for. A regex is supported. tone (str, optional): Tone to search for. A regex is supported. initial (str, optional): Initial to search for. A regex is supported. final (str, optional): Final to search for. jyutping (str, optional): Jyutping romanization of one Cantonese character to search for. character (str, optional): One or more Cantonese characters to search for. pos (str, optional): A part-of-speech tag to search for. A regex is supported. word_range (tuple[int, int], optional): Span of words around a match. Default is ``(0, 0)``. utterance_range (tuple[int, int], optional): Span of utterances around a match. Default is ``(0, 0)``. by_token (bool, optional): If True, return Token objects. Otherwise return word strings. by_utterance (bool, optional): If True, return full utterances containing matches. by_file (bool, optional): If True, return data organized by file. Returns: list """ tagged_sents = self.tokens( by_utterance=True, by_file=True, ) result_by_files = _perform_search( tagged_sents, onset=onset, nucleus=nucleus, coda=coda, tone=tone, initial=initial, final=final, jyutping=jyutping, character=character, pos=pos, word_range=word_range, utterance_range=utterance_range, by_token=by_token, by_utterance=by_utterance, ) if by_file: return result_by_files else: return _flatten(result_by_files)
[docs] def utterances(self, *, by_file=False) -> list[Utterance] | list[list[Utterance]]: """Return the utterances. Args: by_file (bool, optional): If True, return utterances grouped by file. Returns: list[Utterance] | list[list[Utterance]] """ return self._chat.utterances(by_file=by_file)
[docs] def to_strs(self): """Return the data as CHAT-formatted strings. Returns: list[str] """ return self._chat.to_strs()
[docs] def to_files(self, dir_path: str | os.PathLike[str], *, filenames=None): """Write CHAT (.cha) files to a directory. Args: dir_path (str or os.PathLike[str]): Output directory path. filenames (list[str], optional): Filenames for each file. """ self._chat.to_files(os.fspath(dir_path), filenames=filenames)
@property def n_files(self): """The number of files.""" return self._chat.n_files @property def file_paths(self): """The file paths.""" return self._chat.file_paths
[docs] def filter(self, *, participants=None, files=None): """Filter the data by participants and/or files. Args: participants (str, optional): Regex pattern to match participant codes. files (str, optional): Glob pattern to match file paths. Returns: CHAT """ filtered = self._chat.filter(participants=participants, files=files) return CHAT(filtered)
[docs] def append(self, other): """Append another CHAT object's data.""" self._chat.append(other._chat)
[docs] def extend(self, others): """Extend with data from multiple CHAT objects.""" self._chat.extend([o._chat for o in others])
[docs] def info(self, verbose=False): """Print summary information.""" self._chat.info(verbose=verbose)
[docs] def headers(self): """Return the headers.""" return self._chat.headers()
[docs] def participants(self, *, by_file=False): """Return the participants.""" return self._chat.participants(by_file=by_file)
[docs] def ages(self): """Return the ages.""" return self._chat.ages()
[docs] def head(self, n=5): """Return the first n utterances with a formatted display.""" return self._chat.head(n=n)
[docs] def tail(self, n=5): """Return the last n utterances with a formatted display.""" return self._chat.tail(n=n)
[docs] def languages(self, *, by_file=False): """Return the languages.""" return self._chat.languages(by_file=by_file)
[docs] @functools.lru_cache(maxsize=1) def hkcancor() -> CHAT: """Create a corpus object for the Hong Kong Cantonese Corpus. Returns: :class:`~pycantonese.CHAT` """ data_dir = os.path.join(os.path.dirname(__file__), "data", "hkcancor") chat = _RustChat.from_dir(data_dir, parallel=not _IS_WASM) return CHAT(chat)
[docs] @functools.lru_cache(maxsize=1) def cantomap() -> CHAT: """Create a corpus object for the CantoMap corpus. Returns: :class:`~pycantonese.CHAT` """ data_dir = os.path.join(os.path.dirname(__file__), "data", "cantomap", "extracted") chat = _RustChat.from_dir(data_dir, parallel=not _IS_WASM) return CHAT(chat)
def _normalize_filter( value: str | Sequence[str] | None, ) -> str | None: """Convert a filter value to a single regex pattern string.""" if value is None: return None if isinstance(value, str): return value return "|".join(re.escape(v) for v in value)
[docs] def read_chat( path: str | os.PathLike[str], *, filter_files: str | Sequence[str] | None = None, filter_participants: str | Sequence[str] | None = None, strict: bool = True, ) -> CHAT: """Read Cantonese CHAT data files. Args: path (str or os.PathLike[str]): A path that points to one of the following: - A local ``.zip`` file path. - A local directory, for files under this directory recursively. - A single ``.cha`` CHAT file. filter_files (str or Sequence[str], optional): Filename(s) to keep. Regular expression matching is supported. If ``None``, all files are included. filter_participants (str or Sequence[str], optional): Participant code(s) to keep. Regular expression matching is supported. If ``None``, all participants are included. strict (bool, optional): If ``True``, enforce strict parsing of the CHAT data. Returns: :class:`~pycantonese.CHAT` """ path = os.fspath(path) parallel = not _IS_WASM if path.endswith(".zip"): chat = _RustChat.from_zip(path, parallel=parallel, strict=strict) elif os.path.isdir(path): chat = _RustChat.from_dir(path, parallel=parallel, strict=strict) else: chat = _RustChat.from_files([path], parallel=parallel, strict=strict) files_pattern = _normalize_filter(filter_files) participants_pattern = _normalize_filter(filter_participants) if files_pattern is not None or participants_pattern is not None: chat = chat.filter(files=files_pattern, participants=participants_pattern) return CHAT(chat)