from __future__ import annotations
import functools
import itertools
import os
import re
import sys
from collections.abc import Sequence
from typing import cast
from pycantonese._rust import Chat as _RustChat, Token, Utterance
from pycantonese.search import _perform_search
_IS_WASM = sys.platform == "emscripten"
def _flatten(iterable):
"""Flatten one level of nesting."""
return list(itertools.chain.from_iterable(iterable))
[docs]
class CHAT:
"""A reader for Cantonese CHAT corpus data.
This class wraps a Rust-backed CHAT parser and provides
Cantonese-specific functionality such as Jyutping extraction,
character-level access, and corpus search.
"""
[docs]
def __init__(
self,
chat: _RustChat | None = None,
):
self._chat = chat if chat is not None else _RustChat()
[docs]
@classmethod
def from_zip(
cls,
path: str | os.PathLike[str],
*,
match: str | None = None,
extension=".cha",
parallel=True,
strict=True,
mor_tier="%mor",
gra_tier="%gra",
):
"""Read CHAT data from a ZIP file.
Args:
path (str or os.PathLike[str]): Path to the ZIP file.
match (str, optional): Glob pattern to match filenames within the ZIP.
extension (str, optional): File extension to match.
Default is ``".cha"``.
parallel (bool, optional): If True, parse files in parallel.
strict (bool, optional): If True, enforce strict parsing.
mor_tier (str or None, optional): Name of the dependent tier to treat
as the morphology tier, e.g. ``"%mor"`` or ``"%xmor"``. Default is
``"%mor"``. Set to None to disable mor+gra handling.
gra_tier (str or None, optional): Name of the dependent tier to treat
as the grammatical relation tier, e.g. ``"%gra"`` or ``"%xgra"``.
Default is ``"%gra"``. Set to None to disable mor+gra handling.
Returns:
:class:`~pycantonese.CHAT`
"""
return cls(
_RustChat.from_zip(
os.fspath(path),
match=match,
extension=extension,
parallel=parallel,
strict=strict,
mor_tier=mor_tier,
gra_tier=gra_tier,
)
)
[docs]
@classmethod
def from_dir(
cls,
path: str | os.PathLike[str],
*,
match: str | None = None,
extension=".cha",
parallel=True,
strict=True,
mor_tier="%mor",
gra_tier="%gra",
):
"""Read CHAT data from a directory.
Args:
path (str or os.PathLike[str]): Path to the directory.
match (str, optional): Glob pattern to match filenames within
the directory.
extension (str, optional): File extension to match.
Default is ``".cha"``.
parallel (bool, optional): If True, parse files in parallel.
strict (bool, optional): If True, enforce strict parsing.
mor_tier (str or None, optional): Name of the dependent tier to treat
as the morphology tier, e.g. ``"%mor"`` or ``"%xmor"``. Default is
``"%mor"``. Set to None to disable mor+gra handling.
gra_tier (str or None, optional): Name of the dependent tier to treat
as the grammatical relation tier, e.g. ``"%gra"`` or ``"%xgra"``.
Default is ``"%gra"``. Set to None to disable mor+gra handling.
Returns:
:class:`~pycantonese.CHAT`
"""
return cls(
_RustChat.from_dir(
os.fspath(path),
match=match,
extension=extension,
parallel=parallel,
strict=strict,
mor_tier=mor_tier,
gra_tier=gra_tier,
)
)
[docs]
@classmethod
def from_files(
cls,
paths: Sequence[str | os.PathLike[str]],
*,
parallel=True,
strict=True,
mor_tier="%mor",
gra_tier="%gra",
):
"""Read CHAT data from file paths.
Args:
paths (Sequence[str | os.PathLike[str]]): Paths to CHAT files.
parallel (bool, optional): If True, parse files in parallel.
strict (bool, optional): If True, enforce strict parsing.
mor_tier (str or None, optional): Name of the dependent tier to treat
as the morphology tier, e.g. ``"%mor"`` or ``"%xmor"``. Default is
``"%mor"``. Set to None to disable mor+gra handling.
gra_tier (str or None, optional): Name of the dependent tier to treat
as the grammatical relation tier, e.g. ``"%gra"`` or ``"%xgra"``.
Default is ``"%gra"``. Set to None to disable mor+gra handling.
Returns:
:class:`~pycantonese.CHAT`
"""
return cls(
_RustChat.from_files(
[os.fspath(p) for p in paths],
parallel=parallel,
strict=strict,
mor_tier=mor_tier,
gra_tier=gra_tier,
)
)
[docs]
@classmethod
def from_strs(
cls,
strs,
*,
ids=None,
parallel=True,
strict=True,
mor_tier="%mor",
gra_tier="%gra",
):
"""Read CHAT data from strings.
Args:
strs (list[str]): CHAT-formatted strings.
ids (list[str], optional): Identifiers for each string.
parallel (bool, optional): If True, parse strings in parallel.
strict (bool, optional): If True, enforce strict parsing.
mor_tier (str or None, optional): Name of the dependent tier to treat
as the morphology tier, e.g. ``"%mor"`` or ``"%xmor"``. Default is
``"%mor"``. Set to None to disable mor+gra handling.
gra_tier (str or None, optional): Name of the dependent tier to treat
as the grammatical relation tier, e.g. ``"%gra"`` or ``"%xgra"``.
Default is ``"%gra"``. Set to None to disable mor+gra handling.
Returns:
:class:`~pycantonese.CHAT`
"""
return cls(
_RustChat.from_strs(
strs,
ids=ids,
parallel=parallel,
strict=strict,
mor_tier=mor_tier,
gra_tier=gra_tier,
)
)
[docs]
@classmethod
def from_git(
cls,
url: str,
*,
rev: str | None = None,
depth: int | None = None,
match: str | None = None,
extension=".cha",
cache_dir: str | os.PathLike[str] | None = None,
force_download=False,
parallel=True,
strict=True,
mor_tier="%mor",
gra_tier="%gra",
):
"""Read CHAT data from a Git repository.
Args:
url (str): URL of the Git repository.
rev (str, optional): Git revision (branch, tag, or commit hash).
depth (int, optional): Clone depth for shallow clones.
match (str, optional): Glob pattern to match filenames within
the repository.
extension (str, optional): File extension to match.
Default is ``".cha"``.
cache_dir (str or os.PathLike[str], optional): Directory to cache
the cloned repository.
force_download (bool, optional): If True, force re-download
even if cached.
parallel (bool, optional): If True, parse files in parallel.
strict (bool, optional): If True, enforce strict parsing.
mor_tier (str or None, optional): Name of the dependent tier to treat
as the morphology tier, e.g. ``"%mor"`` or ``"%xmor"``. Default is
``"%mor"``. Set to None to disable mor+gra handling.
gra_tier (str or None, optional): Name of the dependent tier to treat
as the grammatical relation tier, e.g. ``"%gra"`` or ``"%xgra"``.
Default is ``"%gra"``. Set to None to disable mor+gra handling.
Returns:
:class:`~pycantonese.CHAT`
"""
return cls(
_RustChat.from_git(
url,
rev=rev,
depth=depth,
match=match,
extension=extension,
cache_dir=os.fspath(cache_dir) if cache_dir is not None else None,
force_download=force_download,
parallel=parallel,
strict=strict,
mor_tier=mor_tier,
gra_tier=gra_tier,
)
)
[docs]
@classmethod
def from_url(
cls,
url: str,
*,
match: str | None = None,
extension=".cha",
cache_dir: str | os.PathLike[str] | None = None,
force_download=False,
parallel=True,
strict=True,
mor_tier="%mor",
gra_tier="%gra",
):
"""Read CHAT data from a URL pointing to a ZIP archive.
Args:
url (str): URL of the ZIP archive.
match (str, optional): Glob pattern to match filenames within
the archive.
extension (str, optional): File extension to match.
Default is ``".cha"``.
cache_dir (str or os.PathLike[str], optional): Directory to cache
the downloaded archive.
force_download (bool, optional): If True, force re-download
even if cached.
parallel (bool, optional): If True, parse files in parallel.
strict (bool, optional): If True, enforce strict parsing.
mor_tier (str or None, optional): Name of the dependent tier to treat
as the morphology tier, e.g. ``"%mor"`` or ``"%xmor"``. Default is
``"%mor"``. Set to None to disable mor+gra handling.
gra_tier (str or None, optional): Name of the dependent tier to treat
as the grammatical relation tier, e.g. ``"%gra"`` or ``"%xgra"``.
Default is ``"%gra"``. Set to None to disable mor+gra handling.
Returns:
:class:`~pycantonese.CHAT`
"""
return cls(
_RustChat.from_url(
url,
match=match,
extension=extension,
cache_dir=os.fspath(cache_dir) if cache_dir is not None else None,
force_download=force_download,
parallel=parallel,
strict=strict,
mor_tier=mor_tier,
gra_tier=gra_tier,
)
)
[docs]
@classmethod
def from_utterances(cls, utterances):
"""Construct a CHAT reader from a list of utterances.
Creates a new reader containing a single virtual file with the given
utterances. Useful for splitting a reader into sub-readers based on
utterance boundaries.
Args:
utterances (Sequence[Utterance]): Utterance objects to include.
Returns:
:class:`~pycantonese.CHAT`
"""
return cls(_RustChat.from_utterances(utterances))
def __getattr__(self, name):
return getattr(self._chat, name)
[docs]
def tokens(
self,
*,
by_utterance=False,
by_file=False,
) -> list[Token] | list[list[Token]] | list[list[list[Token]]]:
"""Return the tokens.
Args:
by_utterance (bool, optional): If True, return tokens grouped
by utterance.
by_file (bool, optional): If True, return tokens grouped by file.
Returns:
list
"""
return self._chat.tokens(
by_utterance=by_utterance,
by_file=by_file,
)
[docs]
def words(
self,
*,
by_utterance=False,
by_file=False,
) -> list[str] | list[list[str]] | list[list[list[str]]]:
"""Return the words.
Args:
by_utterance (bool, optional): If True, return words grouped
by utterance.
by_file (bool, optional): If True, return words grouped by file.
Returns:
list
"""
return self._chat.words(by_utterance=by_utterance, by_file=by_file)
[docs]
def jyutping(
self,
*,
by_utterance=False,
by_file=False,
) -> list[str | None] | list[list[str | None]] | list[list[list[str | None]]]:
"""Return the data in Jyutping romanization.
Args:
by_utterance (bool, optional): If True, return Jyutping grouped
by utterance.
by_file (bool, optional): If True, return Jyutping grouped by file.
Returns:
list
"""
return self._chat.jyutping(by_utterance=by_utterance, by_file=by_file)
@staticmethod
def _get_chars_from_sent(sent: list[str]) -> list[str]:
result = []
for word in sent:
if word and "\u4e00" <= word[0] <= "\u9fff":
result.extend(list(word))
else:
result.append(word)
return result
[docs]
def characters(
self,
*,
by_utterance=False,
by_file=False,
) -> list[str] | list[list[str]] | list[list[list[str]]]:
"""Return the data in individual Chinese characters.
Args:
by_utterance (bool, optional): If True, return characters grouped
by utterance.
by_file (bool, optional): If True, return characters grouped
by file.
Returns:
list
"""
sents = cast(
list[list[list[str]]],
self.words(by_utterance=True, by_file=True),
)
result = [
[self._get_chars_from_sent(sent) for sent in sents_for_file]
for sents_for_file in sents
]
if by_file and by_utterance:
pass
elif by_file and not by_utterance:
result = [_flatten(f) for f in result]
elif not by_file and by_utterance:
result = _flatten(result)
else:
result = _flatten(_flatten(f) for f in result)
return result
[docs]
def word_ngrams(self, n: int):
"""Return word n-grams across all utterances.
N-grams do not cross utterance boundaries.
Args:
n (int): The n-gram order (1 for unigrams, 2 for bigrams, etc.).
Returns:
Ngrams
"""
return self._chat.word_ngrams(n)
[docs]
def search(
self,
*,
onset=None,
nucleus=None,
coda=None,
tone=None,
initial=None,
final=None,
jyutping=None,
character=None,
pos=None,
word_range=(0, 0),
utterance_range=(0, 0),
by_token=True,
by_utterance=False,
by_file=False,
):
"""Search the data for the given criteria.
Args:
onset (str, optional): Onset to search for. A regex is supported.
nucleus (str, optional): Nucleus to search for.
A regex is supported.
coda (str, optional): Coda to search for. A regex is supported.
tone (str, optional): Tone to search for. A regex is supported.
initial (str, optional): Initial to search for.
A regex is supported.
final (str, optional): Final to search for.
jyutping (str, optional): Jyutping romanization of one Cantonese
character to search for.
character (str, optional): One or more Cantonese characters to
search for.
pos (str, optional): A part-of-speech tag to search for.
A regex is supported.
word_range (tuple[int, int], optional): Span of words around
a match. Default is ``(0, 0)``.
utterance_range (tuple[int, int], optional): Span of utterances
around a match. Default is ``(0, 0)``.
by_token (bool, optional): If True, return Token objects.
Otherwise return word strings.
by_utterance (bool, optional): If True, return full utterances
containing matches.
by_file (bool, optional): If True, return data organized by file.
Returns:
list
"""
tagged_sents = self.tokens(
by_utterance=True,
by_file=True,
)
result_by_files = _perform_search(
tagged_sents,
onset=onset,
nucleus=nucleus,
coda=coda,
tone=tone,
initial=initial,
final=final,
jyutping=jyutping,
character=character,
pos=pos,
word_range=word_range,
utterance_range=utterance_range,
by_token=by_token,
by_utterance=by_utterance,
)
if by_file:
return result_by_files
else:
return _flatten(result_by_files)
[docs]
def utterances(self, *, by_file=False) -> list[Utterance] | list[list[Utterance]]:
"""Return the utterances.
Args:
by_file (bool, optional): If True, return utterances grouped
by file.
Returns:
list[Utterance] | list[list[Utterance]]
"""
return self._chat.utterances(by_file=by_file)
[docs]
def to_strs(self):
"""Return the data as CHAT-formatted strings.
Returns:
list[str]
"""
return self._chat.to_strs()
[docs]
def to_files(self, dir_path: str | os.PathLike[str], *, filenames=None):
"""Write CHAT (.cha) files to a directory.
Args:
dir_path (str or os.PathLike[str]): Output directory path.
filenames (list[str], optional): Filenames for each file.
"""
self._chat.to_files(os.fspath(dir_path), filenames=filenames)
@property
def n_files(self):
"""The number of files."""
return self._chat.n_files
@property
def file_paths(self):
"""The file paths."""
return self._chat.file_paths
[docs]
def filter(self, *, participants=None, files=None):
"""Filter the data by participants and/or files.
Args:
participants (str, optional): Regex pattern to match participant
codes.
files (str, optional): Glob pattern to match file paths.
Returns:
CHAT
"""
filtered = self._chat.filter(participants=participants, files=files)
return CHAT(filtered)
[docs]
def append(self, other):
"""Append another CHAT object's data."""
self._chat.append(other._chat)
[docs]
def extend(self, others):
"""Extend with data from multiple CHAT objects."""
self._chat.extend([o._chat for o in others])
[docs]
def info(self, verbose=False):
"""Print summary information."""
self._chat.info(verbose=verbose)
[docs]
def participants(self, *, by_file=False):
"""Return the participants."""
return self._chat.participants(by_file=by_file)
[docs]
def ages(self):
"""Return the ages."""
return self._chat.ages()
[docs]
def head(self, n=5):
"""Return the first n utterances with a formatted display."""
return self._chat.head(n=n)
[docs]
def tail(self, n=5):
"""Return the last n utterances with a formatted display."""
return self._chat.tail(n=n)
[docs]
def languages(self, *, by_file=False):
"""Return the languages."""
return self._chat.languages(by_file=by_file)
[docs]
@functools.lru_cache(maxsize=1)
def hkcancor() -> CHAT:
"""Create a corpus object for the Hong Kong Cantonese Corpus.
Returns:
:class:`~pycantonese.CHAT`
"""
data_dir = os.path.join(os.path.dirname(__file__), "data", "hkcancor")
chat = _RustChat.from_dir(data_dir, parallel=not _IS_WASM)
return CHAT(chat)
[docs]
@functools.lru_cache(maxsize=1)
def cantomap() -> CHAT:
"""Create a corpus object for the CantoMap corpus.
Returns:
:class:`~pycantonese.CHAT`
"""
data_dir = os.path.join(os.path.dirname(__file__), "data", "cantomap", "extracted")
chat = _RustChat.from_dir(data_dir, parallel=not _IS_WASM)
return CHAT(chat)
def _normalize_filter(
value: str | Sequence[str] | None,
) -> str | None:
"""Convert a filter value to a single regex pattern string."""
if value is None:
return None
if isinstance(value, str):
return value
return "|".join(re.escape(v) for v in value)
[docs]
def read_chat(
path: str | os.PathLike[str],
*,
filter_files: str | Sequence[str] | None = None,
filter_participants: str | Sequence[str] | None = None,
strict: bool = True,
) -> CHAT:
"""Read Cantonese CHAT data files.
Args:
path (str or os.PathLike[str]): A path that points to one of the
following:
- A local ``.zip`` file path.
- A local directory, for files under this directory recursively.
- A single ``.cha`` CHAT file.
filter_files (str or Sequence[str], optional): Filename(s) to keep.
Regular expression matching is supported.
If ``None``, all files are included.
filter_participants (str or Sequence[str], optional): Participant
code(s) to keep. Regular expression matching is supported.
If ``None``, all participants are included.
strict (bool, optional): If ``True``, enforce strict parsing of the
CHAT data.
Returns:
:class:`~pycantonese.CHAT`
"""
path = os.fspath(path)
parallel = not _IS_WASM
if path.endswith(".zip"):
chat = _RustChat.from_zip(path, parallel=parallel, strict=strict)
elif os.path.isdir(path):
chat = _RustChat.from_dir(path, parallel=parallel, strict=strict)
else:
chat = _RustChat.from_files([path], parallel=parallel, strict=strict)
files_pattern = _normalize_filter(filter_files)
participants_pattern = _normalize_filter(filter_participants)
if files_pattern is not None or participants_pattern is not None:
chat = chat.filter(files=files_pattern, participants=participants_pattern)
return CHAT(chat)