Source code for pycantonese.pos_tagging.tagger

import functools
import os

from rustling.perceptron_pos_tagger import AveragedPerceptron

from pycantonese._punctuation_marks import _PUNCTUATION_MARKS
from pycantonese.pos_tagging.hkcancor_to_ud import hkcancor_to_ud

_THIS_DIR = os.path.dirname(os.path.abspath(__file__))
_MODEL_PATH = os.path.join(_THIS_DIR, "tagger.fb.zst")


class _POSTagger(AveragedPerceptron):
    """A part-of-speech tagger.

    Inherits from ``rustling.perceptron_pos_tagger.AveragedPerceptron``
    and provides Cantonese-specific functionality such as
    Chinese full-width punctuation handling.
    """

    def __new__(
        cls,
        *,
        frequency_threshold=10,
        ambiguity_threshold=0.95,
        n_iter=5,
        random_seed=None,
    ):
        return super().__new__(
            cls,
            frequency_threshold=frequency_threshold,
            ambiguity_threshold=ambiguity_threshold,
            n_iter=n_iter,
            random_seed=random_seed,
        )

    def __init__(
        self,
        *,
        frequency_threshold=10,
        ambiguity_threshold=0.95,
        n_iter=5,
        random_seed=None,
    ):
        """Initialize a part-of-speech tagger.

        Args:
            frequency_threshold (int, optional): A good number of words are
                almost unambiguously associated with a given tag. If these
                words have a frequency of occurrence above this threshold in
                the training data, they are directly associated with their
                tag in the model.
            ambiguity_threshold (float, optional): A good number of words are
                almost unambiguously associated with a given tag. If the ratio
                of (# of occurrences of this word with this tag) /
                (# of occurrences of this word) in the training data is equal
                to or greater than this threshold, then this word is directly
                associated with the tag in the model.
            n_iter (int, optional): Number of times the training phase
                iterates through the data. At each new iteration, the data
                is randomly shuffled.
            random_seed (int | None, optional): Random seed for reproducible
                training. If None, a random seed is used.
        """
        # HKCanCor doesn't have the Chinese full-width punctuation marks,
        # so they must be handled outside of the rustling tagger.
        self._punctuation_tags = {punct: punct for punct in _PUNCTUATION_MARKS}

    def predict(self, sequences):
        """Predict the tags for the sequences.

        Args:
            sequences (list[list[str]]): A list of segmented sentences,
                where each sentence is a list of words in Cantonese
                characters.

        Returns:
            list[list[str]]: The list of predicted tag sequences.
        """
        tags = super().predict(sequences)
        for seq_tags, seq_words in zip(tags, sequences):
            for i, word in enumerate(seq_words):
                if word in self._punctuation_tags:
                    seq_tags[i] = self._punctuation_tags[word]
        return tags


@functools.lru_cache(maxsize=1)
def _get_tagger():
    tagger = _POSTagger()
    tagger.load(_MODEL_PATH)
    return tagger



[docs]
def pos_tag(words, tagset="universal"):
    """Tag the words for their parts of speech.

    The part-of-speech tagger uses an averaged perceptron model,
    and is trained by the HKCanCor data.

    .. versionadded:: 3.1.0

    Args:
        words (list[str]): A segmented sentence or phrase, where each word
            is a string of Cantonese characters.
        tagset (str, {"universal", "hkcancor"}): The part-of-speech tagset
            that the returned tags are in. Supported options:

            * ``"hkcancor"``, for the tagset used by the original HKCanCor
              data. There are over 100 tags, 46 of which are described at
              https://github.com/fcbond/hkcancor.
            * ``"universal"`` (default option), for the Universal Dependencies
              v2 tagset. There are 17 tags; see
              https://universaldependencies.org/u/pos/index.html.
              Internally, this option applies
              :func:`~pycantonese.pos_tagging.hkcancor_to_ud` to convert
              HKCanCor tags to UD tags.

    Returns:
        list[tuple[str, str]]: The segmented sentence/phrase where each word
        is paired with its predicted POS tag.

    Raises:
        TypeError: If the input is a string (e.g., an unsegmented string of
            Cantonese).
        ValueError: If the ``tagset`` argument is not one of the allowed
            options from ``{"universal", "hkcancor"}``.

    Examples:
        >>> words = ['我', '噚日', '買', '嗰', '對', '鞋', '。']  # I bought that pair of shoes yesterday.
        >>> pos_tag(words)
        [('我', 'PRON'), ('噚日', 'ADV'), ('買', 'VERB'), ('嗰', 'PRON'), ('對', 'NOUN'), ('鞋', 'NOUN'), ('。', 'PUNCT')]
        >>> pos_tag(words, tagset="hkcancor")
        [('我', 'r'), ('噚日', 't'), ('買', 'v'), ('嗰', 'r'), ('對', 'q'), ('鞋', 'n'), ('。', '。')]
    """  # noqa: E501
    if isinstance(words, str):
        raise TypeError(
            f"Input must be a list of segmented words, not a string: {words}"
        )
    tags = _get_tagger().predict([words])[0]
    if tagset == "universal":
        tags = [hkcancor_to_ud(tag) for tag in tags]
    elif tagset != "hkcancor":
        raise ValueError(f"tagset must be one of {{'universal', 'hkcancor'}}: {tagset}")
    return list(zip(words, tags))