Source code for pycantonese.pos_tagging.tagger

import functools
import os

from rustling.perceptron_pos_tagger import AveragedPerceptron

from pycantonese._punctuation_marks import _PUNCTUATION_MARKS
from pycantonese.pos_tagging.hkcancor_to_ud import hkcancor_to_ud

_THIS_DIR = os.path.dirname(os.path.abspath(__file__))
_MODEL_PATH = os.path.join(_THIS_DIR, "tagger.fb.zst")


class _POSTagger(AveragedPerceptron):
    """A part-of-speech tagger.

    Inherits from ``rustling.perceptron_pos_tagger.AveragedPerceptron``
    and provides Cantonese-specific functionality such as
    Chinese full-width punctuation handling.
    """

    def __new__(
        cls,
        *,
        frequency_threshold=10,
        ambiguity_threshold=0.95,
        n_iter=5,
        random_seed=None,
    ):
        return super().__new__(
            cls,
            frequency_threshold=frequency_threshold,
            ambiguity_threshold=ambiguity_threshold,
            n_iter=n_iter,
            random_seed=random_seed,
        )

    def __init__(
        self,
        *,
        frequency_threshold=10,
        ambiguity_threshold=0.95,
        n_iter=5,
        random_seed=None,
    ):
        """Initialize a part-of-speech tagger.

        Args:
            frequency_threshold (int, optional): A good number of words are
                almost unambiguously associated with a given tag. If these
                words have a frequency of occurrence above this threshold in
                the training data, they are directly associated with their
                tag in the model.
            ambiguity_threshold (float, optional): A good number of words are
                almost unambiguously associated with a given tag. If the ratio
                of (# of occurrences of this word with this tag) /
                (# of occurrences of this word) in the training data is equal
                to or greater than this threshold, then this word is directly
                associated with the tag in the model.
            n_iter (int, optional): Number of times the training phase
                iterates through the data. At each new iteration, the data
                is randomly shuffled.
            random_seed (int | None, optional): Random seed for reproducible
                training. If None, a random seed is used.
        """
        # HKCanCor doesn't have the Chinese full-width punctuation marks,
        # so they must be handled outside of the rustling tagger.
        self._punctuation_tags = {punct: punct for punct in _PUNCTUATION_MARKS}

    def predict(self, sequences):
        """Predict the tags for the sequences.

        Args:
            sequences (list[list[str]]): A list of segmented sentences,
                where each sentence is a list of words in Cantonese
                characters.

        Returns:
            list[list[str]]: The list of predicted tag sequences.
        """
        tags = super().predict(sequences)
        for seq_tags, seq_words in zip(tags, sequences):
            for i, word in enumerate(seq_words):
                if word in self._punctuation_tags:
                    seq_tags[i] = self._punctuation_tags[word]
        return tags


@functools.lru_cache(maxsize=1)
def _get_tagger():
    tagger = _POSTagger()
    tagger.load(_MODEL_PATH)
    return tagger


[docs] def pos_tag(words, tagset="universal"): """Tag the words for their parts of speech. The part-of-speech tagger uses an averaged perceptron model, and is trained by the HKCanCor data. .. versionadded:: 3.1.0 Args: words (list[str]): A segmented sentence or phrase, where each word is a string of Cantonese characters. tagset (str, {"universal", "hkcancor"}): The part-of-speech tagset that the returned tags are in. Supported options: * ``"hkcancor"``, for the tagset used by the original HKCanCor data. There are over 100 tags, 46 of which are described at https://github.com/fcbond/hkcancor. * ``"universal"`` (default option), for the Universal Dependencies v2 tagset. There are 17 tags; see https://universaldependencies.org/u/pos/index.html. Internally, this option applies :func:`~pycantonese.pos_tagging.hkcancor_to_ud` to convert HKCanCor tags to UD tags. Returns: list[tuple[str, str]]: The segmented sentence/phrase where each word is paired with its predicted POS tag. Raises: TypeError: If the input is a string (e.g., an unsegmented string of Cantonese). ValueError: If the ``tagset`` argument is not one of the allowed options from ``{"universal", "hkcancor"}``. Examples: >>> words = ['我', '噚日', '買', '嗰', '對', '鞋', '。'] # I bought that pair of shoes yesterday. >>> pos_tag(words) [('我', 'PRON'), ('噚日', 'ADV'), ('買', 'VERB'), ('嗰', 'PRON'), ('對', 'NOUN'), ('鞋', 'NOUN'), ('。', 'PUNCT')] >>> pos_tag(words, tagset="hkcancor") [('我', 'r'), ('噚日', 't'), ('買', 'v'), ('嗰', 'r'), ('對', 'q'), ('鞋', 'n'), ('。', '。')] """ # noqa: E501 if isinstance(words, str): raise TypeError( f"Input must be a list of segmented words, not a string: {words}" ) tags = _get_tagger().predict([words])[0] if tagset == "universal": tags = [hkcancor_to_ud(tag) for tag in tags] elif tagset != "hkcancor": raise ValueError(f"tagset must be one of {{'universal', 'hkcancor'}}: {tagset}") return list(zip(words, tags))