Source code for pycantonese.pos_tagging.tagger

import functools
import os

from rustling.perceptron_pos_tagger import AveragedPerceptron

from pycantonese._punctuation_marks import _PUNCTUATION_MARKS
from pycantonese.pos_tagging.hkcancor_to_ud import hkcancor_to_ud

_THIS_DIR = os.path.dirname(os.path.abspath(__file__))
_MODEL_PATH = os.path.join(_THIS_DIR, "tagger.fb.zst")


class _POSTagger:
    """A part-of-speech tagger.

    This class wraps ``rustling.perceptron_pos_tagger.AveragedPerceptron``
    and provides Cantonese-specific functionality such as
    Chinese full-width punctuation handling.
    """

    def __init__(
        self,
        *,
        frequency_threshold=10,
        ambiguity_threshold=0.95,
        n_iter=5,
        random_seed=None,
    ):
        """Initialize a part-of-speech tagger.

        Parameters
        ----------
        frequency_threshold : int, optional
            A good number of words are almost unambiguously associated with
            a given tag. If these words have a frequency of occurrence above
            this threshold in the training data, they are directly associated
            with their tag in the model.
        ambiguity_threshold : float, optional
            A good number of words are almost unambiguously associated with
            a given tag. If the ratio of (# of occurrences of this word with
            this tag) / (# of occurrences of this word) in the training data
            is equal to or greater than this threshold, then this word is
            directly associated with the tag in the model.
        n_iter : int, optional
            Number of times the training phase iterates through the data.
            At each new iteration, the data is randomly shuffled.
        random_seed : int | None, optional
            Random seed for reproducible training. If None, a random seed
            is used.
        """
        self._tagger = AveragedPerceptron(
            frequency_threshold=frequency_threshold,
            ambiguity_threshold=ambiguity_threshold,
            n_iter=n_iter,
            random_seed=random_seed,
        )
        # HKCanCor doesn't have the Chinese full-width punctuation marks,
        # so they must be handled outside of the rustling tagger.
        self._punctuation_tags = {punct: punct for punct in _PUNCTUATION_MARKS}

    def predict(self, sequences):
        """Predict the tags for the sequences.

        Parameters
        ----------
        sequences : list[list[str]]
            A list of segmented sentences, where each sentence is a list
            of words in Cantonese characters.

        Returns
        -------
        list[list[str]]
            The list of predicted tag sequences.
        """
        tags = self._tagger.predict(sequences)
        for seq_tags, seq_words in zip(tags, sequences):
            for i, word in enumerate(seq_words):
                if word in self._punctuation_tags:
                    seq_tags[i] = self._punctuation_tags[word]
        return tags

    def fit(self, sequences, tags):
        """Train a model.

        Parameters
        ----------
        sequences : list[list[str]]
            A list of segmented sentences for training.
        tags : list[list[str]]
            A list of tag sequences, parallel to ``sequences``.
        """
        self._tagger.fit(sequences, tags)

    def save(self, path):
        """Save the model as a binary model file.

        Parameters
        ----------
        path : str
            The path to save the model.
        """
        self._tagger.save(path)

    def load(self, path):
        """Load a model from a binary model file.

        Parameters
        ----------
        path : str
            The path where the binary model file is located.
        """
        self._tagger.load(path)


@functools.lru_cache(maxsize=1)
def _get_tagger():
    tagger = _POSTagger()
    tagger.load(_MODEL_PATH)
    return tagger


[docs] def pos_tag(words, tagset="universal"): """Tag the words for their parts of speech. The part-of-speech tagger uses an averaged perceptron model, and is trained by the HKCanCor data. .. versionadded:: 3.1.0 Parameters ---------- words : list[str] A segmented sentence or phrase, where each word is a string of Cantonese characters. tagset : str, {"universal", "hkcancor"} The part-of-speech tagset that the returned tags are in. Supported options: * ``"hkcancor"``, for the tagset used by the original HKCanCor data. There are over 100 tags, 46 of which are described at https://github.com/fcbond/hkcancor. * ``"universal"`` (default option), for the Universal Dependencies v2 tagset. There are 17 tags; see https://universaldependencies.org/u/pos/index.html. Internally, this option applies :func:`~pycantonese.pos_tagging.hkcancor_to_ud` to convert HKCanCor tags to UD tags. Returns ------- list[tuple[str, str]] The segmented sentence/phrase where each word is paired with its predicted POS tag. Raises ------ TypeError If the input is a string (e.g., an unsegmented string of Cantonese). ValueError If the ``tagset`` argument is not one of the allowed options from ``{"universal", "hkcancor"}``. Examples -------- >>> words = ['我', '噚日', '買', '嗰', '對', '鞋', '。'] # I bought that pair of shoes yesterday. >>> pos_tag(words) [('我', 'PRON'), ('噚日', 'ADV'), ('買', 'VERB'), ('嗰', 'PRON'), ('對', 'NOUN'), ('鞋', 'NOUN'), ('。', 'PUNCT')] >>> pos_tag(words, tagset="hkcancor") [('我', 'r'), ('噚日', 't'), ('買', 'v'), ('嗰', 'r'), ('對', 'q'), ('鞋', 'n'), ('。', '。')] """ # noqa: E501 if isinstance(words, str): raise TypeError( f"Input must be a list of segmented words, not a string: {words}" ) tags = _get_tagger().predict([words])[0] if tagset == "universal": tags = [hkcancor_to_ud(tag) for tag in tags] elif tagset != "hkcancor": raise ValueError(f"tagset must be one of {{'universal', 'hkcancor'}}: {tagset}") return list(zip(words, tags))