Source code for pycantonese.pos_tagging.tagger

import functools
import os

from rustling.perceptron_pos_tagger import AveragedPerceptron

from pycantonese._punctuation_marks import _PUNCTUATION_MARKS
from pycantonese.pos_tagging.hkcancor_to_ud import hkcancor_to_ud

_THIS_DIR = os.path.dirname(os.path.abspath(__file__))
_MODEL_PATH = os.path.join(_THIS_DIR, "tagger.fb.zst")


class _POSTagger:
    """A part-of-speech tagger.

    This class wraps ``rustling.perceptron_pos_tagger.AveragedPerceptron``
    and provides Cantonese-specific functionality such as
    Chinese full-width punctuation handling.
    """

    def __init__(
        self,
        *,
        frequency_threshold=10,
        ambiguity_threshold=0.95,
        n_iter=5,
        random_seed=None,
    ):
        """Initialize a part-of-speech tagger.

        Parameters
        ----------
        frequency_threshold : int, optional
            A good number of words are almost unambiguously associated with
            a given tag. If these words have a frequency of occurrence above
            this threshold in the training data, they are directly associated
            with their tag in the model.
        ambiguity_threshold : float, optional
            A good number of words are almost unambiguously associated with
            a given tag. If the ratio of (# of occurrences of this word with
            this tag) / (# of occurrences of this word) in the training data
            is equal to or greater than this threshold, then this word is
            directly associated with the tag in the model.
        n_iter : int, optional
            Number of times the training phase iterates through the data.
            At each new iteration, the data is randomly shuffled.
        random_seed : int | None, optional
            Random seed for reproducible training. If None, a random seed
            is used.
        """
        self._tagger = AveragedPerceptron(
            frequency_threshold=frequency_threshold,
            ambiguity_threshold=ambiguity_threshold,
            n_iter=n_iter,
            random_seed=random_seed,
        )
        # HKCanCor doesn't have the Chinese full-width punctuation marks,
        # so they must be handled outside of the rustling tagger.
        self._punctuation_tags = {punct: punct for punct in _PUNCTUATION_MARKS}

    def predict(self, sequences):
        """Predict the tags for the sequences.

        Parameters
        ----------
        sequences : list[list[str]]
            A list of segmented sentences, where each sentence is a list
            of words in Cantonese characters.

        Returns
        -------
        list[list[str]]
            The list of predicted tag sequences.
        """
        tags = self._tagger.predict(sequences)
        for seq_tags, seq_words in zip(tags, sequences):
            for i, word in enumerate(seq_words):
                if word in self._punctuation_tags:
                    seq_tags[i] = self._punctuation_tags[word]
        return tags

    def fit(self, sequences, tags):
        """Train a model.

        Parameters
        ----------
        sequences : list[list[str]]
            A list of segmented sentences for training.
        tags : list[list[str]]
            A list of tag sequences, parallel to ``sequences``.
        """
        self._tagger.fit(sequences, tags)

    def save(self, path):
        """Save the model as a binary model file.

        Parameters
        ----------
        path : str
            The path to save the model.
        """
        self._tagger.save(path)

    def load(self, path):
        """Load a model from a binary model file.

        Parameters
        ----------
        path : str
            The path where the binary model file is located.
        """
        self._tagger.load(path)


@functools.lru_cache(maxsize=1)
def _get_tagger():
    tagger = _POSTagger()
    tagger.load(_MODEL_PATH)
    return tagger



[docs]
def pos_tag(words, tagset="universal"):
    """Tag the words for their parts of speech.

    The part-of-speech tagger uses an averaged perceptron model,
    and is trained by the HKCanCor data.

    .. versionadded:: 3.1.0

    Parameters
    ----------
    words : list[str]
        A segmented sentence or phrase, where each word is a string of
        Cantonese characters.
    tagset : str, {"universal", "hkcancor"}
        The part-of-speech tagset that the returned tags are in.
        Supported options:

        * ``"hkcancor"``, for the tagset used by the original HKCanCor data.
          There are over 100 tags, 46 of which are described at
          https://github.com/fcbond/hkcancor.
        * ``"universal"`` (default option), for the Universal Dependencies v2
          tagset. There are 17 tags; see
          https://universaldependencies.org/u/pos/index.html.
          Internally, this option applies
          :func:`~pycantonese.pos_tagging.hkcancor_to_ud` to convert HKCanCor
          tags to UD tags.

    Returns
    -------
    list[tuple[str, str]]
        The segmented sentence/phrase where each word is paired with its
        predicted POS tag.

    Raises
    ------
    TypeError
        If the input is a string (e.g., an unsegmented string of Cantonese).
    ValueError
        If the ``tagset`` argument is not one of the allowed options from
        ``{"universal", "hkcancor"}``.

    Examples
    --------
    >>> words = ['我', '噚日', '買', '嗰', '對', '鞋', '。']  # I bought that pair of shoes yesterday.
    >>> pos_tag(words)
    [('我', 'PRON'), ('噚日', 'ADV'), ('買', 'VERB'), ('嗰', 'PRON'), ('對', 'NOUN'), ('鞋', 'NOUN'), ('。', 'PUNCT')]
    >>> pos_tag(words, tagset="hkcancor")
    [('我', 'r'), ('噚日', 't'), ('買', 'v'), ('嗰', 'r'), ('對', 'q'), ('鞋', 'n'), ('。', '。')]
    """  # noqa: E501
    if isinstance(words, str):
        raise TypeError(
            f"Input must be a list of segmented words, not a string: {words}"
        )
    tags = _get_tagger().predict([words])[0]
    if tagset == "universal":
        tags = [hkcancor_to_ud(tag) for tag in tags]
    elif tagset != "hkcancor":
        raise ValueError(f"tagset must be one of {{'universal', 'hkcancor'}}: {tagset}")
    return list(zip(words, tags))