import functools
import os
from rustling.perceptron_pos_tagger import AveragedPerceptron
from pycantonese._punctuation_marks import _PUNCTUATION_MARKS
from pycantonese.pos_tagging.hkcancor_to_ud import hkcancor_to_ud
_THIS_DIR = os.path.dirname(os.path.abspath(__file__))
_MODEL_PATH = os.path.join(_THIS_DIR, "tagger.fb.zst")
class _POSTagger(AveragedPerceptron):
"""A part-of-speech tagger.
Inherits from ``rustling.perceptron_pos_tagger.AveragedPerceptron``
and provides Cantonese-specific functionality such as
Chinese full-width punctuation handling.
"""
def __new__(
cls,
*,
frequency_threshold=10,
ambiguity_threshold=0.95,
n_iter=5,
random_seed=None,
):
return super().__new__(
cls,
frequency_threshold=frequency_threshold,
ambiguity_threshold=ambiguity_threshold,
n_iter=n_iter,
random_seed=random_seed,
)
def __init__(
self,
*,
frequency_threshold=10,
ambiguity_threshold=0.95,
n_iter=5,
random_seed=None,
):
"""Initialize a part-of-speech tagger.
Args:
frequency_threshold (int, optional): A good number of words are
almost unambiguously associated with a given tag. If these
words have a frequency of occurrence above this threshold in
the training data, they are directly associated with their
tag in the model.
ambiguity_threshold (float, optional): A good number of words are
almost unambiguously associated with a given tag. If the ratio
of (# of occurrences of this word with this tag) /
(# of occurrences of this word) in the training data is equal
to or greater than this threshold, then this word is directly
associated with the tag in the model.
n_iter (int, optional): Number of times the training phase
iterates through the data. At each new iteration, the data
is randomly shuffled.
random_seed (int | None, optional): Random seed for reproducible
training. If None, a random seed is used.
"""
# HKCanCor doesn't have the Chinese full-width punctuation marks,
# so they must be handled outside of the rustling tagger.
self._punctuation_tags = {punct: punct for punct in _PUNCTUATION_MARKS}
def predict(self, sequences):
"""Predict the tags for the sequences.
Args:
sequences (list[list[str]]): A list of segmented sentences,
where each sentence is a list of words in Cantonese
characters.
Returns:
list[list[str]]: The list of predicted tag sequences.
"""
tags = super().predict(sequences)
for seq_tags, seq_words in zip(tags, sequences):
for i, word in enumerate(seq_words):
if word in self._punctuation_tags:
seq_tags[i] = self._punctuation_tags[word]
return tags
@functools.lru_cache(maxsize=1)
def _get_tagger():
tagger = _POSTagger()
tagger.load(_MODEL_PATH)
return tagger
[docs]
def pos_tag(words, tagset="universal"):
"""Tag the words for their parts of speech.
The part-of-speech tagger uses an averaged perceptron model,
and is trained by the HKCanCor data.
.. versionadded:: 3.1.0
Args:
words (list[str]): A segmented sentence or phrase, where each word
is a string of Cantonese characters.
tagset (str, {"universal", "hkcancor"}): The part-of-speech tagset
that the returned tags are in. Supported options:
* ``"hkcancor"``, for the tagset used by the original HKCanCor
data. There are over 100 tags, 46 of which are described at
https://github.com/fcbond/hkcancor.
* ``"universal"`` (default option), for the Universal Dependencies
v2 tagset. There are 17 tags; see
https://universaldependencies.org/u/pos/index.html.
Internally, this option applies
:func:`~pycantonese.pos_tagging.hkcancor_to_ud` to convert
HKCanCor tags to UD tags.
Returns:
list[tuple[str, str]]: The segmented sentence/phrase where each word
is paired with its predicted POS tag.
Raises:
TypeError: If the input is a string (e.g., an unsegmented string of
Cantonese).
ValueError: If the ``tagset`` argument is not one of the allowed
options from ``{"universal", "hkcancor"}``.
Examples:
>>> words = ['我', '噚日', '買', '嗰', '對', '鞋', '。'] # I bought that pair of shoes yesterday.
>>> pos_tag(words)
[('我', 'PRON'), ('噚日', 'ADV'), ('買', 'VERB'), ('嗰', 'PRON'), ('對', 'NOUN'), ('鞋', 'NOUN'), ('。', 'PUNCT')]
>>> pos_tag(words, tagset="hkcancor")
[('我', 'r'), ('噚日', 't'), ('買', 'v'), ('嗰', 'r'), ('對', 'q'), ('鞋', 'n'), ('。', '。')]
""" # noqa: E501
if isinstance(words, str):
raise TypeError(
f"Input must be a list of segmented words, not a string: {words}"
)
tags = _get_tagger().predict([words])[0]
if tagset == "universal":
tags = [hkcancor_to_ud(tag) for tag in tags]
elif tagset != "hkcancor":
raise ValueError(f"tagset must be one of {{'universal', 'hkcancor'}}: {tagset}")
return list(zip(words, tags))