Source code for nefnir.wrapper

# -*- coding: utf-8 -*-
import pkgutil
import json

from typing import Optional, Tuple

from .nefnir import Nefnir as _Nefnir

class _NefnirPackage(_Nefnir):
    def __init__(self, rules, tagmap):
        # Load resources from package
        self.rules = rules
        self.tagmap = tagmap

        self.proper = {t for t in self.tagmap if t[0] == 'n' and t[-1] in {'m', 'ö', 's'}}
        self.unanalyzed = {t for t in self.tagmap if t[:2] == 'nx'} | {'x', 'e', 'as'}

_nefnir: Optional[_NefnirPackage] = None

def _get_nefnir() -> _NefnirPackage:
    if _nefnir is None:
    assert _nefnir is not None, "Failed initalization"
    return _nefnir

[docs]def init() -> None: """Read configuration files.""" global _nefnir rule_data = pkgutil.get_data(__package__, 'rules.json') tag_data = pkgutil.get_data(__package__, 'tags.json') if rule_data is not None and tag_data is not None: rules = json.loads(rule_data) tagmap = json.loads(tag_data) _nefnir = _NefnirPackage(rules, tagmap)
[docs]def lemmatize(form: str, tag: str) -> str: """ Lemmatize a word form given its part-of-speech tag. :param form: A word form. :param tag: The word form's part-of-speech tag. :return: The word form's lemma. """ return _get_nefnir().lemmatize(form, tag)
[docs]def recase(form: str, tag: str, lemma: str) -> str: """ Determine how to properly case a lemma given the word form and part of speech tag it was derived from. Nefnir transforms words into lowercase prior to lemmatization. Some words, such as proper nouns, abbreviations and foreign words therefore need to be re-capitalized or changed back into uppercase. :param form: A word form, cased as it was written. :param tag: The word form's part-of-speech tag. :param lemma: The word form's lemma, in lowercase. :return: A properly cased lemma. """ return _get_nefnir().recase(form, tag, lemma)
[docs]def lemmatize_line(line: str, separator: str = '\t') -> Tuple[Optional[str], Optional[str], Optional[str]]: """ Lemmatize a word form given its part-of-speech tag. :param line: A line with form and tag separated by seperator. :param separator: The token separator. :return: Tuple with form, tag, lemma (any can be None if data invalid). """ form = None tag = None lemma = None line = line.strip() if len(line) > 0: if separator in line: form, tag = line.split(separator) lemma = lemmatize(form, tag) return form, tag, lemma