Source code for finalfusion.vocab.subword

"""
Finalfusion Subword Vocabularies
"""

import struct
from abc import abstractmethod
from os import PathLike
from typing import List, Optional, Tuple, Any, Union, Dict, BinaryIO, cast

from finalfusion.io import ChunkIdentifier, find_chunk, _write_binary, _read_required_binary
from finalfusion.subword import ExplicitIndexer, FastTextIndexer, FinalfusionHashIndexer, ngrams
from finalfusion.vocab.vocab import Vocab, _validate_items_and_create_index, \
    _calculate_binary_list_size, _write_words_binary, _read_items, _read_items_with_indices


[docs]class SubwordVocab(Vocab):
    """
    Interface for vocabularies with subword lookups.
    """
[docs]    def idx(self, item: str, default: Optional[Union[List[int], int]] = None
            ) -> Optional[Union[List[int], int]]:
        idx = self.word_index.get(item)
        if idx is not None:
            return idx
        subwords = cast(List[int], self.subword_indices(item))
        if subwords != []:
            return subwords
        return default

    @property
    def upper_bound(self) -> int:
        return len(self) + self.subword_indexer.upper_bound

    @property
    def min_n(self) -> int:
        """
        Get the lower bound of the range of extracted n-grams.

        Returns
        -------
        min_n : int
            lower bound of n-gram range.
        """
        return self.subword_indexer.min_n

    @property
    def max_n(self) -> int:
        """
        Get the upper bound of the range of extracted n-grams.

        Returns
        -------
        max_n : int
            upper bound of n-gram range.
        """
        return self.subword_indexer.max_n

    @property
    @abstractmethod
    def subword_indexer(
            self
    ) -> Union[ExplicitIndexer, FinalfusionHashIndexer, FastTextIndexer]:
        """
        Get this vocab's subword Indexer.

        The subword indexer produces indices for n-grams.

        In case of bucket vocabularies, this is a hash-based indexer
        (:class:`.FinalfusionHashIndexer`, :class:`.FastTextIndexer`). For explicit subword
        vocabularies, this is an :class:`.ExplicitIndexer`.

        Returns
        -------
        subword_indexer : ExplicitIndexer, FinalfusionHashIndexer, FastTextIndexer
            The subword indexer of the vocabulary.
        """
[docs]    def subwords(self, item: str, bracket: bool = True) -> List[str]:
        """
        Get the n-grams of the given item as a list.

        The n-gram range is determined by the `min_n` and `max_n` values.

        Parameters
        ----------
        item : str
            The query item to extract n-grams from.
        bracket : bool
            Toggles bracketing the item with '<' and '>' before extraction.

        Returns
        -------
        ngrams : List[str]
            List of n-grams.
        """
        return ngrams(item, self.min_n, self.max_n, bracket)

[docs]    def subword_indices(self, item: str, bracket: bool = True, with_ngrams: bool = False)\
            -> List[Union[int, Tuple[str, int]]]:
        """
        Get the subword indices for the given item.

        This list does not contain the index for known items.

        Parameters
        ----------
        item : str
            The query item.
        bracket : bool
            Toggles bracketing the item with '<' and '>' before extraction.
        with_ngrams : bool
            Toggles returning ngrams together with indices.

        Returns
        -------
        indices : List[Union[int, Tuple[str, int]]]
            The list of subword indices.
        """
        return self.subword_indexer.subword_indices(item,
                                                    offset=len(self.words),
                                                    bracket=bracket,
                                                    with_ngrams=with_ngrams)

    def __getitem__(self, item: str) -> Union[int, List[int]]:
        idx = self.word_index.get(item)
        if idx is not None:
            return idx
        subwords = cast(List[int], self.subword_indices(item))
        if subwords != []:
            return subwords
        raise KeyError(f"No indices found for {item}")

    def __repr__(self) -> str:
        return f"{type(self).__name__}(\n" \
               f"\tn_words={len(self)},\n" \
               f"\tupper_bound={self.upper_bound},\n" \
               f"\tindexer={self.subword_indexer}\n" \
                ")"

    def __eq__(self, other: Any) -> bool:
        return isinstance(other, type(self)) and \
               self.subword_indexer == other.subword_indexer and \
               super(SubwordVocab, self).__eq__(other)


[docs]class FinalfusionBucketVocab(SubwordVocab):
    """
    Finalfusion Bucket Vocabulary.
    """
[docs]    def __init__(self,
                 words: List[str],
                 indexer: Optional[FinalfusionHashIndexer] = None):
        """
        Initialize a FinalfusionBucketVocab.

        Initializes the vocabulary with the given words.

        If no indexer is passed, a FinalfusionHashIndexer with bucket exponent
        21 is used.

        The word list cannot contain duplicate entries.

        Parameters
        ----------
        words : List[str]
            List of unique words
        indexer : FinalfusionHashIndexer, optional
            Subword indexer to use for the vocabulary. Defaults to an indexer
            with 2^21 buckets with range 3-6.

        Raises
        ------
        AssertionError
            If the indexer is not a FinalfusionHashIndexer or ``words`` contains duplicate entries.
        """
        if indexer is None:
            indexer = FinalfusionHashIndexer(21)
        assert isinstance(indexer, FinalfusionHashIndexer), \
            f"indexer needs to be FinalfusionHashIndexer, not {type(indexer)}"
        super().__init__()
        self._index = _validate_items_and_create_index(words)
        self._words = words
        self._indexer = indexer

[docs]    def to_explicit(self) -> 'ExplicitVocab':
        """
        Return an ExplicitVocab built from this vocab.

        This method iterates over the known words and extracts all ngrams within this vocab's
        bounds. Each of the ngrams is hashed and mapped to an index. This index is not necessarily
        unique for each ngram, if hashes collide, multiple ngrams will be mapped to the same index.

        The returned vocab will be unable to produce indices for unknown ngrams.

        The indices of the new vocabs known indices will be cover `[0, vocab.upper_bound)`

        Returns
        -------
        explicit_vocab : ExplicitVocab
            The converted vocabulary.
        """
        return _bucket_to_explicit(self)

[docs]    def write_chunk(self, file: BinaryIO):
        _write_bucket_vocab(file, self)

    @property
    def words(self) -> List[str]:
        return self._words

    @property
    def subword_indexer(self) -> FinalfusionHashIndexer:
        return self._indexer

    @property
    def word_index(self) -> Dict[str, int]:
        return self._index

[docs]    @staticmethod
    def read_chunk(file: BinaryIO) -> 'FinalfusionBucketVocab':
        length, min_n, max_n, buckets = _read_required_binary(file, "<QIII")
        words = _read_items(file, length)
        indexer = FinalfusionHashIndexer(buckets, min_n, max_n)
        return FinalfusionBucketVocab(words, indexer)

[docs]    @staticmethod
    def chunk_identifier() -> ChunkIdentifier:
        return ChunkIdentifier.BucketSubwordVocab


[docs]class FastTextVocab(SubwordVocab):
    """
    FastText vocabulary
    """
[docs]    def __init__(self,
                 words: List[str],
                 indexer: Optional[FastTextIndexer] = None):
        """
        Initialize a FastTextVocab.

        Initializes the vocabulary with the given words.

        If no indexer is passed, a FastTextIndexer with 2_000_000 buckets is used.

        The word list cannot contain duplicate entries.

        Parameters
        ----------
        words : List[str]
            List of unique words
        indexer : FastTextIndexer, optional
            Subword indexer to use for the vocabulary. Defaults to an indexer
            with 2_000_000 buckets and range 3-6.

        Raises
        ------
        AssertionError
            If the indexer is not a FastTextIndexer or ``words`` contains duplicate entries.
        """
        if indexer is None:
            indexer = FastTextIndexer(2000000)
        assert isinstance(indexer, FastTextIndexer)
        super().__init__()
        self._index = _validate_items_and_create_index(words)
        self._words = words
        self._indexer = indexer

[docs]    def to_explicit(self) -> 'ExplicitVocab':
        """
        Return an ExplicitVocab built from this vocab.

        This method iterates over the known words and extracts all ngrams within this vocab's
        bounds. Each of the ngrams is hashed and mapped to an index. This index is not necessarily
        unique for each ngram, if hashes collide, multiple ngrams will be mapped to the same index.

        The returned vocab will be unable to produce indices for unknown ngrams.

        The indices of the new vocabs known indices will be cover `[0, vocab.upper_bound)`

        Returns
        -------
        explicit_vocab : ExplicitVocab
            The converted vocabulary.
        """
        return _bucket_to_explicit(self)

    @property
    def subword_indexer(self) -> FastTextIndexer:
        return self._indexer

    @property
    def word_index(self) -> Dict[str, int]:
        return self._index

    @property
    def words(self) -> List[str]:
        return self._words

[docs]    @staticmethod
    def read_chunk(file: BinaryIO) -> 'FastTextVocab':
        length, min_n, max_n, buckets = _read_required_binary(file, "<QIII")
        words = _read_items(file, length)
        indexer = FastTextIndexer(buckets, min_n, max_n)
        return FastTextVocab(words, indexer)

[docs]    def write_chunk(self, file: BinaryIO):
        _write_bucket_vocab(file, self)

[docs]    @staticmethod
    def chunk_identifier():
        return ChunkIdentifier.FastTextSubwordVocab


[docs]class ExplicitVocab(SubwordVocab):
    """
    A vocabulary with explicitly stored n-grams.
    """
[docs]    def __init__(self, words: List[str], indexer: ExplicitIndexer):
        """
        Initialize an ExplicitVocab.

        Initializes the vocabulary with the given words and ExplicitIndexer.

        The word list cannot contain duplicate entries.

        Parameters
        ----------
        words : List[str]
            List of unique words
        indexer : ExplicitIndexer
            Subword indexer to use for the vocabulary.

        Raises
        ------
        AssertionError
            If the indexer is not an ExplicitIndexer.

        See Also
        --------
        :class:`.ExplicitIndexer`
        """
        assert isinstance(indexer, ExplicitIndexer)
        super().__init__()
        self._index = _validate_items_and_create_index(words)
        self._words = words
        self._indexer = indexer

    @property
    def word_index(self) -> Dict[str, int]:
        return self._index

    @property
    def subword_indexer(self) -> ExplicitIndexer:
        return self._indexer

    @property
    def words(self) -> List[str]:
        return self._words

[docs]    @staticmethod
    def chunk_identifier() -> ChunkIdentifier:
        return ChunkIdentifier.ExplicitSubwordVocab

[docs]    @staticmethod
    def read_chunk(file: BinaryIO) -> 'ExplicitVocab':
        length, ngram_length, min_n, max_n = _read_required_binary(
            file, "<QQII")
        words = _read_items(file, length)
        ngram_list, ngram_index = _read_items_with_indices(file, ngram_length)
        indexer = ExplicitIndexer(ngram_list, min_n, max_n, ngram_index)
        return ExplicitVocab(words, indexer)

[docs]    def write_chunk(self, file) -> None:
        chunk_length = _calculate_binary_list_size(self.words)
        chunk_length += _calculate_binary_list_size(
            self.subword_indexer.ngrams)
        min_n_max_n_size = struct.calcsize("<II")
        chunk_length += min_n_max_n_size
        chunk_header = (int(self.chunk_identifier()), chunk_length,
                        len(self.words), len(self.subword_indexer.ngrams),
                        self.min_n, self.max_n)
        _write_binary(file, "<IQQQII", *chunk_header)
        _write_words_binary((bytes(word, "utf-8") for word in self.words),
                            file)
        for ngram in self.subword_indexer.ngrams:
            b_ngram = ngram.encode("utf-8")
            _write_binary(file, "<I", len(b_ngram))
            file.write(b_ngram)
            _write_binary(file, "<Q", self.subword_indexer.ngram_index[ngram])


[docs]def load_finalfusion_bucket_vocab(file: Union[str, bytes, int, PathLike]
                                  ) -> FinalfusionBucketVocab:
    """
    Load a FinalfusionBucketVocab from the given finalfusion file.

    Parameters
    ----------
    file : str, bytes, int, PathLike
        Path to file containing a FinalfusionBucketVocab chunk.

    Returns
    -------
    vocab : FinalfusionBucketVocab
        Returns the first FinalfusionBucketVocab in the file.
    """
    with open(file, "rb") as inf:
        chunk = find_chunk(inf, [ChunkIdentifier.BucketSubwordVocab])
        if chunk is None:
            raise ValueError('File did not contain a FinalfusionBucketVocab}')
        return FinalfusionBucketVocab.read_chunk(inf)


[docs]def load_fasttext_vocab(file: Union[str, bytes, int, PathLike]
                        ) -> FastTextVocab:
    """
    Load a FastTextVocab from the given finalfusion file.

    Parameters
    ----------
    file : str, bytes, int, PathLike
        Path to file containing a FastTextVocab chunk.

    Returns
    -------
    vocab : FastTextVocab
        Returns the first FastTextVocab in the file.
    """
    with open(file, "rb") as inf:
        chunk = find_chunk(inf, [ChunkIdentifier.FastTextSubwordVocab])
        if chunk is None:
            raise ValueError('File did not contain a FastTextVocab}')
        return FastTextVocab.read_chunk(inf)


[docs]def load_explicit_vocab(file: Union[str, bytes, int, PathLike]
                        ) -> ExplicitVocab:
    """
    Load a ExplicitVocab from the given finalfusion file.

    Parameters
    ----------
    file : str, bytes, int, PathLike
        Path to file containing a ExplicitVocab chunk.

    Returns
    -------
    vocab : ExplicitVocab
        Returns the first ExplicitVocab in the file.
    """
    with open(file, "rb") as inf:
        chunk = find_chunk(inf, [ChunkIdentifier.ExplicitSubwordVocab])
        if chunk is None:
            raise ValueError('File did not contain a FastTextVocab}')
        return ExplicitVocab.read_chunk(inf)


def _bucket_to_explicit(vocab: Union[FinalfusionBucketVocab, FastTextVocab]
                        ) -> 'ExplicitVocab':
    ngram_index = dict()
    idx_index = dict()  # type: Dict[int, int]
    ngram_list = []
    for word in vocab.words:
        token_ngrams = vocab.subwords(word)
        for ngram in token_ngrams:
            if ngram not in ngram_index:
                ngram_list.append(ngram)
                idx = vocab.subword_indexer(ngram)
                if idx not in idx_index:
                    idx_index[idx] = len(idx_index)
                ngram_index[ngram] = idx_index[idx]
    indexer = ExplicitIndexer(ngram_list, vocab.min_n, vocab.max_n,
                              ngram_index)
    return ExplicitVocab(vocab.words, indexer)


def _write_bucket_vocab(file: BinaryIO,
                        vocab: Union[FastTextVocab, FinalfusionBucketVocab]):
    min_n_max_n_size = struct.calcsize("<II")
    buckets_size = struct.calcsize("<I")
    chunk_length = _calculate_binary_list_size(vocab.words)
    chunk_length += min_n_max_n_size
    chunk_length += buckets_size

    chunk_id = vocab.chunk_identifier()
    if chunk_id == ChunkIdentifier.FastTextSubwordVocab:
        buckets = vocab.subword_indexer.upper_bound
    else:
        buckets = cast(FinalfusionHashIndexer,
                       vocab.subword_indexer).buckets_exp

    chunk_header = (int(chunk_id), chunk_length, len(vocab.words), vocab.min_n,
                    vocab.max_n, buckets)
    _write_binary(file, "<IQQIII", *chunk_header)
    _write_words_binary((bytes(word, "utf-8") for word in vocab.words), file)


__all__ = [
    'SubwordVocab', 'FinalfusionBucketVocab', 'load_finalfusion_bucket_vocab',
    'FastTextVocab', 'load_fasttext_vocab', 'ExplicitVocab',
    'load_explicit_vocab'
]