Source code for finalfusion.vocab.simple_vocab

"""
Finalfusion SimpleVocab
"""
import struct
from os import PathLike
from typing import List, Optional, Union, BinaryIO, Dict

from finalfusion.io import ChunkIdentifier, find_chunk, _write_binary, _read_required_binary
from finalfusion.vocab.vocab import Vocab, _validate_items_and_create_index, _read_items,\
    _write_words_binary


[docs]class SimpleVocab(Vocab): """ Simple vocabulary. SimpleVocabs provide a simple string to index mapping and index to string mapping. """
[docs] def __init__(self, words: List[str]): """ Initialize a SimpleVocab. Initializes the vocabulary with the given words and optional index. If no index is given, the nth word in the `words` list is assigned index `n`. The word list cannot contain duplicate entries and it needs to be of same length as the index. Parameters ---------- words : List[str] List of unique words Raises ------ AssertionError if ``words`` contains duplicate entries. """ self._index = _validate_items_and_create_index(words) self._words = words
@property def words(self) -> List[str]: return self._words @property def word_index(self) -> Dict[str, int]: return self._index @property def upper_bound(self) -> int: return len(self.word_index)
[docs] def idx(self, item: str, default: Union[list, int, None] = None ) -> Optional[Union[list, int]]: return self.word_index.get(item, default)
[docs] @staticmethod def read_chunk(file: BinaryIO) -> 'SimpleVocab': length = _read_required_binary(file, "<Q")[0] words = _read_items(file, length) return SimpleVocab(words)
[docs] def write_chunk(self, file: BinaryIO): _write_binary(file, "<I", int(self.chunk_identifier())) b_word_len_sum = sum(len(bytes(word, "utf-8")) for word in self.words) n_words_size = struct.calcsize("<Q") word_lens_size = len(self.words) * struct.calcsize("<I") chunk_length = n_words_size + word_lens_size + b_word_len_sum _write_binary(file, "<QQ", chunk_length, len(self.words)) _write_words_binary((bytes(word, "utf-8") for word in self.words), file)
[docs] @staticmethod def chunk_identifier() -> ChunkIdentifier: return ChunkIdentifier.SimpleVocab
[docs]def load_simple_vocab(file: Union[str, bytes, int, PathLike]) -> SimpleVocab: """ Load a SimpleVocab from the given finalfusion file. Parameters ---------- file : str Path to file containing a SimpleVocab chunk. Returns ------- vocab : SimpleVocab Returns the first SimpleVocab in the file. """ with open(file, "rb") as inf: chunk = find_chunk(inf, [ChunkIdentifier.SimpleVocab]) if chunk is None: raise ValueError('File did not contain a SimpleVocab}') return SimpleVocab.read_chunk(inf)
__all__ = ['SimpleVocab', 'load_simple_vocab']