Source code for finalfusion.io

"""
This module defines some common IO operations and types.

:class:`Chunk` is the building block of finalfusion embeddings, each component
is serialized as its own, non-overlapping, chunk in finalfusion files.

:class:`ChunkIdentifier` is a unique integer identifiers for :class:`Chunk`.

:class:`TypeId` is used to uniquely identify numerical types.

The :class:`Header` handles the preamble of finalfusion files.

:class:`FinalfusionFormatError` is raised upon reading from malformed finalfusion
files.
"""
import struct
import sys
from abc import ABC, abstractmethod
from enum import unique, IntEnum
from os import PathLike
from typing import Optional, Tuple, List, BinaryIO, Union, Any

import numpy as np

_MAGIC = b'FiFu'
VERSION = 0


[docs]class Chunk(ABC): """ Basic building blocks of finalfusion files. """
[docs] def write(self, file: Union[str, bytes, int, PathLike]): """ Write the Chunk as a standalone finalfusion file. Parameters ---------- file: Union[str, bytes, int, PathLike] Output path Raises ------ TypeError If the Chunk is a :class:`Header`. """ with open(file, "wb") as inf: chunk_id = self.chunk_identifier() if chunk_id == ChunkIdentifier.Header: raise TypeError("Cannot write header to file by itself") Header([chunk_id]).write_chunk(inf) self.write_chunk(inf)
[docs] @staticmethod @abstractmethod def chunk_identifier() -> 'ChunkIdentifier': """ Get the ChunkIdentifier for this Chunk. Returns -------- chunk_identifier : ChunkIdentifier """
[docs] @staticmethod @abstractmethod def read_chunk(file: BinaryIO) -> 'Chunk': """ Read the Chunk and return it. The file must be positioned before the contents of the :class:`Chunk` but after its header. Parameters ----------- file : BinaryIO a finalfusion file containing the given Chunk Returns -------- chunk: Chunk The chunk read from the file. """
[docs] @abstractmethod def write_chunk(self, file: BinaryIO): """ Write the Chunk to a file. Parameters ---------- file : BinaryIO Output file for the Chunk """
class Header(Chunk): """ Header Chunk The header chunk handles the preamble. """ def __init__(self, chunk_ids: List['ChunkIdentifier']): self._chunk_ids = chunk_ids @property def chunk_ids(self) -> List['ChunkIdentifier']: """ Get the chunk IDs from the header Returns ------- chunk_ids : List[ChunkIdentifier] List of ChunkIdentifiers in the Header. """ return self._chunk_ids @staticmethod def chunk_identifier() -> 'ChunkIdentifier': return ChunkIdentifier.Header @staticmethod def read_chunk(file: BinaryIO) -> 'Header': magic = file.read(4) if magic != _MAGIC: invalid_magic = magic.decode('ascii', errors='ignore') raise FinalfusionFormatError( f'Magic should be b\'FiFu\', not: {invalid_magic}') version = _read_required_binary(file, "<I")[0] if version != VERSION: raise FinalfusionFormatError(f'Unknown model version: {version}') n_chunks = _read_required_binary(file, "<I")[0] chunk_ids = list(_read_required_binary(file, f'<{n_chunks}I')) return Header(chunk_ids) def write_chunk(self, file: BinaryIO): file.write(_MAGIC) n_chunks = len(self.chunk_ids) _write_binary(file, f'<II{n_chunks}I', VERSION, n_chunks, *self.chunk_ids) def find_chunk(file: BinaryIO, chunks: List['ChunkIdentifier']) -> Optional['ChunkIdentifier']: """ Find a :class:`Chunk` in a file. Looks for one of the specified `chunks` in the input file and seeks the file to the beginning of the first chunk found from `chunks`. I.e. the file is positioned before the content but after the header of a chunk. The :func:`Chunk.read_chunk` method can be invoked on the Chunk corresponding to the returned :class:`ChunkIdentifier`. This method seeks the input file to the beginning before searching. Parameters ---------- file : BinaryIO finalfusion file chunks : List[ChunkIdentifier] List of Chunks to look for in the input file. Returns ------- chunk_id : Optional[ChunkIdentifier] The first ChunkIdentifier found in the file. None if none of the chunks could be found. """ file.seek(0) Header.read_chunk(file) while True: chunk_header = _read_chunk_header(file) if chunk_header is None: return None chunk_id, chunk_size = chunk_header if chunk_id in chunks: return chunk_id file.seek(chunk_size, 1)
[docs]@unique class ChunkIdentifier(IntEnum): """ Known finalfusion Chunk types. """ Header = 0 SimpleVocab = 1 NdArray = 2 BucketSubwordVocab = 3 QuantizedArray = 4 Metadata = 5 NdNorms = 6 FastTextSubwordVocab = 7 ExplicitSubwordVocab = 8
[docs]@unique class TypeId(IntEnum): """ Known finalfusion data types. """ u8 = 1 f32 = 10
[docs]class FinalfusionFormatError(Exception): """ Exception to specify that the format of a finalfusion file was incorrect. """
def _pad_float32(pos: int) -> int: """ Helper method to pad to the next page boundary from a given position. Parameters ---------- pos : int Current offset Returns ------- padding : int Required padding in bytes. """ float_size = struct.calcsize('<f') return float_size - (pos % float_size) def _write_binary(file: BinaryIO, struct_fmt: str, *args): """ Helper method to write binary data according to the format string. """ data = struct.pack(struct_fmt, *args) file.write(data) def _read_binary(file: BinaryIO, struct_fmt: str) -> Optional[Tuple[Any, ...]]: """ Helper method to read binary data from a file according to the format string. Parameters ---------- file : BinaryIO Output file struct_fmt : str struct format string Returns ------- data : tuple, optional Returns the unpacked data as a tuple. If **no** data could be read, None is returned Raises ------ FinalfusionFormatError If data could only be read partially. """ size = struct.calcsize(struct_fmt) buf = file.read(size) if len(buf) == 0: return None if len(buf) != size: raise FinalfusionFormatError(f'Could not read {size} bytes from file') return struct.unpack(struct_fmt, buf) def _read_required_binary(file: BinaryIO, struct_fmt: str) -> Tuple[Any, ...]: val = _read_binary(file, struct_fmt) if val is None: raise FinalfusionFormatError( f'Could not read {struct_fmt} bytes from file') return val def _read_chunk_header(file: BinaryIO ) -> Optional[Tuple['ChunkIdentifier', int]]: """ Reads the chunk header. After successfully reading the header, a tuple containing :class:`.ChunkIdentifier` and and integer specifying the chunk size in bytes are returned. Parameters ---------- file : BinaryIO a finalfusion file positioned before a chunk header. Returns ------- chunk_header : Optional[(ChunkIdentifier, int)] None is returned iff the reader is at EOF. Raises ------ FinalfusionFormatError If only part of the header could be read. """ val = _read_binary(file, "<IQ") if val is None: return None return ChunkIdentifier(val[0]), val[1] def _read_required_chunk_header(file: BinaryIO) -> Tuple[ChunkIdentifier, int]: val = _read_chunk_header(file) if val is None: raise FinalfusionFormatError('could not read chunk header.') return val def _serialize_array_as_le(file: BinaryIO, array: np.ndarray): native_is_le = sys.byteorder == "little" array_bo = array.dtype.byteorder array_is_le = array_bo in "<|" or (array_bo == "=" and native_is_le) if array_is_le: array.tofile(file) else: if array.ndim == 2: for row in array: row.byteswap(inplace=False).tofile(file) else: array.byteswap(inplace=False).tofile(file) def _read_array_as_native(file: BinaryIO, dtype: np.dtype, count: int) -> np.array: array = np.fromfile(file=file, count=count, dtype=dtype) if sys.byteorder == "big": array.byteswap(inplace=True) return array # export nothing from this module since it's not part of the public API __all__ = [] # type: List[str]