Source code for finalfusion.compat.text

"""
Text based embedding formats.
"""

import re
from os import PathLike
from typing import Union, TextIO

import numpy as np

from finalfusion.embeddings import Embeddings
from finalfusion._util import _normalize_matrix
from finalfusion.storage import NdArray
from finalfusion.vocab import SimpleVocab

_ASCII_WHITESPACE_PAT = re.compile(r'(?a)\s+')


[docs]def load_text_dims(file: Union[str, bytes, int, PathLike],
                   lossy: bool = False) -> Embeddings:
    """
    Read emebddings in text-dims format.

    The returned embeddings have a SimpleVocab, NdArray storage and a Norms chunk. The storage is
    l2-normalized per default and the corresponding norms are stored in the Norms.

    The first line contains whitespace separated rows and cols, the rest of the file contains
    whitespace separated word and vector components.

    Parameters
    ----------
    file : str, bytes, int, PathLike
        Path to a file with embeddings in text format with dimensions on the first line.
    lossy : bool
        If set to true, malformed UTF-8 sequences in words will be replaced with the `U+FFFD`
        REPLACEMENT character.

    Returns
    -------
    embeddings : Embeddings
        The embeddings from the input file.
    """
    with open(file, encoding='utf8',
              errors='replace' if lossy else 'strict') as inf:
        rows, cols = next(inf).split()
        return _load_text(inf, int(rows), int(cols))


[docs]def load_text(file: Union[str, bytes, int, PathLike],
              lossy: bool = False) -> Embeddings:
    """
    Read embeddings in text format.

    The returned embeddings have a SimpleVocab, NdArray storage and a Norms chunk. The storage is
    l2-normalized per default and the corresponding norms are stored in the Norms.

    Expects a file with utf-8 encoded lines with:

    * word at the start of the line
    * followed by whitespace
    * followed by whitespace separated vector components

    Parameters
    ----------
    file : str, bytes, int, PathLike
        Path to a file with embeddings in word2vec binary format.
    lossy : bool
        If set to true, malformed UTF-8 sequences in words will be replaced with the `U+FFFD`
        REPLACEMENT character.

    Returns
    -------
    embeddings : Embeddings
        Embeddings from the input file. The resulting Embeddings will have a
        SimpleVocab, NdArray and Norms.
    """
    with open(file, encoding='utf8',
              errors='replace' if lossy else 'strict') as inf:
        try:
            first = next(inf)
        except StopIteration:
            raise ValueError("Can't read from empty embeddings file.")
        line = _ASCII_WHITESPACE_PAT.split(first.rstrip())
        cols = len(line[1:])
        rows = sum(1 for _ in inf) + 1
        inf.seek(0)
        return _load_text(inf, rows, cols)


[docs]def write_text(file: Union[str, bytes, int, PathLike],
               embeddings: Embeddings,
               sep=" "):
    """
    Write embeddings in text format.

    Embeddings are un-normalized before serialization, if norms are present, each embedding is
    scaled by the associated norm.

    The output consists of utf-8 encoded lines with:
        * word at the start of the line
        * followed by whitespace
        * followed by whitespace separated vector components

    Parameters
    ----------
    file : str, bytes, int, PathLike
        Output file
    embeddings : Embeddings
        Embeddings to write
    sep : str
        Separator of word and embeddings.
    """
    _write_text(file, embeddings, False, sep=sep)


[docs]def write_text_dims(file: Union[str, bytes, int, PathLike],
                    embeddings: Embeddings,
                    sep=" "):
    """
    Write embeddings in text-dims format.

    Embeddings are un-normalized before serialization, if norms are present, each embedding is
    scaled by the associated norm.

    The output consists of utf-8 encoded lines with:
        * `rows cols` on the **first** line
        * word at the start of the line
        * followed by whitespace
        * followed by whitespace separated vector components

    Parameters
    ----------
    file : str, bytes, int, PathLike
        Output file
    embeddings : Embeddings
        Embeddings to write
    sep : str
        Separator of word and embeddings.
    """
    _write_text(file, embeddings, True, sep=sep)


def _load_text(file: TextIO, rows: int, cols: int) -> Embeddings:
    words = []
    matrix = np.zeros((rows, cols), dtype=np.float32)
    for row, line in zip(matrix, file):
        parts = _ASCII_WHITESPACE_PAT.split(line.rstrip())
        words.append(parts[0])
        row[:] = parts[1:]
    storage = NdArray(matrix)
    return Embeddings(storage=storage,
                      norms=_normalize_matrix(storage),
                      vocab=SimpleVocab(words),
                      origin=file.name)


def _write_text(file: Union[str, bytes, int, PathLike],
                embeddings: Embeddings,
                dims: bool,
                sep=" "):
    vocab = embeddings.vocab
    matrix = embeddings.storage[:len(vocab)]
    with open(file, 'w', encoding='utf8') as outf:
        if dims:
            print(*matrix.shape, file=outf)
        for idx, word in enumerate(vocab):
            row = matrix[idx]  # type: np.ndarray
            if embeddings.norms is not None:
                row = row * embeddings.norms[idx]
            print(word, ' '.join(map(str, row)), sep=sep, file=outf)


__all__ = ['load_text', 'load_text_dims', 'write_text', 'write_text_dims']