Source code for finalfusion.compat.word2vec

"""
Word2vec binary format.
"""

import sys
from os import PathLike
from typing import Union, BinaryIO, AnyStr

import numpy as np

from finalfusion.embeddings import Embeddings
from finalfusion.io import _serialize_array_as_le
from finalfusion.storage import NdArray
from finalfusion._util import _normalize_matrix
from finalfusion.vocab import SimpleVocab


[docs]def load_word2vec(file: Union[str, bytes, int, PathLike], lossy: bool = False) -> Embeddings: """ Read embeddings in word2vec binary format. The returned embeddings have a SimpleVocab, NdArray storage and a Norms chunk. The storage is l2-normalized per default and the corresponding norms are stored in the Norms. Files are expected to start with a line containing rows and cols in utf-8. Words are encoded in utf-8 followed by a single whitespace. After the whitespace, the embedding components are expected as little-endian single-precision floats. Parameters ---------- file : str, bytes, int, PathLike Path to a file with embeddings in word2vec binary format. lossy : bool If set to true, malformed UTF-8 sequences in words will be replaced with the `U+FFFD` REPLACEMENT character. Returns ------- embeddings : Embeddings The embeddings from the input file. """ words = [] with open(file, 'rb') as inf: rows, cols = map(int, inf.readline().decode("ascii").split()) matrix = np.zeros((rows, cols), dtype=np.float32) for row in matrix: words.append(_read_binary_word(inf, b' ', lossy).strip()) array = np.fromfile(file=inf, count=cols, dtype=np.float32) if sys.byteorder == "big": array.byteswap(inplace=True) row[:] = array storage = NdArray(matrix) return Embeddings(storage=storage, norms=_normalize_matrix(storage), vocab=SimpleVocab(words), origin=inf.name)
[docs]def write_word2vec(file: Union[str, bytes, int, PathLike], embeddings: Embeddings): r""" Write embeddings in word2vec binary format. If the embeddings are not compatible with the w2v format (e.g. include a SubwordVocab), only the known words and embeddings are serialized. I.e. the subword matrix is discarded. Embeddings are un-normalized before serialization, if norms are present, each embedding is scaled by the associated norm. The output file will contain the shape encoded in utf-8 on the first line as `rows columns`. This is followed by the embeddings. Each embedding consists of: * utf-8 encoded word * single space ``' '`` following the word * ``cols`` single-precision floating point numbers * ``'\n'`` newline at the end of each line. Parameters ---------- file : str, bytes, int, PathLike Output file embeddings : Embeddings The embeddings to serialize. """ vocab = embeddings.vocab matrix = embeddings.storage[:len(vocab)] with open(file, 'wb') as outf: outf.write(f'{matrix.shape[0]} {matrix.shape[1]}\n'.encode('ascii')) for idx, word in enumerate(vocab): row = matrix[idx] # type: np.ndarray if embeddings.norms is not None: row = row * embeddings.norms[idx] b_word = word.encode('utf-8') outf.write(b_word) outf.write(b' ') _serialize_array_as_le(outf, row) outf.write(b'\n')
def _read_binary_word(inf: BinaryIO, delim: AnyStr, lossy: bool): word = [] while True: byte = inf.read(1) if byte == delim: break if byte == b'': raise EOFError word.append(byte) return b''.join(word).decode('utf-8', errors='replace' if lossy else 'strict') __all__ = ['load_word2vec', 'write_word2vec']