# -*- coding: utf-8 -*-

import numpy as np

from ..common.logger import open_file
from .vocab import Tokens


def read_embedding_file(embedding_filename, encoding,
                        vocab=None, add_unk=True, tensor_fn=np.array):
    with open_file(embedding_filename, 'r', encoding=encoding) as fp:
        words, vectors = [], []
        if add_unk:
            words.append(Tokens.UNK)
            vectors.append(None)

        for line in fp:
            line = line.strip()
            if not line:
                continue
            fields = line.split()
            assert len(fields) >= 2
            word, *vector = fields
            if vocab is not None and word not in vocab:
                continue

            words.append(word)
            vectors.append([float(i) for i in vector])

        if vectors[0] is None:
            vectors[0] = [0] * len(vectors[1])

    return words, tensor_fn(vectors)
