import os
import argparse

def gen_lexicon(wrd_path, dict_path, lexicon_path):
    """
    using to gen lexicon for kenlm in wav2vec(fairseq)
    wrd_path: wav2vec training manifest data xxx.wrd
    dict_path: wav2vec training dict file
    lexicon_path: output path for lexicon file
    """

    dict_set = set()
    for line in open(dict_path, 'r'):
        c = line.split()[0]
        print(c)
        dict_set.add(c)
    print('*'*20)
    word_set = set()
    lexicon_lines = []

    for line in open(wrd_path, 'r'):
        words = line[:-1].split()

        for word in words:
            if word not in word_set:
                word_contain_unk_flag = False
                word_set.add(word)
                character_list = [c for c in word]
                for c in word:
                    if c not in dict_set:
                        word_contain_unk_flag = True
                        break
                if not word_contain_unk_flag:
                    lexicon_lines.append("{}\t{} |\n".format(word, " ".join(character_list)))

    with open(lexicon_path, 'w') as f:
        for line in lexicon_lines:
            f.write(line)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--wrd-path", required=True)
    parser.add_argument("--dict-path", required=True)
    parser.add_argument("--lexicon-path", required=True)
    args = parser.parse_args()

    gen_lexicon(args.wrd_path, args.dict_path, args.lexicon_path)

