#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

"""
Helper script to pre-compute embeddings for a wav2letter++ dataset
"""

import argparse
import os


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("tsv")
    parser.add_argument("--output-dir", required=True)
    parser.add_argument("--output-name", required=True)
    args = parser.parse_args()

    os.makedirs(args.output_dir, exist_ok=True)

    transcriptions = {}

    with open(args.tsv, "r") as tsv, open(
        os.path.join(args.output_dir, args.output_name + ".ltr"), "w"
    ) as ltr_out, open(
        os.path.join(args.output_dir, args.output_name + ".wrd"), "w"
    ) as wrd_out:
        root = next(tsv).strip()
        for line in tsv:
            line = line.strip() # '374/180298/374-180298-0037.flac'
            dir = os.path.dirname(line)
            if dir not in transcriptions:
                parts = dir.split(os.path.sep) # ['374', '180298'] 
                trans_path = f"{parts[-2]}-{parts[-1]}.trans.txt" # 374-180298.trans.txt
                path = os.path.join(root, dir, trans_path) # /nasdata/audio_data/ASR/librispeech/LibriSpeech/train-clean-100/374/180298/374-180298.trans.txt
                assert os.path.exists(path)
                texts = {}
                with open(path, "r") as trans_f: # 读某人的整个transcript文件记录在transcriptions的texts里 为多个音频的text结果
                    for tline in trans_f:
                        items = tline.strip().split() # ['374-180298-0001', 'MARGUERITE', 'TO', 'BE', 'UNABLE', 'TO', 'LIVE', 'APART', 'FROM', 'ME', 'IT', 'WAS', 'THE', 'DAY', 'AFTER', 'THE', 'EVENING', 'WHEN', 'SHE', 'CAME', 'TO', 'SEE', 'ME', 'THAT', 'I', 'SENT', 'HER', 'MANON', 'LESCAUT', 'FROM', 'THAT', 'TIME', 'SEEING', 'THAT', 'I', 'COULD', 'NOT', 'CHANGE', 'MY', "MISTRESS'S", 'LIFE', 'I', 'CHANGED', 'MY', 'OWN']
                        texts[items[0]] = " ".join(items[1:]) # texts['374-180298-0001']="MARGUERITE TO BE UNABLE TO LIVE APART FROM ME IT WAS THE DAY AFTER THE EVENING WHEN SHE CAME TO SEE ME THAT I SENT HER MANON LESCAUT FROM THAT TIME SEEING THAT I COULD NOT CHANGE MY MISTRESS'S LIFE I CHANGED MY OWN"
                transcriptions[dir] = texts # text={'374-180298-0000': 'CHAPTER', '374-180298-0001':'**'}
            part = os.path.basename(line).split(".")[0] # '374-180298-0037'
            assert part in transcriptions[dir]
            print(transcriptions[dir][part], file=wrd_out) # 输出的为该audio对应的word seq string 'ALWAYS REPEATING THAT I WAS QUITE WELL AND THAT I WAS NOT IN NEED OF MONEY TWO THINGS WHICH I THOUGHT WOULD CONSOLE MY FATHER FOR MY DELAY IN PAYING HIM MY ANNUAL VISIT JUST THEN ONE FINE DAY IN SUMMER'
            print(
                " ".join(list(transcriptions[dir][part].replace(" ", "|"))) + " |", # word_seq放到里面，搞出来letter的seq:'A L W A Y S | R E P E A T I N G | T H A T | I | W A S | Q U I T E | W E L L | A N D | T H A T | I | W A S | N O T | I N | N E E D | O F | M O N E Y | T W O | T H I N G S | W H I C H | I | T H O U G H T | W O U L D | C O N S O L E | M Y | F A T H E R | F O R | M Y | D E L A Y | I N | P A Y I N G | H I M | M Y | A N N U A L | V I S I T | J U S T | T H E N | O N E | F I N E | D A Y | I N | S U M M E R |'
                file=ltr_out,
            )
# 大写去标点符号

if __name__ == "__main__":
    main()
