import os
import sys

def readTxt(filename):
    data = []
    print("Reading " + filename)
    with open(filename) as f:
        for line in f:
            data.append(line.strip())
    return data

if __name__ == "__main__":
    if len(sys.argv) < 5:
        print("Usage: python3 filterByMaxSize.py <input1> <input2> <outputdir> <maxlen>")
    
    data1 = readTxt(sys.argv[1])
    data2 = readTxt(sys.argv[2])

    if not os.path.exists(sys.argv[3]):
        os.makedirs(sys.argv[3])

    output1 = open(os.path.join(sys.argv[3], sys.argv[1].split('/')[-1]) , 'w')
    output2 = open(os.path.join(sys.argv[3], sys.argv[2].split('/')[-1]) , 'w')

    maxlen = int(sys.argv[4])

    # start, end = 0, 0
    start = 6412 * 32
    end = 7212 * 32
    xlengths, ylengths = [], []
    for idx, (x, y) in enumerate(zip(data1, data2)):
        xtokens = x.split(' ')
        ytokens = y.split(' ')
        xlengths.append(len(xtokens))
        ylengths.append(len(ytokens))
        # if len(xtokens) < maxlen and len(ytokens) < maxlen:
        # # if len(ytokens) < maxlen:
        #     output1.write(x + '\n')
        #     output2.write(y + '\n')
        # else:
        #     empty += 1
    print(max(xlengths[-100000:]))
    print(max(ylengths[-100000:]))
    
    # output1.close()
    # output2.close()
    # print("Filter %d long example" % empty)

    # python3 filterByMaxSize.py /mnt/bd/lab-wxz/clt/langReverse/MSPM/train.merged.spm.doc /mnt/bd/lab-wxz/clt/langReverse/MSPM/train.merged.spm.sum /mnt/bd/lab-wxz/clt/langReverse/MSPM/filtered/ 512