import glob
import struct
import json
import re

REMAP = {"-lrb-": "(", "-rrb-": ")", "-lcb-": "{", "-rcb-": "}",

         "-lsb-": "[", "-rsb-": "]", "``": '"', "''": '"', "\n": '',":":','}

def clean(x):
    return re.sub(r"-lrb-|-rrb-|-lcb-|-rcb-|-lsb-|-rsb-|``|''|\n|:", lambda m: REMAP.get(m.group()), x)

def clean_str(s):
    forbidden=['b"','-lrb-','-rrb-','-','“','"',"'","`",'``',"''","b'",'/','\\','\\n','-','<s>','</s>']
    for i in forbidden:
        s=s.replace(i,'')
    return s

def end_replace(s):
    forbidden=['!', '?',';']

    for i in forbidden:
        s=s.replace(i,'.')
   
    return s


def example_generator_MUTIL(data_path):
  reader_src = open((data_path+'.txt.src.tokenized.fixed.cleaned.final.truncated.txt'), 'rb')
  reader_ref = open((data_path+'.txt.tgt.tokenized.fixed.cleaned.final.truncated.txt'), 'rb')
  while True:
      src=reader_src.readline()
      ref=reader_ref.readline()
      if src and ref:
          src=src.decode()
          ref=ref.decode()
          src=clean(src)
          ref=clean(ref)
          src=end_replace(src)
          ref=end_replace(ref)
          src=src.replace('"','')
          ref=ref.replace('"','')
          src=src.replace('[','(')
          src=src.replace(']',')')
          ref=ref.replace('[','(')
          ref=ref.replace(']',')')
          multi_src=src.split('story_separator_special_tag')
          multi_article=[]
          for one_src in multi_src:
              article=one_src.split('.')
              if '' in article:
                article.remove('')
              multi_article.append(article)
          summary=ref.split('.')
          if '' in summary:
            summary.remove('')
          yield [multi_article,summary]
      else:
          break


def example_generator_MUTIL_raw(data_path):
  reader_src = open((data_path+'.txt.src'), 'rb')
  reader_ref = open((data_path+'.txt.tgt'), 'rb')
  while True:
      src=reader_src.readline()
      ref=reader_ref.readline()
      if src and ref:
          src=src.decode()
          ref=ref.decode()
          src=clean(src)
          ref=clean(ref)
          src=end_replace(src)
          ref=end_replace(ref)
          src=src.replace('"','')
          ref=ref.replace('"','')
          src=src.replace('[','(')
          src=src.replace(']',')')
          ref=ref.replace('[','(')
          ref=ref.replace(']',')')
          multi_src=src.split('story_separator_special_tag')
          multi_article=[]
          for one_src in multi_src:
              article=one_src.split('.')
              if '' in article:
                article.remove('')
              multi_article.append(article)
          summary=ref.split('.')
          if '' in summary:
            summary.remove('')
          yield [multi_article,summary]
      else:
          break