import os
import glob
import argparse
from pytesseract import pytesseract
from PIL import Image
from tqdm import tqdm

def extract_ocr(filepath,output_path):

    all_files = glob.glob(filepath+'/*')
    folder_name = filepath.split('/')[-1]
    new_output_path = os.path.join(output_path,folder_name)

    try:
        os.makedirs(new_output_path)
    except OSError:
        print ("Folder exists")
        pass

    for file in tqdm(all_files):
        filename = file.split('/')[-1]

        if args.config != 'hocr':
            image = Image.open(file)
            text = pytesseract.image_to_string(image)

            txt_filename = os.path.join(new_output_path,filename.split('.')[0]+'.txt')
            with open(txt_filename,'w') as f:
                f.write(text)
        else:
            hocr_filename = os.path.join(new_output_path,filename.split('.')[0])
            pytesseract.run_tesseract(file,hocr_filename,config='hocr', lang=None,extension='html')

if __name__ == "__main__":
    parser = argparse.ArgumentParser(prog='Extract OCR', conflict_handler='resolve')
    parser.add_argument('--input_path',type=str,help='Input files path')
    parser.add_argument('--output_path',type=str,help='Base output path')
    parser.add_argument('--config', type=str, help='type of output: hocr or ocr',default='')

    args = parser.parse_args()

    extract_ocr(args.input_path,args.output_path)