# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import re


SPACE_NORMALIZER = re.compile(r"\s+")


def tokenize_line(line, convert_lower=False):
    line = SPACE_NORMALIZER.sub(" ", line)
    line = line.strip()
    if convert_lower:
        line = line.lower()
    return line.split()


def tokenize_line_to_word(line, convert_lower=False):
    line = SPACE_NORMALIZER.sub(" ", line)
    if convert_lower:
        line = line.lower()
    # print("before line:", line)
    # line = line.replace('\'s', ' ##\'s').replace('ly ', ' ##ly ').replace('ly\n', ' ##ly\n').replace('n\'t','nnot')#.replace('ing ', ' ##ing ').replace('ing\n', ' ##ing\n')
    line = line.replace('\'s', ' ##\'s').replace('ly ', ' ##ly ').replace('ly\n', ' ##ly\n')
    # print("after line:", line)
    line = line.strip()
    return line.split()