import re

token_re = re.compile(r'\d+|[a-zA-Z]+|[^ ]+')
digits_re = re.compile(r'\d+')
word_re = re.compile(r'\w+')

def tokenize(text, subword_len=3):
    tokens = token_re.findall(text)
    r = []
    for s in tokens:
        is_digit = False
        if digits_re.fullmatch(s):
            is_digit = True
            if len(s) <= 3:
                tk_type = f'{len(s)}-digit'
            else:
                tk_type = '4+-digit'
        elif word_re.fullmatch(s):
            tk_type = 'word'
        else:
            tk_type = 'other'

        common_attributes = frozenset({
            ('type', tk_type),
        })

        if tk_type == 'word':
            if len(s) > subword_len:
                subwords = [
                    s[i:i + subword_len].lower()
                    for i in range(len(s) - subword_len + 1)
                ]
            else:
                subwords = [s.lower()]

            for i in range(len(subwords)):
                tk = set(common_attributes)
                tk.add(('sub', subwords[i]))

                if i == 0:
                    tk.add(('pos', 'start'))

                if i == len(subwords) - 1:
                    tk.add(('pos', 'end'))

                if i != 0 and i != len(subwords) - 1:
                    tk.add(('pos', 'middle'))

                r.append(frozenset(tk))
        elif is_digit:
            r.append(common_attributes)
        else:
            r.append(common_attributes | {('sub', s)})

    return r


def tok1(text):
    """
    Tokenizer tok1, as defined by the original paper.
    """
    r = []
    i = 0
    while i < len(text):
        if not text[i].isspace() and text[i].isprintable():
            i0 = i
            i += 1
            while i < len(text) and text[i].isalnum():
                i += 1
            r.append(text[i0:i])
        else:
            i += 1
    return r


tok2_split_re = re.compile(r'[\s\.,:-]+')

def tok2(text):
    """
    Tokenizer tok2, as defined by the original paper.
    """
    return tok2_split_re.split(text)
