import re
from typing import List, Tuple

from cryptoaddress import BitcoinAddress, EthereumAddress, LitecoinAddress

from normalizer_utils import extract_spans_with_context, check_overlap_with_spans, remove_overlapping_spans_from_back


BTC_ADDRESS_PREFIXES = ['1', '3', 'bc1']
#ETH_ADDRESS_PREFIXES = []
LTC_ADDRESS_PREFIXES = ['L', 'M', '3']

IDENTIFIER_BTC_ADDRESS = 'ID_BTC_ADDRESS'
IDENTIFIER_ETH_ADDRESS = 'ID_ETH_ADDRESS'
IDENTIFIER_LTC_ADDRESS = 'ID_LTC_ADDRESS'
IDENTIFIER_OTHER_ADDRESS = 'ID_OTHER_ADDRESS'

IDENTIFIER_EMAIL = 'ID_EMAIL'
IDENTIFIER_ONION_URL = 'ID_ONION_URL'
IDENTIFIER_NORMAL_URL = 'ID_NORMAL_URL'
IDENTIFIER_FILENAME = 'ID_FILENAME'
IDENTIFIER_NUMBER = 'ID_NUMBER'
IDENTIFIER_CRYPTO_MONEY = 'ID_CRYPTO_MONEY'
IDENTIFIER_GENERAL_MONEY = 'ID_GENERAL_MONEY'
IDENTIFIER_LENGTH = 'ID_LENGTH'
IDENTIFIER_WEIGHT = 'ID_WEIGHT'
IDENTIFIER_VOLUME = 'ID_VOLUME'
IDENTIFIER_PERCENTAGE = 'ID_PERCENTAGE'
IDENTIFIER_IP_ADDRESS = 'ID_IP_ADDRESS'
IDENTIFIER_TIME = 'ID_TIME'
IDENTIFIER_FILESIZE = 'ID_FILESIZE'
IDENTIFIER_VERSION = 'ID_VERSION'
IDENTIFIER_BRAND_NAME = 'ID_BRAND_NAME'

IDENTIFIERS_ALL = frozenset(value for key, value in globals().items() if key.startswith('IDENTIFIER_') and value.startswith('ID_'))


# Organised collection of common file extensions
# https://github.com/dyne/file-extension-list
FILE_EXTENSIONS_BY_CATEGORY = {
    'sheet': ['ods', 'xls', 'xlsx', 'csv', 'ics', 'vcf'],
    'db': ['db', 'sql'],
    'image': ['3dm', '3ds', 'max', 'bmp', 'dds', 'gif', 'jpg', 'jpeg', 'png', 'psd', 'xcf', 'tga', 'thm', 'tif', 'tiff',
              'yuv', 'ai', 'eps', 'ps', 'svg', 'dwg', 'dxf', 'gpx', 'kml', 'kmz', 'webp'],
    'video': ['3g2', '3gp', 'aaf', 'asf', 'avchd', 'avi', 'drc', 'flv', 'm2v', 'm4p', 'm4v', 'mkv', 'mng', 'mov', 'mp2',
              'mp4', 'mpe', 'mpeg', 'mpg', 'mpv', 'mxf', 'nsv', 'ogg', 'ogv', 'ogm', 'qt', 'rm', 'rmvb', 'roq', 'srt',
              'svi', 'vob', 'webm', 'wmv', 'yuv'],
    'audio': ['aac', 'aiff', 'ape', 'au', 'flac', 'gsm', 'it', 'm3u', 'm4a', 'mid', 'mod', 'mp3', 'mpa', 'pls', 'ra',
              's3m', 'sid', 'wav', 'wma', 'xm'],
    'archiv': ['7z', 'a', 'apk', 'ar', 'bz2', 'cab', 'cpio', 'deb', 'dmg', 'egg', 'gz', 'iso', 'jar', 'lha', 'mar',
               'pea', 'rar', 'rpm', 's7z', 'shar', 'tar', 'tbz2', 'tgz', 'tlz', 'war', 'whl', 'xpi', 'zip', 'zipx',
               'xz', 'pak'],
    'exec': ['exe', 'msi', 'bin', 'command', 'sh', 'bat', 'crx'],
    'code': ['c', 'cc', 'class', 'clj', 'cpp', 'cs', 'cxx', 'el', 'go', 'h', 'java', 'lua', 'm', 'm4', 'php', 'pl',
             'po', 'py', 'rb', 'rs', 'sh', 'swift', 'vb', 'vcxproj', 'xcodeproj', 'xml', 'diff', 'patch', 'html', 'js'],
    'web': ['html', 'htm', 'css', 'js', 'jsx', 'less', 'scss', 'wasm', 'php'],
    'font': ['eot', 'otf', 'ttf', 'woff', 'woff2'],
    'slide': ['ppt', 'pptx', 'odp'],
    'text': ['doc', 'docx', 'ebook', 'log', 'md', 'msg', 'odt', 'org', 'pages', 'pdf', 'rtf', 'rst', 'tex', 'txt', 'wpd',
             'wps'],
    'book': ['mobi', 'epub', 'azw1', 'azw3', 'azw4', 'azw6', 'azw', 'cbr', 'cbz'],
}

FILE_EXTENSIONS = frozenset(ext for exts in FILE_EXTENSIONS_BY_CATEGORY.values() for ext in exts)





REGEX_CRYPTO_ADDRESS_CANDIDATE = re.compile(r'(?:[^a-zA-Z0-9]|^)([a-zA-Z0-9]{25,50})(?=[^a-zA-Z0-9]|$)')

REGEX_NORMAL_URL = re.compile(r"(https?://[^\s]+[^ㄱ-ㅣ가-힣\s][^\.,\s])")  # Should not end with Korean? ("~에", "~로" as seen in "https://xxx.yyy에 들어가서 ...")
RAW_REGEX_NORMAL_URL_INDICATOR = r'(?:[^\w]|^)http'
REGEX_ONION_16_URL = re.compile(r"(?:[^a-zA-Z0-9]|^)((?:http://)?[a-zA-Z0-9]{16}\.onion)")
REGEX_ONION_56_URL = re.compile(r"(?:[^a-zA-Z0-9]|^)((?:http://)?[a-zA-Z0-9]{56}\.onion)")
RAW_REGEX_ONION_URL_INDICATOR = r'\.onion(?=[^\w]|$)'
RAW_REGEX_POPULAR_URL_INDICATOR = (r'\.(?:' 
                                  r'com|org|net|int|edu|gov|mil|info|biz|xyz|ly|io|tv|ws|co|me|cc'
                                  r'|co\.kr|go\.kr' 
                                  r'|uk|us|tk|de|cn|ru|jp|tw|ca' 
                                  r')(?=[^\w#]|$)')
REGEX_POPULAR_URL = re.compile((r"(?:[^a-zA-Z0-9@/.]|^)([a-zA-Z0-9._-]{3,30}" + RAW_REGEX_POPULAR_URL_INDICATOR + ")"))

REGEX_EMAIL_ADDR = re.compile(r"(?:[^.+_-]|^)([a-zA-Z0-9.+_-]+@[a-zA-Z0-9._-]+\.[a-zA-Z]+)")

RAW_REGEX_FILENAME_INDICATOR = r"\.(?:" + '|'.join(FILE_EXTENSIONS) + r")(?=[^\w]|$)"
REGEX_FILENAME = re.compile(r"(?:[\s/:]|^)([^\s/:]+" + RAW_REGEX_FILENAME_INDICATOR + r")")

UNITS_CRYPTO_CURRENCY = \
    ['₿', 'Ƀ', '฿', 'bitcoins', 'bitcoin', 'satoshi',
     'uBTC', 'μBTC', 'mBTC', 'cBTC', 'dBTC', 'deBTC', 'hBTC', 'kBTC', 'MBTC', 'BTC',] \
    + ['Ethers', 'Ether', 'ETH', ] \
    + ['XMR', ]
UNITS_GENERAL_CURRENCY = \
    [r'\$', 'USD', '€', 'Euros', 'Euro', 'EUR', '£', 'GBP', '¥', 'JPY', '¥', 'CNY', '₽', 'руб', 'RUB']
UNITS_LENGTH = ['cm', 'mm', 'm']           # for weapons and drugs
UNITS_WEIGHT = ['µg', 'ug', 'grams', 'gram', 'g', 'mcg', 'mg',  'kilograms', 'kilogram', 'kg', 'lbs', 'lb']     # for weapons and drugs
UNITS_VOLUME = ['l', 'ml']                 # for drugs
UNITS_PERCENTAGE = ['%']                   # for drugs, etc.
UNITS_FILESIZE = ['kb', 'mb', 'gb', 'tb', 'bytes']  # for hacking, leaks
UNIT_LEFT_BOUNDARY  = r"(?:^|[^a-zA-Z0-9])"
UNIT_RIGHT_BOUNDARY = r"(?=$|[^a-zA-Z0-9])"
REGEX_HYPHENS = r'[\-᠆‐‑‒–—―−⁓﹘﹣－~]'
REGEX_HYPHENS_OR_BLANK = REGEX_HYPHENS.replace('[', '[ ')
RAW_REGEX_NUMBER_SINGLE = r"(?:(?:\d[\d,]*(?:\.\d+)?)|(?:\.\d+))"
RAW_REGEX_NUMBER = RAW_REGEX_NUMBER_SINGLE + r"(?: ?" + REGEX_HYPHENS_OR_BLANK + " ?" + RAW_REGEX_NUMBER_SINGLE + ")?"
REGEX_NUMBER = re.compile(
    rf"{UNIT_LEFT_BOUNDARY}"
    rf"({RAW_REGEX_NUMBER})"  # Single number, or range
    rf"{UNIT_RIGHT_BOUNDARY}"
)
UNITS_MONTH = [
    'January', 'February', 'March', 'April', 'May', 'June', 'July',
    'August', 'September', 'October', 'November', 'December',
    #'Jan.', 'Feb.', 'Mar.', 'Apr.', 'Aug.', 'Sept.', 'Oct.', 'Nov.', 'Dec.',
    'Jan', 'Feb', 'Mar', 'Apr', 'Jun', 'Jul', 'Aug', 'Sept', 'Sep', 'Oct', 'Nov', 'Dec',
]
RAW_REGEX_MONTH = r'(?:' + r'|'.join(UNITS_MONTH) + r':)'

REGEX_IP_ADDRESS = re.compile(r"""(?:^|[^\d])(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(?::\d{1,5})?)(?=$|[^\d])""")
REGEX_TIME = re.compile(UNIT_LEFT_BOUNDARY + r"""(
        (?: 
            [12]\d{3}-(?:[0-1]\d|""" + RAW_REGEX_MONTH + r""")-[0-3]\d,?  # "2020-09-30" or "2020-Sep-30"
                (?:\s+[0-2]\d:[0-5]\d:[0-5]\d|\s+[0-2]\d:[0-5]\d)?   # "13:35:02" or "13:35"
            |\d{2,4}[.]\d{1,2}[.]\d{2,4}   
            |\d{2,4}[/]\d{1,2}[/]\d{2,4}   
            |[0-3]\d[-][0-3]\d[-][12]\d{3}   # DD-MM-YYYY or MM-DD-YYYY
            |(?:[0-3]\d|[1-9])[ -]""" + RAW_REGEX_MONTH + r"""\.?,?[ -][12]\d{3}  # "21-Aug-2020"
                (?:\s+[0-2]\d:[0-5]\d:[0-5]\d|\s+[0-2]\d:[0-5]\d)?              
            |""" + RAW_REGEX_MONTH + r"""\.?[ -](?:[0-3]\d|[1-9]),?[ -][12]\d{3}   # "Aug-21-2020"
                (?:\s+[0-2]\d:[0-5]\d:[0-5]\d|\s+[0-2]\d:[0-5]\d)?
        )
    )"""
    + UNIT_RIGHT_BOUNDARY, flags=re.VERBOSE)
REGEX_CRYPTO_MONEY1 = re.compile(UNIT_LEFT_BOUNDARY
                                 + rf"((?:{'|'.join(UNITS_CRYPTO_CURRENCY)})[ \xa0]*{RAW_REGEX_NUMBER})"
                                 , flags=re.IGNORECASE)
REGEX_CRYPTO_MONEY2 = re.compile(rf"({RAW_REGEX_NUMBER}\+?[ \xa0]*(?:{'|'.join(UNITS_CRYPTO_CURRENCY)}))"  # BTC
                                 + UNIT_RIGHT_BOUNDARY, flags=re.IGNORECASE)
REGEX_GENERAL_MONEY1 = re.compile(UNIT_LEFT_BOUNDARY
                                  + rf"((?:{'|'.join(UNITS_GENERAL_CURRENCY)})[ \xa0]*{RAW_REGEX_NUMBER})"
                                  , flags=re.IGNORECASE)
REGEX_GENERAL_MONEY2 = re.compile(rf"({RAW_REGEX_NUMBER}\+?[ \xa0]*(?:{'|'.join(UNITS_GENERAL_CURRENCY)}))"  # BTC
                                  + UNIT_RIGHT_BOUNDARY, flags=re.IGNORECASE)
REGEX_LENGTH = re.compile(#UNIT_LEFT_BOUNDARY +
                          rf"({RAW_REGEX_NUMBER}\+?[ \xa0]*(?:{'|'.join(UNITS_LENGTH)}))"
                          + UNIT_RIGHT_BOUNDARY, flags=re.IGNORECASE)
REGEX_WEIGHT = re.compile(#UNIT_LEFT_BOUNDARY +
                          rf"({RAW_REGEX_NUMBER}\+?[ \xa0]*(?:{'|'.join(UNITS_WEIGHT)}))"
                          + UNIT_RIGHT_BOUNDARY, flags=re.IGNORECASE)
REGEX_VOLUME = re.compile(#UNIT_LEFT_BOUNDARY +
                          rf"({RAW_REGEX_NUMBER}\+?[ \xa0]*(?:{'|'.join(UNITS_VOLUME)}))"
                          + UNIT_RIGHT_BOUNDARY, flags=re.IGNORECASE)
REGEX_PERCENTAGE = re.compile(#UNIT_LEFT_BOUNDARY +
                              rf"({RAW_REGEX_NUMBER}\+?[ \xa0]*(?:{'|'.join(UNITS_PERCENTAGE)}))"
                              + UNIT_RIGHT_BOUNDARY, flags=re.IGNORECASE)
REGEX_FILESIZE = re.compile(#UNIT_LEFT_BOUNDARY +
                            rf"({RAW_REGEX_NUMBER}[ \xa0]*(?:{'|'.join(UNITS_FILESIZE)}))"
                            + UNIT_RIGHT_BOUNDARY, flags=re.IGNORECASE)
REGEX_VERSION = re.compile(UNIT_LEFT_BOUNDARY
                           + r"((?:version|ver|v)[ \xa0]*\d+(?:\.\d+){0,3})"
                           , flags=re.IGNORECASE)
ATOMIC_BRAND_NAMES_LOWERED = [n.lower() for n in [
    'BitCoin', 'BitCoins', 'PayPal', 'PayPals', 'MoneyGram', 'MoneyGrams', 'MasterCard', 'MasterCards',
    'UnionPay', 'FedEx', 'FullZ', #'AmericanExpress', 'WesternUnion', 'WesternUnions',
    'AliExpress', 'AliPay', #'PrePaid', 'GiftCard', 'GiftCards'

    'AltCoin', 'AltCoins', 'LiteCoin', 'LiteCoins',
    #'MonaCoin', 'NameCoin', 'NavCoin', 'PandaCoin', 'PiggyCoin', 'SpectroCoin', 'StableCoin',

    'AnonChat', 'JavaScript', 'JavaScripts', 'AdBlock', 'AdBlocker', 'AdGuard', # 'DoS', 'DDoS', 'AdblockPlus',
    'AngularJS',  'GitHub', 'GitLab', 'AntiVirus', 'AntiSpam', 'AutoCAD',
    'BluRay', #'CentOS', 'CloudFlare', #'CodeQL', 'CoinBase', 'CoffeeScript', 'TypeScript',
    'DropBox',  # 'DropDown', 'DrugStore', 'ExploitDB',
    'FaceBook', 'FaceTime', 'FastCGI', 'FileZilla',  'FireFox', 'FreeBSD', 'FreeDOS', #'FinTech',
    'FujiFilm', 'GandCrab', 'GeForce',  #'BackEnd', 'FrontEnd', 'FullHD',
    #'KeyBase', 'KeyGen', 'KeyLogger', 'KeyStroke',
    'LinkedIn', #'InstaGram',
    'McAfee', 'MariaDB', 'MongoDB', 'MonetDB', 'MySQL', 'MySpace',
    'NumPy', 'SciPy', 'OneDrive', 'OnePlus', #'NetFlix',
    #'OpenBSD', 'OpenJDK', 'OpenSSH', 'OpenSSL',
    'MobaXterm', 'PageRank', 'PhpMyAdmin',  'PlayStation',  #'PhotoShop',
    'PowerPoint', 'PowerShell', 'ProtonMail', #'PrintScreen',
    'PyPI', 'QicPic', 'RaaS', 'RedHat', 'SecMail',  # 'ReadMe', 'SecureDrop', 'SecureDrops', 'StackOverflow',
    'SharePoint', 'SnapChat', 'SoundCloud', 'SourceForge', 'TikTok',
    'VirtualBox', 'VueJS', 'WannaCry', 'WeChat', 'WebGL', # 'WickrME',
    'WinScp', 'YouTube', 'YouTuber', 'ZeroNet',

    #'iPhone', 'iPhones', 'MacBook',  'iPad', 'iPads', 'iMac', 'iMacs', 'iPod', 'iPods', 'iOS',
    #'iCloud', 'iMessage', 'iTunes',
    'MacBooks', 'AirDrop', 'AirPods', 'AirPod', 'AppleCare', 'EarPods', 'EarPods', 'MacOS',
    'BlackBerry',

    'BlackHats', 'BlackHat',  #'BlockChain', 'BookMark', 'CapsLock', 'CocaCola', 'DataBase', 'DashBoard',
    'WhatsApp', 'WordPress',  #'HardCore', 'HawkEye', 'HashTag', 'HashTags', 'HelpDesk',
    'DefCon', 'DuckDuckGo', 'GoPro', 'HeBe',  # 'LockDown', 'NaCl',
    'MarketPlace', 'McDonald', 'McDonalds', 'NeurIPS', 'NewEgg', #'NewYork', 'PostCard', 'ScreenLock',
    'RegEx', 'SignUp', 'LogIn', 'SsangYong', 'ThinQ', 'ThinkPad', 'WalMart',
    'arXiv',

]]
REGEX_BRAND_NAMES = re.compile(UNIT_LEFT_BOUNDARY
                               #+ r"""((?:[A-Z][a-z]+|[a-z]{2,})(?:(?:[A-Z][a-z]+)+|[A-Z]{2,}))"""
                               + r"""((?:[A-Z][a-z]+)(?:(?:[A-Z][a-z]+)+|[A-Z]{2,}))"""
                               + UNIT_RIGHT_BOUNDARY, flags=re.VERBOSE)


def extract_urls(raw_text):

    # 배치/우선순위 중요!
    http_url_spans = extract_spans_with_context(raw_text, indicator=RAW_REGEX_NORMAL_URL_INDICATOR,
                                                indicator_context_range=(0, 300), regex=REGEX_NORMAL_URL)
    onion56_url_spans = extract_spans_with_context(raw_text, indicator=RAW_REGEX_ONION_URL_INDICATOR,
                                                   indicator_context_range=(67, 0), regex=REGEX_ONION_56_URL)
    onion16_url_spans = extract_spans_with_context(raw_text, indicator=RAW_REGEX_ONION_URL_INDICATOR,
                                                   indicator_context_range=(37, 0), regex=REGEX_ONION_16_URL)
    popular_ext_urls = extract_spans_with_context(raw_text, indicator=RAW_REGEX_POPULAR_URL_INDICATOR,
                                                  indicator_context_range=(30, 0), regex=REGEX_POPULAR_URL)

    all_url_spans = http_url_spans + onion56_url_spans + onion16_url_spans + popular_ext_urls
    valid_url_spans = remove_overlapping_spans_from_back(all_url_spans)

    #print('@popular_ext_urls', popular_ext_urls)
    #print(REGEX_POPULAR_URL)
    valid_url_spans = [(offsets, text) for offsets, text, context in sorted(valid_url_spans)]

    return valid_url_spans  # type: List[Tuple[Tuple[int, int], str]]


def extract_emails(raw_text: str) -> List[Tuple[Tuple[int, int], str]]:

    email_spans_with_context = extract_spans_with_context(raw_text, indicator=r'@', indicator_context_range=(20, 30),
                                                          regex=REGEX_EMAIL_ADDR)
    email_spans = [(span_offset, span_text) for span_offset, span_text, context in email_spans_with_context]

    return email_spans


def extract_crypto_addresses(raw_text: str):

    spans_with_context = extract_spans_with_context(raw_text, regex=REGEX_CRYPTO_ADDRESS_CANDIDATE)
    spans = []

    for span_offset, span_text, context in spans_with_context:

        # =========== Bitcoin (BTC) ===========
        if any(span_text.startswith(prefix) for prefix in BTC_ADDRESS_PREFIXES):
            try:
                bitcoin_address = BitcoinAddress(span_text)
            except ValueError:
                pass
            else:
                spans.append((span_offset, span_text, 'btc'))

        # =========== Ethereum (ETH) ===========
        try:
            ethereum_address = EthereumAddress(span_text)
        except ValueError:
            pass
        else:
            spans.append((span_offset, span_text, 'eth'))

        # =========== LiteCoin (LTC) ===========
        if any(span_text.startswith(prefix) for prefix in LTC_ADDRESS_PREFIXES):
            try:
                litecoin_address = LitecoinAddress(span_text)
            except ValueError:
                pass
            else:
                spans.append((span_offset, span_text, 'ltc'))

    return spans


def extract_filenames(raw_text: str) -> List[Tuple[Tuple[int, int], str]]:

    spans_with_context = extract_spans_with_context(raw_text,
                                                    indicator=RAW_REGEX_FILENAME_INDICATOR,
                                                    indicator_context_range=(30, 5),
                                                    regex=REGEX_FILENAME)
    spans = [(span_offset, span_text) for span_offset, span_text, context in spans_with_context]
    return spans


def extract_crypto_money(raw_text):

    money_spans = extract_money_with_regex(raw_text, REGEX_CRYPTO_MONEY1, REGEX_CRYPTO_MONEY2)
    return money_spans


def extract_general_money(raw_text):

    money_spans = extract_money_with_regex(raw_text, REGEX_GENERAL_MONEY1, REGEX_GENERAL_MONEY2)
    return money_spans


def extract_money_with_regex(raw_text, money1_regex, money2_regex):
    # 비트코인, 이더리움, USD, EUR 등
    # "{숫자}+{단위}" ("100 USD")형태를 먼저 식별하고, 남은 것중에(즉, span 범위가 겹치지 않는 것 중에)
    # "{단위}+{숫자}" ("USD 50") 형태인 것을 식별함
    # 즉, "100 USD 50" 에서는 "100 USD"만 식별

    money_spans_with_context = extract_spans_with_context(raw_text, regex=money1_regex)
    money_spans = [(span_offset, span_text) for span_offset, span_text, context in money_spans_with_context]

    money_cand_spans_with_context = extract_spans_with_context(raw_text, regex=money2_regex)
    money_cand_spans = [(span_offset, span_text) for span_offset, span_text, context in money_cand_spans_with_context]

    #print('!!@@', money_spans, money_cand_spans)

    for cand_span in money_cand_spans:
        (start_offset, end_offset), span_text = cand_span

        if not check_overlap_with_spans(start_offset, end_offset, money_spans):
            money_spans.append(cand_span)

    return money_spans


def extract_lengths(raw_text):
    # 길이
    return extract_spans_by_regex(raw_text, REGEX_LENGTH)


def extract_volume(raw_text):
    # 부피
    return extract_spans_by_regex(raw_text, REGEX_VOLUME)


def extract_weights(raw_text):
    # 무게
    return extract_spans_by_regex(raw_text, REGEX_WEIGHT)


def extract_percentage(raw_text):
    # 부피
    return extract_spans_by_regex(raw_text, REGEX_PERCENTAGE)


def extract_filesize(raw_text):
    # 파일 크기
    return extract_spans_by_regex(raw_text, REGEX_FILESIZE)


def extract_version(raw_text):
    # 버전명
    return extract_spans_by_regex(raw_text, REGEX_VERSION)


def extract_temperature(raw_text):
    # 온도
    pass  # Not yet used ...


def extract_ip_address(raw_text):
    # IP주소
    return extract_spans_by_regex(raw_text, REGEX_IP_ADDRESS)


def extract_time(raw_text):
    # 시간
    return extract_spans_by_regex(raw_text, REGEX_TIME)


def extract_decimal_numbers(raw_text: str):
    return extract_spans_by_regex(raw_text, REGEX_NUMBER)


def extract_spans_by_regex(raw_text, regex):
    spans_with_context = extract_spans_with_context(raw_text, regex=regex)
    spans = [(span_offset, span_text) for span_offset, span_text, context in spans_with_context]

    return spans


