import os
import re
from collections import Counter

import langdetect
from langdetect import detect

RAW_DIR = "txt_raw/"
PREPROCESS_DIR = "txt_preprocessed/"


USELESS_ENDING = [
    "Story Attribution:",
    "Please help spread the word by introducing your friends to our products.",
    "I hope you enjoyed my story",
    "This book was made possible by",
    "Share this book to help",
    "Thank You\s*My dear reader",
    "                                About the Author",
    "This edition of this free ebook",
    "Want to find more books like this?",
    "This book is shared online by Free Kids Books at https://www.freekidsbooks.org",
    "The Mustard Seed Books project uses an open-source",
    "This is a PDF E-book, provided by",
    "Advertisement",
    "Flesch-Kincaid Grade Level \d.\d",
    "MORE FROM THE CREATOR",
    "Simply great free books -",
    "We appreciate your feedback therefore the first ",
    "dear reader,",
    "                   Learn More about the Mathematics in the Story",
    "THINGS TO REMEMBER",
    "              About Free Kids Books",
    "    Scan the code below to listen to the song,",
    "Written by\s*Judie McEwen",
    "Bookbot books are free, high quality decodable readers based on the order",
    " Copyright 1933\s*By G.    L.   Freeman\s*all   rights reserved",
    "Check for more illustrated stories at www.BubuTales.com",
    "www.icharacter.org\s*info@icharacter.org",
    "### Thank you for reading my book.",
    "                         Flesch-Kincaid Grade Level -\d.\d",
    "Copyright © 2017 by the Nebraska Department of Education.",
    "                    About this Book",
    "© Copyright by Mayan Books 2010",
    "THE END\s*COPYRIGHT © Danielle Bruckert 2015\s*FREE KIDS BOOKS\s*Licensed in Creative Commons: BY-NC-SA\s*MANY "
    "MORE BOOKS LIKE THIS ONE\s*ARE AVAILABLE AT",
    "info@icharacter.org\s*By Agnes and Salem de Bezenac",
    "Please help spread the word by\s*introducing your friends to our products.",
    "THE\s*END\s*Attributions and Photo Index",
    "This free ebook\s*was\s*brought to you\s*by Free Kids Books!",
    "Just simply great free children's ebooks",
    "This book is the intellectual copywrite of the author. While it",
    "Boodlebobs  \[EP-01\] - RAJ THE RAMBLING RADIATOR\s*8",
    "ABOUT PUSS IN BOOTS",
    "This book came from Free Kids Books...\s*Looking for more books like this one?",
    "This book, which is available for free download at",
    "More children’s books\s*To see preview pages",
    "THE\s*END\s*The Tale of Jemima Puddle-Duck",
    "This version of this free ebook is published by ",
    "             About the creators of this ebook:",
    "THE\s*END\s*Beatrix Potter available",
    "End\s*About these books"
]

USELESS_STARTING = [
    "you\s*use\s*the\s*material.",
    "ISBN( |:)*[0-9-]*",
    "Book Ref[^\n]*",
    "More books like this one are available at\s*http://www.freekidsbooks.org",
    "We are not associated with any product or vendor in this book.",
    "without express permission of the\s*author.",
    "written permission of the author.",
    "Monkey Pens Free Book Project:",
    "for commercial and non-commercial purposes.",
    "Free Printable Worksheets: 2003 to 2009",
    "Stories and illustrations are copyrighted.",
    "All rights reserved, copyright ©\d+",
    "            About this Book",
    "Thank you for respecting the hard work of this author.",
    " write to the publisher at the email address of michelledennise@yahoo.com.",
    "copyright and the above publishers.",
    "Copyright \d+ by Kanika G",
    "Copyright \d+ By Kanika G",
    "This ebook published with permission on Free Kids Books Website",
    "hashtag #BreakTheSilenceNS.",
    "                Copyright © 2001",
    "(CC BY-NC-ND International)",
    "Text and illustrations by Ivan Parvov",
    "Copyright © 2014 by Gabe Fankhauser\s*All Rights Reserved",
    "Discover other titles by Carmen Saptouw:",
    "TABLE OF CONTENTS",
    "Reading Level: Flesch-Kincaid Grade Level -\d.\d",
    "    may be a correct response.",
    "\n\n\nMayan Books 2010",
    "First Edition, 2019.",
    "       This edition of this free ebook was",
    "Dedicated to:\s+Lumberjacks,\s+Forest Animals,\s+And Kids Everywhere.",
    "An introduction to dog ownership for children.\s*COPYRIGHT © Danielle Bruckert 2015\s*FREE KIDS BOOKS\s*Licensed "
    "in Creative Commons: BY-NC-SA\s*MANY MORE BOOKS LIKE THIS ONE\s*ARE AVAILABLE AT",
    "Eric Stone 11/19/2010",
    "Click Here For More Books By Carmen Saptouw",
    "This story is dedicated to Domino, who has provided us with years of\s*companionship and love.",
    "The author must be notified of any use.",
    "E-mail and Facebook:\s*n.miranda_books@outlook.com",
    "(Pre release access, shout outs & more)",
    "Sleeping Beauty – Common Core® Foundation ",
    "Sleeping Beauty – Common Core® Foundation                                         Page 4",
    "This is a free book from megamousebooks.com. ",
    "Cars-by-Numbers Series\s*COUNTING CARS\s*IS FUN!",
    "First Published in 1910",
    "5 - short stories for teenage to adults"
]

REMOVE_ANYWHERE = [
    "www.breakthesilenceNS.ca   #BreakTheSilenceNS"
    r"This book is brought to you by https://www.freekidsbooks.org Page \d+",
    r"http[^\s]*",
    r"www.[^\s]*",
    "Copyright ©2018 Marsha Landau, illustrations by S.D. Monahan",
    "All right reseved.",
    "Copyright 2009 by Latrija",
    "Copyright 2012 Emma Laybourn",
    "sites.google.com/site/kanikagebooks/home",
    "Free ebook from ",
    "tonyonthemoon.co.uk",
    "megamousebooks.com",
    "This is a free book from "
]

FILES_TO_AVOID = [
    "CALI_Coloring_Book-Images-of-the-law-FKB.txt",
    "Robert-FKB-Stories.txt",
    "Robert.txt",
    "african-animals-alphabet-colouring-FKB.txt",
    "y-single-letter-ebook-FKB.txt",
    "who_will_save_the_planet-peter_mclennan-FKB.txt",
    "free-drawing-book-for-kids-cartooning-with-letters-numbers-words.txt",
    "Turtle-Trouble.txt",
    "The-Mouse-That-Was.txt",
    "LATEST-mirror-mirror_FKB.txt",
    "Kung-Fu-Grasshoppers-short-kids-ebook-FKB.txt",
    "FKB-Stories-free-drawing-book-for-kids-cartooning-with-letters-numbers-words.txt",
    "Young_Adult-Coloring_Book-Healing_from_burnout-FKB.txt",
    "Vegepedia-Colouring-R-FKB-Stories.txt",
    "Vegepedia-Colouring-R-FKB-Kids-Stories.txt",
    "Uncovering-Earths-Secrets-FKB-Stories.txt",
    "Uncovering-Earths-Secrets-FKB-Kids-Stories.txt",
    "The_Red_Panda-Wordscientists-FKB-CC-NC_compressed.txt",
    "Sitcky-Brains-ebook-dflip.txt",
    "Sitcky-Brains-ebook-.txt",
    "Sarbassts-Tales-3.txt",
    "Sarbassts-Tales-3-FKB-Stories.txt",
    "Red_Pandas_Like_to_Nap-Wordscientists-FKB.txt",
    "The-Kaptain-Uke-Silly-ABC-Book.txt",
    "The-Kaptain-Uke-Book-of-Shapes.txt",
    "Printable_Coloring_Pages_Alphabet_Older_Children-Peaksel-FKB-copy.txt",
    "FKB-Stories-The-Kaptain-Uke-Silly-ABC-Book.txt",
    "FKB-Stories-The-Kaptain-Uke-Book-of-Shapes.txt",
    "ColoringBookTrentoPublicLibrary-Classic_Colouring-FKB.txt",
    "CKLA_G4_U6_Geology_Reader.txt"
]

POTENTIAL_START_NUMBER = [1,2]
SPLIT_TOKENS = [" "]


def remove_header(content):
    sentences_counter = Counter(content.splitlines())
    return "\n".join([x for x in content.splitlines() if sentences_counter[x] < 5])

def remove_page_numbers(content):
    for start in POTENTIAL_START_NUMBER:
        for slash in [False, True]:
            content = remove_page_numbers_sub(content, start, " ", slash)
    return content


def remove_page_numbers_sub(content, current_number=1, split_token=" ", with_slash=False):
    result = []
    for line in content.splitlines():
        line = line.strip().split(split_token)
        if re.fullmatch(r"\d+/\d+", line[0]):
            number = line[0].split("/")[0]
        elif not with_slash:
            number = line[0]
        else:
            result.append(" ".join(line))
            continue
        try:
            page_number = int(number)
            if page_number == current_number:
                line = line[1:]
                current_number += 1
            else:
                return content
        except ValueError:
            pass
        result.append(" ".join(line))
    return "\n".join(result)


def merge_lines(content):
    result = []
    for line in content.splitlines():
        line = line.strip()
        if line:
            result.append(line)
        else:
            result.append("\n")
    content = " ".join(result)
    content = "\n".join([x.strip(" ") for x in content.splitlines()])
    return content


def remove_end(content):
    for unwanted in USELESS_ENDING:
        match = re.search(unwanted, content)
        if match is not None:
            content = content[:match.start()]
    return content


def remove_start(content):
    for unwanted in USELESS_STARTING:
        match = re.search(unwanted, content)
        while match is not None:
            content = content[match.end():]
            match = re.search(unwanted, content)
    return content


def remove_unwanted(content):
    for unwanted in REMOVE_ANYWHERE:
        content = re.sub(unwanted, "\n", content)
    return content


def clean_file(content):
    content = content.replace("", "\n")
    content = remove_end(content)
    content = remove_start(content)
    content = merge_lines(content)
    content = remove_page_numbers(content)
    content = re.sub(r"  +", " ", content)
    content = re.sub(r"\n\n\n+", "\n", content)
    content = remove_unwanted(content)
    content = remove_header(content)
    return content


if __name__ == "__main__":
    show_number = 65
    visited = set()
    for i, filename in enumerate(os.listdir(RAW_DIR)):
        if filename in FILES_TO_AVOID:
            continue
        with open(RAW_DIR + filename) as f:
            content = f.read()
        result = clean_file(content)
        if len(result) > 10000 or len(result) < 200:
            print(filename, "DROPPED")
            continue
        try:
            lang = detect(result)
        except langdetect.lang_detect_exception.LangDetectException:
            continue
        if lang != "en":
            print(filename, "DROPPED FOR NOT ENGLISH")
            continue
        if i == show_number:
            print("\n\n\n\n")
            print(filename)
            print(i)
            print(len(result))
            #print(content)
            print(result)

        if content in visited:
            continue
        visited.add(content)

        filename = re.sub("FKB-Kids-Stories", "", filename)
        filename = re.sub("FKB-Stories", "", filename)

        with open(PREPROCESS_DIR + filename, "w") as f:
            f.write(result)
