import os
import codecs
import random
from collections import defaultdict
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder

all_languages = [
"aai", "aak", "aau", "abk", "abt", "abx", "aby", "aca", "acc", "ace", "acf", "ach", "acr", "acu", 
"ada", "ady", "aey", "afr", "agd", "agg", "agm", "agr", "agu", "aia", "aii", "ake", "akh", "alp", 
"alq", "als", "alt", "alz", "ame", "amf", "amh", "ami", "amk", "amm", "amn", "amp", "amr", "amu", 
"ang", "anv", "aoj", "aom", "aon", "ape", "apn", "apr", "apu", "apw", "apy", "apz", "ara", "arb", 
"arc", "arg", "arl", "arn", "arz", "asm", "aso", "ast", "ata", "atb", "atd", "atj", "att", "auc", 
"auy", "ava", "avt", "awb", "aym", "ayp", "azb", "aze", "azg", "azz", "bak", "bam", "bao", "bar", 
"bas", "bat", "bba", "bbb", "bbc", "bbr", "bch", "bci", "bcl", "bdd", "bef", "bel", "bem", "ben", 
"ber", "bgs", "bhl", "bhw", "big", "bih", "bin", "bis", "bjn", "bjr", "bjv", "bkd", "bkq", "blw", 
"blz", "bmh", "bmr", "bmu", "bnp", "boa", "bod", "boj", "bon", "bos", "box", "bpr", "bpy", "bqc", 
"bqp", "bre", "bsn", "bss", "bts", "btx", "bug", "buk", "bul", "bum", "bus", "bxr", "byr", "byx", 
"bzd", "bzh", "bzj", "caa", "cab", "caf", "cak", "cao", "cap", "car", "cat", "cav", "cax", "cbc", 
"cbi", "cbk", "cbm", "cbr", "cbs", "cbt", "cbu", "cbv", "cce", "cco", "cdo", "ceb", "ceg", "ces", 
"cgc", "cha", "chd", "che", "chf", "chk", "chq", "chr", "chu", "chv", "chw", "chz", "cjk", "cjo", 
"cjp", "cjv", "ckb", "cke", "cki", "ckw", "cle", "clu", "cme", "cmn", "cnh", "cni", "cnl", "coe", 
"cof", "cop", "cor", "cos", "cot", "cpa", "cpb", "cpc", "cpu", "crh", "crm", "crn", "crs", "crx", 
"csb", "cso", "cta", "cti", "ctp", "ctu", "cub", "cuc", "cui", "cuk", "cul", "cut", "cux", "cym", 
"daa", "dad", "dah", "dan", "ded", "des", "deu", "dgc", "dgz", "dhv", "dik", "din", "diq", "div", 
"dje", "djk", "dob", "dop", "dsb", "dty", "dua", "dwr", "dww", "dyu", "ebk", "efi", "ell", "emi", 
"eml", "emp", "eng", "enq", "epo", "ese", "esi", "esk", "est", "etr", "eus", "ewe", "ext", "faa", 
"fai", "fao", "fas", "ffm", "fij", "fil", "fin", "fiu", "fon", "for", "fra", "frp", "frr", "fry", 
"fue", "fuh", "fur", "gaa", "gag", "gah", "gam", "gaw", "gbi", "gdn", "gdr", "gfk", "ghs", "gil", 
"gla", "gle", "glg", "glk", "glv", "gmv", "gng", "gnw", "gof", "gom", "gor", "grn", "gub", "guc", 
"gug", "guh", "gui", "guj", "gul", "gum", "gun", "guo", "guw", "gvc", "gvf", "gwi", "gym", "gyr", 
"hak", "hat", "hau", "haw", "hch", "heb", "heg", "her", "hif", "hil", "hin", "hix", "hla", "hmn", 
"hmo", "hns", "hop", "hot", "hrv", "hsb", "hto", "hub", "hui", "hun", "hus", "huu", "huv", "hva", 
"hwc", "hye", "ian", "iba", "ibg", "ibo", "icr", "ido", "ign", "ikk", "ikw", "ile", "ilo", "imo", 
"ina", "inb", "ind", "inh", "ino", "iou", "ipi", "ish", "isl", "iso", "ita", "iws", "ixl", "jac", 
"jae", "jai", "jam", "jav", "jbo", "jic", "jiv", "jpn", "jvn", "kaa", "kab", "kac", "kal", "kam", 
"kan", "kaq", "kat", "kaz", "kbc", "kbd", "kbh", "kbm", "kbp", "kde", "kdl", "kea", "kek", "ken", 
"kew", "kgf", "kgk", "kgp", "khm", "khz", "kik", "kin", "kir", "kjb", "kje", "kjs", "kkc", "klv", 
"kmb", "kmg", "kmh", "kmo", "kmr", "kms", "kmu", "knf", "knj", "knv", "koi", "kom", "kon", "koo", 
"kor", "kos", "kpf", "kpg", "kpj", "kpr", "kpw", "kpx", "kqc", "kqn", "krc", "kri", "ksd", "ksh", 
"ksr", "kss", "ksw", "ktj", "kto", "kua", "kud", "kue", "kup", "kur", "kvn", "kwi", "kwj", "kwn", 
"kwy", "kyc", "kyf", "kyg", "kyq", "kyz", "kze", "lac", "lad", "lam", "lao", "lat", "lav", "lbe", 
"lbk", "lcm", "leu", "lex", "lez", "lfn", "lid", "lif", "lij", "lim", "lin", "lit", "lmo", "loz", 
"lrc", "ltg", "ltz", "lua", "lub", "lue", "lug", "lun", "luo", "lus", "maa", "mah", "mai", "maj", 
"mal", "mam", "map", "maq", "mar", "mau", "mav", "maz", "mbb", "mbc", "mbh", "mbj", "mbl", "mbt", 
"mca", "mcb", "mcd", "mcf", "mco", "mcp", "mcq", "mdf", "med", "mee", "mek", "meq", "meu", "mfe", 
"mgr", "mhl", "mhr", "mib", "mie", "mig", "mih", "mil", "min", "mio", "mir", "mit", "miz", "mjc", 
"mkd", "mkl", "mkn", "mks", "mlg", "mlh", "mlp", "mlt", "mmo", "mmx", "mna", "mon", "mop", "mos", 
"mox", "mpm", "mpp", "mps", "mpt", "mpx", "mqb", "mqj", "mri", "msa", "msm", "msy", "mti", "mto", 
"mux", "muy", "mva", "mvc", "mvj", "mvn", "mwl", "mxb", "mxp", "mxq", "mxt", "mxv", "mya", "myu", 
"myv", "myw", "myy", "mzn", "nab", "naf", "nah", "nak", "nap", "nas", "nav", "nba", "nbl", "nbq", 
"nca", "nch", "ncj", "ncl", "ncu", "ncx", "ndc", "nde", "ndo", "nds", "nep", "new", "ngl", "ngu", 
"nhe", "nhg", "nhi", "nhu", "nhw", "nhy", "nia", "nif", "nii", "nij", "nin", "niu", "nko", "nld", 
"nno", "noa", "nob", "nop", "nor", "not", "nou", "nov", "npl", "nrm", "nsn", "nso", "ntp", "nvm", 
"nwi", "nya", "nyk", "nyn", "nyu", "nzi", "obo", "oci", "ojb", "oke", "okv", "olo", "omw", "ong", 
"ons", "ood", "opm", "ori", "orm", "oss", "ote", "otm", "otn", "otq", "ots", "pab", "pad", "pag", 
"pah", "pam", "pan", "pao", "pap", "pbb", "pbc", "pcd", "pck", "pcm", "pdc", "pdt", "pes", "pfl", 
"pib", "pio", "pir", "pis", "pli", "pls", "plt", "plu", "pma", "pms", "pnb", "pob", "poe", "poh", 
"poi", "pol", "pon", "por", "ppk", "ppo", "prf", "pri", "ptp", "ptu", "pus", "pwg", "qub", "quc", 
"que", "quf", "qug", "quh", "qul", "qup", "qut", "quw", "quy", "quz", "qvc", "qve", "qvh", "qvi", 
"qvm", "qvn", "qvs", "qvw", "qvz", "qwh", "qxh", "qxn", "qxo", "rai", "ram", "rar", "rkb", "rmn", 
"rmy", "rnd", "roa", "roh", "ron", "roo", "rro", "rue", "run", "rus", "rwo", "sab", "sag", "sah", 
"san", "sat", "scn", "sco", "seh", "sey", "sgz", "shi", "shn", "sid", "sim", "sin", "sja", "slk", 
"sll", "slv", "sme", "smk", "smo", "sna", "snc", "snd", "snn", "snp", "sny", "som", "sop", "soq", 
"sot", "soy", "spa", "spl", "spp", "sps", "spy", "sqi", "srd", "sri", "srm", "srn", "srp", "srq", 
"ssd", "ssg", "ssw", "ssx", "stp", "stq", "sua", "sue", "sun", "suz", "swa", "swc", "swe", "swh", 
"swp", "sxb", "sxn", "szl", "tac", "tah", "taj", "tam", "tar", "tat", "tav", "taw", "tbc", "tbg", 
"tbl", "tbo", "tbz", "tca", "tcf", "tcy", "tdt", "tee", "tel", "ter", "tet", "tew", "tfr", "tgk", 
"tgl", "tgp", "tha", "tif", "tim", "tir", "tiv", "tku", "tlf", "tlh", "tll", "tmh", "tna", "tnc", 
"tnk", "tnn", "toc", "tog", "toh", "toi", "toj", "tok", "ton", "too", "top", "tos", "tot", "tpi", 
"tpt", "tpz", "trc", "trq", "tsc", "tsn", "tso", "tsw", "tsz", "ttc", "tte", "ttj", "tuc", "tue", 
"tuf", "tuk", "tum", "tuo", "tur", "tvk", "tvl", "twi", "txu", "tyv", "tzc", "tze", "tzh", "tzj", 
"tzo", "tzs", "tzt", "tzu", "tzz", "ubr", "ubu", "udm", "udu", "uig", "ukr", "umb", "ura", "urb", 
"urd", "urh", "usa", "usp", "uvl", "uzb", "vec", "ven", "vep", "vie", "viv", "vls", "vmw", "vmy", 
"vol", "waj", "wal", "wap", "war", "wer", "wes", "wiu", "wln", "wls", "wmw", "wnc", "wnu", "wol", 
"wos", "wrs", "wsk", "wuu", "wuv", "xal", "xav", "xed", "xho", "xla", "xmf", "xon", "xsi", "xtd", 
"xtm", "yaa", "yad", "yam", "yao", "yap", "yaq", "yby", "ycn", "yid", "yle", "yml", "yon", "yor", 
"yrb", "yre", "yss", "yua", "yuj", "yut", "yuw", "yva", "zaa", "zab", "zac", "zad", "zai", "zam", 
"zao", "zar", "zas", "zat", "zav", "zaw", "zca", "zea", "zho", "zhs", "zht", "zia", "zlm", "zne", 
"zos", "zpc", "zpi", "zpl", "zpm", "zpo", "zpq", "zpt", "zpu", "zpv", "zpz", "zsr", "ztq", "zty", 
"zul", "zyp"
]

manual_additions = [
    ("orm", "africa_sub"),
    ("mlg", "africa_sub"),
    ("kon", "africa_sub"),
    ("tot",	"america_central"),
    ("nah",	"america_central"),
    ("grn",	"america_south"),
    ("que",	"america_south"),
    ("que", "america_central"),
    ("ori",	"asia_south"),
    ("hmn",	"asia_southeast"), 
    ("hmn", "asia_east"),
    ("msa",	"asia_southeast"), 
    ("msa", "oceania"),
    ("srd",	"europe_west"),
    ("nno",	"europe_west"),
    ("ile",	"europe_west"),
    ("est",	"europe_west"), 
    ("est", "europe_east"),
    ("yid",	"europe_west"),
    ("yid", "europe_east"), 
    ("yid", "america_north"), 
    ("yid", "middle_east"),
    ("aze",	"middle_east"), 
    ("aze", "europe_russia"),
    ("ady", "europe_russia"),
    ("alt", "europe_russia"),
    ("atj", "america_north"),
    ("bam", "africa_sub"),
    ("chu", "europe_russia"),
    ("chu", "europe_east"),
    ("dwr", "africa_north"),
    ("etr", "asia_southeast"),
    ("fue", "africa_sub"),
    ("ish", "africa_sub"),
    ("kea", "africa_sub"),
    ("lbe", "europe_russia"),
    ("mxv", "america_central"),
    ("pdt", "america_north"),
    ("rmy", "europe_east"),
    ("toh", "africa_sub"),
    ("toh", "africa_southern"),
    ("tsz", "america_central"),
]

delete_list = [
"bat", "ber", "bih", "fiu", 
"map", "roa", "twi", "zhs", 
"zht", "din", "tlh", "tmh",
"ang", "arc", "icr", "pdc",
"wes", "arc", "bod", "wuu",
"jai", "myv", "tot", "tzc", 
"tze", "tzo", "tzs", "tzz", 
"wuu", "lfn", "alz", "inh",
"nif", "nov", "xal", "lfn",
"arg", "pdc", "ady", 
 ]

mapping_dict = {
    "arb": "ara",    "arz": "ara",
    "ayp": "ara",    "azb": "aze",
    "esk": "esi",    "pes": "fas", 
    "gnw": "gug",    "grn": "gug",
    "gui": "gug",    "hif": "hin",
    "jvn": "jav",    "zlm": "ind",
    "azz": "nah",    "nch": "nah",
    "ncj": "nah",    "ncx": "nah",
    "ngu": "nah",    "nhe": "nah",
    "nhi": "nah",    "nhw": "nah",
    "nhy": "nah",    "npl": "nah",
    "nno": "nor",    "nob": "nor",
    "qub": "que",
    "quc": "que",    "quf": "que",
    "qug": "que",    "quh": "que",
    "qul": "que",    "qup": "que",
    "quw": "que",    "quy": "que",
    "quz": "que",    "qvc": "que",
    "qve": "que",    "qvh": "que",
    "qvi": "que",    "qvm": "que",
    "qvn": "que",    "qvs": "que",
    "qvw": "que",    "qvz": "que",
    "qwh": "que",    "qxh": "que",
    "qxn": "que",    "qxo": "que",
    "als": "sqi",    "swc": "swa",
    "swh": "swa",    "cmn": "zho",
    "ckb": "kur",    "kmr": "kur",
    "fil": "tgl",    "bos": "hbs",
    "cnr": "hbs",    "hrv": "hbs",
    "srp": "hbs",    "svm": "hbs",
    "ckm": "hbs",    "kjv": "hbs",
    "cpb": "cpu",    "msa": "ind",
    "koi": "kom",
    }

#--------------------------------------------------------------------------------------------
# This function scans the LID data and outputs a corpus in fastText's format
def get_corpus(source, languages, name, international_languages, mapping_dict):

    #Set thresholds
    if "test" in name:
        int_threshold = 15000
        
    elif "train" in name: 
        int_threshold = 100000

    count_dict = defaultdict(int) #Keep a count of how many samples per language, just for info
    print("Getting corpus for ", len(languages), "local and", len(international_languages), "international")
    
    #Open the corpus for writing
    with codecs.open(name, "w", encoding = "utf-8") as fw:
    
        #Iterate over registers
        for register in os.listdir(source):
            if "." not in register:
            
                #Iterate over files 
                for file in os.listdir(os.path.join(source, register)):
                    if file.endswith(".txt"):
                    
                        meta = file.split(".")
                        language = meta[0]
                        
                        if language in mapping_dict:
                            language = mapping_dict[language]
                        
                        #Check if the language is included
                        if language in languages or language in international_languages and language not in delete_list:
                            
                            #Open file and save it
                            with codecs.open(os.path.join(source, register, file), "r", encoding = "utf-8") as fr:
                                for line in fr:
                                    
                                    #Split into smaller bits
                                    line = line.strip()
                                    
                                    #Don't process lines that are too short or have no spaces at all
                                    if len(line) > 99:
                                        if line.count(" ") > 2:
                                    
                                            i = 50 #Set assumed split point, but don't split within words
                                            while True:
                                                try:
                                                    if line[i] != " ":
                                                        i += 1
                                                    else:
                                                        line1 = line[:i]
                                                        #Now write to the output file
                                                        fw.write("__label__" + language + "\t" + line1)
                                                        fw.write("\n")
                                                        count_dict[language] += 1
                                                        break
                                                        
                                                except Exception as e:
                                                   break                                            
                                                    
                                            j = i + 50 #Set assumed split point, but don't split within words unless too much
                                            while True:
                                                try:
                                                    #The string is only so long
                                                    if j < len(line):
                                                    
                                                        if line[j] != " ":
                                                            j += 1
                                                        #If we've reached a space, write it
                                                        else:
                                                            line2 = line[i:min(j,len(line))]
                                                            break
                                                    #If we've reached the end, just write it
                                                    else:
                                                        line2 = line[i:min(j,len(line))]
                                                        fw.write("__label__" + language + "\t" + line2)
                                                        fw.write("\n")
                                                        count_dict[language] += 1
                                                        break
                                                        
                                                except Exception as e:
                                                    break 
    
    count_df = pd.DataFrame.from_dict(count_dict, orient = "index")
    count_df.columns = ["Samples"]
    count_df.to_csv(name+".before.csv")
    
    #Initialize holder for reducing corpus
    data = []
    counts = defaultdict(int)
        
    with codecs.open(name, "r", encoding = "utf-8") as f:
        for line in f:
            data.append(line)
            
    random.shuffle(data)
    
    with codecs.open(name, "w", encoding = "utf-8") as f:
        for line in data:
            current_language = line.split("\t")[0].replace("__label__", "")

            if current_language not in international_languages:
                counts[current_language] += 1
                f.write(line)
                
            elif current_language in international_languages and current_language in ["jpn", "zho"]:
                counts[current_language] += 1
                f.write(line)
                
            elif current_language in international_languages and current_language not in languages:
                if counts[current_language] < int_threshold:
                    counts[current_language] += 1
                    f.write(line)
                    
            elif current_language in international_languages and current_language in languages:
                if counts[current_language] < int_threshold*2:
                    counts[current_language] += 1
                    f.write(line)
                    
    #Write final sample counts
    count_df = pd.DataFrame.from_dict(counts, orient = "index")
    count_df.columns = ["Samples"]
    count_df.to_csv(name+".after.csv")

    return
#--------------------------------------------------------------------------------------------
     
#--------------------------------------------------------------------------------------------
def separate_labels(name):

    true = []
    
    #Open test file and the test file to write without labels
    with codecs.open(name, "r", encoding = "utf-8") as fr:
        with codecs.open(name+".nolabels.txt", "w", encoding = "utf-8") as fw:
        
            for line in fr:
                
                #Separate into labels and text
                try:
                    line = line.strip().replace("\n","").replace("\r","").split("\t")
                    text_line = line[1]
                except Exception as e:
                    ("FIRST", e, line)
                    text_line = line[0].replace("__label__","")
                    
                #Clean and save actual label
                try:
                    label = line[0]
                    
                except Exception as e:
                    print("HERE", e, line)   
                    
                label = label.replace("__label__","")
                true.append(label)
                    
                #Write text to unlabelled file
                try:
                    fw.write(str(text_line))
                except:
                    fw.write("    ")
                    
                fw.write("\n")
                      
    return true
#--------------------------------------------------------------------------------------------

#--------------------------------------------------------------------------------------------
def run_experiment(name, languages, n_workers = 10, international_list = [], mapping_dict = {}):

    #Only if not finished already
    if not os.path.exists(name+".ftz_results.txt"):
    
        PATH_TO_TRAIN = os.path.join(".", "Data_Train")
        PATH_TO_TEST = os.path.join(".", "Data_Test")

        #Get training and testing corpora
        get_corpus(PATH_TO_TEST, languages, name = name + ".test.txt", international_languages = international_list, mapping_dict = mapping_dict)
        get_corpus(PATH_TO_TRAIN, languages, name = name + ".train.txt", international_languages = international_list, mapping_dict = mapping_dict)
        
        #Only train a new model
        if not os.path.exists(name+".full.bin"):
        
            #Train fastText model
            command = "./fastText/fasttext supervised "
            command += "-input " + name+".train.txt" + " -output " + name+".full" + " "
            command += "-neg 100 -dim 100 -bucket 4000000 -loss ns -minCount 2 -epoch 20 -thread " + str(n_workers) + " -autotune-metric f1"
            print(command)
            os.system(command) 
          
        #Only train a new model
        if not os.path.exists(name+".reduced.ftz"):
            #Quantize model to reduce size
            command = "./fastText/fasttext quantize -thread " + str(n_workers) + " "
            command += "-input " + name+".train.txt" + " " + " -output " + name+".full" + " "
            #command += "-input " + name + ".full" + " -output " + name+".ftz" + " "
            print(command)
            os.system(command)
            
            #Rename the reduced file
            os.rename(name+".full.ftz", name+".reduced.ftz")

        #Get labels and test samples
        true = separate_labels(name+".test.txt")
        predict_full = []
        predict_ftz = []

        #Test both full size
        command = "./fastText/fasttext predict "
        command += name+".full.bin" + " " + name+".test.txt.nolabels.txt"
        command += " > " + name +".full_results.txt"
        print(command)
        os.system(command)

        #Test quantized model
        command = "./fastText/fasttext predict "
        command += name+".reduced.ftz" + " " + name+".test.txt.nolabels.txt"
        command += " > " + name +".ftz_results.txt"
        print(command)
        os.system(command)
                    
        #Get arrays with predicted labels  
        with codecs.open(name +".full_results.txt", "r", encoding = "utf-8") as f:
            for line in f:
                predict_full.append(line.strip().replace("__label__",""))
                
        with codecs.open(name +".ftz_results.txt", "r", encoding = "utf-8") as f:
            for line in f:
                predict_ftz.append(line.strip().replace("__label__",""))
       
        #Get classifier reports
        report_full = classification_report(true, predict_full, digits = 2)
        report_ftz = classification_report(true, predict_ftz, digits = 2)
        
        #Full confusion matrix
        matrix_full = confusion_matrix(true, predict_full)
        matrix_labels = sorted(list(set(true)))
        matrix_full_df = pd.DataFrame(matrix_full)
        matrix_full_df.columns = matrix_labels
        matrix_full_df.index = matrix_labels
        matrix_full_df.to_csv(name+".full_errors.csv")
        
        #Reduced confusion matrix
        matrix_ftz = confusion_matrix(true, predict_ftz)
        matrix_labels = sorted(list(set(true)))
        matrix_ftz_df = pd.DataFrame(matrix_ftz)
        matrix_ftz_df.columns = matrix_labels
        matrix_ftz_df.index = matrix_labels
        matrix_ftz_df.to_csv(name+".ftz_errors.csv")

        #Save classifier reports
        with codecs.open(name+".full_results.txt", "w", encoding = "utf-8") as f:
            f.write(report_full)
            
        with codecs.open(name+".ftz_results.txt", "w", encoding = "utf-8") as f:
            f.write(report_ftz)
            
        #Now delete unnecessary files to save space
        if os.path.exists(name+"full.vec"):
            os.remove(name+".full.vec")
        if os.path.exists(name+".train.txt"):
            os.remove(name+".train.txt")
        if os.path.exists(name+".test.txt"):
            os.remove(name+".test.txt")
        if os.path.exists(name+".test.txt.nolabels.txt"):
            os.remove(name+".test.txt.nolabels.txt")
    
    return
#--------------------------------------------------------------------------------------------

#Load region list
region_df = pd.read_csv("Region_Languages_Unique.csv", header = None, index_col = 0)
region_df.columns = ["Language"]
#print(region_df)

#Make dict of region: language pairs
region_dict = {}
for row in region_df.itertuples():
    region = row[0]
    language = row[1]
    if language not in delete_list:
        if region not in region_dict:
            region_dict[region] = []
    region_dict[region].append(language)
    
#Add manual additions
for language, region in manual_additions:
    if language not in delete_list:
        region_dict[region].append(language)

#Load international list
international_df = pd.read_csv("International_Languages.csv", header = None, index_col = 0)
international_df.columns = ["Family", "Speakers", "Code"]
international_list = international_df.loc[:,"Code"].tolist()

#Make list with all languages in all regions
check_list = []
check_list += international_list
for region in region_dict:
    check_list += region_dict[region]
    check_list = list(set(check_list))

#Check full coverage
for language in all_languages:
    if language not in check_list:
        if language not in delete_list:
            print(language, end=", ")

#Go through regions one at a time
for region in region_dict:

    #Get list of languages
    current_languages = []
    current_languages += region_dict[region]
    current_languages = list(set(current_languages))
    
    #Change language codes if necessary
    for i in range(len(current_languages)):
        language = current_languages[i]
        if language in mapping_dict:
            current_languages[i] = mapping_dict[language]
        if language in delete_list:
            current_languages[i] = ""
            
    print(region, len(current_languages))

    #Run it all
    name = "round4."+region
    run_experiment(name, current_languages, n_workers = 32, international_list = international_list, mapping_dict = mapping_dict)
    
#Now do all as baseline
current_languages = []
for region in region_dict:

    #Get list of languages
    current_languages += region_dict[region]
    current_languages = list(set(current_languages))
    
    #Change language codes if necessary
    for i in range(len(current_languages)):
        language = current_languages[i]
        if language in mapping_dict:
            current_languages[i] = mapping_dict[language]
        if language in delete_list:
            current_languages[i] = ""
            
print(region, len(current_languages))

#Run it all
name = "round4.baseline"
run_experiment(name, current_languages, n_workers = 32, international_list = international_list, mapping_dict = mapping_dict)