import codecs
import os
import random

in_directory = os.path.join(".", "!Dialect_CC_Samples")
out_directory = os.path.join(".")


for filename in os.listdir(in_directory):
	
	line_list = []
	print("\tStarting " + str(filename))
	
	with open(os.path.join(in_directory, filename), "rb") as fo:
	
		for line in fo:
			line = line.decode("utf-8", errors = "replace")
			line = line.strip()
			line_list.append(line)
			
	random.shuffle(line_list)
	
	#Get number of training / testing / development samples
	dev_size = 2000
	train_size = int(0.8 * (len(line_list) - dev_size))
	test_size = int(0.2 * (len(line_list) - dev_size))
	
	if train_size > 25000:
		train_size = 25000
		
	if test_size > 5000:
		test_size = 5000
		
	print(len(line_list), dev_size, train_size, test_size)
	
	
	f_dev = codecs.open(os.path.join(out_directory, filename.replace(".txt","") + ".dev.txt"), "w", encoding = "utf-8")
	f_train = codecs.open(os.path.join(out_directory, filename.replace(".txt","") + ".train.txt"), "w", encoding = "utf-8")
	f_test = codecs.open(os.path.join(out_directory, filename.replace(".txt","") + ".test.txt"), "w", encoding = "utf-8")
	print(len(line_list))
	for i in range(dev_size):
		line = line_list.pop()
		f_dev.write(line + "\n")
		
	for i in range(train_size):
		line = line_list.pop()
		f_train.write(line + "\n")
		
	for i in range(test_size):
		line = line_list.pop()
		f_test.write(line + "\n")
	print(len(line_list))
		
	f_dev.close()
	f_train.close()
	f_test.close()