/**
 Copyright (C) 2014 Alina Maria Ciobanu
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or any
 later version.
 
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package main;
import handlers.NeedlemanWunschHandler;
import handlers.OrthographicMetricHandler;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import utils.Utils;
import utils.enums.Language;
import utils.enums.Metric;
import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.filters.Filter;
import weka.filters.unsupervised.instance.NonSparseToSparse;

//  software issued under the GNU General Public License

public class FeatureExtractor 
{
	/**
	 * get set of features for alignment of strings a and b, using n-grams of size n
	 * @param a first word
	 * @param b second word
	 * @param n size of n-grams
	 * @return set of features extracted from aligning strings a and b
	 */
	public static Set<String> getFeatures(String a, String b, int n)
	{
		Set<String> features = new HashSet<String>();
		
		NeedlemanWunschHandler nw = new NeedlemanWunschHandler(a, b);

		String alignment = nw.computeAlignment();
		String alignedA = "$" + alignment.split("_")[0] + "$";
		String alignedB = "$" + alignment.split("_")[1] + "$";
		
		for (int i = 0; i < alignedA.length(); i++) 
		{
			if (alignedA.charAt(i) == '-' || alignedB.charAt(i) == '-'
					|| alignedA.charAt(i) != alignedB.charAt(i)
					) 
			{
				if (n == 1)
				{
					String feature = alignedA.charAt(i) + "_" + alignedB.charAt(i);
					features.add(feature);
				}
				
				if (n == 2)
				{
					if (i > 0) 
					{
						String feature = alignedA.charAt(i - 1) + ""
								+ alignedA.charAt(i) + "_"
								+ alignedB.charAt(i - 1) + ""
								+ alignedB.charAt(i);
						features.add(feature);
					}

					if (i < alignedA.length() - 1) 
					{
						String feature = alignedA.charAt(i) + ""
								+ alignedA.charAt(i + 1) + "_"
								+ alignedB.charAt(i) + ""
								+ alignedB.charAt(i + 1);
						features.add(feature);
					}
				}

				if (n == 3)
				{
					if (i > 1)
					{
						String feature = alignedA.charAt(i - 2) + ""
								+ alignedA.charAt(i - 1) + ""
								+ alignedA.charAt(i) + "_"
								+ alignedB.charAt(i - 2) + "" 
								+ alignedB.charAt(i - 1) + ""
								+ alignedB.charAt(i);
						features.add(feature);
					}
					
					if (i < alignedA.length() - 2)
					{
						String feature = alignedA.charAt(i) + ""
								+ alignedA.charAt(i + 1) + ""
								+ alignedA.charAt(i + 2) + "_"
								+ alignedB.charAt(i) + "" 
								+ alignedB.charAt(i + 1) + ""
								+ alignedB.charAt(i + 2);
						features.add(feature);
					}
					
					if (i > 0 && i < alignedA.length() - 1) 
					{
						String feature = alignedA.charAt(i - 1) + ""
								+ alignedA.charAt(i) + ""
								+ alignedA.charAt(i + 1) + "_"
								+ alignedB.charAt(i - 1) + ""
								+ alignedB.charAt(i) + ""
								+ alignedB.charAt(i + 1);
						features.add(feature);
					}
				}
			}
		}
		return features;
	}
	
	/**
	 * get set of features for all files of input pairs 
	 * @param paths to the files with pairs of words
	 * @param n size of n-grams
	 * @return set of features for all files in the list given as parameter
	 */
	public static Set<String> getAllFeaturesForFiles(List<String> paths, int n)
	{
		Set<String> features = new HashSet<String>();
		
		for (String path : paths)
		{
			List<String> pairs = Utils.getLines(path);
					
			for (String pair : pairs)
				features.addAll(getFeatures(pair.split("____")[0], pair.split("____")[1], n));
		}
		
		return features;
	}
	
	/**
	 * get set of features from all files and build an arff file for each input path
	 * file format: word1____word2____label
	 * the output file is saved in the same location as the input file, with an appended .n=x.arff extension
	 * @param paths to the files with pairs of words
	 */
	public static void writeAlignmentArffForFiles(List<String> paths, int n)
	{
		Set<String> features = getAllFeaturesForFiles(paths, n);
		
		FastVector attributes = new FastVector();
		
		// add dummy attribute
		attributes.addElement(new Attribute("dummy"));
		
		// add n-gram features
		for (String feature : features)
			attributes.addElement(new Attribute(feature));
		
		FastVector classes = new FastVector();
		classes.addElement("dummy");
		classes.addElement("cognates");
		classes.addElement("non_cognates");

		// add class label
	     attributes.addElement(new Attribute("class", classes));
		
		Map<String, Integer> map = buildFeaturesMap(features);
		
		for (String path : paths)
		{
			List<String> pairs = Utils.getLines(path);
			
			Instances dataset = new Instances("COGNATES/NONCOGNATES", attributes, 0);
			
			for (String pair : pairs)
			{
				Set<String> currentFeatures = getFeatures(pair.split("____")[0], pair.split("____")[1], n);
			
				if (currentFeatures.size() == 0)
					continue;
			
				// + 1 for dummy (first) attribute
				// + 1 for class label (last) attribute
				double[] values = new double[features.size() + 2];
				
				for (String currentFeature : currentFeatures)
				{
					values[map.get(currentFeature)] = 1;
				}
						
				if (pair.split("____")[2].equals("cognates"))
					values[values.length - 1] = classes.indexOf("cognates");
				else if (pair.split("____")[2].equals("non_cognates"))
					values[values.length - 1] = classes.indexOf("non_cognates");
				
				dataset.add(new Instance(1.0, values));
			}
			
			try
			{
				NonSparseToSparse nonSparseToSparseInstance = new NonSparseToSparse(); 
			    nonSparseToSparseInstance.setInputFormat(dataset); 
			    Instances sparseDataset = Filter.useFilter(dataset, nonSparseToSparseInstance);
			    
			    BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(
			    		path.replace(".txt", "") + ".n=" + n + ".arff")), Charset.forName("UTF8")));
			    out.write(sparseDataset.toString());
			    out.close();
			}
			catch (Exception e)
			{
				System.out.println("Exception while writing arff file for input file '" + path + "'");
				
				if (Utils.verbose)
					e.printStackTrace();
			}
		}
	}
	
	
	/**
	 * build map of features with pairs (feature, index), where index starts at 1 because of a weka known issue
	 * @param features
	 * @return map of features
	 */
	public static Map<String, Integer> buildFeaturesMap(Set<String> features)
	{
		Map<String, Integer> map = new HashMap<String, Integer>();
		
		int index = 1;
		
		for (String feature : features)
			map.put(feature, index++);
		
		return map;
	}
	
	/**
	 * print features ordered by index
	 * @param map of features
	 */
	public static void printFeaturesMap(Map<String, Integer> map)
	{
		for (Entry<String, Integer> entry : map.entrySet())
		{
			System.out.println(entry.getValue() + ". " + entry.getKey());
		}
	}
	/**
	 * get features from all files and build an arff file for each input path
	 * file format: word1____word2____label
	 * the output file is saved in the same location as the input file, with an appended .metric=M.arff extension
	 * @param paths to the files with pairs of words
	 * @param metric orthographic metric to compute
	 */
	public static void writeMetricArffFileForFiles(List<String> paths, Metric metric)
	{
		FastVector attributes = new FastVector();
		
		// add dummy attribute
		attributes.addElement(new Attribute("dummy"));
		attributes.addElement(new Attribute(metric.name()));
		
		FastVector classes = new FastVector();
		classes.addElement("dummy");
		classes.addElement("cognates");
		classes.addElement("non_cognates");

		// add class label
	    attributes.addElement(new Attribute("class", classes));
		
	    OrthographicMetricHandler metricHandler = new OrthographicMetricHandler();
	    
		for (String path : paths)
		{
			List<String> pairs = Utils.getLines(path);
			
			Instances dataset = new Instances("COGNATES/NONCOGNATES", attributes, 0);
			
			for (String pair : pairs)
			{
				// + 1 for dummy (first) attribute
				// + 1 for class label (last) attribute 
				double[] values = new double[3];
				
				values[1] = metricHandler.computeMetric(pair.split("____")[0], pair.split("____")[1], metric);
				if (pair.split("____")[2].equals("cognates"))
					values[2] = classes.indexOf("cognates");
				else if (pair.split("____")[2].equals("non_cognates"))
					values[2] = classes.indexOf("non_cognates");
				
				dataset.add(new Instance(1.0, values));
			}

			try
			{
				NonSparseToSparse nonSparseToSparseInstance = new NonSparseToSparse(); 
			    nonSparseToSparseInstance.setInputFormat(dataset); 
			    Instances sparseDataset = Filter.useFilter(dataset, nonSparseToSparseInstance);
			    
			    BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(
			    		path.replace(".txt", "") + ".metric=" + metric + ".arff")), Charset.forName("UTF8")));
			    out.write(sparseDataset.toString());
			    out.close();
			}
			catch (Exception e)
			{
				System.out.println("Exception while writing arff file for input file '" + path + "'");
				
				if (Utils.verbose)
					e.printStackTrace();
			}
		}
	}

	/** compute orthography-based features for input files with pairs of cognates and non-cognates
	 * and write output arff files
	 */
	public static void main(String[] args) throws Exception
	{
		for (Language language : Arrays.asList(Language.FRENCH, Language.ITALIAN, Language.SPANISH, Language.PORTUGUESE))
		{
			String trainFilePath = "files/" + language.toString().toLowerCase() + "_train.txt";
			String testFilePath = "files/" + language.toString().toLowerCase() + "_test.txt";
			
			for (int i = 1; i <= 3; i++)
				writeAlignmentArffForFiles(Arrays.asList(trainFilePath, testFilePath), i);
			
			for (Metric metric : Arrays.asList(Metric.EDIT, Metric.LCSR, Metric.XDICE))
				writeMetricArffFileForFiles(Arrays.asList(trainFilePath, testFilePath), metric);
		}
	}
}