package segmenter;

/*
	APS - Affinity Propagation for Segmentation, a linear text segmenter.

Copyright (C) 2011, Anna Kazantseva

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/


import java.io.File;
import java.io.FileFilter;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashSet;
import java.util.Locale;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.regex.Pattern;




import com.aliasi.tokenizer.LowerCaseTokenizerFactory;
import com.aliasi.tokenizer.PorterStemmerTokenizerFactory;
import com.aliasi.tokenizer.RegExFilteredTokenizerFactory;
import com.aliasi.tokenizer.RegExTokenizerFactory;
import com.aliasi.tokenizer.StopTokenizerFactory;
import com.aliasi.tokenizer.TokenizerFactory;

import commandln.*;

import similarity.DfDictionary;
import similarity.Document;
import similarity.CosineSimComputer;
import similarity.ISimComputer;
import similarity.TokenDictionary;


public class RunSegmenter {
	
	
	
	static boolean lowerCase = true;
	static boolean useStemmer = true;
	static boolean removeStopWords = true;
	static File stopWordsFile = new File("./STOPWORD.list");
	
	String segmPattern = "[=]+";
	
	public RunSegmenter()
	{
		
	}
	
	public AbstractAPSegmenterDP CreateConfiguredSegmenter(
			IDataSource[] corpus, 	// corpus for document frequencies,
									// initialized, possibly null if using segment df
			IDataSource curDataSource, // the actual file to segment
			boolean useSegmentDf, // if false, we use regular global tf.idf
			Integer numSegments, // how many segments we split a document into if
								// using segment df
			TokenizerFactory tokenFact, boolean useSparseSegmenter, // should we
																	// use
																	// sparse or
																	// dense
																	// segmenter
			int simWindowSize, // window size for computing similarities
			boolean useSmoothing, Double parzenAlpha, // alpha parameter for
														// smoothing
			Integer parzenWindowSize, double pref, double damp, File outDir)	{
		DfDictionary dfDict;
		int numDocs;
		//use global tf.idf
		if (useSegmentDf == false)
		{
			
			dfDict = new DfDictionary(corpus, tokenFact, corpus.length);
			numDocs = corpus.length;
			try 
			{
				dfDict.ProcessCorpus();
			} 
			catch (Exception e1) {
				// TODO Auto-generated catch block
				System.out.println("Exception in CreateConfiguredSegmenter when processing dfDict globally:");
				e1.printStackTrace();
			}
		}
		//or use segment tf.idf
		else
		{
			numDocs = numSegments;
			IDataSource[] segmCorpus = new IDataSource[] {curDataSource};
			dfDict = new DfDictionary(segmCorpus, tokenFact, numSegments);
			try 
			{
				dfDict.ProcessCorpus();
			} 
			catch (Exception e1) {
				// TODO Auto-generated catch block
				System.out.println("Exception in CreateConfiguredSegmenter when processing dfDict with segments:");
				e1.printStackTrace();
			}
		}
		//dfDict.PrintDf();
		dfDict.ForgetCorpus();
		
		try
		{
			TokenDictionary tokenDict = new TokenDictionary(curDataSource, tokenFact);
			tokenDict.ProcessText();
			
			//compute tf.idf
			Document curDoc = new Document(tokenDict, numDocs, curDataSource);
			curDoc.setDocFreqs(dfDict);
			curDoc.Init();
			//curDoc.PrintTf();
			//curDoc.PrintFreqs();
			//System.out.println("COMPUTED TF.IDF:");
			curDoc.ComputeTfIdf();
			//curDoc.PrintFreqs();
			
			
			//curDoc.PrintSentVectors(0, 12);
			if (useSmoothing == true)
				curDoc.SmoothSentCounts(parzenWindowSize, parzenAlpha);
			//curDoc.PrintFreqs();
			//curDoc.PrintSentVectors(0, 4);
			//System.out.println("APPLIED TF.IDF:");
			curDoc.ApplyTfIdfWeighting();
			//curDoc.PrintSentVectors(0, 12);
			//curDoc.PrintSentVectors(0, 4);
			
			//now compute similarities
			CosineSimComputer simComp = new CosineSimComputer();
			simComp.Init(curDataSource);
			simComp.SetUp(tokenDict, simWindowSize, useSparseSegmenter, outDir );
			simComp.SetSentenceVectors(curDoc.getSentVectors());
			simComp.ComputeSimilarities();
			//simComp.OutputSimilarities(outDir);
			
			//free memory
			curDoc = null;
			simComp.ForgetSentVectors();
			
//			System.out.println("starting to create segm...");
			AbstractAPSegmenterDP segmenter = this.CreateAffinityPropagationSegmenter(simComp, pref, damp, useSparseSegmenter);
			
			return segmenter;
		}
		catch (Exception e)
		{
			System.out.println("Exception processing " + curDataSource.GetName());
			e.printStackTrace();
		}
		
		return null;
	}
	
	public void Run(String inputFilePath, 
			String outputDirPath, 
			String resultFileName, 
			double pref, 
			double damp,
			boolean useSegmentDf, //if false, we use regular global tf.idf 
			Integer numSegments, //how many segments we split a document into if using segment df
			TokenizerFactory tokenFact,
			boolean useSparseSegmenter, //should we use sparse or dense segmenter
			Integer simWindowSize, //window size for computing similarities
			Double simWindowRatio,
			boolean useSmoothing,
			Double parzenAlpha, //alpha parameter for smoothing
			Integer parzenWindowSize,
			String[] inputExtensions,
			String[] corpusExtensions)
	{
		
		File inDir = new File (inputFilePath);
		File outDir = new File (outputDirPath);
		File resultsFile = new File(outDir, resultFileName);
		
		double netRegular = 0.0;
		double netFP = 0.0;
		double netFN = 0.0;
		
		TokenizerFactory tokenizerFactory = this.CreateTokenizerFactory();
		
		IDataSource[] corpus = new IDataSource[0];
		
		if (useSegmentDf == false)
		{
			File[] dfFiles= inDir.listFiles(new SegFileFilter(corpusExtensions));
			corpus = new IDataSource[dfFiles.length];
			for (int i = 0; i < dfFiles.length; i++)
			{
				File cFile = dfFiles[i];
				try
				{
					corpus[i] = new SimpleFileDataSource(cFile, segmPattern);
					corpus[i].Init(IDataSource.SENT_LEVEL);
				}
				catch(Exception e)
				{
					
				}
			}
		}
		File[] files= inDir.listFiles(new SegFileFilter(inputExtensions));
		ArrayList<EvalResult> results = new ArrayList<EvalResult>();
		
		for (int i = 0; i < files.length; i++)
		{
			File curFile = files[i];
			try
			{
				Date startTime = new Date();
//				System.out.println("START: " + startTime.toString());
				System.out.println("\n\nprocessing " + curFile.getAbsolutePath());
				
				IDataSource dataSource  = new SimpleFileDataSource(curFile, segmPattern);
				dataSource.Init(IDataSource.SENT_LEVEL);
				
				int curWindowSize;
				
				if (simWindowSize != null)
					curWindowSize = simWindowSize;
				else if (simWindowRatio != null)
				{
					if (simWindowRatio >= 1)
					{
						System.out.println("windowRation > 1. Setting it to default value 0.5");
						simWindowRatio = 0.5;
					}
					curWindowSize = (int)Math.round(simWindowRatio * dataSource.GetNumChunks());
				}
				else
					throw new Exception("Either simWindowSize or simWindowRation has to be defined");
				
				if (curWindowSize > 400)
				{
					System.out.println("WARNING: large window for computing similarity " + String.valueOf(curWindowSize) + " Convergence may take a while.");
				}
//				System.out.println("using window size " + String.valueOf(curWindowSize));
				
				//check that the size of the window is not more than the length of the file
				if (curWindowSize >= dataSource.GetNumChunks())
				{
					System.out.println("window too large (" + String.valueOf(curWindowSize) + 
							") when numChunks " + String.valueOf(dataSource.GetNumChunks()) );
					curWindowSize = dataSource.GetNumChunks() - 1;
					System.out.println("New win size" + String.valueOf(curWindowSize) );
				}
					
				
				if (useSegmentDf == true)
				{
					corpus = new IDataSource[] {dataSource};
				}
				
				AbstractAPSegmenterDP segmenter = this.CreateConfiguredSegmenter(corpus, 
						dataSource, 
						useSegmentDf, 
						numSegments, 
						tokenizerFactory, 
						useSparseSegmenter, 
						curWindowSize, 
						useSmoothing, 
						parzenAlpha, 
						parzenWindowSize, 
						pref, 
						damp, 
						outDir);
			    
			  
				segmenter.Run();
				TreeMap<Integer, TreeSet<Integer>> assigns;
				
				//System.out.println("Regular assigns:");
				assigns = segmenter.GetAssignments();
				//segmenter.PrintAssignments();
				
				//System.out.println("\nNonconflicting assignments:");
				assigns = segmenter.GetNonConflictingAssignments();
				//segmenter.PrintAssignments();
				
				Integer[] hypo = this.GetHypoBreaks(assigns);
				
				//this.PrintRefBreaks(dataSource);
				
				LinearEvaluator eval = new LinearEvaluator();
				eval.Init(dataSource, hypo);
				//eval.PrintBreaks();
				
				String res = eval.evaluate();
				System.out.println(res);
				
				double curWinDiff = eval.ComputeWinDiff(LinearEvaluator.WINDIFF_MODE.REGULAR);
				double curFP = eval.ComputeWinDiff(LinearEvaluator.WINDIFF_MODE.FP);
				double curFN = eval.ComputeWinDiff(LinearEvaluator.WINDIFF_MODE.FN);
				
				//double df, double pv, double wd, int windowSize, double numRefSegm, double numHypoSegments
				double chosenWindow = 0;
				if (simWindowSize != null)
					chosenWindow = simWindowSize;
				else if (simWindowRatio != null)
					chosenWindow = simWindowRatio;
				
				EvalResult curResult = new EvalResult(damp, pref, curWinDiff, chosenWindow,
						( double ) dataSource.GetReferenceSegmentBreaks().length, ( double ) hypo.length);
				curResult.SetFPFN(curFP, curFN);
				results.add(curResult);
				
				netRegular += curWinDiff;
				netFP += curFP;
				netFN += curFN;
				
				//output segmentation results
				dataSource.Output(new File (outputDirPath + "/" + curFile.getName() + ".seg"), hypo);
				
				Date endTime = new Date();
//				System.out.println("END: " + endTime.toString());
				
			}
			catch (Exception e)
			{
				System.out.println("Exception in RunAPSeg");
				System.out.println(e.getMessage());
				e.printStackTrace();
			}
		}
		
		int counter = files.length;
		
		double stdDev = this.ComputeStdDevWinDiff(results);
		String resultStr = "WinDiffRegular: " + String.valueOf( netRegular / counter) + "\n";
		resultStr += "WinDiffFP: " + String.valueOf( netFP / counter) + "\n";
		resultStr += "WinDiffFN: " + String.valueOf( netFN / counter) + "\n";
		resultStr += "WinDiff std dev: " + String.valueOf( stdDev) + "\n";
		
		TextFileIO.OutputFile(resultsFile, resultStr);
	}
	
	public void CompareSegmentations(File refDir, File hypoDir, File outputDir, String[] validExtensions)
	{
		double netRegular = 0.0;
		double netFP = 0.0;
		double netFN = 0.0;
		
		File resultsFile = new File(outputDir, "compare_segm_results_corrected.txt");
		
		FileFilter filter = new SegFileFilter(validExtensions);
		File[] refFiles= refDir.listFiles(filter);
		ArrayList<EvalResult> results = new ArrayList<EvalResult>();
		
		for (int i = 0; i < refFiles.length; i++)
		{
			File refFile = refFiles[i];
			try
			{
				System.out.println("\n\nprocessing ref: " + refFile.getAbsolutePath());
				
				String name = refFile.getName();
				File hypoFile = new File(hypoDir, name);
				if (hypoFile.exists() == false )
				{
					System.out.println("hypo file does not exist: " + hypoFile.getAbsolutePath());
					continue;
				}
				
				IDataSource refDataSource  = new SimpleFileDataSource(refFile, segmPattern);
				refDataSource.Init(IDataSource.SENT_LEVEL);
				
				
				IDataSource hypoDataSource = new SimpleFileDataSource(hypoFile, segmPattern);
				hypoDataSource.Init(IDataSource.SENT_LEVEL);
				Integer[] hypoBreaks = hypoDataSource.GetReferenceSegmentBreaks();
				
				LinearEvaluator eval = new LinearEvaluator();
				eval.Init(refDataSource, hypoBreaks);
				String res = eval.evaluate();
				System.out.println(res);
				
				double curWinDiff = eval.ComputeWinDiff(LinearEvaluator.WINDIFF_MODE.REGULAR);
				double curFP = eval.ComputeWinDiff(LinearEvaluator.WINDIFF_MODE.FP);
				double curFN = eval.ComputeWinDiff(LinearEvaluator.WINDIFF_MODE.FN);
				
				//double df, double pv, double wd, int windowSize, double numRefSegm, double numHypoSegments
				EvalResult curResult = new EvalResult(0.0, 0.0, curWinDiff, 0,
						( double ) refDataSource.GetReferenceSegmentBreaks().length, ( double ) hypoBreaks.length);
				curResult.SetFPFN(curFP, curFN);
				results.add(curResult);
				
				netRegular += curWinDiff;
				netFP += curFP;
				netFN += curFN;
				
				Date endTime = new Date();
				System.out.println("END: " + endTime.toString());
				
			}
			catch (Exception e)
			{
				System.out.println("Exception in RunAPSeg");
				System.out.println(e.getMessage());
				e.printStackTrace();
				//System.out.println(e.getStackTrace());
			}
		}
		
		int counter = refFiles.length;
		
		double stdDev = this.ComputeStdDevWinDiff(results);
		String resultStr = "WinDiffRegular: " + String.valueOf( netRegular / counter) + "\n";
		resultStr += "WinDiffFP: " + String.valueOf( netFP / counter) + "\n";
		resultStr += "WinDiffFN: " + String.valueOf( netFN / counter) + "\n";
		resultStr += "WinDiff std dev: " + String.valueOf( stdDev) + "\n";
		
		TextFileIO.OutputFile(resultsFile, resultStr);
	}
	
	public double ComputeStdDevWinDiff(ArrayList<EvalResult> results)
	{
		double ave = 0.0;
		double variance = 0.0;
		double stdDev = 0.0;
		for (EvalResult curResult: results)
		{
			ave += curResult.winDiff;
		}
		ave = ave / results.size();
		for (EvalResult curResult: results)
		{
			double curDif = (ave - curResult.winDiff);
			variance += Math.pow( curDif , 2);
		}
		variance = variance / results.size();
		stdDev = Math.sqrt(variance);
		System.out.println("Ave:\t" + String.valueOf(ave) + 
				"\tVar:\t" + String.valueOf(variance) + 
				"\tStd dev:t" + String.valueOf(stdDev));
		
		return stdDev;
	}
	
	public void TuneParameters(String devDirIn, String devDirOut,
			String resultFileName, 
			boolean useSegmentDf, //if false, we use regular global tf.idf 
			Integer numSegments, //how many segments we split a document into if using segment df
			TokenizerFactory tokenFact,
			boolean useSparseSegmenter, //should we use sparse or dense segmenter
			boolean useSmoothing,
			Double parzenAlpha, //alpha parameter for smoothing
			Integer parzenWindowSize,
			double[] prefs,
			double[] damps,
			int[] winSizes,
			String[] validExtensions,
			String[] validCorpusExtensions)
	{
		File devDir = new File(devDirIn);
		File devOut = new File(devDirOut);
		File resultsFile = new File(devOut, resultFileName);
		
		SegFileFilter dfFilter = new SegFileFilter(validCorpusExtensions);
		TokenizerFactory tokenizerFactory = this.CreateTokenizerFactory();
		
		DfDictionary dfDict;
		IDataSource[] corpus = new IDataSource[0];
		
		if (useSegmentDf == false)
		{
			File[] dfFiles= devDir.listFiles(dfFilter);
			corpus = new IDataSource[dfFiles.length];
			for (int i = 0; i < dfFiles.length; i++)
			{
				File cFile = dfFiles[i];
				try
				{
					corpus[i] = new SimpleFileDataSource(cFile, segmPattern);
					corpus[i].Init(IDataSource.SENT_LEVEL);
				}
				catch(Exception e)
				{
					
				}
			}
			dfDict = new DfDictionary(corpus, tokenizerFactory, corpus.length);
			try 
			{
				dfDict.ProcessCorpus();
			} catch (Exception e1) {
				System.out.println("Exception in RUnSegmenter.TuneParameters when Processing dfDict:");
				e1.printStackTrace();
			}
			//dfDict.PrintDf();
			dfDict.ForgetCorpus();
		}
		
		SegFileFilter filter = new SegFileFilter(validExtensions);
		File[] files= devDir.listFiles(filter);	
		
		ArrayList<EvalResult> results = new ArrayList<EvalResult>();
		
		for (int p = 0; p < prefs.length; p++)
		{
			for (int d = 0; d < damps.length; d++)
			{
				for (int w = 0; w < winSizes.length; w++)
				{
					double aveWinDiff = 0;
					double netWinDiff = 0;
					double aveFP = 0;
					double netFP = 0;
					double aveFN = 0;
					double netFN = 0;
					int fileCounter = 0;
					int netRef = 0;
					int netHypo = 0;
					
					double curPref = prefs[p];
					double curDamp = damps[d];
					int curWinSize = winSizes[w];
					
					for (int i = 0; i < files.length; i++)
					{
						File curFile = files[i];
						curWinSize = winSizes[w]; //in case any of the files were too small and we use the full window
						
						Date startTime = new Date();
//						System.out.println("START: " + startTime.toString());
						System.out.println("\n\nprocessing " + curFile.getAbsolutePath());
						System.out.println("parameters: p" + String.valueOf(curPref) + " and d " + String.valueOf(curDamp) + 
								" and win " + String.valueOf(curWinSize));
//						System.out.println("fileCounter: " + String.valueOf(fileCounter));
						try
						{
							IDataSource dataSource  = new SimpleFileDataSource(curFile, segmPattern);
							dataSource.Init(IDataSource.SENT_LEVEL);
							
							//check that the size of the window is not more than the length of the file
							if (curWinSize >= dataSource.GetNumChunks())
							{
								System.out.println("window too large (" + String.valueOf(curWinSize) + 
										") when numChunks " + String.valueOf(dataSource.GetNumChunks()) );
								curWinSize = dataSource.GetNumChunks() - 1;
								System.out.println("New win size" + String.valueOf(curWinSize) );
							}
								
							netRef += dataSource.GetReferenceSegmentBreaks().length;
							
							
							if (useSegmentDf == true)
							{
								corpus = new IDataSource[] {dataSource};
							}
							
							AbstractAPSegmenterDP segmenter = this.CreateConfiguredSegmenter(corpus, 
									dataSource, 
									useSegmentDf, 
									numSegments, 
									tokenizerFactory, 
									useSparseSegmenter, 
									curWinSize, 
									useSmoothing, 
									parzenAlpha, 
									parzenWindowSize, 
									curPref, 
									curDamp, 
									devOut);
						  
							segmenter.Run();
							TreeMap<Integer, TreeSet<Integer>> assigns;
							
							//System.out.println("Regular assigns:");
							assigns = segmenter.GetAssignments();
							//segmenter.PrintAssignments();
							
							//System.out.println("\nNonconflicting assignments:");
							assigns = segmenter.GetNonConflictingAssignments();
							//segmenter.PrintAssignments();
							
							Integer[] hypo = this.GetHypoBreaks(assigns);
							
							//this.PrintRefBreaks(dataSource);
							
							LinearEvaluator eval = new LinearEvaluator();
							eval.Init(dataSource, hypo);
//							eval.PrintBreaks();
							
							
							String res = eval.evaluate();
//							System.out.println(res);
						
							
							double curWinDiff = eval.ComputeWinDiff(LinearEvaluator.WINDIFF_MODE.REGULAR );
							double curFP = eval.ComputeWinDiff(LinearEvaluator.WINDIFF_MODE.FP);
							double curFN = eval.ComputeWinDiff(LinearEvaluator.WINDIFF_MODE.FN);
							
							netWinDiff +=curWinDiff;
							netFP += curFP;
							netFN += curFN;
							
							netHypo+= hypo.length;
							
							fileCounter++;
							
							Date endTime = new Date();
//							System.out.println("END: " + endTime.toString());
						}
						catch (Exception e)
						{
							System.out.println("Exception in RunAPSeg.TuneParameters");
							System.out.println(e.getMessage());
							e.printStackTrace();
						}
					}//end looping over files
					
					curWinSize = winSizes[w]; //in case any of the files were too small, what was the window setting
					
					double aveRef = netRef / fileCounter;
					double aveHypo =  netHypo / fileCounter;
					aveWinDiff = netWinDiff / fileCounter;
					aveFP = netFP / fileCounter;
					aveFN = netFN / fileCounter;
					EvalResult er = new EvalResult(curDamp, curPref, aveWinDiff, curWinSize, aveRef, aveHypo);
					er.SetFPFN(aveFP, aveFN);
					results.add(er);
					System.out.println(String.valueOf(er.dampFactor) + "\t" 
							+ String.valueOf(er.prefValue) + "\t" 
							+ String.valueOf(er.winDiff) + "\t" 
							+ "FP:\t"+ String.valueOf(er.winFP) + "\t"
							+ "FP:\t"+ String.valueOf(er.winFN) + "\t"
							+ String.valueOf(curWinSize) + "\t"
							+ String.valueOf(aveRef) + "\t" + String.valueOf(aveHypo));
//					System.out.println("fileCounter: " + String.valueOf(fileCounter));
				}//end loop ove window sizes
			}//end loop over damp fact
		}//end loop over prefs
		
		StringBuilder resText = new StringBuilder();
		resText.append("damp\tpref\twinSize\twinDiff\twinFP\twinFN\n");
		for (EvalResult res: results)
		{
			resText.append(String.valueOf(res.dampFactor) + "\t" + 
					String.valueOf(res.prefValue) + "\t" + 
					String.valueOf(res.winSize) + "\t" + 
					String.valueOf(res.winDiff)+ "\t" + 
					String.valueOf(res.winFP)+ "\t" + 
					String.valueOf(res.winFN)+ "\t" + 
					String.valueOf(res.ref) + "\t" +
					String.valueOf(res.hypo) + "\n");
		}
		TextFileIO.OutputFile(resultsFile, resText.toString());
	}
	
	public void TuneParametersWinRatios(String devDirIn, String devDirOut,
			String resultFileName, 
			boolean useSegmentDf, //if false, we use regular global tf.idf 
			Integer numSegments, //how many segments we split a document into if using segment df
			TokenizerFactory tokenFact,
			boolean useSparseSegmenter, //should we use sparse or dense segmenter
			boolean useSmoothing,
			Double parzenAlpha, //alpha parameter for smoothing
			Integer parzenWindowSize,
			double[] prefs,
			double[] damps,
			double[] winRatios,
			String[] validExtensions,
			String[] validCorpusExtensions)
	{
		File devDir = new File(devDirIn);
		File devOut = new File(devDirOut);
		File resultsFile = new File(devOut, resultFileName);
		
		SegFileFilter dfFilter = new SegFileFilter(validCorpusExtensions);
		TokenizerFactory tokenizerFactory = this.CreateTokenizerFactory();
		
		DfDictionary dfDict;
		IDataSource[] corpus = new IDataSource[0];
		
		if (useSegmentDf == false)
		{
			File[] dfFiles= devDir.listFiles(dfFilter);
			corpus = new IDataSource[dfFiles.length];
			for (int i = 0; i < dfFiles.length; i++)
			{
				File cFile = dfFiles[i];
				try
				{
					corpus[i] = new SimpleFileDataSource(cFile, segmPattern);
					corpus[i].Init(IDataSource.SENT_LEVEL);
				}
				catch(Exception e)
				{
					
				}
			}
			dfDict = new DfDictionary(corpus, tokenizerFactory, corpus.length);
			try 
			{
				dfDict.ProcessCorpus();
			} catch (Exception e1) {
				System.out.println("Exception in RUnSegmenter.TuneParameters when Processing dfDict:");
				e1.printStackTrace();
			}
			//dfDict.PrintDf();
			dfDict.ForgetCorpus();
		}
		
		SegFileFilter filter = new SegFileFilter(validExtensions);
		File[] files= devDir.listFiles(filter);	
		
		ArrayList<EvalResult> results = new ArrayList<EvalResult>();
		
		for (int p = 0; p < prefs.length; p++)
		{
			for (int d = 0; d < damps.length; d++)
			{
				for (int w = 0; w < winRatios.length; w++)
				{
					double aveWinDiff = 0;
					double netWinDiff = 0;
					double aveFP = 0;
					double netFP = 0;
					double aveFN = 0;
					double netFN = 0;
					int fileCounter = 0;
					int netRef = 0;
					int netHypo = 0;
					
					double curPref = prefs[p];
					double curDamp = damps[d];
					double curRatio = winRatios[w];
					int curWinSize = 0;
					
					
					for (int i = 0; i < files.length; i++)
					{
						File curFile = files[i];
						
						boolean useSparse = useSparseSegmenter;
						
						Date startTime = new Date();
						
						try
						{
							IDataSource dataSource  = new SimpleFileDataSource(curFile, segmPattern);
							dataSource.Init(IDataSource.SENT_LEVEL);
							
							//check if this is one of the large files
							if (dataSource.GetNumChunks() > 400)
							{
								if (curRatio > 0.7)
									curRatio = 0.7;
							}
							
							if ( curRatio > 0.95)
							{
								if (dataSource.GetNumChunks() > 400 )
									curRatio = 0.7;
								else 
									useSparse = false;
							}
							
							Float curRatioFl = new Float (curRatio);
							Integer nChunks = new Integer (dataSource.GetNumChunks());
							
							curWinSize = Math.round( curRatioFl * nChunks);
							
							//in case any of the files were too small and we use the full window
//							System.out.println("START: " + startTime.toString());
							System.out.println("\n\nprocessing " + curFile.getAbsolutePath());
							System.out.println("parameters: p" + String.valueOf(curPref) + " and d " + String.valueOf(curDamp) + 
									" and win ratio" + String.valueOf(winRatios[w]) + " curWinSize " + curWinSize);
//							System.out.println("fileCounter: " + String.valueOf(fileCounter));
							
							
							//check that the size of the window is not more than the length of the file
							if (curWinSize >= dataSource.GetNumChunks())
							{
								System.out.println("window too large (" + String.valueOf(curWinSize) + 
										") when numChunks " + String.valueOf(dataSource.GetNumChunks()) );
								curWinSize = dataSource.GetNumChunks() - 1;
								System.out.println("New win size" + String.valueOf(curWinSize) );
							}
								
							netRef += dataSource.GetReferenceSegmentBreaks().length;
							
							
							if (useSegmentDf == true)
							{
								corpus = new IDataSource[] {dataSource};
							}
							
							AbstractAPSegmenterDP segmenter = this.CreateConfiguredSegmenter(corpus, 
									dataSource, 
									useSegmentDf, 
									numSegments, 
									tokenizerFactory, 
									useSparse, 
									curWinSize, 
									useSmoothing, 
									parzenAlpha, 
									parzenWindowSize, 
									curPref, 
									curDamp, 
									devOut);
						    
						  
							segmenter.Run();
							TreeMap<Integer, TreeSet<Integer>> assigns;
							
							//System.out.println("Regular assigns:");
							assigns = segmenter.GetAssignments();
							//segmenter.PrintAssignments();
							
							//System.out.println("\nNonconflicting assignments:");
							assigns = segmenter.GetNonConflictingAssignments();
							//segmenter.PrintAssignments();
							
							Integer[] hypo = this.GetHypoBreaks(assigns);
							
							//this.PrintRefBreaks(dataSource);
							
							LinearEvaluator eval = new LinearEvaluator();
							eval.Init(dataSource, hypo);
//							eval.PrintBreaks();
							
							
							String res = eval.evaluate();
//							System.out.println(res);
						
							
							double curWinDiff = eval.ComputeWinDiff(LinearEvaluator.WINDIFF_MODE.REGULAR );
							double curFP = eval.ComputeWinDiff(LinearEvaluator.WINDIFF_MODE.FP);
							double curFN = eval.ComputeWinDiff(LinearEvaluator.WINDIFF_MODE.FN);
							
							netWinDiff +=curWinDiff;
							netFP += curFP;
							netFN += curFN;
							
							netHypo+= hypo.length;
							
							fileCounter++;
							
							Date endTime = new Date();
//							System.out.println("END: " + endTime.toString());
							
							//reset winRatio
							curRatio = winRatios[w];
						}
						catch (Exception e)
						{
							System.out.println("Exception in RunAPSeg.TuneParameters");
							System.out.println(e.getMessage());
							e.printStackTrace();
						}
						
					}//end looping over files
					
					
					double aveRef = netRef / fileCounter;
					double aveHypo =  netHypo / fileCounter;
					aveWinDiff = netWinDiff / fileCounter;
					aveFP = netFP / fileCounter;
					aveFN = netFN / fileCounter;
					EvalResult er = new EvalResult(curDamp, curPref, aveWinDiff, curRatio, aveRef, aveHypo);
					er.SetFPFN(aveFP, aveFN);
					results.add(er);
					System.out.println(String.valueOf(er.dampFactor) + "\t" 
							+ String.valueOf(er.prefValue) + "\t" 
							+ String.valueOf(er.winDiff) + "\t" 
							+ "FP:\t"+ String.valueOf(er.winFP) + "\t"
							+ "FP:\t"+ String.valueOf(er.winFN) + "\t"
							+ String.valueOf(curRatio) + "\t"
							+ String.valueOf(aveRef) + "\t" + String.valueOf(aveHypo));
					System.out.println("fileCounter: " + String.valueOf(fileCounter));
				}//end loop ove window sizes
				
				
			}//end loop over damp fact
			
		}//end loop over prefs
		
		StringBuilder resText = new StringBuilder();
		resText.append("damp\tpref\twinSize\twinDiff\twinFP\twinFN\n");
		for (EvalResult res: results)
		{
			resText.append(String.valueOf(res.dampFactor) + "\t" + 
					String.valueOf(res.prefValue) + "\t" + 
					String.valueOf(res.winSize) + "\t" + 
					String.valueOf(res.winDiff)+ "\t" + 
					String.valueOf(res.winFP)+ "\t" + 
					String.valueOf(res.winFN)+ "\t" + 
					String.valueOf(res.ref) + "\t" +
					String.valueOf(res.hypo) + "\n");
		}
		TextFileIO.OutputFile(resultsFile, resText.toString());
	}
	
	Integer[] GetHypoBreaks(TreeMap<Integer, TreeSet<Integer>> assigns)
	{
		if (assigns == null || assigns.isEmpty())
			return (new Integer[]{});
		TreeSet<Integer> sortedBreaks = new TreeSet<Integer>();
		for (Integer examplar: assigns.keySet() )
		{
			
			TreeSet<Integer> children = assigns.get(examplar);
			Integer lastChild = children.last();
			sortedBreaks.add(lastChild);
		}
		//remove last break
		//Integer last = sortedBreaks.last();
		//sortedBreaks.remove(last);
		
		
		//String[] x = (String[]) v.toArray(new String[0]);

		Integer[] ar = (Integer[]) sortedBreaks.toArray(new Integer[0]);
		
//		for (Integer br: ar)
//		{
//			System.out.println("Hypo break after " + br.toString());
//		}
		
		return ar;
	}
	
	AbstractAPSegmenterDP CreateAffinityPropagationSegmenter(ISimComputer sims, double pref, double damp, boolean useSparse) throws Exception
	{
		int maxIterations = 1000;
		AbstractAPSegmenterDP segmenter;
		if (useSparse == false)
			segmenter = new AffinityPropagationSegmenterDense();
		else
			segmenter = new AffinityPropagationSegmenterSparse();
		
		segmenter.Init(sims);
		
		segmenter.setDampFactor(damp);
		segmenter.setMaxIterations(maxIterations);
		segmenter.SetPreferences(pref);
		//System.out.println("set prefs");
		return segmenter;
	}
	
	public void PrintRefBreaks(IDataSource ds)
	{
		Integer ar [] = ds.GetReferenceSegmentBreaks();
		for (Integer br: ar)
		{
			System.out.println("Ref break after " + br.toString());
		}
	}
	
	TokenizerFactory CreateTokenizerFactory()
	{
		//initialize the tokenizer and create the TokenDictionary
		TokenizerFactory tokenizerFactory;
		HashSet<String> stopWords = new HashSet<String>();
		
		//read in stop words
		String str = TextFileIO.ReadTextFile(stopWordsFile);
		String[] sWords = TextFileIO.LinesToArray(str);
		for (int j = 0; j < sWords.length; j++)
		{
			String word = sWords[j].trim();
			if (word.isEmpty())
				continue;
			stopWords.add(word);
		}
		

		//intialize the appropriate tokenizer
		String regex = "[a-zA-Z]{2,}|[0-9]+|\\S{2,}";
		
		//pattern to remove punctuation
		Pattern punctRegex = Pattern.compile("[\\_]+|[^\\W]+");
		
	    tokenizerFactory = new RegExTokenizerFactory(regex);
	    if (lowerCase == true)
	    {
	    	tokenizerFactory = new LowerCaseTokenizerFactory(tokenizerFactory, new Locale("en"));
	    }
	    if (useStemmer == true)
	    {
	    	tokenizerFactory = new PorterStemmerTokenizerFactory(tokenizerFactory);
	    }
	    if (removeStopWords == true)
	    {
	    	tokenizerFactory = new StopTokenizerFactory(tokenizerFactory , stopWords) ;
	    }
	    
	    //filter out punctuation
	    tokenizerFactory = new RegExFilteredTokenizerFactory(tokenizerFactory, punctRegex);
	    return tokenizerFactory;
	}

	/**
	 * @param args
	 */
	public static void main(String[] args) {

		CommandLineParser cmdLine = new CommandLineParser(new ParamGroup(new IParam[]   
     	{
     		new OrGroup(new IParam[] 
     		{
     			new IfGroup("-run", new IParam[] 
     			{
     				new DoubleParam("-preference", null),
     				new DoubleParam("-damping", null),
     				new OrGroup(new IParam[] 
     				{
     					new IntParam("-windowSize", null),
     					new DoubleParam("-windowRatio", null),
     				}),
     			}),
     			new IfGroup("-tune", new IParam[] 
      			{
      				new DoubleArrayParam("-tunePrefs", null),
      				new DoubleArrayParam("-tuneDamps", null),
      				new OrGroup(new IParam[] 
      				{
     					new DoubleArrayParam("-tuneWinRatios", null),
     					new IntArrayParam("-tuneWinSizes", null),
      				}),
      			}),
     		}),
     			
     		new DirectoryParam("-inputDir", null),
     		new DirectoryParam("-outputDir", null),
     		new StringParam("-resultFile", ".+", "results.txt"),
     		new StringArrayParam("-inputExtensions", ".+", new String[]{""}),
     		new StringArrayParam("-corpusExtensions", ".+", new String[]{""}),

     		new BoolParam("-sparse", true),
     		
     		new IfGroup("-useSegmentDf", new IParam[] 
     		{
     			new IntParam("-numTFIDFsegments", null),
     		}),
     		
     		new IfGroup("-smoothing", new IParam[] 
     		{
     			new DoubleParam("-smoothingAlpha", null),
     			new IntParam("-smoothingWindow", null),
     		}),
     	}));
		
		String inputFilePath;
		String outputDirPath;
		String resultFileName;
		String[] corpusExt;
		String[] inputExt;
		
		boolean useSegmentDf; //if false, we use regular global tf.idf  
		Integer numSegments = null; //how many segments we split a document into if using segment df
		boolean useSparseSegmenter; //should we use sparse or dense segmenter
		boolean useSmoothing;
		Double parzenAlpha = null; //alpha parameter for smoothing
		Integer parzenWindowSize = null;

		Double curPref = null; // preference
		Double curDamp = null; // damping factor
		Integer curWinSize = null; // window size
		Double curWinRatio = null; // window ratio

		double[] allPrefs = null;
		double[] allDamps = null;
		double[] allWinRatios = null;
		int[] allWinSizes = null;

		boolean doRun;
		
		StringBuilder params = new StringBuilder("Parameters in use:\n");
		try
		{
			
			cmdLine.Parse(args);
			
			inputFilePath = cmdLine.getStringValue("-inputDir");
			outputDirPath = cmdLine.getStringValue("-outputDir");
			resultFileName = cmdLine.getStringValue("-resultFile");
			corpusExt = cmdLine.getStringArrayValue("-corpusExtensions");
			inputExt = cmdLine.getStringArrayValue("-inputExtensions");
			
			params.append("inputFilePath: " + inputFilePath + "\n");
			params.append("outputDirPath: " + outputDirPath + "\n");
			params.append("resultFileName: " + resultFileName + "\n");
			
			for (String ext: corpusExt)
				params.append("corpusExt: " + ext + "\n");
			for (String ext: inputExt)
				params.append("intputExt: " + ext + "\n");
			
			useSegmentDf = cmdLine.getBoolValue("-useSegmentDf");  
			params.append("useSegmentDf: " + useSegmentDf + "\n");
			if (useSegmentDf)
			{
				numSegments = cmdLine.getIntValue("-numTFIDFsegments");
				params.append("\tnumTFIDFsegments: " + numSegments + "\n");
			}
			
			useSparseSegmenter = cmdLine.getBoolValue("-sparse");
			params.append("useSparseSegmenter: " + useSparseSegmenter + "\n");
		
			
			useSmoothing = cmdLine.getBoolValue("-smoothing");
			params.append("useSmoothing: " + useSmoothing + "\n");
			
			if (useSmoothing)
			{
				parzenAlpha = cmdLine.getDoubleValue("-smoothingAlpha");
				parzenWindowSize = cmdLine.getIntValue("-smoothingWindow");
				params.append("\tparzenAlpha: " + parzenAlpha + "\n");
				params.append("\tparzenWindowSize: " + parzenWindowSize + "\n");
			}

			
			try
			{
				cmdLine.getBoolValue("-run");
				doRun = true;
			}
			catch (Exception e)
			{
				cmdLine.getBoolValue("-tune");
				doRun = false;
			}
			if (doRun)
			{
				params.append("run: true \n");
				curPref = cmdLine.getDoubleValue("-preference");
				curDamp = cmdLine.getDoubleValue("-damping");
				params.append("\tcurPref: " + curPref + "\n");
				params.append("\tcurDamp: " + curDamp + "\n");
				
				try
				{
					curWinSize = cmdLine.getIntValue("-windowSize");
					params.append("\tcurWinSize: " + curWinSize + "\n");
				}
				catch (Exception ex)
				{
					curWinRatio = cmdLine.getDoubleValue("-windowRatio");
					params.append("\tcurWinRatio: " + curWinRatio + "\n");
				}
			}
			else 
			{
				params.append("tune parameters: true \n");
				allPrefs = cmdLine.getDoubleArrayValue("-tunePrefs");
				StringBuilder pr = new StringBuilder("\tallPrefs: ");
				for (double p: allPrefs)
				{
					pr.append(p + " ");
				}
				params.append(pr + "\n");
				
				allDamps = cmdLine.getDoubleArrayValue("-tuneDamps");
				StringBuilder damps = new StringBuilder("\tallDamps: ");
				for (double d: allDamps)
				{
					damps.append(d + " ");
				}
				params.append(damps + "\n");
				
				try
				{
					allWinRatios = cmdLine.getDoubleArrayValue("-tuneWinRatios");
					StringBuilder ratios = new StringBuilder("\tallWinRatios: ");
					for (double r: allWinRatios)
					{
						ratios.append(r + " ");
					}
					params.append(ratios + "\n");
				}
				catch (Exception ex)
				{
					allWinSizes = cmdLine.getIntArrayValue("-tuneWinSizes");
					StringBuilder wSizes = new StringBuilder("\tallWinSizes: ");
					for (double w: allWinSizes)
					{
						wSizes.append(w + " ");
					}
					params.append(wSizes + "\n");
				}
				
				doRun = false;
				
			}
			
			System.out.println("Parameters: \n" + params.toString());
			
		}
		catch (Exception ex)
		{
			
			
			System.out.println("Usage: \n"+
					"[ -run " + 
						"-preference <double> -damping <double> [-windowSize <int> | -windowRatio <double>] ]\n" + 
					"\tOR\n"+
					"[ -tune" +
						"-tunePrefs <comma separated doubles> -tuneDamps <comma separated doubles> [-tuneWinSizes <comma separated ints> | -tuneWinRatios <comma separated doubles>] ]\n" +
					"-sparse <true | false>\n" + 
					"(-useSegmentDf)\n" +
						"\t-numTFIDFsegments <int>\n" + 
					"(-smoothing)\n" + 
						"\t-smoothingWindow <int> -smoothingAlpha <double>\n" +
					"-corpusExtensions <comma separated extensions>\n" + 
					"-inputExtensions <comma separated extensions>\n" + 
					"(Indented parameters are mandatory if the preceding unindented option in set.)\n" +
					"Please read README and see examples of configuration files in ./config");
			
			//System.out.println("You input: " + params.toString());
			return;
		}

		RunSegmenter apSeg;
		TokenizerFactory tokenFact;
		
		apSeg = new RunSegmenter();
		tokenFact = apSeg.CreateTokenizerFactory();

		if (doRun)
		{
			apSeg.Run(inputFilePath, 
					outputDirPath, 
					resultFileName, 
					curPref, 
					curDamp, 
					useSegmentDf, 
					numSegments, 
					tokenFact, 
					useSparseSegmenter, 
					curWinSize,
					curWinRatio,
					useSmoothing, 
					parzenAlpha, 
					parzenWindowSize, 
					inputExt, 
					corpusExt);
		}
		else if (allWinSizes != null)
		{
			apSeg.TuneParameters(inputFilePath, outputDirPath,
					resultFileName, 
					useSegmentDf, //if false, we use regular global tf.idf 
					numSegments, //how many segments we split a document into if using segment df
					tokenFact,
					useSparseSegmenter, //should we use sparse or dense segmenter
					useSmoothing,
					parzenAlpha, //alpha parameter for smoothing
					parzenWindowSize,
					allPrefs,
					allDamps,
					allWinSizes,
					inputExt,
					corpusExt);
		}
		else if (allWinRatios != null)
		{
			apSeg.TuneParametersWinRatios(inputFilePath, outputDirPath,
				resultFileName, 
				useSegmentDf, //if false, we use regular global tf.idf 
				numSegments, //how many segments we split a document into if using segment df
				tokenFact,
				useSparseSegmenter, //should we use sparse or dense segmenter
				useSmoothing,
				parzenAlpha, //alpha parameter for smoothing
				parzenWindowSize,
				allPrefs,
				allDamps,
				allWinRatios,
				inputExt,
				corpusExt);
		}
	}
	
	private class SegFileFilter implements FileFilter
	{
		String[] okExtensions = null;
		public SegFileFilter(String[] validExtensions)
		{
			this.okExtensions = validExtensions;
		}

		public boolean accept(File file)
		{
			for (String goodExtension: this.okExtensions)
			{
				if (file.getName().toLowerCase().endsWith(goodExtension))
					return true;
			}
			return false;
		}
	}
	
	private class EvalResult
	{
		public double dampFactor = 0.0;
		public double prefValue = 0.0;
		public double winSize = 0;
		public double winDiff =  1.0;
		public double winFP = 1.0;
		public double winFN = 1.0;
		public double ref = 0;
		public double hypo = 0;
		
		EvalResult(double df, double pv, double wd, double windowSize, double numRefSegm, double numHypoSegments)
		{
			this.dampFactor = df;
			this.prefValue = pv;
			this.winSize = windowSize;
			this.winDiff = wd;
			this.ref = numRefSegm;
			this.hypo = numHypoSegments;
		}
		
		public void SetFPFN(double FP, double FN)
		{
			this.winFP = FP;
			this.winFN = FN;
		}
		
	}

}



