/*
 	APS - Affinity Propagation for Segmentation, a linear text segmenter.
 
    Copyright (C) 2011, Anna Kazantseva

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
    */

/**
 * 
 */
package segmenter;


import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


/**
 * @author anna
 *
 */
public class SimpleFileDataSource implements IDataSource {
	
	private ArrayList<String> chunks = new ArrayList<String>();
	private ArrayList<Integer> refBreaks = new ArrayList<Integer>();
	private Pattern segmPattern;
	private String segmString = "==========";
	private File inputFile;
	
	/**
	 * 
	 */
	public SimpleFileDataSource(File inputF, String segmPatternStr)
	{
		this.segmPattern = Pattern.compile( segmPatternStr );
		this.inputFile = inputF;
		
	}
	
	
	public void Init(int basicUnits) throws Exception
	{
		if (basicUnits == IDataSource.SENT_LEVEL)
			this.InitSentLevel();
		else if (basicUnits == IDataSource.PAR_LEVEL)
			this.InitParLevel();
		else
		{
			Exception e = new Exception ("Exception in SimpleFileDataSource.Init(): Invalid value of basic units " + String.valueOf(basicUnits) );
			throw e;
		}
	}
	
	private void InitSentLevel()
	{
		try
		{
			
			BufferedReader reader = new BufferedReader(new FileReader(this.inputFile));
			String line;
			while ((line = reader.readLine()) != null)
			{
				if (line.isEmpty() )
					continue;
				Matcher m = this.segmPattern.matcher(line);
				if (m.matches())
				{
					//do not include a break before beginning of the text 
					if (this.chunks.size() > 0)
						
					this.refBreaks.add( this.chunks.size() - 1 );
				}
				else
					this.chunks.add(line);
			
			}
			
			//by convention always add a break at the end, if there was not one
			Integer lastRefBreak = this.refBreaks.get(this.refBreaks.size() - 1);
			if (lastRefBreak < this.GetNumChunks() - 1)
				this.refBreaks.add( new Integer (this.GetNumChunks() - 1));
			
			
		}
		catch (Exception e)
		{
			System.out.println("Exception in SimpleFileDataSource.InitSentLevel(): " + e.getMessage());
		}
	}
	
	private void InitParLevel()
	{
		//was the last line a paragraph break?
		boolean parEnd = false;
		
		try
		{
			BufferedReader reader = new BufferedReader(new FileReader(this.inputFile));
			String line;
			StringBuilder curPar = new StringBuilder();
			
			while ((line = reader.readLine()) != null)
			{
				if (line.isEmpty() )
				{
					if ( parEnd == true)
					{
						continue;
					}
					else // we encountered the end of a paragraph
					{
						parEnd = true;
						this.chunks.add(curPar.toString());
						curPar = new StringBuilder();
						continue;
					}
				}
				
				parEnd = false;
				Matcher m = this.segmPattern.matcher(line);
				if (m.matches())
					this.refBreaks.add( this.chunks.size() - 1 );
				else
					curPar.append(" " + line);
			
			}
		}
		catch (Exception e)
		{
			System.out.println("Exception in SimpleFileDataSource.InitParLevel(): " + e.getMessage());
		}
	}

	/**
	 * @see org.kazantseva.segmentor.IDataSource#GetChunk(int)
	 */
	public String GetChunk(int index) {
		// TODO Auto-generated method stub
		return this.chunks.get(index);
	}

	/* (non-Javadoc)
	 * @see segmentor.IDataSource#GetNumChunks()
	 */
	public int GetNumChunks() {
		// TODO Auto-generated method stub
		return this.chunks.size();
	}

	/* (non-Javadoc)
	 * @see segmentor.IDataSource#GetReferenceSegmentBreaks()
	 */
	public Integer[] GetReferenceSegmentBreaks() {
		// TODO Auto-generated method stub
		return this.refBreaks.toArray(new Integer[this.refBreaks.size()]);
	}


	public String GetName() {
		// TODO Auto-generated method stub
		return this.inputFile.getName();
	}
	
	public void PrintChunks()
	{
		for (int i = 0; i < this.chunks.size(); i++ )
		{
			System.out.println("<" + String.valueOf(i) + ">\t" + this.chunks.get(i) );
		}
	}


	
	public void Output(File outputFile, Integer[] breaks) {
		if (breaks == null || breaks.length == 0)
			breaks = new Integer[]{ new Integer (this.GetNumChunks() - 1) };
		StringBuilder str = new StringBuilder();
		
		int breakIndex = 0;
		int breakValue = breaks[breakIndex];
		
		for (int i = 0; i < this.GetNumChunks() - 1; i++)
		{
			if (i <= breakValue)
			{
				str.append(this.GetChunk(i) + "\n");
			}
			else
			{
				str.append(this.segmString + "\n");
				
				if (breakIndex < breaks.length - 1)
				{
					breakIndex ++;
					breakValue = breaks[breakIndex];
					
				}
				else
				{
					breakValue = Integer.MAX_VALUE;
				}	
			}
		}
		str.append(this.segmString + "\n");
		
		TextFileIO.OutputFile(outputFile, str.toString());
		System.out.println("output " + outputFile.getAbsolutePath());
		
		
	}

}
