/*
 	APS - Affinity Propagation for Segmentation, a linear text segmenter.
 
    Copyright (C) 2011, Anna Kazantseva

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
    */


package similarity;

import java.util.Set;
import java.util.TreeMap;

import segmenter.IDataSource;


import com.aliasi.tokenizer.TokenizerFactory;

//a class to keep document frequencies of terms in a corpus

public class DfDictionary {
	
	IDataSource[] corpus = null;
	TokenDictionary tokenDict = null;
	TreeMap<Integer, Double> dfDictionary = null;
	TokenizerFactory tFactory = null;
	int numDocuments = -1;
	
	//numDocs reflects the number of documents in the corpus
	//or if we only have one document, the number of segments we should split it into
	public DfDictionary( IDataSource[] corpus, //the data sources must have been initialized
			TokenizerFactory tokenFactory, int numDocs)
	{
		
		this.corpus = corpus;
		this.dfDictionary = new TreeMap<Integer, Double>();
		this.tFactory = tokenFactory;
		this.numDocuments = numDocs;
		
		
		this.tokenDict = new TokenDictionary("", this.tFactory);
		this.tokenDict.ProcessText();
		
		//create a token dictionary containing all tokens in the corpus
		for (IDataSource curData: this.corpus)
		{
			int numChunks = curData.GetNumChunks();
			for (int i = 0; i < numChunks; i++)
			{
				String curChunk = curData.GetChunk(i);
				this.tokenDict.AddText(curChunk);
				
//				System.out.println("Added text in DfDictionary()");
//				System.out.println(curChunk);
//				
//				for (Integer smth: this.tokenDict.GetAllTokenIds())
//				{
//					System.out.println(smth.toString() + " "+this.tokenDict.GetTokenString(smth));
//				}
				
			}
		}
		
		
		
		//initialize df map
		for (Integer id: this.tokenDict.GetAllTokenIds() )
		{
			this.dfDictionary.put(id, new Double(0));
		}
	}
	
	//frees memory
	public void ForgetCorpus()
	{
		this.corpus = null;
	}
	
	
	public void ProcessCorpus() throws Exception
	{
		if (corpus.length == 1)
		{
			this.ProcessDocSegments(this.corpus[0], this.numDocuments);
			return;
		}
		
		for (int i = 0; i < this.corpus.length; i++)
		{
			IDataSource curData = this.corpus[i];
			//compute term frequencies in this document;
			TokenDictionary curDict = new TokenDictionary(curData, this.tFactory);
			curDict.ProcessText();
			Set<String> tokens = curDict.GetAllTokenStrings();
			for (String token : tokens)
			{
				//System.out.println(token);
				Integer id = this.tokenDict.GetTokenId(token);
				if (id == null)
				{
					System.out.println("Warning in DfDictionary.ProcessCorpus. Token in doc not found in this.tokenDict:\t" + token);
					continue;
				}
				Double oldCount = this.dfDictionary.get(id);
				this.dfDictionary.put(id, oldCount + 1);
			}
			
		}
	}
	
	//use this if we only have one document. Split it into equal size segments and
	//compute dfs based on that
	public void ProcessDocSegments(IDataSource data, int numSegments)
	{
		
		int segmSize = (int) Math.floor( data.GetNumChunks() / numSegments);
		
		int curStart = 0;
		int curEnd = 0;
		String curSegm = "";
		
		while(curStart < data.GetNumChunks())
		{
			curEnd = curStart + segmSize;
			//System.out.println("curStart " + String.valueOf(curStart) + " curEnd " + String.valueOf(curEnd));
			
			if (curEnd >= data.GetNumChunks())
				curEnd = data.GetNumChunks() - 1;
			StringBuilder sb = new StringBuilder();
			for (int i = curStart; i <= curEnd; i++)
			{
				sb.append(" " + data.GetChunk(i) + " ");
			}
			curSegm = sb.toString();
			TokenDictionary segmDict = new TokenDictionary(curSegm, this.tFactory);
			segmDict.ProcessText();
			
			Set<String> tokens = segmDict.GetAllTokenStrings();
			for (String token: tokens)
			{
				Integer id = this.tokenDict.GetTokenId(token);
				if (id == null)
				{
					System.out.println("Warning in DfDictionary.ProcessSegments. Token in doc not found in this.tokenDict:\t" + token);
					continue;
				}
				Double oldCount = this.dfDictionary.get(id);
				this.dfDictionary.put(id, oldCount + 1);
				//System.out.println(id.toString() + "\t" + this.tokenDict.GetTokenString(id) + " old count:\t" + oldCount.toString() + " new count:\t" 
				//		+ this.dfDictionary.get(id).toString());
			}
			
			
			curStart = curEnd + 1;
		}
		
	}
	
	public void PrintDf()
	{
		Set<Integer> tokenIds = this.tokenDict.GetAllTokenIds();
		for (Integer curId: tokenIds)
		{
			String curToken = this.tokenDict.GetTokenString(curId);
			System.out.println(curId.toString() + "\t" + curToken + ":\t" + String.valueOf(this.dfDictionary.get( curId)) );
			if (curId > 400)
				break;
		}
	}


	public TreeMap<Integer, Double> getDfDictionary() {
		return dfDictionary;
	}
	
	public Integer GetTokenId(String token)
	{
		return this.tokenDict.GetTokenId(token);
	}
	
	public String GetTokenString(Integer id)
	{
		return this.tokenDict.GetTokenString(id);
	}
	

}
