package szte.csd.indicatorsel;

import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.Map.Entry;

import cc.mallet.types.AugmentableFeatureVector;
import cc.mallet.types.Instance;
import cc.mallet.types.Label;

import szte.nlputils.MapUtils;

import szte.datamining.DataHandler;
import szte.datamining.DataMiningException;
import szte.datamining.mallet.MalletDataHandler;

/**
 * 
 * A greedy selection of features according to several information theoretic feature evaluators.
 *
 */
abstract class InformtaionTheoryIndicatorSelection implements IndicatorSelector {
  protected Set<String> indicators = new HashSet<String>();
  protected double threshold=0.75;
  
  protected double safeentropy(double a, double b)
  {
    return (a == 0.0 ? 0.0 : a * Math.log(b));
  }
  
  abstract public double calcFeatureScore(Map<Boolean, Double> df, Map<Boolean, Double> n);

  public Set<String> getTopRankedFeatures(DataHandler vsm)
      throws DataMiningException {
    return indicators;
  }

  public void reset() {
    indicators = new HashSet<String>();
  }

/*
  public Set<String> getIndicators(DataHandler vsm) throws DataMiningException {
    Set<String> dnf = new HashSet<String>();
    while(true){
      Map<String, Double> scores = new HashMap<String, Double>();
      Map<Boolean, Double> n = new HashMap<Boolean, Double>();
      for(String feature : vsm.getFeatureNames()){
        n = new HashMap<Boolean, Double>();
        Map<Boolean, Double> df = new HashMap<Boolean, Double>();
        instances: for(String id : vsm.getInstanceIds())
        {
          for(String term : dnf)
            if(vsm.getBinaryValue(id, term))
              continue instances;
          MapUtils.addToMap(n, (Boolean)vsm.getLabel(id), 1.0);
          if(vsm.getNumericValue(id, feature) > 0.0)
          {
            MapUtils.addToMap(df, (Boolean)vsm.getLabel(id), 1.0);
          }
        }
        if(!df.containsKey(true) || df.get(true)<=3.0 || df.get(true)/n.get(true)<0.2) 
        {
          continue;
        }
        if(!df.containsKey(false)){
            System.out.println("N: "+feature+" "+df+" "+n);
            scores.put(feature, -df.get(true)/n.get(true));
            continue;
        }
        scores.put(feature, calcFeatureScore(df,n));
      }
      if(scores.size()==0)
        break;
      String bestfeature = MapUtils.sortMapByValue(scores,false).firstKey();
      System.out.println(MapUtils.sortMapByValue(scores,false));
      double limit = threshold;
      if(n.get(true) < 10.0)
        limit = 0.5;
      if(scores.get(bestfeature) > limit)
        break;
      dnf.add(bestfeature);
      System.out.println(bestfeature+" "+scores.get(bestfeature)+" "+n);
    }
    return dnf;
  }
*/
 
 // This is a hack which directly use the MALLET interface (instead of NLPCommons) which results in x10 speed. The same as the previous method. You should use the previous one if you use differnt ML method in NLPCommons.
/**
 * The greedy selection of features based on the calcFeatureScore abstract method. It cacultes basic frequencies for the different feature evaluators.
 */
 public Set<String> getIndicators(DataHandler vsm) throws DataMiningException {
  MalletDataHandler d = (MalletDataHandler)vsm;
  Set<Integer> dnf = new HashSet<Integer>();
  while(true){
    Map<Integer, Double> scores = new HashMap<Integer, Double>();
    Map<Integer,Map<Boolean, Double>> df = new HashMap<Integer,Map<Boolean, Double>>();
    Map<Boolean, Double> n = new HashMap<Boolean, Double>();
    instances: for(int i=0; i<d.data.size(); ++i)
      {
        Instance inst = d.data.get(i);
        AugmentableFeatureVector fv = (AugmentableFeatureVector)inst.getData();
        for(Integer term : dnf)
          for(int j=0;j<fv.getIndices().length;++j)
            if(fv.indexAtLocation(j)==term)
              continue instances;
            else if(fv.indexAtLocation(j)>term)
              break;
        MapUtils.addToMap(n, (Boolean)((Label)inst.getTarget()).getEntry(), 1.0);
        for(int j=0; j<fv.numLocations(); ++j){
            if(!df.containsKey(fv.indexAtLocation(j)))
              df.put(fv.indexAtLocation(j),new HashMap<Boolean,Double>());
            MapUtils.addToMap(df.get(fv.indexAtLocation(j))
                              ,(Boolean)((Label)inst.getTarget()).getEntry()
                              ,1.0);
        }
      }

    for(Entry<Integer,Map<Boolean, Double>> term : df.entrySet())
    {
      if(!term.getValue().containsKey(true) || term.getValue().get(true)<=3.0 || term.getValue().get(true)/n.get(true)<0.2) 
        continue;
      if(!term.getValue().containsKey(false)){
          scores.put(term.getKey(), 1.0 + term.getValue().get(true)/n.get(true));
          continue;
      }
      scores.put(term.getKey(), calcFeatureScore(term.getValue(),n));
    }
    if(scores.size()==0)
      break;
    Integer bestfeature = MapUtils.sortMapByValue(scores,true).firstKey();
    double limit = threshold;
    if(scores.get(bestfeature) < limit)
      break;
    dnf.add(bestfeature);
  }
  Set<String> features = new HashSet<String>();
  for(Integer i : dnf)
    features.add((String)d.data.getAlphabet().lookupObject(i));
  return features;
}

public void setThreshold(double t) {
  threshold = t;
}


}
