* Returns a list of featured thresholded by minPrecision and sorted by their frequency of
   * occurrence. precision in this case, is defined as the frequency of majority label over total
   * frequency for that feature.
   * @return list of high precision features.
  private List<F> getHighPrecisionFeatures(
      GeneralDataset<L, F> dataset, double minPrecision, int maxNumFeatures) {
    int[][] feature2label = new int[dataset.numFeatures()][dataset.numClasses()];
    for (int f = 0; f < dataset.numFeatures(); f++) Arrays.fill(feature2label[f], 0);

    int[][] data = dataset.data;
    int[] labels = dataset.labels;
    for (int d = 0; d < data.length; d++) {
      int label = labels[d];
      // System.out.println("datum id:"+d+" label id: "+label);
      if (data[d] != null) {
        // System.out.println(" number of features:"+data[d].length);
        for (int n = 0; n < data[d].length; n++) {
    Counter<F> feature2freq = new ClassicCounter<F>();
    for (int f = 0; f < dataset.numFeatures(); f++) {
      int maxF = ArrayMath.max(feature2label[f]);
      int total = ArrayMath.sum(feature2label[f]);
      double precision = ((double) maxF) / total;
      F feature = dataset.featureIndex.get(f);
      if (precision >= minPrecision) {
        feature2freq.incrementCount(feature, total);
    if (feature2freq.size() > maxNumFeatures) {
      Counters.retainTop(feature2freq, maxNumFeatures);
    // for(F feature : feature2freq.keySet())
    // System.out.println(feature+" "+feature2freq.getCount(feature));
    // System.exit(0);
    return Counters.toSortedList(feature2freq);
 public LinearClassifier<L, F> trainClassifier(GeneralDataset<L, F> dataset, double[] initial) {
   if (dataset instanceof RVFDataset) ((RVFDataset<L, F>) dataset).ensureRealValues();
   double[][] weights = trainWeights(dataset, initial, false);
   LinearClassifier<L, F> classifier =
       new LinearClassifier<L, F>(weights, dataset.featureIndex(), dataset.labelIndex());
   return classifier;
 /** IMPORTANT: dataset and biasedDataset must have same featureIndex, labelIndex */
 public Classifier<L, F> trainClassifierSemiSup(
     GeneralDataset<L, F> data,
     GeneralDataset<L, F> biasedData,
     double[][] confusionMatrix,
     double[] initial) {
   double[][] weights = trainWeightsSemiSup(data, biasedData, confusionMatrix, initial);
   LinearClassifier<L, F> classifier =
       new LinearClassifier<L, F>(weights, data.featureIndex(), data.labelIndex());
   return classifier;
  * Train a classifier with a sigma tuned on a validation set. In this case we are fitting on the
  * last 30% of the training data.
  * @param train The data to train (and validate) on.
  * @return The constructed classifier
 public LinearClassifier<L, F> trainClassifierV(
     GeneralDataset<L, F> train, double min, double max, boolean accuracy) {
   labelIndex = train.labelIndex();
   featureIndex = train.featureIndex();
   tuneSigmaHeldOut = true;
   this.min = min;
   this.max = max;
   double[][] weights = trainWeights(train);
   return new LinearClassifier<L, F>(weights, train.featureIndex(), train.labelIndex());
  public Classifier<L, F> trainClassifier(
      GeneralDataset<L, F> dataset, float[] dataWeights, LogPrior prior) {
    Minimizer<DiffFunction> minimizer = getMinimizer();
    if (dataset instanceof RVFDataset) ((RVFDataset<L, F>) dataset).ensureRealValues();
    LogConditionalObjectiveFunction<L, F> objective =
        new LogConditionalObjectiveFunction<L, F>(dataset, dataWeights, logPrior);

    double[] initial = objective.initial();
    double[] weights = minimizer.minimize(objective, TOL, initial);

    LinearClassifier<L, F> classifier =
        new LinearClassifier<L, F>(
            objective.to2D(weights), dataset.featureIndex(), dataset.labelIndex());
    return classifier;
  * Trains the linear classifier using Generalized Expectation criteria as described in
  * <tt>Generalized Expectation Criteria for Semi Supervised Learning of Conditional Random
  * Fields</tt>, Mann and McCallum, ACL 2008. The original algorithm is proposed for CRFs but has
  * been adopted to LinearClassifier (which is a simpler special case of a CRF). IMPORTANT: the
  * labeled features that are passed as an argument are assumed to be binary valued, although other
  * features are allowed to be real valued.
 public LinearClassifier<L, F> trainSemiSupGE(
     GeneralDataset<L, F> labeledDataset,
     List<? extends Datum<L, F>> unlabeledDataList,
     List<F> GEFeatures,
     double convexComboCoeff) {
   Minimizer<DiffFunction> minimizer = minimizerCreator.create();
   LogConditionalObjectiveFunction<L, F> objective =
       new LogConditionalObjectiveFunction<L, F>(
           labeledDataset, new LogPrior(LogPrior.LogPriorType.NULL));
   GeneralizedExpectationObjectiveFunction<L, F> geObjective =
       new GeneralizedExpectationObjectiveFunction<L, F>(
           labeledDataset, unlabeledDataList, GEFeatures);
   SemiSupervisedLogConditionalObjectiveFunction semiSupObjective =
       new SemiSupervisedLogConditionalObjectiveFunction(
           objective, geObjective, null, convexComboCoeff);
   double[] initial = objective.initial();
   double[] weights = minimizer.minimize(semiSupObjective, TOL, initial);
   return new LinearClassifier<L, F>(
       objective.to2D(weights), labeledDataset.featureIndex(), labeledDataset.labelIndex());
 public double[] heldOutSetSigma(GeneralDataset<L, F> train, Scorer<L> scorer) {
   Pair<GeneralDataset<L, F>, GeneralDataset<L, F>> data = train.split(0.3);
   return heldOutSetSigma(data.first(), data.second(), scorer);