示例#1
0
  private int[] applyNewFeatures(List<RankList> samples, int[] features) {
    int totalFeatureCount = samples.get(0).get(0).getFeatureCount();
    int[] newFeatures = new int[features.length + lcList.size()];
    System.arraycopy(features, 0, newFeatures, 0, features.length);
    // for(int i=0;i<features.length;i++)
    // newFeatures[i] = features[i];
    for (int k = 0; k < lcList.size(); k++)
      newFeatures[features.length + k] = totalFeatureCount + k + 1;

    float[] addedFeatures = new float[lcList.size()];
    for (int i = 0; i < samples.size(); i++) {
      RankList rl = samples.get(i);
      for (int j = 0; j < rl.size(); j++) {
        DataPoint p = rl.get(j);
        for (int k = 0; k < lcList.size(); k++)
          addedFeatures[k] = lcList.get(k).compute(p.getExternalFeatureVector());

        p.addFeatures(addedFeatures);
      }
    }

    int[] newFeatures2 = new int[lcList.size()];
    for (int i = 0; i < lcList.size(); i++) newFeatures2[i] = newFeatures[i + features.length];

    if (keepOrigFeatures) return newFeatures;
    return newFeatures2;
  }
示例#2
0
  /** @param args */
  public static void main(String[] args) {

    String[] rType =
        new String[] {
          "MART",
          "RankNet",
          "RankBoost",
          "AdaRank",
          "Coordinate Ascent",
          "LambdaRank",
          "LambdaMART",
          "ListNet",
          "Random Forests"
        };
    RANKER_TYPE[] rType2 =
        new RANKER_TYPE[] {
          RANKER_TYPE.MART,
          RANKER_TYPE.RANKNET,
          RANKER_TYPE.RANKBOOST,
          RANKER_TYPE.ADARANK,
          RANKER_TYPE.COOR_ASCENT,
          RANKER_TYPE.LAMBDARANK,
          RANKER_TYPE.LAMBDAMART,
          RANKER_TYPE.LISTNET,
          RANKER_TYPE.RANDOM_FOREST
        };

    String trainFile = "";
    String featureDescriptionFile = "";
    double ttSplit = 0.0; // train-test split
    double tvSplit = 0.0; // train-validation split
    int foldCV = -1;
    String validationFile = "";
    String testFile = "";
    int rankerType = 4;
    String trainMetric = "ERR@10";
    String testMetric = "";
    Evaluator.normalize = false;
    String savedModelFile = "";
    String rankFile = "";
    boolean printIndividual = false;

    // for my personal use
    String indriRankingFile = "";
    String scoreFile = "";

    if (args.length < 2) {
      System.out.println("Usage: java -jar RankLib.jar <Params>");
      System.out.println("Params:");
      System.out.println("  [+] Training (+ tuning and evaluation)");
      System.out.println("\t-train <file>\t\tTraining data");
      System.out.println("\t-ranker <type>\t\tSpecify which ranking algorithm to use");
      System.out.println("\t\t\t\t0: MART (gradient boosted regression tree)");
      System.out.println("\t\t\t\t1: RankNet");
      System.out.println("\t\t\t\t2: RankBoost");
      System.out.println("\t\t\t\t3: AdaRank");
      System.out.println("\t\t\t\t4: Coordinate Ascent");
      System.out.println("\t\t\t\t6: LambdaMART");
      System.out.println("\t\t\t\t7: ListNet");
      System.out.println("\t\t\t\t8: Random Forests");
      System.out.println(
          "\t[ -feature <file> ]\tFeature description file: list features to be considered by the learner, each on a separate line");
      System.out.println("\t\t\t\tIf not specified, all features will be used.");
      // System.out.println("\t[ -metric2t <metric> ]\tMetric to optimize on the training data.
      // Supported: MAP, NDCG@k, DCG@k, P@k, RR@k, BEST@k, ERR@k (default=" + trainMetric + ")");
      System.out.println(
          "\t[ -metric2t <metric> ]\tMetric to optimize on the training data. Supported: MAP, NDCG@k, DCG@k, P@k, RR@k, ERR@k (default="
              + trainMetric
              + ")");
      System.out.println(
          "\t[ -metric2T <metric> ]\tMetric to evaluate on the test data (default to the same as specified for -metric2t)");
      System.out.println(
          "\t[ -gmax <label> ]\tHighest judged relevance label. It affects the calculation of ERR (default="
              + (int) SimpleMath.logBase2(ERRScorer.MAX)
              + ", i.e. 5-point scale {0,1,2,3,4})");
      // System.out.println("\t[ -qrel <file> ]\tTREC-style relevance judgment file. It only affects
      // MAP and NDCG (default=unspecified)");

      System.out.println(
          "\t[ -test <file> ]\tSpecify if you want to evaluate the trained model on this data (default=unspecified)");
      System.out.println(
          "\t[ -validate <file> ]\tSpecify if you want to tune your system on the validation data (default=unspecified)");
      System.out.println(
          "\t\t\t\tIf specified, the final model will be the one that performs best on the validation data");
      System.out.println("\t[ -tvs <x \\in [0..1]> ]\tSet train-validation split to be (x)(1.0-x)");
      System.out.println(
          "\t[ -tts <x \\in [0..1]> ]\tSet train-test split to be (x)(1.0-x). -tts will override -tvs");
      System.out.println(
          "\t[ -kcv <k> ]\t\tSpecify if you want to perform k-fold cross validation using ONLY the specified training data (default=NoCV)");

      System.out.println(
          "\t[ -norm <method>]\tNormalize feature vectors (default=no-normalization). Method can be:");
      System.out.println("\t\t\t\tsum: normalize each feature by the sum of all its values");
      System.out.println("\t\t\t\tzscore: normalize each feature by its mean/standard deviation");

      System.out.println(
          "\t[ -save <model> ]\tSave the learned model to the specified file (default=not-save)");

      System.out.println(
          "\t[ -silent ]\t\tDo not print progress messages (which are printed by default)");

      System.out.println("");
      System.out.println("    [-] RankNet-specific parameters");
      System.out.println(
          "\t[ -epoch <T> ]\t\tThe number of epochs to train (default=" + RankNet.nIteration + ")");
      System.out.println(
          "\t[ -layer <layer> ]\tThe number of hidden layers (default="
              + RankNet.nHiddenLayer
              + ")");
      System.out.println(
          "\t[ -node <node> ]\tThe number of hidden nodes per layer (default="
              + RankNet.nHiddenNodePerLayer
              + ")");
      System.out.println(
          "\t[ -lr <rate> ]\t\tLearning rate (default="
              + (new DecimalFormat("###.########")).format(RankNet.learningRate)
              + ")");

      System.out.println("");
      System.out.println("    [-] RankBoost-specific parameters");
      System.out.println(
          "\t[ -round <T> ]\t\tThe number of rounds to train (default="
              + RankBoost.nIteration
              + ")");
      System.out.println(
          "\t[ -tc <k> ]\t\tNumber of threshold candidates to search. -1 to use all feature values (default="
              + RankBoost.nThreshold
              + ")");

      System.out.println("");
      System.out.println("    [-] AdaRank-specific parameters");
      System.out.println(
          "\t[ -round <T> ]\t\tThe number of rounds to train (default=" + AdaRank.nIteration + ")");
      System.out.println(
          "\t[ -noeq ]\t\tTrain without enqueuing too-strong features (default=unspecified)");
      System.out.println(
          "\t[ -tolerance <t> ]\tTolerance between two consecutive rounds of learning (default="
              + AdaRank.tolerance
              + ")");
      System.out.println(
          "\t[ -max <times> ]\tThe maximum number of times can a feature be consecutively selected without changing performance (default="
              + AdaRank.maxSelCount
              + ")");

      System.out.println("");
      System.out.println("    [-] Coordinate Ascent-specific parameters");
      System.out.println(
          "\t[ -r <k> ]\t\tThe number of random restarts (default=" + CoorAscent.nRestart + ")");
      System.out.println(
          "\t[ -i <iteration> ]\tThe number of iterations to search in each dimension (default="
              + CoorAscent.nMaxIteration
              + ")");
      System.out.println(
          "\t[ -tolerance <t> ]\tPerformance tolerance between two solutions (default="
              + CoorAscent.tolerance
              + ")");
      System.out.println(
          "\t[ -reg <slack> ]\tRegularization parameter (default=no-regularization)");

      System.out.println("");
      System.out.println("    [-] {MART, LambdaMART}-specific parameters");
      System.out.println("\t[ -tree <t> ]\t\tNumber of trees (default=" + LambdaMART.nTrees + ")");
      System.out.println(
          "\t[ -leaf <l> ]\t\tNumber of leaves for each tree (default="
              + LambdaMART.nTreeLeaves
              + ")");
      System.out.println(
          "\t[ -shrinkage <factor> ]\tShrinkage, or learning rate (default="
              + LambdaMART.learningRate
              + ")");
      System.out.println(
          "\t[ -tc <k> ]\t\tNumber of threshold candidates for tree spliting. -1 to use all feature values (default="
              + LambdaMART.nThreshold
              + ")");
      System.out.println(
          "\t[ -mls <n> ]\t\tMin leaf support -- minimum #samples each leaf has to contain (default="
              + LambdaMART.minLeafSupport
              + ")");
      System.out.println(
          "\t[ -estop <e> ]\t\tStop early when no improvement is observed on validaton data in e consecutive rounds (default="
              + LambdaMART.nRoundToStopEarly
              + ")");

      System.out.println("");
      System.out.println("    [-] ListNet-specific parameters");
      System.out.println(
          "\t[ -epoch <T> ]\t\tThe number of epochs to train (default=" + ListNet.nIteration + ")");
      System.out.println(
          "\t[ -lr <rate> ]\t\tLearning rate (default="
              + (new DecimalFormat("###.########")).format(ListNet.learningRate)
              + ")");

      System.out.println("");
      System.out.println("    [-] Random Forests-specific parameters");
      System.out.println("\t[ -bag <r> ]\t\tNumber of bags (default=" + RFRanker.nBag + ")");
      System.out.println(
          "\t[ -srate <r> ]\t\tSub-sampling rate (default=" + RFRanker.subSamplingRate + ")");
      System.out.println(
          "\t[ -frate <r> ]\t\tFeature sampling rate (default="
              + RFRanker.featureSamplingRate
              + ")");
      int type = (RFRanker.rType.ordinal() - RANKER_TYPE.MART.ordinal());
      System.out.println(
          "\t[ -rtype <type> ]\tRanker to bag (default=" + type + ", i.e. " + rType[type] + ")");
      System.out.println(
          "\t[ -tree <t> ]\t\tNumber of trees in each bag (default=" + RFRanker.nTrees + ")");
      System.out.println(
          "\t[ -leaf <l> ]\t\tNumber of leaves for each tree (default="
              + RFRanker.nTreeLeaves
              + ")");
      System.out.println(
          "\t[ -shrinkage <factor> ]\tShrinkage, or learning rate (default="
              + RFRanker.learningRate
              + ")");
      System.out.println(
          "\t[ -tc <k> ]\t\tNumber of threshold candidates for tree spliting. -1 to use all feature values (default="
              + RFRanker.nThreshold
              + ")");
      System.out.println(
          "\t[ -mls <n> ]\t\tMin leaf support -- minimum #samples each leaf has to contain (default="
              + RFRanker.minLeafSupport
              + ")");

      System.out.println("");
      System.out.println("  [+] Testing previously saved models");
      System.out.println("\t-load <model>\t\tThe model to load");
      System.out.println(
          "\t-test <file>\t\tTest data to evaluate the model (specify either this or -rank but not both)");
      System.out.println(
          "\t-rank <file>\t\tRank the samples in the specified file (specify either this or -test but not both)");
      System.out.println(
          "\t[ -metric2T <metric> ]\tMetric to evaluate on the test data (default="
              + trainMetric
              + ")");
      System.out.println(
          "\t[ -gmax <label> ]\tHighest judged relevance label. It affects the calculation of ERR (default="
              + (int) SimpleMath.logBase2(ERRScorer.MAX)
              + ", i.e. 5-point scale {0,1,2,3,4})");
      System.out.println(
          "\t[ -score <file>]\tStore ranker's score for each object being ranked (has to be used with -rank)");
      // System.out.println("\t[ -qrel <file> ]\tTREC-style relevance judgment file. It only affects
      // MAP and NDCG (default=unspecified)");
      System.out.println(
          "\t[ -idv ]\t\tPrint model performance (in test metric) on individual ranked lists (has to be used with -test)");
      System.out.println(
          "\t[ -norm ]\t\tNormalize feature vectors (similar to -norm for training/tuning)");

      /*			System.out.println("");
      			System.out.println("  +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++");
      			System.out.println("  + NOTE: ALWAYS include -letor if you're doing experiments on LETOR 4.0 dataset.       +");
      			System.out.println("  +       The reason is a relevance degree of 2 in the dataset is actually counted as 3 +");
      			System.out.println("  +       (this is based on the evaluation script they provided). To be consistent      +");
      			System.out.println("  +       with their numbers, this program will change 2 to 3 when it loads the data    +");
      			System.out.println("  +       into memory if the -letor flag is specified.                                  +");
      			System.out.println("  +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++");
      */
      System.out.println("");
      return;
    }

    MyThreadPool.init(Runtime.getRuntime().availableProcessors());
    // MyThreadPool.init(2);

    for (int i = 0; i < args.length; i++) {
      if (args[i].compareTo("-train") == 0) trainFile = args[++i];
      else if (args[i].compareTo("-ranker") == 0) rankerType = Integer.parseInt(args[++i]);
      else if (args[i].compareTo("-feature") == 0) featureDescriptionFile = args[++i];
      else if (args[i].compareTo("-metric2t") == 0) trainMetric = args[++i];
      else if (args[i].compareTo("-metric2T") == 0) testMetric = args[++i];
      else if (args[i].compareTo("-gmax") == 0)
        ERRScorer.MAX = Math.pow(2, Double.parseDouble(args[++i]));
      else if (args[i].compareTo("-qrel") == 0) qrelFile = args[++i];
      else if (args[i].compareTo("-tts") == 0) ttSplit = Double.parseDouble(args[++i]);
      else if (args[i].compareTo("-tvs") == 0) tvSplit = Double.parseDouble(args[++i]);
      else if (args[i].compareTo("-kcv") == 0) foldCV = Integer.parseInt(args[++i]);
      else if (args[i].compareTo("-validate") == 0) validationFile = args[++i];
      else if (args[i].compareTo("-test") == 0) testFile = args[++i];
      else if (args[i].compareTo("-norm") == 0) {
        Evaluator.normalize = true;
        String n = args[++i];
        if (n.compareTo("sum") == 0) Evaluator.nml = new SumNormalizor();
        else if (n.compareTo("zscore") == 0) Evaluator.nml = new ZScoreNormalizor();
        else {
          System.out.println("Unknown normalizor: " + n);
          System.out.println("System will now exit.");
          System.exit(1);
        }
      } else if (args[i].compareTo("-save") == 0) Evaluator.modelFile = args[++i];
      else if (args[i].compareTo("-silent") == 0) Ranker.verbose = false;
      else if (args[i].compareTo("-load") == 0) {
        savedModelFile = args[++i];
        modelToLoad = args[i];
      } else if (args[i].compareTo("-idv") == 0) printIndividual = true;
      else if (args[i].compareTo("-rank") == 0) rankFile = args[++i];
      else if (args[i].compareTo("-score") == 0) scoreFile = args[++i];

      // Ranker-specific parameters
      // RankNet
      else if (args[i].compareTo("-epoch") == 0) {
        RankNet.nIteration = Integer.parseInt(args[++i]);
        ListNet.nIteration = Integer.parseInt(args[i]);
      } else if (args[i].compareTo("-layer") == 0)
        RankNet.nHiddenLayer = Integer.parseInt(args[++i]);
      else if (args[i].compareTo("-node") == 0)
        RankNet.nHiddenNodePerLayer = Integer.parseInt(args[++i]);
      else if (args[i].compareTo("-lr") == 0) {
        RankNet.learningRate = Double.parseDouble(args[++i]);
        ListNet.learningRate = Neuron.learningRate;
      }

      // RankBoost
      else if (args[i].compareTo("-tc") == 0) {
        RankBoost.nThreshold = Integer.parseInt(args[++i]);
        LambdaMART.nThreshold = Integer.parseInt(args[i]);
      }

      // AdaRank
      else if (args[i].compareTo("-noeq") == 0) AdaRank.trainWithEnqueue = false;
      else if (args[i].compareTo("-max") == 0) AdaRank.maxSelCount = Integer.parseInt(args[++i]);

      // COORDINATE ASCENT
      else if (args[i].compareTo("-r") == 0) CoorAscent.nRestart = Integer.parseInt(args[++i]);
      else if (args[i].compareTo("-i") == 0) CoorAscent.nMaxIteration = Integer.parseInt(args[++i]);

      // ranker-shared parameters
      else if (args[i].compareTo("-round") == 0) {
        RankBoost.nIteration = Integer.parseInt(args[++i]);
        AdaRank.nIteration = Integer.parseInt(args[i]);
      } else if (args[i].compareTo("-reg") == 0) {
        CoorAscent.slack = Double.parseDouble(args[++i]);
        CoorAscent.regularized = true;
      } else if (args[i].compareTo("-tolerance") == 0) {
        AdaRank.tolerance = Double.parseDouble(args[++i]);
        CoorAscent.tolerance = Double.parseDouble(args[i]);
      }

      // MART / LambdaMART / Random forest
      else if (args[i].compareTo("-tree") == 0) {
        LambdaMART.nTrees = Integer.parseInt(args[++i]);
        RFRanker.nTrees = Integer.parseInt(args[i]);
      } else if (args[i].compareTo("-leaf") == 0) {
        LambdaMART.nTreeLeaves = Integer.parseInt(args[++i]);
        RFRanker.nTreeLeaves = Integer.parseInt(args[i]);
      } else if (args[i].compareTo("-shrinkage") == 0) {
        LambdaMART.learningRate = Float.parseFloat(args[++i]);
        RFRanker.learningRate = Float.parseFloat(args[i]);
      } else if (args[i].compareTo("-mls") == 0) {
        LambdaMART.minLeafSupport = Integer.parseInt(args[++i]);
        RFRanker.minLeafSupport = Integer.parseInt(args[i]);
      } else if (args[i].compareTo("-estop") == 0)
        LambdaMART.nRoundToStopEarly = Integer.parseInt(args[++i]);

      // Random forest
      else if (args[i].compareTo("-bag") == 0) RFRanker.nBag = Integer.parseInt(args[++i]);
      else if (args[i].compareTo("-srate") == 0)
        RFRanker.subSamplingRate = Float.parseFloat(args[++i]);
      else if (args[i].compareTo("-frate") == 0)
        RFRanker.featureSamplingRate = Float.parseFloat(args[++i]);
      else if (args[i].compareTo("-letor") == 0) letor = true;

      /////////////////////////////////////////////////////
      // These parameters are *ONLY* for my personal use
      /////////////////////////////////////////////////////
      else if (args[i].compareTo("-nf") == 0) newFeatureFile = args[++i];
      else if (args[i].compareTo("-keep") == 0) keepOrigFeatures = true;
      else if (args[i].compareTo("-t") == 0) topNew = Integer.parseInt(args[++i]);
      else if (args[i].compareTo("-indri") == 0) indriRankingFile = args[++i];
      else if (args[i].compareTo("-hr") == 0) mustHaveRelDoc = true;
      else {
        System.out.println("Unknown command-line parameter: " + args[i]);
        System.out.println("System will now exit.");
        System.exit(1);
      }
    }

    if (testMetric.compareTo("") == 0) testMetric = trainMetric;

    System.out.println("");
    // System.out.println((keepOrigFeatures)?"Keep orig. features":"Discard orig. features");
    System.out.println("[+] General Parameters:");
    System.out.println("LETOR 4.0 dataset: " + (letor ? "Yes" : "No"));
    Evaluator e = new Evaluator(rType2[rankerType], trainMetric, testMetric);
    if (trainFile.compareTo("") != 0) {
      System.out.println("Training data:\t" + trainFile);

      if (foldCV != -1) {
        System.out.println("Cross validation: " + foldCV + " folds.");
      } else {
        if (testFile.compareTo("") != 0) System.out.println("Test data:\t" + testFile);
        else if (ttSplit > 0.0) // choose to split training data into train and test
        System.out.println("Train-Test split: " + ttSplit);

        if (validationFile.compareTo("") != 0) // the user has specified the validation set
        System.out.println("Validation data:\t" + validationFile);
        else if (ttSplit <= 0.0 && tvSplit > 0.0)
          System.out.println("Train-Validation split: " + tvSplit);
      }
      System.out.println("Ranking method:\t" + rType[rankerType]);
      if (featureDescriptionFile.compareTo("") != 0)
        System.out.println("Feature description file:\t" + featureDescriptionFile);
      else System.out.println("Feature description file:\tUnspecified. All features will be used.");
      System.out.println("Train metric:\t" + trainMetric);
      System.out.println("Test metric:\t" + testMetric);
      if (trainMetric.toUpperCase().startsWith("ERR") || testMetric.toUpperCase().startsWith("ERR"))
        System.out.println(
            "Highest relevance label (to compute ERR): "
                + (int) SimpleMath.logBase2(ERRScorer.MAX));
      if (qrelFile.compareTo("") != 0)
        System.out.println(
            "TREC-format relevance judgment (only affects MAP and NDCG scores): " + qrelFile);
      System.out.println(
          "Feature normalization: " + ((Evaluator.normalize) ? Evaluator.nml.name() : "No"));
      if (modelFile.compareTo("") != 0) System.out.println("Model file: " + modelFile);

      System.out.println("");
      System.out.println("[+] " + rType[rankerType] + "'s Parameters:");
      RankerFactory rf = new RankerFactory();

      rf.createRanker(rType2[rankerType]).printParameters();
      System.out.println("");

      // starting to do some work
      if (foldCV != -1) e.evaluate(trainFile, featureDescriptionFile, foldCV);
      else {
        if (ttSplit > 0.0) // we should use a held-out portion of the training data for testing?
        e.evaluate(trainFile, validationFile, featureDescriptionFile, ttSplit);
        else if (tvSplit > 0.0) // should we use a portion of the training data for validation?
        e.evaluate(trainFile, tvSplit, testFile, featureDescriptionFile);
        else e.evaluate(trainFile, validationFile, testFile, featureDescriptionFile);
      }
    } else // scenario: test a saved model
    {
      System.out.println("Model file:\t" + savedModelFile);
      System.out.println(
          "Feature normalization: " + ((Evaluator.normalize) ? Evaluator.nml.name() : "No"));
      if (rankFile.compareTo("") != 0) {
        if (scoreFile.compareTo("") != 0) e.score(savedModelFile, rankFile, scoreFile);
        else if (indriRankingFile.compareTo("") != 0)
          e.rank(savedModelFile, rankFile, indriRankingFile);
        else e.rank(savedModelFile, rankFile);
      } else {
        System.out.println("Test metric:\t" + testMetric);
        if (testMetric.startsWith("ERR"))
          System.out.println(
              "Highest relevance label (to compute ERR): "
                  + (int) SimpleMath.logBase2(ERRScorer.MAX));
        if (savedModelFile.compareTo("") != 0) e.test(savedModelFile, testFile, printIndividual);
        // This is *ONLY* for my personal use. It is *NOT* exposed via cmd-line
        // It will evaluate the input ranking (without being reranked by any model) using any
        // measure specified via metric2T
        else e.test(testFile);
      }
    }
    MyThreadPool.getInstance().shutdown();
  }