private int[] applyNewFeatures(List<RankList> samples, int[] features) { int totalFeatureCount = samples.get(0).get(0).getFeatureCount(); int[] newFeatures = new int[features.length + lcList.size()]; System.arraycopy(features, 0, newFeatures, 0, features.length); // for(int i=0;i<features.length;i++) // newFeatures[i] = features[i]; for (int k = 0; k < lcList.size(); k++) newFeatures[features.length + k] = totalFeatureCount + k + 1; float[] addedFeatures = new float[lcList.size()]; for (int i = 0; i < samples.size(); i++) { RankList rl = samples.get(i); for (int j = 0; j < rl.size(); j++) { DataPoint p = rl.get(j); for (int k = 0; k < lcList.size(); k++) addedFeatures[k] = lcList.get(k).compute(p.getExternalFeatureVector()); p.addFeatures(addedFeatures); } } int[] newFeatures2 = new int[lcList.size()]; for (int i = 0; i < lcList.size(); i++) newFeatures2[i] = newFeatures[i + features.length]; if (keepOrigFeatures) return newFeatures; return newFeatures2; }
/** @param args */ public static void main(String[] args) { String[] rType = new String[] { "MART", "RankNet", "RankBoost", "AdaRank", "Coordinate Ascent", "LambdaRank", "LambdaMART", "ListNet", "Random Forests" }; RANKER_TYPE[] rType2 = new RANKER_TYPE[] { RANKER_TYPE.MART, RANKER_TYPE.RANKNET, RANKER_TYPE.RANKBOOST, RANKER_TYPE.ADARANK, RANKER_TYPE.COOR_ASCENT, RANKER_TYPE.LAMBDARANK, RANKER_TYPE.LAMBDAMART, RANKER_TYPE.LISTNET, RANKER_TYPE.RANDOM_FOREST }; String trainFile = ""; String featureDescriptionFile = ""; double ttSplit = 0.0; // train-test split double tvSplit = 0.0; // train-validation split int foldCV = -1; String validationFile = ""; String testFile = ""; int rankerType = 4; String trainMetric = "ERR@10"; String testMetric = ""; Evaluator.normalize = false; String savedModelFile = ""; String rankFile = ""; boolean printIndividual = false; // for my personal use String indriRankingFile = ""; String scoreFile = ""; if (args.length < 2) { System.out.println("Usage: java -jar RankLib.jar <Params>"); System.out.println("Params:"); System.out.println(" [+] Training (+ tuning and evaluation)"); System.out.println("\t-train <file>\t\tTraining data"); System.out.println("\t-ranker <type>\t\tSpecify which ranking algorithm to use"); System.out.println("\t\t\t\t0: MART (gradient boosted regression tree)"); System.out.println("\t\t\t\t1: RankNet"); System.out.println("\t\t\t\t2: RankBoost"); System.out.println("\t\t\t\t3: AdaRank"); System.out.println("\t\t\t\t4: Coordinate Ascent"); System.out.println("\t\t\t\t6: LambdaMART"); System.out.println("\t\t\t\t7: ListNet"); System.out.println("\t\t\t\t8: Random Forests"); System.out.println( "\t[ -feature <file> ]\tFeature description file: list features to be considered by the learner, each on a separate line"); System.out.println("\t\t\t\tIf not specified, all features will be used."); // System.out.println("\t[ -metric2t <metric> ]\tMetric to optimize on the training data. // Supported: MAP, NDCG@k, DCG@k, P@k, RR@k, BEST@k, ERR@k (default=" + trainMetric + ")"); System.out.println( "\t[ -metric2t <metric> ]\tMetric to optimize on the training data. Supported: MAP, NDCG@k, DCG@k, P@k, RR@k, ERR@k (default=" + trainMetric + ")"); System.out.println( "\t[ -metric2T <metric> ]\tMetric to evaluate on the test data (default to the same as specified for -metric2t)"); System.out.println( "\t[ -gmax <label> ]\tHighest judged relevance label. It affects the calculation of ERR (default=" + (int) SimpleMath.logBase2(ERRScorer.MAX) + ", i.e. 5-point scale {0,1,2,3,4})"); // System.out.println("\t[ -qrel <file> ]\tTREC-style relevance judgment file. It only affects // MAP and NDCG (default=unspecified)"); System.out.println( "\t[ -test <file> ]\tSpecify if you want to evaluate the trained model on this data (default=unspecified)"); System.out.println( "\t[ -validate <file> ]\tSpecify if you want to tune your system on the validation data (default=unspecified)"); System.out.println( "\t\t\t\tIf specified, the final model will be the one that performs best on the validation data"); System.out.println("\t[ -tvs <x \\in [0..1]> ]\tSet train-validation split to be (x)(1.0-x)"); System.out.println( "\t[ -tts <x \\in [0..1]> ]\tSet train-test split to be (x)(1.0-x). -tts will override -tvs"); System.out.println( "\t[ -kcv <k> ]\t\tSpecify if you want to perform k-fold cross validation using ONLY the specified training data (default=NoCV)"); System.out.println( "\t[ -norm <method>]\tNormalize feature vectors (default=no-normalization). Method can be:"); System.out.println("\t\t\t\tsum: normalize each feature by the sum of all its values"); System.out.println("\t\t\t\tzscore: normalize each feature by its mean/standard deviation"); System.out.println( "\t[ -save <model> ]\tSave the learned model to the specified file (default=not-save)"); System.out.println( "\t[ -silent ]\t\tDo not print progress messages (which are printed by default)"); System.out.println(""); System.out.println(" [-] RankNet-specific parameters"); System.out.println( "\t[ -epoch <T> ]\t\tThe number of epochs to train (default=" + RankNet.nIteration + ")"); System.out.println( "\t[ -layer <layer> ]\tThe number of hidden layers (default=" + RankNet.nHiddenLayer + ")"); System.out.println( "\t[ -node <node> ]\tThe number of hidden nodes per layer (default=" + RankNet.nHiddenNodePerLayer + ")"); System.out.println( "\t[ -lr <rate> ]\t\tLearning rate (default=" + (new DecimalFormat("###.########")).format(RankNet.learningRate) + ")"); System.out.println(""); System.out.println(" [-] RankBoost-specific parameters"); System.out.println( "\t[ -round <T> ]\t\tThe number of rounds to train (default=" + RankBoost.nIteration + ")"); System.out.println( "\t[ -tc <k> ]\t\tNumber of threshold candidates to search. -1 to use all feature values (default=" + RankBoost.nThreshold + ")"); System.out.println(""); System.out.println(" [-] AdaRank-specific parameters"); System.out.println( "\t[ -round <T> ]\t\tThe number of rounds to train (default=" + AdaRank.nIteration + ")"); System.out.println( "\t[ -noeq ]\t\tTrain without enqueuing too-strong features (default=unspecified)"); System.out.println( "\t[ -tolerance <t> ]\tTolerance between two consecutive rounds of learning (default=" + AdaRank.tolerance + ")"); System.out.println( "\t[ -max <times> ]\tThe maximum number of times can a feature be consecutively selected without changing performance (default=" + AdaRank.maxSelCount + ")"); System.out.println(""); System.out.println(" [-] Coordinate Ascent-specific parameters"); System.out.println( "\t[ -r <k> ]\t\tThe number of random restarts (default=" + CoorAscent.nRestart + ")"); System.out.println( "\t[ -i <iteration> ]\tThe number of iterations to search in each dimension (default=" + CoorAscent.nMaxIteration + ")"); System.out.println( "\t[ -tolerance <t> ]\tPerformance tolerance between two solutions (default=" + CoorAscent.tolerance + ")"); System.out.println( "\t[ -reg <slack> ]\tRegularization parameter (default=no-regularization)"); System.out.println(""); System.out.println(" [-] {MART, LambdaMART}-specific parameters"); System.out.println("\t[ -tree <t> ]\t\tNumber of trees (default=" + LambdaMART.nTrees + ")"); System.out.println( "\t[ -leaf <l> ]\t\tNumber of leaves for each tree (default=" + LambdaMART.nTreeLeaves + ")"); System.out.println( "\t[ -shrinkage <factor> ]\tShrinkage, or learning rate (default=" + LambdaMART.learningRate + ")"); System.out.println( "\t[ -tc <k> ]\t\tNumber of threshold candidates for tree spliting. -1 to use all feature values (default=" + LambdaMART.nThreshold + ")"); System.out.println( "\t[ -mls <n> ]\t\tMin leaf support -- minimum #samples each leaf has to contain (default=" + LambdaMART.minLeafSupport + ")"); System.out.println( "\t[ -estop <e> ]\t\tStop early when no improvement is observed on validaton data in e consecutive rounds (default=" + LambdaMART.nRoundToStopEarly + ")"); System.out.println(""); System.out.println(" [-] ListNet-specific parameters"); System.out.println( "\t[ -epoch <T> ]\t\tThe number of epochs to train (default=" + ListNet.nIteration + ")"); System.out.println( "\t[ -lr <rate> ]\t\tLearning rate (default=" + (new DecimalFormat("###.########")).format(ListNet.learningRate) + ")"); System.out.println(""); System.out.println(" [-] Random Forests-specific parameters"); System.out.println("\t[ -bag <r> ]\t\tNumber of bags (default=" + RFRanker.nBag + ")"); System.out.println( "\t[ -srate <r> ]\t\tSub-sampling rate (default=" + RFRanker.subSamplingRate + ")"); System.out.println( "\t[ -frate <r> ]\t\tFeature sampling rate (default=" + RFRanker.featureSamplingRate + ")"); int type = (RFRanker.rType.ordinal() - RANKER_TYPE.MART.ordinal()); System.out.println( "\t[ -rtype <type> ]\tRanker to bag (default=" + type + ", i.e. " + rType[type] + ")"); System.out.println( "\t[ -tree <t> ]\t\tNumber of trees in each bag (default=" + RFRanker.nTrees + ")"); System.out.println( "\t[ -leaf <l> ]\t\tNumber of leaves for each tree (default=" + RFRanker.nTreeLeaves + ")"); System.out.println( "\t[ -shrinkage <factor> ]\tShrinkage, or learning rate (default=" + RFRanker.learningRate + ")"); System.out.println( "\t[ -tc <k> ]\t\tNumber of threshold candidates for tree spliting. -1 to use all feature values (default=" + RFRanker.nThreshold + ")"); System.out.println( "\t[ -mls <n> ]\t\tMin leaf support -- minimum #samples each leaf has to contain (default=" + RFRanker.minLeafSupport + ")"); System.out.println(""); System.out.println(" [+] Testing previously saved models"); System.out.println("\t-load <model>\t\tThe model to load"); System.out.println( "\t-test <file>\t\tTest data to evaluate the model (specify either this or -rank but not both)"); System.out.println( "\t-rank <file>\t\tRank the samples in the specified file (specify either this or -test but not both)"); System.out.println( "\t[ -metric2T <metric> ]\tMetric to evaluate on the test data (default=" + trainMetric + ")"); System.out.println( "\t[ -gmax <label> ]\tHighest judged relevance label. It affects the calculation of ERR (default=" + (int) SimpleMath.logBase2(ERRScorer.MAX) + ", i.e. 5-point scale {0,1,2,3,4})"); System.out.println( "\t[ -score <file>]\tStore ranker's score for each object being ranked (has to be used with -rank)"); // System.out.println("\t[ -qrel <file> ]\tTREC-style relevance judgment file. It only affects // MAP and NDCG (default=unspecified)"); System.out.println( "\t[ -idv ]\t\tPrint model performance (in test metric) on individual ranked lists (has to be used with -test)"); System.out.println( "\t[ -norm ]\t\tNormalize feature vectors (similar to -norm for training/tuning)"); /* System.out.println(""); System.out.println(" +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"); System.out.println(" + NOTE: ALWAYS include -letor if you're doing experiments on LETOR 4.0 dataset. +"); System.out.println(" + The reason is a relevance degree of 2 in the dataset is actually counted as 3 +"); System.out.println(" + (this is based on the evaluation script they provided). To be consistent +"); System.out.println(" + with their numbers, this program will change 2 to 3 when it loads the data +"); System.out.println(" + into memory if the -letor flag is specified. +"); System.out.println(" +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"); */ System.out.println(""); return; } MyThreadPool.init(Runtime.getRuntime().availableProcessors()); // MyThreadPool.init(2); for (int i = 0; i < args.length; i++) { if (args[i].compareTo("-train") == 0) trainFile = args[++i]; else if (args[i].compareTo("-ranker") == 0) rankerType = Integer.parseInt(args[++i]); else if (args[i].compareTo("-feature") == 0) featureDescriptionFile = args[++i]; else if (args[i].compareTo("-metric2t") == 0) trainMetric = args[++i]; else if (args[i].compareTo("-metric2T") == 0) testMetric = args[++i]; else if (args[i].compareTo("-gmax") == 0) ERRScorer.MAX = Math.pow(2, Double.parseDouble(args[++i])); else if (args[i].compareTo("-qrel") == 0) qrelFile = args[++i]; else if (args[i].compareTo("-tts") == 0) ttSplit = Double.parseDouble(args[++i]); else if (args[i].compareTo("-tvs") == 0) tvSplit = Double.parseDouble(args[++i]); else if (args[i].compareTo("-kcv") == 0) foldCV = Integer.parseInt(args[++i]); else if (args[i].compareTo("-validate") == 0) validationFile = args[++i]; else if (args[i].compareTo("-test") == 0) testFile = args[++i]; else if (args[i].compareTo("-norm") == 0) { Evaluator.normalize = true; String n = args[++i]; if (n.compareTo("sum") == 0) Evaluator.nml = new SumNormalizor(); else if (n.compareTo("zscore") == 0) Evaluator.nml = new ZScoreNormalizor(); else { System.out.println("Unknown normalizor: " + n); System.out.println("System will now exit."); System.exit(1); } } else if (args[i].compareTo("-save") == 0) Evaluator.modelFile = args[++i]; else if (args[i].compareTo("-silent") == 0) Ranker.verbose = false; else if (args[i].compareTo("-load") == 0) { savedModelFile = args[++i]; modelToLoad = args[i]; } else if (args[i].compareTo("-idv") == 0) printIndividual = true; else if (args[i].compareTo("-rank") == 0) rankFile = args[++i]; else if (args[i].compareTo("-score") == 0) scoreFile = args[++i]; // Ranker-specific parameters // RankNet else if (args[i].compareTo("-epoch") == 0) { RankNet.nIteration = Integer.parseInt(args[++i]); ListNet.nIteration = Integer.parseInt(args[i]); } else if (args[i].compareTo("-layer") == 0) RankNet.nHiddenLayer = Integer.parseInt(args[++i]); else if (args[i].compareTo("-node") == 0) RankNet.nHiddenNodePerLayer = Integer.parseInt(args[++i]); else if (args[i].compareTo("-lr") == 0) { RankNet.learningRate = Double.parseDouble(args[++i]); ListNet.learningRate = Neuron.learningRate; } // RankBoost else if (args[i].compareTo("-tc") == 0) { RankBoost.nThreshold = Integer.parseInt(args[++i]); LambdaMART.nThreshold = Integer.parseInt(args[i]); } // AdaRank else if (args[i].compareTo("-noeq") == 0) AdaRank.trainWithEnqueue = false; else if (args[i].compareTo("-max") == 0) AdaRank.maxSelCount = Integer.parseInt(args[++i]); // COORDINATE ASCENT else if (args[i].compareTo("-r") == 0) CoorAscent.nRestart = Integer.parseInt(args[++i]); else if (args[i].compareTo("-i") == 0) CoorAscent.nMaxIteration = Integer.parseInt(args[++i]); // ranker-shared parameters else if (args[i].compareTo("-round") == 0) { RankBoost.nIteration = Integer.parseInt(args[++i]); AdaRank.nIteration = Integer.parseInt(args[i]); } else if (args[i].compareTo("-reg") == 0) { CoorAscent.slack = Double.parseDouble(args[++i]); CoorAscent.regularized = true; } else if (args[i].compareTo("-tolerance") == 0) { AdaRank.tolerance = Double.parseDouble(args[++i]); CoorAscent.tolerance = Double.parseDouble(args[i]); } // MART / LambdaMART / Random forest else if (args[i].compareTo("-tree") == 0) { LambdaMART.nTrees = Integer.parseInt(args[++i]); RFRanker.nTrees = Integer.parseInt(args[i]); } else if (args[i].compareTo("-leaf") == 0) { LambdaMART.nTreeLeaves = Integer.parseInt(args[++i]); RFRanker.nTreeLeaves = Integer.parseInt(args[i]); } else if (args[i].compareTo("-shrinkage") == 0) { LambdaMART.learningRate = Float.parseFloat(args[++i]); RFRanker.learningRate = Float.parseFloat(args[i]); } else if (args[i].compareTo("-mls") == 0) { LambdaMART.minLeafSupport = Integer.parseInt(args[++i]); RFRanker.minLeafSupport = Integer.parseInt(args[i]); } else if (args[i].compareTo("-estop") == 0) LambdaMART.nRoundToStopEarly = Integer.parseInt(args[++i]); // Random forest else if (args[i].compareTo("-bag") == 0) RFRanker.nBag = Integer.parseInt(args[++i]); else if (args[i].compareTo("-srate") == 0) RFRanker.subSamplingRate = Float.parseFloat(args[++i]); else if (args[i].compareTo("-frate") == 0) RFRanker.featureSamplingRate = Float.parseFloat(args[++i]); else if (args[i].compareTo("-letor") == 0) letor = true; ///////////////////////////////////////////////////// // These parameters are *ONLY* for my personal use ///////////////////////////////////////////////////// else if (args[i].compareTo("-nf") == 0) newFeatureFile = args[++i]; else if (args[i].compareTo("-keep") == 0) keepOrigFeatures = true; else if (args[i].compareTo("-t") == 0) topNew = Integer.parseInt(args[++i]); else if (args[i].compareTo("-indri") == 0) indriRankingFile = args[++i]; else if (args[i].compareTo("-hr") == 0) mustHaveRelDoc = true; else { System.out.println("Unknown command-line parameter: " + args[i]); System.out.println("System will now exit."); System.exit(1); } } if (testMetric.compareTo("") == 0) testMetric = trainMetric; System.out.println(""); // System.out.println((keepOrigFeatures)?"Keep orig. features":"Discard orig. features"); System.out.println("[+] General Parameters:"); System.out.println("LETOR 4.0 dataset: " + (letor ? "Yes" : "No")); Evaluator e = new Evaluator(rType2[rankerType], trainMetric, testMetric); if (trainFile.compareTo("") != 0) { System.out.println("Training data:\t" + trainFile); if (foldCV != -1) { System.out.println("Cross validation: " + foldCV + " folds."); } else { if (testFile.compareTo("") != 0) System.out.println("Test data:\t" + testFile); else if (ttSplit > 0.0) // choose to split training data into train and test System.out.println("Train-Test split: " + ttSplit); if (validationFile.compareTo("") != 0) // the user has specified the validation set System.out.println("Validation data:\t" + validationFile); else if (ttSplit <= 0.0 && tvSplit > 0.0) System.out.println("Train-Validation split: " + tvSplit); } System.out.println("Ranking method:\t" + rType[rankerType]); if (featureDescriptionFile.compareTo("") != 0) System.out.println("Feature description file:\t" + featureDescriptionFile); else System.out.println("Feature description file:\tUnspecified. All features will be used."); System.out.println("Train metric:\t" + trainMetric); System.out.println("Test metric:\t" + testMetric); if (trainMetric.toUpperCase().startsWith("ERR") || testMetric.toUpperCase().startsWith("ERR")) System.out.println( "Highest relevance label (to compute ERR): " + (int) SimpleMath.logBase2(ERRScorer.MAX)); if (qrelFile.compareTo("") != 0) System.out.println( "TREC-format relevance judgment (only affects MAP and NDCG scores): " + qrelFile); System.out.println( "Feature normalization: " + ((Evaluator.normalize) ? Evaluator.nml.name() : "No")); if (modelFile.compareTo("") != 0) System.out.println("Model file: " + modelFile); System.out.println(""); System.out.println("[+] " + rType[rankerType] + "'s Parameters:"); RankerFactory rf = new RankerFactory(); rf.createRanker(rType2[rankerType]).printParameters(); System.out.println(""); // starting to do some work if (foldCV != -1) e.evaluate(trainFile, featureDescriptionFile, foldCV); else { if (ttSplit > 0.0) // we should use a held-out portion of the training data for testing? e.evaluate(trainFile, validationFile, featureDescriptionFile, ttSplit); else if (tvSplit > 0.0) // should we use a portion of the training data for validation? e.evaluate(trainFile, tvSplit, testFile, featureDescriptionFile); else e.evaluate(trainFile, validationFile, testFile, featureDescriptionFile); } } else // scenario: test a saved model { System.out.println("Model file:\t" + savedModelFile); System.out.println( "Feature normalization: " + ((Evaluator.normalize) ? Evaluator.nml.name() : "No")); if (rankFile.compareTo("") != 0) { if (scoreFile.compareTo("") != 0) e.score(savedModelFile, rankFile, scoreFile); else if (indriRankingFile.compareTo("") != 0) e.rank(savedModelFile, rankFile, indriRankingFile); else e.rank(savedModelFile, rankFile); } else { System.out.println("Test metric:\t" + testMetric); if (testMetric.startsWith("ERR")) System.out.println( "Highest relevance label (to compute ERR): " + (int) SimpleMath.logBase2(ERRScorer.MAX)); if (savedModelFile.compareTo("") != 0) e.test(savedModelFile, testFile, printIndividual); // This is *ONLY* for my personal use. It is *NOT* exposed via cmd-line // It will evaluate the input ranking (without being reranked by any model) using any // measure specified via metric2T else e.test(testFile); } } MyThreadPool.getInstance().shutdown(); }