Пример #1
0
  private int[] prepareSplit(
      String sampleFile,
      String featureDefFile,
      double percentTrain,
      boolean normalize,
      List<RankList> trainingData,
      List<RankList> testData) {
    List<RankList> data = readInput(sampleFile); // read input
    int[] features = readFeature(featureDefFile); // read features
    if (features == null) // no features specified ==> use all features in the training file
    features = getFeatureFromSampleVector(data);

    if (normalize) normalize(data, features);
    if (newFeatureFile.compareTo("") != 0) {
      System.out.print("Loading new feature description file... ");
      List<String> descriptions = FileUtils.readLine(newFeatureFile, "ASCII");
      for (int i = 0; i < descriptions.size(); i++) {
        if (descriptions.get(i).indexOf("##") == 0) continue;
        LinearComputer lc = new LinearComputer("", descriptions.get(i));
        // if we keep the orig. features ==> discard size-1 linear computer
        if (!keepOrigFeatures || lc.size() > 1) lcList.add(lc);
      }
      features = applyNewFeatures(data, features);
      System.out.println("[Done]");
    }

    int size = (int) (data.size() * percentTrain);

    for (int i = 0; i < size; i++) trainingData.add(new RankList(data.get(i)));
    for (int i = size; i < data.size(); i++) testData.add(new RankList(data.get(i)));

    return features;
  }
Пример #2
0
  private int[] prepareCV(
      String sampleFile,
      String featureDefFile,
      int nFold,
      boolean normalize,
      List<List<RankList>> trainingData,
      List<List<RankList>> testData) {
    List<RankList> data = readInput(sampleFile); // read input
    int[] features = readFeature(featureDefFile); // read features
    if (features == null) // no features specified ==> use all features in the training file
    features = getFeatureFromSampleVector(data);

    if (normalize) normalize(data, features);
    if (newFeatureFile.compareTo("") != 0) {
      System.out.print("Loading new feature description file... ");
      List<String> descriptions = FileUtils.readLine(newFeatureFile, "ASCII");
      for (int i = 0; i < descriptions.size(); i++) {
        if (descriptions.get(i).indexOf("##") == 0) continue;
        LinearComputer lc = new LinearComputer("", descriptions.get(i));
        // if we keep the orig. features ==> discard size-1 linear computer
        if (!keepOrigFeatures || lc.size() > 1) lcList.add(lc);
      }
      features = applyNewFeatures(data, features);
      System.out.println("[Done]");
    }

    List<List<Integer>> trainSamplesIdx = new ArrayList<List<Integer>>();
    int size = data.size() / nFold;
    int start = 0;
    int total = 0;
    for (int f = 0; f < nFold; f++) {
      List<Integer> t = new ArrayList<Integer>();
      for (int i = 0; i < size && start + i < data.size(); i++) t.add(start + i);
      trainSamplesIdx.add(t);
      total += t.size();
      start += size;
    }
    for (; total < data.size(); total++) trainSamplesIdx.get(trainSamplesIdx.size() - 1).add(total);

    for (int i = 0; i < trainSamplesIdx.size(); i++) {
      List<RankList> train = new ArrayList<RankList>();
      List<RankList> test = new ArrayList<RankList>();

      List<Integer> t = trainSamplesIdx.get(i);
      for (int j = 0; j < data.size(); j++) {
        if (t.contains(j)) test.add(new RankList(data.get(j)));
        else train.add(new RankList(data.get(j)));
      }

      trainingData.add(train);
      testData.add(test);
    }

    return features;
  }