private int[] prepareSplit( String sampleFile, String featureDefFile, double percentTrain, boolean normalize, List<RankList> trainingData, List<RankList> testData) { List<RankList> data = readInput(sampleFile); // read input int[] features = readFeature(featureDefFile); // read features if (features == null) // no features specified ==> use all features in the training file features = getFeatureFromSampleVector(data); if (normalize) normalize(data, features); if (newFeatureFile.compareTo("") != 0) { System.out.print("Loading new feature description file... "); List<String> descriptions = FileUtils.readLine(newFeatureFile, "ASCII"); for (int i = 0; i < descriptions.size(); i++) { if (descriptions.get(i).indexOf("##") == 0) continue; LinearComputer lc = new LinearComputer("", descriptions.get(i)); // if we keep the orig. features ==> discard size-1 linear computer if (!keepOrigFeatures || lc.size() > 1) lcList.add(lc); } features = applyNewFeatures(data, features); System.out.println("[Done]"); } int size = (int) (data.size() * percentTrain); for (int i = 0; i < size; i++) trainingData.add(new RankList(data.get(i))); for (int i = size; i < data.size(); i++) testData.add(new RankList(data.get(i))); return features; }
private int[] prepareCV( String sampleFile, String featureDefFile, int nFold, boolean normalize, List<List<RankList>> trainingData, List<List<RankList>> testData) { List<RankList> data = readInput(sampleFile); // read input int[] features = readFeature(featureDefFile); // read features if (features == null) // no features specified ==> use all features in the training file features = getFeatureFromSampleVector(data); if (normalize) normalize(data, features); if (newFeatureFile.compareTo("") != 0) { System.out.print("Loading new feature description file... "); List<String> descriptions = FileUtils.readLine(newFeatureFile, "ASCII"); for (int i = 0; i < descriptions.size(); i++) { if (descriptions.get(i).indexOf("##") == 0) continue; LinearComputer lc = new LinearComputer("", descriptions.get(i)); // if we keep the orig. features ==> discard size-1 linear computer if (!keepOrigFeatures || lc.size() > 1) lcList.add(lc); } features = applyNewFeatures(data, features); System.out.println("[Done]"); } List<List<Integer>> trainSamplesIdx = new ArrayList<List<Integer>>(); int size = data.size() / nFold; int start = 0; int total = 0; for (int f = 0; f < nFold; f++) { List<Integer> t = new ArrayList<Integer>(); for (int i = 0; i < size && start + i < data.size(); i++) t.add(start + i); trainSamplesIdx.add(t); total += t.size(); start += size; } for (; total < data.size(); total++) trainSamplesIdx.get(trainSamplesIdx.size() - 1).add(total); for (int i = 0; i < trainSamplesIdx.size(); i++) { List<RankList> train = new ArrayList<RankList>(); List<RankList> test = new ArrayList<RankList>(); List<Integer> t = trainSamplesIdx.get(i); for (int j = 0; j < data.size(); j++) { if (t.contains(j)) test.add(new RankList(data.get(j))); else train.add(new RankList(data.get(j))); } trainingData.add(train); testData.add(test); } return features; }