public svm_problem do_sample(svm_problem svmProblem) { logger.debug("Creating " + sample + " sample"); Map<Double, List<Integer>> label2index = new HashMap<Double, List<Integer>>(); for (int i = 0; i < svmProblem.l; ++i) { double label = svmProblem.y[i]; if (label2index.containsKey(label)) { label2index.get(label).add(i); } else { List<Integer> indeces = new LinkedList<Integer>(); indeces.add(i); label2index.put(label, indeces); } } for (List<Integer> indeces : label2index.values()) { Collections.shuffle(indeces); } int newSize = (int) (svmProblem.l * sample); logger.debug("Original size: " + svmProblem.l); logger.debug("Sample size: " + newSize); double[] newlabels = new double[newSize]; svm_node[][] newdata = new svm_node[newSize][]; int i = 0; for (List<Integer> indeces : label2index.values()) { int catSize = (int) (indeces.size() * sample); for (int j = 0; j < catSize; ++j) { int index = indeces.remove(0); newlabels[i] = svmProblem.y[index]; newdata[i] = svmProblem.x[index]; if (++i >= newSize) { break; } } if (i >= newSize) { break; } } // fill any remaining empty items caused due to rounding if (i < newSize) { for (List<Integer> indeces : label2index.values()) { if (indeces.isEmpty()) { continue; } int index = indeces.remove(0); newlabels[i] = svmProblem.y[index]; newdata[i] = svmProblem.x[index]; if (++i >= newSize) { break; } } } svm_problem newProblem = new svm_problem(); newProblem.l = newSize; newProblem.x = newdata; newProblem.y = newlabels; return newProblem; }
public svm_problem loadData(CollectionReader cr) { logger.debug("Loading " + cr.getSize() + " documents."); List<svm_node[]> data = new LinkedList<svm_node[]>(); List<Set<String>> labels = new LinkedList<Set<String>>(); Set<String> allLabels = new HashSet<String>(); for (Analyzable a : cr) { CategorizedTextContent ctc = (CategorizedTextContent) a; Set<String> categories; if (targetClass == null) { categories = ctc.getCategories(); } else { categories = new HashSet<String>(); if (ctc.getCategories().contains(targetClass)) { // Not adding any other categories if item belongs to targetClass as well as some other // category. categories.add(targetClass); } else { categories.add("other"); } } labels.add(categories); allLabels.addAll(categories); data.add(representer.represent(ctc.getText()).toSvmNodes()); } representer.shutdown(); labelList = new ArrayList<String>(allLabels); Collections.sort(labelList); logger.debug("Total labels: " + labelList.size()); logger.trace("Labels: " + labelList); int numberOfExpandedData = 0; for (Set<String> ls : labels) { numberOfExpandedData += ls.size(); } // Scale data logger.debug("Scaling data."); scaler = new ValueScaler(0, 1, data.toArray(new svm_node[0][])); svm_node[][] scaledData = scaler.getScaledData(); // For every label, generate a copy of the data item. logger.debug("Generating " + numberOfExpandedData + " records."); svm_node[][] expandedData = new svm_node[numberOfExpandedData][]; double[] expandedLabels = new double[numberOfExpandedData]; int i = 0; int xi = 0; for (svm_node[] dataItem : scaledData) { Set<String> trueLabels = labels.get(i++); for (String label : trueLabels) { double labelIndex = labelList.indexOf(label); expandedData[xi] = dataItem.clone(); expandedLabels[xi++] = labelIndex; } } svm_problem svmProblem = new svm_problem(); svmProblem.l = numberOfExpandedData; svmProblem.x = expandedData; svmProblem.y = expandedLabels; return svmProblem; }