public svm_problem loadData(CollectionReader cr) { logger.debug("Loading " + cr.getSize() + " documents."); List<svm_node[]> data = new LinkedList<svm_node[]>(); List<Set<String>> labels = new LinkedList<Set<String>>(); Set<String> allLabels = new HashSet<String>(); for (Analyzable a : cr) { CategorizedTextContent ctc = (CategorizedTextContent) a; Set<String> categories; if (targetClass == null) { categories = ctc.getCategories(); } else { categories = new HashSet<String>(); if (ctc.getCategories().contains(targetClass)) { // Not adding any other categories if item belongs to targetClass as well as some other // category. categories.add(targetClass); } else { categories.add("other"); } } labels.add(categories); allLabels.addAll(categories); data.add(representer.represent(ctc.getText()).toSvmNodes()); } representer.shutdown(); labelList = new ArrayList<String>(allLabels); Collections.sort(labelList); logger.debug("Total labels: " + labelList.size()); logger.trace("Labels: " + labelList); int numberOfExpandedData = 0; for (Set<String> ls : labels) { numberOfExpandedData += ls.size(); } // Scale data logger.debug("Scaling data."); scaler = new ValueScaler(0, 1, data.toArray(new svm_node[0][])); svm_node[][] scaledData = scaler.getScaledData(); // For every label, generate a copy of the data item. logger.debug("Generating " + numberOfExpandedData + " records."); svm_node[][] expandedData = new svm_node[numberOfExpandedData][]; double[] expandedLabels = new double[numberOfExpandedData]; int i = 0; int xi = 0; for (svm_node[] dataItem : scaledData) { Set<String> trueLabels = labels.get(i++); for (String label : trueLabels) { double labelIndex = labelList.indexOf(label); expandedData[xi] = dataItem.clone(); expandedLabels[xi++] = labelIndex; } } svm_problem svmProblem = new svm_problem(); svmProblem.l = numberOfExpandedData; svmProblem.x = expandedData; svmProblem.y = expandedLabels; return svmProblem; }