public svm_problem loadData(CollectionReader cr) { logger.debug("Loading " + cr.getSize() + " documents."); List<svm_node[]> data = new LinkedList<svm_node[]>(); List<Set<String>> labels = new LinkedList<Set<String>>(); Set<String> allLabels = new HashSet<String>(); for (Analyzable a : cr) { CategorizedTextContent ctc = (CategorizedTextContent) a; Set<String> categories; if (targetClass == null) { categories = ctc.getCategories(); } else { categories = new HashSet<String>(); if (ctc.getCategories().contains(targetClass)) { // Not adding any other categories if item belongs to targetClass as well as some other // category. categories.add(targetClass); } else { categories.add("other"); } } labels.add(categories); allLabels.addAll(categories); data.add(representer.represent(ctc.getText()).toSvmNodes()); } representer.shutdown(); labelList = new ArrayList<String>(allLabels); Collections.sort(labelList); logger.debug("Total labels: " + labelList.size()); logger.trace("Labels: " + labelList); int numberOfExpandedData = 0; for (Set<String> ls : labels) { numberOfExpandedData += ls.size(); } // Scale data logger.debug("Scaling data."); scaler = new ValueScaler(0, 1, data.toArray(new svm_node[0][])); svm_node[][] scaledData = scaler.getScaledData(); // For every label, generate a copy of the data item. logger.debug("Generating " + numberOfExpandedData + " records."); svm_node[][] expandedData = new svm_node[numberOfExpandedData][]; double[] expandedLabels = new double[numberOfExpandedData]; int i = 0; int xi = 0; for (svm_node[] dataItem : scaledData) { Set<String> trueLabels = labels.get(i++); for (String label : trueLabels) { double labelIndex = labelList.indexOf(label); expandedData[xi] = dataItem.clone(); expandedLabels[xi++] = labelIndex; } } svm_problem svmProblem = new svm_problem(); svmProblem.l = numberOfExpandedData; svmProblem.x = expandedData; svmProblem.y = expandedLabels; return svmProblem; }
public svm_problem do_sample(svm_problem svmProblem) { logger.debug("Creating " + sample + " sample"); Map<Double, List<Integer>> label2index = new HashMap<Double, List<Integer>>(); for (int i = 0; i < svmProblem.l; ++i) { double label = svmProblem.y[i]; if (label2index.containsKey(label)) { label2index.get(label).add(i); } else { List<Integer> indeces = new LinkedList<Integer>(); indeces.add(i); label2index.put(label, indeces); } } for (List<Integer> indeces : label2index.values()) { Collections.shuffle(indeces); } int newSize = (int) (svmProblem.l * sample); logger.debug("Original size: " + svmProblem.l); logger.debug("Sample size: " + newSize); double[] newlabels = new double[newSize]; svm_node[][] newdata = new svm_node[newSize][]; int i = 0; for (List<Integer> indeces : label2index.values()) { int catSize = (int) (indeces.size() * sample); for (int j = 0; j < catSize; ++j) { int index = indeces.remove(0); newlabels[i] = svmProblem.y[index]; newdata[i] = svmProblem.x[index]; if (++i >= newSize) { break; } } if (i >= newSize) { break; } } // fill any remaining empty items caused due to rounding if (i < newSize) { for (List<Integer> indeces : label2index.values()) { if (indeces.isEmpty()) { continue; } int index = indeces.remove(0); newlabels[i] = svmProblem.y[index]; newdata[i] = svmProblem.x[index]; if (++i >= newSize) { break; } } } svm_problem newProblem = new svm_problem(); newProblem.l = newSize; newProblem.x = newdata; newProblem.y = newlabels; return newProblem; }
private void do_find_best_parameters(svm_problem svmProblem) { svm_parameter svmParam = getDefaultSvmParameters(); setWeights(svmParam); int maxIter = ((int) Math.ceil(Math.abs((log2cEnd - log2cBegin) / log2cStep)) + 1) * ((int) Math.ceil(Math.abs((log2gEnd - log2gBegin) / log2gStep)) + 1); // Run the grid search in separate CV threads ExecutorService executorService = Executors.newFixedThreadPool(numberOfThreads); List<CvParams> cvParamsList = new ArrayList<CvParams>(); for (double log2c = log2cBegin; (log2cBegin < log2cEnd && log2c <= log2cEnd) || (log2cBegin >= log2cEnd && log2c >= log2cEnd); log2c += log2cStep) { double c1 = Math.pow(2, log2c); for (double log2g = log2gBegin; (log2gBegin < log2gEnd && log2g <= log2gEnd) || (log2gBegin >= log2gEnd && log2g >= log2gEnd); log2g += log2gStep) { double gamma1 = Math.pow(2, log2g); svm_parameter svmParam1 = (svm_parameter) svmParam.clone(); svmParam1.C = c1; svmParam1.gamma = gamma1; executorService.execute( new RunnableSvmCrossValidator(svmProblem, svmParam1, nrFold, cvParamsList)); } } // now wait for all threads to complete by calling shutdown // note that this will NOT terminate the currently running threads, it just signals the thread // pool to closeWriter // once all work is completed executorService.shutdown(); while (!executorService.isTerminated()) { try { Thread.sleep(1000); } catch (InterruptedException e) { // don't care if we get interrupted } // every second, report statistics logger.debug( String.format("%% complete: %5.2f", cvParamsList.size() / (double) maxIter * 100)); CvParams best = getBestCvParams(cvParamsList); CvParams worst = getWorstcvParams(cvParamsList); if (best != null) { logger.debug("Best accuracy: " + best.accuracy); logger.debug("Best C: " + best.c); logger.debug("Best Gamma: " + best.gamma); } if (worst != null) { logger.debug("Worst accuracy: " + worst.accuracy); } } CvParams best = getBestCvParams(cvParamsList); CvParams worst = getWorstcvParams(cvParamsList); if (best != null) { logger.debug("Best accuracy: " + best.accuracy); logger.debug("Best C: " + best.c); logger.debug("Best Gamma: " + best.gamma); c = best.c; gamma = best.gamma; } else { logger.error("Best CV parameters is null."); } if (worst != null) { logger.debug("Worst accuracy: " + worst.accuracy); } }