@Override public void initialize(Parameters params) throws ResourceInitializationException { super.initialize(params); if (params.contains(Constants.REPRESENTER)) { representer = (TextRepresenter) params.get(Constants.REPRESENTER); } else { throw new ResourceInitializationException("No text representer specified."); } if (params.contains(Constants.FIND_BEST_PARAMETERS)) { findBestParameters = params.getBoolean(Constants.FIND_BEST_PARAMETERS); } else { gamma = 1.0 / (double) representer.getFeatures().size(); } if (params.contains(Constants.C)) { if (findBestParameters) { logger.warn("Finding best parameters. The specified C value will be ignored."); } else { c = params.getDouble(Constants.C); } } if (params.contains(Constants.WEIGHTS)) { // Parse weight specs weights = new TreeMap<Integer, Double>(); for (String piece : params.getString(Constants.WEIGHTS).split(",\\s*")) { String[] pair = piece.split(":"); if (pair.length != 2) { throw new ResourceInitializationException( "Invalid weight specs in " + params.getString(Constants.WEIGHTS)); } try { int i = Integer.parseInt(pair[0]); double w = Double.parseDouble(pair[1]); weights.put(i, w); } catch (Exception e) { throw new ResourceInitializationException( "Invalid weight specs in " + params.getString(Constants.WEIGHTS), e); } } } if (params.contains(Constants.GAMMA)) { if (findBestParameters) { logger.warn("Finding best parameters. The specified gamma value will be ignored."); } else { gamma = params.getDouble(Constants.GAMMA); } } if (params.contains(Constants.SAMPLE)) { if (findBestParameters) { sample = params.getDouble(Constants.SAMPLE); if (sample <= 0 || sample > 1) { throw new ResourceInitializationException("Invalid sample value " + sample); } } else { logger.warn( "Sample value valid for parameter optimization only. Provided sample value will be ignored."); } } if (params.contains(Constants.NUMBER_OF_THREADS)) { numberOfThreads = params.getInt(Constants.NUMBER_OF_THREADS); if (numberOfThreads < 1) { throw new ResourceInitializationException(Constants.NUMBER_OF_THREADS + " must be >= 1"); } } if (params.contains(Constants.TARGET_CLASS)) { targetClass = params.getString(Constants.TARGET_CLASS); logger.debug("Target Class: " + targetClass); } }
public svm_problem loadData(CollectionReader cr) { logger.debug("Loading " + cr.getSize() + " documents."); List<svm_node[]> data = new LinkedList<svm_node[]>(); List<Set<String>> labels = new LinkedList<Set<String>>(); Set<String> allLabels = new HashSet<String>(); for (Analyzable a : cr) { CategorizedTextContent ctc = (CategorizedTextContent) a; Set<String> categories; if (targetClass == null) { categories = ctc.getCategories(); } else { categories = new HashSet<String>(); if (ctc.getCategories().contains(targetClass)) { // Not adding any other categories if item belongs to targetClass as well as some other // category. categories.add(targetClass); } else { categories.add("other"); } } labels.add(categories); allLabels.addAll(categories); data.add(representer.represent(ctc.getText()).toSvmNodes()); } representer.shutdown(); labelList = new ArrayList<String>(allLabels); Collections.sort(labelList); logger.debug("Total labels: " + labelList.size()); logger.trace("Labels: " + labelList); int numberOfExpandedData = 0; for (Set<String> ls : labels) { numberOfExpandedData += ls.size(); } // Scale data logger.debug("Scaling data."); scaler = new ValueScaler(0, 1, data.toArray(new svm_node[0][])); svm_node[][] scaledData = scaler.getScaledData(); // For every label, generate a copy of the data item. logger.debug("Generating " + numberOfExpandedData + " records."); svm_node[][] expandedData = new svm_node[numberOfExpandedData][]; double[] expandedLabels = new double[numberOfExpandedData]; int i = 0; int xi = 0; for (svm_node[] dataItem : scaledData) { Set<String> trueLabels = labels.get(i++); for (String label : trueLabels) { double labelIndex = labelList.indexOf(label); expandedData[xi] = dataItem.clone(); expandedLabels[xi++] = labelIndex; } } svm_problem svmProblem = new svm_problem(); svmProblem.l = numberOfExpandedData; svmProblem.x = expandedData; svmProblem.y = expandedLabels; return svmProblem; }