Пример #1
0
  public svm_problem loadData(CollectionReader cr) {
    logger.debug("Loading " + cr.getSize() + " documents.");
    List<svm_node[]> data = new LinkedList<svm_node[]>();
    List<Set<String>> labels = new LinkedList<Set<String>>();
    Set<String> allLabels = new HashSet<String>();

    for (Analyzable a : cr) {
      CategorizedTextContent ctc = (CategorizedTextContent) a;

      Set<String> categories;
      if (targetClass == null) {
        categories = ctc.getCategories();
      } else {
        categories = new HashSet<String>();
        if (ctc.getCategories().contains(targetClass)) {
          // Not adding any other categories if item belongs to targetClass as well as some other
          // category.
          categories.add(targetClass);
        } else {
          categories.add("other");
        }
      }

      labels.add(categories);
      allLabels.addAll(categories);
      data.add(representer.represent(ctc.getText()).toSvmNodes());
    }
    representer.shutdown();

    labelList = new ArrayList<String>(allLabels);
    Collections.sort(labelList);

    logger.debug("Total labels: " + labelList.size());
    logger.trace("Labels: " + labelList);

    int numberOfExpandedData = 0;
    for (Set<String> ls : labels) {
      numberOfExpandedData += ls.size();
    }

    // Scale data
    logger.debug("Scaling data.");
    scaler = new ValueScaler(0, 1, data.toArray(new svm_node[0][]));
    svm_node[][] scaledData = scaler.getScaledData();

    // For every label, generate a copy of the data item.
    logger.debug("Generating " + numberOfExpandedData + " records.");
    svm_node[][] expandedData = new svm_node[numberOfExpandedData][];
    double[] expandedLabels = new double[numberOfExpandedData];
    int i = 0;
    int xi = 0;
    for (svm_node[] dataItem : scaledData) {
      Set<String> trueLabels = labels.get(i++);
      for (String label : trueLabels) {
        double labelIndex = labelList.indexOf(label);
        expandedData[xi] = dataItem.clone();
        expandedLabels[xi++] = labelIndex;
      }
    }

    svm_problem svmProblem = new svm_problem();
    svmProblem.l = numberOfExpandedData;
    svmProblem.x = expandedData;
    svmProblem.y = expandedLabels;

    return svmProblem;
  }