Пример #1
0
  public svm_problem do_sample(svm_problem svmProblem) {
    logger.debug("Creating " + sample + " sample");
    Map<Double, List<Integer>> label2index = new HashMap<Double, List<Integer>>();
    for (int i = 0; i < svmProblem.l; ++i) {
      double label = svmProblem.y[i];
      if (label2index.containsKey(label)) {
        label2index.get(label).add(i);
      } else {
        List<Integer> indeces = new LinkedList<Integer>();
        indeces.add(i);
        label2index.put(label, indeces);
      }
    }

    for (List<Integer> indeces : label2index.values()) {
      Collections.shuffle(indeces);
    }

    int newSize = (int) (svmProblem.l * sample);
    logger.debug("Original size: " + svmProblem.l);
    logger.debug("Sample size: " + newSize);
    double[] newlabels = new double[newSize];
    svm_node[][] newdata = new svm_node[newSize][];

    int i = 0;
    for (List<Integer> indeces : label2index.values()) {
      int catSize = (int) (indeces.size() * sample);
      for (int j = 0; j < catSize; ++j) {
        int index = indeces.remove(0);
        newlabels[i] = svmProblem.y[index];
        newdata[i] = svmProblem.x[index];
        if (++i >= newSize) {
          break;
        }
      }
      if (i >= newSize) {
        break;
      }
    }

    // fill any remaining empty items caused due to rounding
    if (i < newSize) {
      for (List<Integer> indeces : label2index.values()) {
        if (indeces.isEmpty()) {
          continue;
        }
        int index = indeces.remove(0);
        newlabels[i] = svmProblem.y[index];
        newdata[i] = svmProblem.x[index];
        if (++i >= newSize) {
          break;
        }
      }
    }

    svm_problem newProblem = new svm_problem();
    newProblem.l = newSize;
    newProblem.x = newdata;
    newProblem.y = newlabels;

    return newProblem;
  }
Пример #2
0
  public svm_problem loadData(CollectionReader cr) {
    logger.debug("Loading " + cr.getSize() + " documents.");
    List<svm_node[]> data = new LinkedList<svm_node[]>();
    List<Set<String>> labels = new LinkedList<Set<String>>();
    Set<String> allLabels = new HashSet<String>();

    for (Analyzable a : cr) {
      CategorizedTextContent ctc = (CategorizedTextContent) a;

      Set<String> categories;
      if (targetClass == null) {
        categories = ctc.getCategories();
      } else {
        categories = new HashSet<String>();
        if (ctc.getCategories().contains(targetClass)) {
          // Not adding any other categories if item belongs to targetClass as well as some other
          // category.
          categories.add(targetClass);
        } else {
          categories.add("other");
        }
      }

      labels.add(categories);
      allLabels.addAll(categories);
      data.add(representer.represent(ctc.getText()).toSvmNodes());
    }
    representer.shutdown();

    labelList = new ArrayList<String>(allLabels);
    Collections.sort(labelList);

    logger.debug("Total labels: " + labelList.size());
    logger.trace("Labels: " + labelList);

    int numberOfExpandedData = 0;
    for (Set<String> ls : labels) {
      numberOfExpandedData += ls.size();
    }

    // Scale data
    logger.debug("Scaling data.");
    scaler = new ValueScaler(0, 1, data.toArray(new svm_node[0][]));
    svm_node[][] scaledData = scaler.getScaledData();

    // For every label, generate a copy of the data item.
    logger.debug("Generating " + numberOfExpandedData + " records.");
    svm_node[][] expandedData = new svm_node[numberOfExpandedData][];
    double[] expandedLabels = new double[numberOfExpandedData];
    int i = 0;
    int xi = 0;
    for (svm_node[] dataItem : scaledData) {
      Set<String> trueLabels = labels.get(i++);
      for (String label : trueLabels) {
        double labelIndex = labelList.indexOf(label);
        expandedData[xi] = dataItem.clone();
        expandedLabels[xi++] = labelIndex;
      }
    }

    svm_problem svmProblem = new svm_problem();
    svmProblem.l = numberOfExpandedData;
    svmProblem.x = expandedData;
    svmProblem.y = expandedLabels;

    return svmProblem;
  }