예제 #1
0
  public svm_problem loadData(CollectionReader cr) {
    logger.debug("Loading " + cr.getSize() + " documents.");
    List<svm_node[]> data = new LinkedList<svm_node[]>();
    List<Set<String>> labels = new LinkedList<Set<String>>();
    Set<String> allLabels = new HashSet<String>();

    for (Analyzable a : cr) {
      CategorizedTextContent ctc = (CategorizedTextContent) a;

      Set<String> categories;
      if (targetClass == null) {
        categories = ctc.getCategories();
      } else {
        categories = new HashSet<String>();
        if (ctc.getCategories().contains(targetClass)) {
          // Not adding any other categories if item belongs to targetClass as well as some other
          // category.
          categories.add(targetClass);
        } else {
          categories.add("other");
        }
      }

      labels.add(categories);
      allLabels.addAll(categories);
      data.add(representer.represent(ctc.getText()).toSvmNodes());
    }
    representer.shutdown();

    labelList = new ArrayList<String>(allLabels);
    Collections.sort(labelList);

    logger.debug("Total labels: " + labelList.size());
    logger.trace("Labels: " + labelList);

    int numberOfExpandedData = 0;
    for (Set<String> ls : labels) {
      numberOfExpandedData += ls.size();
    }

    // Scale data
    logger.debug("Scaling data.");
    scaler = new ValueScaler(0, 1, data.toArray(new svm_node[0][]));
    svm_node[][] scaledData = scaler.getScaledData();

    // For every label, generate a copy of the data item.
    logger.debug("Generating " + numberOfExpandedData + " records.");
    svm_node[][] expandedData = new svm_node[numberOfExpandedData][];
    double[] expandedLabels = new double[numberOfExpandedData];
    int i = 0;
    int xi = 0;
    for (svm_node[] dataItem : scaledData) {
      Set<String> trueLabels = labels.get(i++);
      for (String label : trueLabels) {
        double labelIndex = labelList.indexOf(label);
        expandedData[xi] = dataItem.clone();
        expandedLabels[xi++] = labelIndex;
      }
    }

    svm_problem svmProblem = new svm_problem();
    svmProblem.l = numberOfExpandedData;
    svmProblem.x = expandedData;
    svmProblem.y = expandedLabels;

    return svmProblem;
  }
예제 #2
0
  public svm_problem do_sample(svm_problem svmProblem) {
    logger.debug("Creating " + sample + " sample");
    Map<Double, List<Integer>> label2index = new HashMap<Double, List<Integer>>();
    for (int i = 0; i < svmProblem.l; ++i) {
      double label = svmProblem.y[i];
      if (label2index.containsKey(label)) {
        label2index.get(label).add(i);
      } else {
        List<Integer> indeces = new LinkedList<Integer>();
        indeces.add(i);
        label2index.put(label, indeces);
      }
    }

    for (List<Integer> indeces : label2index.values()) {
      Collections.shuffle(indeces);
    }

    int newSize = (int) (svmProblem.l * sample);
    logger.debug("Original size: " + svmProblem.l);
    logger.debug("Sample size: " + newSize);
    double[] newlabels = new double[newSize];
    svm_node[][] newdata = new svm_node[newSize][];

    int i = 0;
    for (List<Integer> indeces : label2index.values()) {
      int catSize = (int) (indeces.size() * sample);
      for (int j = 0; j < catSize; ++j) {
        int index = indeces.remove(0);
        newlabels[i] = svmProblem.y[index];
        newdata[i] = svmProblem.x[index];
        if (++i >= newSize) {
          break;
        }
      }
      if (i >= newSize) {
        break;
      }
    }

    // fill any remaining empty items caused due to rounding
    if (i < newSize) {
      for (List<Integer> indeces : label2index.values()) {
        if (indeces.isEmpty()) {
          continue;
        }
        int index = indeces.remove(0);
        newlabels[i] = svmProblem.y[index];
        newdata[i] = svmProblem.x[index];
        if (++i >= newSize) {
          break;
        }
      }
    }

    svm_problem newProblem = new svm_problem();
    newProblem.l = newSize;
    newProblem.x = newdata;
    newProblem.y = newlabels;

    return newProblem;
  }
예제 #3
0
  private void do_find_best_parameters(svm_problem svmProblem) {
    svm_parameter svmParam = getDefaultSvmParameters();
    setWeights(svmParam);

    int maxIter =
        ((int) Math.ceil(Math.abs((log2cEnd - log2cBegin) / log2cStep)) + 1)
            * ((int) Math.ceil(Math.abs((log2gEnd - log2gBegin) / log2gStep)) + 1);

    // Run the grid search in separate CV threads
    ExecutorService executorService = Executors.newFixedThreadPool(numberOfThreads);

    List<CvParams> cvParamsList = new ArrayList<CvParams>();

    for (double log2c = log2cBegin;
        (log2cBegin < log2cEnd && log2c <= log2cEnd)
            || (log2cBegin >= log2cEnd && log2c >= log2cEnd);
        log2c += log2cStep) {

      double c1 = Math.pow(2, log2c);

      for (double log2g = log2gBegin;
          (log2gBegin < log2gEnd && log2g <= log2gEnd)
              || (log2gBegin >= log2gEnd && log2g >= log2gEnd);
          log2g += log2gStep) {

        double gamma1 = Math.pow(2, log2g);

        svm_parameter svmParam1 = (svm_parameter) svmParam.clone();
        svmParam1.C = c1;
        svmParam1.gamma = gamma1;

        executorService.execute(
            new RunnableSvmCrossValidator(svmProblem, svmParam1, nrFold, cvParamsList));
      }
    }

    // now wait for all threads to complete by calling shutdown
    // note that this will NOT terminate the currently running threads, it just signals the thread
    // pool to closeWriter
    // once all work is completed
    executorService.shutdown();

    while (!executorService.isTerminated()) {
      try {
        Thread.sleep(1000);
      } catch (InterruptedException e) {
        // don't care if we get interrupted
      }

      // every second, report statistics
      logger.debug(
          String.format("%% complete: %5.2f", cvParamsList.size() / (double) maxIter * 100));
      CvParams best = getBestCvParams(cvParamsList);
      CvParams worst = getWorstcvParams(cvParamsList);
      if (best != null) {
        logger.debug("Best accuracy: " + best.accuracy);
        logger.debug("Best C:        " + best.c);
        logger.debug("Best Gamma:    " + best.gamma);
      }
      if (worst != null) {
        logger.debug("Worst accuracy: " + worst.accuracy);
      }
    }

    CvParams best = getBestCvParams(cvParamsList);
    CvParams worst = getWorstcvParams(cvParamsList);
    if (best != null) {
      logger.debug("Best accuracy: " + best.accuracy);
      logger.debug("Best C:        " + best.c);
      logger.debug("Best Gamma:    " + best.gamma);

      c = best.c;
      gamma = best.gamma;
    } else {
      logger.error("Best CV parameters is null.");
    }
    if (worst != null) {
      logger.debug("Worst accuracy: " + worst.accuracy);
    }
  }