示例#1
0
  @Override
  public void initialize(Parameters params) throws ResourceInitializationException {
    super.initialize(params);

    if (params.contains(Constants.REPRESENTER)) {
      representer = (TextRepresenter) params.get(Constants.REPRESENTER);
    } else {
      throw new ResourceInitializationException("No text representer specified.");
    }

    if (params.contains(Constants.FIND_BEST_PARAMETERS)) {
      findBestParameters = params.getBoolean(Constants.FIND_BEST_PARAMETERS);
    } else {
      gamma = 1.0 / (double) representer.getFeatures().size();
    }

    if (params.contains(Constants.C)) {
      if (findBestParameters) {
        logger.warn("Finding best parameters. The specified C value will be ignored.");
      } else {
        c = params.getDouble(Constants.C);
      }
    }

    if (params.contains(Constants.WEIGHTS)) {
      // Parse weight specs
      weights = new TreeMap<Integer, Double>();
      for (String piece : params.getString(Constants.WEIGHTS).split(",\\s*")) {
        String[] pair = piece.split(":");
        if (pair.length != 2) {
          throw new ResourceInitializationException(
              "Invalid weight specs in " + params.getString(Constants.WEIGHTS));
        }
        try {
          int i = Integer.parseInt(pair[0]);
          double w = Double.parseDouble(pair[1]);
          weights.put(i, w);
        } catch (Exception e) {
          throw new ResourceInitializationException(
              "Invalid weight specs in " + params.getString(Constants.WEIGHTS), e);
        }
      }
    }

    if (params.contains(Constants.GAMMA)) {
      if (findBestParameters) {
        logger.warn("Finding best parameters. The specified gamma value will be ignored.");
      } else {
        gamma = params.getDouble(Constants.GAMMA);
      }
    }

    if (params.contains(Constants.SAMPLE)) {
      if (findBestParameters) {
        sample = params.getDouble(Constants.SAMPLE);
        if (sample <= 0 || sample > 1) {
          throw new ResourceInitializationException("Invalid sample value " + sample);
        }
      } else {
        logger.warn(
            "Sample value valid for parameter optimization only. Provided sample value will be ignored.");
      }
    }

    if (params.contains(Constants.NUMBER_OF_THREADS)) {
      numberOfThreads = params.getInt(Constants.NUMBER_OF_THREADS);
      if (numberOfThreads < 1) {
        throw new ResourceInitializationException(Constants.NUMBER_OF_THREADS + " must be >= 1");
      }
    }

    if (params.contains(Constants.TARGET_CLASS)) {
      targetClass = params.getString(Constants.TARGET_CLASS);
      logger.debug("Target Class: " + targetClass);
    }
  }
示例#2
0
  public svm_problem loadData(CollectionReader cr) {
    logger.debug("Loading " + cr.getSize() + " documents.");
    List<svm_node[]> data = new LinkedList<svm_node[]>();
    List<Set<String>> labels = new LinkedList<Set<String>>();
    Set<String> allLabels = new HashSet<String>();

    for (Analyzable a : cr) {
      CategorizedTextContent ctc = (CategorizedTextContent) a;

      Set<String> categories;
      if (targetClass == null) {
        categories = ctc.getCategories();
      } else {
        categories = new HashSet<String>();
        if (ctc.getCategories().contains(targetClass)) {
          // Not adding any other categories if item belongs to targetClass as well as some other
          // category.
          categories.add(targetClass);
        } else {
          categories.add("other");
        }
      }

      labels.add(categories);
      allLabels.addAll(categories);
      data.add(representer.represent(ctc.getText()).toSvmNodes());
    }
    representer.shutdown();

    labelList = new ArrayList<String>(allLabels);
    Collections.sort(labelList);

    logger.debug("Total labels: " + labelList.size());
    logger.trace("Labels: " + labelList);

    int numberOfExpandedData = 0;
    for (Set<String> ls : labels) {
      numberOfExpandedData += ls.size();
    }

    // Scale data
    logger.debug("Scaling data.");
    scaler = new ValueScaler(0, 1, data.toArray(new svm_node[0][]));
    svm_node[][] scaledData = scaler.getScaledData();

    // For every label, generate a copy of the data item.
    logger.debug("Generating " + numberOfExpandedData + " records.");
    svm_node[][] expandedData = new svm_node[numberOfExpandedData][];
    double[] expandedLabels = new double[numberOfExpandedData];
    int i = 0;
    int xi = 0;
    for (svm_node[] dataItem : scaledData) {
      Set<String> trueLabels = labels.get(i++);
      for (String label : trueLabels) {
        double labelIndex = labelList.indexOf(label);
        expandedData[xi] = dataItem.clone();
        expandedLabels[xi++] = labelIndex;
      }
    }

    svm_problem svmProblem = new svm_problem();
    svmProblem.l = numberOfExpandedData;
    svmProblem.x = expandedData;
    svmProblem.y = expandedLabels;

    return svmProblem;
  }