Ejemplo n.º 1
0
  /**
   * Populate the FeatureVector with Bag of Words.
   *
   * @param c
   * @param fv
   */
  protected void populateFV(String text, FeatureVector<E> fv) {
    List<String> unnormalized = tokenizer.tokenize(text);

    Multiset<String> terms = HashMultiset.create();
    for (String token : unnormalized) {
      String norm = Util.normalize(token);
      if (!norm.isEmpty()) {
        terms.add(norm);
      }
    }

    // sparse representation... no need to put in 0's
    for (String term : terms.elementSet()) {
      // rare words don't get included, so check first
      if (!integerFeatureNames && train.getMetadata().containsKey(term)) {
        DoubleFeature bagFeat = new DoubleFeature(term, (double) terms.count(term));
        fv.put(term, bagFeat);
      } else if (integerFeatureNames
          && train.getMetadata().containsKey(String.valueOf(wordIndexMap.get(term)))) {
        String featureName = String.valueOf(wordIndexMap.get(term));
        DoubleFeature bagFeat = new DoubleFeature(featureName, (double) terms.count(term));
        fv.put(featureName, bagFeat);
      }
    }
  }
Ejemplo n.º 2
0
  /**
   * Get the training data.
   *
   * @return
   */
  public TrainRelation<E> getTrainingData() {
    TrainRelation<E> copy =
        new TrainRelation<E>(train.getName(), (Metadata) train.getMetadata().clone());
    copy.addAll(train);

    return copy;
  }
Ejemplo n.º 3
0
  /**
   * Get the training data.
   *
   * @return
   */
  public TrainRelation<E> getTrainingData(E clazz) {
    TrainRelation<E> subRelation =
        new TrainRelation<E>("sub-relation", (Metadata) train.getMetadata().clone());
    for (LabeledFeatureVector<E> lfv : train) {
      if (lfv.getLabel().equals(clazz)) {
        subRelation.add(lfv);
      }
    }

    return subRelation;
  }
Ejemplo n.º 4
0
  /**
   * Create the Bag of Words features.
   *
   * @param citations
   */
  public void createFeatures(Collection<String> documents) {
    Multiset<String> terms = HashMultiset.create();

    for (String s : documents) {
      List<String> unnormalized = tokenizer.tokenize(s);

      // normalize them
      for (int i = 0; i < unnormalized.size(); i++) {
        String u = unnormalized.get(i);
        String norm = Util.normalize(u);
        if (!norm.isEmpty()) {
          terms.add(norm);
        }

        if (bigrams && (i < unnormalized.size() - 1)) {
          String second = unnormalized.get(i + 1);
          String normSecond = Util.normalize(second);
          if (!normSecond.isEmpty()) {
            terms.add(norm + "_" + normSecond);
          }
        }
      }
    }

    int i = 0;
    for (String term : terms.elementSet()) {
      if (terms.count(term) >= minOccurs // don't count infreq. words
          && term.length() >= minLength) { // or super short words
        if (!integerFeatureNames) {
          train.getMetadata().put(term, "boolean");
        } else {
          wordIndexMap.put(term, i++);
          train.getMetadata().put(String.valueOf(i), "boolean");
        }
      }
    }
  }