Exemplo n.º 1
0
  /**
   * Populate the FeatureVector with Bag of Words.
   *
   * @param c
   * @param fv
   */
  protected void populateFV(String text, FeatureVector<E> fv) {
    List<String> unnormalized = tokenizer.tokenize(text);

    Multiset<String> terms = HashMultiset.create();
    for (String token : unnormalized) {
      String norm = Util.normalize(token);
      if (!norm.isEmpty()) {
        terms.add(norm);
      }
    }

    // sparse representation... no need to put in 0's
    for (String term : terms.elementSet()) {
      // rare words don't get included, so check first
      if (!integerFeatureNames && train.getMetadata().containsKey(term)) {
        DoubleFeature bagFeat = new DoubleFeature(term, (double) terms.count(term));
        fv.put(term, bagFeat);
      } else if (integerFeatureNames
          && train.getMetadata().containsKey(String.valueOf(wordIndexMap.get(term)))) {
        String featureName = String.valueOf(wordIndexMap.get(term));
        DoubleFeature bagFeat = new DoubleFeature(featureName, (double) terms.count(term));
        fv.put(featureName, bagFeat);
      }
    }
  }