예제 #1
0
파일: BagOfWords.java 프로젝트: khmoran/ml
  /**
   * Populate the FeatureVector with Bag of Words.
   *
   * @param c
   * @param fv
   */
  protected void populateFV(String text, FeatureVector<E> fv) {
    List<String> unnormalized = tokenizer.tokenize(text);

    Multiset<String> terms = HashMultiset.create();
    for (String token : unnormalized) {
      String norm = Util.normalize(token);
      if (!norm.isEmpty()) {
        terms.add(norm);
      }
    }

    // sparse representation... no need to put in 0's
    for (String term : terms.elementSet()) {
      // rare words don't get included, so check first
      if (!integerFeatureNames && train.getMetadata().containsKey(term)) {
        DoubleFeature bagFeat = new DoubleFeature(term, (double) terms.count(term));
        fv.put(term, bagFeat);
      } else if (integerFeatureNames
          && train.getMetadata().containsKey(String.valueOf(wordIndexMap.get(term)))) {
        String featureName = String.valueOf(wordIndexMap.get(term));
        DoubleFeature bagFeat = new DoubleFeature(featureName, (double) terms.count(term));
        fv.put(featureName, bagFeat);
      }
    }
  }
예제 #2
0
파일: BagOfWords.java 프로젝트: khmoran/ml
  /**
   * Get the training data.
   *
   * @return
   */
  public TrainRelation<E> getTrainingData() {
    TrainRelation<E> copy =
        new TrainRelation<E>(train.getName(), (Metadata) train.getMetadata().clone());
    copy.addAll(train);

    return copy;
  }
예제 #3
0
파일: BagOfWords.java 프로젝트: khmoran/ml
  /**
   * Get the training data.
   *
   * @return
   */
  public TrainRelation<E> getTrainingData(E clazz) {
    TrainRelation<E> subRelation =
        new TrainRelation<E>("sub-relation", (Metadata) train.getMetadata().clone());
    for (LabeledFeatureVector<E> lfv : train) {
      if (lfv.getLabel().equals(clazz)) {
        subRelation.add(lfv);
      }
    }

    return subRelation;
  }
예제 #4
0
파일: BagOfWords.java 프로젝트: khmoran/ml
 /**
  * Train the classifier with this instance.
  *
  * @param c
  * @param clazz
  */
 public void train(String id, String text, E clazz) {
   if (!trained.contains(id)) {
     LabeledFeatureVector<E> lfv = createLabeledFV(id, text, clazz);
     train.add(lfv);
     trained.add(id);
   }
 }
예제 #5
0
파일: BagOfWords.java 프로젝트: khmoran/ml
 /**
  * Train the classifier with this instance.
  *
  * @param fv
  * @param clazz
  */
 public void train(FeatureVector<E> fv, E clazz) {
   if (!trained.contains(fv.getId())) {
     LabeledFeatureVector<E> lfv = new LabeledFeatureVector<E>(clazz, fv.getId());
     lfv.putAll(fv);
     lfv.setQid(fv.getQid());
     lfv.setRank(fv.getRank());
     train.add(lfv);
     trained.add(fv.getId());
   }
 }
예제 #6
0
파일: BagOfWords.java 프로젝트: khmoran/ml
  /**
   * Create the Bag of Words features.
   *
   * @param citations
   */
  public void createFeatures(Collection<String> documents) {
    Multiset<String> terms = HashMultiset.create();

    for (String s : documents) {
      List<String> unnormalized = tokenizer.tokenize(s);

      // normalize them
      for (int i = 0; i < unnormalized.size(); i++) {
        String u = unnormalized.get(i);
        String norm = Util.normalize(u);
        if (!norm.isEmpty()) {
          terms.add(norm);
        }

        if (bigrams && (i < unnormalized.size() - 1)) {
          String second = unnormalized.get(i + 1);
          String normSecond = Util.normalize(second);
          if (!normSecond.isEmpty()) {
            terms.add(norm + "_" + normSecond);
          }
        }
      }
    }

    int i = 0;
    for (String term : terms.elementSet()) {
      if (terms.count(term) >= minOccurs // don't count infreq. words
          && term.length() >= minLength) { // or super short words
        if (!integerFeatureNames) {
          train.getMetadata().put(term, "boolean");
        } else {
          wordIndexMap.put(term, i++);
          train.getMetadata().put(String.valueOf(i), "boolean");
        }
      }
    }
  }
예제 #7
0
파일: BagOfWords.java 프로젝트: khmoran/ml
 /** Clear the training set. */
 public void clearTrainingSet() {
   train.clear();
   trained.clear();
 }