Example #1
0
  /**
   * Populate the FeatureVector with Bag of Words.
   *
   * @param c
   * @param fv
   */
  protected void populateFV(String text, FeatureVector<E> fv) {
    List<String> unnormalized = tokenizer.tokenize(text);

    Multiset<String> terms = HashMultiset.create();
    for (String token : unnormalized) {
      String norm = Util.normalize(token);
      if (!norm.isEmpty()) {
        terms.add(norm);
      }
    }

    // sparse representation... no need to put in 0's
    for (String term : terms.elementSet()) {
      // rare words don't get included, so check first
      if (!integerFeatureNames && train.getMetadata().containsKey(term)) {
        DoubleFeature bagFeat = new DoubleFeature(term, (double) terms.count(term));
        fv.put(term, bagFeat);
      } else if (integerFeatureNames
          && train.getMetadata().containsKey(String.valueOf(wordIndexMap.get(term)))) {
        String featureName = String.valueOf(wordIndexMap.get(term));
        DoubleFeature bagFeat = new DoubleFeature(featureName, (double) terms.count(term));
        fv.put(featureName, bagFeat);
      }
    }
  }
Example #2
0
  /**
   * Convert to the Mallet format.
   *
   * @param train
   * @return
   * @throws IOException
   */
  public static InstanceList convert(Relation<?> train) {
    boolean isTrain = (train instanceof TrainRelation) ? true : false;

    StringBuffer sb = new StringBuffer();
    for (FeatureVector<?> fv : train) {
      sb.append(fv.getId() + "\t");

      if (isTrain) {
        sb.append(((LabeledFeatureVector<?>) fv).getLabel().toString() + "\t");
      } else {
        sb.append("X\t");
      }

      for (String feature : train.getMetadata().keySet()) {
        if (fv.containsKey(feature)) {
          sb.append(feature + "=" + fv.get(feature).getValue() + " ");
        }
      }
      sb.append("\n");
    }

    InstanceList il = loadInstances(sb.toString());

    return il;
  }
Example #3
0
 /**
  * Train the classifier with this instance.
  *
  * @param fv
  * @param clazz
  */
 public void train(FeatureVector<E> fv, E clazz) {
   if (!trained.contains(fv.getId())) {
     LabeledFeatureVector<E> lfv = new LabeledFeatureVector<E>(clazz, fv.getId());
     lfv.putAll(fv);
     lfv.setQid(fv.getQid());
     lfv.setRank(fv.getRank());
     train.add(lfv);
     trained.add(fv.getId());
   }
 }
Example #4
0
  /**
   * Convert to the Mallet format.
   *
   * @param fv
   * @param m
   * @return
   */
  public static Instance convert(FeatureVector<?> fv, Metadata m) {
    boolean isTrain = (fv instanceof LabeledFeatureVector) ? true : false;

    StringBuffer sb = new StringBuffer(fv.getId() + "\t");

    if (isTrain) {
      sb.append(((LabeledFeatureVector<?>) fv).getLabel().toString() + "\t");
    } else {
      sb.append("1\t");
    }

    for (String feature : m.keySet()) {
      if (fv.containsKey(feature)) {
        sb.append(feature + "=" + fv.get(feature).getValue() + " ");
      }
    }

    InstanceList il = loadInstances(sb.toString());

    return il.get(0);
  }