예제 #1
0
  public void loadtfidf(LinkedHashMap<String, Ngram> orderedGrams) {
    HashMap<String, Double> idf = ic.getIdfCollection();
    for (String s : orderedGrams.keySet()) {
      // defining size of ngrams to be counted, used with idfs for special
      // ngram size
      Ngram temp = orderedGrams.get(s);
      if (config.prop.containsKey("extractSize")) {

        if (idf.containsKey(s)) {

          temp.setSingleTfidf(idf.get(s) * temp.frequency);
        } else {
          temp.setSingleTfidf(0.1 * temp.frequency);
        }
      } else { // when size of ngram not defined we count each word
        // separately
        for (int i = 0; i < temp.gram.length; i++) {
          if (idf.containsKey(temp.gram[i])) {
            temp.tfidf[i] += idf.get(temp.gram[i]) * temp.frequency;
          } else {
            temp.tfidf[i] += 0.1 * temp.frequency;
          }
        }
      }
    }
  }
예제 #2
0
 public void loadNER(LinkedHashMap<String, Ngram> orderedGrams, NerInterface nerInterface) {
   if (config.prop.containsKey("ner")) {
     for (Ngram ngram : orderedGrams.values()) {
       for (NamedEntity namedEntity : nerInterface.getEntities().values()) {
         if (ngram.toString().toLowerCase().equals(namedEntity.namedEntity)) {
           ngram.matchesNER = true;
         }
       }
     }
   }
 }
예제 #3
0
  public LinkedHashMap<Integer, Vector<Ngram>> getNgramTfidf(
      LinkedHashMap<Integer, LinkedHashMap<String, Ngram>> linkedHashMap) {

    LinkedHashMap<Integer, Vector<Ngram>> freq = new LinkedHashMap<Integer, Vector<Ngram>>();
    for (int gram : linkedHashMap.keySet()) {
      LinkedHashMap<String, Ngram> orderedGrams = linkedHashMap.get(gram);
      // LinkedHashMap<String, Ngram> ngram = linkedHashMap.get(gram);
      // for (Ngram grams : ngram) {
      // String key = grams.toString();
      // if (orderedGrams.containsKey(key)) {
      // Ngram temp = orderedGrams.get(key);
      // temp.frequency += 1;
      // orderedGrams.put(key, temp);
      // } else {
      // orderedGrams.put(key, grams);
      // }
      // }
      // Only used of we are running combined centroids (= more than one
      // file at the time)
      double freqDivideBy = wi.getNumberOfFilesInCluster();
      if (freqDivideBy > 1) {
        for (Ngram grams : orderedGrams.values()) {
          double number = grams.frequency;
          number = (number / freqDivideBy);
          grams.frequency = number;
        }
      }
      loadtfidf(orderedGrams);
      if (config.prop.containsKey("ngramSort")
          && config.prop.getProperty("ngramSort").equals("ner")) {
        loadNER(orderedGrams, NerFactory.getNer(config));
        Vector<Ngram> ordered = SortingFactory.getSorter().orderNgramsByNer(orderedGrams);
        freq.put(gram, ordered);
      } else {
        Vector<Ngram> ordered = SortingFactory.getSorter().orderNgramsByDefault(orderedGrams);
        freq.put(gram, ordered);
      }
    }
    return freq;
  }