public void loadtfidf(LinkedHashMap<String, Ngram> orderedGrams) { HashMap<String, Double> idf = ic.getIdfCollection(); for (String s : orderedGrams.keySet()) { // defining size of ngrams to be counted, used with idfs for special // ngram size Ngram temp = orderedGrams.get(s); if (config.prop.containsKey("extractSize")) { if (idf.containsKey(s)) { temp.setSingleTfidf(idf.get(s) * temp.frequency); } else { temp.setSingleTfidf(0.1 * temp.frequency); } } else { // when size of ngram not defined we count each word // separately for (int i = 0; i < temp.gram.length; i++) { if (idf.containsKey(temp.gram[i])) { temp.tfidf[i] += idf.get(temp.gram[i]) * temp.frequency; } else { temp.tfidf[i] += 0.1 * temp.frequency; } } } } }
public void loadNER(LinkedHashMap<String, Ngram> orderedGrams, NerInterface nerInterface) { if (config.prop.containsKey("ner")) { for (Ngram ngram : orderedGrams.values()) { for (NamedEntity namedEntity : nerInterface.getEntities().values()) { if (ngram.toString().toLowerCase().equals(namedEntity.namedEntity)) { ngram.matchesNER = true; } } } } }
public LinkedHashMap<Integer, Vector<Ngram>> getNgramTfidf( LinkedHashMap<Integer, LinkedHashMap<String, Ngram>> linkedHashMap) { LinkedHashMap<Integer, Vector<Ngram>> freq = new LinkedHashMap<Integer, Vector<Ngram>>(); for (int gram : linkedHashMap.keySet()) { LinkedHashMap<String, Ngram> orderedGrams = linkedHashMap.get(gram); // LinkedHashMap<String, Ngram> ngram = linkedHashMap.get(gram); // for (Ngram grams : ngram) { // String key = grams.toString(); // if (orderedGrams.containsKey(key)) { // Ngram temp = orderedGrams.get(key); // temp.frequency += 1; // orderedGrams.put(key, temp); // } else { // orderedGrams.put(key, grams); // } // } // Only used of we are running combined centroids (= more than one // file at the time) double freqDivideBy = wi.getNumberOfFilesInCluster(); if (freqDivideBy > 1) { for (Ngram grams : orderedGrams.values()) { double number = grams.frequency; number = (number / freqDivideBy); grams.frequency = number; } } loadtfidf(orderedGrams); if (config.prop.containsKey("ngramSort") && config.prop.getProperty("ngramSort").equals("ner")) { loadNER(orderedGrams, NerFactory.getNer(config)); Vector<Ngram> ordered = SortingFactory.getSorter().orderNgramsByNer(orderedGrams); freq.put(gram, ordered); } else { Vector<Ngram> ordered = SortingFactory.getSorter().orderNgramsByDefault(orderedGrams); freq.put(gram, ordered); } } return freq; }