public void loadtfidf(LinkedHashMap<String, Ngram> orderedGrams) { HashMap<String, Double> idf = ic.getIdfCollection(); for (String s : orderedGrams.keySet()) { // defining size of ngrams to be counted, used with idfs for special // ngram size Ngram temp = orderedGrams.get(s); if (config.prop.containsKey("extractSize")) { if (idf.containsKey(s)) { temp.setSingleTfidf(idf.get(s) * temp.frequency); } else { temp.setSingleTfidf(0.1 * temp.frequency); } } else { // when size of ngram not defined we count each word // separately for (int i = 0; i < temp.gram.length; i++) { if (idf.containsKey(temp.gram[i])) { temp.tfidf[i] += idf.get(temp.gram[i]) * temp.frequency; } else { temp.tfidf[i] += 0.1 * temp.frequency; } } } } }
// TODO: I think I have modified this method public void getHashTagTfidf() { HashMap<String, Double> idf = ic.getIdfCollection(); if (config.prop.get("contentType").equals("twitter")) { if (config.prop.getProperty("ignoreCase").equals("yes")) { hashtags = WeightFactory.getParser(config).calculateHashtags(); for (String word : hashtags.keySet()) { HashTag hash = hashtags.get(word); if (idf.containsKey(word)) { double value = hash.frequency * idf.get(word); if (value > 0) { hash.tfidf = value; hashtags.put(word, hash); } else { hash.tfidf = 0.0; hashtags.put(word, hash); } } else { hash.tfidf = hash.frequency * 0.1; hashtags.put(word, hash); } } // hashentries = SortingFactory.getSorter().sortHashtagsByValue( // hashtags); Vector<MediaInformation> mediaInfo = ContentFactory.getParser(config).getMedia(); HashSet<String> stopwords = ic.getStorwords(); for (MediaInformation m : mediaInfo) { for (int i = 1; i <= m.mediaFreq.size(); i++) { HashMap<String, Centroid> mtfidf = new HashMap<String, Centroid>(); for (String tag : m.mediaFreq.get(i).keySet()) { StringTokenizer st = new StringTokenizer(tag); double value = 0; while (st.hasMoreTokens()) { String token = st.nextToken(); if (idf.containsKey(token)) { if (stopwords.contains(token)) { } else { value += idf.get(token); } } else { if (stopwords.contains(token)) { } else { value += 0.1; } } } String id = ContentFactory.getID(config); Centroid c = new Centroid(id + tag, tag, value); if (m.photoUID != null) { c.setPhotoUID(m.photoUID); } else { c.setPhotoUID("null"); System.out.println("photouid is null for centroid " + tag); } mtfidf.put(tag, c); } m.mediaTfidf.put(i, mtfidf); } } } } }
/** * OLD DEPRECATED METHOD Method for calculating TDIDF, method split up in three parts depending on * if you are using facebook, twitter or plain text extraction */ public void calculateTfidf() { this.tf = new TermFrequency(config.prop.getProperty("combine"), config); tf.calculateTermFrequency(); HashMap<String, Double> freq = tf.getFrequency(); HashMap<String, Double> mediaFreq = tf.getOtherFrequency(); HashMap<String, Double> idf = ic.getIdfCollection(); User userFeedback = null; ContentInterface ci = ContentFactory.getParser(config); if (config.prop.containsKey("feedback") && config.prop.get("contentType").equals("facebook")) { FeedbackInterface fi = DataFactory.getFeedback(config); userFeedback = fi.readContent(ci.getID()); Set<String> keyset = userFeedback.negativeWords.keySet(); if (config.prop.getProperty("ignoreCase").equals("yes")) { for (String word : freq.keySet()) { if (keyset.contains(word)) { tfidf.put(word, 0.0); } else { if (idf.containsKey(word)) { double value = freq.get(word) * idf.get(word); if (value > 0) { tfidf.put(word, value); } else { } } else { double value = freq.get(word) * 0.1; if (value > 0) { tfidf.put(word, value); } else { } } } } } else { for (String word : freq.keySet()) { if (keyset.contains(word)) { tfidf.put(word, 0.0); } else { if (idf.containsKey(word)) { double value = freq.get(word) * idf.get(word); tfidf.put(word, value); } else { double value = freq.get(word) * 0.1; tfidf.put(word, value); } } } } } if (config.prop.get("contentType").equals("twitter")) { if (config.prop.getProperty("ignoreCase").equals("yes")) { for (String word : freq.keySet()) { if (idf.containsKey(word)) { double value = freq.get(word) * idf.get(word); if (value > 0) { tfidf.put(word.toLowerCase(), value); } else { } } } for (String word : mediaFreq.keySet()) { if (idf.containsKey(word)) { double value = mediaFreq.get(word) * (idf.get(word)); if (value > 0) { mediaTfidf.put(word.toLowerCase(), value); } else { } } else { double value = mediaFreq.get(word) * 0.1; tfidf.put(word, value); } } } else { for (String word : freq.keySet()) { if (idf.containsKey(word)) { double value = freq.get(word) * idf.get(word); tfidf.put(word, value); } else { double value = freq.get(word) * 0.1; tfidf.put(word, value); } } for (String word : mediaFreq.keySet()) { if (idf.containsKey(word)) { double value = mediaFreq.get(word) * (idf.get(word)); mediaTfidf.put(word, value); } else { double value = freq.get(word) * 0.1; mediaTfidf.put(word, value); } } } hashtags = WeightFactory.getParser(config).calculateHashtags(); for (String word : hashtags.keySet()) { HashTag hash = hashtags.get(word); if (idf.containsKey(word)) { double value = hash.frequency * idf.get(word); if (value > 0) { hash.tfidf = value; hashtags.put(word, hash); } else { } } else { hash.tfidf = hash.frequency * 0.1; hashtags.put(word, hash); } } // hashentries = SortingFactory.getSorter().sortHashtagsByValue( // hashtags); Vector<MediaInformation> mediaInfo = ContentFactory.getParser(config).getMedia(); HashSet<String> stopwords = ic.getStorwords(); for (MediaInformation m : mediaInfo) { for (int i = 1; i <= m.mediaFreq.size(); i++) { HashMap<String, Centroid> mtfidf = new HashMap<String, Centroid>(); for (String tag : m.mediaFreq.get(i).keySet()) { StringTokenizer st = new StringTokenizer(tag); double value = 0; while (st.hasMoreTokens()) { String token = st.nextToken(); if (idf.containsKey(token)) { if (stopwords.contains(token)) { } else { value += idf.get(token); } } else { if (stopwords.contains(token)) { } else { value += 0.1; } } } String id = ContentFactory.getID(config); Centroid c = new Centroid(id + tag, tag, value); if (m.photoUID != null) { c.setPhotoUID(m.photoUID); } else { c.setPhotoUID("null"); } mtfidf.put(tag, c); } m.mediaTfidf.put(i, mtfidf); } } } else { if (config.prop.getProperty("ignoreCase").equals("yes")) { for (String word : freq.keySet()) { if (idf.containsKey(word)) { double value = freq.get(word) * idf.get(word); if (value > 0) { tfidf.put(word, value); } else { } } else { double value = freq.get(word) * 0.1; if (value > 0) { tfidf.put(word, value); } else { } } } } else { for (String word : freq.keySet()) { if (idf.containsKey(word)) { double value = freq.get(word) * idf.get(word); tfidf.put(word, value); } else { double value = freq.get(word) * 0.1; tfidf.put(word, value); } } } } }