@Override public void modifyWeight() { WeightInterface cI = WeightFactory.getParser(config); HashMap<String, Double> significant; String key = (String) config.prop.get("dateLimit"); if (key != null && !key.equals("") && config.prop.get("cutoff").equals("linear")) { significant = cI.findDateSignificanceWords(); if (significant != null && significant.size() > 0) { for (String word : significant.keySet()) { if (tfidf.containsKey(word)) { double number = tfidf.get(word); number = number * (significant.get(word) + 1); tfidf.put(word, number); } } } } key = (String) config.prop.get("counterLimit"); if (key != null && !key.equals("")) { significant = cI.findCounterSignificanceWords(); for (String word : significant.keySet()) { if (tfidf.containsKey(word)) { double number = tfidf.get(word); number = number * (significant.get(word) + 1); tfidf.put(word, number); } } } }
public TFIDFCalculator(Config config) { wi = WeightFactory.getParser(config); this.config = config; this.ic = TFIDFFactory.getIc(config); // calculateTfidf(); calculateAllTfidf(); }
// TODO: I think I have modified this method public void getHashTagTfidf() { HashMap<String, Double> idf = ic.getIdfCollection(); if (config.prop.get("contentType").equals("twitter")) { if (config.prop.getProperty("ignoreCase").equals("yes")) { hashtags = WeightFactory.getParser(config).calculateHashtags(); for (String word : hashtags.keySet()) { HashTag hash = hashtags.get(word); if (idf.containsKey(word)) { double value = hash.frequency * idf.get(word); if (value > 0) { hash.tfidf = value; hashtags.put(word, hash); } else { hash.tfidf = 0.0; hashtags.put(word, hash); } } else { hash.tfidf = hash.frequency * 0.1; hashtags.put(word, hash); } } // hashentries = SortingFactory.getSorter().sortHashtagsByValue( // hashtags); Vector<MediaInformation> mediaInfo = ContentFactory.getParser(config).getMedia(); HashSet<String> stopwords = ic.getStorwords(); for (MediaInformation m : mediaInfo) { for (int i = 1; i <= m.mediaFreq.size(); i++) { HashMap<String, Centroid> mtfidf = new HashMap<String, Centroid>(); for (String tag : m.mediaFreq.get(i).keySet()) { StringTokenizer st = new StringTokenizer(tag); double value = 0; while (st.hasMoreTokens()) { String token = st.nextToken(); if (idf.containsKey(token)) { if (stopwords.contains(token)) { } else { value += idf.get(token); } } else { if (stopwords.contains(token)) { } else { value += 0.1; } } } String id = ContentFactory.getID(config); Centroid c = new Centroid(id + tag, tag, value); if (m.photoUID != null) { c.setPhotoUID(m.photoUID); } else { c.setPhotoUID("null"); System.out.println("photouid is null for centroid " + tag); } mtfidf.put(tag, c); } m.mediaTfidf.put(i, mtfidf); } } } } }
/** * OLD DEPRECATED METHOD Method for calculating TDIDF, method split up in three parts depending on * if you are using facebook, twitter or plain text extraction */ public void calculateTfidf() { this.tf = new TermFrequency(config.prop.getProperty("combine"), config); tf.calculateTermFrequency(); HashMap<String, Double> freq = tf.getFrequency(); HashMap<String, Double> mediaFreq = tf.getOtherFrequency(); HashMap<String, Double> idf = ic.getIdfCollection(); User userFeedback = null; ContentInterface ci = ContentFactory.getParser(config); if (config.prop.containsKey("feedback") && config.prop.get("contentType").equals("facebook")) { FeedbackInterface fi = DataFactory.getFeedback(config); userFeedback = fi.readContent(ci.getID()); Set<String> keyset = userFeedback.negativeWords.keySet(); if (config.prop.getProperty("ignoreCase").equals("yes")) { for (String word : freq.keySet()) { if (keyset.contains(word)) { tfidf.put(word, 0.0); } else { if (idf.containsKey(word)) { double value = freq.get(word) * idf.get(word); if (value > 0) { tfidf.put(word, value); } else { } } else { double value = freq.get(word) * 0.1; if (value > 0) { tfidf.put(word, value); } else { } } } } } else { for (String word : freq.keySet()) { if (keyset.contains(word)) { tfidf.put(word, 0.0); } else { if (idf.containsKey(word)) { double value = freq.get(word) * idf.get(word); tfidf.put(word, value); } else { double value = freq.get(word) * 0.1; tfidf.put(word, value); } } } } } if (config.prop.get("contentType").equals("twitter")) { if (config.prop.getProperty("ignoreCase").equals("yes")) { for (String word : freq.keySet()) { if (idf.containsKey(word)) { double value = freq.get(word) * idf.get(word); if (value > 0) { tfidf.put(word.toLowerCase(), value); } else { } } } for (String word : mediaFreq.keySet()) { if (idf.containsKey(word)) { double value = mediaFreq.get(word) * (idf.get(word)); if (value > 0) { mediaTfidf.put(word.toLowerCase(), value); } else { } } else { double value = mediaFreq.get(word) * 0.1; tfidf.put(word, value); } } } else { for (String word : freq.keySet()) { if (idf.containsKey(word)) { double value = freq.get(word) * idf.get(word); tfidf.put(word, value); } else { double value = freq.get(word) * 0.1; tfidf.put(word, value); } } for (String word : mediaFreq.keySet()) { if (idf.containsKey(word)) { double value = mediaFreq.get(word) * (idf.get(word)); mediaTfidf.put(word, value); } else { double value = freq.get(word) * 0.1; mediaTfidf.put(word, value); } } } hashtags = WeightFactory.getParser(config).calculateHashtags(); for (String word : hashtags.keySet()) { HashTag hash = hashtags.get(word); if (idf.containsKey(word)) { double value = hash.frequency * idf.get(word); if (value > 0) { hash.tfidf = value; hashtags.put(word, hash); } else { } } else { hash.tfidf = hash.frequency * 0.1; hashtags.put(word, hash); } } // hashentries = SortingFactory.getSorter().sortHashtagsByValue( // hashtags); Vector<MediaInformation> mediaInfo = ContentFactory.getParser(config).getMedia(); HashSet<String> stopwords = ic.getStorwords(); for (MediaInformation m : mediaInfo) { for (int i = 1; i <= m.mediaFreq.size(); i++) { HashMap<String, Centroid> mtfidf = new HashMap<String, Centroid>(); for (String tag : m.mediaFreq.get(i).keySet()) { StringTokenizer st = new StringTokenizer(tag); double value = 0; while (st.hasMoreTokens()) { String token = st.nextToken(); if (idf.containsKey(token)) { if (stopwords.contains(token)) { } else { value += idf.get(token); } } else { if (stopwords.contains(token)) { } else { value += 0.1; } } } String id = ContentFactory.getID(config); Centroid c = new Centroid(id + tag, tag, value); if (m.photoUID != null) { c.setPhotoUID(m.photoUID); } else { c.setPhotoUID("null"); } mtfidf.put(tag, c); } m.mediaTfidf.put(i, mtfidf); } } } else { if (config.prop.getProperty("ignoreCase").equals("yes")) { for (String word : freq.keySet()) { if (idf.containsKey(word)) { double value = freq.get(word) * idf.get(word); if (value > 0) { tfidf.put(word, value); } else { } } else { double value = freq.get(word) * 0.1; if (value > 0) { tfidf.put(word, value); } else { } } } } else { for (String word : freq.keySet()) { if (idf.containsKey(word)) { double value = freq.get(word) * idf.get(word); tfidf.put(word, value); } else { double value = freq.get(word) * 0.1; tfidf.put(word, value); } } } } }