예제 #1
0
  // TODO: I think I have modified this method
  public void getHashTagTfidf() {
    HashMap<String, Double> idf = ic.getIdfCollection();

    if (config.prop.get("contentType").equals("twitter")) {
      if (config.prop.getProperty("ignoreCase").equals("yes")) {

        hashtags = WeightFactory.getParser(config).calculateHashtags();
        for (String word : hashtags.keySet()) {
          HashTag hash = hashtags.get(word);
          if (idf.containsKey(word)) {
            double value = hash.frequency * idf.get(word);
            if (value > 0) {
              hash.tfidf = value;
              hashtags.put(word, hash);

            } else {
              hash.tfidf = 0.0;
              hashtags.put(word, hash);
            }
          } else {
            hash.tfidf = hash.frequency * 0.1;
            hashtags.put(word, hash);
          }
        }
        // hashentries = SortingFactory.getSorter().sortHashtagsByValue(
        // hashtags);
        Vector<MediaInformation> mediaInfo = ContentFactory.getParser(config).getMedia();
        HashSet<String> stopwords = ic.getStorwords();
        for (MediaInformation m : mediaInfo) {
          for (int i = 1; i <= m.mediaFreq.size(); i++) {
            HashMap<String, Centroid> mtfidf = new HashMap<String, Centroid>();
            for (String tag : m.mediaFreq.get(i).keySet()) {
              StringTokenizer st = new StringTokenizer(tag);
              double value = 0;
              while (st.hasMoreTokens()) {
                String token = st.nextToken();
                if (idf.containsKey(token)) {
                  if (stopwords.contains(token)) {

                  } else {
                    value += idf.get(token);
                  }

                } else {

                  if (stopwords.contains(token)) {

                  } else {
                    value += 0.1;
                  }
                }
              }
              String id = ContentFactory.getID(config);
              Centroid c = new Centroid(id + tag, tag, value);

              if (m.photoUID != null) {
                c.setPhotoUID(m.photoUID);
              } else {
                c.setPhotoUID("null");
                System.out.println("photouid is null for centroid " + tag);
              }
              mtfidf.put(tag, c);
            }
            m.mediaTfidf.put(i, mtfidf);
          }
        }
      }
    }
  }
예제 #2
0
  /**
   * OLD DEPRECATED METHOD Method for calculating TDIDF, method split up in three parts depending on
   * if you are using facebook, twitter or plain text extraction
   */
  public void calculateTfidf() {
    this.tf = new TermFrequency(config.prop.getProperty("combine"), config);
    tf.calculateTermFrequency();
    HashMap<String, Double> freq = tf.getFrequency();
    HashMap<String, Double> mediaFreq = tf.getOtherFrequency();
    HashMap<String, Double> idf = ic.getIdfCollection();
    User userFeedback = null;
    ContentInterface ci = ContentFactory.getParser(config);
    if (config.prop.containsKey("feedback") && config.prop.get("contentType").equals("facebook")) {
      FeedbackInterface fi = DataFactory.getFeedback(config);
      userFeedback = fi.readContent(ci.getID());
      Set<String> keyset = userFeedback.negativeWords.keySet();
      if (config.prop.getProperty("ignoreCase").equals("yes")) {
        for (String word : freq.keySet()) {
          if (keyset.contains(word)) {
            tfidf.put(word, 0.0);
          } else {
            if (idf.containsKey(word)) {
              double value = freq.get(word) * idf.get(word);
              if (value > 0) {
                tfidf.put(word, value);
              } else {
              }
            } else {
              double value = freq.get(word) * 0.1;
              if (value > 0) {
                tfidf.put(word, value);
              } else {
              }
            }
          }
        }
      } else {
        for (String word : freq.keySet()) {
          if (keyset.contains(word)) {
            tfidf.put(word, 0.0);
          } else {
            if (idf.containsKey(word)) {
              double value = freq.get(word) * idf.get(word);
              tfidf.put(word, value);
            } else {
              double value = freq.get(word) * 0.1;
              tfidf.put(word, value);
            }
          }
        }
      }
    }
    if (config.prop.get("contentType").equals("twitter")) {
      if (config.prop.getProperty("ignoreCase").equals("yes")) {
        for (String word : freq.keySet()) {
          if (idf.containsKey(word)) {
            double value = freq.get(word) * idf.get(word);
            if (value > 0) {
              tfidf.put(word.toLowerCase(), value);
            } else {
            }
          }
        }

        for (String word : mediaFreq.keySet()) {
          if (idf.containsKey(word)) {
            double value = mediaFreq.get(word) * (idf.get(word));
            if (value > 0) {
              mediaTfidf.put(word.toLowerCase(), value);
            } else {

            }
          } else {
            double value = mediaFreq.get(word) * 0.1;
            tfidf.put(word, value);
          }
        }

      } else {
        for (String word : freq.keySet()) {
          if (idf.containsKey(word)) {
            double value = freq.get(word) * idf.get(word);
            tfidf.put(word, value);
          } else {

            double value = freq.get(word) * 0.1;
            tfidf.put(word, value);
          }
        }

        for (String word : mediaFreq.keySet()) {
          if (idf.containsKey(word)) {
            double value = mediaFreq.get(word) * (idf.get(word));
            mediaTfidf.put(word, value);
          } else {
            double value = freq.get(word) * 0.1;
            mediaTfidf.put(word, value);
          }
        }
      }
      hashtags = WeightFactory.getParser(config).calculateHashtags();
      for (String word : hashtags.keySet()) {
        HashTag hash = hashtags.get(word);
        if (idf.containsKey(word)) {
          double value = hash.frequency * idf.get(word);
          if (value > 0) {
            hash.tfidf = value;
            hashtags.put(word, hash);

          } else {
          }
        } else {
          hash.tfidf = hash.frequency * 0.1;
          hashtags.put(word, hash);
        }
      }
      // hashentries = SortingFactory.getSorter().sortHashtagsByValue(
      // hashtags);
      Vector<MediaInformation> mediaInfo = ContentFactory.getParser(config).getMedia();
      HashSet<String> stopwords = ic.getStorwords();
      for (MediaInformation m : mediaInfo) {
        for (int i = 1; i <= m.mediaFreq.size(); i++) {
          HashMap<String, Centroid> mtfidf = new HashMap<String, Centroid>();
          for (String tag : m.mediaFreq.get(i).keySet()) {
            StringTokenizer st = new StringTokenizer(tag);
            double value = 0;
            while (st.hasMoreTokens()) {
              String token = st.nextToken();
              if (idf.containsKey(token)) {
                if (stopwords.contains(token)) {

                } else {
                  value += idf.get(token);
                }

              } else {

                if (stopwords.contains(token)) {

                } else {
                  value += 0.1;
                }
              }
            }

            String id = ContentFactory.getID(config);
            Centroid c = new Centroid(id + tag, tag, value);
            if (m.photoUID != null) {
              c.setPhotoUID(m.photoUID);
            } else {
              c.setPhotoUID("null");
            }
            mtfidf.put(tag, c);
          }
          m.mediaTfidf.put(i, mtfidf);
        }
      }
    } else {
      if (config.prop.getProperty("ignoreCase").equals("yes")) {
        for (String word : freq.keySet()) {
          if (idf.containsKey(word)) {
            double value = freq.get(word) * idf.get(word);
            if (value > 0) {
              tfidf.put(word, value);
            } else {
            }
          } else {
            double value = freq.get(word) * 0.1;
            if (value > 0) {
              tfidf.put(word, value);
            } else {
            }
          }
        }
      } else {
        for (String word : freq.keySet()) {
          if (idf.containsKey(word)) {
            double value = freq.get(word) * idf.get(word);
            tfidf.put(word, value);
          } else {

            double value = freq.get(word) * 0.1;
            tfidf.put(word, value);
          }
        }
      }
    }
  }