Example #1
0
  /**
   * This method will remove all stop words in all documents. see details in http://www.tfidf.com/
   *
   * @param totalDocument ( = 0 -> use default value of total document)
   */
  public void processDocumentsAndCalculateTFIDF() {

    // remove stop words
    if (isSemanticSearch) {
      for (Document document : documents) {
        document.getWords().removeAll(stopwords);
      }
    }
    // end remove stop words

    // calculate word appear in all document (add to words)
    for (Document document : documents) {
      for (Word word : document.getWords()) {
        Word w = new Word(0L, word.getWord(), word.getTypeWord(), 0L, 1D);
        addWordIntoWords(w);
      }
    }

    // update IDF for each word in each document
    for (Document document : documents) {
      // calculate IDF for word in the all document.
      for (Word word : document.getWords()) {
        int wordId = words.indexOf(word);
        double idfWord = Math.log(documents.size() / words.get(wordId).getTf());

        // set idf for word in each document
        word.setIdf(idfWord);
        word.setProcessTF(word.getTf() / document.getWords().size());
        words.get(wordId).setIdf(idfWord);
      }
      Collections.sort(document.getWords());

      if (document.getWords().size() > 10) {
        // System.out.println("url: " + document.getUrl() + "---" + document.getWords().subList(0,
        // 10));
      }
    }
    // System.out.println("word in the all document: " + words);
  }