Esempio n. 1
0
  /**
   * This method will remove all stop words in all documents. see details in http://www.tfidf.com/
   *
   * @param totalDocument ( = 0 -> use default value of total document)
   */
  public void processDocumentsAndCalculateTFIDF() {

    // remove stop words
    if (isSemanticSearch) {
      for (Document document : documents) {
        document.getWords().removeAll(stopwords);
      }
    }
    // end remove stop words

    // calculate word appear in all document (add to words)
    for (Document document : documents) {
      for (Word word : document.getWords()) {
        Word w = new Word(0L, word.getWord(), word.getTypeWord(), 0L, 1D);
        addWordIntoWords(w);
      }
    }

    // update IDF for each word in each document
    for (Document document : documents) {
      // calculate IDF for word in the all document.
      for (Word word : document.getWords()) {
        int wordId = words.indexOf(word);
        double idfWord = Math.log(documents.size() / words.get(wordId).getTf());

        // set idf for word in each document
        word.setIdf(idfWord);
        word.setProcessTF(word.getTf() / document.getWords().size());
        words.get(wordId).setIdf(idfWord);
      }
      Collections.sort(document.getWords());

      if (document.getWords().size() > 10) {
        // System.out.println("url: " + document.getUrl() + "---" + document.getWords().subList(0,
        // 10));
      }
    }
    // System.out.println("word in the all document: " + words);
  }
Esempio n. 2
0
  public List<Document> proccessCalculateVectorSpaceModel() {
    List<Document> documentRetrievals = new ArrayList<Document>();
    processQuery();
    List<Word> queryWords = query.getWords();
    if (queryWords == null || queryWords.size() == 0) return documentRetrievals;

    Collections.sort(queryWords);
    double maximumFrequency = queryWords.get(0).getTf();

    // calculate TF-IDF for query
    for (Word wordQ : queryWords) {

      // check ton tai gia tri cua word(Q) trong word(allDocument)
      int index = words.indexOf(wordQ);
      // System.out.println("INDEX ========== " + index);
      if (index > -1) {
        wordQ.setIdf(words.get(index).getIdf());
        wordQ.setProcessTF(wordQ.getTf() / maximumFrequency);
        // wordQ.setTf(wordQ.getTf()/maximumFrequency);
        // System.out.println("IDF: " + words.get(index).getIdf());
      }
    }

    System.out.println("Query:" + queryWords);
    for (Document document : documents) {
      // calculate document with query
      double cosinQD = 0D;
      for (Word wordInQuery : query.getWords()) {

        // tim word cua query trong document.
        int idWordInD = document.getWords().indexOf(wordInQuery);
        if (idWordInD > -1) {
          cosinQD += wordInQuery.getTfidf() * document.getWords().get(idWordInD).getTfidf();
        }
      }
      document.setCosinWithQuery(cosinQD / (document.getDocumentLeng() * query.getDocumentLeng()));
      if (document.getCosinWithQuery() > 0) {
        documentRetrievals.add(document.clone());
      }
    }

    return documentRetrievals;
  }