public void createConfiguration(ArrayList<Interview> interviews, String file_path) {

    // update TFIDF with each interview instance
    for (Interview interview : interviews)
      for (InterviewInstance instance : interview.getInstances()) tfidf.update(instance);

    tfidf.saveConfiguration(file_path);
  }
Ejemplo n.º 2
0
 public void countTfVariable(TFIDF input) {
   int totalOccurences = 0; // tf
   String terms = input.getTerms();
   for (String word : words) {
     if (word.equals(terms)) {
       totalOccurences++;
     }
   }
   input.setTotalOccurences(totalOccurences);
   input.setTotalTermsInDocument(totalTermsInDocument);
 }
Ejemplo n.º 3
0
 public void countIDfVariable(TFIDF input) {
   String terms = input.getTerms();
   int numberDocumentWithTerms = 0;
   for (String document : documents) {
     if (document.contains(terms)) {
       numberDocumentWithTerms++;
     }
   }
   input.setTotalDocument(totalDocument);
   input.setNumberDocumentWithTerms(numberDocumentWithTerms);
 }
Ejemplo n.º 4
0
  // searches for topicFile (via indexFile() & indexConcatLines())
  public void searchTopicFile(String pathTopicFile) throws IOException {
    String topicName = pathTopicFile.substring(pathTopicFile.lastIndexOf('/') + 1);
    File topic = new File(pathTopicFile);
    indexFile(topic, false);

    // use logarithmic occurrences
    for (Map.Entry<String, Double> entry : topicWords.entrySet()) {
      String key = entry.getKey();
      Double value = entry.getValue();

      entry.setValue(Math.log(1d + value));
    }

    tfidf.createIFIDFForTopic(topicWords, topicName);
  }
 public void loadConfiguration(String file_path) throws java.io.IOException {
   tfidf.loadConfiguration(file_path);
 }
Ejemplo n.º 6
0
  public List<String> summarize(String page) throws Exception {
    long x = System.currentTimeMillis();
    // ekstrak teks dari web page
    String text = extractor.extract(page);
    System.out.println("Retrieve Text :" + (System.currentTimeMillis() - x) + " milisecond");

    long y = System.currentTimeMillis();
    // clean text
    text = cleaner.clean(text);
    System.out.println("Clean Text :" + (System.currentTimeMillis() - y) + " milisecond");

    // buat daftar nama terms
    long z = System.currentTimeMillis();
    List<String> term = keyword.get(text);
    System.out.println(
        "Generate Terms :"
            + (System.currentTimeMillis() - z)
            + " milisecond ("
            + term.size()
            + " term)");
    // inisialisasi TFIDF
    long q = System.currentTimeMillis();
    List<TFIDF> idfList = new ArrayList<TFIDF>();
    int i = 0;
    // Isi kan tiap tiap Term dan hitung TFIDFnya
    counter.setText(text);
    for (String word : term) {
      TFIDF tfidf = new TFIDF();
      tfidf.setTerms(word);
      counter.countIDfVariable(tfidf);
      counter.countTfVariable(tfidf);
      idfList.add(tfidf);
    }
    System.out.println("Count IDF :" + (System.currentTimeMillis() - q) + " milisecond");

    long v = System.currentTimeMillis();

    // urutkan berdasarkan size TFIDF
    Collections.sort(
        idfList,
        new Comparator<TFIDF>() {
          @Override
          public int compare(TFIDF o1, TFIDF o2) {
            return o2.getValue().compareTo(o1.getValue());
          }
        });
    // delete NaN Value
    for (Iterator<TFIDF> itr = idfList.iterator(); itr.hasNext(); ) {
      TFIDF element = itr.next();
      if (element.getValue().isNaN()) {
        itr.remove();
      }
    }
    // ekstrak 30 teratas
    List<String> result = new ArrayList<String>();
    for (TFIDF tfidf : getTopTerm(idfList)) {
      result.add(tfidf.getTerms());
    }
    // return
    System.out.println("Sorting dan seleksi :" + (System.currentTimeMillis() - v) + " milisecond");

    System.out.println(
        "\n Waktu proses total :" + (System.currentTimeMillis() - x) + " milisecond");

    return result;
  }