public void countIDfVariable(TFIDF input) { String terms = input.getTerms(); int numberDocumentWithTerms = 0; for (String document : documents) { if (document.contains(terms)) { numberDocumentWithTerms++; } } input.setTotalDocument(totalDocument); input.setNumberDocumentWithTerms(numberDocumentWithTerms); }
public void countTfVariable(TFIDF input) { int totalOccurences = 0; // tf String terms = input.getTerms(); for (String word : words) { if (word.equals(terms)) { totalOccurences++; } } input.setTotalOccurences(totalOccurences); input.setTotalTermsInDocument(totalTermsInDocument); }
public List<String> summarize(String page) throws Exception { long x = System.currentTimeMillis(); // ekstrak teks dari web page String text = extractor.extract(page); System.out.println("Retrieve Text :" + (System.currentTimeMillis() - x) + " milisecond"); long y = System.currentTimeMillis(); // clean text text = cleaner.clean(text); System.out.println("Clean Text :" + (System.currentTimeMillis() - y) + " milisecond"); // buat daftar nama terms long z = System.currentTimeMillis(); List<String> term = keyword.get(text); System.out.println( "Generate Terms :" + (System.currentTimeMillis() - z) + " milisecond (" + term.size() + " term)"); // inisialisasi TFIDF long q = System.currentTimeMillis(); List<TFIDF> idfList = new ArrayList<TFIDF>(); int i = 0; // Isi kan tiap tiap Term dan hitung TFIDFnya counter.setText(text); for (String word : term) { TFIDF tfidf = new TFIDF(); tfidf.setTerms(word); counter.countIDfVariable(tfidf); counter.countTfVariable(tfidf); idfList.add(tfidf); } System.out.println("Count IDF :" + (System.currentTimeMillis() - q) + " milisecond"); long v = System.currentTimeMillis(); // urutkan berdasarkan size TFIDF Collections.sort( idfList, new Comparator<TFIDF>() { @Override public int compare(TFIDF o1, TFIDF o2) { return o2.getValue().compareTo(o1.getValue()); } }); // delete NaN Value for (Iterator<TFIDF> itr = idfList.iterator(); itr.hasNext(); ) { TFIDF element = itr.next(); if (element.getValue().isNaN()) { itr.remove(); } } // ekstrak 30 teratas List<String> result = new ArrayList<String>(); for (TFIDF tfidf : getTopTerm(idfList)) { result.add(tfidf.getTerms()); } // return System.out.println("Sorting dan seleksi :" + (System.currentTimeMillis() - v) + " milisecond"); System.out.println( "\n Waktu proses total :" + (System.currentTimeMillis() - x) + " milisecond"); return result; }