private void jMenuItemDumpChunksActionPerformed( java.awt.event.ActionEvent evt) { // GEN-FIRST:event_jMenuItemDumpChunksActionPerformed if (currentResults != null) { DocumentClassDialog d = DocumentClassDialog.showClassDialog(this, this.theProject); DocumentClass selectedClass = d.getSelectedDocumentClass(); currentResults.dumpChunksBelongingToDocumentsWithClass(selectedClass.getId()); } } // GEN-LAST:event_jMenuItemDumpChunksActionPerformed
public void train(BagOfWords bag, String docClassName) { termCorpus.merge(bag); DocumentClass docClass = docClasses.get(docClassName); if (null == docClass) { docClass = new DocumentClass(docClassName, keepDetailsOfDocumentsInClasses); docClass.add(bag); docClasses.put(docClassName, docClass); docClassFrequencyPerTerm.addTerms(docClass.getTerms()); } else { docClassFrequencyPerTerm.addTerms(bag.termsNotIn(docClass.getTerms())); docClass.add(bag); docClasses.put(docClassName, docClass); } trainingFinished = false; }
private Double calculateCosinSimilairty(Map<String, Double> filteredBag, DocumentClass docClass) { Double cosinSimilarity = null; try { cosinSimilarity = VectorMath.cosineSimilarityEuclideanNorm( // VectorMath.normlizeVectorEuclideanNorm(queryBag.getFrequencies()), // Without stopWordFilter much better precision ~+25% better VectorMath.normlizeVectorEuclideanNorm(filteredBag), // Only used by BiggerExample - better result without StopWordFilter! // Result better about 20% false // // VectorMath.normlizeVectorEuclideanNorm(docClass.getTermFrequencies().getFrequencies()) // Used currently with best results // Result better about 10-20% false. Only with stopWordFilter and on real trainings // data?! // VectorMath.normlizeVectorEuclideanNorm(docClass.getWeightedFrequencies()) // Trial VectorMath.normlizeVectorEuclideanNorm( docClass.getWeightedFrequencies(getIDFTermFilter()))); } catch (InvalidObjectException e) { e.printStackTrace(); } return cosinSimilarity; }
// Calculate all data for performance classification. This again adds a state what is not really // good! public void finishTraining() { if (docClasses.isEmpty()) { throw new IllegalStateException( "Can not finish training! The classifier does not contain any document classes yet."); } calculateDocClassTermIDFs(); analyzeDocClassTermIDFs(); // Just to improve the performance of the later probability calculation! Pre-Initialize the // docClasses weighted frequencies! for (DocumentClass docClass : docClasses.values()) { docClass.getWeightedFrequencies(); } maxNumberAllowedTerms = new Integer((int) (getCorpusSize() * percentageOfCorpusForAllowedTerms)); trainingFinished = true; }
public List<Entry<String, Double>> getClassificationProbabilities(BagOfWords queryBag) { checkFinishedTraining(); if (bagNotAlreadyAnalyzed(queryBag)) { List<Entry<String, Double>> classifications = new ArrayList<>(docClasses.size()); // Test only if the distances differ from the cosine similarity?! No they don't! List<Entry<String, Double>> distances = new ArrayList<>(docClasses.size()); Map<String, Double> preparedQueryBag = prepareQueryBagOfWords(queryBag); for (DocumentClass docClass : docClasses.values()) { Double cosinSimilarity = calculateCosinSimilairty(preparedQueryBag, docClass); classifications.add(new SimpleEntry<>(docClass.getName(), cosinSimilarity)); Double distance = calculateDistances(preparedQueryBag, docClass); distances.add(new SimpleEntry<>(docClass.getName(), distance)); } lastClassifiedBag = queryBag.hashCode(); lastClassificationResult = new ArrayList<>(classifications); // Order results. Not in own method while not clear if mor than one analyze method will be // used (ie. KNN plus cosineSimilarity) Collections.sort( lastClassificationResult, new Comparator<Entry<String, Double>>() { @Override public int compare(Entry<String, Double> o1, Entry<String, Double> o2) { return o2.getValue().compareTo(o1.getValue()); } }); // Ascending because the smallest is the most important one! Collections.sort( distances, new Comparator<Entry<String, Double>>() { @Override public int compare(Entry<String, Double> o1, Entry<String, Double> o2) { return o1.getValue().compareTo(o2.getValue()); } }); // System.out.println(lastClassificationResult); // System.out.println(distances); } return Collections.unmodifiableList(lastClassificationResult); }
private Double calculateDistances(Map<String, Double> filteredBag, DocumentClass docClass) { Double distance = null; try { distance = VectorMath.distanceEuclideanNorm( VectorMath.normlizeVectorEuclideanNorm(filteredBag), VectorMath.normlizeVectorEuclideanNorm( docClass.getWeightedFrequencies(getIDFTermFilter()))); } catch (Exception ex) { ex.printStackTrace(); } return distance; }