コード例 #1
0
  private void jMenuItemDumpChunksActionPerformed(
      java.awt.event.ActionEvent evt) { // GEN-FIRST:event_jMenuItemDumpChunksActionPerformed
    if (currentResults != null) {
      DocumentClassDialog d = DocumentClassDialog.showClassDialog(this, this.theProject);
      DocumentClass selectedClass = d.getSelectedDocumentClass();

      currentResults.dumpChunksBelongingToDocumentsWithClass(selectedClass.getId());
    }
  } // GEN-LAST:event_jMenuItemDumpChunksActionPerformed
コード例 #2
0
 public void train(BagOfWords bag, String docClassName) {
   termCorpus.merge(bag);
   DocumentClass docClass = docClasses.get(docClassName);
   if (null == docClass) {
     docClass = new DocumentClass(docClassName, keepDetailsOfDocumentsInClasses);
     docClass.add(bag);
     docClasses.put(docClassName, docClass);
     docClassFrequencyPerTerm.addTerms(docClass.getTerms());
   } else {
     docClassFrequencyPerTerm.addTerms(bag.termsNotIn(docClass.getTerms()));
     docClass.add(bag);
     docClasses.put(docClassName, docClass);
   }
   trainingFinished = false;
 }
コード例 #3
0
  private Double calculateCosinSimilairty(Map<String, Double> filteredBag, DocumentClass docClass) {
    Double cosinSimilarity = null;
    try {
      cosinSimilarity =
          VectorMath.cosineSimilarityEuclideanNorm(
              //							VectorMath.normlizeVectorEuclideanNorm(queryBag.getFrequencies()),
              // Without stopWordFilter much better precision ~+25% better
              VectorMath.normlizeVectorEuclideanNorm(filteredBag),

              // Only used by BiggerExample - better result without StopWordFilter!
              // Result better about 20% false
              //
              //	VectorMath.normlizeVectorEuclideanNorm(docClass.getTermFrequencies().getFrequencies())

              // Used currently with best results
              // Result better about 10-20% false. Only with stopWordFilter and on real trainings
              // data?!
              //							VectorMath.normlizeVectorEuclideanNorm(docClass.getWeightedFrequencies())

              // Trial
              VectorMath.normlizeVectorEuclideanNorm(
                  docClass.getWeightedFrequencies(getIDFTermFilter())));
    } catch (InvalidObjectException e) {
      e.printStackTrace();
    }
    return cosinSimilarity;
  }
コード例 #4
0
 // Calculate all data for performance classification. This again adds a state what is not really
 // good!
 public void finishTraining() {
   if (docClasses.isEmpty()) {
     throw new IllegalStateException(
         "Can not finish training! The classifier does not contain any document classes yet.");
   }
   calculateDocClassTermIDFs();
   analyzeDocClassTermIDFs();
   // Just to improve the performance of the later probability calculation! Pre-Initialize the
   // docClasses weighted frequencies!
   for (DocumentClass docClass : docClasses.values()) {
     docClass.getWeightedFrequencies();
   }
   maxNumberAllowedTerms =
       new Integer((int) (getCorpusSize() * percentageOfCorpusForAllowedTerms));
   trainingFinished = true;
 }
コード例 #5
0
  public List<Entry<String, Double>> getClassificationProbabilities(BagOfWords queryBag) {
    checkFinishedTraining();
    if (bagNotAlreadyAnalyzed(queryBag)) {
      List<Entry<String, Double>> classifications = new ArrayList<>(docClasses.size());

      // Test only if the distances differ from the cosine similarity?! No they don't!
      List<Entry<String, Double>> distances = new ArrayList<>(docClasses.size());

      Map<String, Double> preparedQueryBag = prepareQueryBagOfWords(queryBag);

      for (DocumentClass docClass : docClasses.values()) {
        Double cosinSimilarity = calculateCosinSimilairty(preparedQueryBag, docClass);
        classifications.add(new SimpleEntry<>(docClass.getName(), cosinSimilarity));
        Double distance = calculateDistances(preparedQueryBag, docClass);
        distances.add(new SimpleEntry<>(docClass.getName(), distance));
      }

      lastClassifiedBag = queryBag.hashCode();
      lastClassificationResult = new ArrayList<>(classifications);

      // Order results. Not in own method while not clear if mor than one analyze method will be
      // used (ie. KNN plus cosineSimilarity)
      Collections.sort(
          lastClassificationResult,
          new Comparator<Entry<String, Double>>() {
            @Override
            public int compare(Entry<String, Double> o1, Entry<String, Double> o2) {
              return o2.getValue().compareTo(o1.getValue());
            }
          });

      // Ascending because the smallest is the most important one!
      Collections.sort(
          distances,
          new Comparator<Entry<String, Double>>() {
            @Override
            public int compare(Entry<String, Double> o1, Entry<String, Double> o2) {
              return o1.getValue().compareTo(o2.getValue());
            }
          });
      //				System.out.println(lastClassificationResult);
      //				System.out.println(distances);
    }
    return Collections.unmodifiableList(lastClassificationResult);
  }
コード例 #6
0
 private Double calculateDistances(Map<String, Double> filteredBag, DocumentClass docClass) {
   Double distance = null;
   try {
     distance =
         VectorMath.distanceEuclideanNorm(
             VectorMath.normlizeVectorEuclideanNorm(filteredBag),
             VectorMath.normlizeVectorEuclideanNorm(
                 docClass.getWeightedFrequencies(getIDFTermFilter())));
   } catch (Exception ex) {
     ex.printStackTrace();
   }
   return distance;
 }