コード例 #1
0
 private void calculateDocClassTermIDFs() {
   docClassTermIDFs = new HashMap<>();
   for (String term : docClassFrequencyPerTerm.getTerms()) {
     Integer docFrequency = docClassFrequencyPerTerm.getFrequency(term);
     Double docFraction = docClasses.size() / (1 + docFrequency.doubleValue());
     docClassTermIDFs.put(term, Math.log10(docFraction));
   }
 }
コード例 #2
0
 private Map<String, Double> weightBagOfWordsWithIDF(BagOfWords queryBag) {
   Map<String, Double> weighted = new HashMap<>();
   for (Entry<String, Integer> entry : queryBag.getFrequencies().entrySet()) {
     String term = entry.getKey();
     Double classifierCorpusIDF = docClassTermIDFs.get(term);
     weighted.put(
         entry.getKey(),
         (null == classifierCorpusIDF) ? 0d : classifierCorpusIDF * queryBag.getFrequency(term));
   }
   return weighted;
 }
コード例 #3
0
 public void train(BagOfWords bag, String docClassName) {
   termCorpus.merge(bag);
   DocumentClass docClass = docClasses.get(docClassName);
   if (null == docClass) {
     docClass = new DocumentClass(docClassName, keepDetailsOfDocumentsInClasses);
     docClass.add(bag);
     docClasses.put(docClassName, docClass);
     docClassFrequencyPerTerm.addTerms(docClass.getTerms());
   } else {
     docClassFrequencyPerTerm.addTerms(bag.termsNotIn(docClass.getTerms()));
     docClass.add(bag);
     docClasses.put(docClassName, docClass);
   }
   trainingFinished = false;
 }
コード例 #4
0
  public List<Entry<String, Double>> getClassificationProbabilities(BagOfWords queryBag) {
    checkFinishedTraining();
    if (bagNotAlreadyAnalyzed(queryBag)) {
      List<Entry<String, Double>> classifications = new ArrayList<>(docClasses.size());

      // Test only if the distances differ from the cosine similarity?! No they don't!
      List<Entry<String, Double>> distances = new ArrayList<>(docClasses.size());

      Map<String, Double> preparedQueryBag = prepareQueryBagOfWords(queryBag);

      for (DocumentClass docClass : docClasses.values()) {
        Double cosinSimilarity = calculateCosinSimilairty(preparedQueryBag, docClass);
        classifications.add(new SimpleEntry<>(docClass.getName(), cosinSimilarity));
        Double distance = calculateDistances(preparedQueryBag, docClass);
        distances.add(new SimpleEntry<>(docClass.getName(), distance));
      }

      lastClassifiedBag = queryBag.hashCode();
      lastClassificationResult = new ArrayList<>(classifications);

      // Order results. Not in own method while not clear if mor than one analyze method will be
      // used (ie. KNN plus cosineSimilarity)
      Collections.sort(
          lastClassificationResult,
          new Comparator<Entry<String, Double>>() {
            @Override
            public int compare(Entry<String, Double> o1, Entry<String, Double> o2) {
              return o2.getValue().compareTo(o1.getValue());
            }
          });

      // Ascending because the smallest is the most important one!
      Collections.sort(
          distances,
          new Comparator<Entry<String, Double>>() {
            @Override
            public int compare(Entry<String, Double> o1, Entry<String, Double> o2) {
              return o1.getValue().compareTo(o2.getValue());
            }
          });
      //				System.out.println(lastClassificationResult);
      //				System.out.println(distances);
    }
    return Collections.unmodifiableList(lastClassificationResult);
  }
コード例 #5
0
 public Integer getCorpusSize() {
   return termCorpus.getNumberOfTerms();
 }
コード例 #6
0
 private boolean bagNotAlreadyAnalyzed(BagOfWords bag) {
   return lastClassifiedBag != bag.hashCode();
 }