private void calculateDocClassTermIDFs() { docClassTermIDFs = new HashMap<>(); for (String term : docClassFrequencyPerTerm.getTerms()) { Integer docFrequency = docClassFrequencyPerTerm.getFrequency(term); Double docFraction = docClasses.size() / (1 + docFrequency.doubleValue()); docClassTermIDFs.put(term, Math.log10(docFraction)); } }
private Map<String, Double> weightBagOfWordsWithIDF(BagOfWords queryBag) { Map<String, Double> weighted = new HashMap<>(); for (Entry<String, Integer> entry : queryBag.getFrequencies().entrySet()) { String term = entry.getKey(); Double classifierCorpusIDF = docClassTermIDFs.get(term); weighted.put( entry.getKey(), (null == classifierCorpusIDF) ? 0d : classifierCorpusIDF * queryBag.getFrequency(term)); } return weighted; }
public void train(BagOfWords bag, String docClassName) { termCorpus.merge(bag); DocumentClass docClass = docClasses.get(docClassName); if (null == docClass) { docClass = new DocumentClass(docClassName, keepDetailsOfDocumentsInClasses); docClass.add(bag); docClasses.put(docClassName, docClass); docClassFrequencyPerTerm.addTerms(docClass.getTerms()); } else { docClassFrequencyPerTerm.addTerms(bag.termsNotIn(docClass.getTerms())); docClass.add(bag); docClasses.put(docClassName, docClass); } trainingFinished = false; }
public List<Entry<String, Double>> getClassificationProbabilities(BagOfWords queryBag) { checkFinishedTraining(); if (bagNotAlreadyAnalyzed(queryBag)) { List<Entry<String, Double>> classifications = new ArrayList<>(docClasses.size()); // Test only if the distances differ from the cosine similarity?! No they don't! List<Entry<String, Double>> distances = new ArrayList<>(docClasses.size()); Map<String, Double> preparedQueryBag = prepareQueryBagOfWords(queryBag); for (DocumentClass docClass : docClasses.values()) { Double cosinSimilarity = calculateCosinSimilairty(preparedQueryBag, docClass); classifications.add(new SimpleEntry<>(docClass.getName(), cosinSimilarity)); Double distance = calculateDistances(preparedQueryBag, docClass); distances.add(new SimpleEntry<>(docClass.getName(), distance)); } lastClassifiedBag = queryBag.hashCode(); lastClassificationResult = new ArrayList<>(classifications); // Order results. Not in own method while not clear if mor than one analyze method will be // used (ie. KNN plus cosineSimilarity) Collections.sort( lastClassificationResult, new Comparator<Entry<String, Double>>() { @Override public int compare(Entry<String, Double> o1, Entry<String, Double> o2) { return o2.getValue().compareTo(o1.getValue()); } }); // Ascending because the smallest is the most important one! Collections.sort( distances, new Comparator<Entry<String, Double>>() { @Override public int compare(Entry<String, Double> o1, Entry<String, Double> o2) { return o1.getValue().compareTo(o2.getValue()); } }); // System.out.println(lastClassificationResult); // System.out.println(distances); } return Collections.unmodifiableList(lastClassificationResult); }
public Integer getCorpusSize() { return termCorpus.getNumberOfTerms(); }
private boolean bagNotAlreadyAnalyzed(BagOfWords bag) { return lastClassifiedBag != bag.hashCode(); }