public String printCategoryDistribution(SortedMap<Integer, Double> categoryDistribution) { StringBuilder scoreValues = new StringBuilder(); LOG.debug("output category distribution:"); scoreValues.append("scores<-c("); for (Map.Entry<Integer, Double> score : categoryDistribution.entrySet()) { String name = treeCache.getNameById(score.getKey(), String.valueOf(score.getKey())); LOG.debug(name + "\t\t" + score.getValue() + "\t" + treeCache.getDepth(score.getKey())); if (scoreValues.length() > 10) { scoreValues.append(", "); } scoreValues.append(score.getValue()); } scoreValues.append(")"); return scoreValues.toString(); }
/** * Given concept vector which represents a text, calculate the category probabilities. * * @param cv * @param conceptLimit only conceptLimit concepts are used to calculate the category path * distribution * @return category id -> scores */ public SortedMap<Integer, Double> getCategoryDistribution(ConceptVector cv, int conceptLimit) throws WikitException { ConceptIterator conceptIterator = cv.orderedIterator(); // Category ID --> Category Map<Integer, Category> bags = new HashMap<>(); int count = 0; while (conceptIterator.next() && count++ < conceptLimit) { int conceptId = conceptIterator.getId(); ConceptItem conceptItem = new ConceptItem(conceptId, conceptIterator.getValue()); Set<Integer> catIds = treeCache.getCategoryIdsByConceptId(conceptId); conceptItem.catIds = catIds; for (int catId : catIds) { if (bags.containsKey(catId)) { bags.get(catId).addItem(conceptItem); } else { Category category = new Category(catId); category.addItem(conceptItem); bags.put(catId, category); } } } double totalScore = 0; SortedMap<Integer, Double> sortedScores = null; // category id -> score Map<Integer, Double> scores = new HashMap<>(); // LOG.info("Method 1:"); // for (Map.Entry<Integer, Category> entry : bags.entrySet()) { // double categoryScore = 0; // Category category = entry.getValue(); // for (ConceptItem item : category.concepts()) { // categoryScore += item.value; // } // double normalizedScore = categoryScore/entry.getValue().size(); // scores.put(entry.getKey(), normalizedScore); // totalScore += normalizedScore; // } // // sortedScores = new TreeMap<Integer,Double>(new ValueComparator(scores)); // sortedScores.putAll(scores); // Method 2, take link into account" double LAMBDA = 0.6; totalScore = 0; scores = new HashMap<>(); for (Map.Entry<Integer, Category> entry : bags.entrySet()) { double categoryScore = 0; Category category = entry.getValue(); for (ConceptItem conceptItem : category.concepts()) { double v1 = conceptItem.value; double v2 = 0.0; Set<Integer> inlinkIds = conceptCache.getInlinkIds(conceptItem.id); for (int inlinkId : inlinkIds) { if (category.hasConcept(inlinkId)) { v2 += category.getConcept(inlinkId).value; // System.out.println(inlink + "==>" + item.id + "\t" + item.title); } } if (inlinkIds.size() > 0) { v2 = v2 / inlinkIds.size(); // normalize } // if item connected with double v = LAMBDA * v1 + (1 - LAMBDA) * v2; categoryScore += v; } double normalizedScore = categoryScore / category.size(); scores.put(category.id, normalizedScore); totalScore += normalizedScore; } sortedScores = new TreeMap<Integer, Double>(new ValueComparator<>(scores)); boolean normalize = true; if (normalize) { // normalized the value for (Map.Entry<Integer, Double> entry : scores.entrySet()) { sortedScores.put(entry.getKey(), entry.getValue() / totalScore); } } else { sortedScores.putAll(scores); } return sortedScores; }