/** * Given concept vector which represents a text, calculate the category probabilities. * * @param cv * @param conceptLimit only conceptLimit concepts are used to calculate the category path * distribution * @return category id -> scores */ public SortedMap<Integer, Double> getCategoryDistribution(ConceptVector cv, int conceptLimit) throws WikitException { ConceptIterator conceptIterator = cv.orderedIterator(); // Category ID --> Category Map<Integer, Category> bags = new HashMap<>(); int count = 0; while (conceptIterator.next() && count++ < conceptLimit) { int conceptId = conceptIterator.getId(); ConceptItem conceptItem = new ConceptItem(conceptId, conceptIterator.getValue()); Set<Integer> catIds = treeCache.getCategoryIdsByConceptId(conceptId); conceptItem.catIds = catIds; for (int catId : catIds) { if (bags.containsKey(catId)) { bags.get(catId).addItem(conceptItem); } else { Category category = new Category(catId); category.addItem(conceptItem); bags.put(catId, category); } } } double totalScore = 0; SortedMap<Integer, Double> sortedScores = null; // category id -> score Map<Integer, Double> scores = new HashMap<>(); // LOG.info("Method 1:"); // for (Map.Entry<Integer, Category> entry : bags.entrySet()) { // double categoryScore = 0; // Category category = entry.getValue(); // for (ConceptItem item : category.concepts()) { // categoryScore += item.value; // } // double normalizedScore = categoryScore/entry.getValue().size(); // scores.put(entry.getKey(), normalizedScore); // totalScore += normalizedScore; // } // // sortedScores = new TreeMap<Integer,Double>(new ValueComparator(scores)); // sortedScores.putAll(scores); // Method 2, take link into account" double LAMBDA = 0.6; totalScore = 0; scores = new HashMap<>(); for (Map.Entry<Integer, Category> entry : bags.entrySet()) { double categoryScore = 0; Category category = entry.getValue(); for (ConceptItem conceptItem : category.concepts()) { double v1 = conceptItem.value; double v2 = 0.0; Set<Integer> inlinkIds = conceptCache.getInlinkIds(conceptItem.id); for (int inlinkId : inlinkIds) { if (category.hasConcept(inlinkId)) { v2 += category.getConcept(inlinkId).value; // System.out.println(inlink + "==>" + item.id + "\t" + item.title); } } if (inlinkIds.size() > 0) { v2 = v2 / inlinkIds.size(); // normalize } // if item connected with double v = LAMBDA * v1 + (1 - LAMBDA) * v2; categoryScore += v; } double normalizedScore = categoryScore / category.size(); scores.put(category.id, normalizedScore); totalScore += normalizedScore; } sortedScores = new TreeMap<Integer, Double>(new ValueComparator<>(scores)); boolean normalize = true; if (normalize) { // normalized the value for (Map.Entry<Integer, Double> entry : scores.entrySet()) { sortedScores.put(entry.getKey(), entry.getValue() / totalScore); } } else { sortedScores.putAll(scores); } return sortedScores; }
@SuppressWarnings("unchecked") private void compareFile( Cursor cur, ConcurrentHashMap<String, Category> categoryMap, ArrayList<String> deletedFileList, int type) { ArrayList<FileInfo> filesRemoved = new ArrayList<FileInfo>(); ArrayList<FileInfo> filesAdded = new ArrayList<FileInfo>(); // 把缓存中的拷贝一份到tmp中,方便比对 HashMap<String, Category> tmp = new HashMap<String, Category>(); Iterator<Category> values = categoryMap.values().iterator(); while (values.hasNext()) { Category cat = values.next(); Category newCat = new Category(); newCat.filePath = cat.filePath; newCat.files = (ArrayList<FileInfo>) cat.files.clone(); newCat.size = cat.size; tmp.put(cat.filePath, newCat); } HashSet<String> filesInDB = new HashSet<String>(); do { FileInfo file = extractFileInfo(cur, type); if (null == file) { continue; } // 查找是否有这个文件夹 if (tmp.containsKey(getCategoryKey(file, type))) { Category category = tmp.get(getCategoryKey(file, type)); int size = category.files.size(); boolean bFind = false; // 找到这个文件则从tmp中删除,说明原来已经有了 for (int i = size - 1; i >= 0; --i) { FileInfo info = category.files.get(i); if (0 == info.fullFilePath.compareTo(file.fullFilePath)) { bFind = true; category.files.remove(i); break; } } if (!bFind) { // 没有找到则加入缓存 if (!deletedFileList.contains(file.fullFilePath)) { if (categoryMap.get(getCategoryKey(file, type)).addFile(file)) { filesAdded.add(file); // Log.d("onChange", "add " + file.fullFilePath); } } } } else { if (categoryMap.containsKey(getCategoryKey(file, type))) { if (!deletedFileList.contains(file.fullFilePath)) { if (categoryMap.get(getCategoryKey(file, type)).addFile(file)) { filesAdded.add(file); // Log.d("onChange", "add " + file.fullFilePath); } } } else { if (!deletedFileList.contains(file.fullFilePath)) { // 没有这个文件夹则新建一个文件夹,加入缓存 // Log.d("onChange", "add category " + file.filePath); // Log.d("onChange", "add " + file.fullFilePath); Category category = makeCategory(file, type); filesAdded.add(file); categoryMap.put(category.filePath, category); } } } filesInDB.add(file.fullFilePath); } while (cur.moveToNext()); if (mFileObserver != null && filesAdded != null && !filesAdded.isEmpty()) { mFileObserver.filesAdded(filesAdded, type); } // tmp中剩下的则是被删除的文件 Collection<Category> categories = tmp.values(); Object[] ar = categories.toArray(); int size = ar.length; for (int i = 0; i < size; ++i) { Object category = ar[i]; filesRemoved.addAll(((Category) category).files); } // 把删除的文件从缓存中删除 for (FileInfo info : filesRemoved) { Category category = categoryMap.get(getCategoryKey(info, type)); if (null != category) { // Log.d("onChange", "delete file " + info.fullFilePath); category.deleteFile(info.fullFilePath); if (category.files.isEmpty()) { categoryMap.remove(category.filePath); } } } Iterator<String> it = deletedFileList.iterator(); while (it.hasNext()) { if (!filesInDB.contains(it.next())) { it.remove(); } } if (mFileObserver != null && filesRemoved != null && !filesRemoved.isEmpty()) { mFileObserver.filesRemoved(filesRemoved, type); } }