/**
   * Given concept vector which represents a text, calculate the category probabilities.
   *
   * @param cv
   * @param conceptLimit only conceptLimit concepts are used to calculate the category path
   *     distribution
   * @return category id -> scores
   */
  public SortedMap<Integer, Double> getCategoryDistribution(ConceptVector cv, int conceptLimit)
      throws WikitException {
    ConceptIterator conceptIterator = cv.orderedIterator();

    // Category ID --> Category
    Map<Integer, Category> bags = new HashMap<>();
    int count = 0;
    while (conceptIterator.next() && count++ < conceptLimit) {
      int conceptId = conceptIterator.getId();
      ConceptItem conceptItem = new ConceptItem(conceptId, conceptIterator.getValue());
      Set<Integer> catIds = treeCache.getCategoryIdsByConceptId(conceptId);
      conceptItem.catIds = catIds;

      for (int catId : catIds) {
        if (bags.containsKey(catId)) {
          bags.get(catId).addItem(conceptItem);
        } else {
          Category category = new Category(catId);
          category.addItem(conceptItem);
          bags.put(catId, category);
        }
      }
    }
    double totalScore = 0;
    SortedMap<Integer, Double> sortedScores = null;
    // category id -> score
    Map<Integer, Double> scores = new HashMap<>();

    //        LOG.info("Method 1:");
    //        for (Map.Entry<Integer, Category> entry : bags.entrySet()) {
    //            double categoryScore = 0;
    //            Category category = entry.getValue();
    //            for (ConceptItem item : category.concepts()) {
    //                categoryScore += item.value;
    //            }
    //            double normalizedScore = categoryScore/entry.getValue().size();
    //            scores.put(entry.getKey(), normalizedScore);
    //            totalScore += normalizedScore;
    //        }
    //
    //        sortedScores = new TreeMap<Integer,Double>(new ValueComparator(scores));
    //        sortedScores.putAll(scores);

    // Method 2, take link into account"
    double LAMBDA = 0.6;
    totalScore = 0;
    scores = new HashMap<>();
    for (Map.Entry<Integer, Category> entry : bags.entrySet()) {
      double categoryScore = 0;
      Category category = entry.getValue();
      for (ConceptItem conceptItem : category.concepts()) {
        double v1 = conceptItem.value;
        double v2 = 0.0;
        Set<Integer> inlinkIds = conceptCache.getInlinkIds(conceptItem.id);
        for (int inlinkId : inlinkIds) {
          if (category.hasConcept(inlinkId)) {
            v2 += category.getConcept(inlinkId).value;
            // System.out.println(inlink + "==>" + item.id + "\t" + item.title);
          }
        }
        if (inlinkIds.size() > 0) {
          v2 = v2 / inlinkIds.size(); // normalize
        }

        // if item connected with
        double v = LAMBDA * v1 + (1 - LAMBDA) * v2;
        categoryScore += v;
      }

      double normalizedScore = categoryScore / category.size();
      scores.put(category.id, normalizedScore);
      totalScore += normalizedScore;
    }

    sortedScores = new TreeMap<Integer, Double>(new ValueComparator<>(scores));

    boolean normalize = true;
    if (normalize) {
      // normalized the value
      for (Map.Entry<Integer, Double> entry : scores.entrySet()) {
        sortedScores.put(entry.getKey(), entry.getValue() / totalScore);
      }
    } else {
      sortedScores.putAll(scores);
    }
    return sortedScores;
  }
Ejemplo n.º 2
0
  @SuppressWarnings("unchecked")
  private void compareFile(
      Cursor cur,
      ConcurrentHashMap<String, Category> categoryMap,
      ArrayList<String> deletedFileList,
      int type) {
    ArrayList<FileInfo> filesRemoved = new ArrayList<FileInfo>();
    ArrayList<FileInfo> filesAdded = new ArrayList<FileInfo>();

    // 把缓存中的拷贝一份到tmp中,方便比对
    HashMap<String, Category> tmp = new HashMap<String, Category>();

    Iterator<Category> values = categoryMap.values().iterator();
    while (values.hasNext()) {
      Category cat = values.next();
      Category newCat = new Category();

      newCat.filePath = cat.filePath;
      newCat.files = (ArrayList<FileInfo>) cat.files.clone();
      newCat.size = cat.size;

      tmp.put(cat.filePath, newCat);
    }
    HashSet<String> filesInDB = new HashSet<String>();
    do {
      FileInfo file = extractFileInfo(cur, type);
      if (null == file) {
        continue;
      }

      // 查找是否有这个文件夹
      if (tmp.containsKey(getCategoryKey(file, type))) {
        Category category = tmp.get(getCategoryKey(file, type));
        int size = category.files.size();
        boolean bFind = false;
        // 找到这个文件则从tmp中删除,说明原来已经有了
        for (int i = size - 1; i >= 0; --i) {
          FileInfo info = category.files.get(i);
          if (0 == info.fullFilePath.compareTo(file.fullFilePath)) {
            bFind = true;
            category.files.remove(i);
            break;
          }
        }
        if (!bFind) {
          // 没有找到则加入缓存
          if (!deletedFileList.contains(file.fullFilePath)) {
            if (categoryMap.get(getCategoryKey(file, type)).addFile(file)) {
              filesAdded.add(file);
              // Log.d("onChange", "add " + file.fullFilePath);
            }
          }
        }
      } else {
        if (categoryMap.containsKey(getCategoryKey(file, type))) {
          if (!deletedFileList.contains(file.fullFilePath)) {
            if (categoryMap.get(getCategoryKey(file, type)).addFile(file)) {
              filesAdded.add(file);
              // Log.d("onChange", "add " + file.fullFilePath);
            }
          }
        } else {
          if (!deletedFileList.contains(file.fullFilePath)) {
            // 没有这个文件夹则新建一个文件夹,加入缓存
            // Log.d("onChange", "add category " + file.filePath);
            // Log.d("onChange", "add " + file.fullFilePath);
            Category category = makeCategory(file, type);
            filesAdded.add(file);
            categoryMap.put(category.filePath, category);
          }
        }
      }
      filesInDB.add(file.fullFilePath);
    } while (cur.moveToNext());
    if (mFileObserver != null && filesAdded != null && !filesAdded.isEmpty()) {
      mFileObserver.filesAdded(filesAdded, type);
    }
    // tmp中剩下的则是被删除的文件
    Collection<Category> categories = tmp.values();
    Object[] ar = categories.toArray();
    int size = ar.length;
    for (int i = 0; i < size; ++i) {
      Object category = ar[i];
      filesRemoved.addAll(((Category) category).files);
    }

    // 把删除的文件从缓存中删除
    for (FileInfo info : filesRemoved) {
      Category category = categoryMap.get(getCategoryKey(info, type));
      if (null != category) {
        // Log.d("onChange", "delete file " + info.fullFilePath);
        category.deleteFile(info.fullFilePath);
        if (category.files.isEmpty()) {
          categoryMap.remove(category.filePath);
        }
      }
    }

    Iterator<String> it = deletedFileList.iterator();
    while (it.hasNext()) {
      if (!filesInDB.contains(it.next())) {
        it.remove();
      }
    }

    if (mFileObserver != null && filesRemoved != null && !filesRemoved.isEmpty()) {
      mFileObserver.filesRemoved(filesRemoved, type);
    }
  }