Esempio n. 1
0
  /* just for test */
  public static void main(String[] args) {
    QueryProcessor queryProc = new QueryProcessor();

    long start = System.currentTimeMillis();
    VarInteger cnt = new VarInteger();
    List<List<PageInfo>> result = queryProc.doQuery("黄征", cnt); // 中 詹姆斯
    System.err.println(
        "===Time cost for doing query: " + (System.currentTimeMillis() - start) / 1000 + " ===");

    if (result == null) {
      System.out.println("sorry, 找不到相关页面");
      return;
    }
    for (List<PageInfo> set : result) {
      System.out.println("\n以下新闻为一类:");
      for (PageInfo page : set) {
        System.out.println("docID:" + page.getDocID() + "\n标题:" + page.getTitle());
      }
    }
  }
Esempio n. 2
0
  /* 获取相关网页,并从数据库PagesIndex获取网页信息 */
  private List<PageInfo> getResultPages(List<Long> queryIDs) {
    List<PageInfo> resultPages = new ArrayList<>();
    Map<Long, PageInfo> tmpPages = new HashMap<>();

    long start = System.currentTimeMillis();
    List<Map.Entry<Long, TermStat>> relevantDocs = getRelevantDocs(queryIDs);
    if (relevantDocs == null || relevantDocs.isEmpty()) {
      System.out.println("no pages retrived");
      return null;
    }
    long end = System.currentTimeMillis();
    System.out.println("===== 相关文档已找到并合并,用时:" + (end - start) + "毫秒 =====");

    // 计算相似度权重nnn.ntn, 顺便从PagesIndex获取PageInfo
    PageInfo page;
    for (Map.Entry<Long, TermStat> doc : relevantDocs) {
      // 获取pageinfo
      long t1 = System.currentTimeMillis();
      page = new PageInfo(doc.getKey());
      if (!page.loadInfo(dbOperator)) {
        System.out.println("no page info of " + doc.getKey());
        continue;
      }
      long t2 = System.currentTimeMillis();
      System.out.println("-- 从数据库获取网页信息,用时:" + (t2 - t1) + "毫秒  --");
      // 计算关键词高亮位置
      //			page.highlight(queryTerms);//弃用
      tmpPages.put(doc.getKey(), page);

      // 计算权重
      for (long term : queryIDs) {
        TermStat stat = invIdxMap.get(term).getStatsMap().get(doc.getKey());
        if (stat == null) {
          System.out.println("can not find doc stat in term: " + term);
        }
        // 计算“标题+描述”中搜索词出现次数,1次weight+10,以及时间权重
        long titWeight = page.countInTitleDesc(queryTerms.get(term));

        // 累计相似度结果:
        // weight = Σ{检索词项权重(1)*该文档权重(tf-idf)+标题中搜索词出现次数*10}
        doc.getValue().addWeight(stat.getTfIdf() + titWeight);
      }
      // 计算并加入日期权重
      doc.getValue().addWeight(page.countPubTimeWeight());

      System.out.println(
          "doc:"
              + doc.getKey()
              + "; weight["
              + doc.getKey()
              + "]="
              + doc.getValue().getWeight()); //
      t1 = System.currentTimeMillis();
      System.out.println("--权重计算完成,用时:" + (t1 - t2) + "毫秒 --");
    }
    start = System.currentTimeMillis();
    System.out.println("===== 相似度计算完成,用时:" + (start - end) + "毫秒 =====");

    // relevantDocs根据相似度权重降序排列
    Collections.sort(
        relevantDocs,
        new Comparator<Map.Entry<Long, TermStat>>() {
          public int compare(Map.Entry<Long, TermStat> o1, Map.Entry<Long, TermStat> o2) {
            if (o2.getValue().getWeight() > o1.getValue().getWeight()) {
              return 1;
            } else if (o2.getValue().getWeight() >= o1.getValue().getWeight()) {
              return 0;
            } else {
              return -1;
            }
          }
        });

    // 返回排好序的结果信息
    for (Map.Entry<Long, TermStat> doc : relevantDocs) {
      resultPages.add(tmpPages.get(doc.getKey()));
      System.out.println("\nretrived page: " + doc.getKey());
    }
    System.out.println("===== 根据相似度排序完成,用时:" + (System.currentTimeMillis() - start) + "毫秒 =====");

    return resultPages;
  }