/* just for test */ public static void main(String[] args) { QueryProcessor queryProc = new QueryProcessor(); long start = System.currentTimeMillis(); VarInteger cnt = new VarInteger(); List<List<PageInfo>> result = queryProc.doQuery("黄征", cnt); // 中 詹姆斯 System.err.println( "===Time cost for doing query: " + (System.currentTimeMillis() - start) / 1000 + " ==="); if (result == null) { System.out.println("sorry, 找不到相关页面"); return; } for (List<PageInfo> set : result) { System.out.println("\n以下新闻为一类:"); for (PageInfo page : set) { System.out.println("docID:" + page.getDocID() + "\n标题:" + page.getTitle()); } } }
/* 获取相关网页,并从数据库PagesIndex获取网页信息 */ private List<PageInfo> getResultPages(List<Long> queryIDs) { List<PageInfo> resultPages = new ArrayList<>(); Map<Long, PageInfo> tmpPages = new HashMap<>(); long start = System.currentTimeMillis(); List<Map.Entry<Long, TermStat>> relevantDocs = getRelevantDocs(queryIDs); if (relevantDocs == null || relevantDocs.isEmpty()) { System.out.println("no pages retrived"); return null; } long end = System.currentTimeMillis(); System.out.println("===== 相关文档已找到并合并,用时:" + (end - start) + "毫秒 ====="); // 计算相似度权重nnn.ntn, 顺便从PagesIndex获取PageInfo PageInfo page; for (Map.Entry<Long, TermStat> doc : relevantDocs) { // 获取pageinfo long t1 = System.currentTimeMillis(); page = new PageInfo(doc.getKey()); if (!page.loadInfo(dbOperator)) { System.out.println("no page info of " + doc.getKey()); continue; } long t2 = System.currentTimeMillis(); System.out.println("-- 从数据库获取网页信息,用时:" + (t2 - t1) + "毫秒 --"); // 计算关键词高亮位置 // page.highlight(queryTerms);//弃用 tmpPages.put(doc.getKey(), page); // 计算权重 for (long term : queryIDs) { TermStat stat = invIdxMap.get(term).getStatsMap().get(doc.getKey()); if (stat == null) { System.out.println("can not find doc stat in term: " + term); } // 计算“标题+描述”中搜索词出现次数,1次weight+10,以及时间权重 long titWeight = page.countInTitleDesc(queryTerms.get(term)); // 累计相似度结果: // weight = Σ{检索词项权重(1)*该文档权重(tf-idf)+标题中搜索词出现次数*10} doc.getValue().addWeight(stat.getTfIdf() + titWeight); } // 计算并加入日期权重 doc.getValue().addWeight(page.countPubTimeWeight()); System.out.println( "doc:" + doc.getKey() + "; weight[" + doc.getKey() + "]=" + doc.getValue().getWeight()); // t1 = System.currentTimeMillis(); System.out.println("--权重计算完成,用时:" + (t1 - t2) + "毫秒 --"); } start = System.currentTimeMillis(); System.out.println("===== 相似度计算完成,用时:" + (start - end) + "毫秒 ====="); // relevantDocs根据相似度权重降序排列 Collections.sort( relevantDocs, new Comparator<Map.Entry<Long, TermStat>>() { public int compare(Map.Entry<Long, TermStat> o1, Map.Entry<Long, TermStat> o2) { if (o2.getValue().getWeight() > o1.getValue().getWeight()) { return 1; } else if (o2.getValue().getWeight() >= o1.getValue().getWeight()) { return 0; } else { return -1; } } }); // 返回排好序的结果信息 for (Map.Entry<Long, TermStat> doc : relevantDocs) { resultPages.add(tmpPages.get(doc.getKey())); System.out.println("\nretrived page: " + doc.getKey()); } System.out.println("===== 根据相似度排序完成,用时:" + (System.currentTimeMillis() - start) + "毫秒 ====="); return resultPages; }