public TIntDoubleHashMap getConceptVector(String phrase, TIntSet validIds) throws IOException { synchronized (phraseCache) { if (phraseCache.containsKey(phrase)) { return phraseCache.get(phrase); } } QueryParser parser = new QueryParser(Version.LUCENE_42, "text", analyzer); TopDocs docs = null; try { docs = searcher.search(parser.parse(phrase), esaHelper.getWpIdFilter(validIds), 5000); } catch (org.apache.lucene.queryparser.classic.ParseException e) { LOG.log(Level.WARNING, "parsing of phrase " + phrase + " failed", e); return null; } pruneSimilar(docs); TIntDoubleHashMap result = expandScores(docs.scoreDocs); synchronized (phraseCache) { phraseCache.put(phrase, result); } return result; // System.out.println("top docs for " + phrase + " are:"); // for (int i = 0; i < 50 && i < docs.scoreDocs.length; i++) { // ScoreDoc sd = docs.scoreDocs[i]; // Document d = reader.document(sd.doc); // // System.out.println("\t" + sd.score + ": " + // d.get("title") + ", " + d.get("text").split("\\s+").length + // ", " + d.get("inlinks")); // } }
@Override public DocScoreList mostSimilar(int wpId, int maxResults, TIntSet validIds) throws IOException { if (hasCachedMostSimilar(wpId)) { return getCachedMostSimilar(wpId, maxResults, validIds); } MoreLikeThis mlt = getMoreLikeThis(); int luceneId = esaHelper.wpIdToLuceneId(wpId); Query query; if (luceneId >= 0) { query = mlt.like(luceneId); } else if (textHelper != null && textHelper.wpIdToLuceneId(wpId) >= 0) { Document d = textHelper.wpIdToLuceneDoc(wpId); String text = d.get(Page.FIELD_TEXT); query = mlt.like(new StringReader(text), Page.FIELD_TEXT); } else { return null; } TopDocs similarDocs = searcher.search(query, esaHelper.getWpIdFilter(validIds), maxResults); pruneSimilar(similarDocs); DocScoreList scores = new DocScoreList(similarDocs.scoreDocs.length); for (int i = 0; i < similarDocs.scoreDocs.length; i++) { ScoreDoc sd = similarDocs.scoreDocs[i]; scores.set(i, esaHelper.luceneIdToWpId(sd.doc), similarDocs.scoreDocs[i].score); } return normalize(scores); }