@Override public DocScoreList mostSimilar(int wpId, int maxResults, TIntSet validIds) throws IOException { if (hasCachedMostSimilar(wpId)) { return getCachedMostSimilar(wpId, maxResults, validIds); } MoreLikeThis mlt = getMoreLikeThis(); int luceneId = esaHelper.wpIdToLuceneId(wpId); Query query; if (luceneId >= 0) { query = mlt.like(luceneId); } else if (textHelper != null && textHelper.wpIdToLuceneId(wpId) >= 0) { Document d = textHelper.wpIdToLuceneDoc(wpId); String text = d.get(Page.FIELD_TEXT); query = mlt.like(new StringReader(text), Page.FIELD_TEXT); } else { return null; } TopDocs similarDocs = searcher.search(query, esaHelper.getWpIdFilter(validIds), maxResults); pruneSimilar(similarDocs); DocScoreList scores = new DocScoreList(similarDocs.scoreDocs.length); for (int i = 0; i < similarDocs.scoreDocs.length; i++) { ScoreDoc sd = similarDocs.scoreDocs[i]; scores.set(i, esaHelper.luceneIdToWpId(sd.doc), similarDocs.scoreDocs[i].score); } return normalize(scores); }
/** * Loads an index if it is not already loaded. * * @param name - The name of the * @return * @throws IOException * @throws ConfigurationException */ private IndexHelper loadIndex(String name) throws IOException, ConfigurationException { if (env.hasIndex(name)) { return env.getIndex(name); } info("loading index " + name); JSONObject indexConfig = configuration.getIndex(name); File parentDir = requireDirectory(configuration.getIndexes(), "outputDir"); IndexHelper helper = new IndexHelper(new File(parentDir, name), true); if (indexConfig.containsKey("similarity")) { String sim = requireString(indexConfig, "similarity"); if (sim.equals("ESA")) { helper.getSearcher().setSimilarity(new ESASimilarity.LuceneSimilarity()); } else { throw new ConfigurationException("unknown similarity type: " + sim); } } if (indexConfig.containsKey("analyzer")) { String analyzer = requireString(indexConfig, "analyzer"); if (analyzer.equals("ESA")) { helper.setAnalyzer(new ESAAnalyzer()); } else { throw new ConfigurationException("unknown analyzer type: " + analyzer); } } env.addIndex(name, helper); return helper; }
public ESASimilarity(ConceptMapper mapper, IndexHelper helper) { super(mapper, helper); this.esaHelper = helper; this.reader = helper.getReader(); this.searcher = helper.getSearcher(); searcher.setSimilarity(new LuceneSimilarity()); this.setName("esa-similarity"); }
public TIntDoubleHashMap getConceptVector(String phrase, TIntSet validIds) throws IOException { synchronized (phraseCache) { if (phraseCache.containsKey(phrase)) { return phraseCache.get(phrase); } } QueryParser parser = new QueryParser(Version.LUCENE_42, "text", analyzer); TopDocs docs = null; try { docs = searcher.search(parser.parse(phrase), esaHelper.getWpIdFilter(validIds), 5000); } catch (org.apache.lucene.queryparser.classic.ParseException e) { LOG.log(Level.WARNING, "parsing of phrase " + phrase + " failed", e); return null; } pruneSimilar(docs); TIntDoubleHashMap result = expandScores(docs.scoreDocs); synchronized (phraseCache) { phraseCache.put(phrase, result); } return result; // System.out.println("top docs for " + phrase + " are:"); // for (int i = 0; i < 50 && i < docs.scoreDocs.length; i++) { // ScoreDoc sd = docs.scoreDocs[i]; // Document d = reader.document(sd.doc); // // System.out.println("\t" + sd.score + ": " + // d.get("title") + ", " + d.get("text").split("\\s+").length + // ", " + d.get("inlinks")); // } }
public static void main(String args[]) throws IOException, InterruptedException, CompressorException { if (args.length != 4 && args.length != 5) { System.err.println( "usage: java " + TextSimilarity.class.getName() + " field lucene-text-index-dir output-file num-results [num-threads]"); } IndexHelper helper = new IndexHelper(new File(args[1]), true); ESASimilarity sim = new ESASimilarity(null, helper); if (args[0].equals("links")) { sim.setMinTermFreq(1); // HACK! } int cores = (args.length == 5) ? Integer.valueOf(args[4]) : Runtime.getRuntime().availableProcessors(); PairwiseSimilarityWriter writer = new PairwiseSimilarityWriter(sim, new File(args[2])); writer.writeSims(helper.getWpIds(), cores, Integer.valueOf(args[3])); }
@Override public DocScoreList mostSimilar(String phrase, int maxResults, TIntSet validIds) throws IOException { final TIntDoubleHashMap scores = getConceptVector(phrase, validIds); Integer luceneIds[] = ArrayUtils.toObject(scores.keys()); Arrays.sort( luceneIds, new Comparator<Integer>() { @Override public int compare(Integer id1, Integer id2) { return -1 * new Double(scores.get(id1)).compareTo(scores.get(id2)); } }); DocScoreList result = new DocScoreList(Math.min(luceneIds.length, maxResults)); for (int i = 0; i < result.numDocs(); i++) { result.set(i, esaHelper.luceneIdToWpId(luceneIds[i]), scores.get(luceneIds[i])); } return normalize(result); }
@Override public double similarity(int wpId1, int wpId2) throws IOException { int doc1 = esaHelper.wpIdToLuceneId(wpId1); int doc2 = esaHelper.wpIdToLuceneId(wpId2); if (doc1 < 0 || doc2 < 0) { return normalize(0.0); } MoreLikeThis mlt = getMoreLikeThis(); TopDocs similarDocs = searcher.search(mlt.like(doc1), new FieldCacheTermsFilter("id", "" + wpId2), 1); if (similarDocs.scoreDocs.length == 0) { return normalize(0); } else { assert (similarDocs.scoreDocs.length == 1); assert (similarDocs.scoreDocs[0].doc == doc2); return normalize(similarDocs.scoreDocs[0].score); } }