@Override
 public DocScoreList mostSimilar(int wpId, int maxResults, TIntSet validIds) throws IOException {
   if (hasCachedMostSimilar(wpId)) {
     return getCachedMostSimilar(wpId, maxResults, validIds);
   }
   MoreLikeThis mlt = getMoreLikeThis();
   int luceneId = esaHelper.wpIdToLuceneId(wpId);
   Query query;
   if (luceneId >= 0) {
     query = mlt.like(luceneId);
   } else if (textHelper != null && textHelper.wpIdToLuceneId(wpId) >= 0) {
     Document d = textHelper.wpIdToLuceneDoc(wpId);
     String text = d.get(Page.FIELD_TEXT);
     query = mlt.like(new StringReader(text), Page.FIELD_TEXT);
   } else {
     return null;
   }
   TopDocs similarDocs = searcher.search(query, esaHelper.getWpIdFilter(validIds), maxResults);
   pruneSimilar(similarDocs);
   DocScoreList scores = new DocScoreList(similarDocs.scoreDocs.length);
   for (int i = 0; i < similarDocs.scoreDocs.length; i++) {
     ScoreDoc sd = similarDocs.scoreDocs[i];
     scores.set(i, esaHelper.luceneIdToWpId(sd.doc), similarDocs.scoreDocs[i].score);
   }
   return normalize(scores);
 }
 /**
  * Loads an index if it is not already loaded.
  *
  * @param name - The name of the
  * @return
  * @throws IOException
  * @throws ConfigurationException
  */
 private IndexHelper loadIndex(String name) throws IOException, ConfigurationException {
   if (env.hasIndex(name)) {
     return env.getIndex(name);
   }
   info("loading index " + name);
   JSONObject indexConfig = configuration.getIndex(name);
   File parentDir = requireDirectory(configuration.getIndexes(), "outputDir");
   IndexHelper helper = new IndexHelper(new File(parentDir, name), true);
   if (indexConfig.containsKey("similarity")) {
     String sim = requireString(indexConfig, "similarity");
     if (sim.equals("ESA")) {
       helper.getSearcher().setSimilarity(new ESASimilarity.LuceneSimilarity());
     } else {
       throw new ConfigurationException("unknown similarity type: " + sim);
     }
   }
   if (indexConfig.containsKey("analyzer")) {
     String analyzer = requireString(indexConfig, "analyzer");
     if (analyzer.equals("ESA")) {
       helper.setAnalyzer(new ESAAnalyzer());
     } else {
       throw new ConfigurationException("unknown analyzer type: " + analyzer);
     }
   }
   env.addIndex(name, helper);
   return helper;
 }
 public ESASimilarity(ConceptMapper mapper, IndexHelper helper) {
   super(mapper, helper);
   this.esaHelper = helper;
   this.reader = helper.getReader();
   this.searcher = helper.getSearcher();
   searcher.setSimilarity(new LuceneSimilarity());
   this.setName("esa-similarity");
 }
 public TIntDoubleHashMap getConceptVector(String phrase, TIntSet validIds) throws IOException {
   synchronized (phraseCache) {
     if (phraseCache.containsKey(phrase)) {
       return phraseCache.get(phrase);
     }
   }
   QueryParser parser = new QueryParser(Version.LUCENE_42, "text", analyzer);
   TopDocs docs = null;
   try {
     docs = searcher.search(parser.parse(phrase), esaHelper.getWpIdFilter(validIds), 5000);
   } catch (org.apache.lucene.queryparser.classic.ParseException e) {
     LOG.log(Level.WARNING, "parsing of phrase " + phrase + " failed", e);
     return null;
   }
   pruneSimilar(docs);
   TIntDoubleHashMap result = expandScores(docs.scoreDocs);
   synchronized (phraseCache) {
     phraseCache.put(phrase, result);
   }
   return result;
   //        System.out.println("top docs for " + phrase + " are:");
   //        for (int i = 0; i < 50 && i < docs.scoreDocs.length; i++) {
   //            ScoreDoc sd = docs.scoreDocs[i];
   //            Document d = reader.document(sd.doc);
   //
   //            System.out.println("\t" + sd.score + ": " +
   //                    d.get("title") + ", " + d.get("text").split("\\s+").length +
   //                    ", " + d.get("inlinks"));
   //        }
 }
 public static void main(String args[])
     throws IOException, InterruptedException, CompressorException {
   if (args.length != 4 && args.length != 5) {
     System.err.println(
         "usage: java "
             + TextSimilarity.class.getName()
             + " field lucene-text-index-dir output-file num-results [num-threads]");
   }
   IndexHelper helper = new IndexHelper(new File(args[1]), true);
   ESASimilarity sim = new ESASimilarity(null, helper);
   if (args[0].equals("links")) {
     sim.setMinTermFreq(1); // HACK!
   }
   int cores =
       (args.length == 5) ? Integer.valueOf(args[4]) : Runtime.getRuntime().availableProcessors();
   PairwiseSimilarityWriter writer = new PairwiseSimilarityWriter(sim, new File(args[2]));
   writer.writeSims(helper.getWpIds(), cores, Integer.valueOf(args[3]));
 }
 @Override
 public DocScoreList mostSimilar(String phrase, int maxResults, TIntSet validIds)
     throws IOException {
   final TIntDoubleHashMap scores = getConceptVector(phrase, validIds);
   Integer luceneIds[] = ArrayUtils.toObject(scores.keys());
   Arrays.sort(
       luceneIds,
       new Comparator<Integer>() {
         @Override
         public int compare(Integer id1, Integer id2) {
           return -1 * new Double(scores.get(id1)).compareTo(scores.get(id2));
         }
       });
   DocScoreList result = new DocScoreList(Math.min(luceneIds.length, maxResults));
   for (int i = 0; i < result.numDocs(); i++) {
     result.set(i, esaHelper.luceneIdToWpId(luceneIds[i]), scores.get(luceneIds[i]));
   }
   return normalize(result);
 }
  @Override
  public double similarity(int wpId1, int wpId2) throws IOException {
    int doc1 = esaHelper.wpIdToLuceneId(wpId1);
    int doc2 = esaHelper.wpIdToLuceneId(wpId2);

    if (doc1 < 0 || doc2 < 0) {
      return normalize(0.0);
    }

    MoreLikeThis mlt = getMoreLikeThis();
    TopDocs similarDocs =
        searcher.search(mlt.like(doc1), new FieldCacheTermsFilter("id", "" + wpId2), 1);
    if (similarDocs.scoreDocs.length == 0) {
      return normalize(0);
    } else {
      assert (similarDocs.scoreDocs.length == 1);
      assert (similarDocs.scoreDocs[0].doc == doc2);
      return normalize(similarDocs.scoreDocs[0].score);
    }
  }