@Override
 public DocScoreList mostSimilar(int wpId, int maxResults, TIntSet validIds) throws IOException {
   if (hasCachedMostSimilar(wpId)) {
     return getCachedMostSimilar(wpId, maxResults, validIds);
   }
   MoreLikeThis mlt = getMoreLikeThis();
   int luceneId = esaHelper.wpIdToLuceneId(wpId);
   Query query;
   if (luceneId >= 0) {
     query = mlt.like(luceneId);
   } else if (textHelper != null && textHelper.wpIdToLuceneId(wpId) >= 0) {
     Document d = textHelper.wpIdToLuceneDoc(wpId);
     String text = d.get(Page.FIELD_TEXT);
     query = mlt.like(new StringReader(text), Page.FIELD_TEXT);
   } else {
     return null;
   }
   TopDocs similarDocs = searcher.search(query, esaHelper.getWpIdFilter(validIds), maxResults);
   pruneSimilar(similarDocs);
   DocScoreList scores = new DocScoreList(similarDocs.scoreDocs.length);
   for (int i = 0; i < similarDocs.scoreDocs.length; i++) {
     ScoreDoc sd = similarDocs.scoreDocs[i];
     scores.set(i, esaHelper.luceneIdToWpId(sd.doc), similarDocs.scoreDocs[i].score);
   }
   return normalize(scores);
 }
Пример #2
0
  // LUCENE-5725
  public void testMultiValues() throws Exception {
    MoreLikeThis mlt = new MoreLikeThis(reader);
    Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);
    mlt.setAnalyzer(analyzer);
    mlt.setMinDocFreq(1);
    mlt.setMinTermFreq(1);
    mlt.setMinWordLen(1);
    mlt.setFieldNames(new String[] {"text"});

    BooleanQuery query =
        (BooleanQuery)
            mlt.like(
                "text",
                new StringReader("lucene"),
                new StringReader("lucene release"),
                new StringReader("apache"),
                new StringReader("apache lucene"));
    Collection<BooleanClause> clauses = query.clauses();
    assertEquals("Expected 2 clauses only!", 2, clauses.size());
    for (BooleanClause clause : clauses) {
      Term term = ((TermQuery) clause.getQuery()).getTerm();
      assertTrue(
          Arrays.asList(new Term("text", "lucene"), new Term("text", "apache")).contains(term));
    }
    analyzer.close();
  }
  @Override
  public double similarity(int wpId1, int wpId2) throws IOException {
    int doc1 = esaHelper.wpIdToLuceneId(wpId1);
    int doc2 = esaHelper.wpIdToLuceneId(wpId2);

    if (doc1 < 0 || doc2 < 0) {
      return normalize(0.0);
    }

    MoreLikeThis mlt = getMoreLikeThis();
    TopDocs similarDocs =
        searcher.search(mlt.like(doc1), new FieldCacheTermsFilter("id", "" + wpId2), 1);
    if (similarDocs.scoreDocs.length == 0) {
      return normalize(0);
    } else {
      assert (similarDocs.scoreDocs.length == 1);
      assert (similarDocs.scoreDocs[0].doc == doc2);
      return normalize(similarDocs.scoreDocs[0].score);
    }
  }
 private MoreLikeThis getMoreLikeThis() {
   MoreLikeThis mlt = new MoreLikeThis(reader); // Pass the reader reader
   mlt.setMaxDocFreqPct(maxPercentage);
   mlt.setMaxQueryTerms(maxQueryTerms);
   mlt.setMinDocFreq(minDocFreq);
   mlt.setMinTermFreq(minTermFreq);
   mlt.setAnalyzer(analyzer);
   mlt.setFieldNames(new String[] {"text"}); // specify the fields for similiarity
   return mlt;
 }
Пример #5
0
  public void testTopN() throws Exception {
    int numDocs = 100;
    int topN = 25;

    // add series of docs with terms of decreasing df
    Directory dir = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
    for (int i = 0; i < numDocs; i++) {
      addDoc(writer, generateStrSeq(0, i + 1));
    }
    IndexReader reader = writer.getReader();
    writer.close();

    // setup MLT query
    MoreLikeThis mlt = new MoreLikeThis(reader);
    Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
    mlt.setAnalyzer(analyzer);
    mlt.setMaxQueryTerms(topN);
    mlt.setMinDocFreq(1);
    mlt.setMinTermFreq(1);
    mlt.setMinWordLen(1);
    mlt.setFieldNames(new String[] {"text"});

    // perform MLT query
    String likeText = "";
    for (String text : generateStrSeq(0, numDocs)) {
      likeText += text + " ";
    }
    BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader(likeText));

    // check best terms are topN of highest idf
    Collection<BooleanClause> clauses = query.clauses();
    assertEquals("Expected" + topN + "clauses only!", topN, clauses.size());

    Term[] expectedTerms = new Term[topN];
    int idx = 0;
    for (String text : generateStrSeq(numDocs - topN, topN)) {
      expectedTerms[idx++] = new Term("text", text);
    }
    for (BooleanClause clause : clauses) {
      Term term = ((TermQuery) clause.getQuery()).getTerm();
      assertTrue(Arrays.asList(expectedTerms).contains(term));
    }

    // clean up
    reader.close();
    dir.close();
    analyzer.close();
  }
Пример #6
0
 // LUCENE-3326
 public void testMultiFields() throws Exception {
   MoreLikeThis mlt = new MoreLikeThis(reader);
   Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
   mlt.setAnalyzer(analyzer);
   mlt.setMinDocFreq(1);
   mlt.setMinTermFreq(1);
   mlt.setMinWordLen(1);
   mlt.setFieldNames(new String[] {"text", "foobar"});
   mlt.like("foobar", new StringReader("this is a test"));
   analyzer.close();
 }
Пример #7
0
  public void testBoostFactor() throws Throwable {
    Map<String, Float> originalValues = getOriginalValues();

    MoreLikeThis mlt = new MoreLikeThis(reader);
    Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
    mlt.setAnalyzer(analyzer);
    mlt.setMinDocFreq(1);
    mlt.setMinTermFreq(1);
    mlt.setMinWordLen(1);
    mlt.setFieldNames(new String[] {"text"});
    mlt.setBoost(true);

    // this mean that every term boost factor will be multiplied by this
    // number
    float boostFactor = 5;
    mlt.setBoostFactor(boostFactor);

    BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader("lucene release"));
    Collection<BooleanClause> clauses = query.clauses();

    assertEquals(
        "Expected " + originalValues.size() + " clauses.", originalValues.size(), clauses.size());

    for (BooleanClause clause : clauses) {
      BoostQuery bq = (BoostQuery) clause.getQuery();
      TermQuery tq = (TermQuery) bq.getQuery();
      Float termBoost = originalValues.get(tq.getTerm().text());
      assertNotNull("Expected term " + tq.getTerm().text(), termBoost);

      float totalBoost = termBoost * boostFactor;
      assertEquals(
          "Expected boost of "
              + totalBoost
              + " for term '"
              + tq.getTerm().text()
              + "' got "
              + bq.getBoost(),
          totalBoost,
          bq.getBoost(),
          0.0001);
    }
    analyzer.close();
  }
Пример #8
0
  private Map<String, Float> getOriginalValues() throws IOException {
    Map<String, Float> originalValues = new HashMap<>();
    MoreLikeThis mlt = new MoreLikeThis(reader);
    Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
    mlt.setAnalyzer(analyzer);
    mlt.setMinDocFreq(1);
    mlt.setMinTermFreq(1);
    mlt.setMinWordLen(1);
    mlt.setFieldNames(new String[] {"text"});
    mlt.setBoost(true);
    BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader("lucene release"));
    Collection<BooleanClause> clauses = query.clauses();

    for (BooleanClause clause : clauses) {
      BoostQuery bq = (BoostQuery) clause.getQuery();
      TermQuery tq = (TermQuery) bq.getQuery();
      originalValues.put(tq.getTerm().text(), bq.getBoost());
    }
    analyzer.close();
    return originalValues;
  }
Пример #9
0
  @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-7161")
  public void testMultiFieldShouldReturnPerFieldBooleanQuery() throws Exception {
    IndexReader reader = null;
    Directory dir = newDirectory();
    Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
    try {
      int maxQueryTerms = 25;

      String[] itShopItemForSale =
          new String[] {
            "watch",
            "ipod",
            "asrock",
            "imac",
            "macbookpro",
            "monitor",
            "keyboard",
            "mouse",
            "speakers"
          };
      String[] itShopItemNotForSale = new String[] {"tie", "trousers", "shoes", "skirt", "hat"};

      String[] clothesShopItemForSale = new String[] {"tie", "trousers", "shoes", "skirt", "hat"};
      String[] clothesShopItemNotForSale =
          new String[] {
            "watch",
            "ipod",
            "asrock",
            "imac",
            "macbookpro",
            "monitor",
            "keyboard",
            "mouse",
            "speakers"
          };

      // add series of shop docs
      RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
      for (int i = 0; i < 300; i++) {
        addShopDoc(writer, "it", itShopItemForSale, itShopItemNotForSale);
      }
      for (int i = 0; i < 300; i++) {
        addShopDoc(writer, "clothes", clothesShopItemForSale, clothesShopItemNotForSale);
      }
      // Input Document is a clothes shop
      int inputDocId =
          addShopDoc(writer, "clothes", clothesShopItemForSale, clothesShopItemNotForSale);
      reader = writer.getReader();
      writer.close();

      // setup MLT query
      MoreLikeThis mlt = new MoreLikeThis(reader);

      mlt.setAnalyzer(analyzer);
      mlt.setMaxQueryTerms(maxQueryTerms);
      mlt.setMinDocFreq(1);
      mlt.setMinTermFreq(1);
      mlt.setMinWordLen(1);
      mlt.setFieldNames(new String[] {FOR_SALE, NOT_FOR_SALE});

      // perform MLT query
      BooleanQuery query = (BooleanQuery) mlt.like(inputDocId);
      Collection<BooleanClause> clauses = query.clauses();

      Collection<BooleanClause> expectedClothesShopClauses = new ArrayList<BooleanClause>();
      for (String itemForSale : clothesShopItemForSale) {
        BooleanClause booleanClause =
            new BooleanClause(
                new TermQuery(new Term(FOR_SALE, itemForSale)), BooleanClause.Occur.SHOULD);
        expectedClothesShopClauses.add(booleanClause);
      }
      for (String itemNotForSale : clothesShopItemNotForSale) {
        BooleanClause booleanClause =
            new BooleanClause(
                new TermQuery(new Term(NOT_FOR_SALE, itemNotForSale)), BooleanClause.Occur.SHOULD);
        expectedClothesShopClauses.add(booleanClause);
      }

      for (BooleanClause expectedClause : expectedClothesShopClauses) {
        assertTrue(clauses.contains(expectedClause));
      }
    } finally {
      // clean up
      if (reader != null) {
        reader.close();
      }
      dir.close();
      analyzer.close();
    }
  }