@Override public DocScoreList mostSimilar(int wpId, int maxResults, TIntSet validIds) throws IOException { if (hasCachedMostSimilar(wpId)) { return getCachedMostSimilar(wpId, maxResults, validIds); } MoreLikeThis mlt = getMoreLikeThis(); int luceneId = esaHelper.wpIdToLuceneId(wpId); Query query; if (luceneId >= 0) { query = mlt.like(luceneId); } else if (textHelper != null && textHelper.wpIdToLuceneId(wpId) >= 0) { Document d = textHelper.wpIdToLuceneDoc(wpId); String text = d.get(Page.FIELD_TEXT); query = mlt.like(new StringReader(text), Page.FIELD_TEXT); } else { return null; } TopDocs similarDocs = searcher.search(query, esaHelper.getWpIdFilter(validIds), maxResults); pruneSimilar(similarDocs); DocScoreList scores = new DocScoreList(similarDocs.scoreDocs.length); for (int i = 0; i < similarDocs.scoreDocs.length; i++) { ScoreDoc sd = similarDocs.scoreDocs[i]; scores.set(i, esaHelper.luceneIdToWpId(sd.doc), similarDocs.scoreDocs[i].score); } return normalize(scores); }
// LUCENE-5725 public void testMultiValues() throws Exception { MoreLikeThis mlt = new MoreLikeThis(reader); Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false); mlt.setAnalyzer(analyzer); mlt.setMinDocFreq(1); mlt.setMinTermFreq(1); mlt.setMinWordLen(1); mlt.setFieldNames(new String[] {"text"}); BooleanQuery query = (BooleanQuery) mlt.like( "text", new StringReader("lucene"), new StringReader("lucene release"), new StringReader("apache"), new StringReader("apache lucene")); Collection<BooleanClause> clauses = query.clauses(); assertEquals("Expected 2 clauses only!", 2, clauses.size()); for (BooleanClause clause : clauses) { Term term = ((TermQuery) clause.getQuery()).getTerm(); assertTrue( Arrays.asList(new Term("text", "lucene"), new Term("text", "apache")).contains(term)); } analyzer.close(); }
@Override public double similarity(int wpId1, int wpId2) throws IOException { int doc1 = esaHelper.wpIdToLuceneId(wpId1); int doc2 = esaHelper.wpIdToLuceneId(wpId2); if (doc1 < 0 || doc2 < 0) { return normalize(0.0); } MoreLikeThis mlt = getMoreLikeThis(); TopDocs similarDocs = searcher.search(mlt.like(doc1), new FieldCacheTermsFilter("id", "" + wpId2), 1); if (similarDocs.scoreDocs.length == 0) { return normalize(0); } else { assert (similarDocs.scoreDocs.length == 1); assert (similarDocs.scoreDocs[0].doc == doc2); return normalize(similarDocs.scoreDocs[0].score); } }
private MoreLikeThis getMoreLikeThis() { MoreLikeThis mlt = new MoreLikeThis(reader); // Pass the reader reader mlt.setMaxDocFreqPct(maxPercentage); mlt.setMaxQueryTerms(maxQueryTerms); mlt.setMinDocFreq(minDocFreq); mlt.setMinTermFreq(minTermFreq); mlt.setAnalyzer(analyzer); mlt.setFieldNames(new String[] {"text"}); // specify the fields for similiarity return mlt; }
public void testTopN() throws Exception { int numDocs = 100; int topN = 25; // add series of docs with terms of decreasing df Directory dir = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), dir); for (int i = 0; i < numDocs; i++) { addDoc(writer, generateStrSeq(0, i + 1)); } IndexReader reader = writer.getReader(); writer.close(); // setup MLT query MoreLikeThis mlt = new MoreLikeThis(reader); Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); mlt.setAnalyzer(analyzer); mlt.setMaxQueryTerms(topN); mlt.setMinDocFreq(1); mlt.setMinTermFreq(1); mlt.setMinWordLen(1); mlt.setFieldNames(new String[] {"text"}); // perform MLT query String likeText = ""; for (String text : generateStrSeq(0, numDocs)) { likeText += text + " "; } BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader(likeText)); // check best terms are topN of highest idf Collection<BooleanClause> clauses = query.clauses(); assertEquals("Expected" + topN + "clauses only!", topN, clauses.size()); Term[] expectedTerms = new Term[topN]; int idx = 0; for (String text : generateStrSeq(numDocs - topN, topN)) { expectedTerms[idx++] = new Term("text", text); } for (BooleanClause clause : clauses) { Term term = ((TermQuery) clause.getQuery()).getTerm(); assertTrue(Arrays.asList(expectedTerms).contains(term)); } // clean up reader.close(); dir.close(); analyzer.close(); }
// LUCENE-3326 public void testMultiFields() throws Exception { MoreLikeThis mlt = new MoreLikeThis(reader); Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); mlt.setAnalyzer(analyzer); mlt.setMinDocFreq(1); mlt.setMinTermFreq(1); mlt.setMinWordLen(1); mlt.setFieldNames(new String[] {"text", "foobar"}); mlt.like("foobar", new StringReader("this is a test")); analyzer.close(); }
public void testBoostFactor() throws Throwable { Map<String, Float> originalValues = getOriginalValues(); MoreLikeThis mlt = new MoreLikeThis(reader); Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); mlt.setAnalyzer(analyzer); mlt.setMinDocFreq(1); mlt.setMinTermFreq(1); mlt.setMinWordLen(1); mlt.setFieldNames(new String[] {"text"}); mlt.setBoost(true); // this mean that every term boost factor will be multiplied by this // number float boostFactor = 5; mlt.setBoostFactor(boostFactor); BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader("lucene release")); Collection<BooleanClause> clauses = query.clauses(); assertEquals( "Expected " + originalValues.size() + " clauses.", originalValues.size(), clauses.size()); for (BooleanClause clause : clauses) { BoostQuery bq = (BoostQuery) clause.getQuery(); TermQuery tq = (TermQuery) bq.getQuery(); Float termBoost = originalValues.get(tq.getTerm().text()); assertNotNull("Expected term " + tq.getTerm().text(), termBoost); float totalBoost = termBoost * boostFactor; assertEquals( "Expected boost of " + totalBoost + " for term '" + tq.getTerm().text() + "' got " + bq.getBoost(), totalBoost, bq.getBoost(), 0.0001); } analyzer.close(); }
private Map<String, Float> getOriginalValues() throws IOException { Map<String, Float> originalValues = new HashMap<>(); MoreLikeThis mlt = new MoreLikeThis(reader); Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); mlt.setAnalyzer(analyzer); mlt.setMinDocFreq(1); mlt.setMinTermFreq(1); mlt.setMinWordLen(1); mlt.setFieldNames(new String[] {"text"}); mlt.setBoost(true); BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader("lucene release")); Collection<BooleanClause> clauses = query.clauses(); for (BooleanClause clause : clauses) { BoostQuery bq = (BoostQuery) clause.getQuery(); TermQuery tq = (TermQuery) bq.getQuery(); originalValues.put(tq.getTerm().text(), bq.getBoost()); } analyzer.close(); return originalValues; }
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-7161") public void testMultiFieldShouldReturnPerFieldBooleanQuery() throws Exception { IndexReader reader = null; Directory dir = newDirectory(); Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); try { int maxQueryTerms = 25; String[] itShopItemForSale = new String[] { "watch", "ipod", "asrock", "imac", "macbookpro", "monitor", "keyboard", "mouse", "speakers" }; String[] itShopItemNotForSale = new String[] {"tie", "trousers", "shoes", "skirt", "hat"}; String[] clothesShopItemForSale = new String[] {"tie", "trousers", "shoes", "skirt", "hat"}; String[] clothesShopItemNotForSale = new String[] { "watch", "ipod", "asrock", "imac", "macbookpro", "monitor", "keyboard", "mouse", "speakers" }; // add series of shop docs RandomIndexWriter writer = new RandomIndexWriter(random(), dir); for (int i = 0; i < 300; i++) { addShopDoc(writer, "it", itShopItemForSale, itShopItemNotForSale); } for (int i = 0; i < 300; i++) { addShopDoc(writer, "clothes", clothesShopItemForSale, clothesShopItemNotForSale); } // Input Document is a clothes shop int inputDocId = addShopDoc(writer, "clothes", clothesShopItemForSale, clothesShopItemNotForSale); reader = writer.getReader(); writer.close(); // setup MLT query MoreLikeThis mlt = new MoreLikeThis(reader); mlt.setAnalyzer(analyzer); mlt.setMaxQueryTerms(maxQueryTerms); mlt.setMinDocFreq(1); mlt.setMinTermFreq(1); mlt.setMinWordLen(1); mlt.setFieldNames(new String[] {FOR_SALE, NOT_FOR_SALE}); // perform MLT query BooleanQuery query = (BooleanQuery) mlt.like(inputDocId); Collection<BooleanClause> clauses = query.clauses(); Collection<BooleanClause> expectedClothesShopClauses = new ArrayList<BooleanClause>(); for (String itemForSale : clothesShopItemForSale) { BooleanClause booleanClause = new BooleanClause( new TermQuery(new Term(FOR_SALE, itemForSale)), BooleanClause.Occur.SHOULD); expectedClothesShopClauses.add(booleanClause); } for (String itemNotForSale : clothesShopItemNotForSale) { BooleanClause booleanClause = new BooleanClause( new TermQuery(new Term(NOT_FOR_SALE, itemNotForSale)), BooleanClause.Occur.SHOULD); expectedClothesShopClauses.add(booleanClause); } for (BooleanClause expectedClause : expectedClothesShopClauses) { assertTrue(clauses.contains(expectedClause)); } } finally { // clean up if (reader != null) { reader.close(); } dir.close(); analyzer.close(); } }