@Override public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) throws IOException { if (!fieldType().indexed()) { return null; } final NumericType numericType = fieldType().numericType(); if (numericType != null) { if (!(reuse instanceof NumericTokenStream && ((NumericTokenStream) reuse).getPrecisionStep() == type.numericPrecisionStep())) { // lazy init the TokenStream as it is heavy to instantiate // (attributes,...) if not needed (stored field loading) reuse = new NumericTokenStream(type.numericPrecisionStep()); } final NumericTokenStream nts = (NumericTokenStream) reuse; // initialize value in TokenStream final Number val = (Number) fieldsData; switch (numericType) { case INT: nts.setIntValue(val.intValue()); break; case LONG: nts.setLongValue(val.longValue()); break; case FLOAT: nts.setFloatValue(val.floatValue()); break; case DOUBLE: nts.setDoubleValue(val.doubleValue()); break; default: throw new AssertionError("Should never get here"); } return reuse; } if (!fieldType().tokenized()) { if (stringValue() == null) { throw new IllegalArgumentException("Non-Tokenized Fields must have a String value"); } if (!(reuse instanceof StringTokenStream)) { // lazy init the TokenStream as it is heavy to instantiate // (attributes,...) if not needed (stored field loading) reuse = new StringTokenStream(); } ((StringTokenStream) reuse).setValue(stringValue()); return reuse; } if (tokenStream != null) { return tokenStream; } else if (readerValue() != null) { return analyzer.tokenStream(name(), readerValue()); } else if (stringValue() != null) { return analyzer.tokenStream(name(), stringValue()); } throw new IllegalArgumentException( "Field must have either TokenStream, String, Reader or Number value; got " + this); }
public void testWithStemExclusionSet() throws IOException { CharArraySet set = new CharArraySet(1, true); set.add("پیاوە"); Analyzer a = new SoraniAnalyzer(CharArraySet.EMPTY_SET, set); assertAnalyzesTo(a, "پیاوە", new String[] {"پیاوە"}); a.close(); }
public void test() throws Exception { final CharArraySet cas = new CharArraySet(3, false); cas.add("jjp"); cas.add("wlmwoknt"); cas.add("tcgyreo"); final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.add("mtqlpi", ""); builder.add("mwoknt", "jjp"); builder.add("tcgyreo", "zpfpajyws"); final NormalizeCharMap map = builder.build(); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer t = new MockTokenizer(MockTokenFilter.ENGLISH_STOPSET, false, -65); TokenFilter f = new CommonGramsFilter(t, cas); return new TokenStreamComponents(t, f); } @Override protected Reader initReader(String fieldName, Reader reader) { reader = new MockCharFilter(reader, 0); reader = new MappingCharFilter(map, reader); reader = new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(reader); return reader; } }; checkAnalysisConsistency(random(), a, false, "wmgddzunizdomqyj"); a.close(); }
@Test public void simpleTest() throws IOException { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new WhitespaceTokenizer(Lucene.VERSION, reader); return new TokenStreamComponents(t, new TruncateTokenFilter(t, 3)); } }; TokenStream test = analyzer.tokenStream("test", "a bb ccc dddd eeeee"); test.reset(); CharTermAttribute termAttribute = test.addAttribute(CharTermAttribute.class); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("a")); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("bb")); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("ccc")); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("ddd")); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("eee")); assertThat(test.incrementToken(), equalTo(false)); }
@Override public void close() { if (stopped.compareAndSet(false, true)) { // make sure we only stop once try { worker.close(); } catch (Exception e) { log.workerException(e); } this.allIndexesManager.stop(); this.timingSource.stop(); serviceManager.releaseAllServices(); for (Analyzer an : this.analyzers.values()) { an.close(); } for (AbstractDocumentBuilder documentBuilder : this.documentBuildersContainedEntities.values()) { documentBuilder.close(); } for (EntityIndexBinding entityIndexBinding : this.indexBindingForEntities.values()) { entityIndexBinding.getDocumentBuilder().close(); } // unregister statistic mbean if (statisticsMBeanName != null) { JMXRegistrar.unRegisterMBean(statisticsMBeanName); } } }
// LUCENE-5725 public void testMultiValues() throws Exception { MoreLikeThis mlt = new MoreLikeThis(reader); Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false); mlt.setAnalyzer(analyzer); mlt.setMinDocFreq(1); mlt.setMinTermFreq(1); mlt.setMinWordLen(1); mlt.setFieldNames(new String[] {"text"}); BooleanQuery query = (BooleanQuery) mlt.like( "text", new StringReader("lucene"), new StringReader("lucene release"), new StringReader("apache"), new StringReader("apache lucene")); Collection<BooleanClause> clauses = query.clauses(); assertEquals("Expected 2 clauses only!", 2, clauses.size()); for (BooleanClause clause : clauses) { Term term = ((TermQuery) clause.getQuery()).getTerm(); assertTrue( Arrays.asList(new Term("text", "lucene"), new Term("text", "apache")).contains(term)); } analyzer.close(); }
/** test use of exclusion set */ public void testExclude() throws IOException { CharArraySet exclusionSet = new CharArraySet(asSet("llengües"), false); Analyzer a = new CatalanAnalyzer(CatalanAnalyzer.getDefaultStopSet(), exclusionSet); checkOneTerm(a, "llengües", "llengües"); checkOneTerm(a, "llengua", "llengu"); a.close(); }
@Test public void testCase2() throws Exception { StringReader reader = new StringReader("고속도로"); nouns.add(getToken("고속도로", 0, 4)); nouns.add(getToken("고속도", 0, 3)); nouns.add(getToken("고속", 0, 2)); nouns.add(getToken("속도", 1, 3)); nouns.add(getToken("고", 0, 1)); Analyzer analyzer = new KoreanAnalyzer(); TokenStream stream = analyzer.reusableTokenStream("dummy", reader); CharTermAttribute charTermAtt = stream.getAttribute(CharTermAttribute.class); OffsetAttribute offSetAtt = stream.getAttribute(OffsetAttribute.class); while (stream.incrementToken()) { TestToken t = getToken(charTermAtt.toString(), offSetAtt.startOffset(), offSetAtt.endOffset()); System.out.println("termAtt.term() : " + charTermAtt.toString()); System.out.println("offSetAtt : " + offSetAtt.startOffset()); System.out.println("offSetAtt : " + offSetAtt.endOffset()); Assert.assertTrue(nouns.contains(t)); } }
/** test use of exclusion set */ public void testExclude() throws IOException { CharArraySet exclusionSet = new CharArraySet(asSet("chicano"), false); Analyzer a = new SpanishAnalyzer(SpanishAnalyzer.getDefaultStopSet(), exclusionSet); checkOneTerm(a, "chicana", "chican"); checkOneTerm(a, "chicano", "chicano"); a.close(); }
/** blast some random strings through the analyzer */ @Test public void testRandomStrings() throws Exception { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new PatternTokenizer(newAttributeFactory(), Pattern.compile("a"), -1); return new TokenStreamComponents(tokenizer); } }; checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER); a.close(); Analyzer b = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new PatternTokenizer(newAttributeFactory(), Pattern.compile("a"), 0); return new TokenStreamComponents(tokenizer); } }; checkRandomData(random(), b, 1000 * RANDOM_MULTIPLIER); b.close(); }
public void testCuriousWikipediaString() throws Exception { final CharArraySet protWords = new CharArraySet( new HashSet<>( Arrays.asList("rrdpafa", "pupmmlu", "xlq", "dyy", "zqrxrrck", "o", "hsrlfvcha")), false); final byte table[] = new byte[] { -57, 26, 1, 48, 63, -23, 55, -84, 18, 120, -97, 103, 58, 13, 84, 89, 57, -13, -63, 5, 28, 97, -54, -94, 102, -108, -5, 5, 46, 40, 43, 78, 43, -72, 36, 29, 124, -106, -22, -51, 65, 5, 31, -42, 6, -99, 97, 14, 81, -128, 74, 100, 54, -55, -25, 53, -71, -98, 44, 33, 86, 106, -42, 47, 115, -89, -18, -26, 22, -95, -43, 83, -125, 105, -104, -24, 106, -16, 126, 115, -105, 97, 65, -33, 57, 44, -1, 123, -68, 100, 13, -41, -64, -119, 0, 92, 94, -36, 53, -9, -102, -18, 90, 94, -26, 31, 71, -20 }; Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new WikipediaTokenizer(); TokenStream stream = new SopTokenFilter(tokenizer); stream = new WordDelimiterFilter(stream, table, -50, protWords); stream = new SopTokenFilter(stream); return new TokenStreamComponents(tokenizer, stream); } }; checkAnalysisConsistency(random(), a, false, "B\u28c3\ue0f8[ \ud800\udfc2 </p> jb"); a.close(); }
/** test use of elisionfilter */ public void testContractions() throws IOException { Analyzer a = new CatalanAnalyzer(); assertAnalyzesTo( a, "Diccionari de l'Institut d'Estudis Catalans", new String[] {"diccion", "inst", "estud", "catalan"}); a.close(); }
private static String[] groupTokens(Analyzer analyzer, String input) throws IOException { if (Resources.debug) { Resources.LOGGER.debug("TokenParser:" + input); Resources.LOGGER.debug("Analyzer:" + analyzer.getClass()); } TokenStream tokenStream = analyzer.tokenStream("input", input); OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class); tokenStream.reset(); int position = 0; List<TermInfo> infos = new ArrayList<TermInfo>(); while (tokenStream.incrementToken()) { int increment = positionIncrementAttribute.getPositionIncrement(); if (increment > 0) { position = position + increment; if (Resources.debug) { Resources.LOGGER.debug(position + ":"); } } int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String term = charTermAttribute.toString(); TermInfo info = new TermInfo(); info.setStart(startOffset); info.setEnd(endOffset); infos.add(info); if (Resources.debug) { Resources.LOGGER.debug( "[" + term + "]" + ":(" + startOffset + "-->" + endOffset + "):" + typeAttribute.type()); } } tokenStream.end(); tokenStream.close(); Stack<TermInfo> tiStack = groupTokenInfos(infos); List<String> terms = new ArrayList<String>(); while (!tiStack.isEmpty()) { TermInfo termInfo = tiStack.pop(); if (termInfo.getEnd() <= input.length() && termInfo.getStart() >= 1) { String term = input.substring(termInfo.getStart(), termInfo.getEnd()); terms.add(term); } } return terms.toArray(new String[] {}); }
public void cleanText(String... inboundTexts) { try { final List<String> fields = Lists.newArrayList(); for (String raw : inboundTexts) { // Tidy t = new Tidy(); // t.setErrout(new PrintWriter(new ByteArrayOutputStream())); // StringWriter out = new StringWriter(); // t.parse(new StringReader(raw), out); // String tidied = out.getBuffer().toString(); // logger.debug("{}",tidied); // AutoDetectParser p = new AutoDetectParser(); // p.parse(new ByteArrayInputStream(raw.getBytes()), // new TextContentHandler(new DefaultHandler() // { // @Override // public void characters(char[] ch, int start, int length) throws SAXException // { // CharBuffer buf = CharBuffer.wrap(ch, start, length); // String s = buf.toString(); // logger.debug("{}",s); // fields.add(s); // } // }), new Metadata()); } Analyzer analyzer = new StandardAnalyzer(); // String joinedFields = Joiner.on(" ").join(fields).replaceAll("\\s+", " "); String joinedFields = Joiner.on(" ").join(inboundTexts).replaceAll("\\s+", " "); logger.debug("{}", joinedFields); StringReader in = new StringReader(joinedFields); TokenStream ts = analyzer.tokenStream("content", in); ts.reset(); ts = new LowerCaseFilter(ts); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); List<String> words = Lists.newArrayList(); while (ts.incrementToken()) { char[] termBuffer = termAtt.buffer(); int termLen = termAtt.length(); String w = new String(termBuffer, 0, termLen); words.add(w); } ts.end(); ts.close(); analyzer.close(); scrubbedWords = new ArrayList<String>(); for (String word : words) { if (word.length() >= MINWORDLEN && !stopwords.contains(word)) { scrubbedWords.add(word); } else { logger.debug("Ignoring word: {}", word); } } // this.scrubbedWords = words; } catch (Exception e) { throw new RuntimeException(e); } }
// Handle additional arguments... protected void setArgs(IndexSchema schema, Map<String, String> args) { // default to STORED, INDEXED, OMIT_TF_POSITIONS and MULTIVALUED depending on schema version properties = (STORED | INDEXED); float schemaVersion = schema.getVersion(); if (schemaVersion < 1.1f) properties |= MULTIVALUED; if (schemaVersion > 1.1f) properties |= OMIT_TF_POSITIONS; if (schemaVersion < 1.3) { args.remove("compressThreshold"); } if (schemaVersion >= 1.6f) properties |= USE_DOCVALUES_AS_STORED; this.args = Collections.unmodifiableMap(args); Map<String, String> initArgs = new HashMap<>(args); initArgs.remove(CLASS_NAME); // consume the class arg trueProperties = FieldProperties.parseProperties(initArgs, true, false); falseProperties = FieldProperties.parseProperties(initArgs, false, false); properties &= ~falseProperties; properties |= trueProperties; for (String prop : FieldProperties.propertyNames) initArgs.remove(prop); init(schema, initArgs); String positionInc = initArgs.get(POSITION_INCREMENT_GAP); if (positionInc != null) { Analyzer analyzer = getIndexAnalyzer(); if (analyzer instanceof SolrAnalyzer) { ((SolrAnalyzer) analyzer).setPositionIncrementGap(Integer.parseInt(positionInc)); } else { throw new RuntimeException( "Can't set " + POSITION_INCREMENT_GAP + " on custom analyzer " + analyzer.getClass()); } analyzer = getQueryAnalyzer(); if (analyzer instanceof SolrAnalyzer) { ((SolrAnalyzer) analyzer).setPositionIncrementGap(Integer.parseInt(positionInc)); } else { throw new RuntimeException( "Can't set " + POSITION_INCREMENT_GAP + " on custom analyzer " + analyzer.getClass()); } initArgs.remove(POSITION_INCREMENT_GAP); } this.postingsFormat = initArgs.remove(POSTINGS_FORMAT); this.docValuesFormat = initArgs.remove(DOC_VALUES_FORMAT); if (initArgs.size() > 0) { throw new RuntimeException( "schema fieldtype " + typeName + "(" + this.getClass().getName() + ")" + " invalid arguments:" + initArgs); } }
/** test stopwords and stemming */ public void testBasics() throws IOException { Analyzer a = new SpanishAnalyzer(); // stemming checkOneTerm(a, "chicana", "chican"); checkOneTerm(a, "chicano", "chican"); // stopword assertAnalyzesTo(a, "los", new String[] {}); a.close(); }
@Test @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-3869") public void testRandomStrings() throws Exception { Analyzer analyzer = new UIMABaseAnalyzer( "/uima/TestAggregateSentenceAE.xml", "org.apache.lucene.uima.ts.TokenAnnotation", null); checkRandomData(random(), analyzer, 100 * RANDOM_MULTIPLIER); analyzer.close(); }
/** test stopwords and stemming */ public void testBasics() throws IOException { Analyzer a = new CatalanAnalyzer(); // stemming checkOneTerm(a, "llengües", "llengu"); checkOneTerm(a, "llengua", "llengu"); // stopword assertAnalyzesTo(a, "un", new String[] {}); a.close(); }
public static void main(String[] args) { server = new HttpSolrServer(DEFAULT_URL); Random random = new Random(100); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); readFileByLines(filename, analyzer, random); analyzer.close(); server = null; System.runFinalization(); System.gc(); }
@Test @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-3869") public void testRandomStringsWithConfigurationParameters() throws Exception { Map<String, Object> cp = new HashMap<>(); cp.put("line-end", "\r"); Analyzer analyzer = new UIMABaseAnalyzer( "/uima/TestWSTokenizerAE.xml", "org.apache.lucene.uima.ts.TokenAnnotation", cp); checkRandomData(random(), analyzer, 100 * RANDOM_MULTIPLIER); analyzer.close(); }
/** * Annoncements adapter * * @param hg * @param a * @param ann * @return * @throws Exception */ public SearchResultWH makeHW(Highlighter hg, Analyzer a, Announce ann) throws Exception { String s = ""; { String text = ann.getITopDescription() + ""; TokenStream tokenStream = a.tokenStream("topdescription", new StringReader(text)); s += cP( "Совпадения в заголовке объявления", hg.getBestFragments(tokenStream, text, MAX_NUM_FRAGMENTS_REQUIRED, "... ")); } { String text = ann.getIDescription() + ""; TokenStream tokenStream = a.tokenStream("description", new StringReader(text)); s += cP( "Совпадения в тексте объявления", hg.getBestFragments(tokenStream, text, MAX_NUM_FRAGMENTS_REQUIRED, "... ")); } String metatexts = ""; { String text = ann.getMeta_keywords() + ""; TokenStream tokenStream = a.tokenStream("meta_keywords", new StringReader(text)); metatexts += cPmeta( "Совпадения в keywords", hg.getBestFragments(tokenStream, text, MAX_NUM_FRAGMENTS_REQUIRED, "... ")); text = ann.getMeta_description() + ""; tokenStream = a.tokenStream("meta_description", new StringReader(text)); metatexts += cPmeta( "Совпадения в description", hg.getBestFragments(tokenStream, text, MAX_NUM_FRAGMENTS_REQUIRED, "... ")); text = ann.getMeta_subject() + ""; tokenStream = a.tokenStream("meta_subject", new StringReader(text)); metatexts += cPmeta( "Совпадения в subject", hg.getBestFragments(tokenStream, text, MAX_NUM_FRAGMENTS_REQUIRED, "... ")); } SearchResultWH swh = new SearchResultWH(ann, "Announce", s, metatexts); return swh; }
// LUCENE-3326 public void testMultiFields() throws Exception { MoreLikeThis mlt = new MoreLikeThis(reader); Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); mlt.setAnalyzer(analyzer); mlt.setMinDocFreq(1); mlt.setMinTermFreq(1); mlt.setMinWordLen(1); mlt.setFieldNames(new String[] {"text", "foobar"}); mlt.like("foobar", new StringReader("this is a test")); analyzer.close(); }
public static void main(String[] args) throws IOException { Analyzer analyzer = new JavaSrcAnalyzer(); TokenStream stream = analyzer.tokenStream("test", new StringReader("package java.util.ArrayList")); while (stream.incrementToken()) { String[] parts = stream.reflectAsString(false).split("#"); for (String s : parts) { System.out.println(s); } System.out.println(); } }
/** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, new ReverseStringFilter(tokenizer)); } }; checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER); a.close(); }
public void testEmptyTerm() throws IOException { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new KeywordTokenizer(); return new TokenStreamComponents(tokenizer, new KStemFilter(tokenizer)); } }; checkOneTerm(a, "", ""); a.close(); }
/** * Analyzes the given value using the given Analyzer. * * @param value Value to analyze * @param context The {@link AnalysisContext analysis context}. * @return NamedList containing the tokens produced by analyzing the given value */ protected NamedList<? extends Object> analyzeValue(String value, AnalysisContext context) { Analyzer analyzer = context.getAnalyzer(); if (!TokenizerChain.class.isInstance(analyzer)) { TokenStream tokenStream = null; try { tokenStream = analyzer.reusableTokenStream(context.getFieldName(), new StringReader(value)); tokenStream.reset(); } catch (IOException e) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e); } NamedList<List<NamedList>> namedList = new NamedList<List<NamedList>>(); namedList.add( tokenStream.getClass().getName(), convertTokensToNamedLists(analyzeTokenStream(tokenStream), context)); return namedList; } TokenizerChain tokenizerChain = (TokenizerChain) analyzer; CharFilterFactory[] cfiltfacs = tokenizerChain.getCharFilterFactories(); TokenizerFactory tfac = tokenizerChain.getTokenizerFactory(); TokenFilterFactory[] filtfacs = tokenizerChain.getTokenFilterFactories(); NamedList<Object> namedList = new NamedList<Object>(); if (cfiltfacs != null) { String source = value; for (CharFilterFactory cfiltfac : cfiltfacs) { CharStream reader = CharReader.get(new StringReader(source)); reader = cfiltfac.create(reader); source = writeCharStream(namedList, reader); } } TokenStream tokenStream = tfac.create(tokenizerChain.charStream(new StringReader(value))); List<AttributeSource> tokens = analyzeTokenStream(tokenStream); namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(tokens, context)); ListBasedTokenStream listBasedTokenStream = new ListBasedTokenStream(tokens); for (TokenFilterFactory tokenFilterFactory : filtfacs) { tokenStream = tokenFilterFactory.create(listBasedTokenStream); List<AttributeSource> tokenList = analyzeTokenStream(tokenStream); namedList.add( tokenStream.getClass().getName(), convertTokensToNamedLists(tokenList, context)); listBasedTokenStream = new ListBasedTokenStream(tokenList); } return namedList; }
@Override public int doLogic() throws Exception { try { Locale locale = getRunData().getLocale(); if (locale == null) throw new RuntimeException("Locale must be set with the NewLocale task!"); Analyzer analyzer = createAnalyzer(locale, impl); getRunData().setAnalyzer(analyzer); System.out.println( "Changed Analyzer to: " + analyzer.getClass().getName() + "(" + locale + ")"); } catch (Exception e) { throw new RuntimeException("Error creating Analyzer: impl=" + impl, e); } return 1; }
// Adds random graph after: public void testRandomHugeStringsGraphAfter() throws Exception { Random random = random(); Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new StandardTokenizer(newAttributeFactory()); TokenStream tokenStream = new MockGraphTokenFilter(random(), tokenizer); return new TokenStreamComponents(tokenizer, tokenStream); } }; checkRandomData(random, analyzer, 100 * RANDOM_MULTIPLIER, 8192); analyzer.close(); }
public void testTopN() throws Exception { int numDocs = 100; int topN = 25; // add series of docs with terms of decreasing df Directory dir = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), dir); for (int i = 0; i < numDocs; i++) { addDoc(writer, generateStrSeq(0, i + 1)); } IndexReader reader = writer.getReader(); writer.close(); // setup MLT query MoreLikeThis mlt = new MoreLikeThis(reader); Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); mlt.setAnalyzer(analyzer); mlt.setMaxQueryTerms(topN); mlt.setMinDocFreq(1); mlt.setMinTermFreq(1); mlt.setMinWordLen(1); mlt.setFieldNames(new String[] {"text"}); // perform MLT query String likeText = ""; for (String text : generateStrSeq(0, numDocs)) { likeText += text + " "; } BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader(likeText)); // check best terms are topN of highest idf Collection<BooleanClause> clauses = query.clauses(); assertEquals("Expected" + topN + "clauses only!", topN, clauses.size()); Term[] expectedTerms = new Term[topN]; int idx = 0; for (String text : generateStrSeq(numDocs - topN, topN)) { expectedTerms[idx++] = new Term("text", text); } for (BooleanClause clause : clauses) { Term term = ((TermQuery) clause.getQuery()).getTerm(); assertTrue(Arrays.asList(expectedTerms).contains(term)); } // clean up reader.close(); dir.close(); analyzer.close(); }
public String tokens(String field) { try { Field f = doc.getField(field); if (f == null) fail("No such field " + field); if (!f.isTokenized()) { String val = value(field); Token t = new Token(val, 0, val.length()); return t.getPositionIncrement() + " [" + t.termText() + "]"; } TokenStream ts = f.tokenStreamValue(); if (ts == null && f.stringValue() != null) ts = analyzer.tokenStream(field, f.stringValue()); if (ts == null && f.readerValue() != null) ts = analyzer.tokenStream(field, f.readerValue()); if (ts == null) fail("No token stream for field " + field); Token t = null; StringBuilder sb = new StringBuilder(); while ((t = ts.next()) != null) { sb.append(t.getPositionIncrement() + " [" + t.termText() + "] "); } return sb.toString().trim(); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); return null; } }