/** blast some random strings through the analyzer */ @Test public void testRandomStrings() throws Exception { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new PatternTokenizer(newAttributeFactory(), Pattern.compile("a"), -1); return new TokenStreamComponents(tokenizer); } }; checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER); a.close(); Analyzer b = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new PatternTokenizer(newAttributeFactory(), Pattern.compile("a"), 0); return new TokenStreamComponents(tokenizer); } }; checkRandomData(random(), b, 1000 * RANDOM_MULTIPLIER); b.close(); }
public void test() throws Exception { final CharArraySet cas = new CharArraySet(3, false); cas.add("jjp"); cas.add("wlmwoknt"); cas.add("tcgyreo"); final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.add("mtqlpi", ""); builder.add("mwoknt", "jjp"); builder.add("tcgyreo", "zpfpajyws"); final NormalizeCharMap map = builder.build(); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer t = new MockTokenizer(MockTokenFilter.ENGLISH_STOPSET, false, -65); TokenFilter f = new CommonGramsFilter(t, cas); return new TokenStreamComponents(t, f); } @Override protected Reader initReader(String fieldName, Reader reader) { reader = new MockCharFilter(reader, 0); reader = new MappingCharFilter(map, reader); reader = new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(reader); return reader; } }; checkAnalysisConsistency(random(), a, false, "wmgddzunizdomqyj"); a.close(); }
// LUCENE-5725 public void testMultiValues() throws Exception { MoreLikeThis mlt = new MoreLikeThis(reader); Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false); mlt.setAnalyzer(analyzer); mlt.setMinDocFreq(1); mlt.setMinTermFreq(1); mlt.setMinWordLen(1); mlt.setFieldNames(new String[] {"text"}); BooleanQuery query = (BooleanQuery) mlt.like( "text", new StringReader("lucene"), new StringReader("lucene release"), new StringReader("apache"), new StringReader("apache lucene")); Collection<BooleanClause> clauses = query.clauses(); assertEquals("Expected 2 clauses only!", 2, clauses.size()); for (BooleanClause clause : clauses) { Term term = ((TermQuery) clause.getQuery()).getTerm(); assertTrue( Arrays.asList(new Term("text", "lucene"), new Term("text", "apache")).contains(term)); } analyzer.close(); }
/** test use of exclusion set */ public void testExclude() throws IOException { CharArraySet exclusionSet = new CharArraySet(asSet("llengües"), false); Analyzer a = new CatalanAnalyzer(CatalanAnalyzer.getDefaultStopSet(), exclusionSet); checkOneTerm(a, "llengües", "llengües"); checkOneTerm(a, "llengua", "llengu"); a.close(); }
@Override public void close() { if (stopped.compareAndSet(false, true)) { // make sure we only stop once try { worker.close(); } catch (Exception e) { log.workerException(e); } this.allIndexesManager.stop(); this.timingSource.stop(); serviceManager.releaseAllServices(); for (Analyzer an : this.analyzers.values()) { an.close(); } for (AbstractDocumentBuilder documentBuilder : this.documentBuildersContainedEntities.values()) { documentBuilder.close(); } for (EntityIndexBinding entityIndexBinding : this.indexBindingForEntities.values()) { entityIndexBinding.getDocumentBuilder().close(); } // unregister statistic mbean if (statisticsMBeanName != null) { JMXRegistrar.unRegisterMBean(statisticsMBeanName); } } }
/** test use of exclusion set */ public void testExclude() throws IOException { CharArraySet exclusionSet = new CharArraySet(asSet("chicano"), false); Analyzer a = new SpanishAnalyzer(SpanishAnalyzer.getDefaultStopSet(), exclusionSet); checkOneTerm(a, "chicana", "chican"); checkOneTerm(a, "chicano", "chicano"); a.close(); }
public void testCuriousWikipediaString() throws Exception { final CharArraySet protWords = new CharArraySet( new HashSet<>( Arrays.asList("rrdpafa", "pupmmlu", "xlq", "dyy", "zqrxrrck", "o", "hsrlfvcha")), false); final byte table[] = new byte[] { -57, 26, 1, 48, 63, -23, 55, -84, 18, 120, -97, 103, 58, 13, 84, 89, 57, -13, -63, 5, 28, 97, -54, -94, 102, -108, -5, 5, 46, 40, 43, 78, 43, -72, 36, 29, 124, -106, -22, -51, 65, 5, 31, -42, 6, -99, 97, 14, 81, -128, 74, 100, 54, -55, -25, 53, -71, -98, 44, 33, 86, 106, -42, 47, 115, -89, -18, -26, 22, -95, -43, 83, -125, 105, -104, -24, 106, -16, 126, 115, -105, 97, 65, -33, 57, 44, -1, 123, -68, 100, 13, -41, -64, -119, 0, 92, 94, -36, 53, -9, -102, -18, 90, 94, -26, 31, 71, -20 }; Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new WikipediaTokenizer(); TokenStream stream = new SopTokenFilter(tokenizer); stream = new WordDelimiterFilter(stream, table, -50, protWords); stream = new SopTokenFilter(stream); return new TokenStreamComponents(tokenizer, stream); } }; checkAnalysisConsistency(random(), a, false, "B\u28c3\ue0f8[ \ud800\udfc2 </p> jb"); a.close(); }
public void testWithStemExclusionSet() throws IOException { CharArraySet set = new CharArraySet(1, true); set.add("پیاوە"); Analyzer a = new SoraniAnalyzer(CharArraySet.EMPTY_SET, set); assertAnalyzesTo(a, "پیاوە", new String[] {"پیاوە"}); a.close(); }
/** test use of elisionfilter */ public void testContractions() throws IOException { Analyzer a = new CatalanAnalyzer(); assertAnalyzesTo( a, "Diccionari de l'Institut d'Estudis Catalans", new String[] {"diccion", "inst", "estud", "catalan"}); a.close(); }
public void cleanText(String... inboundTexts) { try { final List<String> fields = Lists.newArrayList(); for (String raw : inboundTexts) { // Tidy t = new Tidy(); // t.setErrout(new PrintWriter(new ByteArrayOutputStream())); // StringWriter out = new StringWriter(); // t.parse(new StringReader(raw), out); // String tidied = out.getBuffer().toString(); // logger.debug("{}",tidied); // AutoDetectParser p = new AutoDetectParser(); // p.parse(new ByteArrayInputStream(raw.getBytes()), // new TextContentHandler(new DefaultHandler() // { // @Override // public void characters(char[] ch, int start, int length) throws SAXException // { // CharBuffer buf = CharBuffer.wrap(ch, start, length); // String s = buf.toString(); // logger.debug("{}",s); // fields.add(s); // } // }), new Metadata()); } Analyzer analyzer = new StandardAnalyzer(); // String joinedFields = Joiner.on(" ").join(fields).replaceAll("\\s+", " "); String joinedFields = Joiner.on(" ").join(inboundTexts).replaceAll("\\s+", " "); logger.debug("{}", joinedFields); StringReader in = new StringReader(joinedFields); TokenStream ts = analyzer.tokenStream("content", in); ts.reset(); ts = new LowerCaseFilter(ts); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); List<String> words = Lists.newArrayList(); while (ts.incrementToken()) { char[] termBuffer = termAtt.buffer(); int termLen = termAtt.length(); String w = new String(termBuffer, 0, termLen); words.add(w); } ts.end(); ts.close(); analyzer.close(); scrubbedWords = new ArrayList<String>(); for (String word : words) { if (word.length() >= MINWORDLEN && !stopwords.contains(word)) { scrubbedWords.add(word); } else { logger.debug("Ignoring word: {}", word); } } // this.scrubbedWords = words; } catch (Exception e) { throw new RuntimeException(e); } }
/** test stopwords and stemming */ public void testBasics() throws IOException { Analyzer a = new CatalanAnalyzer(); // stemming checkOneTerm(a, "llengües", "llengu"); checkOneTerm(a, "llengua", "llengu"); // stopword assertAnalyzesTo(a, "un", new String[] {}); a.close(); }
@Test @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-3869") public void testRandomStrings() throws Exception { Analyzer analyzer = new UIMABaseAnalyzer( "/uima/TestAggregateSentenceAE.xml", "org.apache.lucene.uima.ts.TokenAnnotation", null); checkRandomData(random(), analyzer, 100 * RANDOM_MULTIPLIER); analyzer.close(); }
/** test stopwords and stemming */ public void testBasics() throws IOException { Analyzer a = new SpanishAnalyzer(); // stemming checkOneTerm(a, "chicana", "chican"); checkOneTerm(a, "chicano", "chican"); // stopword assertAnalyzesTo(a, "los", new String[] {}); a.close(); }
@AfterClass public static void afterClass() throws Exception { reader.close(); directory.close(); analyzer.close(); reader = null; directory = null; analyzer = null; s1 = s2 = null; }
public static void main(String[] args) { server = new HttpSolrServer(DEFAULT_URL); Random random = new Random(100); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); readFileByLines(filename, analyzer, random); analyzer.close(); server = null; System.runFinalization(); System.gc(); }
// LUCENE-3326 public void testMultiFields() throws Exception { MoreLikeThis mlt = new MoreLikeThis(reader); Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); mlt.setAnalyzer(analyzer); mlt.setMinDocFreq(1); mlt.setMinTermFreq(1); mlt.setMinWordLen(1); mlt.setFieldNames(new String[] {"text", "foobar"}); mlt.like("foobar", new StringReader("this is a test")); analyzer.close(); }
@Test @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-3869") public void testRandomStringsWithConfigurationParameters() throws Exception { Map<String, Object> cp = new HashMap<>(); cp.put("line-end", "\r"); Analyzer analyzer = new UIMABaseAnalyzer( "/uima/TestWSTokenizerAE.xml", "org.apache.lucene.uima.ts.TokenAnnotation", cp); checkRandomData(random(), analyzer, 100 * RANDOM_MULTIPLIER); analyzer.close(); }
public void testEmptyTerm() throws IOException { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new KeywordTokenizer(); return new TokenStreamComponents(tokenizer, new KStemFilter(tokenizer)); } }; checkOneTerm(a, "", ""); a.close(); }
/** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, new ReverseStringFilter(tokenizer)); } }; checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER); a.close(); }
// Adds random graph after: public void testRandomHugeStringsGraphAfter() throws Exception { Random random = random(); Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new StandardTokenizer(newAttributeFactory()); TokenStream tokenStream = new MockGraphTokenFilter(random(), tokenizer); return new TokenStreamComponents(tokenizer, tokenStream); } }; checkRandomData(random, analyzer, 100 * RANDOM_MULTIPLIER, 8192); analyzer.close(); }
public void testTopN() throws Exception { int numDocs = 100; int topN = 25; // add series of docs with terms of decreasing df Directory dir = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), dir); for (int i = 0; i < numDocs; i++) { addDoc(writer, generateStrSeq(0, i + 1)); } IndexReader reader = writer.getReader(); writer.close(); // setup MLT query MoreLikeThis mlt = new MoreLikeThis(reader); Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); mlt.setAnalyzer(analyzer); mlt.setMaxQueryTerms(topN); mlt.setMinDocFreq(1); mlt.setMinTermFreq(1); mlt.setMinWordLen(1); mlt.setFieldNames(new String[] {"text"}); // perform MLT query String likeText = ""; for (String text : generateStrSeq(0, numDocs)) { likeText += text + " "; } BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader(likeText)); // check best terms are topN of highest idf Collection<BooleanClause> clauses = query.clauses(); assertEquals("Expected" + topN + "clauses only!", topN, clauses.size()); Term[] expectedTerms = new Term[topN]; int idx = 0; for (String text : generateStrSeq(numDocs - topN, topN)) { expectedTerms[idx++] = new Term("text", text); } for (BooleanClause clause : clauses) { Term term = ((TermQuery) clause.getQuery()).getTerm(); assertTrue(Arrays.asList(expectedTerms).contains(term)); } // clean up reader.close(); dir.close(); analyzer.close(); }
/** * Saves input documents in a format that can be read by the LDA model * * @param hits * @throws IOException */ public void saveDocumentsToFile(ScoreDoc[] hits) throws IOException { String index = "Index_TREC"; String field = "contents"; IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(index))); IndexSearcher searcher = new IndexSearcher(reader); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_48, IndexerTREC.stopWordsSet); List<String> stopwords = new ArrayList<String>(); for (int i = 0; i < IndexerTREC.STOP_WORDS.length; i++) { stopwords.add(IndexerTREC.STOP_WORDS[i]); } String docFileName = "model/newdocs.dat"; File file = new File(docFileName); FileWriter filerWriterDocs = new FileWriter(file.getAbsoluteFile(), false); BufferedWriter bufferedWriterDocs = new BufferedWriter(filerWriterDocs); PrintWriter out = new PrintWriter(bufferedWriterDocs); Document doc; for (int i = 0; i < hits.length; i++) { String docContent = ""; doc = searcher.doc(hits[i].doc); String content = doc.get("contents").toLowerCase(); TokenStream tokenStream = analyzer.tokenStream("contents", doc.get("contents")); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { String term = charTermAttribute.toString(); if (!stopwords.contains(term.toLowerCase())) { docContent = docContent + " " + term; } } tokenStream.end(); tokenStream.close(); bufferedWriterDocs.write(docContent + "\n"); } analyzer.close(); bufferedWriterDocs.close(); RandomAccessFile f = new RandomAccessFile(new File(docFileName), "rw"); f.seek(0); // to the beginning String documentCount = Integer.toString(hits.length) + "\n"; f.write(documentCount.getBytes()); f.close(); }
// LUCENE-5269 @Slow public void testUnicodeShinglesAndNgrams() throws Exception { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new EdgeNGramTokenizer(2, 94); // TokenStream stream = new SopTokenFilter(tokenizer); TokenStream stream = new ShingleFilter(tokenizer, 5); // stream = new SopTokenFilter(stream); stream = new NGramTokenFilter(stream, 55, 83); // stream = new SopTokenFilter(stream); return new TokenStreamComponents(tokenizer, stream); } }; checkRandomData(random(), analyzer, 2000); analyzer.close(); }
public void testBoostFactor() throws Throwable { Map<String, Float> originalValues = getOriginalValues(); MoreLikeThis mlt = new MoreLikeThis(reader); Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); mlt.setAnalyzer(analyzer); mlt.setMinDocFreq(1); mlt.setMinTermFreq(1); mlt.setMinWordLen(1); mlt.setFieldNames(new String[] {"text"}); mlt.setBoost(true); // this mean that every term boost factor will be multiplied by this // number float boostFactor = 5; mlt.setBoostFactor(boostFactor); BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader("lucene release")); Collection<BooleanClause> clauses = query.clauses(); assertEquals( "Expected " + originalValues.size() + " clauses.", originalValues.size(), clauses.size()); for (BooleanClause clause : clauses) { BoostQuery bq = (BoostQuery) clause.getQuery(); TermQuery tq = (TermQuery) bq.getQuery(); Float termBoost = originalValues.get(tq.getTerm().text()); assertNotNull("Expected term " + tq.getTerm().text(), termBoost); float totalBoost = termBoost * boostFactor; assertEquals( "Expected boost of " + totalBoost + " for term '" + tq.getTerm().text() + "' got " + bq.getBoost(), totalBoost, bq.getBoost(), 0.0001); } analyzer.close(); }
private Map<String, Float> getOriginalValues() throws IOException { Map<String, Float> originalValues = new HashMap<>(); MoreLikeThis mlt = new MoreLikeThis(reader); Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); mlt.setAnalyzer(analyzer); mlt.setMinDocFreq(1); mlt.setMinTermFreq(1); mlt.setMinWordLen(1); mlt.setFieldNames(new String[] {"text"}); mlt.setBoost(true); BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader("lucene release")); Collection<BooleanClause> clauses = query.clauses(); for (BooleanClause clause : clauses) { BoostQuery bq = (BoostQuery) clause.getQuery(); TermQuery tq = (TermQuery) bq.getQuery(); originalValues.put(tq.getTerm().text(), bq.getBoost()); } analyzer.close(); return originalValues; }
public void testIgnoreCaseNoSideEffects() throws Exception { final Dictionary d; // no multiple try-with to workaround bogus VerifyError InputStream affixStream = TestStemmer.class.getResourceAsStream("simple.aff"); InputStream dictStream = TestStemmer.class.getResourceAsStream("simple.dic"); try { d = new Dictionary(affixStream, Collections.singletonList(dictStream), true); } finally { IOUtils.closeWhileHandlingException(affixStream, dictStream); } Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new KeywordTokenizer(); return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, d)); } }; checkOneTerm(a, "NoChAnGy", "NoChAnGy"); a.close(); }
public void testReusableTokenStream() throws Exception { Analyzer a = new GreekAnalyzer(); // Verify the correct analysis of capitals and small accented letters, and // stemming assertAnalyzesTo( a, "Μία εξαιρετικά καλή και πλούσια σειρά χαρακτήρων της Ελληνικής γλώσσας", new String[] {"μια", "εξαιρετ", "καλ", "πλουσ", "σειρ", "χαρακτηρ", "ελληνικ", "γλωσσ"}); // Verify the correct analysis of small letters with diaeresis and the elimination // of punctuation marks assertAnalyzesTo( a, "Προϊόντα (και) [πολλαπλές] - ΑΝΑΓΚΕΣ", new String[] {"προιοντ", "πολλαπλ", "αναγκ"}); // Verify the correct analysis of capital accented letters and capital letters with diaeresis, // as well as the elimination of stop words assertAnalyzesTo( a, "ΠΡΟΫΠΟΘΕΣΕΙΣ Άψογος, ο μεστός και οι άλλοι", new String[] {"προυποθεσ", "αψογ", "μεστ", "αλλ"}); a.close(); }
@Override public void tearDown() throws Exception { analyzer.close(); super.tearDown(); }
/** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { Analyzer a = new GreekAnalyzer(); checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER); a.close(); }
@Override public void close() { fakeAnalyzer.close(); super.close(); }