@Test public void simpleTest() throws IOException { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new WhitespaceTokenizer(Lucene.VERSION, reader); return new TokenStreamComponents(t, new TruncateTokenFilter(t, 3)); } }; TokenStream test = analyzer.tokenStream("test", "a bb ccc dddd eeeee"); test.reset(); CharTermAttribute termAttribute = test.addAttribute(CharTermAttribute.class); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("a")); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("bb")); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("ccc")); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("ddd")); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("eee")); assertThat(test.incrementToken(), equalTo(false)); }
public void refineDocument(Document doc) { TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_36, new StringReader(doc.getContent())); tokenStream = new StopFilter(Version.LUCENE_36, tokenStream, stopWords); tokenStream = new PorterStemFilter(tokenStream); StringBuilder sb = new StringBuilder(); CharTermAttribute charTermAttr = tokenStream.getAttribute(CharTermAttribute.class); List<String> words = new ArrayList<String>(); Set<String> uniqueWords = new HashSet<String>(); try { while (tokenStream.incrementToken()) { String word = charTermAttr.toString(); // int wordVal = textToInt(charTermAttr.toString()); words.add(word); uniqueWords.add(word); dictionary.add(word); if (sb.length() > 0) { sb.append(" "); } sb.append(charTermAttr.toString()); } } catch (IOException e) { System.out.println(e.getMessage()); } doc.setRefinedContent(sb.toString()); doc.setWords(words); doc.setUniqueWords(uniqueWords); }
@Test public void testCase2() throws Exception { StringReader reader = new StringReader("고속도로"); nouns.add(getToken("고속도로", 0, 4)); nouns.add(getToken("고속도", 0, 3)); nouns.add(getToken("고속", 0, 2)); nouns.add(getToken("속도", 1, 3)); nouns.add(getToken("고", 0, 1)); Analyzer analyzer = new KoreanAnalyzer(); TokenStream stream = analyzer.reusableTokenStream("dummy", reader); CharTermAttribute charTermAtt = stream.getAttribute(CharTermAttribute.class); OffsetAttribute offSetAtt = stream.getAttribute(OffsetAttribute.class); while (stream.incrementToken()) { TestToken t = getToken(charTermAtt.toString(), offSetAtt.startOffset(), offSetAtt.endOffset()); System.out.println("termAtt.term() : " + charTermAtt.toString()); System.out.println("offSetAtt : " + offSetAtt.startOffset()); System.out.println("offSetAtt : " + offSetAtt.endOffset()); Assert.assertTrue(nouns.contains(t)); } }
public static List<String> analyze(String content) { List<String> resultList = null; try { // 创建分词对象 resultList = new ArrayList<String>(1); resultList.add(content); IKAnalyzer analyer = new IKAnalyzer(true); analyer.setUseSmart(true); StringReader reader = new StringReader(content); // 分词 TokenStream tokenStream = analyer.tokenStream("", reader); CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class); // 遍历分词数据 while (tokenStream.incrementToken()) { if (!term.toString().isEmpty()) { resultList.add(term.toString()); } } reader.close(); } catch (IOException ex) { logger.error("分词出错", ex); } return resultList; }
public void testConfigureCamelCaseTokenFilter() throws IOException { Settings settings = Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build(); Settings indexSettings = Settings.builder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put("index.analysis.filter.wordDelimiter.type", "word_delimiter") .put("index.analysis.filter.wordDelimiter.split_on_numerics", false) .put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace") .putArray( "index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter") .put("index.analysis.analyzer.custom_analyzer_1.tokenizer", "whitespace") .putArray( "index.analysis.analyzer.custom_analyzer_1.filter", "lowercase", "word_delimiter") .build(); IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); IndexAnalyzers indexAnalyzers = new AnalysisModule(new Environment(settings), emptyList()) .getAnalysisRegistry() .build(idxSettings); try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer")) { assertNotNull(custom_analyser); TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee"); tokenStream.reset(); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); List<String> token = new ArrayList<>(); while (tokenStream.incrementToken()) { token.add(charTermAttribute.toString()); } assertEquals(token.toString(), 2, token.size()); assertEquals("j2se", token.get(0)); assertEquals("j2ee", token.get(1)); } try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer_1")) { assertNotNull(custom_analyser); TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee"); tokenStream.reset(); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); List<String> token = new ArrayList<>(); while (tokenStream.incrementToken()) { token.add(charTermAttribute.toString()); } assertEquals(token.toString(), 6, token.size()); assertEquals("j", token.get(0)); assertEquals("2", token.get(1)); assertEquals("se", token.get(2)); assertEquals("j", token.get(3)); assertEquals("2", token.get(4)); assertEquals("ee", token.get(5)); } }
public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException { TokenStream stream = analyzer.tokenStream("contents", new StringReader(text)); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); TypeAttribute type = stream.addAttribute(TypeAttribute.class); PayloadAttribute payload = stream.addAttribute(PayloadAttribute.class); int position = 0; while (stream.incrementToken()) { int increment = posIncr.getPositionIncrement(); if (increment > 0) { position = position + increment; System.out.println(); System.out.print(position + ":"); } BytesRef pl = payload.getPayload(); if (pl != null) { System.out.print( "[" + term.toString() + ":" + offset.startOffset() + "->" + offset.endOffset() + ":" + type.type() + ":" + new String(pl.bytes) + "] "); } else { System.out.print( "[" + term.toString() + ":" + offset.startOffset() + "->" + offset.endOffset() + ":" + type.type() + "] "); } } System.out.println(); }
@Test public void test() throws IOException { TokenStream input = new WhitespaceTokenizer(new StringReader("abcde")); EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 3); CharTermAttribute termAtt = tokenizer.getAttribute(CharTermAttribute.class); tokenizer.reset(); Assert.assertTrue(tokenizer.incrementToken()); Assert.assertEquals("a", termAtt.toString()); Assert.assertTrue(tokenizer.incrementToken()); Assert.assertEquals("ab", termAtt.toString()); Assert.assertTrue(tokenizer.incrementToken()); Assert.assertEquals("abc", termAtt.toString()); Assert.assertFalse(tokenizer.incrementToken()); tokenizer.close(); }
public boolean incrementToken() throws IOException { if (!morphQueue.isEmpty()) { restoreState(currentState); setAttributesFromQueue(false); return true; } while (input.incrementToken()) { final String type = typeAtt.type(); if (KOREAN_TYPE.equals(type)) { try { analysisKorean(termAtt.toString()); } catch (MorphException e) { throw new RuntimeException(e); } } else { return true; // pass anything else thru } if (!morphQueue.isEmpty()) { setAttributesFromQueue(true); return true; } } return false; }
public Map<String, Double> search(String text) { Map<String, Double> similar = new HashMap<String, Double>(); try { TokenStream tokenStream = analyzer.tokenStream("text", text); CharTermAttribute charTermAtt = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); BooleanQuery bQuery = new BooleanQuery(); while (tokenStream.incrementToken()) { String token = charTermAtt.toString(); TermQuery tq = new TermQuery(new Term("text", token)); tq.setBoost(2f); bQuery.add(tq, Occur.MUST); } tokenStream.close(); TopDocs results = searcher.search(bQuery, 100000); ScoreDoc[] hits = results.scoreDocs; for (ScoreDoc hit : hits) { Document doc = searcher.doc(hit.doc); similar.put(doc.get("id"), new Double(hit.score)); } } catch (Exception e) { e.printStackTrace(); } return similar; }
/** * Adds term frequencies found by tokenizing text from reader into the Map words * * @param r a source of text to be tokenized * @param termFreqMap a Map of terms and their frequencies * @param fieldName Used by analyzer for any special per-field analysis */ private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName) throws IOException { if (analyzer == null) { throw new UnsupportedOperationException( "To use MoreLikeThis without " + "term vectors, you must provide an Analyzer"); } try (TokenStream ts = analyzer.tokenStream(fieldName, r)) { int tokenCount = 0; // for every token CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { String word = termAtt.toString(); tokenCount++; if (tokenCount > maxNumTokensParsed) { break; } if (isNoiseWord(word)) { continue; } // increment frequency Int cnt = termFreqMap.get(word); if (cnt == null) { termFreqMap.put(word, new Int()); } else { cnt.x++; } } ts.end(); } }
@Override protected boolean accept() { // return !stopWords.contains(termAtt.buffer(), 0, termAtt.length()); // System.out.format("%s=%s",termAtt.toString(),bloom.contains(termAtt.toString())); return bloom.contains(termAtt.toString()); // return false; }
public List<Document> searchDocuments(String text) { List<Document> documents = new ArrayList<Document>(); try { TokenStream tokenStream = analyzer.tokenStream("text", text); CharTermAttribute charTermAtt = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); BooleanQuery bQuery = new BooleanQuery(); while (tokenStream.incrementToken()) { String token = charTermAtt.toString(); TermQuery tq = new TermQuery(new Term("text", token)); tq.setBoost(2f); bQuery.add(tq, Occur.MUST); } tokenStream.close(); TopDocs results = searcher.search(bQuery, 100000); ScoreDoc[] hits = results.scoreDocs; for (ScoreDoc hit : hits) { Document doc = searcher.doc(hit.doc); doc.add(new FloatField("score", hit.score, FloatField.TYPE_STORED)); documents.add(doc); } } catch (Exception e) { e.printStackTrace(); } return documents; }
public static void displayTokens(TokenStream stream) throws IOException { CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); while (stream.incrementToken()) { System.out.println("[" + term.toString() + "] "); } }
@Override public boolean incrementToken() throws IOException { if (tokenIter == null || !tokenIter.hasNext()) { if (input.incrementToken()) { tokStart = offsetAtt.startOffset(); tokEnd = offsetAtt.endOffset(); hasIllegalOffsets = (tokStart + termAtt.length()) != tokEnd; tokenBuffer = wordSegmenter.getTokendWords(termAtt.toString()); tokenIter = tokenBuffer.iterator(); if (!tokenIter.hasNext()) return false; } else { return false; } } clearAttributes(); TokendWords nextWord = tokenIter.next(); termAtt.copyBuffer(nextWord.next(), 0, nextWord.next().length); if (hasIllegalOffsets) { offsetAtt.setOffset(tokStart, tokEnd); } else { offsetAtt.setOffset(nextWord.start, nextWord.end); } typeAtt.setType("word"); return true; }
public TokenIndex getTokenIndex(String str) { TokenIndex ret = new TokenIndex(); try { Tokenizer tokenizer = new JapaneseTokenizer( new StringReader(str), null, true, org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode.SEARCH); TokenStream stream = new JapaneseBaseFormFilter(tokenizer); // stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags); stream = new CJKWidthFilter(stream); // stream = new StopFilter(matchVersion, stream, stopwords); stream = new JapaneseKatakanaStemFilter(stream); // stream = new LowerCaseFilter(matchVersion, stream); OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class); while (stream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String token = charTermAttribute.toString(); ret.add(startOffset, endOffset); // System.out.println(token.str+" \t\tS="+token.start+" E="+token.end); } } catch (java.io.IOException e) { System.err.println(e); } return ret; }
@Override public boolean incrementToken() throws IOException { if (tokenIter == null || !tokenIter.hasNext()) { // there are no remaining tokens from the current sentence... are there more sentences? if (input.incrementToken()) { tokStart = offsetAtt.startOffset(); tokEnd = offsetAtt.endOffset(); // if length by start + end offsets doesn't match the term text then assume // this is a synonym and don't adjust the offsets. hasIllegalOffsets = (tokStart + termAtt.length()) != tokEnd; // a new sentence is available: process it. tokenBuffer = splitIntoTokens(termAtt.toString(), offsetAtt.startOffset()); tokenIter = tokenBuffer.iterator(); // it should not be possible to have a sentence with 0 words, check just in case. // returning EOS isn't the best either, but its the behavior of the original code. if (!tokenIter.hasNext()) return false; } else { return false; // no more sentences, end of stream! } } // WordTokenFilter must clear attributes, as it is creating new tokens. clearAttributes(); // There are remaining tokens from the current sentence, return the next one. SegToken nextWord = tokenIter.next(); termAtt.append(nextWord.term); // termAtt.copyBuffer(nextWord.charArray, 0, nextWord.charArray.length); if (hasIllegalOffsets) { offsetAtt.setOffset(tokStart, tokEnd); } else { offsetAtt.setOffset(nextWord.start, nextWord.end); } typeAtt.setType("word"); return true; }
/** * Uses the solr.ASCIIFoldingFilter to convert a string to its ASCII equivalent. See solr * documentation for full details. </br> When doing the conversion, this method mirrors GBIF's * registry-solr schema configuration for <fieldType name="text_auto_ngram">. For example, it uses * the KeywordTokenizer that treats the entire string as a single token, regardless of its * content. See the solr documentation for more details. </br> This method is needed when checking * if the query string matches the dataset title. For example, if the query string is "straße", it * won't match the dataset title "Schulhof Gymnasium Hürth Bonnstrasse" unless "straße" gets * converted to its ASCII equivalent "strasse". * * @param q query string * @return query string converted to ASCII equivalent * @see org.gbif.portal.action.dataset.SearchAction#addMissingHighlighting(String, String) * @see org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter * @see org.apache.lucene.analysis.core.KeywordTokenizer */ protected static String foldToAscii(String q) { if (!Strings.isNullOrEmpty(q)) { ASCIIFoldingFilter filter = null; try { StringReader reader = new StringReader(q); TokenStream stream = new KeywordTokenizer(reader); filter = new ASCIIFoldingFilter(stream); CharTermAttribute termAtt = filter.addAttribute(CharTermAttribute.class); filter.reset(); filter.incrementToken(); // converted q to ASCII equivalent and return it return termAtt.toString(); } catch (IOException e) { // swallow } finally { if (filter != null) { try { filter.end(); filter.close(); } catch (IOException e) { // swallow } } } } return q; }
private String getAnalyzerResult(String suggestion) { TokenStream ts = null; try { Reader reader = new StringReader(suggestion); ts = this.suggestionAnalyzer.tokenStream("", reader); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { String word = termAtt.toString(); if (word != null && word.length() > 0) { return word; } } } catch (Exception ex) { if (this.field != null) { LOG.error( String.format( "Error executing analyzer for field: {0} in DiceSuggester on suggestion: {1}", this.field, suggestion), ex); } else if (this.fieldTypeName != null) { LOG.error( String.format( "Error executing analyzer for field type: {0} in DiceSuggester on suggestion: {1}", this.fieldTypeName, suggestion), ex); } } finally { if (ts != null) { IOUtils.closeWhileHandlingException(ts); } } return null; }
public void processInput() { ClassLoader classLoader = getClass().getClassLoader(); File englishStopWords = new File(classLoader.getResource("DEFAULT_ENGLISH_STOP_WORDS").getFile()); processedInput = new ArrayList<String>(); lemmatizedInput = new ArrayList<String>(); // tokenize and stop word removal operations initStopWordList(englishStopWords); CharArraySet stopwords = new CharArraySet(stopWordPool, true); StandardAnalyzer analyzer = new StandardAnalyzer(stopwords); TokenStream stream; try { stream = analyzer.tokenStream(null, new StringReader(input)); CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { if (!processedInput.contains(cattr.toString())) processedInput.add(cattr.toString()); } stream.end(); stream.close(); // System.out.println("In input processing " + " " // + processedInput); setProcessedInput(processedInput); // for lemmatization concatinate input strings and send to Standford // NLP processor for (int i = 0; i < processedInput.size(); i++) { lemmatizedInput.addAll(new StanfordLemmatizer().lemmatize(processedInput.get(i))); } // System.out.println("In input processing " + " " // + lemmatizedInput); setLemmatizedInput(lemmatizedInput); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
public void testPerField() throws Exception { String text = "Qwerty"; PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT)); analyzer.addAnalyzer("special", new SimpleAnalyzer(TEST_VERSION_CURRENT)); TokenStream tokenStream = analyzer.tokenStream("field", new StringReader(text)); CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class); assertTrue(tokenStream.incrementToken()); assertEquals("WhitespaceAnalyzer does not lowercase", "Qwerty", termAtt.toString()); tokenStream = analyzer.tokenStream("special", new StringReader(text)); termAtt = tokenStream.getAttribute(CharTermAttribute.class); assertTrue(tokenStream.incrementToken()); assertEquals("SimpleAnalyzer lowercases", "qwerty", termAtt.toString()); }
private static String[] groupTokens(Analyzer analyzer, String input) throws IOException { if (Resources.debug) { Resources.LOGGER.debug("TokenParser:" + input); Resources.LOGGER.debug("Analyzer:" + analyzer.getClass()); } TokenStream tokenStream = analyzer.tokenStream("input", input); OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class); tokenStream.reset(); int position = 0; List<TermInfo> infos = new ArrayList<TermInfo>(); while (tokenStream.incrementToken()) { int increment = positionIncrementAttribute.getPositionIncrement(); if (increment > 0) { position = position + increment; if (Resources.debug) { Resources.LOGGER.debug(position + ":"); } } int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String term = charTermAttribute.toString(); TermInfo info = new TermInfo(); info.setStart(startOffset); info.setEnd(endOffset); infos.add(info); if (Resources.debug) { Resources.LOGGER.debug( "[" + term + "]" + ":(" + startOffset + "-->" + endOffset + "):" + typeAttribute.type()); } } tokenStream.end(); tokenStream.close(); Stack<TermInfo> tiStack = groupTokenInfos(infos); List<String> terms = new ArrayList<String>(); while (!tiStack.isEmpty()) { TermInfo termInfo = tiStack.pop(); if (termInfo.getEnd() <= input.length() && termInfo.getStart() >= 1) { String term = input.substring(termInfo.getStart(), termInfo.getEnd()); terms.add(term); } } return terms.toArray(new String[] {}); }
/* (non-Javadoc) * @see org.apache.lucene.xmlparser.QueryObjectBuilder#process(org.w3c.dom.Element) */ @Override public Query getQuery(Element e) throws ParserException { String fieldsList = e.getAttribute("fieldNames"); // a comma-delimited list of fields String fields[] = defaultFieldNames; if ((fieldsList != null) && (fieldsList.trim().length() > 0)) { fields = fieldsList.trim().split(","); // trim the fieldnames for (int i = 0; i < fields.length; i++) { fields[i] = fields[i].trim(); } } // Parse any "stopWords" attribute // TODO MoreLikeThis needs to ideally have per-field stopWords lists - until then // I use all analyzers/fields to generate multi-field compatible stop list String stopWords = e.getAttribute("stopWords"); Set<String> stopWordsSet = null; if ((stopWords != null) && (fields != null)) { stopWordsSet = new HashSet<String>(); for (String field : fields) { try (TokenStream ts = analyzer.tokenStream(field, stopWords)) { CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { stopWordsSet.add(termAtt.toString()); } ts.end(); ts.close(); } catch (IOException ioe) { throw new ParserException( "IoException parsing stop words list in " + getClass().getName() + ":" + ioe.getLocalizedMessage()); } } } MoreLikeThisQuery mlt = new MoreLikeThisQuery(DOMUtils.getText(e), fields, analyzer, fields[0]); mlt.setMaxQueryTerms(DOMUtils.getAttribute(e, "maxQueryTerms", DEFAULT_MAX_QUERY_TERMS)); mlt.setMinTermFrequency( DOMUtils.getAttribute(e, "minTermFrequency", DEFAULT_MIN_TERM_FREQUENCY)); mlt.setPercentTermsToMatch( DOMUtils.getAttribute(e, "percentTermsToMatch", DEFAULT_PERCENT_TERMS_TO_MATCH) / 100); mlt.setStopWords(stopWordsSet); int minDocFreq = DOMUtils.getAttribute(e, "minDocFreq", -1); if (minDocFreq >= 0) { mlt.setMinDocFreq(minDocFreq); } mlt.setBoost(DOMUtils.getAttribute(e, "boost", 1.0f)); return mlt; }
private List<String> filter(TokenFilter filter) throws IOException { List<String> tas = new ArrayList<>(); CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class); filter.reset(); while (filter.incrementToken()) { tas.add(termAtt.toString()); } filter.end(); filter.close(); return tas; }
@Override public boolean incrementToken() throws IOException { if (inPhrase) { inPhrase = false; clearAttributes(); termAtt.setEmpty().append("phrase2"); offsetAtt.setOffset(savedStart, savedEnd); return true; } else while (input.incrementToken()) { if (termAtt.toString().equals("phrase")) { inPhrase = true; savedStart = offsetAtt.startOffset(); savedEnd = offsetAtt.endOffset(); termAtt.setEmpty().append("phrase1"); offsetAtt.setOffset(savedStart, savedEnd); return true; } else if (!termAtt.toString().equals("stop")) return true; } return false; }
public static void assertAnalyzesTo(Analyzer analyzer, String input, String[] output) throws Exception { TokenStream stream = analyzer.tokenStream("field", new StringReader(input)); stream.reset(); CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); for (String expected : output) { Assert.assertTrue(stream.incrementToken()); Assert.assertEquals(expected, termAttr.toString()); } Assert.assertFalse(stream.incrementToken()); stream.close(); }
protected Term getAnalyzedTerm(TokenType tokenType, String termString) throws IOException { Term term = getTerm(termString, tokenType); // first ensure that we've stripped any prefixes TokenStream tokenStream = analyzer.tokenStream(term.field(), new StringReader(term.text())); tokenStream.reset(); CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); StringBuilder sb = new StringBuilder(); while (tokenStream.incrementToken()) { sb.append(termAtt.toString()); } tokenStream.end(); tokenStream.close(); return new Term(term.field(), sb.toString()); }
/** * Convert tokenStream object into a string. * * @param tokenStream object returned by Lucene tokenizer * @return String corresponding to the tokens output by tokenStream */ protected static String streamToString(TokenStream tokenStream) { CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.clearAttributes(); StringBuilder tokenized = new StringBuilder(); try { while (tokenStream.incrementToken()) { tokenized.append(termAtt.toString() + " "); } } catch (IOException e) { e.printStackTrace(); } return tokenized.toString().trim(); }
public static void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException { stream.reset(); CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); assertThat(termAttr, notNullValue()); int i = 0; while (stream.incrementToken()) { assertThat(expected.length, greaterThan(i)); assertThat( "expected different term at index " + i, expected[i++], equalTo(termAttr.toString())); } assertThat("not all tokens produced", i, equalTo(expected.length)); }
/** * @param text * @return * @throws IOException */ private String tokenize(String text) throws IOException { TokenStream stream = analyzer.tokenStream("contents", new StringReader(text)); StringBuilder stBld = new StringBuilder(); CharTermAttribute termAttribute = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class); while (stream.incrementToken()) { String token = termAttribute.toString(); stBld.append(token).append(" "); } return stBld.toString(); }
public void analyze(String text) throws IOException { List<String> searchlst = new ArrayList<String>(); proposalController.getProposalList().clear(); String query = ""; System.out.println("Analzying \"" + text + "\""); Analyzer analyzer = new RussianAnalyzer(Version.LUCENE_31); System.out.println("\t" + analyzer.getClass().getName() + ":"); System.out.print("\t\t"); TokenStream stream = analyzer.tokenStream("contents", new StringReader(text)); while (true) { if (!stream.incrementToken()) break; AttributeSource token = stream.cloneAttributes(); CharTermAttribute term = (CharTermAttribute) token.addAttribute(CharTermAttribute.class); System.out.print("[" + term.toString() + "] "); // 2 searchlst.add(term.toString()); } int i = 0; for (String param : searchlst) { if (i < searchlst.size() - 1) { query += param + " AND "; } else { query += param; } i++; } _log.info("Запрос для поиска:" + query); startSearch(query); System.out.println("\n"); }