@Test public void simpleTest() throws IOException { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new WhitespaceTokenizer(Lucene.VERSION, reader); return new TokenStreamComponents(t, new UniqueTokenFilter(t)); } }; TokenStream test = analyzer.tokenStream("test", "this test with test"); test.reset(); CharTermAttribute termAttribute = test.addAttribute(CharTermAttribute.class); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("this")); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("test")); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("with")); assertThat(test.incrementToken(), equalTo(false)); }
private boolean getNextToken(final int pos) throws IOException { assert pos >= 0; final boolean ret; if (pos == ngramSize) { ret = true; } else { final int ich = input.read(); if (ich == -1) { termAtt.setEmpty(); ret = false; } else { final char ch = (char) ich; if (ch == ' ') { ret = getNextToken(); } else { termAtt.append(ch); ret = getNextToken(pos + 1); } } } return ret; }
@Override public boolean incrementToken() throws IOException { if (tokenIter == null || !tokenIter.hasNext()) { // there are no remaining tokens from the current sentence... are there more sentences? if (input.incrementToken()) { tokStart = offsetAtt.startOffset(); tokEnd = offsetAtt.endOffset(); // if length by start + end offsets doesn't match the term text then assume // this is a synonym and don't adjust the offsets. hasIllegalOffsets = (tokStart + termAtt.length()) != tokEnd; // a new sentence is available: process it. tokenBuffer = splitIntoTokens(termAtt.toString(), offsetAtt.startOffset()); tokenIter = tokenBuffer.iterator(); // it should not be possible to have a sentence with 0 words, check just in case. // returning EOS isn't the best either, but its the behavior of the original code. if (!tokenIter.hasNext()) return false; } else { return false; // no more sentences, end of stream! } } // WordTokenFilter must clear attributes, as it is creating new tokens. clearAttributes(); // There are remaining tokens from the current sentence, return the next one. SegToken nextWord = tokenIter.next(); termAtt.append(nextWord.term); // termAtt.copyBuffer(nextWord.charArray, 0, nextWord.charArray.length); if (hasIllegalOffsets) { offsetAtt.setOffset(tokStart, tokEnd); } else { offsetAtt.setOffset(nextWord.start, nextWord.end); } typeAtt.setType("word"); return true; }
public TokenIndex getTokenIndex(String str) { TokenIndex ret = new TokenIndex(); try { Tokenizer tokenizer = new JapaneseTokenizer( new StringReader(str), null, true, org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode.SEARCH); TokenStream stream = new JapaneseBaseFormFilter(tokenizer); // stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags); stream = new CJKWidthFilter(stream); // stream = new StopFilter(matchVersion, stream, stopwords); stream = new JapaneseKatakanaStemFilter(stream); // stream = new LowerCaseFilter(matchVersion, stream); OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class); while (stream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String token = charTermAttribute.toString(); ret.add(startOffset, endOffset); // System.out.println(token.str+" \t\tS="+token.start+" E="+token.end); } } catch (java.io.IOException e) { System.err.println(e); } return ret; }
/** * Adds term frequencies found by tokenizing text from reader into the Map words * * @param r a source of text to be tokenized * @param termFreqMap a Map of terms and their frequencies * @param fieldName Used by analyzer for any special per-field analysis */ private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName) throws IOException { if (analyzer == null) { throw new UnsupportedOperationException( "To use MoreLikeThis without " + "term vectors, you must provide an Analyzer"); } try (TokenStream ts = analyzer.tokenStream(fieldName, r)) { int tokenCount = 0; // for every token CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { String word = termAtt.toString(); tokenCount++; if (tokenCount > maxNumTokensParsed) { break; } if (isNoiseWord(word)) { continue; } // increment frequency Int cnt = termFreqMap.get(word); if (cnt == null) { termFreqMap.put(word, new Int()); } else { cnt.x++; } } ts.end(); } }
/** * Sugar: analyzes the text with the analyzer and separates by {@link * SynonymMap#WORD_SEPARATOR}. reuse and its chars must not be null. */ public CharsRef analyze(String text, CharsRefBuilder reuse) throws IOException { try (TokenStream ts = analyzer.tokenStream("", text)) { CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); ts.reset(); reuse.clear(); while (ts.incrementToken()) { int length = termAtt.length(); if (length == 0) { throw new IllegalArgumentException( "term: " + text + " analyzed to a zero-length token"); } if (posIncAtt.getPositionIncrement() != 1) { throw new IllegalArgumentException( "term: " + text + " analyzed to a token with posinc != 1"); } reuse.grow(reuse.length() + length + 1); /* current + word + separator */ int end = reuse.length(); if (reuse.length() > 0) { reuse.setCharAt(end++, SynonymMap.WORD_SEPARATOR); reuse.setLength(reuse.length() + 1); } System.arraycopy(termAtt.buffer(), 0, reuse.chars(), end, length); reuse.setLength(reuse.length() + length); } ts.end(); } if (reuse.length() == 0) { throw new IllegalArgumentException( "term: " + text + " was completely eliminated by analyzer"); } return reuse.get(); }
@Override public final boolean incrementToken() throws IOException { while (true) { if (curTermBuffer == null) { if (!input.incrementToken()) { return false; } else { curTermBuffer = termAtt.buffer().clone(); curTermLength = termAtt.length(); curGramSize = minGram; tokStart = offsetAtt.startOffset(); } } if (curGramSize <= maxGram) { if (!(curGramSize > curTermLength // if the remaining input is too short, we can't generate any // n-grams || curGramSize > maxGram)) { // if we have hit the end of our n-gram size range, quit // grab gramSize chars from front or back int start = side == Side.FRONT ? 0 : curTermLength - curGramSize; int end = start + curGramSize; clearAttributes(); offsetAtt.setOffset(tokStart + start, tokStart + end); termAtt.copyBuffer(curTermBuffer, start, curGramSize); curGramSize++; return true; } } curTermBuffer = null; } }
/** @param analyzer @Description: */ private static void cutWordsForSingle(Analyzer analyzer, String inputs, Random random) { try { TokenStream ts = analyzer.tokenStream("text", new StringReader(inputs)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { if (termAtt.length() > 0) { word += ts.getAttribute(CharTermAttribute.class).toString(); word += "|" + (random.nextFloat() + 10) + " "; // 拼接 word|23.3 word1|43.4 i++; // 添加到索引 if (i >= throftdocnum) { // 100次一个文档 i = 0; docnum++; // System.out.println(word); addDoc(docnum + patch, word); word = ""; } j++; } } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
@Override public boolean incrementToken() throws IOException { if (tokenIter == null || !tokenIter.hasNext()) { if (input.incrementToken()) { tokStart = offsetAtt.startOffset(); tokEnd = offsetAtt.endOffset(); hasIllegalOffsets = (tokStart + termAtt.length()) != tokEnd; tokenBuffer = wordSegmenter.getTokendWords(termAtt.toString()); tokenIter = tokenBuffer.iterator(); if (!tokenIter.hasNext()) return false; } else { return false; } } clearAttributes(); TokendWords nextWord = tokenIter.next(); termAtt.copyBuffer(nextWord.next(), 0, nextWord.next().length); if (hasIllegalOffsets) { offsetAtt.setOffset(tokStart, tokEnd); } else { offsetAtt.setOffset(nextWord.start, nextWord.end); } typeAtt.setType("word"); return true; }
public static List<String> analyze(String content) { List<String> resultList = null; try { // 创建分词对象 resultList = new ArrayList<String>(1); resultList.add(content); IKAnalyzer analyer = new IKAnalyzer(true); analyer.setUseSmart(true); StringReader reader = new StringReader(content); // 分词 TokenStream tokenStream = analyer.tokenStream("", reader); CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class); // 遍历分词数据 while (tokenStream.incrementToken()) { if (!term.toString().isEmpty()) { resultList.add(term.toString()); } } reader.close(); } catch (IOException ex) { logger.error("分词出错", ex); } return resultList; }
@Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { char buffer[] = termAtt.buffer(); int length = termAtt.length(); for (int i = 0; i < length; i++) { int ch = Character.codePointAt(buffer, i, length); // look for digits outside of basic latin if (ch > 0x7F && Character.isDigit(ch)) { // replace with equivalent basic latin digit buffer[i] = (char) ('0' + Character.getNumericValue(ch)); // if the original was supplementary, shrink the string if (ch > 0xFFFF) { length = StemmerUtil.delete(buffer, i + 1, length); termAtt.setLength(length); } } } return true; } else { return false; } }
/** Rigourous Test :-) */ public void testBasicKoreanAnalyzer() throws IOException { // text to tokenize final String text = "그러면 조개가 쏘옥 올라온다"; // String text = "※ 청년,창직,창업 인턴제"; // String text = "저는 대학생이구요. 소프트웨어 관련학과 입니다. DB는 수업을 한번 들은 적이 있으며, 수학은 대학에서 통계학, 선형대수학, 이산수학, // 대학수학 등을 배웠지만... 자주 사용을 안하다보니 모두 까먹은 상태입니다."; BasicKoreanAnalyzer analyzer = new BasicKoreanAnalyzer(); TokenStream stream = analyzer.tokenStream("field", new StringReader(text)); // get the TermAttribute from the TokenStream CharTermAttribute termAtt = (CharTermAttribute) stream.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = (OffsetAttribute) stream.addAttribute(OffsetAttribute.class); stream.reset(); int[] lengths = new int[4]; lengths[0] = 3; lengths[1] = 2; lengths[2] = 2; lengths[3] = 3; int i = 0; // print all tokens until stream is exhausted while (stream.incrementToken()) { // System.out.println(termAtt.term() + ": " + termAtt.termLength() + " (" + // offsetAtt.startOffset() + ":" + offsetAtt.endOffset() + ")"); Assert.assertEquals(lengths[i], termAtt.length()); i++; } stream.end(); stream.close(); }
@Test public void testCase2() throws Exception { StringReader reader = new StringReader("고속도로"); nouns.add(getToken("고속도로", 0, 4)); nouns.add(getToken("고속도", 0, 3)); nouns.add(getToken("고속", 0, 2)); nouns.add(getToken("속도", 1, 3)); nouns.add(getToken("고", 0, 1)); Analyzer analyzer = new KoreanAnalyzer(); TokenStream stream = analyzer.reusableTokenStream("dummy", reader); CharTermAttribute charTermAtt = stream.getAttribute(CharTermAttribute.class); OffsetAttribute offSetAtt = stream.getAttribute(OffsetAttribute.class); while (stream.incrementToken()) { TestToken t = getToken(charTermAtt.toString(), offSetAtt.startOffset(), offSetAtt.endOffset()); System.out.println("termAtt.term() : " + charTermAtt.toString()); System.out.println("offSetAtt : " + offSetAtt.startOffset()); System.out.println("offSetAtt : " + offSetAtt.endOffset()); Assert.assertTrue(nouns.contains(t)); } }
@Override public final boolean incrementToken() throws IOException { if (isMailto) { termAtt.setEmpty(); // return the scheme + the mail part isMailto = false; posIncrAtt.setPositionIncrement(0); termAtt.copyBuffer(termBuffer.array(), 0, termBuffer.position()); return true; } if (input.incrementToken()) { final String type = typeAtt.type(); if (type.equals(TupleTokenizer.getTokenTypes()[TupleTokenizer.URI]) && this.isMailtoScheme()) { this.updateBuffer(); termBuffer.put(termAtt.buffer(), 0, termAtt.length()); // return only the mail part posIncrAtt.setPositionIncrement(1); termAtt.copyBuffer(termBuffer.array(), 7, termBuffer.position() - 7); } return true; } return false; }
/** {@inheritDoc} */ @Override public boolean incrementToken() throws IOException { while (input.incrementToken()) { char[] term = termAttribute.buffer(); int termLength = termAttribute.length(); if (termLength > 0 && term[termLength - 1] == '-') { // a hyphenated word // capture the state of the first token only if (savedState == null) { savedState = captureState(); } hyphenated.append(term, 0, termLength - 1); } else if (savedState == null) { // not part of a hyphenated word. return true; } else { // the final portion of a hyphenated word hyphenated.append(term, 0, termLength); unhyphenate(); return true; } } if (savedState != null) { // the final term ends with a hyphen // add back the hyphen, for backwards compatibility. hyphenated.append('-'); unhyphenate(); return true; } return false; }
private String getAnalyzerResult(String suggestion) { TokenStream ts = null; try { Reader reader = new StringReader(suggestion); ts = this.suggestionAnalyzer.tokenStream("", reader); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { String word = termAtt.toString(); if (word != null && word.length() > 0) { return word; } } } catch (Exception ex) { if (this.field != null) { LOG.error( String.format( "Error executing analyzer for field: {0} in DiceSuggester on suggestion: {1}", this.field, suggestion), ex); } else if (this.fieldTypeName != null) { LOG.error( String.format( "Error executing analyzer for field type: {0} in DiceSuggester on suggestion: {1}", this.fieldTypeName, suggestion), ex); } } finally { if (ts != null) { IOUtils.closeWhileHandlingException(ts); } } return null; }
@Override public final boolean incrementToken() throws IOException { if (!tokens.isEmpty()) { assert current != null; CompoundToken token = tokens.removeFirst(); restoreState(current); // keep all other attributes untouched termAtt.setEmpty().append(token.txt); offsetAtt.setOffset(token.startOffset, token.endOffset); posIncAtt.setPositionIncrement(0); return true; } current = null; // not really needed, but for safety if (input.incrementToken()) { // Only words longer than minWordSize get processed if (termAtt.length() >= this.minWordSize) { decompose(); // only capture the state if we really need it for producing new tokens if (!tokens.isEmpty()) { current = captureState(); } } // return original token: return true; } else { return false; } }
private String tokenizerToString(Tokenizer tokenizer) throws Exception { OffsetAttribute extOffset = tokenizer.addAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncrAtt = tokenizer.addAttribute(PositionIncrementAttribute.class); PositionLengthAttribute posLengthAtt = tokenizer.addAttribute(PositionLengthAttribute.class); CharTermAttribute term = tokenizer.addAttribute(CharTermAttribute.class); TypeAttribute type = tokenizer.addAttribute(TypeAttribute.class); SemanticClassAttribute semanticClass = tokenizer.addAttribute(SemanticClassAttribute.class); PartOfSpeechAttribute pos = tokenizer.addAttribute(PartOfSpeechAttribute.class); StringBuilder result = new StringBuilder(); tokenizer.reset(); while (tokenizer.incrementToken() == true) { result.append(new String(term.buffer(), 0, term.length())).append(":"); result.append(type.type()).append(":"); result.append(pos.partOfSpeech()).append(":"); result.append(semanticClass.semanticClass()).append(":"); result.append(String.valueOf(posIncrAtt.getPositionIncrement())).append(":"); result.append(String.valueOf(posLengthAtt.getPositionLength())).append(":"); result.append(String.valueOf(extOffset.startOffset())).append(":"); result.append(String.valueOf(extOffset.endOffset())); result.append(","); } tokenizer.end(); return result.toString(); }
public List<Document> searchDocuments(String text) { List<Document> documents = new ArrayList<Document>(); try { TokenStream tokenStream = analyzer.tokenStream("text", text); CharTermAttribute charTermAtt = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); BooleanQuery bQuery = new BooleanQuery(); while (tokenStream.incrementToken()) { String token = charTermAtt.toString(); TermQuery tq = new TermQuery(new Term("text", token)); tq.setBoost(2f); bQuery.add(tq, Occur.MUST); } tokenStream.close(); TopDocs results = searcher.search(bQuery, 100000); ScoreDoc[] hits = results.scoreDocs; for (ScoreDoc hit : hits) { Document doc = searcher.doc(hit.doc); doc.add(new FloatField("score", hit.score, FloatField.TYPE_STORED)); documents.add(doc); } } catch (Exception e) { e.printStackTrace(); } return documents; }
public static void displayTokens(TokenStream stream) throws IOException { CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); while (stream.incrementToken()) { System.out.println("[" + term.toString() + "] "); } }
/* (non-Javadoc) * @see org.apache.lucene.analysis.TokenStream#incrementToken() */ @Override public boolean incrementToken() throws IOException { // 清除所有的词元属性 clearAttributes(); skippedPositions = 0; Lexeme nextLexeme = _IKImplement.next(); if (nextLexeme != null) { posIncrAtt.setPositionIncrement(skippedPositions + 1); // 将Lexeme转成Attributes // 设置词元文本 termAtt.append(nextLexeme.getLexemeText()); // 设置词元长度 termAtt.setLength(nextLexeme.getLength()); // 设置词元位移 offsetAtt.setOffset( correctOffset(nextLexeme.getBeginPosition()), correctOffset(nextLexeme.getEndPosition())); // 记录分词的最后位置 endPosition = nextLexeme.getEndPosition(); // 记录词元分类 typeAtt.setType(nextLexeme.getLexemeTypeString()); // 返会true告知还有下个词元 return true; } // 返会false告知词元输出完毕 return false; }
public Map<String, Double> search(String text) { Map<String, Double> similar = new HashMap<String, Double>(); try { TokenStream tokenStream = analyzer.tokenStream("text", text); CharTermAttribute charTermAtt = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); BooleanQuery bQuery = new BooleanQuery(); while (tokenStream.incrementToken()) { String token = charTermAtt.toString(); TermQuery tq = new TermQuery(new Term("text", token)); tq.setBoost(2f); bQuery.add(tq, Occur.MUST); } tokenStream.close(); TopDocs results = searcher.search(bQuery, 100000); ScoreDoc[] hits = results.scoreDocs; for (ScoreDoc hit : hits) { Document doc = searcher.doc(hit.doc); similar.put(doc.get("id"), new Double(hit.score)); } } catch (Exception e) { e.printStackTrace(); } return similar; }
@Override public boolean incrementToken() throws IOException { if (!terms.isEmpty()) { char[] buffer = terms.poll(); termAttribute.setEmpty(); termAttribute.copyBuffer(buffer, 0, buffer.length); posIncAttr.setPositionIncrement(1); return true; } if (!input.incrementToken()) { return false; } else { final char term[] = termAttribute.buffer(); final int length = termAttribute.length(); int k = 0; for (; k < length; k++) { if (term[k] == tokenDelimiter) { break; } } LinkedList<CharBuffer> buffers = permuteTerms(term, 0, length); Iterator iter = buffers.iterator(); while (iter.hasNext()) { CharBuffer cb = (CharBuffer) iter.next(); terms.add(cb.array()); } // we return true and leave the original token unchanged return true; } }
public void refineDocument(Document doc) { TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_36, new StringReader(doc.getContent())); tokenStream = new StopFilter(Version.LUCENE_36, tokenStream, stopWords); tokenStream = new PorterStemFilter(tokenStream); StringBuilder sb = new StringBuilder(); CharTermAttribute charTermAttr = tokenStream.getAttribute(CharTermAttribute.class); List<String> words = new ArrayList<String>(); Set<String> uniqueWords = new HashSet<String>(); try { while (tokenStream.incrementToken()) { String word = charTermAttr.toString(); // int wordVal = textToInt(charTermAttr.toString()); words.add(word); uniqueWords.add(word); dictionary.add(word); if (sb.length() > 0) { sb.append(" "); } sb.append(charTermAttr.toString()); } } catch (IOException e) { System.out.println(e.getMessage()); } doc.setRefinedContent(sb.toString()); doc.setWords(words); doc.setUniqueWords(uniqueWords); }
/** * Uses the solr.ASCIIFoldingFilter to convert a string to its ASCII equivalent. See solr * documentation for full details. </br> When doing the conversion, this method mirrors GBIF's * registry-solr schema configuration for <fieldType name="text_auto_ngram">. For example, it uses * the KeywordTokenizer that treats the entire string as a single token, regardless of its * content. See the solr documentation for more details. </br> This method is needed when checking * if the query string matches the dataset title. For example, if the query string is "straße", it * won't match the dataset title "Schulhof Gymnasium Hürth Bonnstrasse" unless "straße" gets * converted to its ASCII equivalent "strasse". * * @param q query string * @return query string converted to ASCII equivalent * @see org.gbif.portal.action.dataset.SearchAction#addMissingHighlighting(String, String) * @see org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter * @see org.apache.lucene.analysis.core.KeywordTokenizer */ protected static String foldToAscii(String q) { if (!Strings.isNullOrEmpty(q)) { ASCIIFoldingFilter filter = null; try { StringReader reader = new StringReader(q); TokenStream stream = new KeywordTokenizer(reader); filter = new ASCIIFoldingFilter(stream); CharTermAttribute termAtt = filter.addAttribute(CharTermAttribute.class); filter.reset(); filter.incrementToken(); // converted q to ASCII equivalent and return it return termAtt.toString(); } catch (IOException e) { // swallow } finally { if (filter != null) { try { filter.end(); filter.close(); } catch (IOException e) { // swallow } } } } return q; }
/* * (non-Javadoc) * * @see org.apache.lucene.analysis.TokenStream#next() */ @Override public final boolean incrementToken() throws IOException { clearAttributes(); skippedPositions = 0; while (true) { int tokenType = scanner.getNextToken(); if (tokenType == StandardTokenizerInterface.YYEOF) { return false; } if (scanner.yylength() <= maxTokenLength) { posIncrAtt.setPositionIncrement(skippedPositions + 1); scanner.getText(termAtt); final int start = scanner.yychar(); offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.length())); // This 'if' should be removed in the next release. For now, it converts // invalid acronyms to HOST. When removed, only the 'else' part should // remain. if (tokenType == StandardTokenizer.ACRONYM_DEP) { typeAtt.setType(StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HOST]); termAtt.setLength(termAtt.length() - 1); // remove extra '.' } else { typeAtt.setType(StandardTokenizer.TOKEN_TYPES[tokenType]); } return true; } else // When we skip a too-long term, we still increment the // position increment skippedPositions++; } }
public void cleanText(String... inboundTexts) { try { final List<String> fields = Lists.newArrayList(); for (String raw : inboundTexts) { // Tidy t = new Tidy(); // t.setErrout(new PrintWriter(new ByteArrayOutputStream())); // StringWriter out = new StringWriter(); // t.parse(new StringReader(raw), out); // String tidied = out.getBuffer().toString(); // logger.debug("{}",tidied); // AutoDetectParser p = new AutoDetectParser(); // p.parse(new ByteArrayInputStream(raw.getBytes()), // new TextContentHandler(new DefaultHandler() // { // @Override // public void characters(char[] ch, int start, int length) throws SAXException // { // CharBuffer buf = CharBuffer.wrap(ch, start, length); // String s = buf.toString(); // logger.debug("{}",s); // fields.add(s); // } // }), new Metadata()); } Analyzer analyzer = new StandardAnalyzer(); // String joinedFields = Joiner.on(" ").join(fields).replaceAll("\\s+", " "); String joinedFields = Joiner.on(" ").join(inboundTexts).replaceAll("\\s+", " "); logger.debug("{}", joinedFields); StringReader in = new StringReader(joinedFields); TokenStream ts = analyzer.tokenStream("content", in); ts.reset(); ts = new LowerCaseFilter(ts); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); List<String> words = Lists.newArrayList(); while (ts.incrementToken()) { char[] termBuffer = termAtt.buffer(); int termLen = termAtt.length(); String w = new String(termBuffer, 0, termLen); words.add(w); } ts.end(); ts.close(); analyzer.close(); scrubbedWords = new ArrayList<String>(); for (String word : words) { if (word.length() >= MINWORDLEN && !stopwords.contains(word)) { scrubbedWords.add(word); } else { logger.debug("Ignoring word: {}", word); } } // this.scrubbedWords = words; } catch (Exception e) { throw new RuntimeException(e); } }
private static String[] groupTokens(Analyzer analyzer, String input) throws IOException { if (Resources.debug) { Resources.LOGGER.debug("TokenParser:" + input); Resources.LOGGER.debug("Analyzer:" + analyzer.getClass()); } TokenStream tokenStream = analyzer.tokenStream("input", input); OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class); tokenStream.reset(); int position = 0; List<TermInfo> infos = new ArrayList<TermInfo>(); while (tokenStream.incrementToken()) { int increment = positionIncrementAttribute.getPositionIncrement(); if (increment > 0) { position = position + increment; if (Resources.debug) { Resources.LOGGER.debug(position + ":"); } } int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String term = charTermAttribute.toString(); TermInfo info = new TermInfo(); info.setStart(startOffset); info.setEnd(endOffset); infos.add(info); if (Resources.debug) { Resources.LOGGER.debug( "[" + term + "]" + ":(" + startOffset + "-->" + endOffset + "):" + typeAttribute.type()); } } tokenStream.end(); tokenStream.close(); Stack<TermInfo> tiStack = groupTokenInfos(infos); List<String> terms = new ArrayList<String>(); while (!tiStack.isEmpty()) { TermInfo termInfo = tiStack.pop(); if (termInfo.getEnd() <= input.length() && termInfo.getStart() >= 1) { String term = input.substring(termInfo.getStart(), termInfo.getEnd()); terms.add(term); } } return terms.toArray(new String[] {}); }
public void testConfigureCamelCaseTokenFilter() throws IOException { Settings settings = Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build(); Settings indexSettings = Settings.builder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put("index.analysis.filter.wordDelimiter.type", "word_delimiter") .put("index.analysis.filter.wordDelimiter.split_on_numerics", false) .put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace") .putArray( "index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter") .put("index.analysis.analyzer.custom_analyzer_1.tokenizer", "whitespace") .putArray( "index.analysis.analyzer.custom_analyzer_1.filter", "lowercase", "word_delimiter") .build(); IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); IndexAnalyzers indexAnalyzers = new AnalysisModule(new Environment(settings), emptyList()) .getAnalysisRegistry() .build(idxSettings); try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer")) { assertNotNull(custom_analyser); TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee"); tokenStream.reset(); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); List<String> token = new ArrayList<>(); while (tokenStream.incrementToken()) { token.add(charTermAttribute.toString()); } assertEquals(token.toString(), 2, token.size()); assertEquals("j2se", token.get(0)); assertEquals("j2ee", token.get(1)); } try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer_1")) { assertNotNull(custom_analyser); TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee"); tokenStream.reset(); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); List<String> token = new ArrayList<>(); while (tokenStream.incrementToken()) { token.add(charTermAttribute.toString()); } assertEquals(token.toString(), 6, token.size()); assertEquals("j", token.get(0)); assertEquals("2", token.get(1)); assertEquals("se", token.get(2)); assertEquals("j", token.get(3)); assertEquals("2", token.get(4)); assertEquals("ee", token.get(5)); } }
@Override public boolean incrementToken() throws IOException { while (input.incrementToken()) { String term = new String(termAttribute.buffer(), 0, termAttribute.length()); if (StringUtils.isAlphanumeric(term)) { return true; } } return false; }