@Test public void testCase2() throws Exception { StringReader reader = new StringReader("고속도로"); nouns.add(getToken("고속도로", 0, 4)); nouns.add(getToken("고속도", 0, 3)); nouns.add(getToken("고속", 0, 2)); nouns.add(getToken("속도", 1, 3)); nouns.add(getToken("고", 0, 1)); Analyzer analyzer = new KoreanAnalyzer(); TokenStream stream = analyzer.reusableTokenStream("dummy", reader); CharTermAttribute charTermAtt = stream.getAttribute(CharTermAttribute.class); OffsetAttribute offSetAtt = stream.getAttribute(OffsetAttribute.class); while (stream.incrementToken()) { TestToken t = getToken(charTermAtt.toString(), offSetAtt.startOffset(), offSetAtt.endOffset()); System.out.println("termAtt.term() : " + charTermAtt.toString()); System.out.println("offSetAtt : " + offSetAtt.startOffset()); System.out.println("offSetAtt : " + offSetAtt.endOffset()); Assert.assertTrue(nouns.contains(t)); } }
public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException { TokenStream stream = analyzer.tokenStream("contents", new StringReader(text)); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); TypeAttribute type = stream.addAttribute(TypeAttribute.class); PayloadAttribute payload = stream.addAttribute(PayloadAttribute.class); int position = 0; while (stream.incrementToken()) { int increment = posIncr.getPositionIncrement(); if (increment > 0) { position = position + increment; System.out.println(); System.out.print(position + ":"); } BytesRef pl = payload.getPayload(); if (pl != null) { System.out.print( "[" + term.toString() + ":" + offset.startOffset() + "->" + offset.endOffset() + ":" + type.type() + ":" + new String(pl.bytes) + "] "); } else { System.out.print( "[" + term.toString() + ":" + offset.startOffset() + "->" + offset.endOffset() + ":" + type.type() + "] "); } } System.out.println(); }
@Override public boolean incrementToken() throws IOException { if (tokenIter == null || !tokenIter.hasNext()) { // there are no remaining tokens from the current sentence... are there more sentences? if (input.incrementToken()) { tokStart = offsetAtt.startOffset(); tokEnd = offsetAtt.endOffset(); // if length by start + end offsets doesn't match the term text then assume // this is a synonym and don't adjust the offsets. hasIllegalOffsets = (tokStart + termAtt.length()) != tokEnd; // a new sentence is available: process it. tokenBuffer = splitIntoTokens(termAtt.toString(), offsetAtt.startOffset()); tokenIter = tokenBuffer.iterator(); // it should not be possible to have a sentence with 0 words, check just in case. // returning EOS isn't the best either, but its the behavior of the original code. if (!tokenIter.hasNext()) return false; } else { return false; // no more sentences, end of stream! } } // WordTokenFilter must clear attributes, as it is creating new tokens. clearAttributes(); // There are remaining tokens from the current sentence, return the next one. SegToken nextWord = tokenIter.next(); termAtt.append(nextWord.term); // termAtt.copyBuffer(nextWord.charArray, 0, nextWord.charArray.length); if (hasIllegalOffsets) { offsetAtt.setOffset(tokStart, tokEnd); } else { offsetAtt.setOffset(nextWord.start, nextWord.end); } typeAtt.setType("word"); return true; }
// reconstruct the unused tokens from the phrase (since it didn't match) // need to recompute the token positions based on the length of the currentPhrase, // the current ending position and the length of each token. private void discardCharTokens(StringBuffer phrase, ArrayList<Token> tokenList) { Log.debug("discardCharTokens: '" + phrase.toString() + "'"); OffsetAttribute offAttr = getOffsetAttribute(); int endPos = offAttr.endOffset(); int startPos = endPos - phrase.length(); int lastSp = 0; for (int i = 0; i < phrase.length(); i++) { char chAt = phrase.charAt(i); if (isSpaceChar(chAt) && i > lastSp) { char[] tok = new char[i - lastSp]; phrase.getChars(lastSp, i, tok, 0); if (lastEmitted == null || !endsWith(lastEmitted, tok)) { Token token = new Token(); token.tok = tok; token.startPos = startPos + lastSp; token.endPos = token.startPos + tok.length; Log.debug("discard " + new String(tok) + ": " + token.startPos + ", " + token.endPos); tokenList.add(token); } lastSp = i + 1; } } char[] tok = new char[phrase.length() - lastSp]; phrase.getChars(lastSp, phrase.length(), tok, 0); Token token = new Token(); token.tok = tok; token.endPos = endPos; token.startPos = endPos - tok.length; tokenList.add(token); }
public TokenIndex getTokenIndex(String str) { TokenIndex ret = new TokenIndex(); try { Tokenizer tokenizer = new JapaneseTokenizer( new StringReader(str), null, true, org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode.SEARCH); TokenStream stream = new JapaneseBaseFormFilter(tokenizer); // stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags); stream = new CJKWidthFilter(stream); // stream = new StopFilter(matchVersion, stream, stopwords); stream = new JapaneseKatakanaStemFilter(stream); // stream = new LowerCaseFilter(matchVersion, stream); OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class); while (stream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String token = charTermAttribute.toString(); ret.add(startOffset, endOffset); // System.out.println(token.str+" \t\tS="+token.start+" E="+token.end); } } catch (java.io.IOException e) { System.err.println(e); } return ret; }
private String tokenizerToString(Tokenizer tokenizer) throws Exception { OffsetAttribute extOffset = tokenizer.addAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncrAtt = tokenizer.addAttribute(PositionIncrementAttribute.class); PositionLengthAttribute posLengthAtt = tokenizer.addAttribute(PositionLengthAttribute.class); CharTermAttribute term = tokenizer.addAttribute(CharTermAttribute.class); TypeAttribute type = tokenizer.addAttribute(TypeAttribute.class); SemanticClassAttribute semanticClass = tokenizer.addAttribute(SemanticClassAttribute.class); PartOfSpeechAttribute pos = tokenizer.addAttribute(PartOfSpeechAttribute.class); StringBuilder result = new StringBuilder(); tokenizer.reset(); while (tokenizer.incrementToken() == true) { result.append(new String(term.buffer(), 0, term.length())).append(":"); result.append(type.type()).append(":"); result.append(pos.partOfSpeech()).append(":"); result.append(semanticClass.semanticClass()).append(":"); result.append(String.valueOf(posIncrAtt.getPositionIncrement())).append(":"); result.append(String.valueOf(posLengthAtt.getPositionLength())).append(":"); result.append(String.valueOf(extOffset.startOffset())).append(":"); result.append(String.valueOf(extOffset.endOffset())); result.append(","); } tokenizer.end(); return result.toString(); }
@Override public boolean incrementToken() throws IOException { if (tokenIter == null || !tokenIter.hasNext()) { if (input.incrementToken()) { tokStart = offsetAtt.startOffset(); tokEnd = offsetAtt.endOffset(); hasIllegalOffsets = (tokStart + termAtt.length()) != tokEnd; tokenBuffer = wordSegmenter.getTokendWords(termAtt.toString()); tokenIter = tokenBuffer.iterator(); if (!tokenIter.hasNext()) return false; } else { return false; } } clearAttributes(); TokendWords nextWord = tokenIter.next(); termAtt.copyBuffer(nextWord.next(), 0, nextWord.next().length); if (hasIllegalOffsets) { offsetAtt.setOffset(tokStart, tokEnd); } else { offsetAtt.setOffset(nextWord.start, nextWord.end); } typeAtt.setType("word"); return true; }
private static String[] groupTokens(Analyzer analyzer, String input) throws IOException { if (Resources.debug) { Resources.LOGGER.debug("TokenParser:" + input); Resources.LOGGER.debug("Analyzer:" + analyzer.getClass()); } TokenStream tokenStream = analyzer.tokenStream("input", input); OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class); tokenStream.reset(); int position = 0; List<TermInfo> infos = new ArrayList<TermInfo>(); while (tokenStream.incrementToken()) { int increment = positionIncrementAttribute.getPositionIncrement(); if (increment > 0) { position = position + increment; if (Resources.debug) { Resources.LOGGER.debug(position + ":"); } } int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String term = charTermAttribute.toString(); TermInfo info = new TermInfo(); info.setStart(startOffset); info.setEnd(endOffset); infos.add(info); if (Resources.debug) { Resources.LOGGER.debug( "[" + term + "]" + ":(" + startOffset + "-->" + endOffset + "):" + typeAttribute.type()); } } tokenStream.end(); tokenStream.close(); Stack<TermInfo> tiStack = groupTokenInfos(infos); List<String> terms = new ArrayList<String>(); while (!tiStack.isEmpty()) { TermInfo termInfo = tiStack.pop(); if (termInfo.getEnd() <= input.length() && termInfo.getStart() >= 1) { String term = input.substring(termInfo.getStart(), termInfo.getEnd()); terms.add(term); } } return terms.toArray(new String[] {}); }
@Override public boolean incrementToken() throws IOException { if (offsetCount < offsetLimit && input.incrementToken()) { int offsetLength = offsetAttrib.endOffset() - offsetAttrib.startOffset(); offsetCount += offsetLength; return true; } return false; }
/* (non-Javadoc) * @see org.apache.lucene.search.highlight.TextFragmenter#isNewFragment(org.apache.lucene.analysis.Token) */ public boolean isNewFragment() { boolean isNewFrag = false; int minFragLen = (int) ((1.0f - slop) * targetFragChars); int endOffset = offsetAtt.endOffset(); // ** determin isNewFrag if (posIncAtt.getPositionIncrement() > incrementGapThreshold) { // large position gaps always imply new fragments isNewFrag = true; } else if (endOffset - currentOffset < minFragLen) { // we're not in our range of flexibility isNewFrag = false; } else if (targetOffset > 0) { // we've already decided on a target isNewFrag = endOffset > targetOffset; } else { // we might be able to do something int minOffset = currentOffset + minFragLen; int maxOffset = (int) (currentOffset + (1.0f + slop) * targetFragChars); int hotIndex; // look for a close hotspot hotIndex = Arrays.binarySearch(hotspots, endOffset); if (hotIndex < 0) hotIndex = -hotIndex; if (hotIndex >= hotspots.length) { // no more hotspots in this input stream targetOffset = currentOffset + targetFragChars; } else if (hotspots[hotIndex] > maxOffset) { // no hotspots within slop targetOffset = currentOffset + targetFragChars; } else { // try to find hotspot in slop int goal = hotspots[hotIndex]; while (goal < minOffset && hotIndex < hotspots.length) { hotIndex++; goal = hotspots[hotIndex]; } targetOffset = goal <= maxOffset ? goal : currentOffset + targetFragChars; } isNewFrag = endOffset > targetOffset; } // ** operate on isNewFrag if (isNewFrag) { currentNumFrags++; currentOffset = endOffset; targetOffset = -1; } return isNewFrag; }
/* (non-Javadoc) * @see org.apache.lucene.search.highlight.TextFragmenter#isNewFragment(org.apache.lucene.analysis.Token) */ @Override public boolean isNewFragment() { int endOffset = offsetAtt.endOffset(); boolean isNewFrag = endOffset >= fragOffset + getFragmentSize() || posIncAtt.getPositionIncrement() > INCREMENT_THRESHOLD; if (isNewFrag) { fragOffset = endOffset; } return isNewFrag; }
private void emit(char[] token) { Log.debug("emit: " + new String(token)); if (replaceWhitespaceWith != null) { token = replaceWhiteSpace(token); } CharTermAttribute termAttr = getTermAttribute(); termAttr.setEmpty(); termAttr.append(new StringBuilder().append(token)); OffsetAttribute offAttr = getOffsetAttribute(); if (offAttr != null && offAttr.endOffset() >= token.length) { int start = offAttr.endOffset() - token.length; offAttr.setOffset(start, offAttr.endOffset()); } PositionIncrementAttribute pia = getPositionIncrementAttribute(); if (pia != null) { pia.setPositionIncrement(++positionIncr); } lastEmitted = token; }
/** * Override this method to customize the Object representing a single highlighted suggestions; the * result is set on each {@link LookupResult#highlightKey} member. */ protected Object highlight(String text, Set<String> matchedTokens, String prefixToken) throws IOException { try (TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text))) { CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); ts.reset(); StringBuilder sb = new StringBuilder(); int upto = 0; while (ts.incrementToken()) { String token = termAtt.toString(); int startOffset = offsetAtt.startOffset(); int endOffset = offsetAtt.endOffset(); if (upto < startOffset) { addNonMatch(sb, text.substring(upto, startOffset)); upto = startOffset; } else if (upto > startOffset) { continue; } if (matchedTokens.contains(token)) { // Token matches. addWholeMatch(sb, text.substring(startOffset, endOffset), token); upto = endOffset; } else if (prefixToken != null && token.startsWith(prefixToken)) { addPrefixMatch(sb, text.substring(startOffset, endOffset), token, prefixToken); upto = endOffset; } } ts.end(); int endOffset = offsetAtt.endOffset(); if (upto < endOffset) { addNonMatch(sb, text.substring(upto)); } return sb.toString(); } }
public void printAnalyzerWords(Analyzer analyzer, String field) { // 获取Lucene的TokenStream对象 TokenStream ts = null; try { ts = analyzer.tokenStream(field, this.content); // 获取词元位置属性 OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); // 获取词元文本属性 CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); // 获取词元文本属性 TypeAttribute type = ts.addAttribute(TypeAttribute.class); // 重置TokenStream(重置StringReader) ts.reset(); // 迭代获取分词结果 while (ts.incrementToken()) { System.out.println("documents[" + this.id + "]"); System.out.println( offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); } // 关闭TokenStream(关闭StringReader) ts.end(); // Perform end-of-stream operations, e.g. set the final // offset. } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { // 释放TokenStream的所有资源 if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); } } } }
/** Writes the joined unhyphenated term */ private void unhyphenate() { int endOffset = offsetAttribute.endOffset(); restoreState(savedState); savedState = null; char term[] = termAttribute.buffer(); int length = hyphenated.length(); if (length > termAttribute.length()) { term = termAttribute.resizeBuffer(length); } hyphenated.getChars(0, length, term, 0); termAttribute.setLength(length); offsetAttribute.setOffset(offsetAttribute.startOffset(), endOffset); hyphenated.setLength(0); }
private static String[] mmsegTokens(Analyzer analyzer, String input) throws IOException { if (Resources.debug) { Resources.LOGGER.debug("TokenParser:" + input); Resources.LOGGER.debug("Analyzer:" + analyzer.getClass()); } TokenStream tokenStream = analyzer.tokenStream("input", input); OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class); tokenStream.reset(); int position = 0; List<String> tokens = new ArrayList<String>(); while (tokenStream.incrementToken()) { int increment = positionIncrementAttribute.getPositionIncrement(); if (increment > 0) { position = position + increment; if (Resources.debug) { Resources.LOGGER.debug(position + ":"); } } int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String term = charTermAttribute.toString(); tokens.add(term); if (Resources.debug) { Resources.LOGGER.debug( "[" + term + "]" + ":(" + startOffset + "-->" + endOffset + "):" + typeAttribute.type()); } } tokenStream.end(); tokenStream.close(); return tokens.toArray(new String[] {}); }
public static void main(String[] args) { // 构建IK分词器,使用smart分词模式 Analyzer analyzer = new IKSynonymAnalyzer(Version.LUCENE_4_9, true); // 获取Lucene的TokenStream对象 TokenStream ts = null; try { ts = analyzer.tokenStream("myfield", new StringReader("物理老师数学数学老师")); // 获取词元位置属性 OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); // 获取词元文本属性 CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); // 获取词元文本属性 TypeAttribute type = ts.addAttribute(TypeAttribute.class); // 重置TokenStream(重置StringReader) ts.reset(); // 迭代获取分词结果 while (ts.incrementToken()) { System.out.println( offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); } // 关闭TokenStream(关闭StringReader) ts.end(); // Perform end-of-stream operations, e.g. set the final offset. } catch (IOException e) { e.printStackTrace(); } finally { // 释放TokenStream的所有资源 if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); } } } }
/** Saves the existing attribute states */ private void saveState() { // otherwise, we have delimiters, save state savedStartOffset = offsetAttribute.startOffset(); savedEndOffset = offsetAttribute.endOffset(); // if length by start + end offsets doesn't match the term text then assume this is a synonym // and don't adjust the offsets. hasIllegalOffsets = (savedEndOffset - savedStartOffset != termAttribute.length()); savedType = typeAttribute.type(); if (savedBuffer.length < termAttribute.length()) { savedBuffer = new char[ArrayUtil.oversize(termAttribute.length(), RamUsageEstimator.NUM_BYTES_CHAR)]; } System.arraycopy(termAttribute.buffer(), 0, savedBuffer, 0, termAttribute.length()); iterator.text = savedBuffer; hasSavedState = true; }
public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException { TokenStream stream = analyzer.tokenStream( "contents", // #A new StringReader(text)); stream.reset(); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); // #B PositionIncrementAttribute posIncr = // #B stream.addAttribute(PositionIncrementAttribute.class); // #B OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); // #B TypeAttribute type = stream.addAttribute(TypeAttribute.class); // #B int position = 0; while (stream.incrementToken()) { // #C int increment = posIncr.getPositionIncrement(); // #D if (increment > 0) { // #D position = position + increment; // #D System.out.println(); // #D System.out.print(position + ": "); // #D } System.out.print( "[" + // #E term + ":" + // #E offset.startOffset() + "->" + // #E offset.endOffset() + ":" + // #E type.type() + "] "); // #E } stream.close(); System.out.println(); }
@Override public final boolean incrementToken() throws IOException { while (true) { if (curTermBuffer == null) { if (!input.incrementToken()) { return false; } else { curTermBuffer = termAtt.buffer().clone(); curTermLength = termAtt.length(); curCodePointCount = charUtils.codePointCount(termAtt); curGramSize = minGram; tokStart = offsetAtt.startOffset(); tokEnd = offsetAtt.endOffset(); savePosIncr += posIncrAtt.getPositionIncrement(); savePosLen = posLenAtt.getPositionLength(); } } if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any // n-grams // grab gramSize chars from front or back clearAttributes(); offsetAtt.setOffset(tokStart, tokEnd); // first ngram gets increment, others don't if (curGramSize == minGram) { posIncrAtt.setPositionIncrement(savePosIncr); savePosIncr = 0; } else { posIncrAtt.setPositionIncrement(0); } posLenAtt.setPositionLength(savePosLen); final int charLength = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize); termAtt.copyBuffer(curTermBuffer, 0, charLength); curGramSize++; return true; } } curTermBuffer = null; } }
public boolean incrementToken() throws IOException { if (inPhrase) { inPhrase = false; clearAttributes(); termAtt.setTermBuffer("phrase2"); offsetAtt.setOffset(savedStart, savedEnd); return true; } else while (input.incrementToken()) { if (termAtt.term().equals("phrase")) { inPhrase = true; savedStart = offsetAtt.startOffset(); savedEnd = offsetAtt.endOffset(); termAtt.setTermBuffer("phrase1"); offsetAtt.setOffset(savedStart, savedEnd); return true; } else if (!termAtt.term().equals("stop")) return true; } return false; }
public static List<String> tokenizeString(String textFile) throws IOException { EnglishAnalyzer ena = new EnglishAnalyzer(Version.LUCENE_4_10_4); TokenStream tokenStream = ena.tokenStream(textFile.trim(), new StringReader(textFile.trim())); // StringBuilder sb = new StringBuilder(); OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String term = charTermAttribute.toString(); tokens.add(term); // sb.append(term + " "); } return tokens; }
@Override void newTerm(final int termID) { assert docState.testPoint("TermVectorsTermsWriterPerField.newTerm start"); TermVectorsPostingsArray postings = (TermVectorsPostingsArray) termsHashPerField.postingsArray; postings.freqs[termID] = 1; if (doVectorOffsets) { int startOffset = fieldState.offset + offsetAttribute.startOffset(); int endOffset = fieldState.offset + offsetAttribute.endOffset(); termsHashPerField.writeVInt(1, startOffset); termsHashPerField.writeVInt(1, endOffset - startOffset); postings.lastOffsets[termID] = endOffset; } if (doVectorPositions) { termsHashPerField.writeVInt(0, fieldState.position); postings.lastPositions[termID] = fieldState.position; } }
public static void main(String[] args) throws IOException { EdgeNGramAnalyzerWrapper analyzerWrapper = new EdgeNGramAnalyzerWrapper(Analyzer.PER_FIELD_REUSE_STRATEGY); StringReader reader = new StringReader("hello world"); TokenStream ts = analyzerWrapper.tokenStream("gramtext", reader); CharTermAttribute charAtt = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); ts.reset(); while (ts.incrementToken()) { System.out.println( charAtt.toString() + " , " + "start : " + offsetAtt.startOffset() + " , " + "end : " + offsetAtt.endOffset()); } }
public void testSupplementaryCharacters() throws IOException { final String s = _TestUtil.randomUnicodeString(random(), 10); final int codePointCount = s.codePointCount(0, s.length()); final int minGram = _TestUtil.nextInt(random(), 1, 3); final int maxGram = _TestUtil.nextInt(random(), minGram, 10); TokenStream tk = new KeywordTokenizer(new StringReader(s)); tk = new NGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram); final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class); final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class); tk.reset(); for (int start = 0; start < codePointCount; ++start) { for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) { assertTrue(tk.incrementToken()); assertEquals(0, offsetAtt.startOffset()); assertEquals(s.length(), offsetAtt.endOffset()); final int startIndex = Character.offsetByCodePoints(s, 0, start); final int endIndex = Character.offsetByCodePoints(s, 0, end); assertEquals(s.substring(startIndex, endIndex), termAtt.toString()); } } assertFalse(tk.incrementToken()); }
@Override public boolean incrementToken() throws IOException { while (true) { if (bufferedToken == null) { if (!bufferedTokenStream.incrementToken()) return false; bufferedToken = bufferedTokenStream.captureState(); bufferedStartOffset = bufferedOffsetAtt.startOffset(); bufferedEndOffset = bufferedOffsetAtt.endOffset(); } if (startOffset <= bufferedStartOffset && bufferedEndOffset <= endOffset) { restoreState(bufferedToken); bufferedToken = null; offsetAtt.setOffset( offsetAtt.startOffset() - startOffset, offsetAtt.endOffset() - startOffset); return true; } else if (bufferedEndOffset > endOffset) { startOffset += length + 1; return false; } bufferedToken = null; } }
/** * @param input * @param reusableToken is null well new one auto. * @return null - if not next token or input is null. * @throws IOException */ public static Token nextToken(TokenStream input, Token reusableToken) throws IOException { if (input == null) { return null; } if (!input.incrementToken()) { return null; } CharTermAttribute termAtt = (CharTermAttribute) input.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = (OffsetAttribute) input.getAttribute(OffsetAttribute.class); TypeAttribute typeAtt = (TypeAttribute) input.getAttribute(TypeAttribute.class); if (reusableToken == null) { reusableToken = new Token(); } reusableToken.clear(); if (termAtt != null) { // lucene 3.0 // reusableToken.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength()); // lucene 3.1 reusableToken.copyBuffer(termAtt.buffer(), 0, termAtt.length()); } if (offsetAtt != null) { // lucene 3.1 // reusableToken.setStartOffset(offsetAtt.startOffset()); // reusableToken.setEndOffset(offsetAtt.endOffset()); // lucene 4.0 reusableToken.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); } if (typeAtt != null) { reusableToken.setType(typeAtt.type()); } return reusableToken; }
/** * 查看分析器生成的语汇单元细节 * * @param analyzer * @param text * @throws IOException */ public static void displayTokens(Analyzer analyzer, String text) throws IOException { // 语汇单元流 TokenStream tokenStream = analyzer.tokenStream("default", new StringReader(text)); // 获取语汇单元的属性 TermAttribute termAttribute = tokenStream.addAttribute(TermAttribute.class); // 位置增量,在短语查询的时候,同义词查询的时候有作用 PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class); // 偏移量,高亮查询匹配结果有用 OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); // 语汇单元类型,普通是word,还有email等 TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class); int position = 0; while (tokenStream.incrementToken()) { // 计算位置信息 int increment = positionIncrementAttribute.getPositionIncrement(); if (increment > 0) { position = position + increment; } // 打印所有语汇单元的细节信息 System.out.println( "position : " + position + " [" + termAttribute.term() + ":" + offsetAttribute.startOffset() + "->" + offsetAttribute.endOffset() + ":" + typeAttribute.type() + "]"); } }
public void tokenise() throws IOException { String ignregexp = "--+|\\.\\.+|\\.+\\p{Space}"; // delete full stops and dashes (typically not used). if (ignoredElements != null && ignoredElements.length() > 0) ignregexp = ignregexp + "|< *" + ignoredElements + "[^>]*?/>" + "|< *" + ignoredElements + ".*?>.*?</" + ignoredElements + " *>"; if (!tagIndexing) ignregexp = ignregexp + "|<.*?>"; // ignregexp = ignregexp+"|\\W\\W+"; Pattern p = Pattern.compile(ignregexp); Matcher igns = p.matcher(originalText); StringBuffer tx = new StringBuffer(originalText); int ct = 1; while (igns.find()) { int s = igns.start(); int e = igns.end(); if (verbose) PrintUtil.printNoMove("Processing exclusions ...", ct++); // System.err.println("replacing\n-----------"+originalText.substring(s,e)+"\n--------------"); char sp[] = new char[e - s]; for (int j = 0; j < sp.length; j++) { sp[j] = ' '; } tx.replace(s, e, new String(sp)); } if (verbose) PrintUtil.donePrinting(); ct = 1; // verbose = false; String text = new String(tx); // System.out.println("-->"+text+"<--"); Tokenizer tokenizer = new JapaneseTokenizer( new StringReader(text), null, true, org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode.SEARCH); TokenStream stream = new JapaneseBaseFormFilter(tokenizer); // stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags); stream = new CJKWidthFilter(stream); // stream = new StopFilter(matchVersion, stream, stopwords); stream = new JapaneseKatakanaStemFilter(stream); // stream = new LowerCaseFilter(matchVersion, stream); OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class); while (stream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String token = charTermAttribute.toString(); tokenMap.putPos(token, startOffset); // System.out.println(token.str+" \t\tS="+token.start+" E="+token.end); } if (verbose) PrintUtil.donePrinting(); ct = 1; }
/** * Iterates over the given token stream and adds the resulting terms to the index; Equivalent to * adding a tokenized, indexed, termVectorStored, unstored, Lucene {@link * org.apache.lucene.document.Field}. Finally closes the token stream. Note that untokenized * keywords can be added with this method via {@link #keywordTokenStream(Collection)}, the Lucene * <code>KeywordTokenizer</code> or similar utilities. * * @param fieldName a name to be associated with the text * @param tokenStream the token stream to retrieve tokens from. It's guaranteed to be closed no * matter what. * @param boost the boost factor for hits for this field * @param positionIncrementGap the position increment gap if fields with the same name are added * more than once * @param offsetGap the offset gap if fields with the same name are added more than once * @see org.apache.lucene.document.Field#setBoost(float) */ public void addField( String fieldName, TokenStream tokenStream, float boost, int positionIncrementGap, int offsetGap) { try (TokenStream stream = tokenStream) { if (frozen) throw new IllegalArgumentException("Cannot call addField() when MemoryIndex is frozen"); if (fieldName == null) throw new IllegalArgumentException("fieldName must not be null"); if (stream == null) throw new IllegalArgumentException("token stream must not be null"); if (boost <= 0.0f) throw new IllegalArgumentException("boost factor must be greater than 0.0"); int numTokens = 0; int numOverlapTokens = 0; int pos = -1; final BytesRefHash terms; final SliceByteStartArray sliceArray; Info info; long sumTotalTermFreq = 0; int offset = 0; FieldInfo fieldInfo; if ((info = fields.get(fieldName)) != null) { fieldInfo = info.fieldInfo; numTokens = info.numTokens; numOverlapTokens = info.numOverlapTokens; pos = info.lastPosition + positionIncrementGap; offset = info.lastOffset + offsetGap; terms = info.terms; boost *= info.boost; sliceArray = info.sliceArray; sumTotalTermFreq = info.sumTotalTermFreq; } else { fieldInfo = new FieldInfo( fieldName, fields.size(), true, false, this.storePayloads, this.storeOffsets ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, DocValuesType.NONE, -1, Collections.<String, String>emptyMap()); sliceArray = new SliceByteStartArray(BytesRefHash.DEFAULT_CAPACITY); terms = new BytesRefHash(byteBlockPool, BytesRefHash.DEFAULT_CAPACITY, sliceArray); } TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncrAttribute = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class); PayloadAttribute payloadAtt = storePayloads ? stream.addAttribute(PayloadAttribute.class) : null; BytesRef ref = termAtt.getBytesRef(); stream.reset(); while (stream.incrementToken()) { termAtt.fillBytesRef(); // if (DEBUG) System.err.println("token='" + term + "'"); numTokens++; final int posIncr = posIncrAttribute.getPositionIncrement(); if (posIncr == 0) numOverlapTokens++; pos += posIncr; int ord = terms.add(ref); if (ord < 0) { ord = (-ord) - 1; postingsWriter.reset(sliceArray.end[ord]); } else { sliceArray.start[ord] = postingsWriter.startNewSlice(); } sliceArray.freq[ord]++; sumTotalTermFreq++; postingsWriter.writeInt(pos); if (storeOffsets) { postingsWriter.writeInt(offsetAtt.startOffset() + offset); postingsWriter.writeInt(offsetAtt.endOffset() + offset); } if (storePayloads) { final BytesRef payload = payloadAtt.getPayload(); final int pIndex; if (payload == null || payload.length == 0) { pIndex = -1; } else { pIndex = payloadsBytesRefs.append(payload); } postingsWriter.writeInt(pIndex); } sliceArray.end[ord] = postingsWriter.getCurrentOffset(); } stream.end(); // ensure infos.numTokens > 0 invariant; needed for correct operation of terms() if (numTokens > 0) { fields.put( fieldName, new Info( fieldInfo, terms, sliceArray, numTokens, numOverlapTokens, boost, pos, offsetAtt.endOffset() + offset, sumTotalTermFreq)); } } catch (IOException e) { throw new RuntimeException(e); } }