@Override
  public final boolean incrementToken() throws IOException {
    clearAttributes();
    if (delimitersCount == -1) {
      int length = 0;
      delimiterPositions.add(0);
      while (true) {
        int c = input.read();
        if (c < 0) {
          break;
        }
        length++;
        if (c == delimiter) {
          delimiterPositions.add(length);
          resultToken.append(replacement);
        } else {
          resultToken.append((char) c);
        }
      }
      delimitersCount = delimiterPositions.size();
      if (delimiterPositions.get(delimitersCount - 1) < length) {
        delimiterPositions.add(length);
        delimitersCount++;
      }
      if (resultTokenBuffer.length < resultToken.length()) {
        resultTokenBuffer = new char[resultToken.length()];
      }
      resultToken.getChars(0, resultToken.length(), resultTokenBuffer, 0);
      resultToken.setLength(0);
      int idx = delimitersCount - 1 - skip;
      if (idx >= 0) {
        // otherwise its ok, because we will skip and return false
        endPosition = delimiterPositions.get(idx);
      }
      finalOffset = correctOffset(length);
      posAtt.setPositionIncrement(1);
    } else {
      posAtt.setPositionIncrement(0);
    }

    while (skipped < delimitersCount - skip - 1) {
      int start = delimiterPositions.get(skipped);
      termAtt.copyBuffer(resultTokenBuffer, start, endPosition - start);
      offsetAtt.setOffset(correctOffset(start), correctOffset(endPosition));
      skipped++;
      return true;
    }

    return false;
  }
  private String tokenizerToString(Tokenizer tokenizer) throws Exception {
    OffsetAttribute extOffset = tokenizer.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncrAtt =
        tokenizer.addAttribute(PositionIncrementAttribute.class);
    PositionLengthAttribute posLengthAtt = tokenizer.addAttribute(PositionLengthAttribute.class);
    CharTermAttribute term = tokenizer.addAttribute(CharTermAttribute.class);
    TypeAttribute type = tokenizer.addAttribute(TypeAttribute.class);
    SemanticClassAttribute semanticClass = tokenizer.addAttribute(SemanticClassAttribute.class);
    PartOfSpeechAttribute pos = tokenizer.addAttribute(PartOfSpeechAttribute.class);

    StringBuilder result = new StringBuilder();
    tokenizer.reset();
    while (tokenizer.incrementToken() == true) {
      result.append(new String(term.buffer(), 0, term.length())).append(":");
      result.append(type.type()).append(":");
      result.append(pos.partOfSpeech()).append(":");
      result.append(semanticClass.semanticClass()).append(":");
      result.append(String.valueOf(posIncrAtt.getPositionIncrement())).append(":");
      result.append(String.valueOf(posLengthAtt.getPositionLength())).append(":");
      result.append(String.valueOf(extOffset.startOffset())).append(":");
      result.append(String.valueOf(extOffset.endOffset()));
      result.append(",");
    }
    tokenizer.end();
    return result.toString();
  }
 /**
  * Sugar: analyzes the text with the analyzer and separates by {@link
  * SynonymMap#WORD_SEPARATOR}. reuse and its chars must not be null.
  */
 public CharsRef analyze(String text, CharsRefBuilder reuse) throws IOException {
   try (TokenStream ts = analyzer.tokenStream("", text)) {
     CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
     PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
     ts.reset();
     reuse.clear();
     while (ts.incrementToken()) {
       int length = termAtt.length();
       if (length == 0) {
         throw new IllegalArgumentException(
             "term: " + text + " analyzed to a zero-length token");
       }
       if (posIncAtt.getPositionIncrement() != 1) {
         throw new IllegalArgumentException(
             "term: " + text + " analyzed to a token with posinc != 1");
       }
       reuse.grow(reuse.length() + length + 1); /* current + word + separator */
       int end = reuse.length();
       if (reuse.length() > 0) {
         reuse.setCharAt(end++, SynonymMap.WORD_SEPARATOR);
         reuse.setLength(reuse.length() + 1);
       }
       System.arraycopy(termAtt.buffer(), 0, reuse.chars(), end, length);
       reuse.setLength(reuse.length() + length);
     }
     ts.end();
   }
   if (reuse.length() == 0) {
     throw new IllegalArgumentException(
         "term: " + text + " was completely eliminated by analyzer");
   }
   return reuse.get();
 }
Ejemplo n.º 4
0
  @Override
  public final boolean incrementToken() throws IOException {
    if (isMailto) {
      termAtt.setEmpty();
      // return the scheme + the mail part
      isMailto = false;
      posIncrAtt.setPositionIncrement(0);
      termAtt.copyBuffer(termBuffer.array(), 0, termBuffer.position());
      return true;
    }

    if (input.incrementToken()) {
      final String type = typeAtt.type();
      if (type.equals(TupleTokenizer.getTokenTypes()[TupleTokenizer.URI])
          && this.isMailtoScheme()) {
        this.updateBuffer();
        termBuffer.put(termAtt.buffer(), 0, termAtt.length());
        // return only the mail part
        posIncrAtt.setPositionIncrement(1);
        termAtt.copyBuffer(termBuffer.array(), 7, termBuffer.position() - 7);
      }
      return true;
    }
    return false;
  }
 @Override
 public final void end() throws IOException {
   super.end();
   // set final offset
   int finalOffset = correctOffset(this.endPosition);
   offsetAtt.setOffset(finalOffset, finalOffset);
   posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
 }
Ejemplo n.º 6
0
  private static String[] groupTokens(Analyzer analyzer, String input) throws IOException {
    if (Resources.debug) {
      Resources.LOGGER.debug("TokenParser:" + input);
      Resources.LOGGER.debug("Analyzer:" + analyzer.getClass());
    }
    TokenStream tokenStream = analyzer.tokenStream("input", input);
    OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute positionIncrementAttribute =
        tokenStream.addAttribute(PositionIncrementAttribute.class);
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class);
    tokenStream.reset();
    int position = 0;

    List<TermInfo> infos = new ArrayList<TermInfo>();
    while (tokenStream.incrementToken()) {
      int increment = positionIncrementAttribute.getPositionIncrement();
      if (increment > 0) {
        position = position + increment;
        if (Resources.debug) {
          Resources.LOGGER.debug(position + ":");
        }
      }

      int startOffset = offsetAttribute.startOffset();
      int endOffset = offsetAttribute.endOffset();
      String term = charTermAttribute.toString();
      TermInfo info = new TermInfo();
      info.setStart(startOffset);
      info.setEnd(endOffset);
      infos.add(info);
      if (Resources.debug) {
        Resources.LOGGER.debug(
            "["
                + term
                + "]"
                + ":("
                + startOffset
                + "-->"
                + endOffset
                + "):"
                + typeAttribute.type());
      }
    }
    tokenStream.end();
    tokenStream.close();

    Stack<TermInfo> tiStack = groupTokenInfos(infos);
    List<String> terms = new ArrayList<String>();
    while (!tiStack.isEmpty()) {
      TermInfo termInfo = tiStack.pop();
      if (termInfo.getEnd() <= input.length() && termInfo.getStart() >= 1) {
        String term = input.substring(termInfo.getStart(), termInfo.getEnd());
        terms.add(term);
      }
    }
    return terms.toArray(new String[] {});
  }
Ejemplo n.º 7
0
 @Override
 public final void end() throws IOException {
   super.end();
   // set final offset
   int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
   offsetAtt.setOffset(finalOffset, finalOffset);
   // adjust any skipped tokens
   posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
 }
Ejemplo n.º 8
0
 public static void displayPositionIncrements(Analyzer analyzer, String text) throws IOException {
   TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
   stream.reset();
   PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
   while (stream.incrementToken()) {
     System.out.println("posIncr=" + posIncr.getPositionIncrement());
   }
   stream.close();
 }
Ejemplo n.º 9
0
  public TermSubQueryFactory termToFactory(String fieldname, Term sourceTerm, FieldBoost boost)
      throws IOException {

    CacheKey cacheKey = null;

    if (termQueryCache != null) {

      cacheKey = new CacheKey(fieldname, sourceTerm);

      TermQueryCacheValue cacheValue = termQueryCache.get(cacheKey);
      if (cacheValue != null) {
        // The cache references factories with pre-analyzed terms, or cache entries without a
        // query factory if the term does not exist in the index. cacheValue.hasQuery() returns
        // true/false correspondingly.
        // Cache entries don't have a boost factor, it is only added later via the queryFactory.
        return (cacheValue.hasQuery()) ? new TermSubQueryFactory(cacheValue, boost) : null;
      }
    }

    LuceneQueryFactoryAndPRMSQuery root = null;
    TokenStream ts = null;
    try {

      ts = analyzer.tokenStream(fieldname, new CharSequenceReader(sourceTerm));
      CharTermAttribute termAttr = ts.addAttribute(CharTermAttribute.class);
      PositionIncrementAttribute posIncAttr = ts.addAttribute(PositionIncrementAttribute.class);
      ts.reset();

      PositionSequence<org.apache.lucene.index.Term> sequence = new PositionSequence<>();
      while (ts.incrementToken()) {

        int inc = posIncAttr.getPositionIncrement();
        if (inc > 0 || sequence.isEmpty()) {
          sequence.nextPosition();
        }

        sequence.addElement(new org.apache.lucene.index.Term(fieldname, new BytesRef(termAttr)));
      }

      root = positionSequenceToQueryFactoryAndPRMS(sequence);

    } finally {
      if (ts != null) {
        try {
          ts.close();
        } catch (IOException e) {
        }
      }
    }

    putQueryFactoryAndPRMSQueryIntoCache(cacheKey, root);

    return root == null ? null : new TermSubQueryFactory(root, boost);
  }
Ejemplo n.º 10
0
  public static void displayTokensWithFullDetails(Analyzer analyzer, String text)
      throws IOException {

    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));

    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
    TypeAttribute type = stream.addAttribute(TypeAttribute.class);
    PayloadAttribute payload = stream.addAttribute(PayloadAttribute.class);

    int position = 0;
    while (stream.incrementToken()) {

      int increment = posIncr.getPositionIncrement();
      if (increment > 0) {
        position = position + increment;
        System.out.println();
        System.out.print(position + ":");
      }

      BytesRef pl = payload.getPayload();

      if (pl != null) {
        System.out.print(
            "["
                + term.toString()
                + ":"
                + offset.startOffset()
                + "->"
                + offset.endOffset()
                + ":"
                + type.type()
                + ":"
                + new String(pl.bytes)
                + "] ");

      } else {
        System.out.print(
            "["
                + term.toString()
                + ":"
                + offset.startOffset()
                + "->"
                + offset.endOffset()
                + ":"
                + type.type()
                + "] ");
      }
    }
    System.out.println();
  }
Ejemplo n.º 11
0
  public final boolean incrementToken() throws IOException {
    int increment = 0;
    while (input.incrementToken()) {
      if (!stopWords.contains(termAttr.termBuffer(), 0, termAttr.termLength())) {
        posIncrAttr.setPositionIncrement(posIncrAttr.getPositionIncrement() + increment);
        return true;
      }

      increment += posIncrAttr.getPositionIncrement();
    }

    return false;
  }
 /**
  * Count position increments in a token stream. Package private for testing.
  *
  * @param analyzer analyzer to create token stream
  * @param fieldName field name to pass to analyzer
  * @param fieldValue field value to pass to analyzer
  * @return number of position increments in a token stream
  * @throws IOException if tokenStream throws it
  */
 static int countPositions(Analyzer analyzer, String fieldName, String fieldValue)
     throws IOException {
   try (TokenStream tokenStream = analyzer.tokenStream(fieldName, fieldValue)) {
     int count = 0;
     PositionIncrementAttribute position =
         tokenStream.addAttribute(PositionIncrementAttribute.class);
     tokenStream.reset();
     while (tokenStream.incrementToken()) {
       count += position.getPositionIncrement();
     }
     tokenStream.end();
     count += position.getPositionIncrement();
     return count;
   }
 }
Ejemplo n.º 13
0
 @Override
 public boolean incrementToken() throws IOException {
   while (true) {
     final boolean gotOne = input.incrementToken();
     if (!gotOne) {
       return false;
     } else if (termAtt.toString().equals("a")) {
       pendingPosInc += posIncAtt.getPositionIncrement();
     } else {
       posIncAtt.setPositionIncrement(pendingPosInc + posIncAtt.getPositionIncrement());
       pendingPosInc = 0;
       return true;
     }
   }
 }
Ejemplo n.º 14
0
 @Override
 public boolean incrementToken() throws IOException {
   // return the first non-stop word found
   int skippedPositions = 0;
   while (input.incrementToken()) {
     if (!filter.run(termAtt.buffer(), 0, termAtt.length())) {
       if (enablePositionIncrements) {
         posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
       }
       return true;
     }
     skippedPositions += posIncrAtt.getPositionIncrement();
   }
   // reached EOS -- return false
   return false;
 }
  /*
   * (non-Javadoc)
   *
   * @see org.apache.lucene.analysis.TokenStream#next()
   */
  @Override
  public final boolean incrementToken() throws IOException {
    clearAttributes();
    skippedPositions = 0;

    while (true) {
      int tokenType = scanner.getNextToken();

      if (tokenType == StandardTokenizerInterface.YYEOF) {
        return false;
      }

      if (scanner.yylength() <= maxTokenLength) {
        posIncrAtt.setPositionIncrement(skippedPositions + 1);
        scanner.getText(termAtt);
        final int start = scanner.yychar();
        offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.length()));
        // This 'if' should be removed in the next release. For now, it converts
        // invalid acronyms to HOST. When removed, only the 'else' part should
        // remain.
        if (tokenType == StandardTokenizer.ACRONYM_DEP) {
          typeAtt.setType(StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HOST]);
          termAtt.setLength(termAtt.length() - 1); // remove extra '.'
        } else {
          typeAtt.setType(StandardTokenizer.TOKEN_TYPES[tokenType]);
        }
        return true;
      } else
        // When we skip a too-long term, we still increment the
        // position increment
        skippedPositions++;
    }
  }
Ejemplo n.º 16
0
  @Override
  public boolean incrementToken() throws IOException {
    if (!terms.isEmpty()) {
      char[] buffer = terms.poll();
      termAttribute.setEmpty();
      termAttribute.copyBuffer(buffer, 0, buffer.length);
      posIncAttr.setPositionIncrement(1);
      return true;
    }

    if (!input.incrementToken()) {
      return false;
    } else {
      final char term[] = termAttribute.buffer();
      final int length = termAttribute.length();

      int k = 0;
      for (; k < length; k++) {
        if (term[k] == tokenDelimiter) {
          break;
        }
      }

      LinkedList<CharBuffer> buffers = permuteTerms(term, 0, length);

      Iterator iter = buffers.iterator();
      while (iter.hasNext()) {
        CharBuffer cb = (CharBuffer) iter.next();
        terms.add(cb.array());
      }

      // we return true and leave the original token unchanged
      return true;
    }
  }
Ejemplo n.º 17
0
  /*
   * (non-Javadoc)
   *
   * @see org.apache.lucene.analysis.TokenStream#next()
   */
  @Override
  public final boolean incrementToken() throws IOException {
    clearAttributes();
    skippedPositions = 0;

    while (true) {
      int tokenType = scanner.getNextToken();

      if (tokenType == StandardTokenizerImpl.YYEOF) {
        return false;
      }

      if (scanner.yylength() <= maxTokenLength) {
        posIncrAtt.setPositionIncrement(skippedPositions + 1);
        scanner.getText(termAtt);
        final int start = scanner.yychar();
        offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.length()));
        typeAtt.setType(StandardTokenizer.TOKEN_TYPES[tokenType]);
        return true;
      } else
        // When we skip a too-long term, we still increment the
        // position increment
        skippedPositions++;
    }
  }
  @Override
  public final boolean incrementToken() throws IOException {
    if (!tokens.isEmpty()) {
      assert current != null;
      CompoundToken token = tokens.removeFirst();
      restoreState(current); // keep all other attributes untouched
      termAtt.setEmpty().append(token.txt);
      offsetAtt.setOffset(token.startOffset, token.endOffset);
      posIncAtt.setPositionIncrement(0);
      return true;
    }

    current = null; // not really needed, but for safety
    if (input.incrementToken()) {
      // Only words longer than minWordSize get processed
      if (termAtt.length() >= this.minWordSize) {
        decompose();
        // only capture the state if we really need it for producing new tokens
        if (!tokens.isEmpty()) {
          current = captureState();
        }
      }
      // return original token:
      return true;
    } else {
      return false;
    }
  }
  /* (non-Javadoc)
   * @see org.apache.lucene.analysis.TokenStream#incrementToken()
   */
  @Override
  public boolean incrementToken() throws IOException {
    // 清除所有的词元属性
    clearAttributes();
    skippedPositions = 0;

    Lexeme nextLexeme = _IKImplement.next();
    if (nextLexeme != null) {
      posIncrAtt.setPositionIncrement(skippedPositions + 1);

      // 将Lexeme转成Attributes
      // 设置词元文本
      termAtt.append(nextLexeme.getLexemeText());
      // 设置词元长度
      termAtt.setLength(nextLexeme.getLength());
      // 设置词元位移
      offsetAtt.setOffset(
          correctOffset(nextLexeme.getBeginPosition()), correctOffset(nextLexeme.getEndPosition()));

      // 记录分词的最后位置
      endPosition = nextLexeme.getEndPosition();
      // 记录词元分类
      typeAtt.setType(nextLexeme.getLexemeTypeString());
      // 返会true告知还有下个词元
      return true;
    }
    // 返会false告知词元输出完毕
    return false;
  }
Ejemplo n.º 20
0
  private static String[] mmsegTokens(Analyzer analyzer, String input) throws IOException {
    if (Resources.debug) {
      Resources.LOGGER.debug("TokenParser:" + input);
      Resources.LOGGER.debug("Analyzer:" + analyzer.getClass());
    }
    TokenStream tokenStream = analyzer.tokenStream("input", input);
    OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute positionIncrementAttribute =
        tokenStream.addAttribute(PositionIncrementAttribute.class);
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class);
    tokenStream.reset();
    int position = 0;

    List<String> tokens = new ArrayList<String>();
    while (tokenStream.incrementToken()) {
      int increment = positionIncrementAttribute.getPositionIncrement();
      if (increment > 0) {
        position = position + increment;
        if (Resources.debug) {
          Resources.LOGGER.debug(position + ":");
        }
      }

      int startOffset = offsetAttribute.startOffset();
      int endOffset = offsetAttribute.endOffset();
      String term = charTermAttribute.toString();
      tokens.add(term);
      if (Resources.debug) {
        Resources.LOGGER.debug(
            "["
                + term
                + "]"
                + ":("
                + startOffset
                + "-->"
                + endOffset
                + "):"
                + typeAttribute.type());
      }
    }
    tokenStream.end();
    tokenStream.close();
    return tokens.toArray(new String[] {});
  }
Ejemplo n.º 21
0
  @Override
  public boolean incrementToken() throws IOException {

    // parse() is able to return w/o producing any new
    // tokens, when the tokens it had produced were entirely
    // punctuation.  So we loop here until we get a real
    // token or we end:
    while (pending.size() == 0) {
      if (end) {
        return false;
      }

      // Push Viterbi forward some more:
      parse();
    }

    final Token token = pending.remove(pending.size() - 1);

    int position = token.getPosition();
    int length = token.getLength();
    clearAttributes();
    assert length > 0;
    // System.out.println("off=" + token.getOffset() + " len=" + length + " vs " +
    // token.getSurfaceForm().length);
    termAtt.copyBuffer(token.getSurfaceForm(), token.getOffset(), length);
    offsetAtt.setOffset(correctOffset(position), correctOffset(position + length));
    basicFormAtt.setToken(token);
    posAtt.setToken(token);
    readingAtt.setToken(token);
    inflectionAtt.setToken(token);
    if (token.getPosition() == lastTokenPos) {
      posIncAtt.setPositionIncrement(0);
      posLengthAtt.setPositionLength(token.getPositionLength());
    } else {
      assert token.getPosition() > lastTokenPos;
      posIncAtt.setPositionIncrement(1);
      posLengthAtt.setPositionLength(1);
    }
    if (VERBOSE) {
      System.out.println(Thread.currentThread().getName() + ":    incToken: return token=" + token);
    }
    lastTokenPos = token.getPosition();
    return true;
  }
Ejemplo n.º 22
0
  /* (non-Javadoc)
   * @see org.apache.lucene.search.highlight.TextFragmenter#isNewFragment(org.apache.lucene.analysis.Token)
   */
  public boolean isNewFragment() {
    boolean isNewFrag = false;
    int minFragLen = (int) ((1.0f - slop) * targetFragChars);
    int endOffset = offsetAtt.endOffset();

    // ** determin isNewFrag
    if (posIncAtt.getPositionIncrement() > incrementGapThreshold) {
      // large position gaps always imply new fragments
      isNewFrag = true;

    } else if (endOffset - currentOffset < minFragLen) {
      // we're not in our range of flexibility
      isNewFrag = false;

    } else if (targetOffset > 0) {
      // we've already decided on a target
      isNewFrag = endOffset > targetOffset;

    } else {
      // we might be able to do something
      int minOffset = currentOffset + minFragLen;
      int maxOffset = (int) (currentOffset + (1.0f + slop) * targetFragChars);
      int hotIndex;

      // look for a close hotspot
      hotIndex = Arrays.binarySearch(hotspots, endOffset);
      if (hotIndex < 0) hotIndex = -hotIndex;
      if (hotIndex >= hotspots.length) {
        // no more hotspots in this input stream
        targetOffset = currentOffset + targetFragChars;

      } else if (hotspots[hotIndex] > maxOffset) {
        // no hotspots within slop
        targetOffset = currentOffset + targetFragChars;

      } else {
        // try to find hotspot in slop
        int goal = hotspots[hotIndex];
        while (goal < minOffset && hotIndex < hotspots.length) {
          hotIndex++;
          goal = hotspots[hotIndex];
        }
        targetOffset = goal <= maxOffset ? goal : currentOffset + targetFragChars;
      }

      isNewFrag = endOffset > targetOffset;
    }

    // ** operate on isNewFrag
    if (isNewFrag) {
      currentNumFrags++;
      currentOffset = endOffset;
      targetOffset = -1;
    }
    return isNewFrag;
  }
Ejemplo n.º 23
0
 @Override
 public boolean incrementToken() throws IOException {
   if (currentPrefix != null) {
     if (!currentPrefix.hasNext()) {
       return input.incrementToken();
     } else {
       posAttr.setPositionIncrement(0);
     }
   } else {
     currentPrefix = prefixes.iterator();
     termAttr.setEmpty();
     posAttr.setPositionIncrement(1);
     assert (currentPrefix.hasNext()) : "one or more prefixes needed";
   }
   termAttr.setEmpty();
   termAttr.append(currentPrefix.next());
   termAttr.append(separator);
   return true;
 }
Ejemplo n.º 24
0
  public static void displayTokensWithFullDetails(Analyzer analyzer, String text)
      throws IOException {

    TokenStream stream =
        analyzer.tokenStream(
            "contents", // #A
            new StringReader(text));
    stream.reset();

    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); // #B
    PositionIncrementAttribute posIncr = // #B
        stream.addAttribute(PositionIncrementAttribute.class); // #B
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); // #B
    TypeAttribute type = stream.addAttribute(TypeAttribute.class); // #B

    int position = 0;
    while (stream.incrementToken()) { // #C

      int increment = posIncr.getPositionIncrement(); // #D
      if (increment > 0) { // #D
        position = position + increment; // #D
        System.out.println(); // #D
        System.out.print(position + ": "); // #D
      }

      System.out.print(
          "["
              + // #E
              term
              + ":"
              + // #E
              offset.startOffset()
              + "->"
              + // #E
              offset.endOffset()
              + ":"
              + // #E
              type.type()
              + "] "); // #E
    }
    stream.close();
    System.out.println();
  }
Ejemplo n.º 25
0
 @Override
 public final boolean incrementToken() throws IOException {
   while (true) {
     if (curTermBuffer == null) {
       if (!input.incrementToken()) {
         return false;
       } else {
         curTermBuffer = termAtt.buffer().clone();
         curTermLength = termAtt.length();
         curCodePointCount = charUtils.codePointCount(termAtt);
         curGramSize = minGram;
         tokStart = offsetAtt.startOffset();
         tokEnd = offsetAtt.endOffset();
         savePosIncr += posIncrAtt.getPositionIncrement();
         savePosLen = posLenAtt.getPositionLength();
       }
     }
     if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit
       if (curGramSize
           <= curCodePointCount) { // if the remaining input is too short, we can't generate any
         // n-grams
         // grab gramSize chars from front or back
         clearAttributes();
         offsetAtt.setOffset(tokStart, tokEnd);
         // first ngram gets increment, others don't
         if (curGramSize == minGram) {
           posIncrAtt.setPositionIncrement(savePosIncr);
           savePosIncr = 0;
         } else {
           posIncrAtt.setPositionIncrement(0);
         }
         posLenAtt.setPositionLength(savePosLen);
         final int charLength =
             charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);
         termAtt.copyBuffer(curTermBuffer, 0, charLength);
         curGramSize++;
         return true;
       }
     }
     curTermBuffer = null;
   }
 }
Ejemplo n.º 26
0
 /* (non-Javadoc)
  * @see org.apache.lucene.search.highlight.TextFragmenter#isNewFragment(org.apache.lucene.analysis.Token)
  */
 @Override
 public boolean isNewFragment() {
   int endOffset = offsetAtt.endOffset();
   boolean isNewFrag =
       endOffset >= fragOffset + getFragmentSize()
           || posIncAtt.getPositionIncrement() > INCREMENT_THRESHOLD;
   if (isNewFrag) {
     fragOffset = endOffset;
   }
   return isNewFrag;
 }
Ejemplo n.º 27
0
  public static void displayTokensWithPositions(Analyzer analyzer, String text) throws IOException {

    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));

    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);

    int position = 0;
    while (stream.incrementToken()) {

      int increment = posIncr.getPositionIncrement();
      if (increment > 0) {
        position = position + increment;
        System.out.println();
        System.out.print(position + ":");
      }

      System.out.print("[" + term.toString() + "] ");
    }
    System.out.println();
  }
Ejemplo n.º 28
0
 private void buffer() {
   if (bufferedLen == buffered.length) {
     int newSize = ArrayUtil.oversize(bufferedLen + 1, 8);
     buffered = Arrays.copyOf(buffered, newSize);
     startOff = Arrays.copyOf(startOff, newSize);
     posInc = Arrays.copyOf(posInc, newSize);
   }
   startOff[bufferedLen] = offsetAttribute.startOffset();
   posInc[bufferedLen] = posIncAttribute.getPositionIncrement();
   buffered[bufferedLen] = captureState();
   bufferedLen++;
 }
 @Override
 public boolean incrementToken() {
   clearAttributes();
   if (upto == 4) {
     return false;
   }
   if (upto == 0) {
     posIncr.setPositionIncrement(1);
     term.setEmpty().append("a");
   } else if (upto == 1) {
     posIncr.setPositionIncrement(1);
     term.setEmpty().append("b");
   } else if (upto == 2) {
     posIncr.setPositionIncrement(0);
     term.setEmpty().append("c");
   } else {
     posIncr.setPositionIncrement(0);
     term.setEmpty().append("d");
   }
   upto++;
   return true;
 }
  private void emit(char[] token) {
    Log.debug("emit: " + new String(token));
    if (replaceWhitespaceWith != null) {
      token = replaceWhiteSpace(token);
    }
    CharTermAttribute termAttr = getTermAttribute();
    termAttr.setEmpty();
    termAttr.append(new StringBuilder().append(token));

    OffsetAttribute offAttr = getOffsetAttribute();
    if (offAttr != null && offAttr.endOffset() >= token.length) {
      int start = offAttr.endOffset() - token.length;
      offAttr.setOffset(start, offAttr.endOffset());
    }

    PositionIncrementAttribute pia = getPositionIncrementAttribute();
    if (pia != null) {
      pia.setPositionIncrement(++positionIncr);
    }

    lastEmitted = token;
  }