Beispiel #1
0
  @Test
  public void testCase2() throws Exception {
    StringReader reader = new StringReader("고속도로");

    nouns.add(getToken("고속도로", 0, 4));
    nouns.add(getToken("고속도", 0, 3));
    nouns.add(getToken("고속", 0, 2));
    nouns.add(getToken("속도", 1, 3));
    nouns.add(getToken("고", 0, 1));

    Analyzer analyzer = new KoreanAnalyzer();
    TokenStream stream = analyzer.reusableTokenStream("dummy", reader);

    CharTermAttribute charTermAtt = stream.getAttribute(CharTermAttribute.class);
    OffsetAttribute offSetAtt = stream.getAttribute(OffsetAttribute.class);

    while (stream.incrementToken()) {
      TestToken t =
          getToken(charTermAtt.toString(), offSetAtt.startOffset(), offSetAtt.endOffset());
      System.out.println("termAtt.term() : " + charTermAtt.toString());
      System.out.println("offSetAtt : " + offSetAtt.startOffset());
      System.out.println("offSetAtt : " + offSetAtt.endOffset());

      Assert.assertTrue(nouns.contains(t));
    }
  }
  public static void displayTokensWithFullDetails(Analyzer analyzer, String text)
      throws IOException {

    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));

    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
    TypeAttribute type = stream.addAttribute(TypeAttribute.class);
    PayloadAttribute payload = stream.addAttribute(PayloadAttribute.class);

    int position = 0;
    while (stream.incrementToken()) {

      int increment = posIncr.getPositionIncrement();
      if (increment > 0) {
        position = position + increment;
        System.out.println();
        System.out.print(position + ":");
      }

      BytesRef pl = payload.getPayload();

      if (pl != null) {
        System.out.print(
            "["
                + term.toString()
                + ":"
                + offset.startOffset()
                + "->"
                + offset.endOffset()
                + ":"
                + type.type()
                + ":"
                + new String(pl.bytes)
                + "] ");

      } else {
        System.out.print(
            "["
                + term.toString()
                + ":"
                + offset.startOffset()
                + "->"
                + offset.endOffset()
                + ":"
                + type.type()
                + "] ");
      }
    }
    System.out.println();
  }
Beispiel #3
0
  @Override
  public boolean incrementToken() throws IOException {
    if (tokenIter == null || !tokenIter.hasNext()) {
      // there are no remaining tokens from the current sentence... are there more sentences?
      if (input.incrementToken()) {
        tokStart = offsetAtt.startOffset();
        tokEnd = offsetAtt.endOffset();
        // if length by start + end offsets doesn't match the term text then assume
        // this is a synonym and don't adjust the offsets.
        hasIllegalOffsets = (tokStart + termAtt.length()) != tokEnd;
        // a new sentence is available: process it.
        tokenBuffer = splitIntoTokens(termAtt.toString(), offsetAtt.startOffset());
        tokenIter = tokenBuffer.iterator();

        // it should not be possible to have a sentence with 0 words, check just in case.
        // returning EOS isn't the best either, but its the behavior of the original code.
        if (!tokenIter.hasNext()) return false;
      } else {
        return false; // no more sentences, end of stream!
      }
    }
    // WordTokenFilter must clear attributes, as it is creating new tokens.
    clearAttributes();
    // There are remaining tokens from the current sentence, return the next one.
    SegToken nextWord = tokenIter.next();
    termAtt.append(nextWord.term);
    // termAtt.copyBuffer(nextWord.charArray, 0, nextWord.charArray.length);
    if (hasIllegalOffsets) {
      offsetAtt.setOffset(tokStart, tokEnd);
    } else {
      offsetAtt.setOffset(nextWord.start, nextWord.end);
    }
    typeAtt.setType("word");
    return true;
  }
  // reconstruct the unused tokens from the phrase (since it didn't match)
  // need to recompute the token positions based on the length of the currentPhrase,
  // the current ending position and the length of each token.
  private void discardCharTokens(StringBuffer phrase, ArrayList<Token> tokenList) {
    Log.debug("discardCharTokens: '" + phrase.toString() + "'");
    OffsetAttribute offAttr = getOffsetAttribute();
    int endPos = offAttr.endOffset();
    int startPos = endPos - phrase.length();

    int lastSp = 0;
    for (int i = 0; i < phrase.length(); i++) {
      char chAt = phrase.charAt(i);
      if (isSpaceChar(chAt) && i > lastSp) {
        char[] tok = new char[i - lastSp];
        phrase.getChars(lastSp, i, tok, 0);
        if (lastEmitted == null || !endsWith(lastEmitted, tok)) {
          Token token = new Token();
          token.tok = tok;

          token.startPos = startPos + lastSp;
          token.endPos = token.startPos + tok.length;
          Log.debug("discard " + new String(tok) + ": " + token.startPos + ", " + token.endPos);
          tokenList.add(token);
        }
        lastSp = i + 1;
      }
    }
    char[] tok = new char[phrase.length() - lastSp];
    phrase.getChars(lastSp, phrase.length(), tok, 0);

    Token token = new Token();
    token.tok = tok;
    token.endPos = endPos;
    token.startPos = endPos - tok.length;
    tokenList.add(token);
  }
  public TokenIndex getTokenIndex(String str) {
    TokenIndex ret = new TokenIndex();
    try {
      Tokenizer tokenizer =
          new JapaneseTokenizer(
              new StringReader(str),
              null,
              true,
              org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode.SEARCH);
      TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
      // stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags);
      stream = new CJKWidthFilter(stream);
      // stream = new StopFilter(matchVersion, stream, stopwords);
      stream = new JapaneseKatakanaStemFilter(stream);
      // stream = new LowerCaseFilter(matchVersion, stream);

      OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class);
      CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class);

      while (stream.incrementToken()) {
        int startOffset = offsetAttribute.startOffset();
        int endOffset = offsetAttribute.endOffset();
        String token = charTermAttribute.toString();
        ret.add(startOffset, endOffset);
        // System.out.println(token.str+" \t\tS="+token.start+" E="+token.end);
      }
    } catch (java.io.IOException e) {
      System.err.println(e);
    }
    return ret;
  }
  private String tokenizerToString(Tokenizer tokenizer) throws Exception {
    OffsetAttribute extOffset = tokenizer.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncrAtt =
        tokenizer.addAttribute(PositionIncrementAttribute.class);
    PositionLengthAttribute posLengthAtt = tokenizer.addAttribute(PositionLengthAttribute.class);
    CharTermAttribute term = tokenizer.addAttribute(CharTermAttribute.class);
    TypeAttribute type = tokenizer.addAttribute(TypeAttribute.class);
    SemanticClassAttribute semanticClass = tokenizer.addAttribute(SemanticClassAttribute.class);
    PartOfSpeechAttribute pos = tokenizer.addAttribute(PartOfSpeechAttribute.class);

    StringBuilder result = new StringBuilder();
    tokenizer.reset();
    while (tokenizer.incrementToken() == true) {
      result.append(new String(term.buffer(), 0, term.length())).append(":");
      result.append(type.type()).append(":");
      result.append(pos.partOfSpeech()).append(":");
      result.append(semanticClass.semanticClass()).append(":");
      result.append(String.valueOf(posIncrAtt.getPositionIncrement())).append(":");
      result.append(String.valueOf(posLengthAtt.getPositionLength())).append(":");
      result.append(String.valueOf(extOffset.startOffset())).append(":");
      result.append(String.valueOf(extOffset.endOffset()));
      result.append(",");
    }
    tokenizer.end();
    return result.toString();
  }
  @Override
  public boolean incrementToken() throws IOException {

    if (tokenIter == null || !tokenIter.hasNext()) {
      if (input.incrementToken()) {
        tokStart = offsetAtt.startOffset();
        tokEnd = offsetAtt.endOffset();
        hasIllegalOffsets = (tokStart + termAtt.length()) != tokEnd;
        tokenBuffer = wordSegmenter.getTokendWords(termAtt.toString());
        tokenIter = tokenBuffer.iterator();
        if (!tokenIter.hasNext()) return false;

      } else {
        return false;
      }
    }

    clearAttributes();

    TokendWords nextWord = tokenIter.next();

    termAtt.copyBuffer(nextWord.next(), 0, nextWord.next().length);
    if (hasIllegalOffsets) {
      offsetAtt.setOffset(tokStart, tokEnd);
    } else {
      offsetAtt.setOffset(nextWord.start, nextWord.end);
    }
    typeAtt.setType("word");
    return true;
  }
Beispiel #8
0
  private static String[] groupTokens(Analyzer analyzer, String input) throws IOException {
    if (Resources.debug) {
      Resources.LOGGER.debug("TokenParser:" + input);
      Resources.LOGGER.debug("Analyzer:" + analyzer.getClass());
    }
    TokenStream tokenStream = analyzer.tokenStream("input", input);
    OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute positionIncrementAttribute =
        tokenStream.addAttribute(PositionIncrementAttribute.class);
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class);
    tokenStream.reset();
    int position = 0;

    List<TermInfo> infos = new ArrayList<TermInfo>();
    while (tokenStream.incrementToken()) {
      int increment = positionIncrementAttribute.getPositionIncrement();
      if (increment > 0) {
        position = position + increment;
        if (Resources.debug) {
          Resources.LOGGER.debug(position + ":");
        }
      }

      int startOffset = offsetAttribute.startOffset();
      int endOffset = offsetAttribute.endOffset();
      String term = charTermAttribute.toString();
      TermInfo info = new TermInfo();
      info.setStart(startOffset);
      info.setEnd(endOffset);
      infos.add(info);
      if (Resources.debug) {
        Resources.LOGGER.debug(
            "["
                + term
                + "]"
                + ":("
                + startOffset
                + "-->"
                + endOffset
                + "):"
                + typeAttribute.type());
      }
    }
    tokenStream.end();
    tokenStream.close();

    Stack<TermInfo> tiStack = groupTokenInfos(infos);
    List<String> terms = new ArrayList<String>();
    while (!tiStack.isEmpty()) {
      TermInfo termInfo = tiStack.pop();
      if (termInfo.getEnd() <= input.length() && termInfo.getStart() >= 1) {
        String term = input.substring(termInfo.getStart(), termInfo.getEnd());
        terms.add(term);
      }
    }
    return terms.toArray(new String[] {});
  }
 @Override
 public boolean incrementToken() throws IOException {
   if (offsetCount < offsetLimit && input.incrementToken()) {
     int offsetLength = offsetAttrib.endOffset() - offsetAttrib.startOffset();
     offsetCount += offsetLength;
     return true;
   }
   return false;
 }
  /* (non-Javadoc)
   * @see org.apache.lucene.search.highlight.TextFragmenter#isNewFragment(org.apache.lucene.analysis.Token)
   */
  public boolean isNewFragment() {
    boolean isNewFrag = false;
    int minFragLen = (int) ((1.0f - slop) * targetFragChars);
    int endOffset = offsetAtt.endOffset();

    // ** determin isNewFrag
    if (posIncAtt.getPositionIncrement() > incrementGapThreshold) {
      // large position gaps always imply new fragments
      isNewFrag = true;

    } else if (endOffset - currentOffset < minFragLen) {
      // we're not in our range of flexibility
      isNewFrag = false;

    } else if (targetOffset > 0) {
      // we've already decided on a target
      isNewFrag = endOffset > targetOffset;

    } else {
      // we might be able to do something
      int minOffset = currentOffset + minFragLen;
      int maxOffset = (int) (currentOffset + (1.0f + slop) * targetFragChars);
      int hotIndex;

      // look for a close hotspot
      hotIndex = Arrays.binarySearch(hotspots, endOffset);
      if (hotIndex < 0) hotIndex = -hotIndex;
      if (hotIndex >= hotspots.length) {
        // no more hotspots in this input stream
        targetOffset = currentOffset + targetFragChars;

      } else if (hotspots[hotIndex] > maxOffset) {
        // no hotspots within slop
        targetOffset = currentOffset + targetFragChars;

      } else {
        // try to find hotspot in slop
        int goal = hotspots[hotIndex];
        while (goal < minOffset && hotIndex < hotspots.length) {
          hotIndex++;
          goal = hotspots[hotIndex];
        }
        targetOffset = goal <= maxOffset ? goal : currentOffset + targetFragChars;
      }

      isNewFrag = endOffset > targetOffset;
    }

    // ** operate on isNewFrag
    if (isNewFrag) {
      currentNumFrags++;
      currentOffset = endOffset;
      targetOffset = -1;
    }
    return isNewFrag;
  }
Beispiel #11
0
 /* (non-Javadoc)
  * @see org.apache.lucene.search.highlight.TextFragmenter#isNewFragment(org.apache.lucene.analysis.Token)
  */
 @Override
 public boolean isNewFragment() {
   int endOffset = offsetAtt.endOffset();
   boolean isNewFrag =
       endOffset >= fragOffset + getFragmentSize()
           || posIncAtt.getPositionIncrement() > INCREMENT_THRESHOLD;
   if (isNewFrag) {
     fragOffset = endOffset;
   }
   return isNewFrag;
 }
  private void emit(char[] token) {
    Log.debug("emit: " + new String(token));
    if (replaceWhitespaceWith != null) {
      token = replaceWhiteSpace(token);
    }
    CharTermAttribute termAttr = getTermAttribute();
    termAttr.setEmpty();
    termAttr.append(new StringBuilder().append(token));

    OffsetAttribute offAttr = getOffsetAttribute();
    if (offAttr != null && offAttr.endOffset() >= token.length) {
      int start = offAttr.endOffset() - token.length;
      offAttr.setOffset(start, offAttr.endOffset());
    }

    PositionIncrementAttribute pia = getPositionIncrementAttribute();
    if (pia != null) {
      pia.setPositionIncrement(++positionIncr);
    }

    lastEmitted = token;
  }
  /**
   * Override this method to customize the Object representing a single highlighted suggestions; the
   * result is set on each {@link LookupResult#highlightKey} member.
   */
  protected Object highlight(String text, Set<String> matchedTokens, String prefixToken)
      throws IOException {
    try (TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text))) {
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
      ts.reset();
      StringBuilder sb = new StringBuilder();
      int upto = 0;
      while (ts.incrementToken()) {
        String token = termAtt.toString();
        int startOffset = offsetAtt.startOffset();
        int endOffset = offsetAtt.endOffset();
        if (upto < startOffset) {
          addNonMatch(sb, text.substring(upto, startOffset));
          upto = startOffset;
        } else if (upto > startOffset) {
          continue;
        }

        if (matchedTokens.contains(token)) {
          // Token matches.
          addWholeMatch(sb, text.substring(startOffset, endOffset), token);
          upto = endOffset;
        } else if (prefixToken != null && token.startsWith(prefixToken)) {
          addPrefixMatch(sb, text.substring(startOffset, endOffset), token, prefixToken);
          upto = endOffset;
        }
      }
      ts.end();
      int endOffset = offsetAtt.endOffset();
      if (upto < endOffset) {
        addNonMatch(sb, text.substring(upto));
      }
      return sb.toString();
    }
  }
  public void printAnalyzerWords(Analyzer analyzer, String field) {

    // 获取Lucene的TokenStream对象
    TokenStream ts = null;
    try {
      ts = analyzer.tokenStream(field, this.content);
      // 获取词元位置属性
      OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
      // 获取词元文本属性
      CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
      // 获取词元文本属性
      TypeAttribute type = ts.addAttribute(TypeAttribute.class);

      // 重置TokenStream(重置StringReader)
      ts.reset();
      // 迭代获取分词结果
      while (ts.incrementToken()) {
        System.out.println("documents[" + this.id + "]");
        System.out.println(
            offset.startOffset()
                + " - "
                + offset.endOffset()
                + " : "
                + term.toString()
                + " | "
                + type.type());
      }
      // 关闭TokenStream(关闭StringReader)
      ts.end(); // Perform end-of-stream operations, e.g. set the final
      // offset.

    } catch (CorruptIndexException e) {
      e.printStackTrace();
    } catch (LockObtainFailedException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    } finally {
      // 释放TokenStream的所有资源
      if (ts != null) {
        try {
          ts.close();
        } catch (IOException e) {
          e.printStackTrace();
        }
      }
    }
  }
  /** Writes the joined unhyphenated term */
  private void unhyphenate() {
    int endOffset = offsetAttribute.endOffset();

    restoreState(savedState);
    savedState = null;

    char term[] = termAttribute.buffer();
    int length = hyphenated.length();
    if (length > termAttribute.length()) {
      term = termAttribute.resizeBuffer(length);
    }

    hyphenated.getChars(0, length, term, 0);
    termAttribute.setLength(length);
    offsetAttribute.setOffset(offsetAttribute.startOffset(), endOffset);
    hyphenated.setLength(0);
  }
Beispiel #16
0
  private static String[] mmsegTokens(Analyzer analyzer, String input) throws IOException {
    if (Resources.debug) {
      Resources.LOGGER.debug("TokenParser:" + input);
      Resources.LOGGER.debug("Analyzer:" + analyzer.getClass());
    }
    TokenStream tokenStream = analyzer.tokenStream("input", input);
    OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute positionIncrementAttribute =
        tokenStream.addAttribute(PositionIncrementAttribute.class);
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class);
    tokenStream.reset();
    int position = 0;

    List<String> tokens = new ArrayList<String>();
    while (tokenStream.incrementToken()) {
      int increment = positionIncrementAttribute.getPositionIncrement();
      if (increment > 0) {
        position = position + increment;
        if (Resources.debug) {
          Resources.LOGGER.debug(position + ":");
        }
      }

      int startOffset = offsetAttribute.startOffset();
      int endOffset = offsetAttribute.endOffset();
      String term = charTermAttribute.toString();
      tokens.add(term);
      if (Resources.debug) {
        Resources.LOGGER.debug(
            "["
                + term
                + "]"
                + ":("
                + startOffset
                + "-->"
                + endOffset
                + "):"
                + typeAttribute.type());
      }
    }
    tokenStream.end();
    tokenStream.close();
    return tokens.toArray(new String[] {});
  }
  public static void main(String[] args) {
    // 构建IK分词器,使用smart分词模式
    Analyzer analyzer = new IKSynonymAnalyzer(Version.LUCENE_4_9, true);

    // 获取Lucene的TokenStream对象
    TokenStream ts = null;
    try {
      ts = analyzer.tokenStream("myfield", new StringReader("物理老师数学数学老师"));
      // 获取词元位置属性
      OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
      // 获取词元文本属性
      CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
      // 获取词元文本属性
      TypeAttribute type = ts.addAttribute(TypeAttribute.class);

      // 重置TokenStream(重置StringReader)
      ts.reset();
      // 迭代获取分词结果
      while (ts.incrementToken()) {
        System.out.println(
            offset.startOffset()
                + " - "
                + offset.endOffset()
                + " : "
                + term.toString()
                + " | "
                + type.type());
      }
      // 关闭TokenStream(关闭StringReader)
      ts.end(); // Perform end-of-stream operations, e.g. set the final offset.

    } catch (IOException e) {
      e.printStackTrace();
    } finally {
      // 释放TokenStream的所有资源
      if (ts != null) {
        try {
          ts.close();
        } catch (IOException e) {
          e.printStackTrace();
        }
      }
    }
  }
  /** Saves the existing attribute states */
  private void saveState() {
    // otherwise, we have delimiters, save state
    savedStartOffset = offsetAttribute.startOffset();
    savedEndOffset = offsetAttribute.endOffset();
    // if length by start + end offsets doesn't match the term text then assume this is a synonym
    // and don't adjust the offsets.
    hasIllegalOffsets = (savedEndOffset - savedStartOffset != termAttribute.length());
    savedType = typeAttribute.type();

    if (savedBuffer.length < termAttribute.length()) {
      savedBuffer =
          new char[ArrayUtil.oversize(termAttribute.length(), RamUsageEstimator.NUM_BYTES_CHAR)];
    }

    System.arraycopy(termAttribute.buffer(), 0, savedBuffer, 0, termAttribute.length());
    iterator.text = savedBuffer;

    hasSavedState = true;
  }
  public static void displayTokensWithFullDetails(Analyzer analyzer, String text)
      throws IOException {

    TokenStream stream =
        analyzer.tokenStream(
            "contents", // #A
            new StringReader(text));
    stream.reset();

    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); // #B
    PositionIncrementAttribute posIncr = // #B
        stream.addAttribute(PositionIncrementAttribute.class); // #B
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); // #B
    TypeAttribute type = stream.addAttribute(TypeAttribute.class); // #B

    int position = 0;
    while (stream.incrementToken()) { // #C

      int increment = posIncr.getPositionIncrement(); // #D
      if (increment > 0) { // #D
        position = position + increment; // #D
        System.out.println(); // #D
        System.out.print(position + ": "); // #D
      }

      System.out.print(
          "["
              + // #E
              term
              + ":"
              + // #E
              offset.startOffset()
              + "->"
              + // #E
              offset.endOffset()
              + ":"
              + // #E
              type.type()
              + "] "); // #E
    }
    stream.close();
    System.out.println();
  }
 @Override
 public final boolean incrementToken() throws IOException {
   while (true) {
     if (curTermBuffer == null) {
       if (!input.incrementToken()) {
         return false;
       } else {
         curTermBuffer = termAtt.buffer().clone();
         curTermLength = termAtt.length();
         curCodePointCount = charUtils.codePointCount(termAtt);
         curGramSize = minGram;
         tokStart = offsetAtt.startOffset();
         tokEnd = offsetAtt.endOffset();
         savePosIncr += posIncrAtt.getPositionIncrement();
         savePosLen = posLenAtt.getPositionLength();
       }
     }
     if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit
       if (curGramSize
           <= curCodePointCount) { // if the remaining input is too short, we can't generate any
         // n-grams
         // grab gramSize chars from front or back
         clearAttributes();
         offsetAtt.setOffset(tokStart, tokEnd);
         // first ngram gets increment, others don't
         if (curGramSize == minGram) {
           posIncrAtt.setPositionIncrement(savePosIncr);
           savePosIncr = 0;
         } else {
           posIncrAtt.setPositionIncrement(0);
         }
         posLenAtt.setPositionLength(savePosLen);
         final int charLength =
             charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);
         termAtt.copyBuffer(curTermBuffer, 0, charLength);
         curGramSize++;
         return true;
       }
     }
     curTermBuffer = null;
   }
 }
 public boolean incrementToken() throws IOException {
   if (inPhrase) {
     inPhrase = false;
     clearAttributes();
     termAtt.setTermBuffer("phrase2");
     offsetAtt.setOffset(savedStart, savedEnd);
     return true;
   } else
     while (input.incrementToken()) {
       if (termAtt.term().equals("phrase")) {
         inPhrase = true;
         savedStart = offsetAtt.startOffset();
         savedEnd = offsetAtt.endOffset();
         termAtt.setTermBuffer("phrase1");
         offsetAtt.setOffset(savedStart, savedEnd);
         return true;
       } else if (!termAtt.term().equals("stop")) return true;
     }
   return false;
 }
Beispiel #22
0
  public static List<String> tokenizeString(String textFile) throws IOException {

    EnglishAnalyzer ena = new EnglishAnalyzer(Version.LUCENE_4_10_4);

    TokenStream tokenStream = ena.tokenStream(textFile.trim(), new StringReader(textFile.trim()));

    //        StringBuilder sb = new StringBuilder();
    OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

    tokenStream.reset();
    while (tokenStream.incrementToken()) {
      int startOffset = offsetAttribute.startOffset();
      int endOffset = offsetAttribute.endOffset();
      String term = charTermAttribute.toString();
      tokens.add(term);
      //            sb.append(term + " ");
    }
    return tokens;
  }
  @Override
  void newTerm(final int termID) {
    assert docState.testPoint("TermVectorsTermsWriterPerField.newTerm start");
    TermVectorsPostingsArray postings = (TermVectorsPostingsArray) termsHashPerField.postingsArray;

    postings.freqs[termID] = 1;

    if (doVectorOffsets) {
      int startOffset = fieldState.offset + offsetAttribute.startOffset();
      int endOffset = fieldState.offset + offsetAttribute.endOffset();

      termsHashPerField.writeVInt(1, startOffset);
      termsHashPerField.writeVInt(1, endOffset - startOffset);
      postings.lastOffsets[termID] = endOffset;
    }

    if (doVectorPositions) {
      termsHashPerField.writeVInt(0, fieldState.position);
      postings.lastPositions[termID] = fieldState.position;
    }
  }
  public static void main(String[] args) throws IOException {
    EdgeNGramAnalyzerWrapper analyzerWrapper =
        new EdgeNGramAnalyzerWrapper(Analyzer.PER_FIELD_REUSE_STRATEGY);

    StringReader reader = new StringReader("hello world");
    TokenStream ts = analyzerWrapper.tokenStream("gramtext", reader);

    CharTermAttribute charAtt = ts.addAttribute(CharTermAttribute.class);

    OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
    ts.reset();
    while (ts.incrementToken()) {
      System.out.println(
          charAtt.toString()
              + " , "
              + "start : "
              + offsetAtt.startOffset()
              + " , "
              + "end : "
              + offsetAtt.endOffset());
    }
  }
 public void testSupplementaryCharacters() throws IOException {
   final String s = _TestUtil.randomUnicodeString(random(), 10);
   final int codePointCount = s.codePointCount(0, s.length());
   final int minGram = _TestUtil.nextInt(random(), 1, 3);
   final int maxGram = _TestUtil.nextInt(random(), minGram, 10);
   TokenStream tk = new KeywordTokenizer(new StringReader(s));
   tk = new NGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram);
   final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
   final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
   tk.reset();
   for (int start = 0; start < codePointCount; ++start) {
     for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) {
       assertTrue(tk.incrementToken());
       assertEquals(0, offsetAtt.startOffset());
       assertEquals(s.length(), offsetAtt.endOffset());
       final int startIndex = Character.offsetByCodePoints(s, 0, start);
       final int endIndex = Character.offsetByCodePoints(s, 0, end);
       assertEquals(s.substring(startIndex, endIndex), termAtt.toString());
     }
   }
   assertFalse(tk.incrementToken());
 }
    @Override
    public boolean incrementToken() throws IOException {
      while (true) {
        if (bufferedToken == null) {
          if (!bufferedTokenStream.incrementToken()) return false;
          bufferedToken = bufferedTokenStream.captureState();
          bufferedStartOffset = bufferedOffsetAtt.startOffset();
          bufferedEndOffset = bufferedOffsetAtt.endOffset();
        }

        if (startOffset <= bufferedStartOffset && bufferedEndOffset <= endOffset) {
          restoreState(bufferedToken);
          bufferedToken = null;
          offsetAtt.setOffset(
              offsetAtt.startOffset() - startOffset, offsetAtt.endOffset() - startOffset);
          return true;
        } else if (bufferedEndOffset > endOffset) {
          startOffset += length + 1;
          return false;
        }
        bufferedToken = null;
      }
    }
Beispiel #27
0
  /**
   * @param input
   * @param reusableToken is null well new one auto.
   * @return null - if not next token or input is null.
   * @throws IOException
   */
  public static Token nextToken(TokenStream input, Token reusableToken) throws IOException {
    if (input == null) {
      return null;
    }
    if (!input.incrementToken()) {
      return null;
    }

    CharTermAttribute termAtt = (CharTermAttribute) input.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = (OffsetAttribute) input.getAttribute(OffsetAttribute.class);
    TypeAttribute typeAtt = (TypeAttribute) input.getAttribute(TypeAttribute.class);

    if (reusableToken == null) {
      reusableToken = new Token();
    }

    reusableToken.clear();
    if (termAtt != null) {
      // lucene 3.0
      // reusableToken.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength());
      // lucene 3.1
      reusableToken.copyBuffer(termAtt.buffer(), 0, termAtt.length());
    }
    if (offsetAtt != null) {
      // lucene 3.1
      // reusableToken.setStartOffset(offsetAtt.startOffset());
      // reusableToken.setEndOffset(offsetAtt.endOffset());
      // lucene 4.0
      reusableToken.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
    }

    if (typeAtt != null) {
      reusableToken.setType(typeAtt.type());
    }

    return reusableToken;
  }
Beispiel #28
0
  /**
   * 查看分析器生成的语汇单元细节
   *
   * @param analyzer
   * @param text
   * @throws IOException
   */
  public static void displayTokens(Analyzer analyzer, String text) throws IOException {
    // 语汇单元流
    TokenStream tokenStream = analyzer.tokenStream("default", new StringReader(text));

    // 获取语汇单元的属性
    TermAttribute termAttribute = tokenStream.addAttribute(TermAttribute.class);
    // 位置增量,在短语查询的时候,同义词查询的时候有作用
    PositionIncrementAttribute positionIncrementAttribute =
        tokenStream.addAttribute(PositionIncrementAttribute.class);
    // 偏移量,高亮查询匹配结果有用
    OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
    // 语汇单元类型,普通是word,还有email等
    TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class);

    int position = 0;
    while (tokenStream.incrementToken()) {
      // 计算位置信息
      int increment = positionIncrementAttribute.getPositionIncrement();
      if (increment > 0) {
        position = position + increment;
      }
      // 打印所有语汇单元的细节信息
      System.out.println(
          "position : "
              + position
              + " ["
              + termAttribute.term()
              + ":"
              + offsetAttribute.startOffset()
              + "->"
              + offsetAttribute.endOffset()
              + ":"
              + typeAttribute.type()
              + "]");
    }
  }
  public void tokenise() throws IOException {
    String ignregexp =
        "--+|\\.\\.+|\\.+\\p{Space}"; // delete full stops and dashes (typically not used).
    if (ignoredElements != null && ignoredElements.length() > 0)
      ignregexp =
          ignregexp
              + "|< *"
              + ignoredElements
              + "[^>]*?/>"
              + "|< *"
              + ignoredElements
              + ".*?>.*?</"
              + ignoredElements
              + " *>";
    if (!tagIndexing) ignregexp = ignregexp + "|<.*?>";
    // ignregexp = ignregexp+"|\\W\\W+";

    Pattern p = Pattern.compile(ignregexp);
    Matcher igns = p.matcher(originalText);

    StringBuffer tx = new StringBuffer(originalText);
    int ct = 1;
    while (igns.find()) {
      int s = igns.start();
      int e = igns.end();
      if (verbose) PrintUtil.printNoMove("Processing exclusions ...", ct++);
      // System.err.println("replacing\n-----------"+originalText.substring(s,e)+"\n--------------");
      char sp[] = new char[e - s];
      for (int j = 0; j < sp.length; j++) {
        sp[j] = ' ';
      }
      tx.replace(s, e, new String(sp));
    }
    if (verbose) PrintUtil.donePrinting();
    ct = 1;
    // verbose = false;
    String text = new String(tx);
    // System.out.println("-->"+text+"<--");
    Tokenizer tokenizer =
        new JapaneseTokenizer(
            new StringReader(text),
            null,
            true,
            org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode.SEARCH);
    TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
    // stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags);
    stream = new CJKWidthFilter(stream);
    // stream = new StopFilter(matchVersion, stream, stopwords);
    stream = new JapaneseKatakanaStemFilter(stream);
    // stream = new LowerCaseFilter(matchVersion, stream);

    OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class);
    CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class);

    while (stream.incrementToken()) {
      int startOffset = offsetAttribute.startOffset();
      int endOffset = offsetAttribute.endOffset();
      String token = charTermAttribute.toString();
      tokenMap.putPos(token, startOffset);
      // System.out.println(token.str+" \t\tS="+token.start+" E="+token.end);
    }
    if (verbose) PrintUtil.donePrinting();
    ct = 1;
  }
Beispiel #30
0
  /**
   * Iterates over the given token stream and adds the resulting terms to the index; Equivalent to
   * adding a tokenized, indexed, termVectorStored, unstored, Lucene {@link
   * org.apache.lucene.document.Field}. Finally closes the token stream. Note that untokenized
   * keywords can be added with this method via {@link #keywordTokenStream(Collection)}, the Lucene
   * <code>KeywordTokenizer</code> or similar utilities.
   *
   * @param fieldName a name to be associated with the text
   * @param tokenStream the token stream to retrieve tokens from. It's guaranteed to be closed no
   *     matter what.
   * @param boost the boost factor for hits for this field
   * @param positionIncrementGap the position increment gap if fields with the same name are added
   *     more than once
   * @param offsetGap the offset gap if fields with the same name are added more than once
   * @see org.apache.lucene.document.Field#setBoost(float)
   */
  public void addField(
      String fieldName,
      TokenStream tokenStream,
      float boost,
      int positionIncrementGap,
      int offsetGap) {
    try (TokenStream stream = tokenStream) {
      if (frozen)
        throw new IllegalArgumentException("Cannot call addField() when MemoryIndex is frozen");
      if (fieldName == null) throw new IllegalArgumentException("fieldName must not be null");
      if (stream == null) throw new IllegalArgumentException("token stream must not be null");
      if (boost <= 0.0f)
        throw new IllegalArgumentException("boost factor must be greater than 0.0");
      int numTokens = 0;
      int numOverlapTokens = 0;
      int pos = -1;
      final BytesRefHash terms;
      final SliceByteStartArray sliceArray;
      Info info;
      long sumTotalTermFreq = 0;
      int offset = 0;
      FieldInfo fieldInfo;
      if ((info = fields.get(fieldName)) != null) {
        fieldInfo = info.fieldInfo;
        numTokens = info.numTokens;
        numOverlapTokens = info.numOverlapTokens;
        pos = info.lastPosition + positionIncrementGap;
        offset = info.lastOffset + offsetGap;
        terms = info.terms;
        boost *= info.boost;
        sliceArray = info.sliceArray;
        sumTotalTermFreq = info.sumTotalTermFreq;
      } else {
        fieldInfo =
            new FieldInfo(
                fieldName,
                fields.size(),
                true,
                false,
                this.storePayloads,
                this.storeOffsets
                    ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS
                    : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS,
                DocValuesType.NONE,
                -1,
                Collections.<String, String>emptyMap());
        sliceArray = new SliceByteStartArray(BytesRefHash.DEFAULT_CAPACITY);
        terms = new BytesRefHash(byteBlockPool, BytesRefHash.DEFAULT_CAPACITY, sliceArray);
      }

      TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
      PositionIncrementAttribute posIncrAttribute =
          stream.addAttribute(PositionIncrementAttribute.class);
      OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
      PayloadAttribute payloadAtt =
          storePayloads ? stream.addAttribute(PayloadAttribute.class) : null;
      BytesRef ref = termAtt.getBytesRef();
      stream.reset();

      while (stream.incrementToken()) {
        termAtt.fillBytesRef();
        //        if (DEBUG) System.err.println("token='" + term + "'");
        numTokens++;
        final int posIncr = posIncrAttribute.getPositionIncrement();
        if (posIncr == 0) numOverlapTokens++;
        pos += posIncr;
        int ord = terms.add(ref);
        if (ord < 0) {
          ord = (-ord) - 1;
          postingsWriter.reset(sliceArray.end[ord]);
        } else {
          sliceArray.start[ord] = postingsWriter.startNewSlice();
        }
        sliceArray.freq[ord]++;
        sumTotalTermFreq++;
        postingsWriter.writeInt(pos);
        if (storeOffsets) {
          postingsWriter.writeInt(offsetAtt.startOffset() + offset);
          postingsWriter.writeInt(offsetAtt.endOffset() + offset);
        }
        if (storePayloads) {
          final BytesRef payload = payloadAtt.getPayload();
          final int pIndex;
          if (payload == null || payload.length == 0) {
            pIndex = -1;
          } else {
            pIndex = payloadsBytesRefs.append(payload);
          }
          postingsWriter.writeInt(pIndex);
        }
        sliceArray.end[ord] = postingsWriter.getCurrentOffset();
      }
      stream.end();

      // ensure infos.numTokens > 0 invariant; needed for correct operation of terms()
      if (numTokens > 0) {
        fields.put(
            fieldName,
            new Info(
                fieldInfo,
                terms,
                sliceArray,
                numTokens,
                numOverlapTokens,
                boost,
                pos,
                offsetAtt.endOffset() + offset,
                sumTotalTermFreq));
      }
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }