private String tokenizerToString(Tokenizer tokenizer) throws Exception {
    OffsetAttribute extOffset = tokenizer.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncrAtt =
        tokenizer.addAttribute(PositionIncrementAttribute.class);
    PositionLengthAttribute posLengthAtt = tokenizer.addAttribute(PositionLengthAttribute.class);
    CharTermAttribute term = tokenizer.addAttribute(CharTermAttribute.class);
    TypeAttribute type = tokenizer.addAttribute(TypeAttribute.class);
    SemanticClassAttribute semanticClass = tokenizer.addAttribute(SemanticClassAttribute.class);
    PartOfSpeechAttribute pos = tokenizer.addAttribute(PartOfSpeechAttribute.class);

    StringBuilder result = new StringBuilder();
    tokenizer.reset();
    while (tokenizer.incrementToken() == true) {
      result.append(new String(term.buffer(), 0, term.length())).append(":");
      result.append(type.type()).append(":");
      result.append(pos.partOfSpeech()).append(":");
      result.append(semanticClass.semanticClass()).append(":");
      result.append(String.valueOf(posIncrAtt.getPositionIncrement())).append(":");
      result.append(String.valueOf(posLengthAtt.getPositionLength())).append(":");
      result.append(String.valueOf(extOffset.startOffset())).append(":");
      result.append(String.valueOf(extOffset.endOffset()));
      result.append(",");
    }
    tokenizer.end();
    return result.toString();
  }
Пример #2
0
  @Override
  public boolean incrementToken() throws IOException {

    // parse() is able to return w/o producing any new
    // tokens, when the tokens it had produced were entirely
    // punctuation.  So we loop here until we get a real
    // token or we end:
    while (pending.size() == 0) {
      if (end) {
        return false;
      }

      // Push Viterbi forward some more:
      parse();
    }

    final Token token = pending.remove(pending.size() - 1);

    int position = token.getPosition();
    int length = token.getLength();
    clearAttributes();
    assert length > 0;
    // System.out.println("off=" + token.getOffset() + " len=" + length + " vs " +
    // token.getSurfaceForm().length);
    termAtt.copyBuffer(token.getSurfaceForm(), token.getOffset(), length);
    offsetAtt.setOffset(correctOffset(position), correctOffset(position + length));
    basicFormAtt.setToken(token);
    posAtt.setToken(token);
    readingAtt.setToken(token);
    inflectionAtt.setToken(token);
    if (token.getPosition() == lastTokenPos) {
      posIncAtt.setPositionIncrement(0);
      posLengthAtt.setPositionLength(token.getPositionLength());
    } else {
      assert token.getPosition() > lastTokenPos;
      posIncAtt.setPositionIncrement(1);
      posLengthAtt.setPositionLength(1);
    }
    if (VERBOSE) {
      System.out.println(Thread.currentThread().getName() + ":    incToken: return token=" + token);
    }
    lastTokenPos = token.getPosition();
    return true;
  }
Пример #3
0
 @Override
 public final boolean incrementToken() throws IOException {
   while (true) {
     if (curTermBuffer == null) {
       if (!input.incrementToken()) {
         return false;
       } else {
         curTermBuffer = termAtt.buffer().clone();
         curTermLength = termAtt.length();
         curCodePointCount = charUtils.codePointCount(termAtt);
         curGramSize = minGram;
         tokStart = offsetAtt.startOffset();
         tokEnd = offsetAtt.endOffset();
         savePosIncr += posIncrAtt.getPositionIncrement();
         savePosLen = posLenAtt.getPositionLength();
       }
     }
     if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit
       if (curGramSize
           <= curCodePointCount) { // if the remaining input is too short, we can't generate any
         // n-grams
         // grab gramSize chars from front or back
         clearAttributes();
         offsetAtt.setOffset(tokStart, tokEnd);
         // first ngram gets increment, others don't
         if (curGramSize == minGram) {
           posIncrAtt.setPositionIncrement(savePosIncr);
           savePosIncr = 0;
         } else {
           posIncrAtt.setPositionIncrement(0);
         }
         posLenAtt.setPositionLength(savePosLen);
         final int charLength =
             charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);
         termAtt.copyBuffer(curTermBuffer, 0, charLength);
         curGramSize++;
         return true;
       }
     }
     curTermBuffer = null;
   }
 }
 @Override
 public boolean incrementToken() throws IOException {
   boolean tokenAvailable = false;
   int builtGramSize = 0;
   if (gramSize.atMinValue() || inputWindow.size() < gramSize.getValue()) {
     shiftInputWindow();
     gramBuilder.setLength(0);
   } else {
     builtGramSize = gramSize.getPreviousValue();
   }
   if (inputWindow.size() >= gramSize.getValue()) {
     boolean isAllFiller = true;
     InputWindowToken nextToken = null;
     Iterator<InputWindowToken> iter = inputWindow.iterator();
     for (int gramNum = 1; iter.hasNext() && builtGramSize < gramSize.getValue(); ++gramNum) {
       nextToken = iter.next();
       if (builtGramSize < gramNum) {
         if (builtGramSize > 0) {
           gramBuilder.append(tokenSeparator);
         }
         gramBuilder.append(nextToken.termAtt.buffer(), 0, nextToken.termAtt.length());
         ++builtGramSize;
       }
       if (isAllFiller && nextToken.isFiller) {
         if (gramNum == gramSize.getValue()) {
           gramSize.advance();
         }
       } else {
         isAllFiller = false;
       }
     }
     if (!isAllFiller && builtGramSize == gramSize.getValue()) {
       inputWindow.getFirst().attSource.copyTo(this);
       posIncrAtt.setPositionIncrement(isOutputHere ? 0 : 1);
       termAtt.setEmpty().append(gramBuilder);
       if (gramSize.getValue() > 1) {
         typeAtt.setType(tokenType);
         noShingleOutput = false;
       }
       offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
       posLenAtt.setPositionLength(builtGramSize);
       isOutputHere = true;
       gramSize.advance();
       tokenAvailable = true;
     }
   }
   return tokenAvailable;
 }
Пример #5
0
 @Override
 public boolean incrementToken() {
   if (upto < tokens.length) {
     final Token token = tokens[upto++];
     // TODO: can we just capture/restoreState so
     // we get all attrs...?
     clearAttributes();
     termAtt.setEmpty();
     termAtt.append(token.toString());
     posIncrAtt.setPositionIncrement(token.getPositionIncrement());
     posLengthAtt.setPositionLength(token.getPositionLength());
     offsetAtt.setOffset(token.startOffset(), token.endOffset());
     payloadAtt.setPayload(token.getPayload());
     return true;
   } else {
     return false;
   }
 }
Пример #6
0
 @Override
 public boolean incrementToken() throws IOException {
   if (tokens == null) {
     fillTokens();
   }
   // System.out.println("graphTokenizer: incr upto=" + upto + " vs " + tokens.size());
   if (upto == tokens.size()) {
     // System.out.println("  END @ " + tokens.size());
     return false;
   }
   final Token t = tokens.get(upto++);
   // System.out.println("  return token=" + t);
   clearAttributes();
   termAtt.append(t.toString());
   offsetAtt.setOffset(t.startOffset(), t.endOffset());
   posIncrAtt.setPositionIncrement(t.getPositionIncrement());
   posLengthAtt.setPositionLength(t.getPositionLength());
   return true;
 }
Пример #7
0
  @Override
  public boolean incrementToken() throws IOException {

    // System.out.println("\nS: incrToken inputSkipCount=" + inputSkipCount + " nextRead=" +
    // nextRead + " nextWrite=" + nextWrite);

    while (true) {

      // First play back any buffered future inputs/outputs
      // w/o running parsing again:
      while (inputSkipCount != 0) {

        // At each position, we first output the original
        // token

        // TODO: maybe just a PendingState class, holding
        // both input & outputs?
        final PendingInput input = futureInputs[nextRead];
        final PendingOutputs outputs = futureOutputs[nextRead];

        // System.out.println("  cycle nextRead=" + nextRead + " nextWrite=" + nextWrite + "
        // inputSkipCount="+ inputSkipCount + " input.keepOrig=" + input.keepOrig + "
        // input.consumed=" + input.consumed + " input.state=" + input.state);

        if (!input.consumed && (input.keepOrig || !input.matched)) {
          if (input.state != null) {
            // Return a previously saved token (because we
            // had to lookahead):
            restoreState(input.state);
          } else {
            // Pass-through case: return token we just pulled
            // but didn't capture:
            assert inputSkipCount == 1
                : "inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead;
          }
          input.reset();
          if (outputs.count > 0) {
            outputs.posIncr = 0;
          } else {
            nextRead = rollIncr(nextRead);
            inputSkipCount--;
          }
          // System.out.println("  return token=" + termAtt.toString());
          return true;
        } else if (outputs.upto < outputs.count) {
          // Still have pending outputs to replay at this
          // position
          input.reset();
          final int posIncr = outputs.posIncr;
          final CharsRef output = outputs.pullNext();
          clearAttributes();
          termAtt.copyBuffer(output.chars, output.offset, output.length);
          typeAtt.setType(TYPE_SYNONYM);
          int endOffset = outputs.getLastEndOffset();
          if (endOffset == -1) {
            endOffset = input.endOffset;
          }
          offsetAtt.setOffset(input.startOffset, endOffset);
          posIncrAtt.setPositionIncrement(posIncr);
          posLenAtt.setPositionLength(outputs.getLastPosLength());
          if (outputs.count == 0) {
            // Done with the buffered input and all outputs at
            // this position
            nextRead = rollIncr(nextRead);
            inputSkipCount--;
          }
          // System.out.println("  return token=" + termAtt.toString());
          return true;
        } else {
          // Done with the buffered input and all outputs at
          // this position
          input.reset();
          nextRead = rollIncr(nextRead);
          inputSkipCount--;
        }
      }

      if (finished && nextRead == nextWrite) {
        // End case: if any output syns went beyond end of
        // input stream, enumerate them now:
        final PendingOutputs outputs = futureOutputs[nextRead];
        if (outputs.upto < outputs.count) {
          final int posIncr = outputs.posIncr;
          final CharsRef output = outputs.pullNext();
          futureInputs[nextRead].reset();
          if (outputs.count == 0) {
            nextWrite = nextRead = rollIncr(nextRead);
          }
          clearAttributes();
          // Keep offset from last input token:
          offsetAtt.setOffset(lastStartOffset, lastEndOffset);
          termAtt.copyBuffer(output.chars, output.offset, output.length);
          typeAtt.setType(TYPE_SYNONYM);
          // System.out.println("  set posIncr=" + outputs.posIncr + " outputs=" + outputs);
          posIncrAtt.setPositionIncrement(posIncr);
          // System.out.println("  return token=" + termAtt.toString());
          return true;
        } else {
          return false;
        }
      }

      // Find new synonym matches:
      parse();
    }
  }