Example #1
0
  @Override
  protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
    final StreamLemmasFilter src =
        new StreamLemmasFilter(reader, hebMorphLemmatizer, commonWords, lemmaFilter);
    src.setAlwaysSaveMarkedOriginal(alwaysSaveMarkedOriginal);
    src.setSuffixForExactMatch(suffixForExactMatch);

    TokenStream tok = new SynonymFilter(src, acronymMergingMap, false);
    if (commonWords != null && commonWords.size() > 0)
      tok = new CommonGramsFilter(matchVersion, tok, commonWords, false);
    tok = new SuffixKeywordFilter(tok, '$');
    return new TokenStreamComponents(src, tok) {
      @Override
      protected void setReader(final Reader reader) throws IOException {
        super.setReader(reader);
      }
    };
  }
  @Override
  public boolean incrementToken() throws IOException {
    if (!emitSingleTokens && unusedTokens.size() > 0) {
      Log.debug("emitting unused phrases");
      // emit these until the queue is empty before emitting any new stuff
      Token aToken = unusedTokens.remove(0);
      emit(aToken);
      return true;
    }

    if (lastToken != null) {
      emit(lastToken);
      lastToken = null;
      return true;
    }

    char[] nextToken = nextToken();
    // if (nextToken != null) System.out.println( "nextToken: " + new String( nextToken ));
    if (nextToken == null) {
      if (lastValid != null) {
        emit(lastValid);
        lastValid = null;
        return true;
      }

      if (emitSingleTokens && currentSetToCheck != null && currentSetToCheck.size() > 0) {
        char[] phrase = getFirst(currentSetToCheck);
        char[] lastTok = getCurrentBuffer(new char[0]);
        if (phrase != null && endsWith(lastTok, phrase)) {
          currentSetToCheck = remove(currentSetToCheck, phrase);
          emit(phrase);
          return true;
        }
      } else if (!emitSingleTokens && currentSetToCheck != null && currentSetToCheck.size() > 0) {
        if (lastEmitted != null
            && !equals(fixWhitespace(lastEmitted), getCurrentBuffer(new char[0]))) {
          discardCharTokens(currentPhrase, unusedTokens);
          currentSetToCheck = null;
          if (unusedTokens.size() > 0) {
            Token aToken = unusedTokens.remove(0);
            Log.debug("emitting putback token");
            emit(aToken);
            return true;
          }
        }
      }

      if (lastEmitted == null && (currentPhrase != null && currentPhrase.length() > 0)) {
        char[] lastTok = getCurrentBuffer(new char[0]);
        if (currentSetToCheck.contains(lastTok, 0, lastTok.length)) {
          emit(lastTok);
          currentPhrase.setLength(0);
          return true;
        } else if (!emitSingleTokens) {
          discardCharTokens(currentPhrase, unusedTokens);
          currentSetToCheck = null;
          currentPhrase.setLength(0);
          if (unusedTokens.size() > 0) {
            Token aToken = unusedTokens.remove(0);
            Log.debug("emitting putback token");
            emit(aToken);
            return true;
          }
        }
      }
      return false;
    }

    // if emitSingleToken, set lastToken = nextToken
    if (emitSingleTokens) {
      lastToken = nextToken;
    }

    if (currentSetToCheck == null || currentSetToCheck.size() == 0) {
      Log.debug("Checking for phrase start on '" + new String(nextToken) + "'");

      if (phraseMap.keySet().contains(nextToken, 0, nextToken.length)) {
        // get the phrase set for this token, add it to currentSetTocheck
        currentSetToCheck = phraseMap.get(nextToken, 0, nextToken.length);
        if (currentPhrase == null) currentPhrase = new StringBuffer();
        else currentPhrase.setLength(0);
        currentPhrase.append(nextToken);
        return incrementToken();
      } else {
        emit(nextToken);
        // clear lastToken
        lastToken = null;
        return true;
      }
    } else {
      // add token to the current string buffer.
      char[] currentBuffer = getCurrentBuffer(nextToken);

      if (currentSetToCheck.contains(currentBuffer, 0, currentBuffer.length)) {
        // if its the only one valid, emit it
        // if there is a longer one, wait to see if it will be matched
        // if the longer one breaks on the next token, emit this one...
        // emit the current phrase
        currentSetToCheck = remove(currentSetToCheck, currentBuffer);

        if (currentSetToCheck.size() == 0) {
          emit(currentBuffer);
          lastValid = null;
          --positionIncr;
        } else {
          if (emitSingleTokens) {
            lastToken = currentBuffer;
            return true;
          }
          lastValid = currentBuffer;
        }

        if (phraseMap.keySet().contains(nextToken, 0, nextToken.length)) {
          // get the phrase set for this token, add it to currentPhrasesTocheck
          currentSetToCheck = phraseMap.get(nextToken, 0, nextToken.length);
          if (currentPhrase == null) currentPhrase = new StringBuffer();
          else currentPhrase.setLength(0);
          currentPhrase.append(nextToken);
        }

        return (lastValid != null) ? incrementToken() : true;
      }

      if (phraseMap.keySet().contains(nextToken, 0, nextToken.length)) {
        // get the phrase set for this token, add it to currentPhrasesTocheck
        // System.out.println( "starting new phrase with " + new String( nextToken ) );
        // does this add all of the set? if not need iterator loop
        CharArraySet newSet = phraseMap.get(nextToken, 0, nextToken.length);
        Iterator<Object> phraseIt = newSet.iterator();
        while (phraseIt != null && phraseIt.hasNext()) {
          char[] phrase = (char[]) phraseIt.next();
          currentSetToCheck.add(phrase);
        }
      }

      // for each phrase in currentSetToCheck -
      // if there is a phrase prefix match, get the next token recursively
      Iterator<Object> phraseIt = currentSetToCheck.iterator();
      while (phraseIt != null && phraseIt.hasNext()) {
        char[] phrase = (char[]) phraseIt.next();

        if (startsWith(phrase, currentBuffer)) {
          return incrementToken();
        }
      }

      if (lastValid != null) {
        emit(lastValid);
        lastValid = null;
        return true;
      }

      if (!emitSingleTokens) {
        // current phrase didn't match fully: put the tokens back
        // into the unusedTokens list
        discardCharTokens(currentPhrase, unusedTokens);
        currentPhrase.setLength(0);
        currentSetToCheck = null;

        if (unusedTokens.size() > 0) {
          Token aToken = unusedTokens.remove(0);
          Log.debug("emitting putback token");
          emit(aToken);
          return true;
        }
      }
      currentSetToCheck = null;

      Log.debug("returning at end.");
      return incrementToken();
    }
  }