コード例 #1
0
  private CharArraySet remove(CharArraySet fromSet, char[] charArray) {
    // System.out.println( "remove from: " + new String( charArray ));
    CharArraySet newSet = new CharArraySet(5, false);
    Iterator<Object> phraseIt = currentSetToCheck.iterator();
    while (phraseIt != null && phraseIt.hasNext()) {
      char[] phrase = (char[]) phraseIt.next();

      // if (!equals( phrase, charArray) && (startsWith( charArray, phrase ) || endsWith( charArray,
      // phrase))) {
      if (!equals(phrase, charArray) && startsWith(phrase, charArray)
          || endsWith(charArray, phrase)) {
        newSet.add(phrase);
      } else {
        // System.out.println( "removing " + new String( phrase ));
      }
    }

    return newSet;
  }
コード例 #2
0
  private CharArrayMap convertPhraseSet(CharArraySet phraseSet) {
    CharArrayMap<CharArraySet> phraseMap = new CharArrayMap(100, false);
    Iterator<Object> phraseIt = phraseSet.iterator();
    while (phraseIt != null && phraseIt.hasNext()) {
      char[] phrase = (char[]) phraseIt.next();

      Log.debug("'" + new String(phrase) + "'");

      char[] firstTerm = getFirstTerm(phrase);
      Log.debug("'" + new String(firstTerm) + "'");

      CharArraySet itsPhrases = phraseMap.get(firstTerm, 0, firstTerm.length);
      if (itsPhrases == null) {
        itsPhrases = new CharArraySet(5, false);
        phraseMap.put(new String(firstTerm), itsPhrases);
      }

      itsPhrases.add(phrase);
    }

    return phraseMap;
  }
コード例 #3
0
  @Override
  public boolean incrementToken() throws IOException {
    if (!emitSingleTokens && unusedTokens.size() > 0) {
      Log.debug("emitting unused phrases");
      // emit these until the queue is empty before emitting any new stuff
      Token aToken = unusedTokens.remove(0);
      emit(aToken);
      return true;
    }

    if (lastToken != null) {
      emit(lastToken);
      lastToken = null;
      return true;
    }

    char[] nextToken = nextToken();
    // if (nextToken != null) System.out.println( "nextToken: " + new String( nextToken ));
    if (nextToken == null) {
      if (lastValid != null) {
        emit(lastValid);
        lastValid = null;
        return true;
      }

      if (emitSingleTokens && currentSetToCheck != null && currentSetToCheck.size() > 0) {
        char[] phrase = getFirst(currentSetToCheck);
        char[] lastTok = getCurrentBuffer(new char[0]);
        if (phrase != null && endsWith(lastTok, phrase)) {
          currentSetToCheck = remove(currentSetToCheck, phrase);
          emit(phrase);
          return true;
        }
      } else if (!emitSingleTokens && currentSetToCheck != null && currentSetToCheck.size() > 0) {
        if (lastEmitted != null
            && !equals(fixWhitespace(lastEmitted), getCurrentBuffer(new char[0]))) {
          discardCharTokens(currentPhrase, unusedTokens);
          currentSetToCheck = null;
          if (unusedTokens.size() > 0) {
            Token aToken = unusedTokens.remove(0);
            Log.debug("emitting putback token");
            emit(aToken);
            return true;
          }
        }
      }

      if (lastEmitted == null && (currentPhrase != null && currentPhrase.length() > 0)) {
        char[] lastTok = getCurrentBuffer(new char[0]);
        if (currentSetToCheck.contains(lastTok, 0, lastTok.length)) {
          emit(lastTok);
          currentPhrase.setLength(0);
          return true;
        } else if (!emitSingleTokens) {
          discardCharTokens(currentPhrase, unusedTokens);
          currentSetToCheck = null;
          currentPhrase.setLength(0);
          if (unusedTokens.size() > 0) {
            Token aToken = unusedTokens.remove(0);
            Log.debug("emitting putback token");
            emit(aToken);
            return true;
          }
        }
      }
      return false;
    }

    // if emitSingleToken, set lastToken = nextToken
    if (emitSingleTokens) {
      lastToken = nextToken;
    }

    if (currentSetToCheck == null || currentSetToCheck.size() == 0) {
      Log.debug("Checking for phrase start on '" + new String(nextToken) + "'");

      if (phraseMap.keySet().contains(nextToken, 0, nextToken.length)) {
        // get the phrase set for this token, add it to currentSetTocheck
        currentSetToCheck = phraseMap.get(nextToken, 0, nextToken.length);
        if (currentPhrase == null) currentPhrase = new StringBuffer();
        else currentPhrase.setLength(0);
        currentPhrase.append(nextToken);
        return incrementToken();
      } else {
        emit(nextToken);
        // clear lastToken
        lastToken = null;
        return true;
      }
    } else {
      // add token to the current string buffer.
      char[] currentBuffer = getCurrentBuffer(nextToken);

      if (currentSetToCheck.contains(currentBuffer, 0, currentBuffer.length)) {
        // if its the only one valid, emit it
        // if there is a longer one, wait to see if it will be matched
        // if the longer one breaks on the next token, emit this one...
        // emit the current phrase
        currentSetToCheck = remove(currentSetToCheck, currentBuffer);

        if (currentSetToCheck.size() == 0) {
          emit(currentBuffer);
          lastValid = null;
          --positionIncr;
        } else {
          if (emitSingleTokens) {
            lastToken = currentBuffer;
            return true;
          }
          lastValid = currentBuffer;
        }

        if (phraseMap.keySet().contains(nextToken, 0, nextToken.length)) {
          // get the phrase set for this token, add it to currentPhrasesTocheck
          currentSetToCheck = phraseMap.get(nextToken, 0, nextToken.length);
          if (currentPhrase == null) currentPhrase = new StringBuffer();
          else currentPhrase.setLength(0);
          currentPhrase.append(nextToken);
        }

        return (lastValid != null) ? incrementToken() : true;
      }

      if (phraseMap.keySet().contains(nextToken, 0, nextToken.length)) {
        // get the phrase set for this token, add it to currentPhrasesTocheck
        // System.out.println( "starting new phrase with " + new String( nextToken ) );
        // does this add all of the set? if not need iterator loop
        CharArraySet newSet = phraseMap.get(nextToken, 0, nextToken.length);
        Iterator<Object> phraseIt = newSet.iterator();
        while (phraseIt != null && phraseIt.hasNext()) {
          char[] phrase = (char[]) phraseIt.next();
          currentSetToCheck.add(phrase);
        }
      }

      // for each phrase in currentSetToCheck -
      // if there is a phrase prefix match, get the next token recursively
      Iterator<Object> phraseIt = currentSetToCheck.iterator();
      while (phraseIt != null && phraseIt.hasNext()) {
        char[] phrase = (char[]) phraseIt.next();

        if (startsWith(phrase, currentBuffer)) {
          return incrementToken();
        }
      }

      if (lastValid != null) {
        emit(lastValid);
        lastValid = null;
        return true;
      }

      if (!emitSingleTokens) {
        // current phrase didn't match fully: put the tokens back
        // into the unusedTokens list
        discardCharTokens(currentPhrase, unusedTokens);
        currentPhrase.setLength(0);
        currentSetToCheck = null;

        if (unusedTokens.size() > 0) {
          Token aToken = unusedTokens.remove(0);
          Log.debug("emitting putback token");
          emit(aToken);
          return true;
        }
      }
      currentSetToCheck = null;

      Log.debug("returning at end.");
      return incrementToken();
    }
  }
コード例 #4
0
 private char[] getFirst(CharArraySet charSet) {
   if (charSet.isEmpty()) return null;
   Iterator<Object> phraseIt = charSet.iterator();
   return (char[]) phraseIt.next();
 }