예제 #1
0
 private boolean lookup() {
   /**
    * **** debugging code String thisLookup = word.toString(); boolean added =
    * lookups.add(thisLookup); if (!added) { System.out.println("######extra lookup:" +
    * thisLookup); // occaasional extra lookups aren't necessarily errors... could happen by diff
    * manipulations // throw new RuntimeException("######extra lookup:" + thisLookup); } else { //
    * System.out.println("new lookup:" + thisLookup); } ****
    */
   matchedEntry = dict_ht.get(word.getArray(), 0, word.size());
   return matchedEntry != null;
 }
예제 #2
0
 private DictEntry wordInDict() {
   /**
    * * if (matchedEntry != null) { if (dict_ht.get(word.getArray(), 0, word.size()) !=
    * matchedEntry) { System.out.println("Uh oh... cached entry doesn't match"); } return
    * matchedEntry; } *
    */
   if (matchedEntry != null) return matchedEntry;
   DictEntry e = dict_ht.get(word.getArray(), 0, word.length());
   if (e != null && !e.exception) {
     matchedEntry = e; // only cache if it's not an exception.
   }
   // lookups.add(word.toString());
   return e;
 }
  private CharArrayMap convertPhraseSet(CharArraySet phraseSet) {
    CharArrayMap<CharArraySet> phraseMap = new CharArrayMap(100, false);
    Iterator<Object> phraseIt = phraseSet.iterator();
    while (phraseIt != null && phraseIt.hasNext()) {
      char[] phrase = (char[]) phraseIt.next();

      Log.debug("'" + new String(phrase) + "'");

      char[] firstTerm = getFirstTerm(phrase);
      Log.debug("'" + new String(firstTerm) + "'");

      CharArraySet itsPhrases = phraseMap.get(firstTerm, 0, firstTerm.length);
      if (itsPhrases == null) {
        itsPhrases = new CharArraySet(5, false);
        phraseMap.put(new String(firstTerm), itsPhrases);
      }

      itsPhrases.add(phrase);
    }

    return phraseMap;
  }
  @Override
  public boolean incrementToken() throws IOException {
    if (!emitSingleTokens && unusedTokens.size() > 0) {
      Log.debug("emitting unused phrases");
      // emit these until the queue is empty before emitting any new stuff
      Token aToken = unusedTokens.remove(0);
      emit(aToken);
      return true;
    }

    if (lastToken != null) {
      emit(lastToken);
      lastToken = null;
      return true;
    }

    char[] nextToken = nextToken();
    // if (nextToken != null) System.out.println( "nextToken: " + new String( nextToken ));
    if (nextToken == null) {
      if (lastValid != null) {
        emit(lastValid);
        lastValid = null;
        return true;
      }

      if (emitSingleTokens && currentSetToCheck != null && currentSetToCheck.size() > 0) {
        char[] phrase = getFirst(currentSetToCheck);
        char[] lastTok = getCurrentBuffer(new char[0]);
        if (phrase != null && endsWith(lastTok, phrase)) {
          currentSetToCheck = remove(currentSetToCheck, phrase);
          emit(phrase);
          return true;
        }
      } else if (!emitSingleTokens && currentSetToCheck != null && currentSetToCheck.size() > 0) {
        if (lastEmitted != null
            && !equals(fixWhitespace(lastEmitted), getCurrentBuffer(new char[0]))) {
          discardCharTokens(currentPhrase, unusedTokens);
          currentSetToCheck = null;
          if (unusedTokens.size() > 0) {
            Token aToken = unusedTokens.remove(0);
            Log.debug("emitting putback token");
            emit(aToken);
            return true;
          }
        }
      }

      if (lastEmitted == null && (currentPhrase != null && currentPhrase.length() > 0)) {
        char[] lastTok = getCurrentBuffer(new char[0]);
        if (currentSetToCheck.contains(lastTok, 0, lastTok.length)) {
          emit(lastTok);
          currentPhrase.setLength(0);
          return true;
        } else if (!emitSingleTokens) {
          discardCharTokens(currentPhrase, unusedTokens);
          currentSetToCheck = null;
          currentPhrase.setLength(0);
          if (unusedTokens.size() > 0) {
            Token aToken = unusedTokens.remove(0);
            Log.debug("emitting putback token");
            emit(aToken);
            return true;
          }
        }
      }
      return false;
    }

    // if emitSingleToken, set lastToken = nextToken
    if (emitSingleTokens) {
      lastToken = nextToken;
    }

    if (currentSetToCheck == null || currentSetToCheck.size() == 0) {
      Log.debug("Checking for phrase start on '" + new String(nextToken) + "'");

      if (phraseMap.keySet().contains(nextToken, 0, nextToken.length)) {
        // get the phrase set for this token, add it to currentSetTocheck
        currentSetToCheck = phraseMap.get(nextToken, 0, nextToken.length);
        if (currentPhrase == null) currentPhrase = new StringBuffer();
        else currentPhrase.setLength(0);
        currentPhrase.append(nextToken);
        return incrementToken();
      } else {
        emit(nextToken);
        // clear lastToken
        lastToken = null;
        return true;
      }
    } else {
      // add token to the current string buffer.
      char[] currentBuffer = getCurrentBuffer(nextToken);

      if (currentSetToCheck.contains(currentBuffer, 0, currentBuffer.length)) {
        // if its the only one valid, emit it
        // if there is a longer one, wait to see if it will be matched
        // if the longer one breaks on the next token, emit this one...
        // emit the current phrase
        currentSetToCheck = remove(currentSetToCheck, currentBuffer);

        if (currentSetToCheck.size() == 0) {
          emit(currentBuffer);
          lastValid = null;
          --positionIncr;
        } else {
          if (emitSingleTokens) {
            lastToken = currentBuffer;
            return true;
          }
          lastValid = currentBuffer;
        }

        if (phraseMap.keySet().contains(nextToken, 0, nextToken.length)) {
          // get the phrase set for this token, add it to currentPhrasesTocheck
          currentSetToCheck = phraseMap.get(nextToken, 0, nextToken.length);
          if (currentPhrase == null) currentPhrase = new StringBuffer();
          else currentPhrase.setLength(0);
          currentPhrase.append(nextToken);
        }

        return (lastValid != null) ? incrementToken() : true;
      }

      if (phraseMap.keySet().contains(nextToken, 0, nextToken.length)) {
        // get the phrase set for this token, add it to currentPhrasesTocheck
        // System.out.println( "starting new phrase with " + new String( nextToken ) );
        // does this add all of the set? if not need iterator loop
        CharArraySet newSet = phraseMap.get(nextToken, 0, nextToken.length);
        Iterator<Object> phraseIt = newSet.iterator();
        while (phraseIt != null && phraseIt.hasNext()) {
          char[] phrase = (char[]) phraseIt.next();
          currentSetToCheck.add(phrase);
        }
      }

      // for each phrase in currentSetToCheck -
      // if there is a phrase prefix match, get the next token recursively
      Iterator<Object> phraseIt = currentSetToCheck.iterator();
      while (phraseIt != null && phraseIt.hasNext()) {
        char[] phrase = (char[]) phraseIt.next();

        if (startsWith(phrase, currentBuffer)) {
          return incrementToken();
        }
      }

      if (lastValid != null) {
        emit(lastValid);
        lastValid = null;
        return true;
      }

      if (!emitSingleTokens) {
        // current phrase didn't match fully: put the tokens back
        // into the unusedTokens list
        discardCharTokens(currentPhrase, unusedTokens);
        currentPhrase.setLength(0);
        currentSetToCheck = null;

        if (unusedTokens.size() > 0) {
          Token aToken = unusedTokens.remove(0);
          Log.debug("emitting putback token");
          emit(aToken);
          return true;
        }
      }
      currentSetToCheck = null;

      Log.debug("returning at end.");
      return incrementToken();
    }
  }
예제 #5
0
  /** Stems the text in the token. Returns true if changed. */
  boolean stem(char[] term, int len) {

    result = null;

    k = len - 1;
    if ((k <= 1) || (k >= MaxWordLen - 1)) {
      return false; // don't stem
    }

    // first check the stemmer dictionaries, and avoid using the
    // cache if it's in there.
    DictEntry entry = dict_ht.get(term, 0, len);
    if (entry != null) {
      if (entry.root != null) {
        result = entry.root;
        return true;
      }
      return false;
    }

    /**
     * * caching off is normally faster if (cache == null) initializeStemHash();
     *
     * <p>// now check the cache, before we copy chars to "word" if (cache != null) { String val =
     * cache.get(term, 0, len); if (val != null) { if (val != SAME) { result = val; return true; }
     * return false; } } *
     */
    word.reset();
    // allocate enough space so that an expansion is never needed
    word.reserve(len + 10);
    for (int i = 0; i < len; i++) {
      char ch = term[i];
      if (!isAlpha(ch)) return false; // don't stem
      // don't lowercase... it's a requirement that lowercase filter be
      // used before this stemmer.
      word.unsafeWrite(ch);
    }

    matchedEntry = null;
    /** * lookups.clear(); lookups.add(word.toString()); * */

    /*
     * This while loop will never be executed more than one time; it is here
     * only to allow the break statement to be used to escape as soon as a word
     * is recognized
     */
    while (true) {
      // YCS: extra lookup()s were inserted so we don't need to
      // do an extra wordInDict() here.
      plural();
      if (matched()) break;
      pastTense();
      if (matched()) break;
      aspect();
      if (matched()) break;
      ityEndings();
      if (matched()) break;
      nessEndings();
      if (matched()) break;
      ionEndings();
      if (matched()) break;
      erAndOrEndings();
      if (matched()) break;
      lyEndings();
      if (matched()) break;
      alEndings();
      if (matched()) break;
      entry = wordInDict();
      iveEndings();
      if (matched()) break;
      izeEndings();
      if (matched()) break;
      mentEndings();
      if (matched()) break;
      bleEndings();
      if (matched()) break;
      ismEndings();
      if (matched()) break;
      icEndings();
      if (matched()) break;
      ncyEndings();
      if (matched()) break;
      nceEndings();
      matched();
      break;
    }

    /*
     * try for a direct mapping (allows for cases like `Italian'->`Italy' and
     * `Italians'->`Italy')
     */
    entry = matchedEntry;
    if (entry != null) {
      result = entry.root; // may be null, which means that "word" is the stem
    }

    /**
     * * caching off is normally faster if (cache != null && cache.size() < maxCacheSize) { char[]
     * key = new char[len]; System.arraycopy(term, 0, key, 0, len); if (result != null) {
     * cache.put(key, result); } else { cache.put(key, word.toString()); } } *
     */

    /**
     * * if (entry == null) { if (!word.toString().equals(new String(term,0,len))) {
     * System.out.println("CASE:" + word.toString() + "," + new String(term,0,len));
     *
     * <p>} } *
     */

    // no entry matched means result is "word"
    return true;
  }