Beispiel #1
0
 /**
  * Returns a copy of the given map as a {@link CharArrayMap}. If the given map is a {@link
  * CharArrayMap} the ignoreCase property will be preserved.
  *
  * @param map a map to copy
  * @return a copy of the given map as a {@link CharArrayMap}. If the given map is a {@link
  *     CharArrayMap} the ignoreCase property as well as the matchVersion will be of the given map
  *     will be preserved.
  */
 @SuppressWarnings("unchecked")
 public static <V> CharArrayMap<V> copy(final Map<?, ? extends V> map) {
   if (map == EMPTY_MAP) return emptyMap();
   if (map instanceof CharArrayMap) {
     CharArrayMap<V> m = (CharArrayMap<V>) map;
     // use fast path instead of iterating all values
     // this is even on very small sets ~10 times faster than iterating
     final char[][] keys = new char[m.keys.length][];
     System.arraycopy(m.keys, 0, keys, 0, keys.length);
     final V[] values = (V[]) new Object[m.values.length];
     System.arraycopy(m.values, 0, values, 0, values.length);
     m = new CharArrayMap<>(m);
     m.keys = keys;
     m.values = values;
     return m;
   }
   // In jdk-9b54 or later, a plain diamond causes compile error with "-source 1.7":
   return new CharArrayMap<V>(map, false);
 }
Beispiel #2
0
 private boolean lookup() {
   /**
    * **** debugging code String thisLookup = word.toString(); boolean added =
    * lookups.add(thisLookup); if (!added) { System.out.println("######extra lookup:" +
    * thisLookup); // occaasional extra lookups aren't necessarily errors... could happen by diff
    * manipulations // throw new RuntimeException("######extra lookup:" + thisLookup); } else { //
    * System.out.println("new lookup:" + thisLookup); } ****
    */
   matchedEntry = dict_ht.get(word.getArray(), 0, word.size());
   return matchedEntry != null;
 }
  private CharArrayMap convertPhraseSet(CharArraySet phraseSet) {
    CharArrayMap<CharArraySet> phraseMap = new CharArrayMap(100, false);
    Iterator<Object> phraseIt = phraseSet.iterator();
    while (phraseIt != null && phraseIt.hasNext()) {
      char[] phrase = (char[]) phraseIt.next();

      Log.debug("'" + new String(phrase) + "'");

      char[] firstTerm = getFirstTerm(phrase);
      Log.debug("'" + new String(firstTerm) + "'");

      CharArraySet itsPhrases = phraseMap.get(firstTerm, 0, firstTerm.length);
      if (itsPhrases == null) {
        itsPhrases = new CharArraySet(5, false);
        phraseMap.put(new String(firstTerm), itsPhrases);
      }

      itsPhrases.add(phrase);
    }

    return phraseMap;
  }
Beispiel #4
0
 private DictEntry wordInDict() {
   /**
    * * if (matchedEntry != null) { if (dict_ht.get(word.getArray(), 0, word.size()) !=
    * matchedEntry) { System.out.println("Uh oh... cached entry doesn't match"); } return
    * matchedEntry; } *
    */
   if (matchedEntry != null) return matchedEntry;
   DictEntry e = dict_ht.get(word.getArray(), 0, word.length());
   if (e != null && !e.exception) {
     matchedEntry = e; // only cache if it's not an exception.
   }
   // lookups.add(word.toString());
   return e;
 }
  @Override
  public boolean incrementToken() throws IOException {
    if (!emitSingleTokens && unusedTokens.size() > 0) {
      Log.debug("emitting unused phrases");
      // emit these until the queue is empty before emitting any new stuff
      Token aToken = unusedTokens.remove(0);
      emit(aToken);
      return true;
    }

    if (lastToken != null) {
      emit(lastToken);
      lastToken = null;
      return true;
    }

    char[] nextToken = nextToken();
    // if (nextToken != null) System.out.println( "nextToken: " + new String( nextToken ));
    if (nextToken == null) {
      if (lastValid != null) {
        emit(lastValid);
        lastValid = null;
        return true;
      }

      if (emitSingleTokens && currentSetToCheck != null && currentSetToCheck.size() > 0) {
        char[] phrase = getFirst(currentSetToCheck);
        char[] lastTok = getCurrentBuffer(new char[0]);
        if (phrase != null && endsWith(lastTok, phrase)) {
          currentSetToCheck = remove(currentSetToCheck, phrase);
          emit(phrase);
          return true;
        }
      } else if (!emitSingleTokens && currentSetToCheck != null && currentSetToCheck.size() > 0) {
        if (lastEmitted != null
            && !equals(fixWhitespace(lastEmitted), getCurrentBuffer(new char[0]))) {
          discardCharTokens(currentPhrase, unusedTokens);
          currentSetToCheck = null;
          if (unusedTokens.size() > 0) {
            Token aToken = unusedTokens.remove(0);
            Log.debug("emitting putback token");
            emit(aToken);
            return true;
          }
        }
      }

      if (lastEmitted == null && (currentPhrase != null && currentPhrase.length() > 0)) {
        char[] lastTok = getCurrentBuffer(new char[0]);
        if (currentSetToCheck.contains(lastTok, 0, lastTok.length)) {
          emit(lastTok);
          currentPhrase.setLength(0);
          return true;
        } else if (!emitSingleTokens) {
          discardCharTokens(currentPhrase, unusedTokens);
          currentSetToCheck = null;
          currentPhrase.setLength(0);
          if (unusedTokens.size() > 0) {
            Token aToken = unusedTokens.remove(0);
            Log.debug("emitting putback token");
            emit(aToken);
            return true;
          }
        }
      }
      return false;
    }

    // if emitSingleToken, set lastToken = nextToken
    if (emitSingleTokens) {
      lastToken = nextToken;
    }

    if (currentSetToCheck == null || currentSetToCheck.size() == 0) {
      Log.debug("Checking for phrase start on '" + new String(nextToken) + "'");

      if (phraseMap.keySet().contains(nextToken, 0, nextToken.length)) {
        // get the phrase set for this token, add it to currentSetTocheck
        currentSetToCheck = phraseMap.get(nextToken, 0, nextToken.length);
        if (currentPhrase == null) currentPhrase = new StringBuffer();
        else currentPhrase.setLength(0);
        currentPhrase.append(nextToken);
        return incrementToken();
      } else {
        emit(nextToken);
        // clear lastToken
        lastToken = null;
        return true;
      }
    } else {
      // add token to the current string buffer.
      char[] currentBuffer = getCurrentBuffer(nextToken);

      if (currentSetToCheck.contains(currentBuffer, 0, currentBuffer.length)) {
        // if its the only one valid, emit it
        // if there is a longer one, wait to see if it will be matched
        // if the longer one breaks on the next token, emit this one...
        // emit the current phrase
        currentSetToCheck = remove(currentSetToCheck, currentBuffer);

        if (currentSetToCheck.size() == 0) {
          emit(currentBuffer);
          lastValid = null;
          --positionIncr;
        } else {
          if (emitSingleTokens) {
            lastToken = currentBuffer;
            return true;
          }
          lastValid = currentBuffer;
        }

        if (phraseMap.keySet().contains(nextToken, 0, nextToken.length)) {
          // get the phrase set for this token, add it to currentPhrasesTocheck
          currentSetToCheck = phraseMap.get(nextToken, 0, nextToken.length);
          if (currentPhrase == null) currentPhrase = new StringBuffer();
          else currentPhrase.setLength(0);
          currentPhrase.append(nextToken);
        }

        return (lastValid != null) ? incrementToken() : true;
      }

      if (phraseMap.keySet().contains(nextToken, 0, nextToken.length)) {
        // get the phrase set for this token, add it to currentPhrasesTocheck
        // System.out.println( "starting new phrase with " + new String( nextToken ) );
        // does this add all of the set? if not need iterator loop
        CharArraySet newSet = phraseMap.get(nextToken, 0, nextToken.length);
        Iterator<Object> phraseIt = newSet.iterator();
        while (phraseIt != null && phraseIt.hasNext()) {
          char[] phrase = (char[]) phraseIt.next();
          currentSetToCheck.add(phrase);
        }
      }

      // for each phrase in currentSetToCheck -
      // if there is a phrase prefix match, get the next token recursively
      Iterator<Object> phraseIt = currentSetToCheck.iterator();
      while (phraseIt != null && phraseIt.hasNext()) {
        char[] phrase = (char[]) phraseIt.next();

        if (startsWith(phrase, currentBuffer)) {
          return incrementToken();
        }
      }

      if (lastValid != null) {
        emit(lastValid);
        lastValid = null;
        return true;
      }

      if (!emitSingleTokens) {
        // current phrase didn't match fully: put the tokens back
        // into the unusedTokens list
        discardCharTokens(currentPhrase, unusedTokens);
        currentPhrase.setLength(0);
        currentSetToCheck = null;

        if (unusedTokens.size() > 0) {
          Token aToken = unusedTokens.remove(0);
          Log.debug("emitting putback token");
          emit(aToken);
          return true;
        }
      }
      currentSetToCheck = null;

      Log.debug("returning at end.");
      return incrementToken();
    }
  }
 private boolean isPhrase(char[] phrase) {
   return phraseMap != null && phraseMap.containsKey(phrase, 0, phrase.length);
 }
Beispiel #7
0
 /**
  * Returns an unmodifiable {@link CharArrayMap}. This allows to provide unmodifiable views of
  * internal map for "read-only" use.
  *
  * @param map a map for which the unmodifiable map is returned.
  * @return an new unmodifiable {@link CharArrayMap}.
  * @throws NullPointerException if the given map is <code>null</code>.
  */
 public static <V> CharArrayMap<V> unmodifiableMap(CharArrayMap<V> map) {
   if (map == null) throw new NullPointerException("Given map is null");
   if (map == emptyMap() || map.isEmpty()) return emptyMap();
   if (map instanceof UnmodifiableCharArrayMap) return map;
   return new UnmodifiableCharArrayMap<>(map);
 }
Beispiel #8
0
  private static CharArrayMap<DictEntry> initializeDictHash() {
    DictEntry defaultEntry;
    DictEntry entry;

    CharArrayMap<DictEntry> d = new CharArrayMap<DictEntry>(Version.LUCENE_31, 1000, false);

    d = new CharArrayMap<DictEntry>(Version.LUCENE_31, 1000, false);
    for (int i = 0; i < exceptionWords.length; i++) {
      if (!d.containsKey(exceptionWords[i])) {
        entry = new DictEntry(exceptionWords[i], true);
        d.put(exceptionWords[i], entry);
      } else {
        System.out.println("Warning: Entry [" + exceptionWords[i] + "] already in dictionary 1");
      }
    }

    for (int i = 0; i < directConflations.length; i++) {
      if (!d.containsKey(directConflations[i][0])) {
        entry = new DictEntry(directConflations[i][1], false);
        d.put(directConflations[i][0], entry);
      } else {
        System.out.println(
            "Warning: Entry [" + directConflations[i][0] + "] already in dictionary 2");
      }
    }

    for (int i = 0; i < countryNationality.length; i++) {
      if (!d.containsKey(countryNationality[i][0])) {
        entry = new DictEntry(countryNationality[i][1], false);
        d.put(countryNationality[i][0], entry);
      } else {
        System.out.println(
            "Warning: Entry [" + countryNationality[i][0] + "] already in dictionary 3");
      }
    }

    defaultEntry = new DictEntry(null, false);

    String[] array;
    array = KStemData1.data;

    for (int i = 0; i < array.length; i++) {
      if (!d.containsKey(array[i])) {
        d.put(array[i], defaultEntry);
      } else {
        System.out.println("Warning: Entry [" + array[i] + "] already in dictionary 4");
      }
    }

    array = KStemData2.data;
    for (int i = 0; i < array.length; i++) {
      if (!d.containsKey(array[i])) {
        d.put(array[i], defaultEntry);
      } else {
        System.out.println("Warning: Entry [" + array[i] + "] already in dictionary 4");
      }
    }

    array = KStemData3.data;
    for (int i = 0; i < array.length; i++) {
      if (!d.containsKey(array[i])) {
        d.put(array[i], defaultEntry);
      } else {
        System.out.println("Warning: Entry [" + array[i] + "] already in dictionary 4");
      }
    }

    array = KStemData4.data;
    for (int i = 0; i < array.length; i++) {
      if (!d.containsKey(array[i])) {
        d.put(array[i], defaultEntry);
      } else {
        System.out.println("Warning: Entry [" + array[i] + "] already in dictionary 4");
      }
    }

    array = KStemData5.data;
    for (int i = 0; i < array.length; i++) {
      if (!d.containsKey(array[i])) {
        d.put(array[i], defaultEntry);
      } else {
        System.out.println("Warning: Entry [" + array[i] + "] already in dictionary 4");
      }
    }

    array = KStemData6.data;
    for (int i = 0; i < array.length; i++) {
      if (!d.containsKey(array[i])) {
        d.put(array[i], defaultEntry);
      } else {
        System.out.println("Warning: Entry [" + array[i] + "] already in dictionary 4");
      }
    }

    array = KStemData7.data;
    for (int i = 0; i < array.length; i++) {
      if (!d.containsKey(array[i])) {
        d.put(array[i], defaultEntry);
      } else {
        System.out.println("Warning: Entry [" + array[i] + "] already in dictionary 4");
      }
    }

    for (int i = 0; i < KStemData8.data.length; i++) {
      if (!d.containsKey(KStemData8.data[i])) {
        d.put(KStemData8.data[i], defaultEntry);
      } else {
        System.out.println("Warning: Entry [" + KStemData8.data[i] + "] already in dictionary 4");
      }
    }

    for (int i = 0; i < supplementDict.length; i++) {
      if (!d.containsKey(supplementDict[i])) {
        d.put(supplementDict[i], defaultEntry);
      } else {
        System.out.println("Warning: Entry [" + supplementDict[i] + "] already in dictionary 5");
      }
    }

    for (int i = 0; i < properNouns.length; i++) {
      if (!d.containsKey(properNouns[i])) {
        d.put(properNouns[i], defaultEntry);
      } else {
        System.out.println("Warning: Entry [" + properNouns[i] + "] already in dictionary 6");
      }
    }

    return d;
  }
Beispiel #9
0
  /** Stems the text in the token. Returns true if changed. */
  boolean stem(char[] term, int len) {

    result = null;

    k = len - 1;
    if ((k <= 1) || (k >= MaxWordLen - 1)) {
      return false; // don't stem
    }

    // first check the stemmer dictionaries, and avoid using the
    // cache if it's in there.
    DictEntry entry = dict_ht.get(term, 0, len);
    if (entry != null) {
      if (entry.root != null) {
        result = entry.root;
        return true;
      }
      return false;
    }

    /**
     * * caching off is normally faster if (cache == null) initializeStemHash();
     *
     * <p>// now check the cache, before we copy chars to "word" if (cache != null) { String val =
     * cache.get(term, 0, len); if (val != null) { if (val != SAME) { result = val; return true; }
     * return false; } } *
     */
    word.reset();
    // allocate enough space so that an expansion is never needed
    word.reserve(len + 10);
    for (int i = 0; i < len; i++) {
      char ch = term[i];
      if (!isAlpha(ch)) return false; // don't stem
      // don't lowercase... it's a requirement that lowercase filter be
      // used before this stemmer.
      word.unsafeWrite(ch);
    }

    matchedEntry = null;
    /** * lookups.clear(); lookups.add(word.toString()); * */

    /*
     * This while loop will never be executed more than one time; it is here
     * only to allow the break statement to be used to escape as soon as a word
     * is recognized
     */
    while (true) {
      // YCS: extra lookup()s were inserted so we don't need to
      // do an extra wordInDict() here.
      plural();
      if (matched()) break;
      pastTense();
      if (matched()) break;
      aspect();
      if (matched()) break;
      ityEndings();
      if (matched()) break;
      nessEndings();
      if (matched()) break;
      ionEndings();
      if (matched()) break;
      erAndOrEndings();
      if (matched()) break;
      lyEndings();
      if (matched()) break;
      alEndings();
      if (matched()) break;
      entry = wordInDict();
      iveEndings();
      if (matched()) break;
      izeEndings();
      if (matched()) break;
      mentEndings();
      if (matched()) break;
      bleEndings();
      if (matched()) break;
      ismEndings();
      if (matched()) break;
      icEndings();
      if (matched()) break;
      ncyEndings();
      if (matched()) break;
      nceEndings();
      matched();
      break;
    }

    /*
     * try for a direct mapping (allows for cases like `Italian'->`Italy' and
     * `Italians'->`Italy')
     */
    entry = matchedEntry;
    if (entry != null) {
      result = entry.root; // may be null, which means that "word" is the stem
    }

    /**
     * * caching off is normally faster if (cache != null && cache.size() < maxCacheSize) { char[]
     * key = new char[len]; System.arraycopy(term, 0, key, 0, len); if (result != null) {
     * cache.put(key, result); } else { cache.put(key, word.toString()); } } *
     */

    /**
     * * if (entry == null) { if (!word.toString().equals(new String(term,0,len))) {
     * System.out.println("CASE:" + word.toString() + "," + new String(term,0,len));
     *
     * <p>} } *
     */

    // no entry matched means result is "word"
    return true;
  }