Java DictionaryMetadata примеры использования

Язык программирования: Java

Пространство имен/Пакет: morfologik.stemming

Класс/Тип: DictionaryMetadata

Примеров на hotexamples.com: 12

Java DictionaryMetadata - 12 примеров найдено. Это лучшие примеры Java кода для morfologik.stemming.DictionaryMetadata, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

isConvertingCase(4)

getInputConversionPairs(3)

getLocale(3)

getSeparator(3)

getSeparatorAsChar(2)

getOutputConversionPairs(2)

isIgnoringCamelCase(1)

isIgnoringPunctuation(1)

isIgnoringNumbers(1)

isIgnoringDiacritics(1)

getDecoder(1)

isIgnoringAllUppercase(1)

isFrequencyIncluded(1)

getEncoder(1)

getReplacementPairs(1)

getEquivalentChars(1)

isSupportingRunOnWords(1)

Пример #1

0

Показать файл

Файл: Speller.java Проект: jaumeortola/morfologik-stemming

  /**
   * Get the frequency value for a word form. It is taken from the first entry with this word form.
   *
   * @param word the word to be tested
   * @return frequency value in range: 0..FREQ_RANGE-1 (0: less frequent).
   */
  public int getFrequency(final CharSequence word) {
    if (!dictionaryMetadata.isFrequencyIncluded()) {
      return 0;
    }

    final byte separator = dictionaryMetadata.getSeparator();
    try {
      byteBuffer = charSequenceToBytes(word);
    } catch (UnmappableInputException e) {
      return 0;
    }

    final MatchResult match =
        matcher.match(matchResult, byteBuffer.array(), 0, byteBuffer.remaining(), rootNode);
    if (match.kind == SEQUENCE_IS_A_PREFIX) {
      final int arc = fsa.getArc(match.node, separator);
      if (arc != 0 && !fsa.isArcFinal(arc)) {
        finalStatesIterator.restartFrom(fsa.getEndNode(arc));
        if (finalStatesIterator.hasNext()) {
          final ByteBuffer bb = finalStatesIterator.next();
          final byte[] ba = bb.array();
          final int bbSize = bb.remaining();
          // the last byte contains the frequency after a separator
          return ba[bbSize - 1] - FIRST_RANGE_CODE;
        }
      }
    }
    return 0;
  }

Пример #2

0

Показать файл

Файл: Speller.java Проект: jaumeortola/morfologik-stemming

 // by Jaume Ortola
 private boolean areEqual(final char x, final char y) {
   if (x == y) {
     return true;
   }
   if (dictionaryMetadata.getEquivalentChars() != null
       && dictionaryMetadata.getEquivalentChars().containsKey(x)
       && dictionaryMetadata.getEquivalentChars().get(x).contains(y)) {
     return true;
   }
   if (dictionaryMetadata.isIgnoringDiacritics()) {
     String xn = Normalizer.normalize(Character.toString(x), Form.NFD);
     String yn = Normalizer.normalize(Character.toString(y), Form.NFD);
     if (xn.charAt(0) == yn.charAt(0)) { // avoid case conversion, if possible
       return true;
     }
     if (dictionaryMetadata.isConvertingCase()) {
       // again case conversion only when needed -- we
       // do not need String.lowercase because we only check
       // single characters, so a cheaper method is enough
       if (Character.isLetter(xn.charAt(0))) {
         boolean testNeeded =
             Character.isLowerCase(xn.charAt(0)) != Character.isLowerCase(yn.charAt(0));
         if (testNeeded) {
           return Character.toLowerCase(xn.charAt(0)) == Character.toLowerCase(yn.charAt(0));
         }
       }
     }
     return xn.charAt(0) == yn.charAt(0);
   }
   return false;
 }

Пример #3

0

Показать файл

Файл: Speller.java Проект: jaumeortola/morfologik-stemming

  public Speller(final Dictionary dictionary, final int editDistance) {
    this.editDistance = editDistance;
    this.hMatrix = new HMatrix(editDistance, MAX_WORD_LENGTH);

    this.dictionaryMetadata = dictionary.metadata;
    this.rootNode = dictionary.fsa.getRootNode();
    this.fsa = dictionary.fsa;
    this.matcher = new FSATraversal(fsa);
    this.finalStatesIterator = new ByteSequenceIterator(fsa, rootNode);

    if (rootNode == 0) {
      throw new IllegalArgumentException("Dictionary must have at least the root node.");
    }

    if (dictionaryMetadata == null) {
      throw new IllegalArgumentException("Dictionary metadata must not be null.");
    }

    encoder = dictionaryMetadata.getEncoder();
    decoder = dictionaryMetadata.getDecoder();

    // Multibyte separator will result in an exception here.
    dictionaryMetadata.getSeparatorAsChar();

    this.createReplacementsMaps();
  }

Пример #4

0

Показать файл

Файл: Speller.java Проект: jaumeortola/morfologik-stemming

 private boolean isBeforeSeparator(final int arc) {
   if (containsSeparators) {
     final int arc1 = fsa.getArc(fsa.getEndNode(arc), dictionaryMetadata.getSeparator());
     return arc1 != 0 && !fsa.isArcTerminal(arc1);
   }
   return false;
 }

Пример #5

0

Показать файл

Файл: Speller.java Проект: jaumeortola/morfologik-stemming

 /**
  * Propose suggestions for misspelled run-on words. This algorithm is inspired by spell.cc in
  * s_fsa package by Jan Daciuk.
  *
  * @param original The original misspelled word.
  * @return The list of suggested pairs, as space-concatenated strings.
  */
 public List<String> replaceRunOnWords(final String original) {
   final List<String> candidates = new ArrayList<String>();
   if (!isInDictionary(
           DictionaryLookup.applyReplacements(
               original, dictionaryMetadata.getInputConversionPairs()))
       && dictionaryMetadata.isSupportingRunOnWords()) {
     for (int i = 1; i < original.length(); i++) {
       // chop from left to right
       final CharSequence firstCh = original.subSequence(0, i);
       if (isInDictionary(firstCh) && isInDictionary(original.subSequence(i, original.length()))) {
         if (!dictionaryMetadata.getOutputConversionPairs().isEmpty()) {
           candidates.add(firstCh + " " + original.subSequence(i, original.length()));
         } else {
           candidates.add(
               DictionaryLookup.applyReplacements(
                       firstCh + " " + original.subSequence(i, original.length()),
                       dictionaryMetadata.getOutputConversionPairs())
                   .toString());
         }
       }
     }
   }
   return candidates;
 }

Пример #6

0

Показать файл

Файл: Speller.java Проект: jaumeortola/morfologik-stemming

 /**
  * Checks whether the word is misspelled, by performing a series of checks according to properties
  * of the dictionary.
  *
  * <p>If the flag <code>fsa.dict.speller.ignore-punctuation</code> is set, then all non-alphabetic
  * characters are considered to be correctly spelled.
  *
  * <p>If the flag <code>fsa.dict.speller.ignore-numbers</code> is set, then all words containing
  * decimal digits are considered to be correctly spelled.
  *
  * <p>If the flag <code>fsa.dict.speller.ignore-camel-case</code> is set, then all CamelCase words
  * are considered to be correctly spelled.
  *
  * <p>If the flag <code>fsa.dict.speller.ignore-all-uppercase</code> is set, then all alphabetic
  * words composed of only uppercase characters are considered to be correctly spelled.
  *
  * <p>Otherwise, the word is checked in the dictionary. If the test fails, and the dictionary does
  * not perform any case conversions (as set by <code>fsa.dict.speller.convert-case</code> flag),
  * then the method returns false. In case of case conversions, it is checked whether a non-mixed
  * case word is found in its lowercase version in the dictionary, and for all-uppercase words,
  * whether the word is found in the dictionary with the initial uppercase letter.
  *
  * @param word - the word to be checked
  * @return true if the word is misspelled
  */
 public boolean isMisspelled(final String word) {
   // dictionaries usually do not contain punctuation
   String wordToCheck = word;
   if (!dictionaryMetadata.getInputConversionPairs().isEmpty()) {
     wordToCheck =
         DictionaryLookup.applyReplacements(word, dictionaryMetadata.getInputConversionPairs());
   }
   boolean isAlphabetic = wordToCheck.length() != 1 || isAlphabetic(wordToCheck.charAt(0));
   return wordToCheck.length() > 0
       && (!dictionaryMetadata.isIgnoringPunctuation() || isAlphabetic)
       && (!dictionaryMetadata.isIgnoringNumbers() || containsNoDigit(wordToCheck))
       && !(dictionaryMetadata.isIgnoringCamelCase() && isCamelCase(wordToCheck))
       && !(dictionaryMetadata.isIgnoringAllUppercase()
           && isAlphabetic
           && isAllUppercase(wordToCheck))
       && !isInDictionary(wordToCheck)
       && (!dictionaryMetadata.isConvertingCase()
           || !(!isMixedCase(wordToCheck)
               && (isInDictionary(wordToCheck.toLowerCase(dictionaryMetadata.getLocale()))
                   || isAllUppercase(wordToCheck)
                       && isInDictionary(initialUppercase(wordToCheck)))));
 }

Пример #7

0

Показать файл

Файл: Speller.java Проект: jaumeortola/morfologik-stemming

  /**
   * Test whether the word is found in the dictionary.
   *
   * @param word the word to be tested
   * @return True if it is found.
   */
  public boolean isInDictionary(final CharSequence word) {
    try {
      byteBuffer = charSequenceToBytes(word);
    } catch (UnmappableInputException e) {
      return false;
    }

    // Try to find a partial match in the dictionary.
    final MatchResult match =
        matcher.match(matchResult, byteBuffer.array(), 0, byteBuffer.remaining(), rootNode);

    if (match.kind == EXACT_MATCH) {
      containsSeparators = false;
      return true;
    }

    return containsSeparators
        && match.kind == SEQUENCE_IS_A_PREFIX
        && byteBuffer.remaining() > 0
        && fsa.getArc(match.node, dictionaryMetadata.getSeparator()) != 0;
  }

Пример #8

0

Показать файл

Файл: Speller.java Проект: jaumeortola/morfologik-stemming

 private void createReplacementsMaps() {
   for (Map.Entry<String, List<String>> entry :
       dictionaryMetadata.getReplacementPairs().entrySet()) {
     for (String s : entry.getValue()) {
       // replacements any to one
       // the new key is the target of the replacement pair
       if (s.length() == 1) {
         if (!replacementsAnyToOne.containsKey(s.charAt(0))) {
           List<char[]> charList = new ArrayList<char[]>();
           charList.add(entry.getKey().toCharArray());
           replacementsAnyToOne.put(s.charAt(0), charList);
         } else {
           replacementsAnyToOne.get(s.charAt(0)).add(entry.getKey().toCharArray());
         }
       }
       // replacements any to two
       // the new key is the target of the replacement pair
       else if (s.length() == 2) {
         if (!replacementsAnyToTwo.containsKey(s)) {
           List<char[]> charList = new ArrayList<char[]>();
           charList.add(entry.getKey().toCharArray());
           replacementsAnyToTwo.put(s, charList);
         } else {
           replacementsAnyToTwo.get(s).add(entry.getKey().toCharArray());
         }
       } else {
         if (!replacementsTheRest.containsKey(entry.getKey())) {
           List<String> charList = new ArrayList<String>();
           charList.add(s);
           replacementsTheRest.put(entry.getKey(), charList);
         } else {
           replacementsTheRest.get(entry.getKey()).add(s);
         }
       }
     }
   }
 }

Пример #9

0

Показать файл

Файл: Speller.java Проект: jaumeortola/morfologik-stemming

 /**
  * Used to determine whether the dictionary supports case conversions.
  *
  * @return boolean value that answers this question in a deep and meaningful way.
  * @since 1.9
  */
 public boolean convertsCase() {
   return dictionaryMetadata.isConvertingCase();
 }

Пример #10

0

Показать файл

Файл: Speller.java Проект: jaumeortola/morfologik-stemming

 private boolean isArcNotTerminal(final int arc, final int candIndex) {
   return !fsa.isArcTerminal(arc)
       && !(containsSeparators && candidate[candIndex] == dictionaryMetadata.getSeparatorAsChar());
 }

Пример #11

0

Показать файл

Файл: Speller.java Проект: jaumeortola/morfologik-stemming

  /**
   * Find suggestions by using K. Oflazer's algorithm. See Jan Daciuk's s_fsa package, spell.cc for
   * further explanation.
   *
   * @param w The original misspelled word.
   * @return A list of suggested replacements.
   */
  public List<String> findReplacements(final String w) {
    String word = w;
    if (!dictionaryMetadata.getInputConversionPairs().isEmpty()) {
      word = DictionaryLookup.applyReplacements(w, dictionaryMetadata.getInputConversionPairs());
    }
    candidates.clear();
    if (word.length() > 0 && word.length() < MAX_WORD_LENGTH && !isInDictionary(word)) {
      List<String> wordsToCheck = new ArrayList<String>();
      if (replacementsTheRest != null && word.length() > MIN_WORD_LENGTH) {
        for (final String wordChecked : getAllReplacements(word, 0, 0)) {
          boolean found = false;
          if (isInDictionary(wordChecked)) {
            candidates.add(new CandidateData(wordChecked, 0));
            found = true;
          } else if (dictionaryMetadata.isConvertingCase()) {
            String lowerWord = wordChecked.toLowerCase(dictionaryMetadata.getLocale());
            String upperWord = wordChecked.toUpperCase(dictionaryMetadata.getLocale());
            if (isInDictionary(lowerWord)) {
              // add the word as it is in the dictionary, not mixed-case versions of it
              candidates.add(new CandidateData(lowerWord, 0));
              found = true;
            }
            if (isInDictionary(upperWord)) {
              candidates.add(new CandidateData(upperWord, 0));
              found = true;
            }
            if (lowerWord.length() > 1) {
              String firstupperWord =
                  Character.toUpperCase(lowerWord.charAt(0)) + lowerWord.substring(1);
              if (isInDictionary(firstupperWord)) {
                candidates.add(new CandidateData(firstupperWord, 0));
                found = true;
              }
            }
          }
          if (!found) {
            wordsToCheck.add(wordChecked);
          }
        }
      } else {
        wordsToCheck.add(word);
      }

      // If at least one candidate was found with the replacement pairs (which are usual errors),
      // probably there is no need for more candidates
      if (candidates.isEmpty()) {
        int i = 1;
        for (final String wordChecked : wordsToCheck) {
          i++;
          if (i > UPPER_SEARCH_LIMIT) { // for performance reasons, do not search too deeply
            break;
          }
          wordProcessed = wordChecked.toCharArray();
          wordLen = wordProcessed.length;
          if (wordLen < MIN_WORD_LENGTH
              && i > 2) { // three-letter replacements make little sense anyway
            break;
          }
          candidate = new char[MAX_WORD_LENGTH];
          candLen = candidate.length;
          effectEditDistance = wordLen <= editDistance ? wordLen - 1 : editDistance;
          charBuffer = BufferUtils.clearAndEnsureCapacity(charBuffer, MAX_WORD_LENGTH);
          byteBuffer = BufferUtils.clearAndEnsureCapacity(byteBuffer, MAX_WORD_LENGTH);
          final byte[] prevBytes = new byte[0];
          findRepl(0, fsa.getRootNode(), prevBytes, 0, 0);
        }
      }
    }

    Collections.sort(candidates);

    // Use a linked set to avoid duplicates and preserve the ordering of candidates.
    final Set<String> candStringSet = new LinkedHashSet<String>();
    for (final CandidateData cd : candidates) {
      candStringSet.add(
          DictionaryLookup.applyReplacements(
                  cd.getWord(), dictionaryMetadata.getOutputConversionPairs())
              .toString());
    }
    final List<String> candStringList = new ArrayList<String>(candStringSet.size());
    candStringList.addAll(candStringSet);
    return candStringList;
  }

Пример #12

0

Показать файл

Файл: Speller.java Проект: jaumeortola/morfologik-stemming

 private CharSequence initialUppercase(final String wordToCheck) {
   return wordToCheck.substring(0, 1)
       + wordToCheck.substring(1).toLowerCase(dictionaryMetadata.getLocale());
 }