Exemplo n.º 1
0
  /**
   * Adds one phrase to the list of phrases recognised by this gazetteer
   *
   * @param text the phrase to be added
   * @param lookup the description of the annotation to be added when this phrase is recognised
   */
  public void addLookup(String text, Lookup lookup) {
    char currentChar;
    FSMState currentState = initialState;
    FSMState nextState;
    Lookup oldLookup;
    boolean isSpace;

    for (int i = 0; i < text.length(); i++) {
      currentChar = text.charAt(i);
      isSpace = Character.isSpaceChar(currentChar) || Character.isWhitespace(currentChar);
      if (isSpace) currentChar = ' ';
      else
        currentChar =
            (caseSensitive.booleanValue()) ? currentChar : Character.toUpperCase(currentChar);
      nextState = currentState.next(currentChar);
      if (nextState == null) {
        nextState = new FSMState(this);
        currentState.put(currentChar, nextState);
        if (isSpace) nextState.put(' ', nextState);
      }
      currentState = nextState;
    } // for(int i = 0; i< text.length(); i++)

    currentState.addLookup(lookup);
    // Out.println(text + "|" + lookup.majorType + "|" + lookup.minorType);

  } // addLookup
Exemplo n.º 2
0
  /**
   * Removes one phrase to the list of phrases recognised by this gazetteer
   *
   * @param text the phrase to be removed
   * @param lookup the description of the annotation associated to this phrase
   */
  public void removeLookup(String text, Lookup lookup) {
    char currentChar;
    FSMState currentState = initialState;
    FSMState nextState;
    Lookup oldLookup;

    for (int i = 0; i < text.length(); i++) {
      currentChar = text.charAt(i);
      if (Character.isSpaceChar(currentChar) || Character.isWhitespace(currentChar))
        currentChar = ' ';
      nextState = currentState.next(currentChar);
      if (nextState == null) return; // nothing to remove
      currentState = nextState;
    } // for(int i = 0; i< text.length(); i++)
    currentState.removeLookup(lookup);
  } // removeLookup
Exemplo n.º 3
0
  @Override
  public boolean remove(String singleItem) {
    char currentChar;
    FSMState currentState = initialState;
    FSMState nextState;
    Lookup oldLookup;

    for (int i = 0; i < singleItem.length(); i++) {
      currentChar = singleItem.charAt(i);
      if (Character.isSpaceChar(currentChar) || Character.isWhitespace(currentChar))
        currentChar = ' ';
      nextState = currentState.next(currentChar);
      if (nextState == null) {
        return false;
      } // nothing to remove
      currentState = nextState;
    } // for(int i = 0; i< text.length(); i++)
    currentState.lookupSet = new HashSet();
    return true;
  }
Exemplo n.º 4
0
  /**
   * lookup <br>
   *
   * @param singleItem a single string to be looked up by the gazetteer
   * @return set of the Lookups associated with the parameter
   */
  @Override
  public Set lookup(String singleItem) {
    char currentChar;
    Set set = new HashSet();
    FSMState currentState = initialState;
    FSMState nextState;

    for (int i = 0; i < singleItem.length(); i++) {
      currentChar = singleItem.charAt(i);
      if (Character.isSpaceChar(currentChar) || Character.isWhitespace(currentChar))
        currentChar = ' ';
      nextState = currentState.next(currentChar);
      if (nextState == null) {
        return set;
      }
      currentState = nextState;
    } // for(int i = 0; i< text.length(); i++)
    set = currentState.getLookupSet();
    return set;
  }
Exemplo n.º 5
0
  /**
   * This method runs the gazetteer. It assumes that all the needed parameters are set. If they are
   * not, an exception will be fired.
   */
  @Override
  public void execute() throws ExecutionException {
    interrupted = false;
    AnnotationSet annotationSet;
    // check the input
    if (document == null) {
      throw new ExecutionException("No document to process!");
    }

    if (annotationSetName == null || annotationSetName.equals("")) {
      annotationSet = document.getAnnotations();
    } else {
      annotationSet = document.getAnnotations(annotationSetName);
    }

    fireStatusChanged("Performing look-up in " + document.getName() + "...");
    String content = document.getContent().toString();
    int length = content.length();
    char currentChar;
    FSMState currentState = initialState;
    FSMState nextState;
    FSMState lastMatchingState = null;
    int matchedRegionEnd = 0;
    int matchedRegionStart = 0;
    int charIdx = 0;
    int oldCharIdx = 0;
    FeatureMap fm;
    Lookup currentLookup;

    while (charIdx < length) {
      currentChar = content.charAt(charIdx);

      if (Character.isSpaceChar(currentChar) || Character.isWhitespace(currentChar)) {
        currentChar = ' ';
      } else {
        currentChar =
            caseSensitive.booleanValue() ? currentChar : Character.toUpperCase(currentChar);
      }

      nextState = currentState.next(currentChar);
      if (nextState == null) {
        // the matching stopped
        // if we had a successful match then act on it;
        if (lastMatchingState != null) {
          createLookups(lastMatchingState, matchedRegionStart, matchedRegionEnd, annotationSet);
          lastMatchingState = null;
        }
        // reset the FSM (обходим каждую позицию т.е. сначала с 0, потом с 1, потом с 2)
        charIdx = matchedRegionStart + 1;
        matchedRegionStart = charIdx;

        currentState = initialState;
      } else { // go on with the matching
        currentState = nextState;
        // if we have a successful state then store it
        if (currentState.isFinal()
            && ((!wholeWordsOnly.booleanValue())
                || ((matchedRegionStart == 0
                        || !isWordInternal(content.charAt(matchedRegionStart - 1)))
                    && (charIdx + 1 >= content.length()
                        || !isWordInternal(content.charAt(charIdx + 1)))))) {
          // we have a new match
          // if we had an existing match and we need to annotate prefixes, then
          // apply it
          if (!longestMatchOnly && lastMatchingState != null) {
            createLookups(lastMatchingState, matchedRegionStart, matchedRegionEnd, annotationSet);
          }
          matchedRegionEnd = charIdx;
          lastMatchingState = currentState;
        }
        charIdx++;
        if (charIdx == content.length()) {
          // we can't go on, use the last matching state and restart matching
          // from the next char
          if (lastMatchingState != null) {
            // let's add the new annotation(s)
            createLookups(lastMatchingState, matchedRegionStart, matchedRegionEnd, annotationSet);
            lastMatchingState = null;
          }
          // reset the FSM
          charIdx = matchedRegionStart + 1;
          matchedRegionStart = charIdx;
          currentState = initialState;
        }
      }
      // fire the progress event
      if (charIdx - oldCharIdx > 256) {
        fireProgressChanged((100 * charIdx) / length);
        oldCharIdx = charIdx;
        if (isInterrupted())
          throw new ExecutionInterruptedException(
              "The execution of the " + getName() + " gazetteer has been abruptly interrupted!");
      }
    } // while(charIdx < length)
    // we've finished. If we had a stored match, then apply it.
    if (lastMatchingState != null) {
      createLookups(lastMatchingState, matchedRegionStart, matchedRegionEnd, annotationSet);
    }
    fireProcessFinished();
    fireStatusChanged("Look-up complete!");
  } // execute