Exemplo n.º 1
0
  /** Returns a string representation of the deterministic FSM graph using GML. */
  public String getFSMgml() {
    String res = "graph[ \ndirected 1\n";
    StringBuffer nodes = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE),
        edges = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE);
    Iterator fsmStatesIter = fsmStates.iterator();
    while (fsmStatesIter.hasNext()) {
      FSMState currentState = (FSMState) fsmStatesIter.next();
      int stateIndex = currentState.getIndex();
      nodes.append("node[ id ");
      nodes.append(stateIndex);
      nodes.append(" label \"");
      nodes.append(stateIndex);

      if (currentState.isFinal()) {
        nodes.append(",F\\n");
        nodes.append(currentState.getLookupSet());
      }
      nodes.append("\"  ]\n");
      edges.append(currentState.getEdgesGML());
    }
    res += nodes.toString() + edges.toString() + "]\n";
    return res;
  } // getFSMgml
Exemplo n.º 2
0
  /**
   * This method runs the gazetteer. It assumes that all the needed parameters are set. If they are
   * not, an exception will be fired.
   */
  @Override
  public void execute() throws ExecutionException {
    interrupted = false;
    AnnotationSet annotationSet;
    // check the input
    if (document == null) {
      throw new ExecutionException("No document to process!");
    }

    if (annotationSetName == null || annotationSetName.equals("")) {
      annotationSet = document.getAnnotations();
    } else {
      annotationSet = document.getAnnotations(annotationSetName);
    }

    fireStatusChanged("Performing look-up in " + document.getName() + "...");
    String content = document.getContent().toString();
    int length = content.length();
    char currentChar;
    FSMState currentState = initialState;
    FSMState nextState;
    FSMState lastMatchingState = null;
    int matchedRegionEnd = 0;
    int matchedRegionStart = 0;
    int charIdx = 0;
    int oldCharIdx = 0;
    FeatureMap fm;
    Lookup currentLookup;

    while (charIdx < length) {
      currentChar = content.charAt(charIdx);

      if (Character.isSpaceChar(currentChar) || Character.isWhitespace(currentChar)) {
        currentChar = ' ';
      } else {
        currentChar =
            caseSensitive.booleanValue() ? currentChar : Character.toUpperCase(currentChar);
      }

      nextState = currentState.next(currentChar);
      if (nextState == null) {
        // the matching stopped
        // if we had a successful match then act on it;
        if (lastMatchingState != null) {
          createLookups(lastMatchingState, matchedRegionStart, matchedRegionEnd, annotationSet);
          lastMatchingState = null;
        }
        // reset the FSM (обходим каждую позицию т.е. сначала с 0, потом с 1, потом с 2)
        charIdx = matchedRegionStart + 1;
        matchedRegionStart = charIdx;

        currentState = initialState;
      } else { // go on with the matching
        currentState = nextState;
        // if we have a successful state then store it
        if (currentState.isFinal()
            && ((!wholeWordsOnly.booleanValue())
                || ((matchedRegionStart == 0
                        || !isWordInternal(content.charAt(matchedRegionStart - 1)))
                    && (charIdx + 1 >= content.length()
                        || !isWordInternal(content.charAt(charIdx + 1)))))) {
          // we have a new match
          // if we had an existing match and we need to annotate prefixes, then
          // apply it
          if (!longestMatchOnly && lastMatchingState != null) {
            createLookups(lastMatchingState, matchedRegionStart, matchedRegionEnd, annotationSet);
          }
          matchedRegionEnd = charIdx;
          lastMatchingState = currentState;
        }
        charIdx++;
        if (charIdx == content.length()) {
          // we can't go on, use the last matching state and restart matching
          // from the next char
          if (lastMatchingState != null) {
            // let's add the new annotation(s)
            createLookups(lastMatchingState, matchedRegionStart, matchedRegionEnd, annotationSet);
            lastMatchingState = null;
          }
          // reset the FSM
          charIdx = matchedRegionStart + 1;
          matchedRegionStart = charIdx;
          currentState = initialState;
        }
      }
      // fire the progress event
      if (charIdx - oldCharIdx > 256) {
        fireProgressChanged((100 * charIdx) / length);
        oldCharIdx = charIdx;
        if (isInterrupted())
          throw new ExecutionInterruptedException(
              "The execution of the " + getName() + " gazetteer has been abruptly interrupted!");
      }
    } // while(charIdx < length)
    // we've finished. If we had a stored match, then apply it.
    if (lastMatchingState != null) {
      createLookups(lastMatchingState, matchedRegionStart, matchedRegionEnd, annotationSet);
    }
    fireProcessFinished();
    fireStatusChanged("Look-up complete!");
  } // execute