예제 #1
0
  /**
   * Removes one phrase to the list of phrases recognised by this gazetteer
   *
   * @param text the phrase to be removed
   * @param lookup the description of the annotation associated to this phrase
   */
  public void removeLookup(String text, Lookup lookup) {
    char currentChar;
    FSMState currentState = initialState;
    FSMState nextState;
    Lookup oldLookup;

    for (int i = 0; i < text.length(); i++) {
      currentChar = text.charAt(i);
      if (Character.isSpaceChar(currentChar) || Character.isWhitespace(currentChar))
        currentChar = ' ';
      nextState = currentState.next(currentChar);
      if (nextState == null) return; // nothing to remove
      currentState = nextState;
    } // for(int i = 0; i< text.length(); i++)
    currentState.removeLookup(lookup);
  } // removeLookup
예제 #2
0
  @Override
  public boolean remove(String singleItem) {
    char currentChar;
    FSMState currentState = initialState;
    FSMState nextState;
    Lookup oldLookup;

    for (int i = 0; i < singleItem.length(); i++) {
      currentChar = singleItem.charAt(i);
      if (Character.isSpaceChar(currentChar) || Character.isWhitespace(currentChar))
        currentChar = ' ';
      nextState = currentState.next(currentChar);
      if (nextState == null) {
        return false;
      } // nothing to remove
      currentState = nextState;
    } // for(int i = 0; i< text.length(); i++)
    currentState.lookupSet = new HashSet();
    return true;
  }
예제 #3
0
  /**
   * lookup <br>
   *
   * @param singleItem a single string to be looked up by the gazetteer
   * @return set of the Lookups associated with the parameter
   */
  @Override
  public Set lookup(String singleItem) {
    char currentChar;
    Set set = new HashSet();
    FSMState currentState = initialState;
    FSMState nextState;

    for (int i = 0; i < singleItem.length(); i++) {
      currentChar = singleItem.charAt(i);
      if (Character.isSpaceChar(currentChar) || Character.isWhitespace(currentChar))
        currentChar = ' ';
      nextState = currentState.next(currentChar);
      if (nextState == null) {
        return set;
      }
      currentState = nextState;
    } // for(int i = 0; i< text.length(); i++)
    set = currentState.getLookupSet();
    return set;
  }
예제 #4
0
  /**
   * Adds one phrase to the list of phrases recognised by this gazetteer
   *
   * @param text the phrase to be added
   * @param lookup the description of the annotation to be added when this phrase is recognised
   */
  public void addLookup(String text, Lookup lookup) {
    char currentChar;
    FSMState currentState = initialState;
    FSMState nextState;
    Lookup oldLookup;
    boolean isSpace;

    for (int i = 0; i < text.length(); i++) {
      currentChar = text.charAt(i);
      isSpace = Character.isSpaceChar(currentChar) || Character.isWhitespace(currentChar);
      if (isSpace) currentChar = ' ';
      else
        currentChar =
            (caseSensitive.booleanValue()) ? currentChar : Character.toUpperCase(currentChar);
      nextState = currentState.next(currentChar);
      if (nextState == null) {
        nextState = new FSMState(this);
        currentState.put(currentChar, nextState);
        if (isSpace) nextState.put(' ', nextState);
      }
      currentState = nextState;
    } // for(int i = 0; i< text.length(); i++)

    currentState.addLookup(lookup);
    // Out.println(text + "|" + lookup.majorType + "|" + lookup.minorType);

  } // addLookup
예제 #5
0
  /** Returns a string representation of the deterministic FSM graph using GML. */
  public String getFSMgml() {
    String res = "graph[ \ndirected 1\n";
    StringBuffer nodes = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE),
        edges = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE);
    Iterator fsmStatesIter = fsmStates.iterator();
    while (fsmStatesIter.hasNext()) {
      FSMState currentState = (FSMState) fsmStatesIter.next();
      int stateIndex = currentState.getIndex();
      nodes.append("node[ id ");
      nodes.append(stateIndex);
      nodes.append(" label \"");
      nodes.append(stateIndex);

      if (currentState.isFinal()) {
        nodes.append(",F\\n");
        nodes.append(currentState.getLookupSet());
      }
      nodes.append("\"  ]\n");
      edges.append(currentState.getEdgesGML());
    }
    res += nodes.toString() + edges.toString() + "]\n";
    return res;
  } // getFSMgml
예제 #6
0
  /**
   * Creates the Lookup annotations according to a gazetteer match.
   *
   * @param matchingState the final FSMState that was reached while matching.
   * @param matchedRegionStart the start of the matched text region.
   * @param matchedRegionEnd the end of the matched text region.
   * @param annotationSet the annotation set where the new annotations should be added.
   */
  protected void createLookups(
      FSMState matchingState,
      long matchedRegionStart,
      long matchedRegionEnd,
      AnnotationSet annotationSet) {
    Iterator lookupIter = matchingState.getLookupSet().iterator();
    while (lookupIter.hasNext()) {
      Lookup currentLookup = (Lookup) lookupIter.next();
      FeatureMap fm = Factory.newFeatureMap();
      fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType);
      if (null != currentLookup.oClass && null != currentLookup.ontology) {
        fm.put(LOOKUP_CLASS_FEATURE_NAME, currentLookup.oClass);
        fm.put(LOOKUP_ONTOLOGY_FEATURE_NAME, currentLookup.ontology);
      }

      if (null != currentLookup.minorType)
        fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType);
      if (null != currentLookup.languages)
        fm.put(LOOKUP_LANGUAGE_FEATURE_NAME, currentLookup.languages);
      if (null != currentLookup.features) {
        fm.putAll(currentLookup.features);
      }
      try {
        //        if(currentLookup.annotationType==null || "".equals(currentLookup.annotationType)){
        //          annotationSet.add(new Long(matchedRegionStart),
        //                          new Long(matchedRegionEnd + 1),
        //                          LOOKUP_ANNOTATION_TYPE,
        //                          fm);
        //        }else{
        annotationSet.add(
            new Long(matchedRegionStart),
            new Long(matchedRegionEnd + 1),
            currentLookup.annotationType, // this pojo attribute will have Lookup as a default tag.
            fm);
        // }
      } catch (InvalidOffsetException ioe) {
        throw new GateRuntimeException(ioe.toString());
      }
    } // while(lookupIter.hasNext())
  }
예제 #7
0
  /**
   * This method runs the gazetteer. It assumes that all the needed parameters are set. If they are
   * not, an exception will be fired.
   */
  @Override
  public void execute() throws ExecutionException {
    interrupted = false;
    AnnotationSet annotationSet;
    // check the input
    if (document == null) {
      throw new ExecutionException("No document to process!");
    }

    if (annotationSetName == null || annotationSetName.equals("")) {
      annotationSet = document.getAnnotations();
    } else {
      annotationSet = document.getAnnotations(annotationSetName);
    }

    fireStatusChanged("Performing look-up in " + document.getName() + "...");
    String content = document.getContent().toString();
    int length = content.length();
    char currentChar;
    FSMState currentState = initialState;
    FSMState nextState;
    FSMState lastMatchingState = null;
    int matchedRegionEnd = 0;
    int matchedRegionStart = 0;
    int charIdx = 0;
    int oldCharIdx = 0;
    FeatureMap fm;
    Lookup currentLookup;

    while (charIdx < length) {
      currentChar = content.charAt(charIdx);

      if (Character.isSpaceChar(currentChar) || Character.isWhitespace(currentChar)) {
        currentChar = ' ';
      } else {
        currentChar =
            caseSensitive.booleanValue() ? currentChar : Character.toUpperCase(currentChar);
      }

      nextState = currentState.next(currentChar);
      if (nextState == null) {
        // the matching stopped
        // if we had a successful match then act on it;
        if (lastMatchingState != null) {
          createLookups(lastMatchingState, matchedRegionStart, matchedRegionEnd, annotationSet);
          lastMatchingState = null;
        }
        // reset the FSM (обходим каждую позицию т.е. сначала с 0, потом с 1, потом с 2)
        charIdx = matchedRegionStart + 1;
        matchedRegionStart = charIdx;

        currentState = initialState;
      } else { // go on with the matching
        currentState = nextState;
        // if we have a successful state then store it
        if (currentState.isFinal()
            && ((!wholeWordsOnly.booleanValue())
                || ((matchedRegionStart == 0
                        || !isWordInternal(content.charAt(matchedRegionStart - 1)))
                    && (charIdx + 1 >= content.length()
                        || !isWordInternal(content.charAt(charIdx + 1)))))) {
          // we have a new match
          // if we had an existing match and we need to annotate prefixes, then
          // apply it
          if (!longestMatchOnly && lastMatchingState != null) {
            createLookups(lastMatchingState, matchedRegionStart, matchedRegionEnd, annotationSet);
          }
          matchedRegionEnd = charIdx;
          lastMatchingState = currentState;
        }
        charIdx++;
        if (charIdx == content.length()) {
          // we can't go on, use the last matching state and restart matching
          // from the next char
          if (lastMatchingState != null) {
            // let's add the new annotation(s)
            createLookups(lastMatchingState, matchedRegionStart, matchedRegionEnd, annotationSet);
            lastMatchingState = null;
          }
          // reset the FSM
          charIdx = matchedRegionStart + 1;
          matchedRegionStart = charIdx;
          currentState = initialState;
        }
      }
      // fire the progress event
      if (charIdx - oldCharIdx > 256) {
        fireProgressChanged((100 * charIdx) / length);
        oldCharIdx = charIdx;
        if (isInterrupted())
          throw new ExecutionInterruptedException(
              "The execution of the " + getName() + " gazetteer has been abruptly interrupted!");
      }
    } // while(charIdx < length)
    // we've finished. If we had a stored match, then apply it.
    if (lastMatchingState != null) {
      createLookups(lastMatchingState, matchedRegionStart, matchedRegionEnd, annotationSet);
    }
    fireProcessFinished();
    fireStatusChanged("Look-up complete!");
  } // execute