/** * Removes one phrase to the list of phrases recognised by this gazetteer * * @param text the phrase to be removed * @param lookup the description of the annotation associated to this phrase */ public void removeLookup(String text, Lookup lookup) { char currentChar; FSMState currentState = initialState; FSMState nextState; Lookup oldLookup; for (int i = 0; i < text.length(); i++) { currentChar = text.charAt(i); if (Character.isSpaceChar(currentChar) || Character.isWhitespace(currentChar)) currentChar = ' '; nextState = currentState.next(currentChar); if (nextState == null) return; // nothing to remove currentState = nextState; } // for(int i = 0; i< text.length(); i++) currentState.removeLookup(lookup); } // removeLookup
@Override public boolean remove(String singleItem) { char currentChar; FSMState currentState = initialState; FSMState nextState; Lookup oldLookup; for (int i = 0; i < singleItem.length(); i++) { currentChar = singleItem.charAt(i); if (Character.isSpaceChar(currentChar) || Character.isWhitespace(currentChar)) currentChar = ' '; nextState = currentState.next(currentChar); if (nextState == null) { return false; } // nothing to remove currentState = nextState; } // for(int i = 0; i< text.length(); i++) currentState.lookupSet = new HashSet(); return true; }
/** * lookup <br> * * @param singleItem a single string to be looked up by the gazetteer * @return set of the Lookups associated with the parameter */ @Override public Set lookup(String singleItem) { char currentChar; Set set = new HashSet(); FSMState currentState = initialState; FSMState nextState; for (int i = 0; i < singleItem.length(); i++) { currentChar = singleItem.charAt(i); if (Character.isSpaceChar(currentChar) || Character.isWhitespace(currentChar)) currentChar = ' '; nextState = currentState.next(currentChar); if (nextState == null) { return set; } currentState = nextState; } // for(int i = 0; i< text.length(); i++) set = currentState.getLookupSet(); return set; }
/** * Adds one phrase to the list of phrases recognised by this gazetteer * * @param text the phrase to be added * @param lookup the description of the annotation to be added when this phrase is recognised */ public void addLookup(String text, Lookup lookup) { char currentChar; FSMState currentState = initialState; FSMState nextState; Lookup oldLookup; boolean isSpace; for (int i = 0; i < text.length(); i++) { currentChar = text.charAt(i); isSpace = Character.isSpaceChar(currentChar) || Character.isWhitespace(currentChar); if (isSpace) currentChar = ' '; else currentChar = (caseSensitive.booleanValue()) ? currentChar : Character.toUpperCase(currentChar); nextState = currentState.next(currentChar); if (nextState == null) { nextState = new FSMState(this); currentState.put(currentChar, nextState); if (isSpace) nextState.put(' ', nextState); } currentState = nextState; } // for(int i = 0; i< text.length(); i++) currentState.addLookup(lookup); // Out.println(text + "|" + lookup.majorType + "|" + lookup.minorType); } // addLookup
/** Returns a string representation of the deterministic FSM graph using GML. */ public String getFSMgml() { String res = "graph[ \ndirected 1\n"; StringBuffer nodes = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE), edges = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE); Iterator fsmStatesIter = fsmStates.iterator(); while (fsmStatesIter.hasNext()) { FSMState currentState = (FSMState) fsmStatesIter.next(); int stateIndex = currentState.getIndex(); nodes.append("node[ id "); nodes.append(stateIndex); nodes.append(" label \""); nodes.append(stateIndex); if (currentState.isFinal()) { nodes.append(",F\\n"); nodes.append(currentState.getLookupSet()); } nodes.append("\" ]\n"); edges.append(currentState.getEdgesGML()); } res += nodes.toString() + edges.toString() + "]\n"; return res; } // getFSMgml
/** * Creates the Lookup annotations according to a gazetteer match. * * @param matchingState the final FSMState that was reached while matching. * @param matchedRegionStart the start of the matched text region. * @param matchedRegionEnd the end of the matched text region. * @param annotationSet the annotation set where the new annotations should be added. */ protected void createLookups( FSMState matchingState, long matchedRegionStart, long matchedRegionEnd, AnnotationSet annotationSet) { Iterator lookupIter = matchingState.getLookupSet().iterator(); while (lookupIter.hasNext()) { Lookup currentLookup = (Lookup) lookupIter.next(); FeatureMap fm = Factory.newFeatureMap(); fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType); if (null != currentLookup.oClass && null != currentLookup.ontology) { fm.put(LOOKUP_CLASS_FEATURE_NAME, currentLookup.oClass); fm.put(LOOKUP_ONTOLOGY_FEATURE_NAME, currentLookup.ontology); } if (null != currentLookup.minorType) fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType); if (null != currentLookup.languages) fm.put(LOOKUP_LANGUAGE_FEATURE_NAME, currentLookup.languages); if (null != currentLookup.features) { fm.putAll(currentLookup.features); } try { // if(currentLookup.annotationType==null || "".equals(currentLookup.annotationType)){ // annotationSet.add(new Long(matchedRegionStart), // new Long(matchedRegionEnd + 1), // LOOKUP_ANNOTATION_TYPE, // fm); // }else{ annotationSet.add( new Long(matchedRegionStart), new Long(matchedRegionEnd + 1), currentLookup.annotationType, // this pojo attribute will have Lookup as a default tag. fm); // } } catch (InvalidOffsetException ioe) { throw new GateRuntimeException(ioe.toString()); } } // while(lookupIter.hasNext()) }
/** * This method runs the gazetteer. It assumes that all the needed parameters are set. If they are * not, an exception will be fired. */ @Override public void execute() throws ExecutionException { interrupted = false; AnnotationSet annotationSet; // check the input if (document == null) { throw new ExecutionException("No document to process!"); } if (annotationSetName == null || annotationSetName.equals("")) { annotationSet = document.getAnnotations(); } else { annotationSet = document.getAnnotations(annotationSetName); } fireStatusChanged("Performing look-up in " + document.getName() + "..."); String content = document.getContent().toString(); int length = content.length(); char currentChar; FSMState currentState = initialState; FSMState nextState; FSMState lastMatchingState = null; int matchedRegionEnd = 0; int matchedRegionStart = 0; int charIdx = 0; int oldCharIdx = 0; FeatureMap fm; Lookup currentLookup; while (charIdx < length) { currentChar = content.charAt(charIdx); if (Character.isSpaceChar(currentChar) || Character.isWhitespace(currentChar)) { currentChar = ' '; } else { currentChar = caseSensitive.booleanValue() ? currentChar : Character.toUpperCase(currentChar); } nextState = currentState.next(currentChar); if (nextState == null) { // the matching stopped // if we had a successful match then act on it; if (lastMatchingState != null) { createLookups(lastMatchingState, matchedRegionStart, matchedRegionEnd, annotationSet); lastMatchingState = null; } // reset the FSM (обходим каждую позицию т.е. сначала с 0, потом с 1, потом с 2) charIdx = matchedRegionStart + 1; matchedRegionStart = charIdx; currentState = initialState; } else { // go on with the matching currentState = nextState; // if we have a successful state then store it if (currentState.isFinal() && ((!wholeWordsOnly.booleanValue()) || ((matchedRegionStart == 0 || !isWordInternal(content.charAt(matchedRegionStart - 1))) && (charIdx + 1 >= content.length() || !isWordInternal(content.charAt(charIdx + 1)))))) { // we have a new match // if we had an existing match and we need to annotate prefixes, then // apply it if (!longestMatchOnly && lastMatchingState != null) { createLookups(lastMatchingState, matchedRegionStart, matchedRegionEnd, annotationSet); } matchedRegionEnd = charIdx; lastMatchingState = currentState; } charIdx++; if (charIdx == content.length()) { // we can't go on, use the last matching state and restart matching // from the next char if (lastMatchingState != null) { // let's add the new annotation(s) createLookups(lastMatchingState, matchedRegionStart, matchedRegionEnd, annotationSet); lastMatchingState = null; } // reset the FSM charIdx = matchedRegionStart + 1; matchedRegionStart = charIdx; currentState = initialState; } } // fire the progress event if (charIdx - oldCharIdx > 256) { fireProgressChanged((100 * charIdx) / length); oldCharIdx = charIdx; if (isInterrupted()) throw new ExecutionInterruptedException( "The execution of the " + getName() + " gazetteer has been abruptly interrupted!"); } } // while(charIdx < length) // we've finished. If we had a stored match, then apply it. if (lastMatchingState != null) { createLookups(lastMatchingState, matchedRegionStart, matchedRegionEnd, annotationSet); } fireProcessFinished(); fireStatusChanged("Look-up complete!"); } // execute