@Override public void setIndexDefinition(IndexDefinition definition) { if (definition != null) { this.getFeatures().put(GateConstants.CORPUS_INDEX_DEFINITION_FEATURE_KEY, definition); String className = definition.getIrEngineClassName(); try { // Class aClass = Class.forName(className); Class<?> aClass = Class.forName(className, true, Gate.getClassLoader()); IREngine engine = (IREngine) aClass.newInstance(); this.indexManager = engine.getIndexmanager(); this.indexManager.setIndexDefinition(definition); this.indexManager.setCorpus(this); } catch (Exception e) { e.printStackTrace(Err.getPrintWriter()); } // switch (definition.getIndexType()) { // case GateConstants.IR_LUCENE_INVFILE: // this.indexManager = new LuceneIndexManager(); // this.indexManager.setIndexDefinition(definition); // this.indexManager.setCorpus(this); // break; // } this.addedDocs = new Vector<Document>(); this.removedDocIDs = new Vector<String>(); this.changedDocs = new Vector<Document>(); } }
@Override public Document get(int index) { if (index >= docDataList.size()) return null; Document res = documents.get(index); if (DEBUG) Out.prln("SerialCorpusImpl: get(): index " + index + "result: " + res); // if the document is null, then I must get it from the DS if (res == null) { FeatureMap parameters = Factory.newFeatureMap(); parameters.put(DataStore.DATASTORE_FEATURE_NAME, this.dataStore); try { parameters.put(DataStore.LR_ID_FEATURE_NAME, docDataList.get(index).getPersistentID()); Document lr = (Document) Factory.createResource(docDataList.get(index).getClassType(), parameters); if (DEBUG) Out.prln("Loaded document :" + lr.getName()); // change the result to the newly loaded doc res = lr; // finally replace the doc with the instantiated version documents.set(index, lr); } catch (ResourceInstantiationException ex) { Err.prln("Error reading document inside a serialised corpus."); throw new GateRuntimeException(ex); } } return res; }
@Override public boolean add(Document o) { if (o == null) return false; Document doc = o; // make it accept only docs from its own datastore if (doc.getDataStore() != null && !this.dataStore.equals(doc.getDataStore())) { Err.prln("Error: Persistent corpus can only accept documents " + "from its own datastore!"); return false; } // if // add the document with its index in the docDataList // in this case, since it's going to be added to the end // the index will be the size of the docDataList before // the addition DocumentData docData = new DocumentData(doc.getName(), doc.getLRPersistenceId(), doc.getClass().getName()); boolean result = docDataList.add(docData); documents.add(doc); documentAdded(doc); fireDocumentAdded( new CorpusEvent( SerialCorpusImpl.this, doc, docDataList.size() - 1, doc.getLRPersistenceId(), CorpusEvent.DOCUMENT_ADDED)); return result; }
/** * This method is called when the SAX parser encounts the end of the XML document. Here we set the * content of the gate Document to be the one generated inside this class (tmpDocContent). After * that we use the colector to generate all the annotation reffering this new gate document. */ @Override public void endDocument() throws org.xml.sax.SAXException { // replace the document content with the one without markups doc.setContent(new DocumentContentImpl(tmpDocContent.toString())); // fire the status listener fireStatusChangedEvent("Total elements: " + elements); // If basicAs is null then get the default AnnotationSet, // based on the gate document. if (basicAS == null) { basicAS = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); } // sort colector ascending on its id Collections.sort(colector); Set<Integer> testIdsSet = new HashSet<Integer>(); // create all the annotations (on this new document) from the collector while (!colector.isEmpty()) { CustomObject obj = colector.getFirst(); // Test to see if there are two annotation objects with the same id. if (testIdsSet.contains(obj.getId())) { throw new GateSaxException( "Found two annotations with the same Id(" + obj.getId() + ").The document is inconsistent."); } else { testIdsSet.add(obj.getId()); } // End iff // create a new annotation and add it to the annotation set try { // the annotation type will be conforming with markupElementsMap // add the annotation to the Annotation Set if (markupElementsMap == null) { basicAS.add(obj.getId(), obj.getStart(), obj.getEnd(), obj.getElemName(), obj.getFM()); } else { // get the type of the annotation from Map String annotationType = markupElementsMap.get(obj.getElemName()); if (annotationType != null) { basicAS.add(obj.getId(), obj.getStart(), obj.getEnd(), annotationType, obj.getFM()); } } // End if } catch (gate.util.InvalidOffsetException e) { Err.prln( "InvalidOffsetException for annot :" + obj.getElemName() + " with Id =" + obj.getId() + ". Discarded..."); } // End try colector.remove(obj); } // End while } // endDocument();
/** * @param inputAS input annotation set * @param outputAS output annotation set * @param term String matched * @param startOffset match start offset * @param endOffset match end offset */ private void addLookup( AnnotationSet inputAS, AnnotationSet outputAS, String term, String outputASType, Long startOffset, Long endOffset, boolean useNounChunk) { if (useNounChunk && nounChunkType != null && !nounChunkType.isEmpty()) { AnnotationSet nounChunkAS = inputAS.getCovering(nounChunkType, startOffset, endOffset); if (!nounChunkAS.isEmpty()) { startOffset = nounChunkAS.firstNode().getOffset(); endOffset = nounChunkAS.lastNode().getOffset(); } } try { AnnotationSet diseaseAS = inputAS.get(outputASType, startOffset, endOffset); if (diseaseAS.isEmpty()) { FeatureMap fm = Factory.newFeatureMap(); fm.put("match", term); outputAS.add(startOffset, endOffset, outputASType, fm); } else { Annotation disease = diseaseAS.iterator().next(); FeatureMap fm = disease.getFeatures(); String meta = (String) fm.get("match"); if (meta != null) { meta = meta + " " + term; } fm.put("match", meta); } } catch (InvalidOffsetException ie) { // shouldn't happen gate.util.Err.println(ie); } }
/* Set gracefulExit flag and clean up */ private void gracefulExit(String msg) { gate.util.Err.println(msg); cleanup(); fireProcessFinished(); }
@Override public void execute() throws ExecutionException { interrupted = false; // quit if setup failed if (gracefulExit) { gracefulExit("Plugin was not initialised correctly. Exiting gracefully ... "); return; } AnnotationSet inputAS = (inputASName == null || inputASName.trim().length() == 0) ? document.getAnnotations() : document.getAnnotations(inputASName); AnnotationSet outputAS = (outputASName == null || outputASName.trim().length() == 0) ? document.getAnnotations() : document.getAnnotations(outputASName); AnnotationSet sentenceAS = null; if (sentenceType != null && !sentenceType.isEmpty()) { sentenceAS = inputAS.get(sentenceType); } // Document content String docContent = document.getContent().toString(); int docLen = docContent.length(); // For matching purposes replace all whitespace characters with a single space docContent = docContent.replaceAll("[\\s\\xA0\\u2007\\u202F]", " "); fireStatusChanged("Locating anatomy, disease and procedure mentions in " + document.getName()); fireProgressChanged(0); if (sentenceAS != null) { for (Annotation sentence : sentenceAS) { Long sentStartOffset = sentence.getStartNode().getOffset(); Long sentEndOffset = sentence.getEndNode().getOffset(); // Converting the sentence to lower case prevents the need to use case-insenstive regex // matching, which should give a small performance boost String sentenceContent = docContent .substring(sentStartOffset.intValue(), sentEndOffset.intValue()) .toLowerCase(Locale.ENGLISH); if (diseaseType != null && !diseaseType.isEmpty()) { doMatch( patternMap.get("disease_suffix"), sentenceContent, inputAS, outputAS, "suffDisease", sentStartOffset, docLen); doMatch( patternMap.get("disease_abbrevs"), sentenceContent, inputAS, outputAS, "preDisease", sentStartOffset, docLen); doMatch( patternMap.get("disease_named_syndrome"), sentenceContent, inputAS, outputAS, "namedDisease", sentStartOffset, docLen); doMatch( patternMap.get("disease_sense"), sentenceContent, inputAS, outputAS, "tmpDiseaseSense", sentStartOffset, docLen); doMatch( patternMap.get("disease_sense_context"), sentenceContent, inputAS, outputAS, "tmpDiseaseSenseContext", sentStartOffset, docLen); doMatch( patternMap.get("disease_generic_context"), sentenceContent, inputAS, outputAS, "poDisease", sentStartOffset, docLen); doMatch( patternMap.get("disease_anatomy_context"), sentenceContent, inputAS, outputAS, "tmpDisease", sentStartOffset, docLen); } if (procedureType != null && !procedureType.isEmpty()) { doMatch( patternMap.get("procedure_suffix"), sentenceContent, inputAS, outputAS, "poProcedure", sentStartOffset, docLen); doMatch( patternMap.get("procedure_key"), sentenceContent, inputAS, outputAS, "poProcedure", sentStartOffset, docLen); doMatch( patternMap.get("procedure_anatomy_context"), sentenceContent, inputAS, outputAS, "tmpProcedure", sentStartOffset, docLen); } if (symptomType != null && !symptomType.isEmpty()) { doMatch( patternMap.get("symptom_key"), sentenceContent, inputAS, outputAS, "poSymptom", sentStartOffset, docLen); } if (testType != null && !testType.isEmpty()) { doMatch( patternMap.get("test_key"), sentenceContent, inputAS, outputAS, "poTest", sentStartOffset, docLen); } if (anatomyType != null && !anatomyType.isEmpty()) { doMatch( patternMap.get("anatomy_suffix_adjective"), sentenceContent, inputAS, outputAS, "tmpAnatSuffAdj", sentStartOffset, docLen); doMatch( patternMap.get("anatomy_suffix"), sentenceContent, inputAS, outputAS, "tmpAnatSuff", sentStartOffset, docLen); doMatch( patternMap.get("anatomy_prefix"), sentenceContent, inputAS, outputAS, "tmpAnatPre", sentStartOffset, docLen); doMatch( patternMap.get("anatomy_position"), sentenceContent, inputAS, outputAS, "tmpAnatPos", sentStartOffset, docLen); doMatch( patternMap.get("anatomy_space_region_junction"), sentenceContent, inputAS, outputAS, "tmpAnatSpace", sentStartOffset, docLen); doMatch( patternMap.get("anatomy_part_adjective"), sentenceContent, inputAS, outputAS, "tmpAnatAdj", sentStartOffset, docLen); doMatch( patternMap.get("anatomy_latin_noun"), sentenceContent, inputAS, outputAS, "tmpAnatLatin", sentStartOffset, docLen); doMatch( patternMap.get("anatomy_muscle"), sentenceContent, inputAS, outputAS, "tmpAnatMuscle", sentStartOffset, docLen); doMatch( patternMap.get("anatomy_part"), sentenceContent, inputAS, outputAS, "tmpAnatPart", sentStartOffset, docLen); doMatch( patternMap.get("anatomy_fluid"), sentenceContent, inputAS, outputAS, "tmpAnatFluid", sentStartOffset, docLen); } } // Run JAPE transducer to clean up the output fireStatusChanged( "Processing anatomical, disease and procedure mentions in " + document.getName()); try { japeTransducer.setDocument(document); japeTransducer.setInputASName(inputASName); japeTransducer.setOutputASName(outputASName); japeTransducer.addProgressListener(this); japeTransducer.execute(); } catch (ExecutionException re) { gate.util.Err.println("Unable to run " + japeURL); gracefulExit = true; } finally { japeTransducer.setDocument(null); } // rename temporary annotations if (!debug) { renameAnnotations(outputAS, "tmpAnatomicalTerm", anatomyType); renameAnnotations(outputAS, "suffDisease", diseaseType); renameAnnotations(outputAS, "poDisease", diseaseType); renameAnnotations(outputAS, "preDisease", diseaseType); renameAnnotations(outputAS, "poProcedure", procedureType); renameAnnotations(outputAS, "poSymptom", symptomType); renameAnnotations(outputAS, "poTest", testType); } } else { gracefulExit("No sentences to process!"); } // want list of disease key words plus symptoms such as oedema? or just diseases fireProcessFinished(); } // end execute()
@Override public Resource init() throws ResourceInstantiationException { gracefulExit = false; if (configFileURL == null) { gracefulExit = true; gate.util.Err.println("No configuration file provided!"); } if (japeURL == null) { gracefulExit = true; gate.util.Err.println("No JAPE grammar file provided!"); } // create the init params for the JAPE transducer FeatureMap params = Factory.newFeatureMap(); params.put(Transducer.TRANSD_GRAMMAR_URL_PARAMETER_NAME, japeURL); // Code borrowed from Mark Greenwood's Measurements PR if (japeTransducer == null) { // if this is the first time we are running init then actually create a // new transducer as we don't already have one FeatureMap hidden = Factory.newFeatureMap(); Gate.setHiddenAttribute(hidden, true); japeTransducer = (Transducer) Factory.createResource("gate.creole.Transducer", params, hidden); } else { // we are being run through a call to reInit so simply re-init the // underlying JAPE transducer japeTransducer.setParameterValues(params); japeTransducer.reInit(); } ConfigReader config = new ConfigReader(configFileURL); gracefulExit = config.config(); try { HashMap<String, String> options = config.getOptions(); patternMap = new HashMap<String, Pattern>(); addSuffixPattern("disease_suffix", options); addWordPattern("disease_abbrevs", options); addWordPattern("disease_sense", options); addWordExtraPattern("disease_sense_context", options); addPossessiveWordPattern("disease_named_syndrome", options); addWordExtraPattern("disease_generic_context", options); addWordExtraPattern("disease_anatomy_context", options); addSuffixPluralPattern("procedure_suffix", options); addWordPluralPattern("procedure_key", options); addWordExtraPattern("procedure_anatomy_context", options); addWordPluralPattern("symptom_key", options); addWordPattern("test_key", options); addSuffixPattern("anatomy_suffix_adjective", options); addSuffixPattern("anatomy_suffix", options); addPrefixPattern("anatomy_prefix", options); addWordPattern("anatomy_position", options); addWordPluralPattern("anatomy_space_region_junction", options); addWordPattern("anatomy_part_adjective", options); addWordPattern("anatomy_latin_noun", options); addWordPattern("anatomy_muscle", options); addWordPluralPattern("anatomy_part", options); addWordPluralPattern("anatomy_fluid", options); } catch (NullPointerException ne) { gracefulExit = true; gate.util.Err.println( "Missing or unset configuration options. Please check configuration file."); } return this; } // end init()
/** * This method is called when the HTML parser encounts the end of a tag that means that the tag is * paired by a beginning tag */ @Override public void handleEndTag(HTML.Tag t, int pos) { // obj is for internal use CustomObject obj = null; // end of STYLE tag if (HTML.Tag.STYLE.equals(t)) { isInsideStyleTag = false; } // if // If the stack is not empty then we get the object from the stack if (!stack.isEmpty()) { obj = stack.pop(); // Before adding it to the colector, we need to check if is an // emptyAndSpan one. See CustomObject's isEmptyAndSpan field. if (obj.getStart().equals(obj.getEnd())) { // The element had an end tag and its start was equal to its end. Hence // it is anEmptyAndSpan one. obj.getFM().put("isEmptyAndSpan", "true"); } // End iff // we add it to the colector colector.add(obj); } // End if // If element has text between, then customize its apearance if (obj != null && obj.getStart().longValue() != obj.getEnd().longValue()) // Customize the appearance of the document customizeAppearanceOfDocumentWithEndTag(t); // if t is the </HTML> tag then we reached the end of theHTMLdocument if (t == HTML.Tag.HTML) { // replace the old content with the new one doc.setContent(new DocumentContentImpl(tmpDocContent.toString())); // If basicAs is null then get the default annotation // set from this gate document if (basicAS == null) basicAS = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); // sort colector ascending on its id Collections.sort(colector); // iterate through colector and construct annotations while (!colector.isEmpty()) { obj = colector.getFirst(); colector.remove(obj); // Construct an annotation from this obj try { if (markupElementsMap == null) { basicAS.add(obj.getStart(), obj.getEnd(), obj.getElemName(), obj.getFM()); } else { String annotationType = markupElementsMap.get(obj.getElemName()); if (annotationType != null) basicAS.add(obj.getStart(), obj.getEnd(), annotationType, obj.getFM()); } } catch (InvalidOffsetException e) { Err.prln("Error creating an annot :" + obj + " Discarded..."); } // end try // }// end if } // while // notify the listener about the total amount of elements that // has been processed fireStatusChangedEvent("Total elements : " + elements); } // else } // handleEndTag