@Override public void execute() throws ExecutionException { initBeforeExecute(); AnnotationSet tokensAndDependenciesAS = inputAS; TreeIndex index = new GateAwareTreeIndex( tokensAndDependenciesAS.get(null, Utils.setFromArray(new String[] {"args"}))); QueryData data = new QueryData(index, new GateAnnotationsNodeAttributes(tokensAndDependenciesAS)); Iterable<QueryMatch> results = queryObject.evaluate(data); int queryMatchOrd = 0; for (QueryMatch result : results) { queryMatchOrd++; for (NodeMatch match : result.getMatchingNodes()) { String name = match.getQueryNode().getName(); if (name != null) { Annotation matchingAnnot = tokensAndDependenciesAS.get(match.getNodeId()); FeatureMap fm = Factory.newFeatureMap(); fm.put("matchingNodeId", match.getNodeId()); fm.put( "queryMatchId", String.format("%s_%03d", buildQueryStringHash(getQueryString()), queryMatchOrd)); outputAS.add(matchingAnnot.getStartNode(), matchingAnnot.getEndNode(), name, fm); } } } }
private void transferAnnotations(List<Annotation> toTransfer, AnnotationSet to, boolean newID) throws ExecutionException { for (Annotation annot : toTransfer) { Mapping m = mappings.get(annot.getType()); String name = (m == null || m.newName == null ? annot.getType() : m.newName); try { FeatureMap params = Factory.newFeatureMap(); params.putAll(annot.getFeatures()); if (newID) { to.add(annot.getStartNode().getOffset(), annot.getEndNode().getOffset(), name, params); } else { to.add( annot.getId(), annot.getStartNode().getOffset(), annot.getEndNode().getOffset(), name, params); } } catch (InvalidOffsetException e) { throw new ExecutionException(e); } } }
/** Annotation remove event */ public void annotationRemoved(AnnotationSetEvent ase) { if (!disableListener && ase.getSourceDocument() == this) { AnnotationSet as = (AnnotationSet) ase.getSource(); Annotation annot = ase.getAnnotation(); FeatureMap features = Factory.newFeatureMap(); features.putAll(annot.getFeatures()); boolean defaultAS = as.getName() == null; for (String docID : combinedDocumentIds) { Document aDoc = compoundDocument.getDocument(docID); // find out the details which refer to the deleted annotation OffsetDetails od = getOffsetDetails(docID, as.getName(), annot); if (od == null) continue; if (defaultAS) { aDoc.getAnnotations().remove(od.getOriginalAnnotation()); } else { aDoc.getAnnotations(as.getName()).remove(od.getOriginalAnnotation()); } removeOffsetDetails(docID, od); break; } } }
@Override public Document get(int index) { if (index >= docDataList.size()) return null; Document res = documents.get(index); if (DEBUG) Out.prln("SerialCorpusImpl: get(): index " + index + "result: " + res); // if the document is null, then I must get it from the DS if (res == null) { FeatureMap parameters = Factory.newFeatureMap(); parameters.put(DataStore.DATASTORE_FEATURE_NAME, this.dataStore); try { parameters.put(DataStore.LR_ID_FEATURE_NAME, docDataList.get(index).getPersistentID()); Document lr = (Document) Factory.createResource(docDataList.get(index).getClassType(), parameters); if (DEBUG) Out.prln("Loaded document :" + lr.getName()); // change the result to the newly loaded doc res = lr; // finally replace the doc with the instantiated version documents.set(index, lr); } catch (ResourceInstantiationException ex) { Err.prln("Error reading document inside a serialised corpus."); throw new GateRuntimeException(ex); } } return res; }
@SuppressWarnings("unchecked") public static AnnotationDiffer computeDiffWithDocFeatures( Document document, List<String> featureNames, AnnotationSet responsesAnnotations) { FeatureMap doc_fm = document.getFeatures(); // Logger log = Logger.getLogger(DocumentFeaturesDiff.class); int correct = 0; int missing = 0; int spurious = 0; for (String feature_name : featureNames) { // int cur_correct = 0; List<String> f = (List<String>) doc_fm.get(feature_name); if (f == null) { f = (List<String>) doc_fm.get(feature_name + "s"); } AnnotationDiffer diff = computeDiffWithGoldStandardDataForSingleFeature( feature_name, Utils.setFromList(f), responsesAnnotations); spurious += diff.getSpurious(); correct += diff.getCorrectMatches(); missing += diff.getMissing(); } return new AnnotationDifferDocumentFeaturesImpl(correct, missing, spurious); }
public JSONObject persian_sentiment(String text) throws Exception { oncreate(); File PersianGapp = new File("C:/Users/mohammad/Desktop/New folder/Gate/application.xgapp"); // initialise GATE - this must be done before calling any GATE APIs Gate.init(); // load the saved application CorpusController application = (CorpusController) PersistenceManager.loadObjectFromFile(PersianGapp); // Create a Corpus to use. We recycle the same Corpus object for each // iteration. The string parameter to newCorpus() is simply the // GATE-internal name to use for the corpus. It has no particular // significance. Corpus corpus = Factory.newCorpus("BatchProcessApp Corpus"); application.setCorpus(corpus); // process the files one by one // load the document (using the specified encoding if one was given) Document doc = Factory.newDocument(text); // put the document in the corpus corpus.add(doc); // run the application application.execute(); String featureName = "Doc_sentiment"; FeatureMap features = doc.getFeatures(); // remove the document from the corpus again corpus.clear(); // doc.getFeatures(). // Release the document, as it is no longer needed Factory.deleteResource(doc); LinkedHashMap originalContent = (LinkedHashMap) features.get(featureName); String obj = (String) originalContent.get("sentiment"); // BigDecimal pos =(BigDecimal) originalContent.get("positive"); // BigDecimal neg =(BigDecimal) originalContent.get("negative"); // System.out.println(obj); // create Json for response to user JSONObject obj1 = new JSONObject(); obj1.put("sentiment", obj); /*obj1.put("positive",pos); //obj1.put("negative",neg); System.out.print("----------"); System.out.print(obj1); System.out.print("----------");*/ // application.cleanup(); return obj1; }
// generate annotations for ngrams over a larger span e.g all couples inside // a span of 5 tokens // this allows to match more variants e.g. with adjectives in the middle // we do not generate intermediate annotations here // do with only bigrams for the moment private void generateNGramsOverWindow(List<Annotation> list, AnnotationSet outputAS) throws ExecutionException { List<List> boxes = generateBoxes(list, outputAS); try { for (int b = 0; b < boxes.size(); b++) { List<String> tempAnnotationsStartingHere = new ArrayList<String>(); Long loStart = null; Long hiEnd = null; // create a temporary list containing all the annotations // at position 0 List<Annotation> headannots = boxes.get(b); for (Annotation newAnn : headannots) { // remembering positions loStart = newAnn.getStartNode().getOffset(); if (hiEnd == null) hiEnd = newAnn.getEndNode().getOffset(); else if (newAnn.getEndNode().getOffset().longValue() > hiEnd.longValue()) hiEnd = newAnn.getEndNode().getOffset(); String string = (String) newAnn.getFeatures().get(inputAnnotationFeature); tempAnnotationsStartingHere.add(string); if (this.generateIntermediateAnnotations) { FeatureMap fm = Factory.newFeatureMap(); fm.put(this.outputAnnotationFeature, string); outputAS.add(loStart, hiEnd, outputAnnotationType, fm); } } for (int z = 1; z < window && (b + z < boxes.size()); z++) { // generate all possible bi-grams List<Annotation> current = boxes.get(b + z); for (Annotation newAnn : current) { // remembering positions if (hiEnd == null) hiEnd = newAnn.getEndNode().getOffset(); else if (newAnn.getEndNode().getOffset().longValue() > hiEnd.longValue()) hiEnd = newAnn.getEndNode().getOffset(); String newString = (String) newAnn.getFeatures().get(inputAnnotationFeature); // take what is in the buffer // and make a new annotation out of that for (String s : tempAnnotationsStartingHere) { String combination = s + getNgramSeparator() + newString; // create an annotation for the combination FeatureMap fm = Factory.newFeatureMap(); fm.put(this.outputAnnotationFeature, combination); outputAS.add(loStart, hiEnd, outputAnnotationType, fm); } } } } } catch (Exception e) { throw new ExecutionException(e); } }
/** * Generation of a GATE document from a Behemoth one * * @param key URL of the input doc * @param inputDoc * @return * @throws ResourceInstantiationException * @throws InvalidOffsetException * @throws IOException */ public gate.Document generateGATEDoc(BehemothDocument inputDoc) throws ResourceInstantiationException, InvalidOffsetException, IOException { gate.Document gatedocument = null; // if no text is available (e.g. Tika has not extracted it) // let GATE do the parsing itself from the binary content if (inputDoc.getText() == null) { try { gatedocument = generateGATEDocFromLocalDump(inputDoc); // transfer the text from GATE to Behemoth String textContent = gatedocument.getContent().toString(); inputDoc.setText(textContent); return gatedocument; } catch (Exception e) { LOG.error("Can't generate GATE doc from byte dump", e); } } // if the input document does not have any text -> create a doc with an // empty text String text = inputDoc.getText(); if (inputDoc.getText() == null) text = ""; else text = inputDoc.getText(); gatedocument = Factory.newDocument(text); // then the metadata as document features FeatureMap docFeatures = gatedocument.getFeatures(); String docUrl = inputDoc.getUrl(); if (docUrl != null) docFeatures.put("gate.SourceURL", docUrl); if (inputDoc.getMetadata() != null) { Iterator<Entry<Writable, Writable>> iter = inputDoc.getMetadata().entrySet().iterator(); while (iter.hasNext()) { Entry<Writable, Writable> entry = iter.next(); String skey = entry.getKey().toString().trim(); String svalue = null; if (entry.getValue() != null) svalue = entry.getValue().toString().trim(); docFeatures.put(skey, svalue); } } // finally the annotations as original markups // TODO change the name of the annotation set via config AnnotationSet outputAS = gatedocument.getAnnotations("Original markups"); for (Annotation annot : inputDoc.getAnnotations()) { // add to outputAS as a GATE annotation FeatureMap features = Factory.newFeatureMap(); features.putAll(annot.getFeatures()); outputAS.add(annot.getStart(), annot.getEnd(), annot.getType(), features); } return gatedocument; }
private void generateNGrams(List<Annotation> list, AnnotationSet outputAS) throws ExecutionException { List<List> boxes = generateBoxes(list, outputAS); try { // now do the actual n-grams for (int b = 0; b < boxes.size(); b++) { List<String> tempAnnotationsStartingHere = new ArrayList<String>(); Long loStart = null; Long hiEnd = null; for (int z = 0; z < this.ngram.intValue() && (b + z < boxes.size()); z++) { // do the combination and dump what we've done at every step // e.g generate 1 grams as well as 2-grams List<Annotation> current = boxes.get(b + z); List<String> temptemp = new ArrayList<String>(); for (Annotation newAnn : current) { // remembering positions if (loStart == null) loStart = newAnn.getStartNode().getOffset(); if (hiEnd == null) hiEnd = newAnn.getEndNode().getOffset(); else if (newAnn.getEndNode().getOffset().longValue() > hiEnd.longValue()) hiEnd = newAnn.getEndNode().getOffset(); String newString = (String) newAnn.getFeatures().get(inputAnnotationFeature); // TODO : what if there is no such value???? if (tempAnnotationsStartingHere.size() == 0) { // create an annotation for the current annotation if (this.generateIntermediateAnnotations) { FeatureMap fm = Factory.newFeatureMap(); fm.put(this.outputAnnotationFeature, newString); outputAS.add(loStart, hiEnd, outputAnnotationType, fm); } // add it to the temp temptemp.add(newString); } else for (String existing : tempAnnotationsStartingHere) { String combination = existing + getNgramSeparator() + newString; temptemp.add(combination); if (this.generateIntermediateAnnotations | z == this.ngram.intValue() - 1) { // create an annotation for the combination FeatureMap fm = Factory.newFeatureMap(); fm.put(this.outputAnnotationFeature, combination); outputAS.add(loStart, hiEnd, outputAnnotationType, fm); } } } tempAnnotationsStartingHere = temptemp; } } } catch (Exception e) { throw new ExecutionException(e); } }
public void tokenize() { AnnotationSet tokenizationAs = gateDocument.getAnnotations("Tokenization"); AnnotationSet defaultAs = gateDocument.getAnnotations(""); for (Iterator<Annotation> it = tokenizationAs.iterator(); it.hasNext(); ) { Annotation currentTokenAnnotation = it.next(); FeatureMap tokenFeaturesMap = currentTokenAnnotation.getFeatures(); FeatureMap curFeaturesMap = Factory.newFeatureMap(); if ("Token".compareToIgnoreCase(currentTokenAnnotation.getType()) == 0) { curFeaturesMap.put("string", tokenFeaturesMap.get("string")); curFeaturesMap.put("root", tokenFeaturesMap.get("lemma")); curFeaturesMap.put("category", tokenFeaturesMap.get("POS")); // Add the new Token to the Annotation Set defaultAs.add( currentTokenAnnotation.getStartNode(), currentTokenAnnotation.getEndNode(), currentTokenAnnotation.getType(), curFeaturesMap); } } gateDocument.removeAnnotationSet("Tokenization"); }
/** Annotation added event */ public void annotationAdded(AnnotationSetEvent ase) { if (!disableListener && ase.getSourceDocument() == this) { AnnotationSet as = (AnnotationSet) ase.getSource(); Annotation annot = ase.getAnnotation(); annot.addAnnotationListener(this); FeatureMap features = Factory.newFeatureMap(); features.putAll(annot.getFeatures()); boolean defaultAS = as.getName() == null; for (String docID : combinedDocumentIds) { Document aDoc = compoundDocument.getDocument(docID); long stOffset = getOffsetInSrcDocument(docID, annot.getStartNode().getOffset().longValue()); if (stOffset == -1) continue; long enOffset = getOffsetInSrcDocument(docID, annot.getEndNode().getOffset().longValue()); if (enOffset == -1) continue; Annotation originalAnnot = null; try { Integer id = annot.getId(); if (defaultAS) { aDoc.getAnnotations() .add(id, new Long(stOffset), new Long(enOffset), annot.getType(), features); originalAnnot = aDoc.getAnnotations().get(id); } else { aDoc.getAnnotations(as.getName()) .add(id, new Long(stOffset), new Long(enOffset), annot.getType(), features); originalAnnot = aDoc.getAnnotations(as.getName()).get(id); } } catch (InvalidOffsetException ioe) { System.out.println(aDoc.getName() + "=" + stOffset + "=" + enOffset); throw new GateRuntimeException(ioe); } OffsetDetails od = new OffsetDetails(); od.setOldStartOffset(stOffset); od.setOldEndOffset(enOffset); od.setNewStartOffset(annot.getStartNode().getOffset().longValue()); od.setNewEndOffset(annot.getEndNode().getOffset().longValue()); od.setOriginalAnnotation(originalAnnot); od.setNewAnnotation(annot); addNewOffsetDetails(docID, od); break; } } }
/** * This method is called when the HTML parser encounts the beginning of a tag that means that the * tag is paired by an end tag and it's not an empty one. */ @Override public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) { // Fire the status listener if the elements processed exceded the rate if (0 == (++elements % ELEMENTS_RATE)) fireStatusChangedEvent("Processed elements : " + elements); // Start of STYLE tag if (HTML.Tag.STYLE.equals(t)) { isInsideStyleTag = true; } // if // Construct a feature map from the attributes list FeatureMap fm = Factory.newFeatureMap(); // Take all the attributes an put them into the feature map if (0 != a.getAttributeCount()) { Enumeration<?> enumeration = a.getAttributeNames(); while (enumeration.hasMoreElements()) { Object attribute = enumeration.nextElement(); fm.put(attribute.toString(), (a.getAttribute(attribute)).toString()); } // while } // if // Just analize the tag t and add some\n chars and spaces to the // tmpDocContent.The reason behind is that we need to have a readable form // for the final document. customizeAppearanceOfDocumentWithStartTag(t); // If until here the "tmpDocContent" ends with a NON whitespace char, // then we add a space char before calculating the START index of this // tag. // This is done in order not to concatenate the content of two separate tags // and obtain a different NEW word. int tmpDocContentSize = tmpDocContent.length(); if (tmpDocContentSize != 0 && !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1))) tmpDocContent.append(" "); // create the start index of the annotation Long startIndex = new Long(tmpDocContent.length()); // initialy the start index is equal with the End index CustomObject obj = new CustomObject(t.toString(), fm, startIndex, startIndex); // put it into the stack stack.push(obj); } // handleStartTag
private gate.Document generateGATEDocFromLocalDump(BehemothDocument inputDoc) throws ResourceInstantiationException, IOException { // can't get that to work // File tempDirectory = new // File(this.config.get("hadoop.tmp.dir","/tmp"),this.config.get("user.name", // "./tmp")); // LOG.info("tempDirectory "+tempDirectory); // // tempDirectory.mkdirs(); // // File tempInputFile = File.createTempFile("gateInput-", // inputDoc.getUrl(),tempDirectory); // // FileOutputStream fos = new FileOutputStream(tempInputFile); // OutputStream bout = new BufferedOutputStream(fos); // bout.write(inputDoc.getContent()); // bout.flush(); // bout.close(); // // URL url; // try { // url = tempInputFile.toURI().toURL(); // } catch (MalformedURLException e) { // // delete the input doc // tempInputFile.delete(); // throw e; // } FeatureMap params = Factory.newFeatureMap(); params.put(Document.DOCUMENT_STRING_CONTENT_PARAMETER_NAME, new String(inputDoc.getContent())); String ct = inputDoc.getContentType(); if (ct != null) params.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, ct); gate.Document gatedocument; try { gatedocument = (Document) Factory.createResource("gate.corpora.DocumentImpl", params); } finally { // delete the input doc // tempInputFile.delete(); } return gatedocument; }
@Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + ((features == null) ? 0 : features.hashCode()); result = prime * result + Arrays.hashCode(members); result = prime * result + ((type == null) ? 0 : type.hashCode()); result = prime * result + ((userData == null) ? 0 : userData.hashCode()); return result; }
@Override public Object getObject() throws IOException { ensureGateInit(); FeatureMap fm = Factory.newFeatureMap(); if (sourceMap != null) { for (Map.Entry<Object, Object> entry : sourceMap.entrySet()) { Object key = entry.getKey(); Object value = entry.getValue(); // convert Spring resources to URLs if (value instanceof Resource) { value = SpringFactory.resourceToUrl((Resource) value); } fm.put(key, value); } } return fm; }
/** * Loading the configurationg file and corpus for testing. And make settings as in the GATE Gui. */ void loadSettings(String configFileName, String corpusDirName, String inputasN, String outputasN) throws GateException, IOException { LogService.minVerbosityLevel = 0; if (LogService.minVerbosityLevel > 0) System.out.println("Learning Home : " + learningHome.getAbsolutePath()); FeatureMap parameters = Factory.newFeatureMap(); URL configFileURL = new File(configFileName).toURI().toURL(); parameters.put("configFileURL", configFileURL); learningApi = (LearningAPIMain) Factory.createResource("gate.learning.LearningAPIMain", parameters); // Load the corpus corpus = Factory.newCorpus("DataSet"); ExtensionFileFilter fileFilter = new ExtensionFileFilter(); fileFilter.addExtension("xml"); File[] xmlFiles = new File(corpusDirName).listFiles(fileFilter); Arrays.sort( xmlFiles, new Comparator<File>() { public int compare(File a, File b) { return a.getName().compareTo(b.getName()); } }); for (File f : xmlFiles) { if (!f.isDirectory()) { Document doc = Factory.newDocument(f.toURI().toURL(), "UTF-8"); doc.setName(f.getName()); corpus.add(doc); } } // URL tempURL = new File(corpusDirName).toURI().toURL(); // corpus.populate(tempURL, fileFilter, "UTF-8", false); // Set the inputAS learningApi.setInputASName(inputasN); learningApi.setOutputASName(outputasN); controller = (gate.creole.SerialAnalyserController) Factory.createResource("gate.creole.SerialAnalyserController"); controller.setCorpus(corpus); controller.add(learningApi); }
/** * Rename annotation * * @param outputAS output annotation set * @param oldType old annotation name * @param newType new annotation name */ private void renameAnnotations(AnnotationSet outputAS, String oldType, String newType) { AnnotationSet tmpAnatomyAS = outputAS.get(oldType); for (Annotation tmpAnn : tmpAnatomyAS) { Long startOffset = tmpAnn.getStartNode().getOffset(); Long endOffset = tmpAnn.getEndNode().getOffset(); AnnotationSet existingAS = outputAS.getCovering(newType, startOffset, endOffset); // If we've already got an annotation of the same name in the same place, don't add a new one // just delete the old one if (existingAS.isEmpty()) { FeatureMap tmpFm = tmpAnn.getFeatures(); FeatureMap fm = Factory.newFeatureMap(); fm.putAll(tmpFm); try { outputAS.add(startOffset, endOffset, newType, fm); outputAS.remove(tmpAnn); } catch (InvalidOffsetException ie) { // shouldn't happen } } else { outputAS.remove(tmpAnn); } } }
public void execute() throws ExecutionException { AnnotationSet outputAS = document.getAnnotations(annotationSetName); String text = document.getContent().toString(); Span[] tokens = tokenizer.getTokens(text); try { for (Span token : tokens) { FeatureMap features = Factory.newFeatureMap(); features.put( ANNIEConstants.TOKEN_STRING_FEATURE_NAME, text.substring(token.getStart(), token.getEnd())); outputAS.add( (long) token.getStart(), (long) token.getEnd(), ANNIEConstants.TOKEN_ANNOTATION_TYPE, features); } } catch (Exception e) { throw new ExecutionException("error running tokenizer", e); } }
/** This method is called when the HTML parser encounts an empty tag */ @Override public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) { // fire the status listener if the elements processed exceded the rate if ((++elements % ELEMENTS_RATE) == 0) fireStatusChangedEvent("Processed elements : " + elements); // construct a feature map from the attributes list // these are empty elements FeatureMap fm = Factory.newFeatureMap(); // take all the attributes an put them into the feature map if (0 != a.getAttributeCount()) { // Out.println("HAS attributes = " + a.getAttributeCount ()); Enumeration<?> enumeration = a.getAttributeNames(); while (enumeration.hasMoreElements()) { Object attribute = enumeration.nextElement(); fm.put(attribute.toString(), (a.getAttribute(attribute)).toString()); } // while } // if // create the start index of the annotation Long startIndex = new Long(tmpDocContent.length()); // initialy the start index is equal with the End index CustomObject obj = new CustomObject(t.toString(), fm, startIndex, startIndex); // we add the object directly into the colector // we don't add it to the stack because this is an empty tag colector.add(obj); // Just analize the tag t and add some\n chars and spaces to the // tmpDocContent.The reason behind is that we need to have a readable form // for the final document. customizeAppearanceOfDocumentWithSimpleTag(t); } // handleSimpleTag
/** * @param inputAS input annotation set * @param outputAS output annotation set * @param term String matched * @param startOffset match start offset * @param endOffset match end offset */ private void addLookup( AnnotationSet inputAS, AnnotationSet outputAS, String term, String outputASType, Long startOffset, Long endOffset, boolean useNounChunk) { if (useNounChunk && nounChunkType != null && !nounChunkType.isEmpty()) { AnnotationSet nounChunkAS = inputAS.getCovering(nounChunkType, startOffset, endOffset); if (!nounChunkAS.isEmpty()) { startOffset = nounChunkAS.firstNode().getOffset(); endOffset = nounChunkAS.lastNode().getOffset(); } } try { AnnotationSet diseaseAS = inputAS.get(outputASType, startOffset, endOffset); if (diseaseAS.isEmpty()) { FeatureMap fm = Factory.newFeatureMap(); fm.put("match", term); outputAS.add(startOffset, endOffset, outputASType, fm); } else { Annotation disease = diseaseAS.iterator().next(); FeatureMap fm = disease.getFeatures(); String meta = (String) fm.get("match"); if (meta != null) { meta = meta + " " + term; } fm.put("match", meta); } } catch (InvalidOffsetException ie) { // shouldn't happen gate.util.Err.println(ie); } }
@Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; SimpleRelation other = (SimpleRelation) obj; if (features == null) { if (other.features != null) return false; } else if (!features.equals(other.features)) return false; if (!Arrays.equals(members, other.members)) return false; if (type == null) { if (other.type != null) return false; } else if (!type.equals(other.type)) return false; if (userData == null) { if (other.userData != null) return false; } else if (!userData.equals(other.userData)) return false; return true; }
/** * Populates this Persistence with the data that needs to be stored from the original source * object. */ @Override public void extractDataFromSource(Object source) throws PersistenceException { if (!(source instanceof ProcessingResource)) { throw new UnsupportedOperationException( getClass().getName() + " can only be used for " + ProcessingResource.class.getName() + " objects!\n" + source.getClass().getName() + " is not a " + ProcessingResource.class.getName()); } super.extractDataFromSource(source); Resource res = (Resource) source; ResourceData rData = Gate.getCreoleRegister().get(res.getClass().getName()); if (rData == null) throw new PersistenceException("Could not find CREOLE data for " + res.getClass().getName()); // now get the runtime params ParameterList params = rData.getParameterList(); try { // get the values for the init time parameters runtimeParams = Factory.newFeatureMap(); // this is a list of lists Iterator<List<Parameter>> parDisjIter = params.getRuntimeParameters().iterator(); while (parDisjIter.hasNext()) { Iterator<Parameter> parIter = parDisjIter.next().iterator(); while (parIter.hasNext()) { Parameter parameter = parIter.next(); String parName = parameter.getName(); Object parValue = res.getParameterValue(parName); ((FeatureMap) runtimeParams).put(parName, parValue); } } runtimeParams = PersistenceManager.getPersistentRepresentation(runtimeParams); } catch (ResourceInstantiationException rie) { throw new PersistenceException(rie); } }
/* * (non-Javadoc) * * @see java.lang.Object#toString() */ @Override public String toString() { StringBuilder str = new StringBuilder(); str.append(id).append(": "); String typeOut = type.replaceAll("\\(", Matcher.quoteReplacement("\\(")) .replaceAll("\\)", Matcher.quoteReplacement("\\)")); str.append(typeOut).append("("); for (int i = 0; i < members.length; i++) { if (i > 0) str.append(", "); str.append(members[i]); } str.append(")"); if (features != null) { str.append("#").append(features.toString()); } if (userData != null) { str.append("#").append(userData.toString()); } return str.toString(); }
/** * Creates the Lookup annotations according to a gazetteer match. * * @param matchingState the final FSMState that was reached while matching. * @param matchedRegionStart the start of the matched text region. * @param matchedRegionEnd the end of the matched text region. * @param annotationSet the annotation set where the new annotations should be added. */ protected void createLookups( FSMState matchingState, long matchedRegionStart, long matchedRegionEnd, AnnotationSet annotationSet) { Iterator lookupIter = matchingState.getLookupSet().iterator(); while (lookupIter.hasNext()) { Lookup currentLookup = (Lookup) lookupIter.next(); FeatureMap fm = Factory.newFeatureMap(); fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType); if (null != currentLookup.oClass && null != currentLookup.ontology) { fm.put(LOOKUP_CLASS_FEATURE_NAME, currentLookup.oClass); fm.put(LOOKUP_ONTOLOGY_FEATURE_NAME, currentLookup.ontology); } if (null != currentLookup.minorType) fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType); if (null != currentLookup.languages) fm.put(LOOKUP_LANGUAGE_FEATURE_NAME, currentLookup.languages); if (null != currentLookup.features) { fm.putAll(currentLookup.features); } try { // if(currentLookup.annotationType==null || "".equals(currentLookup.annotationType)){ // annotationSet.add(new Long(matchedRegionStart), // new Long(matchedRegionEnd + 1), // LOOKUP_ANNOTATION_TYPE, // fm); // }else{ annotationSet.add( new Long(matchedRegionStart), new Long(matchedRegionEnd + 1), currentLookup.annotationType, // this pojo attribute will have Lookup as a default tag. fm); // } } catch (InvalidOffsetException ioe) { throw new GateRuntimeException(ioe.toString()); } } // while(lookupIter.hasNext()) }
@Test public void testAddFeatureStemmingEnabled() { Annotation mockedAnnot1 = Mockito.mock(Annotation.class); Annotation mockedAnnot2 = Mockito.mock(Annotation.class); FeatureMap mockedMap1 = Mockito.mock(FeatureMap.class); FeatureMap mockedMap2 = Mockito.mock(FeatureMap.class); Node startNode = Mockito.mock(Node.class); Node endNode = Mockito.mock(Node.class); String wholeSentence = "First Second Third Fourth."; Mockito.when(startNode.getOffset()).thenReturn((long) 0); Mockito.when(endNode.getOffset()).thenReturn((long) 11); Mockito.when(mockedAnnot1.getFeatures()).thenReturn(mockedMap1); Mockito.when(mockedMap1.get("string")).thenReturn("First"); Mockito.when(mockedMap1.get("stem")).thenReturn("stem1"); Mockito.when(mockedAnnot1.getStartNode()).thenReturn(startNode); Mockito.when(mockedAnnot2.getFeatures()).thenReturn(mockedMap2); Mockito.when(mockedMap2.get("string")).thenReturn("Second"); Mockito.when(mockedMap2.get("stem")).thenReturn("stem2"); Mockito.when(mockedAnnot2.getEndNode()).thenReturn(endNode); Document gateDocument = Mockito.mock(Document.class); Mockito.when(gateDocument.getName()).thenReturn("doc1"); ArrayList<Annotation> featureAnnots = new ArrayList<Annotation>(); featureAnnots.add(mockedAnnot1); featureAnnots.add(mockedAnnot2); Mockito.when(options.isEnableStemming()).thenReturn(true); String featureString = "First Second"; String featureStem = "stem1 stem2"; featureContainer.addFeature(featureAnnots, wholeSentence, gateDocument, "content"); Assert.assertTrue(featureContainer.getFeatureDictionary().get(featureString) != null); Assert.assertTrue(featureContainer.getFeatureStorage().get(featureStem) != null); }
/** * Run from the command-line, with a list of URLs as argument. * * <p><B>NOTE:</B><br> * This code will run with all the documents in memory - if you want to unload each from memory * after use, add code to store the corpus in a DataStore. */ public static void main(String args[]) throws GateException, IOException { // initialise the GATE library Out.prln("Initialising GATE..."); Gate.init(); Out.prln("...GATE initialised"); // initialise ANNIE (this may take several minutes) StandAloneAnnie annie = new StandAloneAnnie(); annie.initAnnie(); // create a GATE corpus and add a document for each command-line // argument Corpus corpus = Factory.newCorpus("StandAloneAnnie corpus"); for (int i = 0; i < args.length; i++) { URL u = new URL(args[i]); FeatureMap params = Factory.newFeatureMap(); params.put("sourceUrl", u); params.put("preserveOriginalContent", new Boolean(true)); params.put("collectRepositioningInfo", new Boolean(true)); Out.prln("Creating doc for " + u); Document doc = (Document) Factory.createResource("gate.corpora.DocumentImpl", params); corpus.add(doc); } // for each of args // tell the pipeline about the corpus and run it annie.setCorpus(corpus); annie.execute(); // for each document, get an XML document with the // person and location names added Iterator iter = corpus.iterator(); int count = 0; String startTagPart_1 = "<span GateID=\""; String startTagPart_2 = "\" title=\""; String startTagPart_3 = "\" style=\"background:Red;\">"; String endTag = "</span>"; while (iter.hasNext()) { Document doc = (Document) iter.next(); AnnotationSet defaultAnnotSet = doc.getAnnotations(); Set annotTypesRequired = new HashSet(); annotTypesRequired.add("Person"); annotTypesRequired.add("Location"); Set<Annotation> peopleAndPlaces = new HashSet<Annotation>(defaultAnnotSet.get(annotTypesRequired)); FeatureMap features = doc.getFeatures(); String originalContent = (String) features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME); RepositioningInfo info = (RepositioningInfo) features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME); ++count; File file = new File("StANNIE_" + count + ".HTML"); Out.prln("File name: '" + file.getAbsolutePath() + "'"); if (originalContent != null && info != null) { Out.prln("OrigContent and reposInfo existing. Generate file..."); Iterator it = peopleAndPlaces.iterator(); Annotation currAnnot; SortedAnnotationList sortedAnnotations = new SortedAnnotationList(); while (it.hasNext()) { currAnnot = (Annotation) it.next(); sortedAnnotations.addSortedExclusive(currAnnot); } // while StringBuffer editableContent = new StringBuffer(originalContent); long insertPositionEnd; long insertPositionStart; // insert anotation tags backward Out.prln("Unsorted annotations count: " + peopleAndPlaces.size()); Out.prln("Sorted annotations count: " + sortedAnnotations.size()); for (int i = sortedAnnotations.size() - 1; i >= 0; --i) { currAnnot = (Annotation) sortedAnnotations.get(i); insertPositionStart = currAnnot.getStartNode().getOffset().longValue(); insertPositionStart = info.getOriginalPos(insertPositionStart); insertPositionEnd = currAnnot.getEndNode().getOffset().longValue(); insertPositionEnd = info.getOriginalPos(insertPositionEnd, true); if (insertPositionEnd != -1 && insertPositionStart != -1) { editableContent.insert((int) insertPositionEnd, endTag); editableContent.insert((int) insertPositionStart, startTagPart_3); editableContent.insert((int) insertPositionStart, currAnnot.getType()); editableContent.insert((int) insertPositionStart, startTagPart_2); editableContent.insert((int) insertPositionStart, currAnnot.getId().toString()); editableContent.insert((int) insertPositionStart, startTagPart_1); } // if } // for FileWriter writer = new FileWriter(file); writer.write(editableContent.toString()); writer.close(); } // if - should generate else if (originalContent != null) { Out.prln("OrigContent existing. Generate file..."); Iterator it = peopleAndPlaces.iterator(); Annotation currAnnot; SortedAnnotationList sortedAnnotations = new SortedAnnotationList(); while (it.hasNext()) { currAnnot = (Annotation) it.next(); sortedAnnotations.addSortedExclusive(currAnnot); } // while StringBuffer editableContent = new StringBuffer(originalContent); long insertPositionEnd; long insertPositionStart; // insert anotation tags backward Out.prln("Unsorted annotations count: " + peopleAndPlaces.size()); Out.prln("Sorted annotations count: " + sortedAnnotations.size()); for (int i = sortedAnnotations.size() - 1; i >= 0; --i) { currAnnot = (Annotation) sortedAnnotations.get(i); insertPositionStart = currAnnot.getStartNode().getOffset().longValue(); insertPositionEnd = currAnnot.getEndNode().getOffset().longValue(); if (insertPositionEnd != -1 && insertPositionStart != -1) { editableContent.insert((int) insertPositionEnd, endTag); editableContent.insert((int) insertPositionStart, startTagPart_3); editableContent.insert((int) insertPositionStart, currAnnot.getType()); editableContent.insert((int) insertPositionStart, startTagPart_2); editableContent.insert((int) insertPositionStart, currAnnot.getId().toString()); editableContent.insert((int) insertPositionStart, startTagPart_1); } // if } // for FileWriter writer = new FileWriter(file); writer.write(editableContent.toString()); writer.close(); } else { Out.prln("Content : " + originalContent); Out.prln("Repositioning: " + info); } String xmlDocument = doc.toXml(peopleAndPlaces, false); String fileName = new String("StANNIE_toXML_" + count + ".HTML"); FileWriter writer = new FileWriter(fileName); writer.write(xmlDocument); writer.close(); } // for each doc } // main
// carry out the actual annotations on the given span of text in the // document. protected void annotateText(Document doc, AnnotationSet outputAS, long from, long to) { String text = ""; try { text = doc.getContent().getContent(from, to).toString(); } catch (InvalidOffsetException ex) { throw new GateRuntimeException("Unexpected offset exception, offsets are " + from + "/" + to); } // send the text to the service and get back the response // System.out.println("Annotating text: "+text); // System.out.println("Starting offset is "+from); // NOTE: there is a bug in the TagMe service which causes offset errors // if we use the tweet mode and there are certain patterns in the tweet. // The approach recommended by Francesco Piccinno is to replace those // patterns by spaces. if (getIsTweet()) { logger.debug("Text before cleaning: >>" + text + "<<"); // replace text = text.replaceAll(patternStringRT3, " "); text = text.replaceAll(patternStringRT2, " "); text = text.replaceAll(patternHashTag, " $1"); // now replace the remaining patterns by spaces StringBuilder sb = new StringBuilder(text); Matcher m = patternUrl.matcher(text); while (m.find()) { int start = m.start(); int end = m.end(); sb.replace(start, end, nSpaces(end - start)); } m = patternUser.matcher(text); while (m.find()) { int start = m.start(); int end = m.end(); sb.replace(start, end, nSpaces(end - start)); } text = sb.toString(); logger.debug("Text after cleaning: >>" + text + "<<"); } TagMeAnnotation[] tagmeAnnotations = getTagMeAnnotations(text); for (TagMeAnnotation tagmeAnn : tagmeAnnotations) { if (tagmeAnn.rho >= minrho) { FeatureMap fm = Factory.newFeatureMap(); fm.put("tagMeId", tagmeAnn.id); fm.put("title", tagmeAnn.title); fm.put("rho", tagmeAnn.rho); fm.put("spot", tagmeAnn.spot); fm.put("link_probability", tagmeAnn.link_probability); if (tagmeAnn.title == null) { throw new GateRuntimeException("Odd: got a null title from the TagMe service" + tagmeAnn); } else { fm.put("inst", "http://dbpedia.org/resource/" + recodeForDbp38(tagmeAnn.title)); } try { gate.Utils.addAnn( outputAS, from + tagmeAnn.start, from + tagmeAnn.end, getOutputAnnotationType(), fm); } catch (Exception ex) { System.err.println( "Got an exception in document " + doc.getName() + ": " + ex.getLocalizedMessage()); ex.printStackTrace(System.err); System.err.println("from=" + from + ", to=" + to + " TagMeAnn=" + tagmeAnn); } } } }
@Override public Resource init() throws ResourceInstantiationException { gracefulExit = false; if (configFileURL == null) { gracefulExit = true; gate.util.Err.println("No configuration file provided!"); } if (japeURL == null) { gracefulExit = true; gate.util.Err.println("No JAPE grammar file provided!"); } // create the init params for the JAPE transducer FeatureMap params = Factory.newFeatureMap(); params.put(Transducer.TRANSD_GRAMMAR_URL_PARAMETER_NAME, japeURL); // Code borrowed from Mark Greenwood's Measurements PR if (japeTransducer == null) { // if this is the first time we are running init then actually create a // new transducer as we don't already have one FeatureMap hidden = Factory.newFeatureMap(); Gate.setHiddenAttribute(hidden, true); japeTransducer = (Transducer) Factory.createResource("gate.creole.Transducer", params, hidden); } else { // we are being run through a call to reInit so simply re-init the // underlying JAPE transducer japeTransducer.setParameterValues(params); japeTransducer.reInit(); } ConfigReader config = new ConfigReader(configFileURL); gracefulExit = config.config(); try { HashMap<String, String> options = config.getOptions(); patternMap = new HashMap<String, Pattern>(); addSuffixPattern("disease_suffix", options); addWordPattern("disease_abbrevs", options); addWordPattern("disease_sense", options); addWordExtraPattern("disease_sense_context", options); addPossessiveWordPattern("disease_named_syndrome", options); addWordExtraPattern("disease_generic_context", options); addWordExtraPattern("disease_anatomy_context", options); addSuffixPluralPattern("procedure_suffix", options); addWordPluralPattern("procedure_key", options); addWordExtraPattern("procedure_anatomy_context", options); addWordPluralPattern("symptom_key", options); addWordPattern("test_key", options); addSuffixPattern("anatomy_suffix_adjective", options); addSuffixPattern("anatomy_suffix", options); addPrefixPattern("anatomy_prefix", options); addWordPattern("anatomy_position", options); addWordPluralPattern("anatomy_space_region_junction", options); addWordPattern("anatomy_part_adjective", options); addWordPattern("anatomy_latin_noun", options); addWordPattern("anatomy_muscle", options); addWordPluralPattern("anatomy_part", options); addWordPluralPattern("anatomy_fluid", options); } catch (NullPointerException ne) { gracefulExit = true; gate.util.Err.println( "Missing or unset configuration options. Please check configuration file."); } return this; } // end init()
/** * @param uri - namespace uri * @param localName - local, unprefixed element name * @param qName - fully qualified, prefixed element name * @param atts * @throws SAXException */ @Override public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { // call characterActions if (readCharacterStatus) { readCharacterStatus = false; charactersAction(new String(contentBuffer).toCharArray(), 0, contentBuffer.length()); } // Inform the progress listener to fire only if no of elements processed // so far is a multiple of ELEMENTS_RATE if ((++elements % ELEMENTS_RATE) == 0) { fireStatusChangedEvent("Processed elements : " + elements); } Integer customObjectId = null; // Construct a SimpleFeatureMapImpl from the list of attributes FeatureMap fm = Factory.newFeatureMap(); /** * Use localName rather than qName and add the namespace prefix and uri as features if global * flag is set */ String elemName = qName; boolean hasNSUri = (uri != null && !uri.isEmpty()); if (deserializeNamespaceInfo && hasNSUri) { elemName = localName; StringTokenizer strToken = new StringTokenizer(qName, ":"); if (strToken.countTokens() > 1) { String nsPrefix = strToken.nextToken(); fm.put(namespaceURIFeature, uri); fm.put(namespacePrefixFeature, nsPrefix); } } // Get the name and the value of the attributes and add them to a FeaturesMAP for (int i = 0; i < atts.getLength(); i++) { String attName = atts.getLocalName(i); String attValue = atts.getValue(i); String attUri = atts.getURI(i); if (attUri != null && Gate.URI.equals(attUri)) { if ("gateId".equals(attName)) { customObjectId = new Integer(attValue); } // End if if ("annotMaxId".equals(attName)) { customObjectsId = new Integer(attValue).intValue(); } // End if if ("matches".equals(attName)) { StringTokenizer strTokenizer = new StringTokenizer(attValue, ";"); List<Integer> list = new ArrayList<Integer>(); // Take all tokens,create Integers and add them to the list while (strTokenizer.hasMoreTokens()) { String token = strTokenizer.nextToken(); list.add(new Integer(token)); } // End while fm.put(attName, list); } // End if } else { fm.put(atts.getQName(i), attValue); } // End if } // End for // create the START index of the annotation Long startIndex = new Long(tmpDocContent.length()); // initialy the Start index is equal with End index CustomObject obj = new CustomObject(customObjectId, elemName, fm, startIndex, startIndex); // put this object into the stack stack.push(obj); } // startElement();