public ArrayList<String> getStringFromAnnotation(Object docID, String type) throws ResourceInstantiationException { ArrayList<String> strings = new ArrayList<String>(); Document doc = getDocument(docID); for (Annotation annt : doc.getAnnotations().get(type)) strings.add(stringFor(doc, annt)); return strings; }
/** * Checks two documents for equality. * * @param doc1 a document * @param doc2 another document * @return a boolean. */ public static boolean documentsEqual(Document doc1, Document doc2) { message = ""; if (doc1 == null ^ doc2 == null) { message = "Documents not equal: null<>non-null!"; return false; } if (doc1 == null) return true; if (!check(doc1.getContent(), doc2.getContent())) { message = "Document contents different!"; return false; } if (!check(doc1.getAnnotations(), doc2.getAnnotations())) { message = "Documents default AS not equal!"; return false; } if (doc1 instanceof TextualDocument) { if (doc2 instanceof TextualDocument) { if (!check( ((TextualDocument) doc1).getEncoding(), ((TextualDocument) doc2).getEncoding())) { message = "Textual documents with different encodings!"; return false; } } else { message = "Documents not equal: textual<>non-textual!"; return false; } } if (!check(doc1.getFeatures(), doc2.getFeatures())) { message = "Documents features not equal!"; return false; } // needs friend declaration :( // if(!markupAware.equals(doc.markupAware)) return false; if (!check(doc1.getNamedAnnotationSets(), doc2.getNamedAnnotationSets())) { message = "Documents named annots not equal!"; return false; } // if(doc1 instanceof DocumentImpl){ // if(doc2 instanceof DocumentImpl){ // if(! check(((DocumentImpl)doc1).getNextNodeId(), // ((DocumentImpl)doc2).getNextNodeId())){ // message = "Documents next nodeID not equal!"; // return false; // } // if(! check(((DocumentImpl)doc1).getNextAnnotationId(), // ((DocumentImpl)doc2).getNextAnnotationId())){ // message = "Documents next annotationIDs not equal!"; // return false; // } // }else{ // message = "Documents not equal: DocumentImpl<>non-DocumentImpl!"; // return false; // } // } if (!check(doc1.getSourceUrl(), doc2.getSourceUrl())) { message = "Documents sourceURLs not equal!"; return false; } if (!(check(doc1.getSourceUrlStartOffset(), doc2.getSourceUrlStartOffset()) && check(doc1.getSourceUrlEndOffset(), doc2.getSourceUrlEndOffset()))) { message = "Documents sourceURLOffsets not equal!"; return false; } return true; }
/** * Run from the command-line, with a list of URLs as argument. * * <p><B>NOTE:</B><br> * This code will run with all the documents in memory - if you want to unload each from memory * after use, add code to store the corpus in a DataStore. */ public static void main(String args[]) throws GateException, IOException { // initialise the GATE library Out.prln("Initialising GATE..."); Gate.init(); Out.prln("...GATE initialised"); // initialise ANNIE (this may take several minutes) StandAloneAnnie annie = new StandAloneAnnie(); annie.initAnnie(); // create a GATE corpus and add a document for each command-line // argument Corpus corpus = Factory.newCorpus("StandAloneAnnie corpus"); for (int i = 0; i < args.length; i++) { URL u = new URL(args[i]); FeatureMap params = Factory.newFeatureMap(); params.put("sourceUrl", u); params.put("preserveOriginalContent", new Boolean(true)); params.put("collectRepositioningInfo", new Boolean(true)); Out.prln("Creating doc for " + u); Document doc = (Document) Factory.createResource("gate.corpora.DocumentImpl", params); corpus.add(doc); } // for each of args // tell the pipeline about the corpus and run it annie.setCorpus(corpus); annie.execute(); // for each document, get an XML document with the // person and location names added Iterator iter = corpus.iterator(); int count = 0; String startTagPart_1 = "<span GateID=\""; String startTagPart_2 = "\" title=\""; String startTagPart_3 = "\" style=\"background:Red;\">"; String endTag = "</span>"; while (iter.hasNext()) { Document doc = (Document) iter.next(); AnnotationSet defaultAnnotSet = doc.getAnnotations(); Set annotTypesRequired = new HashSet(); annotTypesRequired.add("Person"); annotTypesRequired.add("Location"); Set<Annotation> peopleAndPlaces = new HashSet<Annotation>(defaultAnnotSet.get(annotTypesRequired)); FeatureMap features = doc.getFeatures(); String originalContent = (String) features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME); RepositioningInfo info = (RepositioningInfo) features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME); ++count; File file = new File("StANNIE_" + count + ".HTML"); Out.prln("File name: '" + file.getAbsolutePath() + "'"); if (originalContent != null && info != null) { Out.prln("OrigContent and reposInfo existing. Generate file..."); Iterator it = peopleAndPlaces.iterator(); Annotation currAnnot; SortedAnnotationList sortedAnnotations = new SortedAnnotationList(); while (it.hasNext()) { currAnnot = (Annotation) it.next(); sortedAnnotations.addSortedExclusive(currAnnot); } // while StringBuffer editableContent = new StringBuffer(originalContent); long insertPositionEnd; long insertPositionStart; // insert anotation tags backward Out.prln("Unsorted annotations count: " + peopleAndPlaces.size()); Out.prln("Sorted annotations count: " + sortedAnnotations.size()); for (int i = sortedAnnotations.size() - 1; i >= 0; --i) { currAnnot = (Annotation) sortedAnnotations.get(i); insertPositionStart = currAnnot.getStartNode().getOffset().longValue(); insertPositionStart = info.getOriginalPos(insertPositionStart); insertPositionEnd = currAnnot.getEndNode().getOffset().longValue(); insertPositionEnd = info.getOriginalPos(insertPositionEnd, true); if (insertPositionEnd != -1 && insertPositionStart != -1) { editableContent.insert((int) insertPositionEnd, endTag); editableContent.insert((int) insertPositionStart, startTagPart_3); editableContent.insert((int) insertPositionStart, currAnnot.getType()); editableContent.insert((int) insertPositionStart, startTagPart_2); editableContent.insert((int) insertPositionStart, currAnnot.getId().toString()); editableContent.insert((int) insertPositionStart, startTagPart_1); } // if } // for FileWriter writer = new FileWriter(file); writer.write(editableContent.toString()); writer.close(); } // if - should generate else if (originalContent != null) { Out.prln("OrigContent existing. Generate file..."); Iterator it = peopleAndPlaces.iterator(); Annotation currAnnot; SortedAnnotationList sortedAnnotations = new SortedAnnotationList(); while (it.hasNext()) { currAnnot = (Annotation) it.next(); sortedAnnotations.addSortedExclusive(currAnnot); } // while StringBuffer editableContent = new StringBuffer(originalContent); long insertPositionEnd; long insertPositionStart; // insert anotation tags backward Out.prln("Unsorted annotations count: " + peopleAndPlaces.size()); Out.prln("Sorted annotations count: " + sortedAnnotations.size()); for (int i = sortedAnnotations.size() - 1; i >= 0; --i) { currAnnot = (Annotation) sortedAnnotations.get(i); insertPositionStart = currAnnot.getStartNode().getOffset().longValue(); insertPositionEnd = currAnnot.getEndNode().getOffset().longValue(); if (insertPositionEnd != -1 && insertPositionStart != -1) { editableContent.insert((int) insertPositionEnd, endTag); editableContent.insert((int) insertPositionStart, startTagPart_3); editableContent.insert((int) insertPositionStart, currAnnot.getType()); editableContent.insert((int) insertPositionStart, startTagPart_2); editableContent.insert((int) insertPositionStart, currAnnot.getId().toString()); editableContent.insert((int) insertPositionStart, startTagPart_1); } // if } // for FileWriter writer = new FileWriter(file); writer.write(editableContent.toString()); writer.close(); } else { Out.prln("Content : " + originalContent); Out.prln("Repositioning: " + info); } String xmlDocument = doc.toXml(peopleAndPlaces, false); String fileName = new String("StANNIE_toXML_" + count + ".HTML"); FileWriter writer = new FileWriter(fileName); writer.write(xmlDocument); writer.close(); } // for each doc } // main
/** * This method annotates paragraphs in a GATE document. The investigated text spans beetween start * and end offsets and the paragraph annotations are created in the annotSetName. If annotSetName * is null then they are creted in the default annotation set. * * @param aDoc is the gate document on which the paragraph detection would be performed.If it is * null or its content it's null then the method woul simply return doing nothing. * @param startOffset is the index form the document content from which the paragraph detection * will start * @param endOffset is the offset where the detection will end. * @param annotSetName is the name of the set in which paragraph annotation would be created.The * annotation type created will be "paragraph" */ public void annotateParagraphs(Document aDoc, int startOffset, int endOffset, String annotSetName) throws DocumentFormatException { // Simply return if the document is null or its content if (aDoc == null || aDoc.getContent() == null) return; // Simply return if the start is > than the end if (startOffset > endOffset) return; // Decide where to put the newly detected annotations AnnotationSet annotSet = null; if (annotSetName == null) annotSet = aDoc.getAnnotations(); else annotSet = aDoc.getAnnotations(annotSetName); // Extract the document content String content = aDoc.getContent().toString(); // This is the offset marking the start of a para int startOffsetPara = startOffset; // This marks the ned of a para int endOffsetPara = endOffset; // The initial sate of the FSA int state = 1; // This field marks that a BR entity was read // A BR entity can be NL or NL CR, depending on the operating system (UNIX // or DOS) boolean readBR = false; int index = startOffset; while (index < endOffset) { // Read the current char char ch = content.charAt(index); // Test if a BR entity was read if (ch == '\n') { readBR = true; // If \n is followed by a \r then advance the index in order to read a // BR entity while ((index + 1 < endOffset) && (content.charAt(index + 1) == '\r')) index++; } // End if switch (state) { // It is the initial and also a final state // Stay in state 1 while it reads whitespaces case 1: { // If reads a non whitespace char then move to state 2 and record // the beggining of a paragraph if (!Character.isWhitespace(ch)) { state = 2; startOffsetPara = index; } // End if } break; // It can be also a final state. case 2: { // Stay in state 2 while reading chars != BR entities if (readBR) { // If you find a BR char go to state 3. The possible end of the para // can be index. This will be confirmed by state 3. So, this is why // the end of a para is recorded here. readBR = false; endOffsetPara = index; state = 3; } // End if } break; // It can be also a final state // From state 3 there are only 2 possible ways: (state 2 or state1) // In state 1 it needs to read a BR // For state 2 it nead to read something different then a BR case 3: { if (readBR) { // A BR was read. Go to state 1 readBR = false; state = 1; // Create an annotation type paragraph try { annotSet.add( new Long(startOffsetPara), new Long(endOffsetPara), "paragraph", Factory.newFeatureMap()); } catch (gate.util.InvalidOffsetException ioe) { throw new DocumentFormatException( "Coudn't create a paragraph" + " annotation", ioe); } // End try } else { // Go to state 2 an keep reading chars state = 2; } // End if } break; } // End switch // Prepare to read the next char. index++; } // End while endOffsetPara = index; // Investigate where the finite automata has stoped if (state == 2 || state == 3) { // Create an annotation type paragraph try { annotSet.add( new Long(startOffsetPara), // Create the final annotation using the endOffset new Long(endOffsetPara), "paragraph", Factory.newFeatureMap()); } catch (gate.util.InvalidOffsetException ioe) { throw new DocumentFormatException("Coudn't create a paragraph" + " annotation", ioe); } // End try } // End if } // End annotateParagraphs();