public ArrayList<String> getStringFromAnnotation(Object docID, String type)
     throws ResourceInstantiationException {
   ArrayList<String> strings = new ArrayList<String>();
   Document doc = getDocument(docID);
   for (Annotation annt : doc.getAnnotations().get(type)) strings.add(stringFor(doc, annt));
   return strings;
 }
Exemplo n.º 2
0
  /**
   * Checks two documents for equality.
   *
   * @param doc1 a document
   * @param doc2 another document
   * @return a boolean.
   */
  public static boolean documentsEqual(Document doc1, Document doc2) {
    message = "";
    if (doc1 == null ^ doc2 == null) {
      message = "Documents not equal: null<>non-null!";
      return false;
    }
    if (doc1 == null) return true;
    if (!check(doc1.getContent(), doc2.getContent())) {
      message = "Document contents different!";
      return false;
    }

    if (!check(doc1.getAnnotations(), doc2.getAnnotations())) {
      message = "Documents default AS not equal!";
      return false;
    }

    if (doc1 instanceof TextualDocument) {
      if (doc2 instanceof TextualDocument) {
        if (!check(
            ((TextualDocument) doc1).getEncoding(), ((TextualDocument) doc2).getEncoding())) {
          message = "Textual documents with different encodings!";
          return false;
        }
      } else {
        message = "Documents not equal: textual<>non-textual!";
        return false;
      }
    }
    if (!check(doc1.getFeatures(), doc2.getFeatures())) {
      message = "Documents features not equal!";
      return false;
    }

    // needs friend declaration :(
    //    if(!markupAware.equals(doc.markupAware)) return false;

    if (!check(doc1.getNamedAnnotationSets(), doc2.getNamedAnnotationSets())) {
      message = "Documents named annots not equal!";
      return false;
    }

    //    if(doc1 instanceof DocumentImpl){
    //      if(doc2 instanceof DocumentImpl){
    //        if(! check(((DocumentImpl)doc1).getNextNodeId(),
    //                   ((DocumentImpl)doc2).getNextNodeId())){
    //          message = "Documents next nodeID not equal!";
    //          return false;
    //        }
    //        if(! check(((DocumentImpl)doc1).getNextAnnotationId(),
    //                   ((DocumentImpl)doc2).getNextAnnotationId())){
    //          message = "Documents next annotationIDs not equal!";
    //          return false;
    //        }
    //      }else{
    //        message = "Documents not equal: DocumentImpl<>non-DocumentImpl!";
    //        return false;
    //      }
    //    }

    if (!check(doc1.getSourceUrl(), doc2.getSourceUrl())) {
      message = "Documents sourceURLs not equal!";
      return false;
    }
    if (!(check(doc1.getSourceUrlStartOffset(), doc2.getSourceUrlStartOffset())
        && check(doc1.getSourceUrlEndOffset(), doc2.getSourceUrlEndOffset()))) {
      message = "Documents sourceURLOffsets not equal!";
      return false;
    }
    return true;
  }
  /**
   * Run from the command-line, with a list of URLs as argument.
   *
   * <p><B>NOTE:</B><br>
   * This code will run with all the documents in memory - if you want to unload each from memory
   * after use, add code to store the corpus in a DataStore.
   */
  public static void main(String args[]) throws GateException, IOException {
    // initialise the GATE library
    Out.prln("Initialising GATE...");
    Gate.init();
    Out.prln("...GATE initialised");

    // initialise ANNIE (this may take several minutes)
    StandAloneAnnie annie = new StandAloneAnnie();
    annie.initAnnie();

    // create a GATE corpus and add a document for each command-line
    // argument
    Corpus corpus = Factory.newCorpus("StandAloneAnnie corpus");
    for (int i = 0; i < args.length; i++) {
      URL u = new URL(args[i]);
      FeatureMap params = Factory.newFeatureMap();
      params.put("sourceUrl", u);
      params.put("preserveOriginalContent", new Boolean(true));
      params.put("collectRepositioningInfo", new Boolean(true));
      Out.prln("Creating doc for " + u);
      Document doc = (Document) Factory.createResource("gate.corpora.DocumentImpl", params);
      corpus.add(doc);
    } // for each of args

    // tell the pipeline about the corpus and run it
    annie.setCorpus(corpus);
    annie.execute();

    // for each document, get an XML document with the
    // person and location names added
    Iterator iter = corpus.iterator();
    int count = 0;
    String startTagPart_1 = "<span GateID=\"";
    String startTagPart_2 = "\" title=\"";
    String startTagPart_3 = "\" style=\"background:Red;\">";
    String endTag = "</span>";

    while (iter.hasNext()) {
      Document doc = (Document) iter.next();
      AnnotationSet defaultAnnotSet = doc.getAnnotations();
      Set annotTypesRequired = new HashSet();
      annotTypesRequired.add("Person");
      annotTypesRequired.add("Location");
      Set<Annotation> peopleAndPlaces =
          new HashSet<Annotation>(defaultAnnotSet.get(annotTypesRequired));

      FeatureMap features = doc.getFeatures();
      String originalContent =
          (String) features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
      RepositioningInfo info =
          (RepositioningInfo) features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME);

      ++count;
      File file = new File("StANNIE_" + count + ".HTML");
      Out.prln("File name: '" + file.getAbsolutePath() + "'");
      if (originalContent != null && info != null) {
        Out.prln("OrigContent and reposInfo existing. Generate file...");

        Iterator it = peopleAndPlaces.iterator();
        Annotation currAnnot;
        SortedAnnotationList sortedAnnotations = new SortedAnnotationList();

        while (it.hasNext()) {
          currAnnot = (Annotation) it.next();
          sortedAnnotations.addSortedExclusive(currAnnot);
        } // while

        StringBuffer editableContent = new StringBuffer(originalContent);
        long insertPositionEnd;
        long insertPositionStart;
        // insert anotation tags backward
        Out.prln("Unsorted annotations count: " + peopleAndPlaces.size());
        Out.prln("Sorted annotations count: " + sortedAnnotations.size());
        for (int i = sortedAnnotations.size() - 1; i >= 0; --i) {
          currAnnot = (Annotation) sortedAnnotations.get(i);
          insertPositionStart = currAnnot.getStartNode().getOffset().longValue();
          insertPositionStart = info.getOriginalPos(insertPositionStart);
          insertPositionEnd = currAnnot.getEndNode().getOffset().longValue();
          insertPositionEnd = info.getOriginalPos(insertPositionEnd, true);
          if (insertPositionEnd != -1 && insertPositionStart != -1) {
            editableContent.insert((int) insertPositionEnd, endTag);
            editableContent.insert((int) insertPositionStart, startTagPart_3);
            editableContent.insert((int) insertPositionStart, currAnnot.getType());
            editableContent.insert((int) insertPositionStart, startTagPart_2);
            editableContent.insert((int) insertPositionStart, currAnnot.getId().toString());
            editableContent.insert((int) insertPositionStart, startTagPart_1);
          } // if
        } // for

        FileWriter writer = new FileWriter(file);
        writer.write(editableContent.toString());
        writer.close();
      } // if - should generate
      else if (originalContent != null) {
        Out.prln("OrigContent existing. Generate file...");

        Iterator it = peopleAndPlaces.iterator();
        Annotation currAnnot;
        SortedAnnotationList sortedAnnotations = new SortedAnnotationList();

        while (it.hasNext()) {
          currAnnot = (Annotation) it.next();
          sortedAnnotations.addSortedExclusive(currAnnot);
        } // while

        StringBuffer editableContent = new StringBuffer(originalContent);
        long insertPositionEnd;
        long insertPositionStart;
        // insert anotation tags backward
        Out.prln("Unsorted annotations count: " + peopleAndPlaces.size());
        Out.prln("Sorted annotations count: " + sortedAnnotations.size());
        for (int i = sortedAnnotations.size() - 1; i >= 0; --i) {
          currAnnot = (Annotation) sortedAnnotations.get(i);
          insertPositionStart = currAnnot.getStartNode().getOffset().longValue();
          insertPositionEnd = currAnnot.getEndNode().getOffset().longValue();
          if (insertPositionEnd != -1 && insertPositionStart != -1) {
            editableContent.insert((int) insertPositionEnd, endTag);
            editableContent.insert((int) insertPositionStart, startTagPart_3);
            editableContent.insert((int) insertPositionStart, currAnnot.getType());
            editableContent.insert((int) insertPositionStart, startTagPart_2);
            editableContent.insert((int) insertPositionStart, currAnnot.getId().toString());
            editableContent.insert((int) insertPositionStart, startTagPart_1);
          } // if
        } // for

        FileWriter writer = new FileWriter(file);
        writer.write(editableContent.toString());
        writer.close();
      } else {
        Out.prln("Content : " + originalContent);
        Out.prln("Repositioning: " + info);
      }

      String xmlDocument = doc.toXml(peopleAndPlaces, false);
      String fileName = new String("StANNIE_toXML_" + count + ".HTML");
      FileWriter writer = new FileWriter(fileName);
      writer.write(xmlDocument);
      writer.close();
    } // for each doc
  } // main
 /**
  * This method annotates paragraphs in a GATE document. The investigated text spans beetween start
  * and end offsets and the paragraph annotations are created in the annotSetName. If annotSetName
  * is null then they are creted in the default annotation set.
  *
  * @param aDoc is the gate document on which the paragraph detection would be performed.If it is
  *     null or its content it's null then the method woul simply return doing nothing.
  * @param startOffset is the index form the document content from which the paragraph detection
  *     will start
  * @param endOffset is the offset where the detection will end.
  * @param annotSetName is the name of the set in which paragraph annotation would be created.The
  *     annotation type created will be "paragraph"
  */
 public void annotateParagraphs(Document aDoc, int startOffset, int endOffset, String annotSetName)
     throws DocumentFormatException {
   // Simply return if the document is null or its content
   if (aDoc == null || aDoc.getContent() == null) return;
   // Simply return if the start is > than the end
   if (startOffset > endOffset) return;
   // Decide where to put the newly detected annotations
   AnnotationSet annotSet = null;
   if (annotSetName == null) annotSet = aDoc.getAnnotations();
   else annotSet = aDoc.getAnnotations(annotSetName);
   // Extract the document content
   String content = aDoc.getContent().toString();
   // This is the offset marking the start of a para
   int startOffsetPara = startOffset;
   // This marks the ned of a para
   int endOffsetPara = endOffset;
   // The initial sate of the FSA
   int state = 1;
   // This field marks that a BR entity was read
   // A BR entity can be NL or NL CR, depending on the operating system (UNIX
   // or DOS)
   boolean readBR = false;
   int index = startOffset;
   while (index < endOffset) {
     // Read the current char
     char ch = content.charAt(index);
     // Test if a BR entity was read
     if (ch == '\n') {
       readBR = true;
       // If \n is followed by a \r then advance the index in order to read a
       // BR entity
       while ((index + 1 < endOffset) && (content.charAt(index + 1) == '\r')) index++;
     } // End if
     switch (state) {
         // It is the initial and also a final state
         // Stay in state 1 while it reads whitespaces
       case 1:
         {
           // If reads a non whitespace char then move to state 2 and record
           // the beggining of a paragraph
           if (!Character.isWhitespace(ch)) {
             state = 2;
             startOffsetPara = index;
           } // End if
         }
         break;
         // It can be also a final state.
       case 2:
         {
           // Stay in state 2 while reading chars != BR entities
           if (readBR) {
             // If you find a BR char go to state 3. The possible end of the para
             // can be index. This will be confirmed by state 3. So, this is why
             // the end of a para is recorded here.
             readBR = false;
             endOffsetPara = index;
             state = 3;
           } // End if
         }
         break;
         // It can be also a final state
         // From state 3 there are only 2 possible ways: (state 2 or state1)
         // In state 1 it needs to read a BR
         // For state 2 it nead to read something different then a BR
       case 3:
         {
           if (readBR) {
             // A BR was read. Go to state 1
             readBR = false;
             state = 1;
             // Create an annotation type paragraph
             try {
               annotSet.add(
                   new Long(startOffsetPara),
                   new Long(endOffsetPara),
                   "paragraph",
                   Factory.newFeatureMap());
             } catch (gate.util.InvalidOffsetException ioe) {
               throw new DocumentFormatException(
                   "Coudn't create a paragraph" + " annotation", ioe);
             } // End try
           } else {
             // Go to state 2 an keep reading chars
             state = 2;
           } // End if
         }
         break;
     } // End switch
     // Prepare to read the next char.
     index++;
   } // End while
   endOffsetPara = index;
   // Investigate where the finite automata has stoped
   if (state == 2 || state == 3) {
     // Create an annotation type paragraph
     try {
       annotSet.add(
           new Long(startOffsetPara),
           // Create the final annotation using the endOffset
           new Long(endOffsetPara),
           "paragraph",
           Factory.newFeatureMap());
     } catch (gate.util.InvalidOffsetException ioe) {
       throw new DocumentFormatException("Coudn't create a paragraph" + " annotation", ioe);
     } // End try
   } // End if
 } // End annotateParagraphs();