예제 #1
0
  @Override
  public Document get(int index) {
    if (index >= docDataList.size()) return null;

    Document res = documents.get(index);

    if (DEBUG) Out.prln("SerialCorpusImpl: get(): index " + index + "result: " + res);

    // if the document is null, then I must get it from the DS
    if (res == null) {
      FeatureMap parameters = Factory.newFeatureMap();
      parameters.put(DataStore.DATASTORE_FEATURE_NAME, this.dataStore);
      try {
        parameters.put(DataStore.LR_ID_FEATURE_NAME, docDataList.get(index).getPersistentID());
        Document lr =
            (Document) Factory.createResource(docDataList.get(index).getClassType(), parameters);
        if (DEBUG) Out.prln("Loaded document :" + lr.getName());
        // change the result to the newly loaded doc
        res = lr;

        // finally replace the doc with the instantiated version
        documents.set(index, lr);
      } catch (ResourceInstantiationException ex) {
        Err.prln("Error reading document inside a serialised corpus.");
        throw new GateRuntimeException(ex);
      }
    }

    return res;
  }
예제 #2
0
  @Override
  public boolean add(Document o) {
    if (o == null) return false;
    Document doc = o;

    // make it accept only docs from its own datastore
    if (doc.getDataStore() != null && !this.dataStore.equals(doc.getDataStore())) {
      Err.prln("Error: Persistent corpus can only accept documents " + "from its own datastore!");
      return false;
    } // if

    // add the document with its index in the docDataList
    // in this case, since it's going to be added to the end
    // the index will be the size of the docDataList before
    // the addition
    DocumentData docData =
        new DocumentData(doc.getName(), doc.getLRPersistenceId(), doc.getClass().getName());
    boolean result = docDataList.add(docData);
    documents.add(doc);
    documentAdded(doc);
    fireDocumentAdded(
        new CorpusEvent(
            SerialCorpusImpl.this,
            doc,
            docDataList.size() - 1,
            doc.getLRPersistenceId(),
            CorpusEvent.DOCUMENT_ADDED));

    return result;
  }
예제 #3
0
  /**
   * This method is called when the SAX parser encounts the end of the XML document. Here we set the
   * content of the gate Document to be the one generated inside this class (tmpDocContent). After
   * that we use the colector to generate all the annotation reffering this new gate document.
   */
  @Override
  public void endDocument() throws org.xml.sax.SAXException {

    // replace the document content with the one without markups
    doc.setContent(new DocumentContentImpl(tmpDocContent.toString()));

    // fire the status listener
    fireStatusChangedEvent("Total elements: " + elements);

    // If basicAs is null then get the default AnnotationSet,
    // based on the gate document.
    if (basicAS == null) {
      basicAS = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
    }

    // sort colector ascending on its id
    Collections.sort(colector);
    Set<Integer> testIdsSet = new HashSet<Integer>();
    // create all the annotations (on this new document) from the collector
    while (!colector.isEmpty()) {
      CustomObject obj = colector.getFirst();
      // Test to see if there are two annotation objects with the same id.
      if (testIdsSet.contains(obj.getId())) {
        throw new GateSaxException(
            "Found two annotations with the same Id("
                + obj.getId()
                + ").The document is inconsistent.");
      } else {
        testIdsSet.add(obj.getId());
      } // End iff
      // create a new annotation and add it to the annotation set
      try {
        // the annotation type will be conforming with markupElementsMap
        // add the annotation to the Annotation Set
        if (markupElementsMap == null) {
          basicAS.add(obj.getId(), obj.getStart(), obj.getEnd(), obj.getElemName(), obj.getFM());
        } else {
          // get the type of the annotation from Map
          String annotationType = markupElementsMap.get(obj.getElemName());
          if (annotationType != null) {
            basicAS.add(obj.getId(), obj.getStart(), obj.getEnd(), annotationType, obj.getFM());
          }
        } // End if
      } catch (gate.util.InvalidOffsetException e) {
        Err.prln(
            "InvalidOffsetException for annot :"
                + obj.getElemName()
                + " with Id ="
                + obj.getId()
                + ". Discarded...");
      } // End try
      colector.remove(obj);
    } // End while
  } // endDocument();
예제 #4
0
  /**
   * This method is called when the HTML parser encounts the end of a tag that means that the tag is
   * paired by a beginning tag
   */
  @Override
  public void handleEndTag(HTML.Tag t, int pos) {
    // obj is for internal use
    CustomObject obj = null;

    // end of STYLE tag
    if (HTML.Tag.STYLE.equals(t)) {
      isInsideStyleTag = false;
    } // if

    // If the stack is not empty then we get the object from the stack
    if (!stack.isEmpty()) {
      obj = stack.pop();
      // Before adding it to the colector, we need to check if is an
      // emptyAndSpan one. See CustomObject's isEmptyAndSpan field.
      if (obj.getStart().equals(obj.getEnd())) {
        // The element had an end tag and its start was equal to its end. Hence
        // it is anEmptyAndSpan one.
        obj.getFM().put("isEmptyAndSpan", "true");
      } // End iff
      // we add it to the colector
      colector.add(obj);
    } // End if

    // If element has text between, then customize its apearance
    if (obj != null && obj.getStart().longValue() != obj.getEnd().longValue())
      // Customize the appearance of the document
      customizeAppearanceOfDocumentWithEndTag(t);

    // if t is the </HTML> tag then we reached the end of theHTMLdocument
    if (t == HTML.Tag.HTML) {
      // replace the old content with the new one
      doc.setContent(new DocumentContentImpl(tmpDocContent.toString()));

      // If basicAs is null then get the default annotation
      // set from this gate document
      if (basicAS == null)
        basicAS = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);

      // sort colector ascending on its id
      Collections.sort(colector);
      // iterate through colector and construct annotations
      while (!colector.isEmpty()) {
        obj = colector.getFirst();
        colector.remove(obj);
        // Construct an annotation from this obj
        try {
          if (markupElementsMap == null) {
            basicAS.add(obj.getStart(), obj.getEnd(), obj.getElemName(), obj.getFM());
          } else {
            String annotationType = markupElementsMap.get(obj.getElemName());
            if (annotationType != null)
              basicAS.add(obj.getStart(), obj.getEnd(), annotationType, obj.getFM());
          }
        } catch (InvalidOffsetException e) {
          Err.prln("Error creating an annot :" + obj + " Discarded...");
        } // end try
        //        }// end if
      } // while

      // notify the listener about the total amount of elements that
      // has been processed
      fireStatusChangedEvent("Total elements : " + elements);
    } // else
  } // handleEndTag