예제 #1
0
  /**
   * This method is called when the HTML parser encounts the beginning of a tag that means that the
   * tag is paired by an end tag and it's not an empty one.
   */
  @Override
  public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
    // Fire the status listener if the elements processed exceded the rate
    if (0 == (++elements % ELEMENTS_RATE))
      fireStatusChangedEvent("Processed elements : " + elements);

    // Start of STYLE tag
    if (HTML.Tag.STYLE.equals(t)) {
      isInsideStyleTag = true;
    } // if

    // Construct a feature map from the attributes list
    FeatureMap fm = Factory.newFeatureMap();

    // Take all the attributes an put them into the feature map
    if (0 != a.getAttributeCount()) {
      Enumeration<?> enumeration = a.getAttributeNames();
      while (enumeration.hasMoreElements()) {
        Object attribute = enumeration.nextElement();
        fm.put(attribute.toString(), (a.getAttribute(attribute)).toString());
      } // while
    } // if

    // Just analize the tag t and add some\n chars and spaces to the
    // tmpDocContent.The reason behind is that we need to have a readable form
    // for the final document.
    customizeAppearanceOfDocumentWithStartTag(t);

    // If until here the "tmpDocContent" ends with a NON whitespace char,
    // then we add a space char before calculating the START index of this
    // tag.
    // This is done in order not to concatenate the content of two separate tags
    // and obtain a different NEW word.
    int tmpDocContentSize = tmpDocContent.length();
    if (tmpDocContentSize != 0
        && !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1)))
      tmpDocContent.append(" ");

    // create the start index of the annotation
    Long startIndex = new Long(tmpDocContent.length());

    // initialy the start index is equal with the End index
    CustomObject obj = new CustomObject(t.toString(), fm, startIndex, startIndex);

    // put it into the stack
    stack.push(obj);
  } // handleStartTag
예제 #2
0
  /**
   * This method is called when the HTML parser encounts the end of a tag that means that the tag is
   * paired by a beginning tag
   */
  @Override
  public void handleEndTag(HTML.Tag t, int pos) {
    // obj is for internal use
    CustomObject obj = null;

    // end of STYLE tag
    if (HTML.Tag.STYLE.equals(t)) {
      isInsideStyleTag = false;
    } // if

    // If the stack is not empty then we get the object from the stack
    if (!stack.isEmpty()) {
      obj = stack.pop();
      // Before adding it to the colector, we need to check if is an
      // emptyAndSpan one. See CustomObject's isEmptyAndSpan field.
      if (obj.getStart().equals(obj.getEnd())) {
        // The element had an end tag and its start was equal to its end. Hence
        // it is anEmptyAndSpan one.
        obj.getFM().put("isEmptyAndSpan", "true");
      } // End iff
      // we add it to the colector
      colector.add(obj);
    } // End if

    // If element has text between, then customize its apearance
    if (obj != null && obj.getStart().longValue() != obj.getEnd().longValue())
      // Customize the appearance of the document
      customizeAppearanceOfDocumentWithEndTag(t);

    // if t is the </HTML> tag then we reached the end of theHTMLdocument
    if (t == HTML.Tag.HTML) {
      // replace the old content with the new one
      doc.setContent(new DocumentContentImpl(tmpDocContent.toString()));

      // If basicAs is null then get the default annotation
      // set from this gate document
      if (basicAS == null)
        basicAS = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);

      // sort colector ascending on its id
      Collections.sort(colector);
      // iterate through colector and construct annotations
      while (!colector.isEmpty()) {
        obj = colector.getFirst();
        colector.remove(obj);
        // Construct an annotation from this obj
        try {
          if (markupElementsMap == null) {
            basicAS.add(obj.getStart(), obj.getEnd(), obj.getElemName(), obj.getFM());
          } else {
            String annotationType = markupElementsMap.get(obj.getElemName());
            if (annotationType != null)
              basicAS.add(obj.getStart(), obj.getEnd(), annotationType, obj.getFM());
          }
        } catch (InvalidOffsetException e) {
          Err.prln("Error creating an annot :" + obj + " Discarded...");
        } // end try
        //        }// end if
      } // while

      // notify the listener about the total amount of elements that
      // has been processed
      fireStatusChangedEvent("Total elements : " + elements);
    } // else
  } // handleEndTag