/** * This method is called when the HTML parser encounts the beginning of a tag that means that the * tag is paired by an end tag and it's not an empty one. */ @Override public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) { // Fire the status listener if the elements processed exceded the rate if (0 == (++elements % ELEMENTS_RATE)) fireStatusChangedEvent("Processed elements : " + elements); // Start of STYLE tag if (HTML.Tag.STYLE.equals(t)) { isInsideStyleTag = true; } // if // Construct a feature map from the attributes list FeatureMap fm = Factory.newFeatureMap(); // Take all the attributes an put them into the feature map if (0 != a.getAttributeCount()) { Enumeration<?> enumeration = a.getAttributeNames(); while (enumeration.hasMoreElements()) { Object attribute = enumeration.nextElement(); fm.put(attribute.toString(), (a.getAttribute(attribute)).toString()); } // while } // if // Just analize the tag t and add some\n chars and spaces to the // tmpDocContent.The reason behind is that we need to have a readable form // for the final document. customizeAppearanceOfDocumentWithStartTag(t); // If until here the "tmpDocContent" ends with a NON whitespace char, // then we add a space char before calculating the START index of this // tag. // This is done in order not to concatenate the content of two separate tags // and obtain a different NEW word. int tmpDocContentSize = tmpDocContent.length(); if (tmpDocContentSize != 0 && !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1))) tmpDocContent.append(" "); // create the start index of the annotation Long startIndex = new Long(tmpDocContent.length()); // initialy the start index is equal with the End index CustomObject obj = new CustomObject(t.toString(), fm, startIndex, startIndex); // put it into the stack stack.push(obj); } // handleStartTag
/** * This method is called when the HTML parser encounts the end of a tag that means that the tag is * paired by a beginning tag */ @Override public void handleEndTag(HTML.Tag t, int pos) { // obj is for internal use CustomObject obj = null; // end of STYLE tag if (HTML.Tag.STYLE.equals(t)) { isInsideStyleTag = false; } // if // If the stack is not empty then we get the object from the stack if (!stack.isEmpty()) { obj = stack.pop(); // Before adding it to the colector, we need to check if is an // emptyAndSpan one. See CustomObject's isEmptyAndSpan field. if (obj.getStart().equals(obj.getEnd())) { // The element had an end tag and its start was equal to its end. Hence // it is anEmptyAndSpan one. obj.getFM().put("isEmptyAndSpan", "true"); } // End iff // we add it to the colector colector.add(obj); } // End if // If element has text between, then customize its apearance if (obj != null && obj.getStart().longValue() != obj.getEnd().longValue()) // Customize the appearance of the document customizeAppearanceOfDocumentWithEndTag(t); // if t is the </HTML> tag then we reached the end of theHTMLdocument if (t == HTML.Tag.HTML) { // replace the old content with the new one doc.setContent(new DocumentContentImpl(tmpDocContent.toString())); // If basicAs is null then get the default annotation // set from this gate document if (basicAS == null) basicAS = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); // sort colector ascending on its id Collections.sort(colector); // iterate through colector and construct annotations while (!colector.isEmpty()) { obj = colector.getFirst(); colector.remove(obj); // Construct an annotation from this obj try { if (markupElementsMap == null) { basicAS.add(obj.getStart(), obj.getEnd(), obj.getElemName(), obj.getFM()); } else { String annotationType = markupElementsMap.get(obj.getElemName()); if (annotationType != null) basicAS.add(obj.getStart(), obj.getEnd(), annotationType, obj.getFM()); } } catch (InvalidOffsetException e) { Err.prln("Error creating an annot :" + obj + " Discarded..."); } // end try // }// end if } // while // notify the listener about the total amount of elements that // has been processed fireStatusChangedEvent("Total elements : " + elements); } // else } // handleEndTag