/** This method is called when the SAX parser encounts the beginning of the XML document. */ @Override public void startDocument() throws org.xml.sax.SAXException { // init of variables in the parent super.startDocument(); /** * We will attempt to add namespace feature info to each namespaced element only if three * parameters are set in the global or local config file: ADD_NAMESPACE_FEATURES: boolean flag * ELEMENT_NAMESPACE_URI: feature name to use to hold namespace uri ELEMENT_NAMESPACE_PREFIX: * feature name to use to hold namespace prefix */ OptionsMap configData = Gate.getUserConfig(); boolean addNSFeature = Boolean.parseBoolean((String) configData.get(GateConstants.ADD_NAMESPACE_FEATURES)); namespaceURIFeature = (String) configData.get(GateConstants.ELEMENT_NAMESPACE_URI); namespacePrefixFeature = (String) configData.get(GateConstants.ELEMENT_NAMESPACE_PREFIX); deserializeNamespaceInfo = (addNSFeature && namespacePrefixFeature != null && !namespacePrefixFeature.isEmpty() && namespaceURIFeature != null && !namespaceURIFeature.isEmpty()); }
/** This method is called when all characters between specific tags have been read completely */ public void charactersAction(char[] text, int start, int length) throws SAXException { // correction of real offset. Didn't affect on other data. super.characters(text, start, length); // create a string object based on the reported text String content = new String(text, start, length); StringBuffer contentBuffer = new StringBuffer(""); int tmpDocContentSize = tmpDocContent.length(); boolean incrementStartIndex = false; boolean addExtraSpace = true; if (Gate.getUserConfig().get(GateConstants.DOCUMENT_ADD_SPACE_ON_UNPACK_FEATURE_NAME) != null) { addExtraSpace = Gate.getUserConfig() .getBoolean(GateConstants.DOCUMENT_ADD_SPACE_ON_UNPACK_FEATURE_NAME) .booleanValue(); } // If the first char of the text just read "text[0]" is NOT whitespace AND // the last char of the tmpDocContent[SIZE-1] is NOT whitespace then // concatenation "tmpDocContent + content" will result into a new different // word... and we want to avoid that, because the tokenizer, gazetter and // Jape work on the raw text and concatenating tokens might be not good. if (tmpDocContentSize != 0 && content.length() != 0 && !Character.isWhitespace(content.charAt(0)) && !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1))) { // If we are here it means that a concatenation between the last // token in the tmpDocContent and the content(which doesn't start // with a white space) will be performed. In order to prevent this, // we will add a " " space char in order to assure that the 2 tokens // stay apart. Howerver we will except from this rule the most known // internal entities like &, <, >, etc if (( // Testing the length against 1 makes it more likely that // an internal entity was called. characters() gets called for // each entity separately. (content.length() == 1) && (content.charAt(0) == '&' || content.charAt(0) == '<' || content.charAt(0) == '>' || content.charAt(0) == '"' || content.charAt(0) == '\'')) || (tmpDocContent.charAt(tmpDocContentSize - 1) == '&' || tmpDocContent.charAt(tmpDocContentSize - 1) == '<' || tmpDocContent.charAt(tmpDocContentSize - 1) == '>' || tmpDocContent.charAt(tmpDocContentSize - 1) == '"' || tmpDocContent.charAt(tmpDocContentSize - 1) == '\'')) { // do nothing. The content will be appended } else if (!addExtraSpace) { } else { // In all other cases append " " contentBuffer.append(" "); incrementStartIndex = true; } // End if } // End if // put the repositioning information if (reposInfo != null) { if (!(start == 0 && length == 1 && text.length <= 2)) { // normal piece of text reposInfo.addPositionInfo( getRealOffset(), content.length(), tmpDocContent.length() + contentBuffer.length(), content.length()); if (DEBUG) { Out.println("Info: " + getRealOffset() + ", " + content.length()); Out.println("Start: " + start + " len" + length); } // DEBUG } else { // unicode char or &xxx; coding // Reported from the parser offset is 0 // The real offset should be found in the ampCodingInfo structure. long lastPosition = 0; RepositioningInfo.PositionInfo pi; if (reposInfo.size() > 0) { pi = reposInfo.get(reposInfo.size() - 1); lastPosition = pi.getOriginalPosition(); } // if for (int i = 0; i < ampCodingInfo.size(); ++i) { pi = ampCodingInfo.get(i); if (pi.getOriginalPosition() > lastPosition) { // found reposInfo.addPositionInfo( pi.getOriginalPosition(), pi.getOriginalLength(), tmpDocContent.length() + contentBuffer.length(), content.length()); break; } // if } // for } // if } // if // update the document content contentBuffer.append(content); // calculate the End index for all the elements of the stack // the expression is : End index = Current doc length + text length Long end = new Long(tmpDocContent.length() + contentBuffer.length()); CustomObject obj = null; // Iterate through stack to modify the End index of the existing elements Iterator<CustomObject> anIterator = stack.iterator(); while (anIterator.hasNext()) { // get the object and move to the next one obj = anIterator.next(); if (incrementStartIndex && obj.getStart().equals(obj.getEnd())) { obj.setStart(new Long(obj.getStart().longValue() + 1)); } // End if // sets its End index obj.setEnd(end); } // End while tmpDocContent.append(contentBuffer.toString()); } // characters();