Java XmlPositionCorrectionHandler Examples

Programming Language: Java

Class/Type: XmlPositionCorrectionHandler

Examples at hotexamples.com: 2

Java XmlPositionCorrectionHandler - 2 examples found. These are the top rated real world Java examples of XmlPositionCorrectionHandler extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

characters(1)

startDocument(1)

Example #1

Show file

File: XmlDocumentHandler.java Project: nikolavp/gate

  /** This method is called when the SAX parser encounts the beginning of the XML document. */
  @Override
  public void startDocument() throws org.xml.sax.SAXException {
    // init of variables in the parent
    super.startDocument();

    /**
     * We will attempt to add namespace feature info to each namespaced element only if three
     * parameters are set in the global or local config file: ADD_NAMESPACE_FEATURES: boolean flag
     * ELEMENT_NAMESPACE_URI: feature name to use to hold namespace uri ELEMENT_NAMESPACE_PREFIX:
     * feature name to use to hold namespace prefix
     */
    OptionsMap configData = Gate.getUserConfig();

    boolean addNSFeature =
        Boolean.parseBoolean((String) configData.get(GateConstants.ADD_NAMESPACE_FEATURES));
    namespaceURIFeature = (String) configData.get(GateConstants.ELEMENT_NAMESPACE_URI);
    namespacePrefixFeature = (String) configData.get(GateConstants.ELEMENT_NAMESPACE_PREFIX);

    deserializeNamespaceInfo =
        (addNSFeature
            && namespacePrefixFeature != null
            && !namespacePrefixFeature.isEmpty()
            && namespaceURIFeature != null
            && !namespaceURIFeature.isEmpty());
  }

Example #2

Show file

File: XmlDocumentHandler.java Project: nikolavp/gate

  /** This method is called when all characters between specific tags have been read completely */
  public void charactersAction(char[] text, int start, int length) throws SAXException {
    // correction of real offset. Didn't affect on other data.
    super.characters(text, start, length);
    // create a string object based on the reported text
    String content = new String(text, start, length);
    StringBuffer contentBuffer = new StringBuffer("");
    int tmpDocContentSize = tmpDocContent.length();
    boolean incrementStartIndex = false;
    boolean addExtraSpace = true;
    if (Gate.getUserConfig().get(GateConstants.DOCUMENT_ADD_SPACE_ON_UNPACK_FEATURE_NAME) != null) {
      addExtraSpace =
          Gate.getUserConfig()
              .getBoolean(GateConstants.DOCUMENT_ADD_SPACE_ON_UNPACK_FEATURE_NAME)
              .booleanValue();
    }
    // If the first char of the text just read "text[0]" is NOT whitespace AND
    // the last char of the tmpDocContent[SIZE-1] is NOT whitespace then
    // concatenation "tmpDocContent + content" will result into a new different
    // word... and we want to avoid that, because the tokenizer, gazetter and
    // Jape work on the raw text and concatenating tokens might be not good.
    if (tmpDocContentSize != 0
        && content.length() != 0
        && !Character.isWhitespace(content.charAt(0))
        && !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1))) {

      // If we are here it means that a concatenation between the last
      // token in the tmpDocContent and the content(which doesn't start
      // with a white space) will be performed. In order to prevent this,
      // we will add a " " space char in order to assure that the 2 tokens
      // stay apart. Howerver we will except from this rule the most known
      // internal entities like &, <, >, etc
      if (( // Testing the length against 1 makes it more likely that
          // an internal entity was called. characters() gets called for
          // each entity separately.
          (content.length() == 1)
              && (content.charAt(0) == '&'
                  || content.charAt(0) == '<'
                  || content.charAt(0) == '>'
                  || content.charAt(0) == '"'
                  || content.charAt(0) == '\''))
          || (tmpDocContent.charAt(tmpDocContentSize - 1) == '&'
              || tmpDocContent.charAt(tmpDocContentSize - 1) == '<'
              || tmpDocContent.charAt(tmpDocContentSize - 1) == '>'
              || tmpDocContent.charAt(tmpDocContentSize - 1) == '"'
              || tmpDocContent.charAt(tmpDocContentSize - 1)
                  == '\'')) { // do nothing. The content will be appended
      } else if (!addExtraSpace) {
      } else {
        // In all other cases append " "
        contentBuffer.append(" ");
        incrementStartIndex = true;
      } // End if
    } // End if

    // put the repositioning information
    if (reposInfo != null) {
      if (!(start == 0 && length == 1 && text.length <= 2)) {
        // normal piece of text
        reposInfo.addPositionInfo(
            getRealOffset(),
            content.length(),
            tmpDocContent.length() + contentBuffer.length(),
            content.length());
        if (DEBUG) {
          Out.println("Info: " + getRealOffset() + ", " + content.length());
          Out.println("Start: " + start + " len" + length);
        } // DEBUG
      } else {
        // unicode char or &xxx; coding
        // Reported from the parser offset is 0
        // The real offset should be found in the ampCodingInfo structure.

        long lastPosition = 0;
        RepositioningInfo.PositionInfo pi;

        if (reposInfo.size() > 0) {
          pi = reposInfo.get(reposInfo.size() - 1);
          lastPosition = pi.getOriginalPosition();
        } // if

        for (int i = 0; i < ampCodingInfo.size(); ++i) {
          pi = ampCodingInfo.get(i);
          if (pi.getOriginalPosition() > lastPosition) {
            // found
            reposInfo.addPositionInfo(
                pi.getOriginalPosition(),
                pi.getOriginalLength(),
                tmpDocContent.length() + contentBuffer.length(),
                content.length());
            break;
          } // if
        } // for
      } // if
    } // if

    // update the document content
    contentBuffer.append(content);
    // calculate the End index for all the elements of the stack
    // the expression is : End index = Current doc length + text length
    Long end = new Long(tmpDocContent.length() + contentBuffer.length());

    CustomObject obj = null;
    // Iterate through stack to modify the End index of the existing elements

    Iterator<CustomObject> anIterator = stack.iterator();
    while (anIterator.hasNext()) {
      // get the object and move to the next one
      obj = anIterator.next();
      if (incrementStartIndex && obj.getStart().equals(obj.getEnd())) {
        obj.setStart(new Long(obj.getStart().longValue() + 1));
      } // End if
      // sets its End index
      obj.setEnd(end);
    } // End while

    tmpDocContent.append(contentBuffer.toString());
  } // characters();