示例#1
0
  /** This method is called when the SAX parser encounts the beginning of the XML document. */
  @Override
  public void startDocument() throws org.xml.sax.SAXException {
    // init of variables in the parent
    super.startDocument();

    /**
     * We will attempt to add namespace feature info to each namespaced element only if three
     * parameters are set in the global or local config file: ADD_NAMESPACE_FEATURES: boolean flag
     * ELEMENT_NAMESPACE_URI: feature name to use to hold namespace uri ELEMENT_NAMESPACE_PREFIX:
     * feature name to use to hold namespace prefix
     */
    OptionsMap configData = Gate.getUserConfig();

    boolean addNSFeature =
        Boolean.parseBoolean((String) configData.get(GateConstants.ADD_NAMESPACE_FEATURES));
    namespaceURIFeature = (String) configData.get(GateConstants.ELEMENT_NAMESPACE_URI);
    namespacePrefixFeature = (String) configData.get(GateConstants.ELEMENT_NAMESPACE_PREFIX);

    deserializeNamespaceInfo =
        (addNSFeature
            && namespacePrefixFeature != null
            && !namespacePrefixFeature.isEmpty()
            && namespaceURIFeature != null
            && !namespaceURIFeature.isEmpty());
  }
示例#2
0
  /** This method is called when all characters between specific tags have been read completely */
  public void charactersAction(char[] text, int start, int length) throws SAXException {
    // correction of real offset. Didn't affect on other data.
    super.characters(text, start, length);
    // create a string object based on the reported text
    String content = new String(text, start, length);
    StringBuffer contentBuffer = new StringBuffer("");
    int tmpDocContentSize = tmpDocContent.length();
    boolean incrementStartIndex = false;
    boolean addExtraSpace = true;
    if (Gate.getUserConfig().get(GateConstants.DOCUMENT_ADD_SPACE_ON_UNPACK_FEATURE_NAME) != null) {
      addExtraSpace =
          Gate.getUserConfig()
              .getBoolean(GateConstants.DOCUMENT_ADD_SPACE_ON_UNPACK_FEATURE_NAME)
              .booleanValue();
    }
    // If the first char of the text just read "text[0]" is NOT whitespace AND
    // the last char of the tmpDocContent[SIZE-1] is NOT whitespace then
    // concatenation "tmpDocContent + content" will result into a new different
    // word... and we want to avoid that, because the tokenizer, gazetter and
    // Jape work on the raw text and concatenating tokens might be not good.
    if (tmpDocContentSize != 0
        && content.length() != 0
        && !Character.isWhitespace(content.charAt(0))
        && !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1))) {

      // If we are here it means that a concatenation between the last
      // token in the tmpDocContent and the content(which doesn't start
      // with a white space) will be performed. In order to prevent this,
      // we will add a " " space char in order to assure that the 2 tokens
      // stay apart. Howerver we will except from this rule the most known
      // internal entities like &, <, >, etc
      if (( // Testing the length against 1 makes it more likely that
          // an internal entity was called. characters() gets called for
          // each entity separately.
          (content.length() == 1)
              && (content.charAt(0) == '&'
                  || content.charAt(0) == '<'
                  || content.charAt(0) == '>'
                  || content.charAt(0) == '"'
                  || content.charAt(0) == '\''))
          || (tmpDocContent.charAt(tmpDocContentSize - 1) == '&'
              || tmpDocContent.charAt(tmpDocContentSize - 1) == '<'
              || tmpDocContent.charAt(tmpDocContentSize - 1) == '>'
              || tmpDocContent.charAt(tmpDocContentSize - 1) == '"'
              || tmpDocContent.charAt(tmpDocContentSize - 1)
                  == '\'')) { // do nothing. The content will be appended
      } else if (!addExtraSpace) {
      } else {
        // In all other cases append " "
        contentBuffer.append(" ");
        incrementStartIndex = true;
      } // End if
    } // End if

    // put the repositioning information
    if (reposInfo != null) {
      if (!(start == 0 && length == 1 && text.length <= 2)) {
        // normal piece of text
        reposInfo.addPositionInfo(
            getRealOffset(),
            content.length(),
            tmpDocContent.length() + contentBuffer.length(),
            content.length());
        if (DEBUG) {
          Out.println("Info: " + getRealOffset() + ", " + content.length());
          Out.println("Start: " + start + " len" + length);
        } // DEBUG
      } else {
        // unicode char or &xxx; coding
        // Reported from the parser offset is 0
        // The real offset should be found in the ampCodingInfo structure.

        long lastPosition = 0;
        RepositioningInfo.PositionInfo pi;

        if (reposInfo.size() > 0) {
          pi = reposInfo.get(reposInfo.size() - 1);
          lastPosition = pi.getOriginalPosition();
        } // if

        for (int i = 0; i < ampCodingInfo.size(); ++i) {
          pi = ampCodingInfo.get(i);
          if (pi.getOriginalPosition() > lastPosition) {
            // found
            reposInfo.addPositionInfo(
                pi.getOriginalPosition(),
                pi.getOriginalLength(),
                tmpDocContent.length() + contentBuffer.length(),
                content.length());
            break;
          } // if
        } // for
      } // if
    } // if

    // update the document content
    contentBuffer.append(content);
    // calculate the End index for all the elements of the stack
    // the expression is : End index = Current doc length + text length
    Long end = new Long(tmpDocContent.length() + contentBuffer.length());

    CustomObject obj = null;
    // Iterate through stack to modify the End index of the existing elements

    Iterator<CustomObject> anIterator = stack.iterator();
    while (anIterator.hasNext()) {
      // get the object and move to the next one
      obj = anIterator.next();
      if (incrementStartIndex && obj.getStart().equals(obj.getEnd())) {
        obj.setStart(new Long(obj.getStart().longValue() + 1));
      } // End if
      // sets its End index
      obj.setEnd(end);
    } // End while

    tmpDocContent.append(contentBuffer.toString());
  } // characters();