Beispiel #1
0
  /**
   * For given content the list with shrink position information is searched and on the
   * corresponding positions the correct repositioning information is calculated and generated.
   */
  public void addRepositioningInfo(String content, int pos, int extractedPos) {
    int contentLength = content.length();

    // wrong way (without correction and analysing)
    // reposInfo.addPositionInfo(pos, contentLength, extractedPos, contentLength);

    RepositioningInfo.PositionInfo pi = null;
    long startPos = pos;
    long correction = 0;
    long substituteStart;
    long remainingLen;
    long offsetInExtracted;

    for (int i = 0; i < ampCodingInfo.size(); ++i) {
      pi = ampCodingInfo.get(i);
      substituteStart = pi.getOriginalPosition();

      if (substituteStart >= startPos) {
        if (substituteStart > pos + contentLength + correction) {
          break; // outside the current text
        } // if

        // should create two repositioning information records
        remainingLen = substituteStart - (startPos + correction);
        offsetInExtracted = startPos - pos;
        if (remainingLen > 0) {
          reposInfo.addPositionInfo(
              startPos + correction, remainingLen, extractedPos + offsetInExtracted, remainingLen);
        } // if
        // record for shrank text
        reposInfo.addPositionInfo(
            substituteStart,
            pi.getOriginalLength(),
            extractedPos + offsetInExtracted + remainingLen,
            pi.getCurrentLength());
        startPos = startPos + remainingLen + pi.getCurrentLength();
        correction += pi.getOriginalLength() - pi.getCurrentLength();
      } // if
    } // for

    // there is some text remaining for repositioning
    offsetInExtracted = startPos - pos;
    remainingLen = contentLength - offsetInExtracted;
    if (remainingLen > 0) {
      reposInfo.addPositionInfo(
          startPos + correction, remainingLen, extractedPos + offsetInExtracted, remainingLen);
    } // if
  } // addRepositioningInfo
Beispiel #2
0
  /** This method is called when all characters between specific tags have been read completely */
  public void charactersAction(char[] text, int start, int length) throws SAXException {
    // correction of real offset. Didn't affect on other data.
    super.characters(text, start, length);
    // create a string object based on the reported text
    String content = new String(text, start, length);
    StringBuffer contentBuffer = new StringBuffer("");
    int tmpDocContentSize = tmpDocContent.length();
    boolean incrementStartIndex = false;
    boolean addExtraSpace = true;
    if (Gate.getUserConfig().get(GateConstants.DOCUMENT_ADD_SPACE_ON_UNPACK_FEATURE_NAME) != null) {
      addExtraSpace =
          Gate.getUserConfig()
              .getBoolean(GateConstants.DOCUMENT_ADD_SPACE_ON_UNPACK_FEATURE_NAME)
              .booleanValue();
    }
    // If the first char of the text just read "text[0]" is NOT whitespace AND
    // the last char of the tmpDocContent[SIZE-1] is NOT whitespace then
    // concatenation "tmpDocContent + content" will result into a new different
    // word... and we want to avoid that, because the tokenizer, gazetter and
    // Jape work on the raw text and concatenating tokens might be not good.
    if (tmpDocContentSize != 0
        && content.length() != 0
        && !Character.isWhitespace(content.charAt(0))
        && !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1))) {

      // If we are here it means that a concatenation between the last
      // token in the tmpDocContent and the content(which doesn't start
      // with a white space) will be performed. In order to prevent this,
      // we will add a " " space char in order to assure that the 2 tokens
      // stay apart. Howerver we will except from this rule the most known
      // internal entities like &, <, >, etc
      if (( // Testing the length against 1 makes it more likely that
          // an internal entity was called. characters() gets called for
          // each entity separately.
          (content.length() == 1)
              && (content.charAt(0) == '&'
                  || content.charAt(0) == '<'
                  || content.charAt(0) == '>'
                  || content.charAt(0) == '"'
                  || content.charAt(0) == '\''))
          || (tmpDocContent.charAt(tmpDocContentSize - 1) == '&'
              || tmpDocContent.charAt(tmpDocContentSize - 1) == '<'
              || tmpDocContent.charAt(tmpDocContentSize - 1) == '>'
              || tmpDocContent.charAt(tmpDocContentSize - 1) == '"'
              || tmpDocContent.charAt(tmpDocContentSize - 1)
                  == '\'')) { // do nothing. The content will be appended
      } else if (!addExtraSpace) {
      } else {
        // In all other cases append " "
        contentBuffer.append(" ");
        incrementStartIndex = true;
      } // End if
    } // End if

    // put the repositioning information
    if (reposInfo != null) {
      if (!(start == 0 && length == 1 && text.length <= 2)) {
        // normal piece of text
        reposInfo.addPositionInfo(
            getRealOffset(),
            content.length(),
            tmpDocContent.length() + contentBuffer.length(),
            content.length());
        if (DEBUG) {
          Out.println("Info: " + getRealOffset() + ", " + content.length());
          Out.println("Start: " + start + " len" + length);
        } // DEBUG
      } else {
        // unicode char or &xxx; coding
        // Reported from the parser offset is 0
        // The real offset should be found in the ampCodingInfo structure.

        long lastPosition = 0;
        RepositioningInfo.PositionInfo pi;

        if (reposInfo.size() > 0) {
          pi = reposInfo.get(reposInfo.size() - 1);
          lastPosition = pi.getOriginalPosition();
        } // if

        for (int i = 0; i < ampCodingInfo.size(); ++i) {
          pi = ampCodingInfo.get(i);
          if (pi.getOriginalPosition() > lastPosition) {
            // found
            reposInfo.addPositionInfo(
                pi.getOriginalPosition(),
                pi.getOriginalLength(),
                tmpDocContent.length() + contentBuffer.length(),
                content.length());
            break;
          } // if
        } // for
      } // if
    } // if

    // update the document content
    contentBuffer.append(content);
    // calculate the End index for all the elements of the stack
    // the expression is : End index = Current doc length + text length
    Long end = new Long(tmpDocContent.length() + contentBuffer.length());

    CustomObject obj = null;
    // Iterate through stack to modify the End index of the existing elements

    Iterator<CustomObject> anIterator = stack.iterator();
    while (anIterator.hasNext()) {
      // get the object and move to the next one
      obj = anIterator.next();
      if (incrementStartIndex && obj.getStart().equals(obj.getEnd())) {
        obj.setStart(new Long(obj.getStart().longValue() + 1));
      } // End if
      // sets its End index
      obj.setEnd(end);
    } // End while

    tmpDocContent.append(contentBuffer.toString());
  } // characters();