/** * For given content the list with shrink position information is searched and on the * corresponding positions the correct repositioning information is calculated and generated. */ public void addRepositioningInfo(String content, int pos, int extractedPos) { int contentLength = content.length(); // wrong way (without correction and analysing) // reposInfo.addPositionInfo(pos, contentLength, extractedPos, contentLength); RepositioningInfo.PositionInfo pi = null; long startPos = pos; long correction = 0; long substituteStart; long remainingLen; long offsetInExtracted; for (int i = 0; i < ampCodingInfo.size(); ++i) { pi = ampCodingInfo.get(i); substituteStart = pi.getOriginalPosition(); if (substituteStart >= startPos) { if (substituteStart > pos + contentLength + correction) { break; // outside the current text } // if // should create two repositioning information records remainingLen = substituteStart - (startPos + correction); offsetInExtracted = startPos - pos; if (remainingLen > 0) { reposInfo.addPositionInfo( startPos + correction, remainingLen, extractedPos + offsetInExtracted, remainingLen); } // if // record for shrank text reposInfo.addPositionInfo( substituteStart, pi.getOriginalLength(), extractedPos + offsetInExtracted + remainingLen, pi.getCurrentLength()); startPos = startPos + remainingLen + pi.getCurrentLength(); correction += pi.getOriginalLength() - pi.getCurrentLength(); } // if } // for // there is some text remaining for repositioning offsetInExtracted = startPos - pos; remainingLen = contentLength - offsetInExtracted; if (remainingLen > 0) { reposInfo.addPositionInfo( startPos + correction, remainingLen, extractedPos + offsetInExtracted, remainingLen); } // if } // addRepositioningInfo
/** This method is called when all characters between specific tags have been read completely */ public void charactersAction(char[] text, int start, int length) throws SAXException { // correction of real offset. Didn't affect on other data. super.characters(text, start, length); // create a string object based on the reported text String content = new String(text, start, length); StringBuffer contentBuffer = new StringBuffer(""); int tmpDocContentSize = tmpDocContent.length(); boolean incrementStartIndex = false; boolean addExtraSpace = true; if (Gate.getUserConfig().get(GateConstants.DOCUMENT_ADD_SPACE_ON_UNPACK_FEATURE_NAME) != null) { addExtraSpace = Gate.getUserConfig() .getBoolean(GateConstants.DOCUMENT_ADD_SPACE_ON_UNPACK_FEATURE_NAME) .booleanValue(); } // If the first char of the text just read "text[0]" is NOT whitespace AND // the last char of the tmpDocContent[SIZE-1] is NOT whitespace then // concatenation "tmpDocContent + content" will result into a new different // word... and we want to avoid that, because the tokenizer, gazetter and // Jape work on the raw text and concatenating tokens might be not good. if (tmpDocContentSize != 0 && content.length() != 0 && !Character.isWhitespace(content.charAt(0)) && !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1))) { // If we are here it means that a concatenation between the last // token in the tmpDocContent and the content(which doesn't start // with a white space) will be performed. In order to prevent this, // we will add a " " space char in order to assure that the 2 tokens // stay apart. Howerver we will except from this rule the most known // internal entities like &, <, >, etc if (( // Testing the length against 1 makes it more likely that // an internal entity was called. characters() gets called for // each entity separately. (content.length() == 1) && (content.charAt(0) == '&' || content.charAt(0) == '<' || content.charAt(0) == '>' || content.charAt(0) == '"' || content.charAt(0) == '\'')) || (tmpDocContent.charAt(tmpDocContentSize - 1) == '&' || tmpDocContent.charAt(tmpDocContentSize - 1) == '<' || tmpDocContent.charAt(tmpDocContentSize - 1) == '>' || tmpDocContent.charAt(tmpDocContentSize - 1) == '"' || tmpDocContent.charAt(tmpDocContentSize - 1) == '\'')) { // do nothing. The content will be appended } else if (!addExtraSpace) { } else { // In all other cases append " " contentBuffer.append(" "); incrementStartIndex = true; } // End if } // End if // put the repositioning information if (reposInfo != null) { if (!(start == 0 && length == 1 && text.length <= 2)) { // normal piece of text reposInfo.addPositionInfo( getRealOffset(), content.length(), tmpDocContent.length() + contentBuffer.length(), content.length()); if (DEBUG) { Out.println("Info: " + getRealOffset() + ", " + content.length()); Out.println("Start: " + start + " len" + length); } // DEBUG } else { // unicode char or &xxx; coding // Reported from the parser offset is 0 // The real offset should be found in the ampCodingInfo structure. long lastPosition = 0; RepositioningInfo.PositionInfo pi; if (reposInfo.size() > 0) { pi = reposInfo.get(reposInfo.size() - 1); lastPosition = pi.getOriginalPosition(); } // if for (int i = 0; i < ampCodingInfo.size(); ++i) { pi = ampCodingInfo.get(i); if (pi.getOriginalPosition() > lastPosition) { // found reposInfo.addPositionInfo( pi.getOriginalPosition(), pi.getOriginalLength(), tmpDocContent.length() + contentBuffer.length(), content.length()); break; } // if } // for } // if } // if // update the document content contentBuffer.append(content); // calculate the End index for all the elements of the stack // the expression is : End index = Current doc length + text length Long end = new Long(tmpDocContent.length() + contentBuffer.length()); CustomObject obj = null; // Iterate through stack to modify the End index of the existing elements Iterator<CustomObject> anIterator = stack.iterator(); while (anIterator.hasNext()) { // get the object and move to the next one obj = anIterator.next(); if (incrementStartIndex && obj.getStart().equals(obj.getEnd())) { obj.setStart(new Long(obj.getStart().longValue() + 1)); } // End if // sets its End index obj.setEnd(end); } // End while tmpDocContent.append(contentBuffer.toString()); } // characters();