@Override public void startDocument() throws SAXException { super.startDocument(); delegate.startDocument(); inHeader = true; inFooter = false; headerCharOffset = 0; if (includeMarkup) { elements = new ArrayList<RecordedElement>(); } };
@Override public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { super.startElement(uri, localName, qName, atts); if (inHeader) { delegate.startElement(uri, localName, qName, atts); } else if (inFooter) { // Do nothing } else if (includeMarkup) { elements.add(new RecordedElement(uri, localName, qName, atts)); } else { // This happens for the <body> element, if we're not doing markup. delegate.startElement(uri, localName, qName, atts); } };
@Override public void endElement(String uri, String localName, String qName) throws SAXException { super.endElement(uri, localName, qName); if (inHeader) { delegate.endElement(uri, localName, qName); inHeader = !localName.equals("head"); } else if (inFooter) { // Do nothing } else if (localName.equals("body")) { inFooter = true; } else if (includeMarkup) { // Add the end element, and the continuation from the previous element elements.add(new RecordedElement(uri, localName, qName)); elements.add(new RecordedElement()); } };
@Override public void characters(char[] chars, int offset, int length) throws SAXException { super.characters(chars, offset, length); if (inHeader) { delegate.characters(chars, offset, length); headerCharOffset++; } else if (inFooter) { // Do nothing } else if (includeMarkup) { RecordedElement element = elements.get(elements.size() - 1); char[] characters = new char[length]; System.arraycopy(chars, offset, characters, 0, length); element.getCharacters().add(characters); } };
@Override public void endDocument() throws SAXException { super.endDocument(); TextDocument td = toTextDocument(); try { extractor.process(td); } catch (BoilerpipeProcessingException e) { throw new SAXException(e); } Attributes emptyAttrs = new AttributesImpl(); // At this point we have all the information we need to either emit N paragraphs // of plain text (if not including markup), or we have to replay our recorded elements // and only emit character runs that passed the boilerpipe filters. if (includeMarkup) { BitSet validCharacterRuns = new BitSet(); for (TextBlock block : td.getTextBlocks()) { if (block.isContent()) { BitSet bs = block.getContainedTextElements(); if (bs != null) { validCharacterRuns.or(bs); } } } // Now have bits set for all valid character runs. Replay our recorded elements, // but only emit character runs flagged as valid. int curCharsIndex = headerCharOffset; for (RecordedElement element : elements) { switch (element.getElementType()) { case START: delegate.startElement( element.getUri(), element.getLocalName(), element.getQName(), element.getAttrs()); // Fall through case CONTINUE: // Now emit characters that are valid. Note that boilerpipe pre-increments the character // index, so // we have to follow suit. for (char[] chars : element.getCharacters()) { curCharsIndex++; if (validCharacterRuns.get(curCharsIndex)) { delegate.characters(chars, 0, chars.length); } } break; case END: delegate.endElement(element.getUri(), element.getLocalName(), element.getQName()); break; default: throw new RuntimeException("Unhandled element type: " + element.getElementType()); } } } else { for (TextBlock block : td.getTextBlocks()) { if (block.isContent()) { delegate.startElement(XHTMLContentHandler.XHTML, "p", "p", emptyAttrs); char[] chars = block.getText().toCharArray(); delegate.characters(chars, 0, chars.length); delegate.endElement(XHTMLContentHandler.XHTML, "p", "p"); delegate.ignorableWhitespace(NL, 0, NL.length); } } } delegate.endElement(XHTMLContentHandler.XHTML, "body", "body"); delegate.endElement(XHTMLContentHandler.XHTML, "html", "html"); // We defer ending any prefix mapping until here, which is why we don't pass this // through to the delegate in an overridden method. delegate.endPrefixMapping(""); delegate.endDocument(); }
@Override public void startPrefixMapping(String prefix, String uri) throws SAXException { super.startPrefixMapping(prefix, uri); delegate.startPrefixMapping(prefix, uri); };