예제 #1
0
  @Override
  public void startDocument() throws SAXException {
    super.startDocument();

    delegate.startDocument();

    inHeader = true;
    inFooter = false;
    headerCharOffset = 0;

    if (includeMarkup) {
      elements = new ArrayList<RecordedElement>();
    }
  };
예제 #2
0
  @Override
  public void startElement(String uri, String localName, String qName, Attributes atts)
      throws SAXException {
    super.startElement(uri, localName, qName, atts);

    if (inHeader) {
      delegate.startElement(uri, localName, qName, atts);
    } else if (inFooter) {
      // Do nothing
    } else if (includeMarkup) {
      elements.add(new RecordedElement(uri, localName, qName, atts));
    } else {
      // This happens for the <body> element, if we're not doing markup.
      delegate.startElement(uri, localName, qName, atts);
    }
  };
예제 #3
0
  @Override
  public void endElement(String uri, String localName, String qName) throws SAXException {
    super.endElement(uri, localName, qName);

    if (inHeader) {
      delegate.endElement(uri, localName, qName);
      inHeader = !localName.equals("head");
    } else if (inFooter) {
      // Do nothing
    } else if (localName.equals("body")) {
      inFooter = true;
    } else if (includeMarkup) {
      // Add the end element, and the continuation from the previous element
      elements.add(new RecordedElement(uri, localName, qName));
      elements.add(new RecordedElement());
    }
  };
예제 #4
0
  @Override
  public void characters(char[] chars, int offset, int length) throws SAXException {
    super.characters(chars, offset, length);

    if (inHeader) {
      delegate.characters(chars, offset, length);
      headerCharOffset++;
    } else if (inFooter) {
      // Do nothing
    } else if (includeMarkup) {
      RecordedElement element = elements.get(elements.size() - 1);

      char[] characters = new char[length];
      System.arraycopy(chars, offset, characters, 0, length);
      element.getCharacters().add(characters);
    }
  };
예제 #5
0
  @Override
  public void endDocument() throws SAXException {
    super.endDocument();

    TextDocument td = toTextDocument();
    try {
      extractor.process(td);
    } catch (BoilerpipeProcessingException e) {
      throw new SAXException(e);
    }

    Attributes emptyAttrs = new AttributesImpl();

    // At this point we have all the information we need to either emit N paragraphs
    // of plain text (if not including markup), or we have to replay our recorded elements
    // and only emit character runs that passed the boilerpipe filters.
    if (includeMarkup) {
      BitSet validCharacterRuns = new BitSet();
      for (TextBlock block : td.getTextBlocks()) {
        if (block.isContent()) {
          BitSet bs = block.getContainedTextElements();
          if (bs != null) {
            validCharacterRuns.or(bs);
          }
        }
      }

      // Now have bits set for all valid character runs. Replay our recorded elements,
      // but only emit character runs flagged as valid.
      int curCharsIndex = headerCharOffset;
      for (RecordedElement element : elements) {
        switch (element.getElementType()) {
          case START:
            delegate.startElement(
                element.getUri(), element.getLocalName(), element.getQName(), element.getAttrs());
            // Fall through

          case CONTINUE:
            // Now emit characters that are valid. Note that boilerpipe pre-increments the character
            // index, so
            // we have to follow suit.
            for (char[] chars : element.getCharacters()) {
              curCharsIndex++;

              if (validCharacterRuns.get(curCharsIndex)) {
                delegate.characters(chars, 0, chars.length);
              }
            }
            break;

          case END:
            delegate.endElement(element.getUri(), element.getLocalName(), element.getQName());
            break;

          default:
            throw new RuntimeException("Unhandled element type: " + element.getElementType());
        }
      }
    } else {
      for (TextBlock block : td.getTextBlocks()) {
        if (block.isContent()) {
          delegate.startElement(XHTMLContentHandler.XHTML, "p", "p", emptyAttrs);
          char[] chars = block.getText().toCharArray();
          delegate.characters(chars, 0, chars.length);
          delegate.endElement(XHTMLContentHandler.XHTML, "p", "p");
          delegate.ignorableWhitespace(NL, 0, NL.length);
        }
      }
    }

    delegate.endElement(XHTMLContentHandler.XHTML, "body", "body");
    delegate.endElement(XHTMLContentHandler.XHTML, "html", "html");

    // We defer ending any prefix mapping until here, which is why we don't pass this
    // through to the delegate in an overridden method.
    delegate.endPrefixMapping("");

    delegate.endDocument();
  }
예제 #6
0
 @Override
 public void startPrefixMapping(String prefix, String uri) throws SAXException {
   super.startPrefixMapping(prefix, uri);
   delegate.startPrefixMapping(prefix, uri);
 };