Java BodyContentHandler Examples

Programming Language: Java

Namespace/Package Name: org.apache.tika.sax

Examples at hotexamples.com: 5

Java BodyContentHandler - 5 examples found. These are the top rated real world Java examples of org.apache.tika.sax.BodyContentHandler extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

characters(2)

endElement(1)

ignorableWhitespace(1)

startElement(1)

toString(1)

Example #1

Show file

File: PoiHssfContentTransformer.java Project: bulias/community-edition

    @Override
    public void characters(char[] ch, int start, int length) throws SAXException {
      if (inCell) {
        StringBuffer t = new StringBuffer(new String(ch, start, length));

        // Quote if not all numbers
        if (all_nums.matcher(t).matches()) {
          super.characters(ch, start, length);
        } else {
          for (int i = t.length() - 1; i >= 0; i--) {
            if (t.charAt(i) == '\"') {
              // Double up double quotes
              t.insert(i, '\"');
              i--;
            }
          }
          t.insert(0, '\"');
          t.append('\"');
          char[] c = t.toString().toCharArray();
          super.characters(c, 0, c.length);
        }
      } else {
        super.characters(ch, start, length);
      }
    }

Example #2

Show file

File: PoiHssfContentTransformer.java Project: bulias/community-edition

 @Override
 public void startElement(String uri, String localName, String name, Attributes atts)
     throws SAXException {
   if (localName.equals("td")) {
     inCell = true;
     if (needsComma) {
       super.characters(comma, 0, 1);
       needsComma = true;
     }
   } else {
     super.startElement(uri, localName, name, atts);
   }
 }

Example #3

Show file

File: PoiHssfContentTransformer.java Project: bulias/community-edition

 @Override
 public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
   if (length == 1 && ch[0] == '\t') {
     // Ignore tabs, as they mess up the CSV output
   } else {
     super.ignorableWhitespace(ch, start, length);
   }
 }

Example #4

Show file

File: PoiHssfContentTransformer.java Project: bulias/community-edition

 @Override
 public void endElement(String uri, String localName, String name) throws SAXException {
   if (localName.equals("td")) {
     needsComma = true;
     inCell = false;
   } else {
     if (localName.equals("tr")) {
       needsComma = false;
     }
     super.endElement(uri, localName, name);
   }
 }

Example #5

Show file

File: DefaultConverter.java Project: jgibson/Xponents

  /**
   * Common implementation -- take an input stream and return a ConvertedDoc;
   *
   * @param input stream for raw file
   * @param doc raw file
   * @return converted doc
   * @throws IOException if underlying Tika parser/writer had an IO problem, an parser problem, or
   *     MAX_TEXT_SIZE is reached.
   */
  @Override
  protected ConvertedDocument conversionImplementation(InputStream input, java.io.File doc)
      throws IOException {
    Metadata metadata = new Metadata();
    BodyContentHandler handler = new BodyContentHandler(maxBuffer);

    try {
      parser.parse(input, handler, metadata, ctx);
    } catch (NoClassDefFoundError classErr) {
      throw new IOException("Unable to parse content due to Tika misconfiguration", classErr);
    } catch (Exception xerr) {
      throw new IOException("Unable to parse content", xerr);
    } finally {
      input.close();
    }
    ConvertedDocument textdoc = new ConvertedDocument(doc);

    textdoc.addTitle(metadata.get(TikaCoreProperties.TITLE));
    textdoc.setEncoding(metadata.get(Metadata.CONTENT_ENCODING));
    textdoc.addCreateDate(metadata.getDate(TikaCoreProperties.CREATED));
    textdoc.addAuthor(metadata.get(TikaCoreProperties.CREATOR));

    // v1.5:  until this version this blank line reducer was in place.
    //     Using Java6 it appeared to cause StackOverflow when it encountered a document hundreds of
    // \n in a row.
    //     Eg.., a Spreadsheet doc converted to text may have thousands of empty lines following the
    // last data row.
    // TextUtils.reduce_line_breaks(txt)
    String t = handler.toString();
    if (t != null) {
      if (textdoc.filename != null && FileUtility.isSpreadsheet(textdoc.filename)) {
        textdoc.setText(t.trim());
      } else {
        textdoc.setText(TextUtils.reduce_line_breaks(t));
      }
    }
    return textdoc;
  }