@Override
    public void characters(char[] ch, int start, int length) throws SAXException {
      if (inCell) {
        StringBuffer t = new StringBuffer(new String(ch, start, length));

        // Quote if not all numbers
        if (all_nums.matcher(t).matches()) {
          super.characters(ch, start, length);
        } else {
          for (int i = t.length() - 1; i >= 0; i--) {
            if (t.charAt(i) == '\"') {
              // Double up double quotes
              t.insert(i, '\"');
              i--;
            }
          }
          t.insert(0, '\"');
          t.append('\"');
          char[] c = t.toString().toCharArray();
          super.characters(c, 0, c.length);
        }
      } else {
        super.characters(ch, start, length);
      }
    }
 @Override
 public void startElement(String uri, String localName, String name, Attributes atts)
     throws SAXException {
   if (localName.equals("td")) {
     inCell = true;
     if (needsComma) {
       super.characters(comma, 0, 1);
       needsComma = true;
     }
   } else {
     super.startElement(uri, localName, name, atts);
   }
 }
 @Override
 public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
   if (length == 1 && ch[0] == '\t') {
     // Ignore tabs, as they mess up the CSV output
   } else {
     super.ignorableWhitespace(ch, start, length);
   }
 }
 @Override
 public void endElement(String uri, String localName, String name) throws SAXException {
   if (localName.equals("td")) {
     needsComma = true;
     inCell = false;
   } else {
     if (localName.equals("tr")) {
       needsComma = false;
     }
     super.endElement(uri, localName, name);
   }
 }
Example #5
0
  /**
   * Common implementation -- take an input stream and return a ConvertedDoc;
   *
   * @param input stream for raw file
   * @param doc raw file
   * @return converted doc
   * @throws IOException if underlying Tika parser/writer had an IO problem, an parser problem, or
   *     MAX_TEXT_SIZE is reached.
   */
  @Override
  protected ConvertedDocument conversionImplementation(InputStream input, java.io.File doc)
      throws IOException {
    Metadata metadata = new Metadata();
    BodyContentHandler handler = new BodyContentHandler(maxBuffer);

    try {
      parser.parse(input, handler, metadata, ctx);
    } catch (NoClassDefFoundError classErr) {
      throw new IOException("Unable to parse content due to Tika misconfiguration", classErr);
    } catch (Exception xerr) {
      throw new IOException("Unable to parse content", xerr);
    } finally {
      input.close();
    }
    ConvertedDocument textdoc = new ConvertedDocument(doc);

    textdoc.addTitle(metadata.get(TikaCoreProperties.TITLE));
    textdoc.setEncoding(metadata.get(Metadata.CONTENT_ENCODING));
    textdoc.addCreateDate(metadata.getDate(TikaCoreProperties.CREATED));
    textdoc.addAuthor(metadata.get(TikaCoreProperties.CREATOR));

    // v1.5:  until this version this blank line reducer was in place.
    //     Using Java6 it appeared to cause StackOverflow when it encountered a document hundreds of
    // \n in a row.
    //     Eg.., a Spreadsheet doc converted to text may have thousands of empty lines following the
    // last data row.
    // TextUtils.reduce_line_breaks(txt)
    String t = handler.toString();
    if (t != null) {
      if (textdoc.filename != null && FileUtility.isSpreadsheet(textdoc.filename)) {
        textdoc.setText(t.trim());
      } else {
        textdoc.setText(TextUtils.reduce_line_breaks(t));
      }
    }
    return textdoc;
  }