@Override public void characters(char[] ch, int start, int length) throws SAXException { if (inCell) { StringBuffer t = new StringBuffer(new String(ch, start, length)); // Quote if not all numbers if (all_nums.matcher(t).matches()) { super.characters(ch, start, length); } else { for (int i = t.length() - 1; i >= 0; i--) { if (t.charAt(i) == '\"') { // Double up double quotes t.insert(i, '\"'); i--; } } t.insert(0, '\"'); t.append('\"'); char[] c = t.toString().toCharArray(); super.characters(c, 0, c.length); } } else { super.characters(ch, start, length); } }
@Override public void startElement(String uri, String localName, String name, Attributes atts) throws SAXException { if (localName.equals("td")) { inCell = true; if (needsComma) { super.characters(comma, 0, 1); needsComma = true; } } else { super.startElement(uri, localName, name, atts); } }
@Override public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { if (length == 1 && ch[0] == '\t') { // Ignore tabs, as they mess up the CSV output } else { super.ignorableWhitespace(ch, start, length); } }
@Override public void endElement(String uri, String localName, String name) throws SAXException { if (localName.equals("td")) { needsComma = true; inCell = false; } else { if (localName.equals("tr")) { needsComma = false; } super.endElement(uri, localName, name); } }
/** * Common implementation -- take an input stream and return a ConvertedDoc; * * @param input stream for raw file * @param doc raw file * @return converted doc * @throws IOException if underlying Tika parser/writer had an IO problem, an parser problem, or * MAX_TEXT_SIZE is reached. */ @Override protected ConvertedDocument conversionImplementation(InputStream input, java.io.File doc) throws IOException { Metadata metadata = new Metadata(); BodyContentHandler handler = new BodyContentHandler(maxBuffer); try { parser.parse(input, handler, metadata, ctx); } catch (NoClassDefFoundError classErr) { throw new IOException("Unable to parse content due to Tika misconfiguration", classErr); } catch (Exception xerr) { throw new IOException("Unable to parse content", xerr); } finally { input.close(); } ConvertedDocument textdoc = new ConvertedDocument(doc); textdoc.addTitle(metadata.get(TikaCoreProperties.TITLE)); textdoc.setEncoding(metadata.get(Metadata.CONTENT_ENCODING)); textdoc.addCreateDate(metadata.getDate(TikaCoreProperties.CREATED)); textdoc.addAuthor(metadata.get(TikaCoreProperties.CREATOR)); // v1.5: until this version this blank line reducer was in place. // Using Java6 it appeared to cause StackOverflow when it encountered a document hundreds of // \n in a row. // Eg.., a Spreadsheet doc converted to text may have thousands of empty lines following the // last data row. // TextUtils.reduce_line_breaks(txt) String t = handler.toString(); if (t != null) { if (textdoc.filename != null && FileUtility.isSpreadsheet(textdoc.filename)) { textdoc.setText(t.trim()); } else { textdoc.setText(TextUtils.reduce_line_breaks(t)); } } return textdoc; }