private void writeRow(DBFRow row, Charset charset, XHTMLContentHandler xhtml) throws SAXException { xhtml.startElement("tr"); for (DBFCell cell : row.cells) { xhtml.startElement("td"); xhtml.characters(cell.getString(charset)); xhtml.endElement("td"); } xhtml.endElement("tr"); }
private Charset getCharset(List<DBFRow> firstRows, DBFFileHeader header) throws IOException, TikaException { // TODO: potentially use codepage info in the header Charset charset = DEFAULT_CHARSET; ByteArrayOutputStream bos = new ByteArrayOutputStream(); for (DBFRow row : firstRows) { for (DBFCell cell : row.cells) { if (cell.getColType().equals(DBFColumnHeader.ColType.C)) { byte[] bytes = cell.getBytes(); bos.write(bytes); if (bos.size() > MAX_CHARS_FOR_CHARSET_DETECTION) { break; } } } } byte[] bytes = bos.toByteArray(); if (bytes.length > 20) { EncodingDetector detector = new Icu4jEncodingDetector(); detector.detect(TikaInputStream.get(bytes), new Metadata()); charset = detector.detect(new ByteArrayInputStream(bytes), new Metadata()); } return charset; }