Example #1
0
 private void writeRow(DBFRow row, Charset charset, XHTMLContentHandler xhtml)
     throws SAXException {
   xhtml.startElement("tr");
   for (DBFCell cell : row.cells) {
     xhtml.startElement("td");
     xhtml.characters(cell.getString(charset));
     xhtml.endElement("td");
   }
   xhtml.endElement("tr");
 }
Example #2
0
 private Charset getCharset(List<DBFRow> firstRows, DBFFileHeader header)
     throws IOException, TikaException {
   // TODO: potentially use codepage info in the header
   Charset charset = DEFAULT_CHARSET;
   ByteArrayOutputStream bos = new ByteArrayOutputStream();
   for (DBFRow row : firstRows) {
     for (DBFCell cell : row.cells) {
       if (cell.getColType().equals(DBFColumnHeader.ColType.C)) {
         byte[] bytes = cell.getBytes();
         bos.write(bytes);
         if (bos.size() > MAX_CHARS_FOR_CHARSET_DETECTION) {
           break;
         }
       }
     }
   }
   byte[] bytes = bos.toByteArray();
   if (bytes.length > 20) {
     EncodingDetector detector = new Icu4jEncodingDetector();
     detector.detect(TikaInputStream.get(bytes), new Metadata());
     charset = detector.detect(new ByteArrayInputStream(bytes), new Metadata());
   }
   return charset;
 }