protected String blobsToText(List<Blob> blobs, String docId) { List<String> strings = new LinkedList<String>(); for (Blob blob : blobs) { try { SimpleBlobHolder bh = new SimpleBlobHolder(blob); BlobHolder result = convert(bh); if (result == null) { continue; } blob = result.getBlob(); if (blob == null) { continue; } String string = new String(blob.getByteArray(), "UTF-8"); // strip '\0 chars from text if (string.indexOf('\0') >= 0) { string = string.replace("\0", " "); } strings.add(string); } catch (Exception e) { String msg = "Could not extract fulltext of file '" + blob.getFilename() + "' for document: " + docId + ": " + e; log.warn(msg); log.debug(msg, e); continue; } } return StringUtils.join(strings, " "); }
protected static String guessEncoding(Blob blob) throws IOException { // encoding already known? if (blob.getEncoding() != null) { return null; } // bad mime type? String mimeType = blob.getMimeType(); if (mimeType == null) { return null; } if (!mimeType.startsWith("text/") && !mimeType.startsWith("application/xhtml")) { // not a text file, we shouldn't be in the Note importer return null; } byte[] bytes = blob.getByteArray(); List<String> charsets = Arrays.asList("utf-8", "iso-8859-1"); // charset specified in MIME type? String CSEQ = "charset="; int i = mimeType.indexOf(CSEQ); if (i > 0) { String onlyMimeType = mimeType.substring(0, i).replace(";", "").trim(); blob.setMimeType(onlyMimeType); String charset = mimeType.substring(i + CSEQ.length()); i = charset.indexOf(";"); if (i > 0) { charset = charset.substring(0, i); } charset = charset.trim().replace("\"", ""); charsets = new ArrayList<String>(charsets); charsets.add(0, charset); } // resort to auto-detection for (String charset : charsets) { try { Charset cs = Charset.forName(charset); CharsetDecoder d = cs.newDecoder() .onMalformedInput(CodingErrorAction.REPORT) .onUnmappableCharacter(CodingErrorAction.REPORT); CharBuffer cb = d.decode(ByteBuffer.wrap(bytes)); return cb.toString(); } catch (IllegalArgumentException e) { // illegal charset } catch (CharacterCodingException e) { // could not decode } } // nothing worked, use platform return null; }
/** * Checks if the {@code inputBlob} string contains a {@code charset} meta tag. If not, add it. * * @param inputBlob the input blob * @throws IOException Signals that an I/O exception has occurred. */ protected Blob checkCharsetMeta(Blob inputBlob) throws IOException { String charset = inputBlob.getEncoding(); if (!StringUtils.isEmpty(charset)) { Pattern charsetMetaPattern = Pattern.compile(String.format("content=\"text/html;\\s*charset=%s\"", charset)); Matcher charsetMetaMatcher = charsetMetaPattern.matcher(inputBlob.getString()); if (!charsetMetaMatcher.find()) { String charsetMetaTag = String.format( "<META http-equiv=\"Content-Type\" content=\"text/html; charset=%s\">", charset); StringBuilder sb = new StringBuilder(charsetMetaTag); sb.append(new String(inputBlob.getByteArray(), charset)); Blob blobWithCharsetMetaTag = Blobs.createBlob(sb.toString(), "text/html", charset, inputBlob.getFilename()); return blobWithCharsetMetaTag; } } return inputBlob; }