protected String blobsToText(List<Blob> blobs, String docId) {
   List<String> strings = new LinkedList<String>();
   for (Blob blob : blobs) {
     try {
       SimpleBlobHolder bh = new SimpleBlobHolder(blob);
       BlobHolder result = convert(bh);
       if (result == null) {
         continue;
       }
       blob = result.getBlob();
       if (blob == null) {
         continue;
       }
       String string = new String(blob.getByteArray(), "UTF-8");
       // strip '\0 chars from text
       if (string.indexOf('\0') >= 0) {
         string = string.replace("\0", " ");
       }
       strings.add(string);
     } catch (Exception e) {
       String msg =
           "Could not extract fulltext of file '"
               + blob.getFilename()
               + "' for document: "
               + docId
               + ": "
               + e;
       log.warn(msg);
       log.debug(msg, e);
       continue;
     }
   }
   return StringUtils.join(strings, " ");
 }
  protected static String guessEncoding(Blob blob) throws IOException {
    // encoding already known?
    if (blob.getEncoding() != null) {
      return null;
    }

    // bad mime type?
    String mimeType = blob.getMimeType();
    if (mimeType == null) {
      return null;
    }
    if (!mimeType.startsWith("text/") && !mimeType.startsWith("application/xhtml")) {
      // not a text file, we shouldn't be in the Note importer
      return null;
    }

    byte[] bytes = blob.getByteArray();

    List<String> charsets = Arrays.asList("utf-8", "iso-8859-1");

    // charset specified in MIME type?
    String CSEQ = "charset=";
    int i = mimeType.indexOf(CSEQ);
    if (i > 0) {
      String onlyMimeType = mimeType.substring(0, i).replace(";", "").trim();
      blob.setMimeType(onlyMimeType);
      String charset = mimeType.substring(i + CSEQ.length());
      i = charset.indexOf(";");
      if (i > 0) {
        charset = charset.substring(0, i);
      }
      charset = charset.trim().replace("\"", "");
      charsets = new ArrayList<String>(charsets);
      charsets.add(0, charset);
    }

    // resort to auto-detection
    for (String charset : charsets) {
      try {
        Charset cs = Charset.forName(charset);
        CharsetDecoder d =
            cs.newDecoder()
                .onMalformedInput(CodingErrorAction.REPORT)
                .onUnmappableCharacter(CodingErrorAction.REPORT);
        CharBuffer cb = d.decode(ByteBuffer.wrap(bytes));
        return cb.toString();
      } catch (IllegalArgumentException e) {
        // illegal charset
      } catch (CharacterCodingException e) {
        // could not decode
      }
    }
    // nothing worked, use platform
    return null;
  }
Beispiel #3
0
  /**
   * Checks if the {@code inputBlob} string contains a {@code charset} meta tag. If not, add it.
   *
   * @param inputBlob the input blob
   * @throws IOException Signals that an I/O exception has occurred.
   */
  protected Blob checkCharsetMeta(Blob inputBlob) throws IOException {

    String charset = inputBlob.getEncoding();
    if (!StringUtils.isEmpty(charset)) {
      Pattern charsetMetaPattern =
          Pattern.compile(String.format("content=\"text/html;\\s*charset=%s\"", charset));
      Matcher charsetMetaMatcher = charsetMetaPattern.matcher(inputBlob.getString());
      if (!charsetMetaMatcher.find()) {
        String charsetMetaTag =
            String.format(
                "<META http-equiv=\"Content-Type\" content=\"text/html; charset=%s\">", charset);
        StringBuilder sb = new StringBuilder(charsetMetaTag);
        sb.append(new String(inputBlob.getByteArray(), charset));
        Blob blobWithCharsetMetaTag =
            Blobs.createBlob(sb.toString(), "text/html", charset, inputBlob.getFilename());
        return blobWithCharsetMetaTag;
      }
    }
    return inputBlob;
  }