public void autoDetectClues(Content content, boolean filter) { byte[] data = content.getContent(); if (minConfidence >= 0 && DETECTABLES.contains(content.getContentType()) && data.length > MIN_LENGTH) { CharsetMatch[] matches = null; // do all these in a try/catch; setText and detect/detectAll // will sometimes throw exceptions try { detector.enableInputFilter(filter); if (data.length > MIN_LENGTH) { detector.setText(data); matches = detector.detectAll(); } } catch (Exception e) { LOG.debug("Exception from ICU4J (ignoring): "); e.printStackTrace(LogUtil.getDebugStream(LOG)); } if (matches != null) { for (CharsetMatch match : matches) { addClue(match.getName(), "detect", match.getConfidence()); } } } // add character encoding coming from HTTP response header addClue(parseCharacterEncoding(content.getMetadata().get(Response.CONTENT_TYPE)), "header"); }
private String getContentCharset(byte[] content, Metadata metadata) { String charset = null; // check if the server specified a charset String specifiedContentType = metadata.getFirstValue(HttpHeaders.CONTENT_TYPE); try { if (specifiedContentType != null) { ContentType parsedContentType = ContentType.parse(specifiedContentType); charset = parsedContentType.getCharset().name(); } } catch (Exception e) { charset = null; } // filter HTML tags CharsetDetector detector = new CharsetDetector(); detector.enableInputFilter(true); // give it a hint detector.setDeclaredEncoding(charset); detector.setText(content); try { CharsetMatch charsetMatch = detector.detect(); if (charsetMatch != null) { charset = charsetMatch.getName(); } } catch (Exception e) { // ignore and leave the charset as-is } return charset; }