public void autoDetectClues(Content content, boolean filter) { byte[] data = content.getContent(); if (minConfidence >= 0 && DETECTABLES.contains(content.getContentType()) && data.length > MIN_LENGTH) { CharsetMatch[] matches = null; // do all these in a try/catch; setText and detect/detectAll // will sometimes throw exceptions try { detector.enableInputFilter(filter); if (data.length > MIN_LENGTH) { detector.setText(data); matches = detector.detectAll(); } } catch (Exception e) { LOG.debug("Exception from ICU4J (ignoring): "); e.printStackTrace(LogUtil.getDebugStream(LOG)); } if (matches != null) { for (CharsetMatch match : matches) { addClue(match.getName(), "detect", match.getConfidence()); } } } // add character encoding coming from HTTP response header addClue(parseCharacterEncoding(content.getMetadata().get(Response.CONTENT_TYPE)), "header"); }
public void configure(JobConf job) { setConf(job); this.co = getConf().getBoolean("segment.reader.co", true); this.fe = getConf().getBoolean("segment.reader.fe", true); this.ge = getConf().getBoolean("segment.reader.ge", true); this.pa = getConf().getBoolean("segment.reader.pa", true); this.pd = getConf().getBoolean("segment.reader.pd", true); this.pt = getConf().getBoolean("segment.reader.pt", true); try { this.fs = FileSystem.get(getConf()); } catch (IOException e) { e.printStackTrace(LogUtil.getWarnStream(LOG)); } }
public SegmentReader( Configuration conf, boolean co, boolean fe, boolean ge, boolean pa, boolean pd, boolean pt) { super(conf); this.co = co; this.fe = fe; this.ge = ge; this.pa = pa; this.pd = pd; this.pt = pt; try { this.fs = FileSystem.get(getConf()); } catch (IOException e) { e.printStackTrace(LogUtil.getWarnStream(LOG)); } }