예제 #1
0
  public void autoDetectClues(Content content, boolean filter) {
    byte[] data = content.getContent();

    if (minConfidence >= 0
        && DETECTABLES.contains(content.getContentType())
        && data.length > MIN_LENGTH) {
      CharsetMatch[] matches = null;

      // do all these in a try/catch; setText and detect/detectAll
      // will sometimes throw exceptions
      try {
        detector.enableInputFilter(filter);
        if (data.length > MIN_LENGTH) {
          detector.setText(data);
          matches = detector.detectAll();
        }
      } catch (Exception e) {
        LOG.debug("Exception from ICU4J (ignoring): ");
        e.printStackTrace(LogUtil.getDebugStream(LOG));
      }

      if (matches != null) {
        for (CharsetMatch match : matches) {
          addClue(match.getName(), "detect", match.getConfidence());
        }
      }
    }

    // add character encoding coming from HTTP response header
    addClue(parseCharacterEncoding(content.getMetadata().get(Response.CONTENT_TYPE)), "header");
  }
예제 #2
0
 public void configure(JobConf job) {
   setConf(job);
   this.co = getConf().getBoolean("segment.reader.co", true);
   this.fe = getConf().getBoolean("segment.reader.fe", true);
   this.ge = getConf().getBoolean("segment.reader.ge", true);
   this.pa = getConf().getBoolean("segment.reader.pa", true);
   this.pd = getConf().getBoolean("segment.reader.pd", true);
   this.pt = getConf().getBoolean("segment.reader.pt", true);
   try {
     this.fs = FileSystem.get(getConf());
   } catch (IOException e) {
     e.printStackTrace(LogUtil.getWarnStream(LOG));
   }
 }
예제 #3
0
 public SegmentReader(
     Configuration conf, boolean co, boolean fe, boolean ge, boolean pa, boolean pd, boolean pt) {
   super(conf);
   this.co = co;
   this.fe = fe;
   this.ge = ge;
   this.pa = pa;
   this.pd = pd;
   this.pt = pt;
   try {
     this.fs = FileSystem.get(getConf());
   } catch (IOException e) {
     e.printStackTrace(LogUtil.getWarnStream(LOG));
   }
 }