public RecordReader<Text, SequencedFragment> createRecordReader(
     InputSplit genericSplit, TaskAttemptContext context)
     throws IOException, InterruptedException {
   context.setStatus(genericSplit.toString());
   return new QseqRecordReader(
       ContextUtil.getConfiguration(context),
       (FileSplit) genericSplit); // cast as per example in TextInputFormat
 }
Esempio n. 2
0
 @Override
 public RecordReader createRecordReader(InputSplit split, TaskAttemptContext context)
     throws IOException, InterruptedException {
   if (split.getClass().equals(TableSplit.class)) {
     return tableInputFormat.createRecordReader(split, context);
   } else {
     return fileInputFormat.createRecordReader(split, context);
   }
 }
 /**
  * Casts an InputSplit into a HCatSplit, providing a useful error message if the cast fails.
  *
  * @param split the InputSplit
  * @return the HCatSplit
  * @throws IOException
  */
 public static HCatSplit castToHCatSplit(InputSplit split) throws IOException {
   if (split instanceof HCatSplit) {
     return (HCatSplit) split;
   } else {
     throw new IOException(
         "Split must be "
             + HCatSplit.class.getName()
             + " but found "
             + split.getClass().getName());
   }
 }
Esempio n. 4
0
  @Override
  @SuppressWarnings("unchecked")
  public String[] getLocations() throws IOException, InterruptedException {
    if (locations == null) {
      HashMap<String, Long> locMap = new HashMap<String, Long>();
      Long lenInMap;
      for (InputSplit split : wrappedSplits) {
        String[] locs = split.getLocations();
        for (String loc : locs) {
          if ((lenInMap = locMap.get(loc)) == null) locMap.put(loc, split.getLength());
          else locMap.put(loc, lenInMap + split.getLength());
        }
      }
      Set<Map.Entry<String, Long>> entrySet = locMap.entrySet();
      Map.Entry<String, Long>[] hostSize = entrySet.toArray(new Map.Entry[entrySet.size()]);
      Arrays.sort(
          hostSize,
          new Comparator<Map.Entry<String, Long>>() {

            @Override
            public int compare(Entry<String, Long> o1, Entry<String, Long> o2) {
              long diff = o1.getValue() - o2.getValue();
              if (diff < 0) return 1;
              if (diff > 0) return -1;
              return 0;
            }
          });
      // maximum 5 locations are in list: refer to PIG-1648 for more details
      int nHost = Math.min(hostSize.length, 5);
      locations = new String[nHost];
      for (int i = 0; i < nHost; ++i) {
        locations[i] = hostSize[i].getKey();
      }
    }
    return locations;
  }
    public void initialize(InputSplit split, TaskAttemptContext context)
        throws IOException, InterruptedException {
      Path path = ((FileSplit) split).getPath();
      Configuration conf = context.getConfiguration();
      FileSystem fs = path.getFileSystem(conf);
      this.in = new SequenceFile.Reader(fs, path, conf);
      this.end = ((FileSplit) split).getStart() + split.getLength();
      if (((FileSplit) split).getStart() > in.getPosition()) {
        in.sync(((FileSplit) split).getStart()); // sync to start
      }
      this.start = in.getPosition();
      vbytes = in.createValueBytes();
      done = start >= end;

      info = InputInfo.getInstance();
      info.setSplit((FileSplit) split);

      System.err.println("input split = " + split);
    }
    @Override
    public void initialize(InputSplit split, final TaskAttemptContext context)
        throws IOException, InterruptedException {

      org.apache.hadoop.mapred.InputSplit oldSplit;

      if (split.getClass() == FileSplit.class) {
        oldSplit =
            new org.apache.hadoop.mapred.FileSplit(
                ((FileSplit) split).getPath(),
                ((FileSplit) split).getStart(),
                ((FileSplit) split).getLength(),
                split.getLocations());
      } else {
        oldSplit = ((InputSplitWrapper) split).realSplit;
      }

      @SuppressWarnings("unchecked")
      Reporter reporter = new Reporter() { // Reporter interface over ctx

            final TaskInputOutputContext ioCtx =
                context instanceof TaskInputOutputContext ? (TaskInputOutputContext) context : null;

            public void progress() {
              HadoopCompat.progress(context);
            }

            // @Override
            public float getProgress() {
              return (ioCtx != null) ? ioCtx.getProgress() : 0;
            }

            public void setStatus(String status) {
              if (ioCtx != null) HadoopCompat.setStatus(ioCtx, status);
            }

            public void incrCounter(String group, String counter, long amount) {
              if (ioCtx != null)
                HadoopCompat.incrementCounter(ioCtx.getCounter(group, counter), amount);
            }

            @SuppressWarnings("unchecked")
            public void incrCounter(Enum<?> key, long amount) {
              if (ioCtx != null) HadoopCompat.incrementCounter(ioCtx.getCounter(key), amount);
            }

            public org.apache.hadoop.mapred.InputSplit getInputSplit()
                throws UnsupportedOperationException {
              throw new UnsupportedOperationException();
            }

            public Counter getCounter(String group, String name) {
              return ioCtx != null ? (Counter) HadoopCompat.getCounter(ioCtx, group, name) : null;
            }

            @SuppressWarnings("unchecked")
            public Counter getCounter(Enum<?> name) {
              return ioCtx != null ? (Counter) ioCtx.getCounter(name) : null;
            }
          };

      realReader =
          realInputFormat.getRecordReader(
              oldSplit, (JobConf) HadoopCompat.getConfiguration(context), reporter);

      keyObj = realReader.createKey();
      valueObj = realReader.createValue();
    }
Esempio n. 7
0
 /**
  * Input Split의 파일명을 반환한다. Input Split은 기본적으로 <tt>file + ":" + start + "+" + length</tt> 형식으로 구성되어
  * 있다.
  *
  * @param inputSplit Input Split
  * @return 파일명
  */
 public static String getFilename(InputSplit inputSplit) {
   String filename = org.openflamingo.mapreduce.util.FileUtils.getFilename(inputSplit.toString());
   int start = filename.indexOf(":");
   return filename.substring(0, start);
 }
 // new API init call
 @Override
 public void initialize(InputSplit split, TaskAttemptContext context) throws IOException {
   context.setStatus(split.toString());
   init((ESInputSplit) split, context.getConfiguration());
 }
Esempio n. 9
0
 /** Returns the <code>RecordReader</code> for reading the arc file. */
 public RecordReader<Text, ArcRecord> createRecordReader(
     InputSplit split, TaskAttemptContext context) throws IOException {
   context.setStatus(split.toString());
   return new ArcRecordReader();
 }
Esempio n. 10
0
 @Override
 public String[] getLocations() throws IOException, InterruptedException {
   return delegate.getLocations();
 }
Esempio n. 11
0
 @Override
 public long getLength() throws IOException, InterruptedException {
   return delegate.getLength();
 }
    @Override
    protected void map(String key, String value, final Context context)
        throws IOException, InterruptedException {
      final InputSplit split = context.getInputSplit();
      if (!(split instanceof DatasourceInputSplit)) {
        throw new IAE(
            "Unexpected split type. Expected [%s] was [%s]",
            DatasourceInputSplit.class.getCanonicalName(), split.getClass().getCanonicalName());
      }

      final String tmpDirLoc = context.getConfiguration().get(TMP_FILE_LOC_KEY);
      final File tmpDir = Paths.get(tmpDirLoc).toFile();

      final DataSegment segment =
          Iterables.getOnlyElement(((DatasourceInputSplit) split).getSegments()).getSegment();

      final HadoopDruidConverterConfig config =
          converterConfigFromConfiguration(context.getConfiguration());

      context.setStatus("DOWNLOADING");
      context.progress();
      final Path inPath = new Path(JobHelper.getURIFromSegment(segment));
      final File inDir = new File(tmpDir, "in");

      if (inDir.exists() && !inDir.delete()) {
        log.warn("Could not delete [%s]", inDir);
      }

      if (!inDir.mkdir() && (!inDir.exists() || inDir.isDirectory())) {
        log.warn("Unable to make directory");
      }

      final long inSize =
          JobHelper.unzipNoGuava(inPath, context.getConfiguration(), inDir, context);
      log.debug("Loaded %d bytes into [%s] for converting", inSize, inDir.getAbsolutePath());
      context.getCounter(COUNTER_GROUP, COUNTER_LOADED).increment(inSize);

      context.setStatus("CONVERTING");
      context.progress();
      final File outDir = new File(tmpDir, "out");
      if (!outDir.mkdir() && (!outDir.exists() || !outDir.isDirectory())) {
        throw new IOException(String.format("Could not create output directory [%s]", outDir));
      }
      HadoopDruidConverterConfig.INDEX_MERGER.convert(
          inDir, outDir, config.getIndexSpec(), JobHelper.progressIndicatorForContext(context));
      if (config.isValidate()) {
        context.setStatus("Validating");
        HadoopDruidConverterConfig.INDEX_IO.validateTwoSegments(inDir, outDir);
      }
      context.progress();
      context.setStatus("Starting PUSH");
      final Path baseOutputPath = new Path(config.getSegmentOutputPath());
      final FileSystem outputFS = baseOutputPath.getFileSystem(context.getConfiguration());
      final DataSegment finalSegmentTemplate =
          segment.withVersion(segment.getVersion() + "_converted");
      final DataSegment finalSegment =
          JobHelper.serializeOutIndex(
              finalSegmentTemplate,
              context.getConfiguration(),
              context,
              context.getTaskAttemptID(),
              outDir,
              JobHelper.makeSegmentOutputPath(baseOutputPath, outputFS, finalSegmentTemplate));
      context.progress();
      context.setStatus("Finished PUSH");
      final String finalSegmentString =
          HadoopDruidConverterConfig.jsonMapper.writeValueAsString(finalSegment);
      context
          .getConfiguration()
          .set(ConvertingOutputFormat.PUBLISHED_SEGMENT_KEY, finalSegmentString);
      context.write(new Text("dataSegment"), new Text(finalSegmentString));

      context.getCounter(COUNTER_GROUP, COUNTER_WRITTEN).increment(finalSegment.getSize());
      context.progress();
      context.setStatus("Ready To Commit");
    }
  @Override
  protected void initParser(InputSplit inSplit) throws IOException, InterruptedException {
    file = ((DelimitedSplit) inSplit).getPath();
    configFileNameAsCollection(conf, file);

    // get header from the DelimitedSplit
    TextArrayWritable taw = ((DelimitedSplit) inSplit).getHeader();
    fields = taw.toStrings();

    fileIn = fs.open(file);
    lineSeparator = retrieveLineSeparator(fileIn);
    if (start != 0) {
      // in case the cut point is \n, back off 1 char to create a partial
      // line so that 1st line can be skipped
      start--;
    }

    fileIn.seek(start);

    instream = new InputStreamReader(fileIn, encoding);

    bytesRead = 0;
    fileLen = inSplit.getLength();
    if (uriName == null) {
      generateId = conf.getBoolean(CONF_DELIMITED_GENERATE_URI, false);
      if (generateId) {
        idGen = new IdGenerator(file.toUri().getPath() + "-" + ((FileSplit) inSplit).getStart());
      } else {
        uriId = 0;
      }
    }

    boolean found = generateId || uriId == 0;

    for (int i = 0; i < fields.length && !found; i++) {
      if (fields[i].equals(uriName)) {
        uriId = i;
        found = true;
        break;
      }
    }
    if (found == false) {
      // idname doesn't match any columns
      if (LOG.isDebugEnabled()) {
        LOG.debug("Header: " + convertToLine(fields));
      }
      throw new IOException(
          "Delimited_uri_id " + uriName + " is not found in " + this.file.toUri().getPath());
    }

    // keep leading and trailing whitespaces to ensure accuracy of pos
    // do not skip empty line just in case the split boundary is \n
    parser =
        new CSVParser(
            instream,
            new CSVStrategy(
                delimiter,
                encapsulator,
                CSVStrategy.COMMENTS_DISABLED,
                CSVStrategy.ESCAPE_DISABLED,
                false,
                false,
                false,
                false));

    // skip first line:
    // 1st split, skip header; other splits, skip partial line
    String[] values = parser.getLine();
    start += getBytesCountFromLine(values);
    pos = start;
  }