コード例 #1
0
  @Override
  public void initialize(InputSplit split, TaskAttemptContext context) throws IOException {

    // Obtain path to input list of input images and open input stream
    FileSplit fileSplit = (FileSplit) split;
    Path path = fileSplit.getPath();
    FileSystem fileSystem = path.getFileSystem(context.getConfiguration());
    FSDataInputStream fileIn = fileSystem.open(path);

    // Note the start and length fields in the FileSplit object are being used to
    // convey a range of lines in the input list of image URLs
    startLine = fileSplit.getStart();
    numLines = fileSplit.getLength();
    linesRead = 0; // total lines read by this particular record reader instance
    linesPerRecord = 100; // can be modified to change key/value pair size (may improve efficiency)

    // If it exists, get the relevant compression codec for the FileSplit
    CompressionCodecFactory codecFactory = new CompressionCodecFactory(context.getConfiguration());
    CompressionCodec codec = codecFactory.getCodec(path);

    // If the codec was found, use it to create an decompressed input stream.
    // Otherwise, assume input stream is already decompressed
    if (codec != null) {
      reader = new BufferedReader(new InputStreamReader(codec.createInputStream(fileIn)));
    } else {
      reader = new BufferedReader(new InputStreamReader(fileIn));
    }
  }
コード例 #2
0
  @Override
  public void initialize(InputSplit split, TaskAttemptContext context)
      throws IOException, InterruptedException {
    ObjectMapper jsonMapper = DruidInitialization.getInstance().getObjectMapper();
    SegmentLoadSpec spec = readSegmentJobSpec(context.getConfiguration(), jsonMapper);

    final List<String> dimensions = spec.getDimensions();
    final List<String> metrics = spec.getMetrics();
    final DimFilter filter = spec.getFilter();
    final Interval interval =
        new Interval(context.getConfiguration().get(DruidInputFormat.CONF_DRUID_INTERVAL));

    String hdfsPath = ((DruidInputSplit) split).getPath();
    logger.info("Reading segment from " + hdfsPath);

    segmentDir = Files.createTempDir();
    logger.info("segment dir: " + segmentDir);

    FileSystem fs = FileSystem.get(context.getConfiguration());
    getSegmentFiles(hdfsPath, segmentDir, fs);
    logger.info("finished getting segment files");

    QueryableIndex index = IndexIO.loadIndex(segmentDir);
    StorageAdapter adapter = new QueryableIndexStorageAdapter(index);
    List<StorageAdapter> adapters = Lists.newArrayList(adapter);
    rowYielder =
        new IngestSegmentFirehose(
            adapters, dimensions, metrics, filter, interval, QueryGranularity.NONE);
  }
コード例 #3
0
  @Override
  public RecordWriter<IEtlKey, CamusWrapper> getDataRecordWriter(
      TaskAttemptContext context, String fileName, CamusWrapper data, FileOutputCommitter committer)
      throws IOException, InterruptedException {

    // If recordDelimiter hasn't been initialized, do so now
    if (recordDelimiter == null) {
      recordDelimiter =
          context.getConfiguration().get(ETL_OUTPUT_RECORD_DELIMITER, DEFAULT_RECORD_DELIMITER);
    }

    // Get the filename for this RecordWriter.
    Path path =
        new Path(
            committer.getWorkPath(),
            EtlMultiOutputFormat.getUniqueFile(context, fileName, getFilenameExtension()));

    //		final FSDataOutputStream writer =
    // path.getFileSystem(context.getConfiguration()).create(path);

    FileSystem fs = path.getFileSystem(context.getConfiguration());
    DataOutputStream writer; // = fs.create(path, false);
    if (isCompressed) {
      return new ByteRecordWriter(
          new DataOutputStream(codec.createOutputStream(fs.create(path, false))), recordDelimiter);
    } else {
      return new ByteRecordWriter(fs.create(path, false), recordDelimiter);
    }
  }
コード例 #4
0
  /**
   * Instantiate a RecordWriter as required. This will create an RecordWriter from the internal
   * AccumuloOutputFormat
   */
  @Override
  public RecordWriter getRecordWriter(TaskAttemptContext context)
      throws IOException, InterruptedException {

    if (zoomLevel == -1) {
      zoomLevel =
          Integer.parseInt(
              context.getConfiguration().get(MrGeoAccumuloConstants.MRGEO_ACC_KEY_ZOOMLEVEL));
    }

    if (_innerFormat == null) {
      initialize(context);
    }

    if (_innerRecordWriter == null) {
      _innerRecordWriter = _innerFormat.getRecordWriter(context);
    }
    String pl = context.getConfiguration().get(MrGeoAccumuloConstants.MRGEO_ACC_KEY_VIZ);
    if (colViz == null) {
      colViz = new ColumnVisibility(pl);
    }
    AccumuloMrGeoRecordWriter outRW =
        new AccumuloMrGeoRecordWriter(
            zoomLevel, table, _innerRecordWriter, new String(colViz.getExpression()));

    return outRW;
  } // end getRecordWriter
コード例 #5
0
    @Override
    public void initialize(InputSplit split, TaskAttemptContext context)
        throws IOException, InterruptedException {
      // 初始化函数
      // System.out.println("initialize");
      // Logger.getLogger("KirchhoffMigration").log(Level.INFO,
      // "enter initialize()");
      FileSplit inputsplit = (FileSplit) split;
      Configuration conf = context.getConfiguration();
      LENGTH = conf.getLong("FileSplitLength", 0);
      SPILL = LENGTH / conf.getInt("SplitPerMap", 1);
      // LENGTH = 8;
      // SPILL = 8;
      assert (LENGTH >= SPILL);
      // System.out.println("length:" + LENGTH);
      // System.out.println("spill:" + SPILL);
      String filename = inputsplit.getPath().toString();
      // System.out.println("filename:" + filename);
      //            String buf = filename.substring(filename.lastIndexOf("fcxy") + 4,
      //                    filename.lastIndexOf("."));
      //            int count = Integer.parseInt(buf);
      // System.out.println(filename);
      // start = inputsplit.getStart(); // 得到此分片开始位置
      start = inputsplit.getStart();
      shotNum += start * 8 / Float.SIZE;
      long offset = LENGTH >= inputsplit.getLength() ? inputsplit.getLength() : LENGTH;
      end = start + offset; // 结束此分片位置
      // System.out.println("inputSplitLength:" + split.getLength());
      // System.out.println("end:" + end);
      // start = inputsplit.getStart(); // 得到此分片开始位置
      // end = start + inputsplit.getLength();// 结束此分片位置
      // System.out.println("start:" + start + " ,end:" + end);
      final Path file = inputsplit.getPath();
      // System.out.println(file.toString());
      // 打开文件
      FileSystem fs = file.getFileSystem(context.getConfiguration());
      fileIn = fs.open(inputsplit.getPath());

      // 关键位置2
      // 将文件指针移动到当前分片,因为每次默认打开文件时,`其指针指向开头
      fileIn.seek(start);

      // in = new LineReader(fileIn, context.getConfiguration());

      // if (start != 0) {
      // System.out.println("not the first split");
      // // 关键解决位置1
      // //
      // 如果这不是第一个分片,那么假设第一个分片是0——4,那么,第4个位置已经被读取,则需要跳过4,否则会产生读入错误,因为你回头又去读之前读过的地方
      // start += (end - pos + 1);
      // }
      pos = start;
    }
コード例 #6
0
    @Override
    @SuppressWarnings("unchecked")
    public void initialize(InputSplit split, TaskAttemptContext ctx)
        throws IOException, InterruptedException {
      // set up columns that needs to read from the RCFile.

      tDesc = TStructDescriptor.getInstance(typeRef.getRawClass());
      thriftWritable = ThriftWritable.newInstance((Class<TBase<?, ?>>) typeRef.getRawClass());
      final List<Field> tFields = tDesc.getFields();

      FileSplit fsplit = (FileSplit) split;
      Path file = fsplit.getPath();

      LOG.info(
          String.format(
              "reading %s from %s:%d:%d",
              typeRef.getRawClass().getName(),
              file.toString(),
              fsplit.getStart(),
              fsplit.getStart() + fsplit.getLength()));

      ColumnarMetadata storedInfo = RCFileUtil.readMetadata(ctx.getConfiguration(), file);

      // list of field numbers
      List<Integer> tFieldIds =
          Lists.transform(
              tFields,
              new Function<Field, Integer>() {
                public Integer apply(Field fd) {
                  return Integer.valueOf(fd.getFieldId());
                }
              });

      columnsBeingRead =
          RCFileUtil.findColumnsToRead(ctx.getConfiguration(), tFieldIds, storedInfo);

      for (int idx : columnsBeingRead) {
        int fid = storedInfo.getFieldId(idx);
        if (fid >= 0) {
          knownRequiredFields.add(tFields.get(tFieldIds.indexOf(fid)));
        } else {
          readUnknownsColumn = true;
        }
      }

      ColumnProjectionUtils.setReadColumnIDs(ctx.getConfiguration(), columnsBeingRead);

      // finally!
      super.initialize(split, ctx);
    }
コード例 #7
0
  @Override
  public RecordReader<Text, PairOfByteBuffers> createRecordReader(
      final InputSplit split, final TaskAttemptContext context) {
    String delimiter = context.getConfiguration().get("textinputformat.record.delimiter");
    byte[] recordDelimiterBytes = null;
    if (null != delimiter) {
      recordDelimiterBytes = delimiter.getBytes(Charsets.UTF_8);
    }
    LOG.info("recordDelimiterBytes = " + recordDelimiterBytes);

    int recordSize = context.getConfiguration().getInt(CONF_KEY_RECORD_SIZE, DEFAULT_RECORD_SIZE);
    LOG.info("recordSize = " + recordSize);
    return new ARFFManyLineRecordReader(recordDelimiterBytes, recordSize);
  }
コード例 #8
0
  @Override
  public void initialize(InputSplit input, TaskAttemptContext tac)
      throws IOException, InterruptedException {
    super.initialize(input, tac);
    skipNonArticles = tac.getConfiguration().getBoolean(SKIP_NON_ARTICLES, false);
    skipRedirect = tac.getConfiguration().getBoolean(SKIP_REDIRECT, false);

    LOG.info(
        "Splitting option: [skip non-article: "
            + skipNonArticles
            + ", skip redirect: "
            + SKIP_REDIRECT
            + "]");
  }
コード例 #9
0
ファイル: CrunchOutputs.java プロジェクト: apache/crunch
 public static OutputCommitter getOutputCommitter(TaskAttemptContext tac)
     throws IOException, InterruptedException {
   Map<String, OutputConfig> outputs = getNamedOutputs(tac.getConfiguration());
   Map<String, OutputCommitter> committers = Maps.newHashMap();
   for (Map.Entry<String, OutputConfig> e : outputs.entrySet()) {
     String namedOutput = e.getKey();
     Job job = getJob(tac.getJobID(), e.getKey(), tac.getConfiguration());
     OutputFormat fmt = getOutputFormat(namedOutput, job, e.getValue());
     TaskAttemptContext taskContext = getTaskContext(tac, job);
     OutputCommitter oc = fmt.getOutputCommitter(taskContext);
     committers.put(namedOutput, oc);
   }
   return new CompositeOutputCommitter(outputs, committers);
 }
コード例 #10
0
ファイル: TFileRecordReader.java プロジェクト: ktzoumas/tez
  @Override
  public void initialize(InputSplit split, TaskAttemptContext context)
      throws IOException, InterruptedException {
    FileSplit fileSplit = (FileSplit) split;
    LOG.info("Initializing TFileRecordReader : " + fileSplit.getPath().toString());
    start = fileSplit.getStart();
    end = start + fileSplit.getLength();

    FileSystem fs = fileSplit.getPath().getFileSystem(context.getConfiguration());
    splitPath = fileSplit.getPath();
    fin = fs.open(splitPath);
    reader =
        new TFile.Reader(fin, fs.getFileStatus(splitPath).getLen(), context.getConfiguration());
    scanner = reader.createScannerByByteRange(start, fileSplit.getLength());
  }
コード例 #11
0
ファイル: HCatMapRedUtil.java プロジェクト: uclaabs/absHive
 public static TaskAttemptContext createTaskAttemptContext(
     org.apache.hadoop.mapreduce.TaskAttemptContext context) {
   return createTaskAttemptContext(
       new JobConf(context.getConfiguration()),
       org.apache.hadoop.mapred.TaskAttemptID.forName(context.getTaskAttemptID().toString()),
       Reporter.NULL);
 }
コード例 #12
0
    public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
      FileSplit split = (FileSplit) genericSplit;
      Configuration job = context.getConfiguration();
      m_Sb.setLength(0);
      m_Start = split.getStart();
      m_End = m_Start + split.getLength();
      final Path file = split.getPath();
      compressionCodecs = new CompressionCodecFactory(job);
      final CompressionCodec codec = compressionCodecs.getCodec(file);

      // open the file and seek to the m_Start of the split
      FileSystem fs = file.getFileSystem(job);
      //  getFileStatus fileStatus = fs.getFileStatus(split.getPath());
      //noinspection deprecation
      @SuppressWarnings(value = "deprecated")
      long length = fs.getLength(file);
      FSDataInputStream fileIn = fs.open(split.getPath());
      if (m_Start > 0) fileIn.seek(m_Start);
      if (codec != null) {
        CompressionInputStream inputStream = codec.createInputStream(fileIn);
        m_Input = new BufferedReader(new InputStreamReader(inputStream));
        m_End = length;
      } else {
        m_Input = new BufferedReader(new InputStreamReader(fileIn));
      }
      m_Current = m_Start;
      m_Key = split.getPath().getName();
    }
コード例 #13
0
  @SuppressWarnings("unchecked")
  @Override
  public RecordWriter<K, V> getRecordWriter(TaskAttemptContext context)
      throws IOException, InterruptedException {

    Configuration conf = context.getConfiguration();
    String keyValueSeparator = conf.get(MAPRED_SEPARATOR, "\t");
    String extension = "";

    // Create the output streams
    Path files[] = getDefaultWorkFiles(context, extension);
    DataOutputStream[] outStreams = null;

    if (files != null && files.length != 0) {
      outStreams = new DataOutputStream[files.length];
      for (int i = 0; i < files.length; ++i) {
        outStreams[i] = files[i].getFileSystem(conf).create(files[i], false);
      }
    }

    // Create the record writer selector
    Class<? extends MultiTextRecordWriterSelector> selectorClass =
        getRecordWriterSelectorClass(context);
    MultiTextRecordWriterSelector<K, V> selector =
        (MultiTextRecordWriterSelector<K, V>) ReflectionUtils.newInstance(selectorClass, conf);

    return new MultiTextRecordWriter<K, V>(outStreams, keyValueSeparator, selector);
  }
コード例 #14
0
  @Override
  public RecordWriter<IntWritable, Double2DArrayWritable> getRecordWriter(
      TaskAttemptContext context) throws IOException, InterruptedException {

    // setup variables for image generation
    FileSystem fs = FileSystem.get(context.getConfiguration());
    Path picTempPath = FileOutputFormat.getOutputPath(context);
    fs.mkdirs(picTempPath);
    int k = context.getConfiguration().getInt("k", -1);

    Path imgPath = picTempPath.suffix("/points.png");

    if (k == -1) throw new RuntimeException("k is -1");

    return new PicRecordWriter(imgPath, k, context.getConfiguration());
  }
コード例 #15
0
 @Override
 public void open(TaskAttemptContext job) throws IOException {
   Configuration conf = job.getConfiguration();
   solr = SolrUtils.getCommonsHttpSolrServer(conf);
   commitSize = conf.getInt(SolrConstants.COMMIT_SIZE, 1000);
   solrMapping = SolrMappingReader.getInstance(conf);
 }
コード例 #16
0
ファイル: CrunchOutputs.java プロジェクト: apache/crunch
    private TaskAttemptContext getContext(String namedOutput, TaskAttemptContext baseContext)
        throws IOException {
      Job job = getJob(baseContext.getJobID(), namedOutput, baseContext.getConfiguration());
      configureJob(namedOutput, job, outputs.get(namedOutput));

      return getTaskContext(baseContext, job);
    }
コード例 #17
0
 @Override
 public RecordWriter<Text, Text> getRecordWriter(TaskAttemptContext job)
     throws IOException, InterruptedException {
   return new RedisHashRecordWriter(
       job.getConfiguration().get(REDIS_HASH_KEY_CONF),
       job.getConfiguration().get(REDIS_HOSTS_CONF));
 }
コード例 #18
0
    public RecordReader<IntWritable, IntWritable> createRecordReader(
        InputSplit ignored, TaskAttemptContext taskContext) throws IOException {
      Configuration conf = taskContext.getConfiguration();

      final int count = conf.getInt(MAP_SLEEP_COUNT, 1);
      if (count < 0) {
        throw new IOException("Invalid map count: " + count);
      }

      int totalIReduces = conf.getInt(IREDUCE_STAGES_COUNT, 1);

      int reduceTasks =
          totalIReduces == 0
              ? taskContext.getNumReduceTasks()
              : conf.getInt(IREDUCE_TASKS_COUNT, 1);
      int sleepCount =
          totalIReduces == 0
              ? conf.getInt(REDUCE_SLEEP_COUNT, 1)
              : conf.getInt(IREDUCE_SLEEP_COUNT, 1);
      final int emitPerMapTask = sleepCount * reduceTasks;

      return new RecordReader<IntWritable, IntWritable>() {
        private int records = 0;
        private int emitCount = 0;
        private IntWritable key = null;
        private IntWritable value = null;

        public void initialize(InputSplit split, TaskAttemptContext context) {}

        public boolean nextKeyValue() throws IOException {
          if (count == 0) {
            return false;
          }
          key = new IntWritable();
          key.set(emitCount);
          int emit = emitPerMapTask / count;
          if ((emitPerMapTask) % count > records) {
            ++emit;
          }
          emitCount += emit;
          value = new IntWritable();
          value.set(emit);
          return records++ < count;
        }

        public IntWritable getCurrentKey() {
          return key;
        }

        public IntWritable getCurrentValue() {
          return value;
        }

        public void close() throws IOException {}

        public float getProgress() throws IOException {
          return count == 0 ? 100 : records / ((float) count);
        }
      };
    }
コード例 #19
0
  private DataOutputStream createRawOutputStream(TaskAttemptContext ctx) throws IOException {
    boolean isCompressed = getCompressOutput(ctx);

    if (!isCompressed) {
      Path file = getDefaultWorkFile(ctx, ".nt");
      FileSystem fs = file.getFileSystem(ctx.getConfiguration());
      return fs.create(file, false);
    } else {
      Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(ctx, GzipCodec.class);

      CompressionCodec codec = ReflectionUtils.newInstance(codecClass, ctx.getConfiguration());
      Path file = getDefaultWorkFile(ctx, ".nt" + codec.getDefaultExtension());
      FileSystem fs = file.getFileSystem(ctx.getConfiguration());
      FSDataOutputStream fileOut = fs.create(file, false);
      return new DataOutputStream(codec.createOutputStream(fileOut));
    }
  }
コード例 #20
0
 public WikipediaRecordReader(FileSplit split, TaskAttemptContext context) throws IOException {
   // open the file and seek to the start of the split
   start = split.getStart();
   end = start + split.getLength();
   Path file = split.getPath();
   FileSystem fs = file.getFileSystem(context.getConfiguration());
   fsin = fs.open(file);
   fsin.seek(start);
 }
コード例 #21
0
  /** {@inheritDoc} */
  @Override
  public RecordWriter<FixedByteRecord, NullWritable> getRecordWriter(final TaskAttemptContext arg0)
      throws IOException, InterruptedException {

    final Configuration conf = arg0.getConfiguration();
    final String outputPath = conf.get(DataGenerator.OUTPUT_PATH);

    return new GeneratorRecordWriter(new Path(outputPath), conf);
  }
コード例 #22
0
 public void initialize(InputSplit split, TaskAttemptContext context)
     throws IOException, InterruptedException {
   super.initialize(split, context);
   assert schemas.size() == 2;
   Configuration conf = context.getConfiguration();
   conf.set(props.getProperty("AVRO_INPUT"), Submitter.AvroIO.KV.name());
   conf.set(props.getProperty("AVRO_KEY_INPUT_SCHEMA"), schemas.get(0).toString());
   conf.set(props.getProperty("AVRO_VALUE_INPUT_SCHEMA"), schemas.get(1).toString());
 }
コード例 #23
0
  @Test
  public void testDeleteMissing() {
    TaskAttemptContext taskAttemptContext = getTaskAttemptContext(config);
    JobContext jobContext =
        new JobContextImpl(
            taskAttemptContext.getConfiguration(),
            taskAttemptContext.getTaskAttemptID().getJobID());
    Configuration conf = jobContext.getConfiguration();

    String sourceBase;
    String targetBase;
    FileSystem fs = null;
    try {
      OutputCommitter committer = new CopyCommitter(null, taskAttemptContext);
      fs = FileSystem.get(conf);
      sourceBase = TestDistCpUtils.createTestSetup(fs, FsPermission.getDefault());
      targetBase = TestDistCpUtils.createTestSetup(fs, FsPermission.getDefault());
      String targetBaseAdd = TestDistCpUtils.createTestSetup(fs, FsPermission.getDefault());
      fs.rename(new Path(targetBaseAdd), new Path(targetBase));

      DistCpOptions options =
          new DistCpOptions(Arrays.asList(new Path(sourceBase)), new Path("/out"));
      options.setSyncFolder(true);
      options.setDeleteMissing(true);
      options.appendToConf(conf);

      CopyListing listing = new GlobbedCopyListing(conf, CREDENTIALS);
      Path listingFile = new Path("/tmp1/" + String.valueOf(rand.nextLong()));
      listing.buildListing(listingFile, options);

      conf.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, targetBase);
      conf.set(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH, targetBase);

      committer.commitJob(jobContext);
      if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) {
        Assert.fail("Source and target folders are not in sync");
      }
      if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, sourceBase, targetBase)) {
        Assert.fail("Source and target folders are not in sync");
      }

      // Test for idempotent commit
      committer.commitJob(jobContext);
      if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) {
        Assert.fail("Source and target folders are not in sync");
      }
      if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, sourceBase, targetBase)) {
        Assert.fail("Source and target folders are not in sync");
      }
    } catch (Throwable e) {
      LOG.error("Exception encountered while testing for delete missing", e);
      Assert.fail("Delete missing failure");
    } finally {
      TestDistCpUtils.delete(fs, "/tmp1");
      conf.set(DistCpConstants.CONF_LABEL_DELETE_MISSING, "false");
    }
  }
コード例 #24
0
ファイル: GridmixJob.java プロジェクト: JichengSong/hadoop-20
 @Override
 public void initialize(InputSplit genericSplit, TaskAttemptContext ctxt)
     throws IOException, InterruptedException {
   final GridmixSplit split = (GridmixSplit) genericSplit;
   final Configuration conf = ctxt.getConfiguration();
   factory =
       new ReadRecordFactory(
           split.getLength(), split.getInputRecords(), new FileQueue(split, conf), conf);
 }
コード例 #25
0
 /* (non-Javadoc)
  * @see org.apache.hadoop.mapreduce.lib.output.FileOutputFormat#getRecordWriter(org.apache.hadoop.mapreduce.TaskAttemptContext)
  */
 @Override
 public RecordWriter<WritableComparable, Tuple> getRecordWriter(TaskAttemptContext job)
     throws IOException, InterruptedException {
   Configuration conf = job.getConfiguration();
   Path file = getDefaultWorkFile(job, "");
   FileSystem fs = file.getFileSystem(conf);
   FSDataOutputStream fileOut = fs.create(file, false);
   return new InterRecordWriter(fileOut);
 }
コード例 #26
0
 @Override
 public RecordReader<LongWritable, Writable> createRecordReader(
     InputSplit split, TaskAttemptContext taskAttempt) throws IOException, InterruptedException {
   if (typeRef == null) {
     typeRef =
         ThriftUtils.getTypeRef(taskAttempt.getConfiguration(), RCFileThriftInputFormat.class);
   }
   return new ThriftReader(createUnwrappedRecordReader(split, taskAttempt));
 }
コード例 #27
0
  @Override
  public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException {
    // This method should only be called once (see Hadoop API). However,
    // there seems to be disagreement between implementations that call
    // initialize() and Hadoop-BAM's own code that relies on
    // {@link BAMInputFormat} to call initialize() when the reader is
    // created. Therefore we add this check for the time being.
    if (isInitialized) close();
    isInitialized = true;

    final Configuration conf = ctx.getConfiguration();

    final FileVirtualSplit split = (FileVirtualSplit) spl;
    final Path file = split.getPath();
    final FileSystem fs = file.getFileSystem(conf);

    this.stringency = SAMHeaderReader.getValidationStringency(conf);

    final FSDataInputStream in = fs.open(file);

    final SAMFileHeader header = SAMHeaderReader.readSAMHeaderFrom(in, conf);
    codec = new BAMRecordCodec(header);

    in.seek(0);
    bci =
        new BlockCompressedInputStream(
            new WrapSeekable<FSDataInputStream>(in, fs.getFileStatus(file).getLen(), file));

    virtualStart = split.getStartVirtualOffset();

    fileStart = virtualStart >>> 16;
    virtualEnd = split.getEndVirtualOffset();

    bci.seek(virtualStart);
    codec.setInputStream(bci);

    if (BAMInputFormat.DEBUG_BAM_SPLITTER) {
      final long recordStart = virtualStart & 0xffff;
      System.err.println(
          "XXX inizialized BAMRecordReader byte offset: "
              + fileStart
              + " record offset: "
              + recordStart);
    }

    keepReadPairsTogether =
        SortOrder.queryname.equals(header.getSortOrder())
            && conf.getBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, false);
    readPair = false;
    lastOfPair = false;
    intervals = BAMInputFormat.getIntervals(conf);
    if (intervals != null) {
      overlapDetector = new OverlapDetector<>(0, 0);
      overlapDetector.addAll(intervals, intervals);
    }
  }
コード例 #28
0
  @Override
  public RecordReader<LongWritable, ThriftWritable<M>> createRecordReader(
      InputSplit split, TaskAttemptContext taskAttempt) throws IOException, InterruptedException {

    if (typeRef_ == null) {
      typeRef_ =
          ThriftUtils.getTypeRef(taskAttempt.getConfiguration(), LzoThriftB64LineInputFormat.class);
    }
    return new LzoThriftB64LineRecordReader<M>(typeRef_);
  }
コード例 #29
0
  @Override
  public RecordWriter<K, V> getRecordWriter(final TaskAttemptContext context) throws IOException {
    // Open data output stream

    Path outPath = getDefaultWorkFile(context, ".bson");
    LOG.info("output going into " + outPath);

    FileSystem fs = outPath.getFileSystem(context.getConfiguration());
    FSDataOutputStream outFile = fs.create(outPath);

    FSDataOutputStream splitFile = null;
    if (MongoConfigUtil.getBSONOutputBuildSplits(context.getConfiguration())) {
      Path splitPath = new Path(outPath.getParent(), "." + outPath.getName() + ".splits");
      splitFile = fs.create(splitPath);
    }

    long splitSize = BSONSplitter.getSplitSize(context.getConfiguration(), null);
    return new BSONFileRecordWriter<K, V>(outFile, splitFile, splitSize);
  }
コード例 #30
0
 public void initialize(InputSplit split, TaskAttemptContext context)
     throws IOException, InterruptedException {
   Path p = ((FileSplit) split).getPath();
   FileSystem fs = p.getFileSystem(context.getConfiguration());
   in = fs.open(p);
   long start = ((FileSplit) split).getStart();
   // find the offset to start at a record boundary
   offset = (RECORD_LENGTH - (start % RECORD_LENGTH)) % RECORD_LENGTH;
   in.seek(start + offset);
   length = ((FileSplit) split).getLength();
 }