示例#1
0
  public static ProcedureWALTrailer readTrailer(FSDataInputStream stream, long startPos, long size)
      throws IOException {
    // Beginning of the Trailer Jump. 17 = 1 byte version + 8 byte magic + 8 byte offset
    long trailerPos = size - 17;

    if (trailerPos < startPos) {
      throw new InvalidWALDataException("Missing trailer: size=" + size + " startPos=" + startPos);
    }

    stream.seek(trailerPos);
    int version = stream.read();
    if (version != TRAILER_VERSION) {
      throw new InvalidWALDataException(
          "Invalid Trailer version. got " + version + " expected " + TRAILER_VERSION);
    }

    long magic = StreamUtils.readLong(stream);
    if (magic != TRAILER_MAGIC) {
      throw new InvalidWALDataException(
          "Invalid Trailer magic. got " + magic + " expected " + TRAILER_MAGIC);
    }

    long trailerOffset = StreamUtils.readLong(stream);
    stream.seek(trailerOffset);

    ProcedureWALEntry entry = readEntry(stream);
    if (entry.getType() != ProcedureWALEntry.Type.PROCEDURE_WAL_EOF) {
      throw new InvalidWALDataException("Invalid Trailer begin");
    }

    ProcedureWALTrailer trailer =
        ProcedureWALTrailer.newBuilder().setVersion(version).setTrackerPos(stream.getPos()).build();
    return trailer;
  }
示例#2
0
  /*
   * Read some data, skip a few bytes and read more. HADOOP-922.
   */
  private void smallReadSeek(FileSystem fileSys, Path name) throws IOException {
    if (fileSys instanceof ChecksumFileSystem) {
      fileSys = ((ChecksumFileSystem) fileSys).getRawFileSystem();
    }
    // Make the buffer size small to trigger code for HADOOP-922
    FSDataInputStream stmRaw = fileSys.open(name, 1);
    byte[] expected = new byte[ONEMB];
    Random rand = new Random(seed);
    rand.nextBytes(expected);

    // Issue a simple read first.
    byte[] actual = new byte[128];
    stmRaw.seek(100000);
    stmRaw.read(actual, 0, actual.length);
    checkAndEraseData(actual, 100000, expected, "First Small Read Test");

    // now do a small seek of 4 bytes, within the same block.
    int newpos1 = 100000 + 128 + 4;
    stmRaw.seek(newpos1);
    stmRaw.read(actual, 0, actual.length);
    checkAndEraseData(actual, newpos1, expected, "Small Seek Bug 1");

    // seek another 256 bytes this time
    int newpos2 = newpos1 + 256;
    stmRaw.seek(newpos2);
    stmRaw.read(actual, 0, actual.length);
    checkAndEraseData(actual, newpos2, expected, "Small Seek Bug 2");

    // all done
    stmRaw.close();
  }
 // to be used for testing
 public WikipediaRecordReader(URL fileURL, long start, long end) throws IOException {
   this.start = start;
   this.end = end;
   Path path = new Path("file://", fileURL.getPath());
   fsin = FileSystem.getLocal(new Configuration()).open(path);
   fsin.seek(start);
   fsin.seek(0);
 }
示例#4
0
 public void readTracker(ProcedureStoreTracker tracker) throws IOException {
   ProcedureWALTrailer trailer = readTrailer();
   try {
     stream.seek(trailer.getTrackerPos());
     tracker.readFrom(stream);
   } finally {
     stream.seek(startPos);
   }
 }
示例#5
0
 public ProcedureWALTrailer readTrailer() throws IOException {
   try {
     return ProcedureWALFormat.readTrailer(stream, startPos, logStatus.getLen());
   } finally {
     stream.seek(startPos);
   }
 }
示例#6
0
  private InputStream OpenMultiplePartsWithOffset(FileSystem fs, Path pt, long offset)
      throws IOException {
    RemoteIterator<LocatedFileStatus> rit = fs.listFiles(pt, false);
    Vector<FSDataInputStream> fileHandleList = new Vector<FSDataInputStream>();
    while (rit.hasNext()) {
      Path path = rit.next().getPath();
      String filename =
          path.toString().substring(path.getParent().toString().length(), path.toString().length());

      if (filename.startsWith("/part-")) {
        long filesize = fs.getFileStatus(path).getLen();
        if (offset < filesize) {
          FSDataInputStream handle = fs.open(path);
          if (offset > 0) {
            handle.seek(offset);
          }
          fileHandleList.add(handle);
        }
        offset -= filesize;
      }
    }
    if (fileHandleList.size() == 1) return fileHandleList.get(0);
    else if (fileHandleList.size() > 1) {
      Enumeration<FSDataInputStream> enu = fileHandleList.elements();
      return new SequenceInputStream(enu);
    } else {
      System.err.println("Error, no source file loaded. run genSeedDataset.sh fisrt!");
      return null;
    }
  }
示例#7
0
  public MutilCharRecordReader(FileSplit inputSplit, Configuration job) throws IOException {

    maxLineLength = job.getInt("mapred.mutilCharRecordReader.maxlength", Integer.MAX_VALUE);
    start = inputSplit.getStart();
    end = start + inputSplit.getLength();
    final Path file = inputSplit.getPath();

    compressionCodecs = new CompressionCodecFactory(job);

    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // 打开文件系统
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(file);
    boolean skipFirstLine = false;

    if (codec != null) {
      lineReader = new LineReader(codec.createInputStream(fileIn), job);
      end = Long.MAX_VALUE;
    } else {
      if (start != 0) {
        skipFirstLine = true;
        --start;
        fileIn.seek(start);
      }
      lineReader = new LineReader(fileIn, job);
    }

    if (skipFirstLine) {
      start +=
          lineReader.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
  }
 @Override
 public ModelInput<StringBuilder> createInput(
     Class<? extends StringBuilder> dataType,
     FileSystem fileSystem,
     Path path,
     long offset,
     long fragmentSize,
     Counter counter)
     throws IOException, InterruptedException {
   FileSystem fs = FileSystem.get(path.toUri(), getConf());
   FSDataInputStream in = fs.open(path);
   boolean succeed = false;
   try {
     in.seek(offset);
     ModelInput<StringBuilder> result =
         format.createInput(
             dataType, path.toString(), new CountInputStream(in, counter), offset, fragmentSize);
     succeed = true;
     return result;
   } finally {
     if (succeed == false) {
       in.close();
     }
   }
 }
示例#9
0
 /**
  * Sets {@link #mHdfsInputStream} to a stream from the under storage system with the stream
  * starting at position. The {@link #mCurrentPosition} is not modified to be position.
  *
  * @throws IOException if opening the file fails
  */
 private void getHdfsInputStream(long position) throws IOException {
   if (mHdfsInputStream == null) {
     org.apache.hadoop.fs.FileSystem fs = mHdfsPath.getFileSystem(mHadoopConf);
     mHdfsInputStream = fs.open(mHdfsPath, mHadoopBufferSize);
   }
   mHdfsInputStream.seek(position);
 }
  public static void main(String[] args) throws IOException {
    String uri = args[0];

    Configuration configuration = new Configuration();
    System.out.println("Trying to get the file system object");
    URI uriObj = URI.create(uri);
    System.out.println("Got URI object " + uri);
    FileSystem fs = FileSystem.get(uriObj, configuration);
    FSDataInputStream fsDataInputStream = null;

    Path hdfsPath = new Path(uri);

    fsDataInputStream = fs.open(hdfsPath);
    // This specifies the reading starts from the 0th Byte.
    fsDataInputStream.seek(0);
    IOUtils.copyBytes(fsDataInputStream, System.out, 4096, false);
    System.out.println("*******************************************");

    BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(hdfsPath)));

    try {
      String line;
      line = br.readLine();
      while (line != null) {
        System.out.println("################ Line is###### " + line);
        // be sure to read the next line otherwise you'll get an infinite loop
        line = br.readLine();
      }
    } finally {
      // you should close out the BufferedReader
      br.close();
    }
  }
示例#11
0
 private void init(Counters.Counter readsCounter) throws IOException {
   if (reader == null) {
     FSDataInputStream in = fs.open(file);
     in.seek(segmentOffset);
     reader = new Reader<K, V>(conf, in, segmentLength, codec, readsCounter);
   }
 }
示例#12
0
  @Override
  public int read(byte b[], int off, int len) throws IOException {
    if (mTachyonFileInputStream != null) {
      int ret = 0;
      try {
        ret = mTachyonFileInputStream.read(b, off, len);
        mCurrentPosition += ret;
        return ret;
      } catch (IOException e) {
        LOG.error(e.getMessage(), e);
        mTachyonFileInputStream = null;
      }
    }

    if (mHdfsInputStream != null) {
      b[off] = (byte) readFromHdfsBuffer();
      if (b[off] == -1) {
        return -1;
      }
      return 1;
    }

    FileSystem fs = mHdfsPath.getFileSystem(mHadoopConf);
    mHdfsInputStream = fs.open(mHdfsPath, mHadoopBufferSize);
    mHdfsInputStream.seek(mCurrentPosition);

    b[off] = (byte) readFromHdfsBuffer();
    if (b[off] == -1) {
      return -1;
    }
    return 1;
  }
示例#13
0
 @Test(timeout = 120000)
 public void testSeekAfterSetDropBehind() throws Exception {
   // start a cluster
   LOG.info("testSeekAfterSetDropBehind");
   Configuration conf = new HdfsConfiguration();
   MiniDFSCluster cluster = null;
   String TEST_PATH = "/test";
   int TEST_PATH_LEN = MAX_TEST_FILE_LEN;
   try {
     cluster = new MiniDFSCluster.Builder(conf).numDataNodes(1).build();
     cluster.waitActive();
     FileSystem fs = cluster.getFileSystem();
     createHdfsFile(fs, new Path(TEST_PATH), TEST_PATH_LEN, false);
     // verify that we can seek after setDropBehind
     FSDataInputStream fis = fs.open(new Path(TEST_PATH));
     try {
       Assert.assertTrue(fis.read() != -1); // create BlockReader
       fis.setDropBehind(false); // clear BlockReader
       fis.seek(2); // seek
     } finally {
       fis.close();
     }
   } finally {
     if (cluster != null) {
       cluster.shutdown();
     }
   }
 }
示例#14
0
    public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
      FileSplit split = (FileSplit) genericSplit;
      Configuration job = context.getConfiguration();
      m_Sb.setLength(0);
      m_Start = split.getStart();
      m_End = m_Start + split.getLength();
      final Path file = split.getPath();
      compressionCodecs = new CompressionCodecFactory(job);
      final CompressionCodec codec = compressionCodecs.getCodec(file);

      // open the file and seek to the m_Start of the split
      FileSystem fs = file.getFileSystem(job);
      //  getFileStatus fileStatus = fs.getFileStatus(split.getPath());
      //noinspection deprecation
      @SuppressWarnings(value = "deprecated")
      long length = fs.getLength(file);
      FSDataInputStream fileIn = fs.open(split.getPath());
      if (m_Start > 0) fileIn.seek(m_Start);
      if (codec != null) {
        CompressionInputStream inputStream = codec.createInputStream(fileIn);
        m_Input = new BufferedReader(new InputStreamReader(inputStream));
        m_End = length;
      } else {
        m_Input = new BufferedReader(new InputStreamReader(fileIn));
      }
      m_Current = m_Start;
      m_Key = split.getPath().getName();
    }
 /*
  * Position the input stream at the start of the first record.
  */
 private void positionAtFirstRecord(FSDataInputStream stream) throws IOException {
   if (start > 0) {
     // Advance to the start of the first line in our slice.
     // We use a temporary LineReader to read a partial line and find the
     // start of the first one on or after our starting position.
     // In case our slice starts right at the beginning of a line, we need to back
     // up by one position and then discard the first line.
     start -= 1;
     stream.seek(start);
     LineReader reader = new LineReader(stream);
     int bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start));
     start = start + bytesRead;
     stream.seek(start);
   }
   // else
   //	if start == 0 we're starting at the beginning of a line
   pos = start;
 }
 public WikipediaRecordReader(FileSplit split, JobConf jobConf) throws IOException {
   // open the file and seek to the start of the split
   start = split.getStart();
   end = start + split.getLength();
   Path file = split.getPath();
   FileSystem fs = file.getFileSystem(jobConf);
   fsin = fs.open(split.getPath());
   fsin.seek(start);
 }
示例#17
0
  @Override
  public boolean nextKeyValue() throws IOException {
    int uncompressedBlockSize = rawInputStream.readInt();
    if (uncompressedBlockSize == 0) {
      // An uncompressed block size of zero means end of file.
      return false;
    } else if (uncompressedBlockSize < 0) {
      throw new EOFException(
          "Could not read uncompressed block size at position "
              + rawInputStream.getPos()
              + " in file "
              + lzoFile);
    }

    int compressedBlockSize = rawInputStream.readInt();
    if (compressedBlockSize <= 0) {
      throw new EOFException(
          "Could not read compressed block size at position "
              + rawInputStream.getPos()
              + " in file "
              + lzoFile);
    }

    // See LzopInputStream.getCompressedData
    boolean isUncompressedBlock = (uncompressedBlockSize == compressedBlockSize);
    int numChecksumsToSkip =
        isUncompressedBlock
            ? numDecompressedChecksums
            : numDecompressedChecksums + numCompressedChecksums;

    // Get the current position.  Since we've read two ints, the current block started 8 bytes ago.
    long pos = rawInputStream.getPos();

    curValue.set(pos - 8, uncompressedOffset, uncompressedBlockSize);

    uncompressedOffset += uncompressedBlockSize;

    // Seek beyond the checksums and beyond the block data to the beginning of the next block.
    rawInputStream.seek(pos + compressedBlockSize + (4 * numChecksumsToSkip));
    ++numBlocksRead;

    // Log some progress every so often.
    if (numBlocksRead % LOG_EVERY_N_BLOCKS == 0) {
      LOG.info(
          "Reading block "
              + numBlocksRead
              + " at pos "
              + pos
              + " of "
              + totalFileSize
              + ". Read is "
              + (100.0 * getProgress())
              + "% done. ");
    }

    return true;
  }
 public WikipediaRecordReader(FileSplit split, TaskAttemptContext context) throws IOException {
   // open the file and seek to the start of the split
   start = split.getStart();
   end = start + split.getLength();
   Path file = split.getPath();
   FileSystem fs = file.getFileSystem(context.getConfiguration());
   fsin = fs.open(file);
   fsin.seek(start);
 }
示例#19
0
  @Override
  public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException {
    // This method should only be called once (see Hadoop API). However,
    // there seems to be disagreement between implementations that call
    // initialize() and Hadoop-BAM's own code that relies on
    // {@link BAMInputFormat} to call initialize() when the reader is
    // created. Therefore we add this check for the time being.
    if (isInitialized) close();
    isInitialized = true;

    final Configuration conf = ctx.getConfiguration();

    final FileVirtualSplit split = (FileVirtualSplit) spl;
    final Path file = split.getPath();
    final FileSystem fs = file.getFileSystem(conf);

    this.stringency = SAMHeaderReader.getValidationStringency(conf);

    final FSDataInputStream in = fs.open(file);

    final SAMFileHeader header = SAMHeaderReader.readSAMHeaderFrom(in, conf);
    codec = new BAMRecordCodec(header);

    in.seek(0);
    bci =
        new BlockCompressedInputStream(
            new WrapSeekable<FSDataInputStream>(in, fs.getFileStatus(file).getLen(), file));

    virtualStart = split.getStartVirtualOffset();

    fileStart = virtualStart >>> 16;
    virtualEnd = split.getEndVirtualOffset();

    bci.seek(virtualStart);
    codec.setInputStream(bci);

    if (BAMInputFormat.DEBUG_BAM_SPLITTER) {
      final long recordStart = virtualStart & 0xffff;
      System.err.println(
          "XXX inizialized BAMRecordReader byte offset: "
              + fileStart
              + " record offset: "
              + recordStart);
    }

    keepReadPairsTogether =
        SortOrder.queryname.equals(header.getSortOrder())
            && conf.getBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, false);
    readPair = false;
    lastOfPair = false;
    intervals = BAMInputFormat.getIntervals(conf);
    if (intervals != null) {
      overlapDetector = new OverlapDetector<>(0, 0);
      overlapDetector.addAll(intervals, intervals);
    }
  }
 /**
  * Constructor to map a file containing Succinct data structures via streams.
  *
  * @param filePath Path of the file.
  * @param conf Configuration for the filesystem.
  * @throws IOException
  */
 public SuccinctIndexedFileStream(Path filePath, Configuration conf) throws IOException {
   super(filePath, conf);
   FSDataInputStream is = getStream(filePath);
   is.seek(endOfCoreStream);
   int len = is.readInt();
   offsets = new int[len];
   for (int i = 0; i < len; i++) {
     offsets[i] = is.readInt();
   }
 }
示例#21
0
 /** Tests seek(). */
 @Test
 public void testSeek() throws IOException {
   final Path testFile = new Path("/testfile+1");
   FSDataOutputStream out = hdfs.create(testFile, true);
   out.writeBytes("0123456789");
   out.close();
   FSDataInputStream in = hftpFs.open(testFile);
   in.seek(7);
   assertEquals('7', in.read());
 }
  @Test(timeout = 10000)
  public void testSeek() throws IOException {
    Path path = path("/tests3a/testfile.seek");
    writeFile(path, TEST_BUFFER_SIZE * 10);

    FSDataInputStream inputStream = fs.open(path, TEST_BUFFER_SIZE);
    inputStream.seek(inputStream.getPos() + MODULUS);

    testReceivedData(inputStream, TEST_BUFFER_SIZE * 10 - MODULUS);
  }
 public void initialize(InputSplit split, TaskAttemptContext context)
     throws IOException, InterruptedException {
   Path p = ((FileSplit) split).getPath();
   FileSystem fs = p.getFileSystem(context.getConfiguration());
   in = fs.open(p);
   long start = ((FileSplit) split).getStart();
   // find the offset to start at a record boundary
   offset = (RECORD_LENGTH - (start % RECORD_LENGTH)) % RECORD_LENGTH;
   in.seek(start + offset);
   length = ((FileSplit) split).getLength();
 }
示例#24
0
  public void open() throws IOException {
    if (stream == null) {
      stream = fs.open(logFile);
    }

    if (header == null) {
      header = ProcedureWALFormat.readHeader(stream);
      startPos = stream.getPos();
    } else {
      stream.seek(startPos);
    }
  }
    @Override
    public void initialize(InputSplit split, TaskAttemptContext context)
        throws IOException, InterruptedException {
      // 初始化函数
      // System.out.println("initialize");
      // Logger.getLogger("KirchhoffMigration").log(Level.INFO,
      // "enter initialize()");
      FileSplit inputsplit = (FileSplit) split;
      Configuration conf = context.getConfiguration();
      LENGTH = conf.getLong("FileSplitLength", 0);
      SPILL = LENGTH / conf.getInt("SplitPerMap", 1);
      // LENGTH = 8;
      // SPILL = 8;
      assert (LENGTH >= SPILL);
      // System.out.println("length:" + LENGTH);
      // System.out.println("spill:" + SPILL);
      String filename = inputsplit.getPath().toString();
      // System.out.println("filename:" + filename);
      //            String buf = filename.substring(filename.lastIndexOf("fcxy") + 4,
      //                    filename.lastIndexOf("."));
      //            int count = Integer.parseInt(buf);
      // System.out.println(filename);
      // start = inputsplit.getStart(); // 得到此分片开始位置
      start = inputsplit.getStart();
      shotNum += start * 8 / Float.SIZE;
      long offset = LENGTH >= inputsplit.getLength() ? inputsplit.getLength() : LENGTH;
      end = start + offset; // 结束此分片位置
      // System.out.println("inputSplitLength:" + split.getLength());
      // System.out.println("end:" + end);
      // start = inputsplit.getStart(); // 得到此分片开始位置
      // end = start + inputsplit.getLength();// 结束此分片位置
      // System.out.println("start:" + start + " ,end:" + end);
      final Path file = inputsplit.getPath();
      // System.out.println(file.toString());
      // 打开文件
      FileSystem fs = file.getFileSystem(context.getConfiguration());
      fileIn = fs.open(inputsplit.getPath());

      // 关键位置2
      // 将文件指针移动到当前分片,因为每次默认打开文件时,`其指针指向开头
      fileIn.seek(start);

      // in = new LineReader(fileIn, context.getConfiguration());

      // if (start != 0) {
      // System.out.println("not the first split");
      // // 关键解决位置1
      // //
      // 如果这不是第一个分片,那么假设第一个分片是0——4,那么,第4个位置已经被读取,则需要跳过4,否则会产生读入错误,因为你回头又去读之前读过的地方
      // start += (end - pos + 1);
      // }
      pos = start;
    }
示例#26
0
    public XmlRecordReader(FileSplit split, TaskAttemptContext context) throws IOException {
      Configuration conf = context.getConfiguration();
      startTag = conf.get(START_TAG_KEY).getBytes("utf-8");
      endTag = conf.get(END_TAG_KEY).getBytes("utf-8");

      // open the file and seek to the start of the split
      start = split.getStart();
      end = start + split.getLength();
      Path file = split.getPath();
      FileSystem fs = file.getFileSystem(conf);
      fsin = fs.open(split.getPath());
      fsin.seek(start);
    }
示例#27
0
 @Override
 public void seek(long pos) throws IOException {
   try {
     in.seek(pos);
   } catch (FileNotFoundException e) {
     tryOpen().seek(pos);
   } catch (NullPointerException e) { // HDFS 1.x - DFSInputStream.getBlockAt()
     tryOpen().seek(pos);
   } catch (AssertionError e) { // assert in HDFS 1.x - DFSInputStream.getBlockAt()
     tryOpen().seek(pos);
   }
   this.pos = pos;
 }
示例#28
0
 public static void main(String[] args) throws Exception {
   String uri = args[0];
   Configuration conf = new Configuration();
   FileSystem fs = FileSystem.get(URI.create(uri), conf);
   FSDataInputStream in = null;
   try {
     in = fs.open(new Path(uri));
     IOUtils.copyBytes(in, System.out, 4096, false);
     in.seek(3); // go back to pos 3 of the file
     IOUtils.copyBytes(in, System.out, 4096, false);
   } finally {
     IOUtils.closeStream(in);
   }
 }
  /**
   * From Design Pattern, O'Reilly... This method takes as arguments the map task’s assigned
   * InputSplit and TaskAttemptContext, and prepares the record reader. For file-based input
   * formats, this is a good place to seek to the byte position in the file to begin reading.
   */
  @Override
  public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {

    // This InputSplit is a FileInputSplit
    FileSplit split = (FileSplit) genericSplit;

    // Retrieve configuration, and Max allowed
    // bytes for a single record
    Configuration job = context.getConfiguration();
    this.delimiterRegex = "^.*<REUTERS.*$";

    delimiterPattern = Pattern.compile(delimiterRegex);

    // Split "S" is responsible for all records
    // starting from "start" and "end" positions
    start = split.getStart();
    end = start + split.getLength();

    // Retrieve file containing Split "S"
    final Path file = split.getPath();
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());

    // If Split "S" starts at byte 0, first line will be processed
    // If Split "S" does not start at byte 0, first line has been already
    // processed by "S-1" and therefore needs to be silently ignored
    boolean skipFirstLine = false;
    if (start != 0) {
      skipFirstLine = true;
      // Set the file pointer at "start - 1" position.
      // This is to make sure we won't miss any line
      // It could happen if "start" is located on a EOL
      --start;
      fileIn.seek(start);
    }

    in = new LineReader(fileIn, job);

    // If first line needs to be skipped, read first line
    // and stores its content to a dummy Text
    if (skipFirstLine) {
      Text dummy = new Text();
      // Reset "start" to "start + line offset"
      start += in.readLine(dummy, 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }

    // Position is the actual start
    this.pos = start;
  }
示例#30
0
      @Override
      public void initialize(InputSplit split, TaskAttemptContext context)
          throws IOException, InterruptedException {
        Configuration conf = context.getConfiguration();
        startTag = conf.get(START_TAG_KEY).getBytes("utf-8");
        endTag = conf.get(END_TAG_KEY).getBytes("utf-8");
        FileSplit fileSplit = (FileSplit) split;

        // open the file and seek to the start of the split
        start = fileSplit.getStart();
        end = start + fileSplit.getLength();
        Path file = fileSplit.getPath();
        FileSystem fs = file.getFileSystem(conf);
        fsin = fs.open(fileSplit.getPath());
        fsin.seek(start);
      }