public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { Configuration conf = ContextUtil.getConfiguration(context); this.maxLineLength = conf.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); FileSplit split = (FileSplit) genericSplit; start = (split.getStart()) << 16; end = (start + split.getLength()) << 16; final Path file = split.getPath(); FileSystem fs = file.getFileSystem(conf); bin = new BlockCompressedInputStream( new WrapSeekable<FSDataInputStream>( fs.open(file), fs.getFileStatus(file).getLen(), file)); in = new LineReader(bin, conf); if (start != 0) { bin.seek(start); // Skip first line in.readLine(new Text()); start = bin.getFilePointer(); } this.pos = start; }
@Override public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException { // This method should only be called once (see Hadoop API). However, // there seems to be disagreement between implementations that call // initialize() and Hadoop-BAM's own code that relies on // {@link BAMInputFormat} to call initialize() when the reader is // created. Therefore we add this check for the time being. if (isInitialized) close(); isInitialized = true; final Configuration conf = ctx.getConfiguration(); final FileVirtualSplit split = (FileVirtualSplit) spl; final Path file = split.getPath(); final FileSystem fs = file.getFileSystem(conf); this.stringency = SAMHeaderReader.getValidationStringency(conf); final FSDataInputStream in = fs.open(file); final SAMFileHeader header = SAMHeaderReader.readSAMHeaderFrom(in, conf); codec = new BAMRecordCodec(header); in.seek(0); bci = new BlockCompressedInputStream( new WrapSeekable<FSDataInputStream>(in, fs.getFileStatus(file).getLen(), file)); virtualStart = split.getStartVirtualOffset(); fileStart = virtualStart >>> 16; virtualEnd = split.getEndVirtualOffset(); bci.seek(virtualStart); codec.setInputStream(bci); if (BAMInputFormat.DEBUG_BAM_SPLITTER) { final long recordStart = virtualStart & 0xffff; System.err.println( "XXX inizialized BAMRecordReader byte offset: " + fileStart + " record offset: " + recordStart); } keepReadPairsTogether = SortOrder.queryname.equals(header.getSortOrder()) && conf.getBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, false); readPair = false; lastOfPair = false; intervals = BAMInputFormat.getIntervals(conf); if (intervals != null) { overlapDetector = new OverlapDetector<>(0, 0); overlapDetector.addAll(intervals, intervals); } }
public boolean nextKeyValue() throws IOException { while (pos <= end) { int newSize = in.readLine(value, maxLineLength); if (newSize == 0) return false; pos = bin.getFilePointer(); if (newSize < maxLineLength) return true; } return false; }
/** * Unless the end has been reached, this only takes file position into account, not the position * within the block. */ @Override public float getProgress() { final long virtPos = bci.getFilePointer(); final long filePos = virtPos >>> 16; if (virtPos >= virtualEnd) return 1; else { final long fileEnd = virtualEnd >>> 16; // Add 1 to the denominator to make sure it doesn't reach 1 here when // filePos == fileEnd. return (float) (filePos - fileStart) / (fileEnd - fileStart + 1); } }
public static void main(String[] args) { try { String inFile = "/psychipc01/disk2/references/1000Genome/release/20130502_v5a/ALL.chr1.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz"; String outFile = "/psychipc01/disk2/references/1000Genome/release/20130502_v5a/ALL.chr1.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes1.vcf.gz"; BlockCompressedInputStream br = new BlockCompressedInputStream(new File(inFile)); BlockCompressedOutputStream bw = new BlockCompressedOutputStream(new File(outFile)); String line = null; String[] cells = null; int[] orgIndices = new int[] { 0, 1, 2, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 25, 33, 34, 35, 36, 37, 38, 39, 40 }; int selectedColNum = orgIndices.length; int i, pos; String delimiter = "\t"; while ((line = br.readLine()) != null) { line = line.trim(); if (line.trim().length() == 0) { continue; } bw.write(line.replaceAll("[|]", "/").getBytes()); bw.write("\n".getBytes()); } bw.close(); br.close(); } catch (Exception ex) { ex.printStackTrace(); } }
/** * @param stream stream.markSupported() must be true * @return true if this looks like a BAM file. */ public static boolean isBAMFile(final InputStream stream) throws IOException { if (!BlockCompressedInputStream.isValidFile(stream)) { return false; } final int buffSize = BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE; stream.mark(buffSize); final byte[] buffer = new byte[buffSize]; readBytes(stream, buffer, 0, buffSize); stream.reset(); try (final BlockCompressedInputStream bcis = new BlockCompressedInputStream(new ByteArrayInputStream(buffer))) { final byte[] magicBuf = new byte[4]; final int magicLength = readBytes(bcis, magicBuf, 0, 4); return magicLength == BAMFileConstants.BAM_MAGIC.length && Arrays.equals(BAMFileConstants.BAM_MAGIC, magicBuf); } }
@Override public boolean nextKeyValue() { long virtPos; while ((virtPos = bci.getFilePointer()) < virtualEnd || (keepReadPairsTogether && readPair && !lastOfPair)) { final SAMRecord r = codec.decode(); if (r == null) return false; // Since we're reading from a BAMRecordCodec directly we have to set the // validation stringency ourselves. if (this.stringency != null) r.setValidationStringency(this.stringency); readPair = r.getReadPairedFlag(); if (readPair) { boolean first = r.getFirstOfPairFlag(), second = r.getSecondOfPairFlag(); // According to the SAM spec (section 1.4) it is possible for pairs to have // multiple segments (i.e. more than two), in which case both `first` and // `second` will be true. boolean firstOfPair = first && !second; lastOfPair = !first && second; // ignore any template that is not first in a pair right at the start of a split // since it will have been returned in the previous split if (virtPos == virtualStart && keepReadPairsTogether && !firstOfPair) { continue; } } if (!overlaps(r)) { continue; } key.set(getKey(r)); record.set(r); return true; } return false; }
@Override public void run() { if (myStartSite >= mySiteCount) { return; } BlockCompressedInputStream reader = getReader(); try { reader.seek(myIndex.virtualOffset(mySeekIndex)); int numSites = Math.min(myNumLinesPerInterval, mySiteCount - myStartSite); byte[][] result = new byte[numSites][]; for (int i = 0; i < numSites; i++) { result[i] = parseLine(reader.readLine(), myTaxaCount, myStartSite + i, myIsOneLetter); CompletableFuture<byte[]> future = myFutureQueue.remove(myStartSite + i); if (future != null) { future.complete(result[i]); } } myGenoCache.put(myProcessBlock, result); // This get to prevent early eviction from cache myGenoCache.getIfPresent(myProcessBlock); myCurrentlyProcessingBlocks.remove(myProcessBlock); for (int i = 0; i < numSites; i++) { CompletableFuture<byte[]> future = myFutureQueue.remove(myStartSite + i); if (future != null) { future.complete(result[i]); } } myStartSite += myNumLinesPerInterval; if (myStartSite >= mySiteCount) { return; } for (int b = 1; b < NUM_LOOK_AHEAD_BLOCKS; b++) { if (myGenoCache.getIfPresent(myProcessBlock + b) != null) { return; } if (!myCurrentlyProcessingBlocks.add(myProcessBlock + b)) { return; } numSites = Math.min(myNumLinesPerInterval, mySiteCount - myStartSite); result = new byte[numSites][]; for (int i = 0; i < numSites; i++) { result[i] = parseLine(reader.readLine(), myTaxaCount, myStartSite + i, myIsOneLetter); } myGenoCache.put(myProcessBlock + b, result); // This get to prevent early eviction from cache myGenoCache.getIfPresent(myProcessBlock + b); myCurrentlyProcessingBlocks.remove(myProcessBlock + b); for (int i = 0; i < numSites; i++) { CompletableFuture<byte[]> future = myFutureQueue.remove(myStartSite + i); if (future != null) { future.complete(result[i]); } } myStartSite += myNumLinesPerInterval; if (myStartSite >= mySiteCount) { return; } } } catch (Exception e) { myLogger.error(e.getMessage(), e); } finally { myReaders.add(reader); } }
@Override public void close() throws IOException { bci.close(); }
/** * Finds a virtual BAM record position in the physical position range [beg,end). Returns end if no * BAM record was found. */ public long guessNextBAMRecordStart(long beg, long end) throws IOException { // Buffer what we need to go through. byte[] arr = new byte[MAX_BYTES_READ]; this.inFile.seek(beg); int totalRead = 0; for (int left = Math.min((int) (end - beg), arr.length); left > 0; ) { final int r = inFile.read(arr, totalRead, left); if (r < 0) break; totalRead += r; left -= r; } arr = Arrays.copyOf(arr, totalRead); this.in = new SeekableArrayStream(arr); this.bgzf = new BlockCompressedInputStream(this.in); this.bgzf.setCheckCrcs(true); this.bamCodec.setInputStream(bgzf); final int firstBGZFEnd = Math.min((int) (end - beg), 0xffff); // cp: Compressed Position, indexes the entire BGZF input. for (int cp = 0; ; ++cp) { final PosSize psz = guessNextBGZFPos(cp, firstBGZFEnd); if (psz == null) return end; final int cp0 = cp = psz.pos; final long cp0Virt = (long) cp0 << 16; try { bgzf.seek(cp0Virt); // This has to catch Throwable, because it's possible to get an // OutOfMemoryError due to an overly large size. } catch (Throwable e) { // Guessed BGZF position incorrectly: try the next guess. continue; } // up: Uncompressed Position, indexes the data inside the BGZF block. for (int up = 0; ; ++up) { final int up0 = up = guessNextBAMPos(cp0Virt, up, psz.size); if (up0 < 0) { // No BAM records found in the BGZF block: try the next BGZF // block. break; } // Verify that we can actually decode BLOCKS_NEEDED_FOR_GUESS worth // of records starting at (cp0,up0). bgzf.seek(cp0Virt | up0); boolean decodedAny = false; try { byte b = 0; int prevCP = cp0; while (b < BLOCKS_NEEDED_FOR_GUESS) { SAMRecord record = bamCodec.decode(); if (record == null) { break; } record.getCigar(); // force decoding of CIGAR decodedAny = true; final int cp2 = (int) (bgzf.getFilePointer() >>> 16); if (cp2 != prevCP) { // The compressed position changed so we must be in a new // block. assert cp2 > prevCP; prevCP = cp2; ++b; } } // Running out of records to verify is fine as long as we // verified at least something. It should only happen if we // couldn't fill the array. if (b < BLOCKS_NEEDED_FOR_GUESS) { assert arr.length < MAX_BYTES_READ; if (!decodedAny) continue; } } catch (SAMFormatException e) { continue; } catch (FileTruncatedException e) { continue; } catch (OutOfMemoryError e) { continue; } catch (IllegalArgumentException e) { continue; } catch (RuntimeIOException e) { continue; } catch (RuntimeEOFException e) { // This can happen legitimately if the [beg,end) range is too // small to accommodate BLOCKS_NEEDED_FOR_GUESS and we get cut // off in the middle of a record. In that case, our stream // should have hit EOF as well. If we've then verified at least // something, go ahead with it and hope for the best. if (!decodedAny && this.in.eof()) continue; } return beg + cp0 << 16 | up0; } } }
private int guessNextBAMPos(long cpVirt, int up, int cSize) { // What we're actually searching for is what's at offset [4], not [0]. So // skip ahead by 4, thus ensuring that whenever we find a valid [0] it's // at position up or greater. up += 4; try { while (up + SHORTEST_POSSIBLE_BAM_RECORD - 4 < cSize) { bgzf.seek(cpVirt | up); bgzf.read(buf.array(), 0, 8); // If the first two checks fail we have what looks like a valid // reference sequence ID. Assume we're at offset [4] or [24], i.e. // the ID of either this read or its mate, respectively. So check // the next integer ([8] or [28]) to make sure it's a 0-based // leftmost coordinate. final int id = buf.getInt(0); final int pos = buf.getInt(4); if (id < -1 || id > referenceSequenceCount || pos < -1) { ++up; continue; } // Okay, we could be at [4] or [24]. Assuming we're at [4], check // that [24] is valid. Assume [4] because we should hit it first: // the only time we expect to hit [24] is at the beginning of the // split, as part of the first read we should skip. bgzf.seek(cpVirt | up + 20); bgzf.read(buf.array(), 0, 8); final int nid = buf.getInt(0); final int npos = buf.getInt(4); if (nid < -1 || nid > referenceSequenceCount || npos < -1) { ++up; continue; } // So far so good: [4] and [24] seem okay. Now do something a bit // more involved: make sure that [36 + [12]&0xff - 1] == 0: that // is, the name of the read should be null terminated. // Move up to 0 just to make it less likely that we get confused // with offsets. Remember where we should continue from if we // reject this up. final int nextUP = up + 1; up -= 4; bgzf.seek(cpVirt | up + 12); bgzf.read(buf.array(), 0, 4); final int nameLength = buf.getInt(0) & 0xff; if (nameLength < 1) { // Names are null-terminated so length must be at least one up = nextUP; continue; } final int nullTerminator = up + 36 + nameLength - 1; if (nullTerminator >= cSize) { // This BAM record can't fit here. But maybe there's another in // the remaining space, so try again. up = nextUP; continue; } bgzf.seek(cpVirt | nullTerminator); bgzf.read(buf.array(), 0, 1); if (buf.get(0) != 0) { up = nextUP; continue; } // All of [4], [24], and [36 + [12]&0xff] look good. If [0] is also // sensible, that's good enough for us. "Sensible" to us means the // following: // // [0] >= 4*([16]&0xffff) + [20] + ([20]+1)/2 + 4*8 + ([12]&0xff) // Note that [0] is "length of the _remainder_ of the alignment // record", which is why this uses 4*8 instead of 4*9. int zeroMin = 4 * 8 + nameLength; bgzf.seek(cpVirt | up + 16); bgzf.read(buf.array(), 0, 8); zeroMin += (buf.getInt(0) & 0xffff) * 4; zeroMin += buf.getInt(4) + (buf.getInt(4) + 1) / 2; bgzf.seek(cpVirt | up); bgzf.read(buf.array(), 0, 4); if (buf.getInt(0) < zeroMin) { up = nextUP; continue; } return up; } } catch (IOException e) { } return -1; }