private GATKSAMRecord revertSoftClippedBases(GATKSAMRecord read) { GATKSAMRecord unclipped = (GATKSAMRecord) read.clone(); Cigar unclippedCigar = new Cigar(); int matchesCount = 0; for (CigarElement element : read.getCigar().getCigarElements()) { if (element.getOperator() == CigarOperator.SOFT_CLIP || element.getOperator() == CigarOperator.MATCH_OR_MISMATCH) matchesCount += element.getLength(); else if (matchesCount > 0) { unclippedCigar.add(new CigarElement(matchesCount, CigarOperator.MATCH_OR_MISMATCH)); matchesCount = 0; unclippedCigar.add(element); } else unclippedCigar.add(element); } if (matchesCount > 0) unclippedCigar.add(new CigarElement(matchesCount, CigarOperator.MATCH_OR_MISMATCH)); unclipped.setCigar(unclippedCigar); final int newStart = read.getAlignmentStart() + calculateAlignmentStartShift(read.getCigar(), unclippedCigar); unclipped.setAlignmentStart(newStart); if (newStart <= 0) { // if the start of the unclipped read occurs before the contig, // we must hard clip away the bases since we cannot represent reads with // negative or 0 alignment start values in the SAMRecord (e.g., 0 means unaligned) return hardClip(unclipped, 0, -newStart); } else { return unclipped; } }
/** * Creates an empty GATKSAMRecord with the read's header, read group and mate information, but * empty (not-null) fields: - Cigar String - Read Bases - Base Qualities * * <p>Use this method if you want to create a new empty GATKSAMRecord based on another * GATKSAMRecord * * @param read a read to copy the header from * @return a read with no bases but safe for the GATK */ public static GATKSAMRecord emptyRead(GATKSAMRecord read) { final GATKSAMRecord emptyRead = new GATKSAMRecord(read.getHeader()); emptyRead.setReferenceIndex(read.getReferenceIndex()); emptyRead.setAlignmentStart(0); emptyRead.setMappingQuality(0); // setting read indexing bin last emptyRead.setFlags(read.getFlags()); emptyRead.setMateReferenceIndex(read.getMateReferenceIndex()); emptyRead.setMateAlignmentStart(read.getMateAlignmentStart()); emptyRead.setInferredInsertSize(read.getInferredInsertSize()); emptyRead.setCigarString(""); emptyRead.setReadBases(new byte[0]); emptyRead.setBaseQualities(new byte[0]); SAMReadGroupRecord samRG = read.getReadGroup(); emptyRead.clearAttributes(); if (samRG != null) { GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(samRG); emptyRead.setReadGroup(rg); } GATKBin.setReadIndexingBin(emptyRead, 0); return emptyRead; }
/** * Hard clip bases from read, from start to stop in base coordinates * * <p>If start == 0, then we will clip from the front of the read, otherwise we clip from the * right. If start == 0 and stop == 10, this would clip out the first 10 bases of the read. * * <p>Note that this function works with reads with negative alignment starts, in order to allow * us to hardClip reads that have had their soft clips reverted and so might have negative * alignment starts * * <p>Works properly with reduced reads and insertion/deletion base qualities * * @param read a non-null read * @param start a start >= 0 and < read.length * @param stop a stop >= 0 and < read.length. * @return a cloned version of read that has been properly trimmed down */ private GATKSAMRecord hardClip(GATKSAMRecord read, int start, int stop) { // If the read is unmapped there is no Cigar string and neither should we create a new cigar // string final CigarShift cigarShift = (read.getReadUnmappedFlag()) ? new CigarShift(new Cigar(), 0, 0) : hardClipCigar(read.getCigar(), start, stop); // the cigar may force a shift left or right (or both) in case we are left with insertions // starting or ending the read after applying the hard clip on start/stop. final int newLength = read.getReadLength() - (stop - start + 1) - cigarShift.shiftFromStart - cigarShift.shiftFromEnd; final byte[] newBases = new byte[newLength]; final byte[] newQuals = new byte[newLength]; final int copyStart = (start == 0) ? stop + 1 + cigarShift.shiftFromStart : cigarShift.shiftFromStart; System.arraycopy(read.getReadBases(), copyStart, newBases, 0, newLength); System.arraycopy(read.getBaseQualities(), copyStart, newQuals, 0, newLength); final GATKSAMRecord hardClippedRead = (GATKSAMRecord) read.clone(); hardClippedRead .resetSoftStartAndEnd(); // reset the cached soft start and end because they may have // changed now that the read was hard clipped. No need to calculate // them now. They'll be lazily calculated on the next call to // getSoftStart()/End() hardClippedRead.setBaseQualities(newQuals); hardClippedRead.setReadBases(newBases); hardClippedRead.setCigar(cigarShift.cigar); if (start == 0) hardClippedRead.setAlignmentStart( read.getAlignmentStart() + calculateAlignmentStartShift(read.getCigar(), cigarShift.cigar)); if (read.hasBaseIndelQualities()) { final byte[] newBaseInsertionQuals = new byte[newLength]; final byte[] newBaseDeletionQuals = new byte[newLength]; System.arraycopy( read.getBaseInsertionQualities(), copyStart, newBaseInsertionQuals, 0, newLength); System.arraycopy( read.getBaseDeletionQualities(), copyStart, newBaseDeletionQuals, 0, newLength); hardClippedRead.setBaseQualities(newBaseInsertionQuals, EventType.BASE_INSERTION); hardClippedRead.setBaseQualities(newBaseDeletionQuals, EventType.BASE_DELETION); } return hardClippedRead; }
/** * Clips the bases in read according to this operation's start and stop. Uses the clipping * representation used is the one provided by algorithm argument. * * @param algorithm clipping algorithm to use * @param originalRead the read to be clipped */ public GATKSAMRecord apply(ClippingRepresentation algorithm, GATKSAMRecord originalRead) { GATKSAMRecord read = (GATKSAMRecord) originalRead.clone(); byte[] quals = read.getBaseQualities(); byte[] bases = read.getReadBases(); byte[] newBases = new byte[bases.length]; byte[] newQuals = new byte[quals.length]; switch (algorithm) { // important note: // it's not safe to call read.getReadBases()[i] = 'N' or read.getBaseQualities()[i] = 0 // because you're not guaranteed to get a pointer to the actual array of bytes in the // GATKSAMRecord case WRITE_NS: for (int i = 0; i < bases.length; i++) { if (i >= start && i <= stop) { newBases[i] = 'N'; } else { newBases[i] = bases[i]; } } read.setReadBases(newBases); break; case WRITE_Q0S: for (int i = 0; i < quals.length; i++) { if (i >= start && i <= stop) { newQuals[i] = 0; } else { newQuals[i] = quals[i]; } } read.setBaseQualities(newQuals); break; case WRITE_NS_Q0S: for (int i = 0; i < bases.length; i++) { if (i >= start && i <= stop) { newQuals[i] = 0; newBases[i] = 'N'; } else { newQuals[i] = quals[i]; newBases[i] = bases[i]; } } read.setBaseQualities(newBases); read.setReadBases(newBases); break; case HARDCLIP_BASES: read = hardClip(read, start, stop); break; case SOFTCLIP_BASES: if (read.getReadUnmappedFlag()) { // we can't process unmapped reads throw new UserException("Read Clipper cannot soft clip unmapped reads"); } // System.out.printf("%d %d %d%n", stop, start, read.getReadLength()); int myStop = stop; if ((stop + 1 - start) == read.getReadLength()) { // BAM representation issue -- we can't SOFTCLIP away all bases in a read, just leave it // alone // Walker.logger.info(String.format("Warning, read %s has all bases clip but this can't be // represented with SOFTCLIP_BASES, just leaving it alone", read.getReadName())); // break; myStop--; // just decrement stop } if (start > 0 && myStop != read.getReadLength() - 1) throw new RuntimeException( String.format( "Cannot apply soft clipping operator to the middle of a read: %s to be clipped at %d-%d", read.getReadName(), start, myStop)); Cigar oldCigar = read.getCigar(); int scLeft = 0, scRight = read.getReadLength(); if (start == 0) scLeft = myStop + 1; else scRight = start; Cigar newCigar = softClip(oldCigar, scLeft, scRight); read.setCigar(newCigar); int newClippedStart = getNewAlignmentStartOffset(newCigar, oldCigar); int newStart = read.getAlignmentStart() + newClippedStart; read.setAlignmentStart(newStart); break; case REVERT_SOFTCLIPPED_BASES: read = revertSoftClippedBases(read); break; default: throw new IllegalStateException("Unexpected Clipping operator type " + algorithm); } return read; }