/** * Use the given read to build a list of aligned read and reference base information. * * @param read The read with the alignment information * @return read and reference information. For a read without an alignment or cigar units, null is * returned. */ public static List<ReadBaseWithReference> extractReadBases(Read read) { // Make sure this read has a valid alignment with Cigar Units if (!read.hasAlignment() || (read.getAlignment().getCigarCount() == 0)) { return null; } ImmutableList.Builder<ReadBaseWithReference> bases = ImmutableList.builder(); String readSeq = read.getAlignedSequence(); List<Integer> readQual = read.getAlignedQualityList(); String refSeq = UNINITIALIZED_REFERENCE_SEQUENCE; int refPosAbsoluteOffset = 0; int readOffset = 0; for (CigarUnit unit : read.getAlignment().getCigarList()) { switch (unit.getOperation()) { case ALIGNMENT_MATCH: case SEQUENCE_MISMATCH: case SEQUENCE_MATCH: for (int i = 0; i < unit.getOperationLength(); i++) { String refBase = ""; if (unit.getOperation().equals(CigarUnit.Operation.SEQUENCE_MATCH)) { refBase = readSeq.substring(readOffset, readOffset + 1); } else if (!unit.getReferenceSequence().isEmpty()) { // try to get the ref sequence from the Cigar unit refBase = unit.getReferenceSequence().substring(i, i + 1); } else { // try to get the ref sequence by fully parsing the MD tag if not already cached if (refSeq != null && refSeq.equals(UNINITIALIZED_REFERENCE_SEQUENCE)) { refSeq = com.google.cloud.genomics.utils.grpc.ReadUtils .inferReferenceSequenceByParsingMdFlag(read); } if (refSeq != null) { refBase = refSeq.substring(readOffset, readOffset + 1); } } String name = read.getAlignment().getPosition().getReferenceName(); Matcher m = Pattern.compile("^(chr)?(X|Y|([12]?\\d))$").matcher(name); if (m.matches()) { name = m.group(m.groupCount() - 1); } Position refPosition = Position.newBuilder() .setReferenceName(name) .setPosition( read.getAlignment().getPosition().getPosition() + refPosAbsoluteOffset) .build(); bases.add( new ReadBaseWithReference( new ReadBaseQuality( readSeq.substring(readOffset, readOffset + 1), readQual.get(readOffset)), refBase, refPosition)); refPosAbsoluteOffset++; readOffset++; } break; case PAD: // padding (silent deletion from padded reference) case CLIP_HARD: // hard clipping (clipped sequences NOT present in seq) break; case CLIP_SOFT: // soft clipping (clipped sequences present in SEQ) case INSERT: // insertion to the reference readOffset += unit.getOperationLength(); break; case DELETE: // deletion from the reference case SKIP: // intron (mRNA-to-genome alignment only) refPosAbsoluteOffset += unit.getOperationLength(); break; default: throw new IllegalArgumentException("Illegal cigar code: " + unit.getOperation()); } } return bases.build(); }
@Override public Boolean apply(Read r) { return Pattern.compile("^(chr)?(X|Y|([12]?\\d))$") .matcher(r.getAlignment().getPosition().getReferenceName()) .matches(); }