예제 #1
0
  /**
   * Use the given read to build a list of aligned read and reference base information.
   *
   * @param read The read with the alignment information
   * @return read and reference information. For a read without an alignment or cigar units, null is
   *     returned.
   */
  public static List<ReadBaseWithReference> extractReadBases(Read read) {

    // Make sure this read has a valid alignment with Cigar Units
    if (!read.hasAlignment() || (read.getAlignment().getCigarCount() == 0)) {
      return null;
    }

    ImmutableList.Builder<ReadBaseWithReference> bases = ImmutableList.builder();

    String readSeq = read.getAlignedSequence();
    List<Integer> readQual = read.getAlignedQualityList();
    String refSeq = UNINITIALIZED_REFERENCE_SEQUENCE;

    int refPosAbsoluteOffset = 0;
    int readOffset = 0;
    for (CigarUnit unit : read.getAlignment().getCigarList()) {
      switch (unit.getOperation()) {
        case ALIGNMENT_MATCH:
        case SEQUENCE_MISMATCH:
        case SEQUENCE_MATCH:
          for (int i = 0; i < unit.getOperationLength(); i++) {
            String refBase = "";
            if (unit.getOperation().equals(CigarUnit.Operation.SEQUENCE_MATCH)) {
              refBase = readSeq.substring(readOffset, readOffset + 1);
            } else if (!unit.getReferenceSequence().isEmpty()) {
              // try to get the ref sequence from the Cigar unit
              refBase = unit.getReferenceSequence().substring(i, i + 1);
            } else {
              // try to get the ref sequence by fully parsing the MD tag if not already cached
              if (refSeq != null && refSeq.equals(UNINITIALIZED_REFERENCE_SEQUENCE)) {
                refSeq =
                    com.google.cloud.genomics.utils.grpc.ReadUtils
                        .inferReferenceSequenceByParsingMdFlag(read);
              }
              if (refSeq != null) {
                refBase = refSeq.substring(readOffset, readOffset + 1);
              }
            }
            String name = read.getAlignment().getPosition().getReferenceName();
            Matcher m = Pattern.compile("^(chr)?(X|Y|([12]?\\d))$").matcher(name);
            if (m.matches()) {
              name = m.group(m.groupCount() - 1);
            }
            Position refPosition =
                Position.newBuilder()
                    .setReferenceName(name)
                    .setPosition(
                        read.getAlignment().getPosition().getPosition() + refPosAbsoluteOffset)
                    .build();
            bases.add(
                new ReadBaseWithReference(
                    new ReadBaseQuality(
                        readSeq.substring(readOffset, readOffset + 1), readQual.get(readOffset)),
                    refBase,
                    refPosition));
            refPosAbsoluteOffset++;
            readOffset++;
          }
          break;

        case PAD: // padding (silent deletion from padded reference)
        case CLIP_HARD: // hard clipping (clipped sequences NOT present in seq)
          break;

        case CLIP_SOFT: // soft clipping (clipped sequences present in SEQ)
        case INSERT: // insertion to the reference
          readOffset += unit.getOperationLength();
          break;

        case DELETE: // deletion from the reference
        case SKIP: // intron (mRNA-to-genome alignment only)
          refPosAbsoluteOffset += unit.getOperationLength();
          break;

        default:
          throw new IllegalArgumentException("Illegal cigar code: " + unit.getOperation());
      }
    }

    return bases.build();
  }
예제 #2
0
 @Override
 public Boolean apply(Read r) {
   return Pattern.compile("^(chr)?(X|Y|([12]?\\d))$")
       .matcher(r.getAlignment().getPosition().getReferenceName())
       .matches();
 }