コード例 #1
0
  /** Ensure that splitting read groups still works when dealing with a sample-split pileup. */
  @Test
  public void testSplitBySample() {
    SAMReadGroupRecord readGroupOne = new SAMReadGroupRecord("rg1");
    readGroupOne.setSample("sample1");
    SAMReadGroupRecord readGroupTwo = new SAMReadGroupRecord("rg2");
    readGroupTwo.setSample("sample2");

    SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
    header.addReadGroup(readGroupOne);
    header.addReadGroup(readGroupTwo);

    GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, 10);
    read1.setAttribute("RG", readGroupOne.getId());
    GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header, "read2", 0, 1, 10);
    read2.setAttribute("RG", readGroupTwo.getId());
    GATKSAMRecord read3 = ArtificialSAMUtils.createArtificialRead(header, "read3", 0, 1, 10);
    read3.setAttribute("RG", readGroupOne.getId());
    GATKSAMRecord read4 = ArtificialSAMUtils.createArtificialRead(header, "read4", 0, 1, 10);
    read4.setAttribute("RG", readGroupTwo.getId());

    ReadBackedPileupImpl sample1Pileup =
        new ReadBackedPileupImpl(null, Arrays.asList(read1, read3), Arrays.asList(1, 1));
    ReadBackedPileupImpl sample2Pileup =
        new ReadBackedPileupImpl(null, Arrays.asList(read2, read4), Arrays.asList(1, 1));
    Map<String, ReadBackedPileupImpl> sampleToPileupMap =
        new HashMap<String, ReadBackedPileupImpl>();
    sampleToPileupMap.put(readGroupOne.getSample(), sample1Pileup);
    sampleToPileupMap.put(readGroupTwo.getSample(), sample2Pileup);

    ReadBackedPileup compositePileup = new ReadBackedPileupImpl(null, sampleToPileupMap);

    ReadBackedPileup rg1Pileup = compositePileup.getPileupForReadGroup("rg1");
    List<GATKSAMRecord> rg1Reads = rg1Pileup.getReads();

    Assert.assertEquals(rg1Reads.size(), 2, "Wrong number of reads in read group rg1");
    Assert.assertEquals(
        rg1Reads.get(0), read1, "Read " + read1.getReadName() + " should be in rg1 but isn't");
    Assert.assertEquals(
        rg1Reads.get(1), read3, "Read " + read3.getReadName() + " should be in rg1 but isn't");

    ReadBackedPileup rg2Pileup = compositePileup.getPileupForReadGroup("rg2");
    List<GATKSAMRecord> rg2Reads = rg2Pileup.getReads();

    Assert.assertEquals(rg1Reads.size(), 2, "Wrong number of reads in read group rg2");
    Assert.assertEquals(
        rg2Reads.get(0), read2, "Read " + read2.getReadName() + " should be in rg2 but isn't");
    Assert.assertEquals(
        rg2Reads.get(1), read4, "Read " + read4.getReadName() + " should be in rg2 but isn't");
  }
コード例 #2
0
  @Test
  public void testGetPileupForSample() {
    String sample1 = "sample1";
    String sample2 = "sample2";

    SAMReadGroupRecord readGroupOne = new SAMReadGroupRecord("rg1");
    readGroupOne.setSample(sample1);
    SAMReadGroupRecord readGroupTwo = new SAMReadGroupRecord("rg2");
    readGroupTwo.setSample(sample2);

    SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
    header.addReadGroup(readGroupOne);
    header.addReadGroup(readGroupTwo);

    GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, 10);
    read1.setAttribute("RG", readGroupOne.getId());
    GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header, "read2", 0, 1, 10);
    read2.setAttribute("RG", readGroupTwo.getId());

    Map<String, ReadBackedPileupImpl> sampleToPileupMap =
        new HashMap<String, ReadBackedPileupImpl>();
    sampleToPileupMap.put(
        sample1, new ReadBackedPileupImpl(null, Collections.singletonList(read1), 0));
    sampleToPileupMap.put(
        sample2, new ReadBackedPileupImpl(null, Collections.singletonList(read2), 0));

    ReadBackedPileup pileup = new ReadBackedPileupImpl(null, sampleToPileupMap);

    ReadBackedPileup sample2Pileup = pileup.getPileupForSample(sample2);
    Assert.assertEquals(
        sample2Pileup.getNumberOfElements(), 1, "Sample 2 pileup has wrong number of elements");
    Assert.assertEquals(
        sample2Pileup.getReads().get(0), read2, "Sample 2 pileup has incorrect read");

    ReadBackedPileup missingSamplePileup = pileup.getPileupForSample("missing");
    Assert.assertNull(missingSamplePileup, "Pileup for sample 'missing' should be null but isn't");

    missingSamplePileup = pileup.getPileupForSample("not here");
    Assert.assertNull(missingSamplePileup, "Pileup for sample 'not here' should be null but isn't");
  }
コード例 #3
0
  public void setupTest2(
      final int ID,
      final String readGroupId,
      final SAMReadGroupRecord readGroupRecord,
      final String sample,
      final String library,
      final SAMFileHeader header,
      final SAMRecordSetBuilder setBuilder)
      throws IOException {

    final String separator = ":";
    final int contig1 = 0;
    final int contig2 = 1;
    final int contig3 = 2;
    readGroupRecord.setSample(sample);
    readGroupRecord.setPlatform(platform);
    readGroupRecord.setLibrary(library);
    readGroupRecord.setPlatformUnit(readGroupId);
    setBuilder.setReadGroup(readGroupRecord);
    setBuilder.setUseNmFlag(true);

    setBuilder.setHeader(header);

    final int max = 800;
    final int min = 1;
    final Random rg = new Random(5);

    // add records that align to all 3 chr in reference file
    for (int i = 0; i < NUM_READS; i++) {
      final int start = rg.nextInt(max) + min;
      final String newReadName = READ_NAME + separator + ID + separator + i;

      if (i <= NUM_READS / 3) {
        setBuilder.addPair(newReadName, contig1, start + ID, start + ID + LENGTH);
      } else if (i < (NUM_READS - (NUM_READS / 3))) {
        setBuilder.addPair(newReadName, contig2, start + ID, start + ID + LENGTH);
      } else {
        setBuilder.addPair(newReadName, contig3, start + ID, start + ID + LENGTH);
      }
    }
  }
コード例 #4
0
  @Override
  public void execute() {
    log.info("Initializing kmer code map...");
    Map<Character, Integer> kmerCodeIndices = new HashMap<Character, Integer>();
    kmerCodeIndices.put('0', 1);
    kmerCodeIndices.put('A', 3);
    kmerCodeIndices.put('B', 4);
    kmerCodeIndices.put('C', 5);
    kmerCodeIndices.put('_', 6);
    kmerCodeIndices.put('.', 7);
    kmerCodeIndices.put('1', 9);

    Map<Character, String> kmerCodeNames = new LinkedHashMap<Character, String>();
    kmerCodeNames.put('0', "ref0");
    kmerCodeNames.put('A', "repetitive");
    kmerCodeNames.put('B', "both");
    kmerCodeNames.put('C', "lowcoverage");
    kmerCodeNames.put('_', "lowconfidence");
    kmerCodeNames.put('.', "novel");
    kmerCodeNames.put('1', "ref1");

    if (KMER_CODE_NAMES != null) {
      for (Character c : kmerCodeNames.keySet()) {
        String cStr = String.valueOf(c);
        if (KMER_CODE_NAMES.containsKey(cStr)) {
          kmerCodeNames.put(c, KMER_CODE_NAMES.get(cStr));
        }
      }
    }

    for (Character c : kmerCodeNames.keySet()) {
      log.info("  {} {}: {}", c, kmerCodeIndices.get(c), kmerCodeNames.get(c));
    }

    log.info("Loading annotated contigs...");
    Map<String, Map<String, String>> annotatedContigs = new HashMap<String, Map<String, String>>();
    int kmerSize = 0;

    if (ANN.length() > 0) {
      TableReader tr = new TableReader(ANN);
      for (Map<String, String> te : tr) {
        String contigName = te.get("contigName");

        if (kmerSize == 0) {
          kmerSize = te.get("seq").length() - te.get("kmerOrigin").length() + 1;
        }

        annotatedContigs.put(contigName, te);

        String[] ref0ToCanonicalExact =
            (te.get("ref0ToCanonicalExact").equals("NA")
                        || te.get("ref0ToCanonicalExact").equals("*:0-0")
                    ? "NA:0-0"
                    : te.get("ref0ToCanonicalExact"))
                .split("[:-]");
        String[] ref1ToCanonicalExact =
            (te.get("ref1ToCanonicalExact").equals("NA")
                        || te.get("ref1ToCanonicalExact").equals("*:0-0")
                    ? "NA:0-0"
                    : te.get("ref1ToCanonicalExact"))
                .split("[:-]");

        cout.println(
            te.get("sampleName")
                + "_"
                + te.get("accession")
                + "_"
                + contigName
                + " "
                + ref0ToCanonicalExact[0]
                + " "
                + ref0ToCanonicalExact[1]
                + " "
                + ref0ToCanonicalExact[2]
                + " radius1=0.8r");
        cout.println(
            te.get("sampleName")
                + "_"
                + te.get("accession")
                + "_"
                + contigName
                + " "
                + ref1ToCanonicalExact[0]
                + " "
                + ref1ToCanonicalExact[1]
                + " "
                + ref1ToCanonicalExact[2]
                + " radius2=0.6r");
      }
    }

    log.info("    contigs: {}", annotatedContigs.size());
    log.info("  kmer size: {}", kmerSize);

    log.info("Computing kmer inheritance information...");

    SAMFileHeader sfh = CONTIGS.getFileHeader();
    for (Character c : kmerCodeNames.keySet()) {
      SAMReadGroupRecord rgr = new SAMReadGroupRecord(kmerCodeNames.get(c));
      rgr.setSample(kmerCodeNames.get(c));
      sfh.addReadGroup(rgr);
    }

    SAMFileWriterFactory sfwf = new SAMFileWriterFactory();
    sfwf.setCreateIndex(true);
    SAMFileWriter sfw = sfwf.makeBAMWriter(sfh, false, bout);

    TableWriter tw = new TableWriter(sout);

    Set<IGVEntry> igvEntries = new TreeSet<IGVEntry>();
    int numContigs = 0;
    for (SAMRecord contig : CONTIGS) {
      if (CONTIG_NAMES == null
          || CONTIG_NAMES.isEmpty()
          || CONTIG_NAMES.contains(contig.getReadName())) {
        Map<String, String> te = annotatedContigs.get(contig.getReadName());

        if (annotatedContigs.containsKey(contig.getReadName())) {
          String seq = contig.getReadString();

          // log.debug("  te: {}", te);

          String annSeq = te.get("seq");
          String kmerOrigin = te.get("kmerOrigin");

          Map<CortexKmer, Character> kmerCodes = new HashMap<CortexKmer, Character>();
          for (int i = 0; i < kmerOrigin.length(); i++) {
            CortexKmer kmer = new CortexKmer(annSeq.substring(i, i + kmerSize));
            Character code = kmerOrigin.charAt(i);

            kmerCodes.put(kmer, code);
          }

          Map<Character, Integer> kmerStats = new HashMap<Character, Integer>();
          for (Character c : kmerCodeNames.keySet()) {
            kmerStats.put(c, 0);
          }

          boolean changed = false;

          // We want to be able to examine soft-clipped regions as well.
          List<CigarElement> ces = new ArrayList<CigarElement>();
          for (CigarElement ce : contig.getCigar().getCigarElements()) {
            if (ce.getOperator().equals(CigarOperator.S)) {
              ces.add(new CigarElement(ce.getLength(), CigarOperator.M));
              changed = true;
            } else {
              ces.add(ce);
            }
          }

          if (changed) {
            CigarElement firstCe = contig.getCigar().getCigarElements().get(0);

            if (firstCe.getOperator().equals(CigarOperator.S)) {
              contig.setAlignmentStart(contig.getAlignmentStart() - firstCe.getLength());
            }

            contig.setCigar(new Cigar(ces));
          }

          for (AlignmentBlock ab : contig.getAlignmentBlocks()) {
            for (int i = ab.getReadStart() - 1; i < ab.getReadStart() + ab.getLength(); i++) {
              if (i + kmerSize < seq.length()) {
                CortexKmer kmer = new CortexKmer(seq.substring(i, i + kmerSize));

                SAMRecord skmer = new SAMRecord(CONTIGS.getFileHeader());
                skmer.setReadBases(seq.substring(i, i + kmerSize).getBytes());

                List<CigarElement> cigarElements = new ArrayList<CigarElement>();
                cigarElements.add(new CigarElement(kmerSize, CigarOperator.M));
                Cigar cigar = new Cigar(cigarElements);

                skmer.setReadName(contig.getReadName() + "." + kmer.getKmerAsString());
                skmer.setReferenceName(contig.getReferenceName());
                skmer.setCigar(cigar);
                skmer.setReadPairedFlag(false);
                skmer.setDuplicateReadFlag(false);
                skmer.setMateNegativeStrandFlag(false);
                skmer.setAlignmentStart(ab.getReferenceStart() - ab.getReadStart() + 1 + i);
                skmer.setAttribute("RG", "none");
                skmer.setMappingQuality(0);

                Character c = kmerCodes.get(kmer);
                String codeName = kmerCodeNames.get(c);

                String parentReadGroupId = null;
                String sampleReadGroupId = null;
                for (SAMReadGroupRecord rgr : sfh.getReadGroups()) {
                  if (rgr.getSample().equals(codeName)) {
                    parentReadGroupId = rgr.getReadGroupId();
                  }

                  if (rgr.getSample().equals(contig.getReadGroup().getSample())) {
                    sampleReadGroupId = rgr.getReadGroupId();
                  }
                }

                skmer.setAttribute(
                    "RG", parentReadGroupId != null ? parentReadGroupId : sampleReadGroupId);
                skmer.setMappingQuality(99);

                sfw.addAlignment(skmer);

                kmerStats.put(c, kmerStats.get(c) + 1);

                IGVEntry igvEntry = new IGVEntry();
                igvEntry.chromosome = contig.getReferenceName();
                igvEntry.start = ab.getReferenceStart() - ab.getReadStart() + i;
                igvEntry.parentageName = kmerCodeNames.get(c);
                igvEntry.parentage = kmerCodeIndices.get(c);
                igvEntries.add(igvEntry);
              }
            }
          }

          if (!contig.isSecondaryOrSupplementary()) {
            beout.println(
                contig.getReferenceName()
                    + "\t"
                    + contig.getAlignmentStart()
                    + "\t"
                    + contig.getAlignmentEnd()
                    + "\t"
                    + contig.getReadName()
                    + "."
                    + contig.getReadGroup().getSample());

            if (annotatedContigs.size() > 10 && numContigs % (annotatedContigs.size() / 10) == 0) {
              log.info("  processed {}/{} contigs", numContigs, annotatedContigs.size());
            }
            numContigs++;
          }

          Map<String, String> stats = new LinkedHashMap<String, String>();
          stats.put("contigName", contig.getReadName());
          stats.put("sampleName", contig.getReadGroup().getSample());
          for (Character c : kmerCodeNames.keySet()) {
            stats.put(kmerCodeNames.get(c), String.valueOf(kmerStats.get(c)));
          }
          tw.addEntry(stats);
        }
      }
    }

    log.info("Writing kmer inheritance information...");
    out.printf("%s\t%s\t%s\t%s\t%s\n", "Chromosome", "Start", "End", "Feature", "Parentage");
    for (IGVEntry igvEntry : igvEntries) {
      out.printf(
          "%s\t%d\t%d\t%s\t%d\n",
          igvEntry.chromosome,
          igvEntry.start,
          igvEntry.start + 1,
          igvEntry.parentageName,
          igvEntry.parentage);
    }

    sfw.close();
  }