Beispiel #1
0
  @Test
  public void testBamIntegers() throws IOException {
    final List<String> errorMessages = new ArrayList<String>();
    final SamReader bamReader = SamReaderFactory.makeDefault().open(BAM_INPUT);
    final File bamOutput = File.createTempFile("test", ".bam");
    final File samOutput = File.createTempFile("test", ".sam");
    final SAMFileWriter samWriter =
        new SAMFileWriterFactory().makeWriter(bamReader.getFileHeader(), true, samOutput, null);
    final SAMFileWriter bamWriter =
        new SAMFileWriterFactory().makeWriter(bamReader.getFileHeader(), true, bamOutput, null);
    final SAMRecordIterator iterator = bamReader.iterator();
    while (iterator.hasNext()) {
      try {
        final SAMRecord rec = iterator.next();
        samWriter.addAlignment(rec);
        bamWriter.addAlignment(rec);
      } catch (final Throwable e) {
        System.out.println(e.getMessage());
        errorMessages.add(e.getMessage());
      }
    }

    CloserUtil.close(bamReader);
    samWriter.close();
    bamWriter.close();
    Assert.assertEquals(errorMessages.size(), 0);
    bamOutput.deleteOnExit();
    samOutput.deleteOnExit();
  }
  @Test(
      description =
          "Read and then write SAM to verify header attribute ordering does not change depending on JVM version")
  public void samRoundTrip() throws Exception {
    final File input = new File(TEST_DATA_DIR, "roundtrip.sam");

    final SamReader reader = SamReaderFactory.makeDefault().open(input);
    final File outputFile = File.createTempFile("roundtrip-out", ".sam");
    outputFile.delete();
    outputFile.deleteOnExit();
    FileOutputStream os = new FileOutputStream(outputFile);
    final SAMFileWriterFactory factory = new SAMFileWriterFactory();
    final SAMFileWriter writer = factory.makeSAMWriter(reader.getFileHeader(), false, os);
    for (SAMRecord rec : reader) {
      writer.addAlignment(rec);
    }
    writer.close();
    os.close();

    InputStream is = new FileInputStream(input);
    String originalsam = IOUtil.readFully(is);
    is.close();

    is = new FileInputStream(outputFile);
    String writtensam = IOUtil.readFully(is);
    is.close();

    Assert.assertEquals(writtensam, originalsam);
  }
  @Test(description = "Write SAM records with null SAMFileHeader")
  public void samNullHeaderRoundTrip() throws Exception {
    final File input = new File(TEST_DATA_DIR, "roundtrip.sam");

    final SamReader reader = SamReaderFactory.makeDefault().open(input);
    final File outputFile = File.createTempFile("nullheader-out", ".sam");
    outputFile.delete();
    outputFile.deleteOnExit();
    FileOutputStream os = new FileOutputStream(outputFile);
    final SAMFileWriterFactory factory = new SAMFileWriterFactory();
    final SAMFileWriter writer = factory.makeSAMWriter(reader.getFileHeader(), false, os);
    for (SAMRecord rec : reader) {
      rec.setHeader(null);
      writer.addAlignment(rec);
    }
    writer.close();
    os.close();

    InputStream is = new FileInputStream(input);
    String originalsam = IOUtil.readFully(is);
    is.close();

    is = new FileInputStream(outputFile);
    String writtensam = IOUtil.readFully(is);
    is.close();

    Assert.assertEquals(writtensam, originalsam);
  }
  @Test(dataProvider = "data")
  public void testProgramGroupAndReadGroupMerge(File inputFiles[], File expectedOutputFile)
      throws IOException {

    BufferedReader reader = new BufferedReader(new FileReader(expectedOutputFile));

    String line;
    String expected_output = "";
    while ((line = reader.readLine()) != null) {
      expected_output += line + "\n";
    }

    final List<SAMFileReader> readers = new ArrayList<SAMFileReader>();
    final List<SAMFileHeader> headers = new ArrayList<SAMFileHeader>();
    for (final File inFile : inputFiles) {
      IOUtil.assertFileIsReadable(inFile);
      final SAMFileReader in = new SAMFileReader(inFile);
      // We are now checking for zero-length reads, so suppress complaint about that.
      in.setValidationStringency(ValidationStringency.SILENT);
      readers.add(in);
      headers.add(in.getFileHeader());
    }
    final MergingSamRecordIterator iterator;

    final SamFileHeaderMerger headerMerger =
        new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate, headers, true);
    iterator = new MergingSamRecordIterator(headerMerger, readers, false);

    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    SAMFileWriter writer =
        new SAMFileWriterFactory().makeSAMWriter(headerMerger.getMergedHeader(), true, baos);
    while (iterator.hasNext()) {
      writer.addAlignment(iterator.next());
    }
    writer.close();

    String actual_output = StringUtil.bytesToString(baos.toByteArray());

    List<String> actual = Arrays.asList(actual_output.split("\\n"));
    List<String> expected = Arrays.asList(expected_output.split("\\n"));
    for (int i = 0; i < expected.size(); i++) {
      if (expected.get(i).startsWith("@")) {
        Assert.assertTrue(headersEquivalent(actual.get(i), expected.get(i)));
      } else {
        List<String> expectedSamParts = Arrays.asList(expected.get(i).split("\\s*"));
        List<String> actualSamParts = Arrays.asList(actual.get(i).split("\\s*"));
        for (String exp : expectedSamParts) {
          Assert.assertTrue(actualSamParts.contains(exp));
        }
        for (String act : actualSamParts) {
          Assert.assertTrue(expectedSamParts.contains(act));
        }
      }
    }
  }
 private void createSmallBam(final File outputFile) {
   final SAMFileWriterFactory factory = new SAMFileWriterFactory();
   factory.setCreateIndex(true);
   factory.setCreateMd5File(true);
   final SAMFileHeader header = new SAMFileHeader();
   // index only created if coordinate sorted
   header.setSortOrder(SAMFileHeader.SortOrder.coordinate);
   header.addSequence(new SAMSequenceRecord("chr1", 123));
   final SAMFileWriter writer = factory.makeBAMWriter(header, false, outputFile);
   fillSmallBam(writer);
   writer.close();
 }
 private void createSmallBamToOutputStream(final OutputStream outputStream, boolean binary) {
   final SAMFileWriterFactory factory = new SAMFileWriterFactory();
   factory.setCreateIndex(false);
   factory.setCreateMd5File(false);
   final SAMFileHeader header = new SAMFileHeader();
   // index only created if coordinate sorted
   header.setSortOrder(SAMFileHeader.SortOrder.coordinate);
   header.addSequence(new SAMSequenceRecord("chr1", 123));
   final SAMFileWriter writer =
       (binary
           ? factory.makeBAMWriter(header, false, outputStream)
           : factory.makeSAMWriter(header, false, outputStream));
   fillSmallBam(writer);
   writer.close();
 }
Beispiel #7
0
  private void createBAM(final List<GATKSAMRecord> reads) throws IOException {
    testBAM = File.createTempFile("TraverseActiveRegionsUnitTest", ".bam");
    testBAM.deleteOnExit();

    SAMFileWriter out =
        new SAMFileWriterFactory()
            .setCreateIndex(true)
            .makeBAMWriter(reads.get(0).getHeader(), true, testBAM);
    for (GATKSAMRecord read : reads) {
      out.addAlignment(read);
    }
    out.close();

    new File(testBAM.getAbsolutePath().replace(".bam", ".bai")).deleteOnExit();
    new File(testBAM.getAbsolutePath() + ".bai").deleteOnExit();
  }
  protected int doWork() {
    IoUtil.assertFileIsReadable(INPUT);
    IoUtil.assertFileIsWritable(OUTPUT);

    final SAMFileReader in = new SAMFileReader(INPUT);

    // create the read group we'll be using
    final SAMReadGroupRecord rg = new SAMReadGroupRecord(RGID);
    rg.setLibrary(RGLB);
    rg.setPlatform(RGPL);
    rg.setSample(RGSM);
    rg.setPlatformUnit(RGPU);
    if (RGCN != null) rg.setSequencingCenter(RGCN);
    if (RGDS != null) rg.setDescription(RGDS);
    if (RGDT != null) rg.setRunDate(RGDT);

    log.info(
        String.format(
            "Created read group ID=%s PL=%s LB=%s SM=%s%n",
            rg.getId(), rg.getPlatform(), rg.getLibrary(), rg.getSample()));

    // create the new header and output file
    final SAMFileHeader inHeader = in.getFileHeader();
    final SAMFileHeader outHeader = inHeader.clone();
    outHeader.setReadGroups(Arrays.asList(rg));
    if (SORT_ORDER != null) outHeader.setSortOrder(SORT_ORDER);

    final SAMFileWriter outWriter =
        new SAMFileWriterFactory()
            .makeSAMOrBAMWriter(
                outHeader, outHeader.getSortOrder() == inHeader.getSortOrder(), OUTPUT);

    final ProgressLogger progress = new ProgressLogger(log);
    for (final SAMRecord read : in) {
      read.setAttribute(SAMTag.RG.name(), RGID);
      outWriter.addAlignment(read);
      progress.record(read);
    }

    // cleanup
    in.close();
    outWriter.close();
    return 0;
  }
Beispiel #9
0
 public void close() {
   for (SAMFileWriter w : writerMap.values()) w.close();
 }
Beispiel #10
0
  protected void PicardPreprocess(
      Context context, PreprocessingTools tools, SAMRecordIterator input, String output)
      throws InterruptedException, QualityException, IOException, URISyntaxException {
    outHeader = header.clone();
    outHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
    // tmp files
    String tmpOut1 = tmpFileBase + "-p1.bam";
    String tmpOut2 = tmpFileBase + "-p2.bam";
    String tmpOut3 = tmpFileBase + "-p3.sam";
    String fCounts = tmpFileBase + "-features.count";
    String tmpMetrics = tmpFileBase + "-p3-metrics.txt";
    SAMFileWriterFactory factory = new SAMFileWriterFactory();
    if (!inputIsBam) {
      outHeader.addReadGroup(bamrg);
    }
    SAMFileWriter writer = factory.makeBAMWriter(outHeader, true, new File(tmpOut1));

    long startTime = System.currentTimeMillis();

    int count = 0;
    SAMRecord sam;
    while (input.hasNext()) {
      sam = input.next();
      writer.addAlignment(sam);
      count++;
    }
    int reads = input.getCount();
    writer.close();

    context.getCounter(HalvadeCounters.IN_PREP_READS).increment(reads);
    long estimatedTime = System.currentTimeMillis() - startTime;
    context.getCounter(HalvadeCounters.TIME_HADOOP_SAMTOBAM).increment(estimatedTime);
    Logger.DEBUG("time writing " + count + " records to disk: " + estimatedTime / 1000);

    Logger.DEBUG("clean sam");
    context.setStatus("clean sam");
    tools.runCleanSam(tmpOut1, tmpOut2);
    Logger.DEBUG("mark duplicates");
    context.setStatus("mark duplicates");
    tools.runMarkDuplicates(tmpOut2, tmpOut3, tmpMetrics);

    if (gff != null) {
      // tmpOut3 is sam for htseq count!
      Logger.DEBUG("featureCounts");
      context.setStatus("featureCounts");
      tools.runFeatureCounts(gff, tmpOut3, fCounts, threads);
      HalvadeFileUtils.uploadFileToHDFS(
          context,
          FileSystem.get(new URI(outputdir), context.getConfiguration()),
          fCounts,
          outputdir + context.getTaskAttemptID().toString() + ".count");
    }

    if (!inputIsBam) {
      Logger.DEBUG("add read-group");
      context.setStatus("add read-group");
      tools.runAddOrReplaceReadGroups(tmpOut3, output, RGID, RGLB, RGPL, RGPU, RGSM);
    } else {
      context.setStatus("convert SAM to BAM");
      Logger.DEBUG("convert SAM to BAM");
      tools.callSAMToBAM(tmpOut3, output, threads);
    }

    Logger.DEBUG("build bam index");
    context.setStatus("build bam index");
    tools.runBuildBamIndex(output);

    estimatedTime = System.currentTimeMillis() - startTime;
    Logger.DEBUG("estimated time: " + estimatedTime / 1000);

    // remove all temporary files now!
    HalvadeFileUtils.removeLocalFile(keep, tmpMetrics, context, HalvadeCounters.FOUT_GATK_TMP);
    HalvadeFileUtils.removeLocalFile(keep, tmpOut1, context, HalvadeCounters.FOUT_GATK_TMP);
    HalvadeFileUtils.removeLocalFile(keep, tmpOut2, context, HalvadeCounters.FOUT_GATK_TMP);
    HalvadeFileUtils.removeLocalFile(keep, tmpOut3, context, HalvadeCounters.FOUT_GATK_TMP);
    HalvadeFileUtils.removeLocalFile(keep, fCounts);
  }
  @Override
  public void execute() {
    log.info("Initializing kmer code map...");
    Map<Character, Integer> kmerCodeIndices = new HashMap<Character, Integer>();
    kmerCodeIndices.put('0', 1);
    kmerCodeIndices.put('A', 3);
    kmerCodeIndices.put('B', 4);
    kmerCodeIndices.put('C', 5);
    kmerCodeIndices.put('_', 6);
    kmerCodeIndices.put('.', 7);
    kmerCodeIndices.put('1', 9);

    Map<Character, String> kmerCodeNames = new LinkedHashMap<Character, String>();
    kmerCodeNames.put('0', "ref0");
    kmerCodeNames.put('A', "repetitive");
    kmerCodeNames.put('B', "both");
    kmerCodeNames.put('C', "lowcoverage");
    kmerCodeNames.put('_', "lowconfidence");
    kmerCodeNames.put('.', "novel");
    kmerCodeNames.put('1', "ref1");

    if (KMER_CODE_NAMES != null) {
      for (Character c : kmerCodeNames.keySet()) {
        String cStr = String.valueOf(c);
        if (KMER_CODE_NAMES.containsKey(cStr)) {
          kmerCodeNames.put(c, KMER_CODE_NAMES.get(cStr));
        }
      }
    }

    for (Character c : kmerCodeNames.keySet()) {
      log.info("  {} {}: {}", c, kmerCodeIndices.get(c), kmerCodeNames.get(c));
    }

    log.info("Loading annotated contigs...");
    Map<String, Map<String, String>> annotatedContigs = new HashMap<String, Map<String, String>>();
    int kmerSize = 0;

    if (ANN.length() > 0) {
      TableReader tr = new TableReader(ANN);
      for (Map<String, String> te : tr) {
        String contigName = te.get("contigName");

        if (kmerSize == 0) {
          kmerSize = te.get("seq").length() - te.get("kmerOrigin").length() + 1;
        }

        annotatedContigs.put(contigName, te);

        String[] ref0ToCanonicalExact =
            (te.get("ref0ToCanonicalExact").equals("NA")
                        || te.get("ref0ToCanonicalExact").equals("*:0-0")
                    ? "NA:0-0"
                    : te.get("ref0ToCanonicalExact"))
                .split("[:-]");
        String[] ref1ToCanonicalExact =
            (te.get("ref1ToCanonicalExact").equals("NA")
                        || te.get("ref1ToCanonicalExact").equals("*:0-0")
                    ? "NA:0-0"
                    : te.get("ref1ToCanonicalExact"))
                .split("[:-]");

        cout.println(
            te.get("sampleName")
                + "_"
                + te.get("accession")
                + "_"
                + contigName
                + " "
                + ref0ToCanonicalExact[0]
                + " "
                + ref0ToCanonicalExact[1]
                + " "
                + ref0ToCanonicalExact[2]
                + " radius1=0.8r");
        cout.println(
            te.get("sampleName")
                + "_"
                + te.get("accession")
                + "_"
                + contigName
                + " "
                + ref1ToCanonicalExact[0]
                + " "
                + ref1ToCanonicalExact[1]
                + " "
                + ref1ToCanonicalExact[2]
                + " radius2=0.6r");
      }
    }

    log.info("    contigs: {}", annotatedContigs.size());
    log.info("  kmer size: {}", kmerSize);

    log.info("Computing kmer inheritance information...");

    SAMFileHeader sfh = CONTIGS.getFileHeader();
    for (Character c : kmerCodeNames.keySet()) {
      SAMReadGroupRecord rgr = new SAMReadGroupRecord(kmerCodeNames.get(c));
      rgr.setSample(kmerCodeNames.get(c));
      sfh.addReadGroup(rgr);
    }

    SAMFileWriterFactory sfwf = new SAMFileWriterFactory();
    sfwf.setCreateIndex(true);
    SAMFileWriter sfw = sfwf.makeBAMWriter(sfh, false, bout);

    TableWriter tw = new TableWriter(sout);

    Set<IGVEntry> igvEntries = new TreeSet<IGVEntry>();
    int numContigs = 0;
    for (SAMRecord contig : CONTIGS) {
      if (CONTIG_NAMES == null
          || CONTIG_NAMES.isEmpty()
          || CONTIG_NAMES.contains(contig.getReadName())) {
        Map<String, String> te = annotatedContigs.get(contig.getReadName());

        if (annotatedContigs.containsKey(contig.getReadName())) {
          String seq = contig.getReadString();

          // log.debug("  te: {}", te);

          String annSeq = te.get("seq");
          String kmerOrigin = te.get("kmerOrigin");

          Map<CortexKmer, Character> kmerCodes = new HashMap<CortexKmer, Character>();
          for (int i = 0; i < kmerOrigin.length(); i++) {
            CortexKmer kmer = new CortexKmer(annSeq.substring(i, i + kmerSize));
            Character code = kmerOrigin.charAt(i);

            kmerCodes.put(kmer, code);
          }

          Map<Character, Integer> kmerStats = new HashMap<Character, Integer>();
          for (Character c : kmerCodeNames.keySet()) {
            kmerStats.put(c, 0);
          }

          boolean changed = false;

          // We want to be able to examine soft-clipped regions as well.
          List<CigarElement> ces = new ArrayList<CigarElement>();
          for (CigarElement ce : contig.getCigar().getCigarElements()) {
            if (ce.getOperator().equals(CigarOperator.S)) {
              ces.add(new CigarElement(ce.getLength(), CigarOperator.M));
              changed = true;
            } else {
              ces.add(ce);
            }
          }

          if (changed) {
            CigarElement firstCe = contig.getCigar().getCigarElements().get(0);

            if (firstCe.getOperator().equals(CigarOperator.S)) {
              contig.setAlignmentStart(contig.getAlignmentStart() - firstCe.getLength());
            }

            contig.setCigar(new Cigar(ces));
          }

          for (AlignmentBlock ab : contig.getAlignmentBlocks()) {
            for (int i = ab.getReadStart() - 1; i < ab.getReadStart() + ab.getLength(); i++) {
              if (i + kmerSize < seq.length()) {
                CortexKmer kmer = new CortexKmer(seq.substring(i, i + kmerSize));

                SAMRecord skmer = new SAMRecord(CONTIGS.getFileHeader());
                skmer.setReadBases(seq.substring(i, i + kmerSize).getBytes());

                List<CigarElement> cigarElements = new ArrayList<CigarElement>();
                cigarElements.add(new CigarElement(kmerSize, CigarOperator.M));
                Cigar cigar = new Cigar(cigarElements);

                skmer.setReadName(contig.getReadName() + "." + kmer.getKmerAsString());
                skmer.setReferenceName(contig.getReferenceName());
                skmer.setCigar(cigar);
                skmer.setReadPairedFlag(false);
                skmer.setDuplicateReadFlag(false);
                skmer.setMateNegativeStrandFlag(false);
                skmer.setAlignmentStart(ab.getReferenceStart() - ab.getReadStart() + 1 + i);
                skmer.setAttribute("RG", "none");
                skmer.setMappingQuality(0);

                Character c = kmerCodes.get(kmer);
                String codeName = kmerCodeNames.get(c);

                String parentReadGroupId = null;
                String sampleReadGroupId = null;
                for (SAMReadGroupRecord rgr : sfh.getReadGroups()) {
                  if (rgr.getSample().equals(codeName)) {
                    parentReadGroupId = rgr.getReadGroupId();
                  }

                  if (rgr.getSample().equals(contig.getReadGroup().getSample())) {
                    sampleReadGroupId = rgr.getReadGroupId();
                  }
                }

                skmer.setAttribute(
                    "RG", parentReadGroupId != null ? parentReadGroupId : sampleReadGroupId);
                skmer.setMappingQuality(99);

                sfw.addAlignment(skmer);

                kmerStats.put(c, kmerStats.get(c) + 1);

                IGVEntry igvEntry = new IGVEntry();
                igvEntry.chromosome = contig.getReferenceName();
                igvEntry.start = ab.getReferenceStart() - ab.getReadStart() + i;
                igvEntry.parentageName = kmerCodeNames.get(c);
                igvEntry.parentage = kmerCodeIndices.get(c);
                igvEntries.add(igvEntry);
              }
            }
          }

          if (!contig.isSecondaryOrSupplementary()) {
            beout.println(
                contig.getReferenceName()
                    + "\t"
                    + contig.getAlignmentStart()
                    + "\t"
                    + contig.getAlignmentEnd()
                    + "\t"
                    + contig.getReadName()
                    + "."
                    + contig.getReadGroup().getSample());

            if (annotatedContigs.size() > 10 && numContigs % (annotatedContigs.size() / 10) == 0) {
              log.info("  processed {}/{} contigs", numContigs, annotatedContigs.size());
            }
            numContigs++;
          }

          Map<String, String> stats = new LinkedHashMap<String, String>();
          stats.put("contigName", contig.getReadName());
          stats.put("sampleName", contig.getReadGroup().getSample());
          for (Character c : kmerCodeNames.keySet()) {
            stats.put(kmerCodeNames.get(c), String.valueOf(kmerStats.get(c)));
          }
          tw.addEntry(stats);
        }
      }
    }

    log.info("Writing kmer inheritance information...");
    out.printf("%s\t%s\t%s\t%s\t%s\n", "Chromosome", "Start", "End", "Feature", "Parentage");
    for (IGVEntry igvEntry : igvEntries) {
      out.printf(
          "%s\t%d\t%d\t%s\t%d\n",
          igvEntry.chromosome,
          igvEntry.start,
          igvEntry.start + 1,
          igvEntry.parentageName,
          igvEntry.parentage);
    }

    sfw.close();
  }