private void createSmallBam(final File outputFile) {
   final SAMFileWriterFactory factory = new SAMFileWriterFactory();
   factory.setCreateIndex(true);
   factory.setCreateMd5File(true);
   final SAMFileHeader header = new SAMFileHeader();
   // index only created if coordinate sorted
   header.setSortOrder(SAMFileHeader.SortOrder.coordinate);
   header.addSequence(new SAMSequenceRecord("chr1", 123));
   final SAMFileWriter writer = factory.makeBAMWriter(header, false, outputFile);
   fillSmallBam(writer);
   writer.close();
 }
 private void createSmallBamToOutputStream(final OutputStream outputStream, boolean binary) {
   final SAMFileWriterFactory factory = new SAMFileWriterFactory();
   factory.setCreateIndex(false);
   factory.setCreateMd5File(false);
   final SAMFileHeader header = new SAMFileHeader();
   // index only created if coordinate sorted
   header.setSortOrder(SAMFileHeader.SortOrder.coordinate);
   header.addSequence(new SAMSequenceRecord("chr1", 123));
   final SAMFileWriter writer =
       (binary
           ? factory.makeBAMWriter(header, false, outputStream)
           : factory.makeSAMWriter(header, false, outputStream));
   fillSmallBam(writer);
   writer.close();
 }
Esempio n. 3
0
  @Override
  public int doWork(String[] args) {
    boolean compressed = false;
    int maxRecordsInRAM = 100000;
    long count = -1L;
    File fileout = null;
    com.github.lindenb.jvarkit.util.cli.GetOpt opt =
        new com.github.lindenb.jvarkit.util.cli.GetOpt();
    int c;
    while ((c = opt.getopt(args, getGetOptDefault() + "o:n:N:T:b")) != -1) {
      switch (c) {
        case 'b':
          compressed = true;
          break;
        case 'N':
          maxRecordsInRAM = Integer.parseInt(opt.getOptArg());
          break;
        case 'n':
          count = Long.parseLong(opt.getOptArg());
          break;
        case 'o':
          fileout = new File(opt.getOptArg());
          break;
        case 'T':
          this.addTmpDirectory(new File(opt.getOptArg()));
          break;
        default:
          {
            switch (handleOtherOptions(c, opt, null)) {
              case EXIT_FAILURE:
                return -1;
              case EXIT_SUCCESS:
                return 0;
              default:
                break;
            }
          }
      }
    }
    if (count < -1L) // -1 == infinite
    {
      error("Bad count:" + count);
      return -1;
    }
    SamReader samReader = null;
    SAMRecordIterator iter = null;
    SAMFileWriter samWriter = null;
    Random random = new Random();
    CloseableIterator<RandSamRecord> iter2 = null;
    try {
      SamFileReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT);
      if (opt.getOptInd() == args.length) {
        info("Reading from stdin");
        samReader = SamFileReaderFactory.mewInstance().openStdin();
      } else if (opt.getOptInd() + 1 == args.length) {
        File filename = new File(args[opt.getOptInd()]);
        info("Reading from " + filename);
        samReader = SamFileReaderFactory.mewInstance().open(filename);
      } else {
        error("Illegal number of arguments.");
        return -1;
      }
      SAMFileHeader header = samReader.getFileHeader();

      header = header.clone();
      header.setSortOrder(SortOrder.unsorted);
      header.addComment("Processed with " + getProgramName() + " : " + getProgramCommandLine());
      SAMFileWriterFactory sfw = new SAMFileWriterFactory();
      sfw.setCreateIndex(false);
      sfw.setCreateMd5File(false);
      if (fileout == null) {
        if (compressed) {
          samWriter = sfw.makeBAMWriter(header, true, System.out);
        } else {
          samWriter = sfw.makeSAMWriter(header, true, System.out);
        }
      } else {
        samWriter = sfw.makeSAMOrBAMWriter(header, true, fileout);
        this.addTmpDirectory(fileout);
      }
      iter = samReader.iterator();
      SAMSequenceDictionaryProgress progress =
          new SAMSequenceDictionaryProgress(samReader.getFileHeader().getSequenceDictionary());

      SortingCollection<RandSamRecord> sorter =
          SortingCollection.newInstance(
              RandSamRecord.class,
              new RandSamRecordCodec(header),
              new RandSamRecordComparator(),
              maxRecordsInRAM,
              getTmpDirectories());
      sorter.setDestructiveIteration(true);
      while (iter.hasNext()) {
        RandSamRecord r = new RandSamRecord();
        r.rand_index = random.nextInt();
        r.samRecord = progress.watch(iter.next());

        sorter.add(r);
      }
      iter.close();
      iter = null;

      sorter.doneAdding();
      iter2 = sorter.iterator();
      if (count == -1) {
        while (iter2.hasNext()) {
          samWriter.addAlignment(iter2.next().samRecord);
        }
      } else {
        while (iter2.hasNext() && count > 0) {
          samWriter.addAlignment(iter2.next().samRecord);
          count--;
        }
      }
      iter2.close();
      iter2 = null;
      sorter.cleanup();
      progress.finish();
    } catch (Exception e) {
      error(e);
      return -1;
    } finally {
      CloserUtil.close(iter);
      CloserUtil.close(iter2);
      CloserUtil.close(samReader);
      CloserUtil.close(samWriter);
    }
    return 0;
  }
Esempio n. 4
0
  protected void PicardPreprocess(
      Context context, PreprocessingTools tools, SAMRecordIterator input, String output)
      throws InterruptedException, QualityException, IOException, URISyntaxException {
    outHeader = header.clone();
    outHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
    // tmp files
    String tmpOut1 = tmpFileBase + "-p1.bam";
    String tmpOut2 = tmpFileBase + "-p2.bam";
    String tmpOut3 = tmpFileBase + "-p3.sam";
    String fCounts = tmpFileBase + "-features.count";
    String tmpMetrics = tmpFileBase + "-p3-metrics.txt";
    SAMFileWriterFactory factory = new SAMFileWriterFactory();
    if (!inputIsBam) {
      outHeader.addReadGroup(bamrg);
    }
    SAMFileWriter writer = factory.makeBAMWriter(outHeader, true, new File(tmpOut1));

    long startTime = System.currentTimeMillis();

    int count = 0;
    SAMRecord sam;
    while (input.hasNext()) {
      sam = input.next();
      writer.addAlignment(sam);
      count++;
    }
    int reads = input.getCount();
    writer.close();

    context.getCounter(HalvadeCounters.IN_PREP_READS).increment(reads);
    long estimatedTime = System.currentTimeMillis() - startTime;
    context.getCounter(HalvadeCounters.TIME_HADOOP_SAMTOBAM).increment(estimatedTime);
    Logger.DEBUG("time writing " + count + " records to disk: " + estimatedTime / 1000);

    Logger.DEBUG("clean sam");
    context.setStatus("clean sam");
    tools.runCleanSam(tmpOut1, tmpOut2);
    Logger.DEBUG("mark duplicates");
    context.setStatus("mark duplicates");
    tools.runMarkDuplicates(tmpOut2, tmpOut3, tmpMetrics);

    if (gff != null) {
      // tmpOut3 is sam for htseq count!
      Logger.DEBUG("featureCounts");
      context.setStatus("featureCounts");
      tools.runFeatureCounts(gff, tmpOut3, fCounts, threads);
      HalvadeFileUtils.uploadFileToHDFS(
          context,
          FileSystem.get(new URI(outputdir), context.getConfiguration()),
          fCounts,
          outputdir + context.getTaskAttemptID().toString() + ".count");
    }

    if (!inputIsBam) {
      Logger.DEBUG("add read-group");
      context.setStatus("add read-group");
      tools.runAddOrReplaceReadGroups(tmpOut3, output, RGID, RGLB, RGPL, RGPU, RGSM);
    } else {
      context.setStatus("convert SAM to BAM");
      Logger.DEBUG("convert SAM to BAM");
      tools.callSAMToBAM(tmpOut3, output, threads);
    }

    Logger.DEBUG("build bam index");
    context.setStatus("build bam index");
    tools.runBuildBamIndex(output);

    estimatedTime = System.currentTimeMillis() - startTime;
    Logger.DEBUG("estimated time: " + estimatedTime / 1000);

    // remove all temporary files now!
    HalvadeFileUtils.removeLocalFile(keep, tmpMetrics, context, HalvadeCounters.FOUT_GATK_TMP);
    HalvadeFileUtils.removeLocalFile(keep, tmpOut1, context, HalvadeCounters.FOUT_GATK_TMP);
    HalvadeFileUtils.removeLocalFile(keep, tmpOut2, context, HalvadeCounters.FOUT_GATK_TMP);
    HalvadeFileUtils.removeLocalFile(keep, tmpOut3, context, HalvadeCounters.FOUT_GATK_TMP);
    HalvadeFileUtils.removeLocalFile(keep, fCounts);
  }
  @Override
  public void execute() {
    log.info("Initializing kmer code map...");
    Map<Character, Integer> kmerCodeIndices = new HashMap<Character, Integer>();
    kmerCodeIndices.put('0', 1);
    kmerCodeIndices.put('A', 3);
    kmerCodeIndices.put('B', 4);
    kmerCodeIndices.put('C', 5);
    kmerCodeIndices.put('_', 6);
    kmerCodeIndices.put('.', 7);
    kmerCodeIndices.put('1', 9);

    Map<Character, String> kmerCodeNames = new LinkedHashMap<Character, String>();
    kmerCodeNames.put('0', "ref0");
    kmerCodeNames.put('A', "repetitive");
    kmerCodeNames.put('B', "both");
    kmerCodeNames.put('C', "lowcoverage");
    kmerCodeNames.put('_', "lowconfidence");
    kmerCodeNames.put('.', "novel");
    kmerCodeNames.put('1', "ref1");

    if (KMER_CODE_NAMES != null) {
      for (Character c : kmerCodeNames.keySet()) {
        String cStr = String.valueOf(c);
        if (KMER_CODE_NAMES.containsKey(cStr)) {
          kmerCodeNames.put(c, KMER_CODE_NAMES.get(cStr));
        }
      }
    }

    for (Character c : kmerCodeNames.keySet()) {
      log.info("  {} {}: {}", c, kmerCodeIndices.get(c), kmerCodeNames.get(c));
    }

    log.info("Loading annotated contigs...");
    Map<String, Map<String, String>> annotatedContigs = new HashMap<String, Map<String, String>>();
    int kmerSize = 0;

    if (ANN.length() > 0) {
      TableReader tr = new TableReader(ANN);
      for (Map<String, String> te : tr) {
        String contigName = te.get("contigName");

        if (kmerSize == 0) {
          kmerSize = te.get("seq").length() - te.get("kmerOrigin").length() + 1;
        }

        annotatedContigs.put(contigName, te);

        String[] ref0ToCanonicalExact =
            (te.get("ref0ToCanonicalExact").equals("NA")
                        || te.get("ref0ToCanonicalExact").equals("*:0-0")
                    ? "NA:0-0"
                    : te.get("ref0ToCanonicalExact"))
                .split("[:-]");
        String[] ref1ToCanonicalExact =
            (te.get("ref1ToCanonicalExact").equals("NA")
                        || te.get("ref1ToCanonicalExact").equals("*:0-0")
                    ? "NA:0-0"
                    : te.get("ref1ToCanonicalExact"))
                .split("[:-]");

        cout.println(
            te.get("sampleName")
                + "_"
                + te.get("accession")
                + "_"
                + contigName
                + " "
                + ref0ToCanonicalExact[0]
                + " "
                + ref0ToCanonicalExact[1]
                + " "
                + ref0ToCanonicalExact[2]
                + " radius1=0.8r");
        cout.println(
            te.get("sampleName")
                + "_"
                + te.get("accession")
                + "_"
                + contigName
                + " "
                + ref1ToCanonicalExact[0]
                + " "
                + ref1ToCanonicalExact[1]
                + " "
                + ref1ToCanonicalExact[2]
                + " radius2=0.6r");
      }
    }

    log.info("    contigs: {}", annotatedContigs.size());
    log.info("  kmer size: {}", kmerSize);

    log.info("Computing kmer inheritance information...");

    SAMFileHeader sfh = CONTIGS.getFileHeader();
    for (Character c : kmerCodeNames.keySet()) {
      SAMReadGroupRecord rgr = new SAMReadGroupRecord(kmerCodeNames.get(c));
      rgr.setSample(kmerCodeNames.get(c));
      sfh.addReadGroup(rgr);
    }

    SAMFileWriterFactory sfwf = new SAMFileWriterFactory();
    sfwf.setCreateIndex(true);
    SAMFileWriter sfw = sfwf.makeBAMWriter(sfh, false, bout);

    TableWriter tw = new TableWriter(sout);

    Set<IGVEntry> igvEntries = new TreeSet<IGVEntry>();
    int numContigs = 0;
    for (SAMRecord contig : CONTIGS) {
      if (CONTIG_NAMES == null
          || CONTIG_NAMES.isEmpty()
          || CONTIG_NAMES.contains(contig.getReadName())) {
        Map<String, String> te = annotatedContigs.get(contig.getReadName());

        if (annotatedContigs.containsKey(contig.getReadName())) {
          String seq = contig.getReadString();

          // log.debug("  te: {}", te);

          String annSeq = te.get("seq");
          String kmerOrigin = te.get("kmerOrigin");

          Map<CortexKmer, Character> kmerCodes = new HashMap<CortexKmer, Character>();
          for (int i = 0; i < kmerOrigin.length(); i++) {
            CortexKmer kmer = new CortexKmer(annSeq.substring(i, i + kmerSize));
            Character code = kmerOrigin.charAt(i);

            kmerCodes.put(kmer, code);
          }

          Map<Character, Integer> kmerStats = new HashMap<Character, Integer>();
          for (Character c : kmerCodeNames.keySet()) {
            kmerStats.put(c, 0);
          }

          boolean changed = false;

          // We want to be able to examine soft-clipped regions as well.
          List<CigarElement> ces = new ArrayList<CigarElement>();
          for (CigarElement ce : contig.getCigar().getCigarElements()) {
            if (ce.getOperator().equals(CigarOperator.S)) {
              ces.add(new CigarElement(ce.getLength(), CigarOperator.M));
              changed = true;
            } else {
              ces.add(ce);
            }
          }

          if (changed) {
            CigarElement firstCe = contig.getCigar().getCigarElements().get(0);

            if (firstCe.getOperator().equals(CigarOperator.S)) {
              contig.setAlignmentStart(contig.getAlignmentStart() - firstCe.getLength());
            }

            contig.setCigar(new Cigar(ces));
          }

          for (AlignmentBlock ab : contig.getAlignmentBlocks()) {
            for (int i = ab.getReadStart() - 1; i < ab.getReadStart() + ab.getLength(); i++) {
              if (i + kmerSize < seq.length()) {
                CortexKmer kmer = new CortexKmer(seq.substring(i, i + kmerSize));

                SAMRecord skmer = new SAMRecord(CONTIGS.getFileHeader());
                skmer.setReadBases(seq.substring(i, i + kmerSize).getBytes());

                List<CigarElement> cigarElements = new ArrayList<CigarElement>();
                cigarElements.add(new CigarElement(kmerSize, CigarOperator.M));
                Cigar cigar = new Cigar(cigarElements);

                skmer.setReadName(contig.getReadName() + "." + kmer.getKmerAsString());
                skmer.setReferenceName(contig.getReferenceName());
                skmer.setCigar(cigar);
                skmer.setReadPairedFlag(false);
                skmer.setDuplicateReadFlag(false);
                skmer.setMateNegativeStrandFlag(false);
                skmer.setAlignmentStart(ab.getReferenceStart() - ab.getReadStart() + 1 + i);
                skmer.setAttribute("RG", "none");
                skmer.setMappingQuality(0);

                Character c = kmerCodes.get(kmer);
                String codeName = kmerCodeNames.get(c);

                String parentReadGroupId = null;
                String sampleReadGroupId = null;
                for (SAMReadGroupRecord rgr : sfh.getReadGroups()) {
                  if (rgr.getSample().equals(codeName)) {
                    parentReadGroupId = rgr.getReadGroupId();
                  }

                  if (rgr.getSample().equals(contig.getReadGroup().getSample())) {
                    sampleReadGroupId = rgr.getReadGroupId();
                  }
                }

                skmer.setAttribute(
                    "RG", parentReadGroupId != null ? parentReadGroupId : sampleReadGroupId);
                skmer.setMappingQuality(99);

                sfw.addAlignment(skmer);

                kmerStats.put(c, kmerStats.get(c) + 1);

                IGVEntry igvEntry = new IGVEntry();
                igvEntry.chromosome = contig.getReferenceName();
                igvEntry.start = ab.getReferenceStart() - ab.getReadStart() + i;
                igvEntry.parentageName = kmerCodeNames.get(c);
                igvEntry.parentage = kmerCodeIndices.get(c);
                igvEntries.add(igvEntry);
              }
            }
          }

          if (!contig.isSecondaryOrSupplementary()) {
            beout.println(
                contig.getReferenceName()
                    + "\t"
                    + contig.getAlignmentStart()
                    + "\t"
                    + contig.getAlignmentEnd()
                    + "\t"
                    + contig.getReadName()
                    + "."
                    + contig.getReadGroup().getSample());

            if (annotatedContigs.size() > 10 && numContigs % (annotatedContigs.size() / 10) == 0) {
              log.info("  processed {}/{} contigs", numContigs, annotatedContigs.size());
            }
            numContigs++;
          }

          Map<String, String> stats = new LinkedHashMap<String, String>();
          stats.put("contigName", contig.getReadName());
          stats.put("sampleName", contig.getReadGroup().getSample());
          for (Character c : kmerCodeNames.keySet()) {
            stats.put(kmerCodeNames.get(c), String.valueOf(kmerStats.get(c)));
          }
          tw.addEntry(stats);
        }
      }
    }

    log.info("Writing kmer inheritance information...");
    out.printf("%s\t%s\t%s\t%s\t%s\n", "Chromosome", "Start", "End", "Feature", "Parentage");
    for (IGVEntry igvEntry : igvEntries) {
      out.printf(
          "%s\t%d\t%d\t%s\t%d\n",
          igvEntry.chromosome,
          igvEntry.start,
          igvEntry.start + 1,
          igvEntry.parentageName,
          igvEntry.parentage);
    }

    sfw.close();
  }