@Test public void test2() throws IOException { SAMFileReader r = new SAMFileReader( new File("c:/temp/HG00096.mapped.illumina.mosaik.GBR.exome.20110411.chr20.bam")); SAMRecordIterator iterator = r.iterator(); CompressionHeaderFactory.HuffmanParamsCalculator c = new HuffmanParamsCalculator(); String[] names = new String[100000]; for (int i = 0; i < names.length && iterator.hasNext(); i++) { names[i] = iterator.next().getReadName(); c.add(names[i].length()); } iterator.close(); r.close(); c.calculate(); int[] values = c.values(); int[] lens = c.bitLens(); System.out.println(Arrays.toString(values)); System.out.println(Arrays.toString(lens)); EncodingParams params = HuffmanIntegerEncoding.toParam(values, lens); HuffmanIntegerEncoding e = new HuffmanIntegerEncoding(); e.fromByteArray(params.params); BitCodec<Integer> codec = e.buildCodec(null, null); ByteArrayOutputStream baos = new ByteArrayOutputStream(); DefaultBitOutputStream bos = new DefaultBitOutputStream(baos); for (int i = 0; i < names.length; i++) { codec.write(bos, names[i].length()); } bos.close(); codec = e.buildCodec(null, null); ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray()); DefaultBitInputStream bis = new DefaultBitInputStream(bais); for (int i = 0; i < names.length; i++) { int v = codec.read(bis); if (v != names[i].length()) fail("Mismatch: " + v + " vs " + names[i].length()); } }
@Override protected int doWork() { Map<Integer, net.sf.picard.util.IntervalTree<Boolean>> bed = null; Histogram<Category> hist = new Histogram<BamStats01.Category>(); SAMFileReader samFileReader = null; BamStats01Report report = new BamStats01Report(samFileReader.getFileHeader()); try { SAMRecordIterator iter = samFileReader.iterator(); report.addAlignment(iter.next()); } catch (Exception err) { return -1; } finally { } return 0; }
public void timeDownsampling(int reps) { for (int i = 0; i < reps; i++) { SAMFileReader reader = new SAMFileReader(inputFile); ReadProperties readProperties = new ReadProperties( Collections.<SAMReaderID>singletonList(new SAMReaderID(inputFile, new Tags())), reader.getFileHeader(), false, SAMFileReader.ValidationStringency.SILENT, downsampling.create(), new ValidationExclusion(Collections.singletonList(ValidationExclusion.TYPE.ALL)), Collections.<ReadFilter>emptyList(), false, false, BAQ.CalculationMode.OFF, BAQ.QualityMode.DONT_MODIFY, null, // no BAQ null, // no BQSR (byte) 0); GenomeLocParser genomeLocParser = new GenomeLocParser(reader.getFileHeader().getSequenceDictionary()); // Filter unmapped reads. TODO: is this always strictly necessary? Who in the GATK normally // filters these out? Iterator<SAMRecord> readIterator = new FilteringIterator(reader.iterator(), new UnmappedReadFilter()); LocusIteratorByState locusIteratorByState = new LocusIteratorByState( readIterator, readProperties, genomeLocParser, LocusIteratorByState.sampleListForSAMWithoutReadGroups()); while (locusIteratorByState.hasNext()) { locusIteratorByState.next().getLocation(); } reader.close(); } }
/** * Asserts that files are readable and writable and then fires off an HsMetricsCalculator instance * to do the real work. */ protected int doWork() { IoUtil.assertFileIsReadable(getProbeIntervals()); IoUtil.assertFileIsReadable(TARGET_INTERVALS); IoUtil.assertFileIsReadable(INPUT); IoUtil.assertFileIsWritable(OUTPUT); if (PER_TARGET_COVERAGE != null) IoUtil.assertFileIsWritable(PER_TARGET_COVERAGE); final SAMFileReader samReader = new SAMFileReader(INPUT); final File probeIntervals = getProbeIntervals(); // Validate that the targets and baits have the same references as the reads file SequenceUtil.assertSequenceDictionariesEqual( samReader.getFileHeader().getSequenceDictionary(), IntervalList.fromFile(TARGET_INTERVALS).getHeader().getSequenceDictionary(), INPUT, TARGET_INTERVALS); SequenceUtil.assertSequenceDictionariesEqual( samReader.getFileHeader().getSequenceDictionary(), IntervalList.fromFile(probeIntervals).getHeader().getSequenceDictionary(), INPUT, probeIntervals); ReferenceSequenceFile ref = null; if (REFERENCE_SEQUENCE != null) { IoUtil.assertFileIsReadable(REFERENCE_SEQUENCE); ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(REFERENCE_SEQUENCE); SequenceUtil.assertSequenceDictionariesEqual( samReader.getFileHeader().getSequenceDictionary(), ref.getSequenceDictionary(), INPUT, REFERENCE_SEQUENCE); } final TargetMetricsCollector collector = makeCollector( METRIC_ACCUMULATION_LEVEL, samReader.getFileHeader().getReadGroups(), ref, PER_TARGET_COVERAGE, TARGET_INTERVALS, probeIntervals, getProbeSetName()); // Add each record to the requested collectors final Iterator<SAMRecord> records = samReader.iterator(); final ProgressLogger progress = new ProgressLogger(log); while (records.hasNext()) { final SAMRecord sam = records.next(); collector.acceptRecord(sam, null); progress.record(sam); } // Write the output file final MetricsFile<HsMetrics, Integer> metrics = getMetricsFile(); collector.finish(); collector.addAllLevelsToFile(metrics); metrics.write(OUTPUT); return 0; }
@Override public int doWork(String[] args) { File refFile = null; com.github.lindenb.jvarkit.util.cli.GetOpt getopt = new com.github.lindenb.jvarkit.util.cli.GetOpt(); int c; while ((c = getopt.getopt(args, "hvL:r:")) != -1) { switch (c) { case 'h': printUsage(); return 0; case 'v': System.out.println(getVersion()); return 0; case 'L': getLogger().setLevel(java.util.logging.Level.parse(getopt.getOptArg())); break; case 'r': refFile = new File(getopt.getOptArg()); break; case ':': System.err.println("Missing argument for option -" + getopt.getOptOpt()); return -1; default: System.err.println("Unknown option -" + getopt.getOptOpt()); return -1; } } if (refFile == null) { error("Undefined REF file"); return -1; } File bamFile = null; if (getopt.getOptInd() + 1 != args.length) { info("reading from stdin."); } else { bamFile = new File(args[getopt.getOptInd()]); } IndexedFastaSequenceFile indexedFastaSequenceFile = null; SAMFileReader samFileReader = null; try { GenomicSequence genomicSequence = null; indexedFastaSequenceFile = new IndexedFastaSequenceFile(refFile); SAMFileReader.setDefaultValidationStringency(ValidationStringency.SILENT); samFileReader = null; if (bamFile == null) { samFileReader = new SAMFileReader(System.in); } else { samFileReader = new SAMFileReader(bamFile); } XMLOutputFactory xmlfactory = XMLOutputFactory.newInstance(); XMLStreamWriter w = xmlfactory.createXMLStreamWriter(System.out, "UTF-8"); w.writeStartDocument("UTF-8", "1.0"); w.writeStartElement("sam"); w.writeComment(getProgramCommandLine()); w.writeAttribute("ref", (bamFile == null ? "stdin" : bamFile.getPath())); w.writeAttribute("bam", args[1]); SAMRecordIterator iter = samFileReader.iterator(); while (iter.hasNext()) { SAMRecord rec = iter.next(); final byte readbases[] = rec.getReadBases(); w.writeStartElement("read"); w.writeStartElement("name"); w.writeCharacters(rec.getReadName()); w.writeEndElement(); w.writeStartElement("sequence"); w.writeCharacters(new String(readbases)); w.writeEndElement(); w.writeStartElement("flags"); w.writeAttribute("paired", String.valueOf(rec.getReadPairedFlag())); w.writeAttribute( "failsVendorQual", String.valueOf(rec.getReadFailsVendorQualityCheckFlag())); w.writeAttribute("mapped", String.valueOf(!rec.getReadUnmappedFlag())); w.writeAttribute("strand", (rec.getReadNegativeStrandFlag() ? "-" : "+")); if (rec.getReadPairedFlag()) { w.writeAttribute("mate-mapped", String.valueOf(!rec.getMateUnmappedFlag())); w.writeAttribute("mate-strand", (rec.getMateNegativeStrandFlag() ? "-" : "+")); w.writeAttribute("proper-pair", String.valueOf(rec.getProperPairFlag())); } w.writeCharacters(String.valueOf(rec.getFlags())); w.writeEndElement(); if (!rec.getReadUnmappedFlag()) { w.writeStartElement("qual"); w.writeCharacters(String.valueOf(rec.getMappingQuality())); w.writeEndElement(); w.writeStartElement("chrom"); w.writeAttribute("index", String.valueOf(rec.getReferenceIndex())); w.writeCharacters(rec.getReferenceName()); w.writeEndElement(); w.writeStartElement("pos"); w.writeCharacters(String.valueOf(rec.getAlignmentStart())); w.writeEndElement(); w.writeStartElement("cigar"); w.writeCharacters(rec.getCigarString()); w.writeEndElement(); } if (!rec.getMateUnmappedFlag()) { w.writeStartElement("mate-chrom"); w.writeAttribute("index", String.valueOf(rec.getMateReferenceIndex())); w.writeCharacters(rec.getMateReferenceName()); w.writeEndElement(); w.writeStartElement("mate-pos"); w.writeCharacters(String.valueOf(rec.getMateAlignmentStart())); w.writeEndElement(); } if (!rec.getReadUnmappedFlag()) { if (genomicSequence == null || genomicSequence.getChrom().equals(rec.getReferenceName())) { genomicSequence = new GenomicSequence(indexedFastaSequenceFile, rec.getReferenceName()); } w.writeStartElement("align"); int readIndex = 0; int refIndex = rec.getAlignmentStart(); for (final CigarElement e : rec.getCigar().getCigarElements()) { switch (e.getOperator()) { case H: break; // ignore hard clips case P: break; // ignore pads case I: // cont. case S: { final int length = e.getLength(); for (int i = 0; i < length; ++i) { w.writeEmptyElement(e.getOperator().name()); w.writeAttribute("read-index", String.valueOf(readIndex + 1)); if (readIndex >= 0 && readIndex < readbases.length) { w.writeAttribute("read-base", String.valueOf((char) (readbases[readIndex]))); } readIndex++; } break; } case N: // cont. -- reference skip case D: { final int length = e.getLength(); for (int i = 0; i < length; ++i) { w.writeEmptyElement(e.getOperator().name()); w.writeAttribute("ref-index", String.valueOf(refIndex)); if (refIndex >= 1 && refIndex <= genomicSequence.length()) { w.writeAttribute( "ref-base", String.valueOf(genomicSequence.charAt(refIndex - 1))); } refIndex++; } break; } case M: case EQ: case X: { final int length = e.getLength(); for (int i = 0; i < length; ++i) { w.writeEmptyElement(e.getOperator().name()); char baseRead = '\0'; if (readIndex >= 0 && readIndex < readbases.length) { baseRead = (char) (rec.getReadBases()[readIndex]); w.writeAttribute("read-index", String.valueOf(readIndex + 1)); w.writeAttribute("read-base", String.valueOf(baseRead)); } w.writeAttribute("ref-index", String.valueOf(refIndex)); if (refIndex >= 1 && refIndex <= genomicSequence.length()) { char baseRef = genomicSequence.charAt(refIndex - 1); w.writeAttribute("ref-base", String.valueOf(baseRef)); if (Character.toUpperCase(baseRef) != Character.toUpperCase(baseRead)) { w.writeAttribute("mismatch", "true"); } } refIndex++; readIndex++; } break; } default: throw new IllegalStateException( "Case statement didn't deal with cigar op: " + e.getOperator()); } } } w.writeEndElement(); w.writeEndElement(); iter.close(); w.writeEndElement(); } w.writeEndElement(); w.writeEndDocument(); w.flush(); w.close(); } catch (Exception err) { error(err); return -1; } finally { CloserUtil.close(samFileReader); CloserUtil.close(indexedFastaSequenceFile); } return 0; }
public static void main(String[] args) { args = new String[] { "/commun/data/users/cfaucheron/aln_20120329/S0529/data_S0529/S0529_sort.nodup.bam" }; ReferenceSequenceFile rsf = ReferenceSequenceFileFactory.getReferenceSequenceFile( new File("/commun/data/pubdb/ucsc/hg19/chromosomes/hg19.fa")); int count = 0; for (String filename : args) { File file = new File(filename); SAMFileReader samIn = new SAMFileReader(file); SAMRecordIterator r = samIn.iterator(); while (r.hasNext()) { SAMRecord rec = r.next(); if (rec.getReadUnmappedFlag()) continue; if (++count > 10000) break; if (rec.getAlignmentStart() > rec.getAlignmentEnd()) throw new IllegalStateException(); byte bases[] = rsf.getSubsequenceAt( rec.getReferenceName(), rec.getAlignmentStart(), Math.max( rec.getAlignmentEnd(), rec.getAlignmentStart() + rec.getCigar().getPaddedReferenceLength())) .getBases(); Iterator<CigarAlignment> i = CigarAlignment.iterator(rec); /*System.err.println(rec.getCigarString()); System.err.println(bases.length); System.err.println("start:"+rec.getAlignmentStart());*/ StringBuilder s1 = new StringBuilder(); StringBuilder s2 = new StringBuilder(); while (i.hasNext()) { CigarAlignment caln = i.next(); /* System.err.println(rec.getCigarString()); System.err.println("bases.length:"+bases.length); System.err.println("refpos:"+caln.getReferencePosition1()); System.err.println("readpos:"+rec.getAlignmentStart()); */ if (caln.getReferencePosition1() - rec.getAlignmentStart() >= bases.length) { System.out.println("SHORT!"); System.out.println("op:" + caln.getCigarOperator()); System.out.println("read start:" + rec.getAlignmentStart()); System.out.println("clan.pos1:" + caln.getReferencePosition1()); System.out.println("read end:" + rec.getAlignmentEnd()); System.out.println("bases.length:" + bases.length); System.out.println( "getPaddedReferenceLength:" + rec.getCigar().getPaddedReferenceLength()); System.out.println("getReferenceLength:" + rec.getCigar().getReferenceLength()); System.out.println("getReadLength:" + rec.getCigar().getReadLength()); System.out.println( "cigar.read.length:" + Cigar.getReadLength(rec.getCigar().getCigarElements())); count = 2000; break; } if (caln.isInsertRef()) { s2.append("-"); s1.append(caln.getReadBase()); } else if (caln.isDeletionRef()) { s2.append((char) bases[caln.getReferencePosition1() - rec.getAlignmentStart()]); s1.append("-"); } else { s2.append((char) bases[caln.getReferencePosition1() - rec.getAlignmentStart()]); s1.append(caln.getReadBase()); } // System.out.println(s1); // System.out.println(s2); // System.out.println(); } System.out.println( rec.getCigarString() + " " + rec.getReferenceName() + ":" + rec.getAlignmentStart()); System.out.println("ref :" + new String(bases)); System.out.println("read:" + new String(rec.getReadBases())); System.out.println(); System.out.println(s1); System.out.println(s2); System.out.println(); } samIn.close(); } }
@Override public int doWork(String[] args) { boolean repair_missing_read = false; SortingCollectionFactory<MappedFastq> sortingFactory = new SortingCollectionFactory<MappedFastq>(); File forwardFile = null; File reverseFile = null; com.github.lindenb.jvarkit.util.cli.GetOpt opt = new com.github.lindenb.jvarkit.util.cli.GetOpt(); int c; sortingFactory.setComponentType(MappedFastq.class); sortingFactory.setCodec(new MappedFastqCodec()); sortingFactory.setComparator(new MappedFastqComparator()); while ((c = opt.getopt(args, super.getGetOptDefault() + "F:R:N:r")) != -1) { switch (c) { case 'F': forwardFile = new File(opt.getOptArg()); break; case 'R': reverseFile = new File(opt.getOptArg()); break; case 't': addTmpDirectory(new File(opt.getOptArg())); break; case 'N': sortingFactory.setMaxRecordsInRAM(Math.max(Integer.parseInt(opt.getOptArg()), 100)); break; case 'r': repair_missing_read = true; break; case ':': System.err.println("Missing argument for option -" + opt.getOptOpt()); return -1; default: { switch (handleOtherOptions(c, opt, args)) { case EXIT_FAILURE: return -1; case EXIT_SUCCESS: return 0; default: break; } } } } SAMFileReader sfr = null; SortingCollection<MappedFastq> fastqCollection = null; try { sortingFactory.setTmpDirs(this.getTmpDirectories()); fastqCollection = sortingFactory.make(); fastqCollection.setDestructiveIteration(true); boolean found_single = false; boolean found_paired = false; long non_primary_alignmaned_flag = 0L; if (opt.getOptInd() == args.length) { info("Reading from stdin"); sfr = new SAMFileReader(System.in); } else if (opt.getOptInd() + 1 == args.length) { String filename = args[opt.getOptInd()]; sfr = new SAMFileReader(new File(filename)); } else { error(getMessageBundle("illegal.number.of.arguments")); return -1; } sfr.setValidationStringency(ValidationStringency.LENIENT); SAMRecordIterator iter = sfr.iterator(); SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(sfr.getFileHeader().getSequenceDictionary()); while (iter.hasNext()) { SAMRecord rec = iter.next(); progress.watch(rec); if (rec.isSecondaryOrSupplementary()) { if (non_primary_alignmaned_flag == 0) { warning("SKIPPING NON-PRIMARY " + (non_primary_alignmaned_flag + 1) + " ALIGNMENTS"); } non_primary_alignmaned_flag++; continue; } MappedFastq m = new MappedFastq(); m.name = rec.getReadName(); if (m.name == null) m.name = ""; m.hash = m.name.hashCode(); m.seq = rec.getReadString(); if (m.seq.equals(SAMRecord.NULL_SEQUENCE_STRING)) m.seq = ""; m.qual = rec.getBaseQualityString(); if (m.qual.equals(SAMRecord.NULL_QUALS_STRING)) m.qual = ""; if (!rec.getReadUnmappedFlag() && rec.getReadNegativeStrandFlag()) { m.seq = AcidNucleics.reverseComplement(m.seq); m.qual = new StringBuilder(m.qual).reverse().toString(); } if (m.seq.length() != m.qual.length()) { error("length(seq)!=length(qual) in " + m.name); continue; } if (m.seq.isEmpty() && m.qual.isEmpty()) { m.seq = "N"; m.qual = "#"; } if (rec.getReadPairedFlag()) { found_paired = true; if (found_single) { sfr.close(); throw new PicardException("input is a mix of paired/singled reads"); } m.side = (byte) (rec.getSecondOfPairFlag() ? 2 : 1); } else { found_single = true; if (found_paired) { sfr.close(); throw new PicardException("input is a mix of paired/singled reads"); } m.side = (byte) 0; } fastqCollection.add(m); } iter.close(); CloserUtil.close(iter); CloserUtil.close(sfr); progress.finish(); fastqCollection.doneAdding(); info("Done reading."); if (found_paired) { FastqWriter fqw1 = null; FastqWriter fqw2 = null; if (forwardFile != null) { info("Writing to " + forwardFile); fqw1 = new BasicFastqWriter(forwardFile); } else { info("Writing to stdout"); fqw1 = new BasicFastqWriter(new PrintStream(System.out)); } if (reverseFile != null) { info("Writing to " + reverseFile); fqw2 = new BasicFastqWriter(reverseFile); } else { info("Writing to interlaced stdout"); fqw2 = fqw1; } List<MappedFastq> row = new ArrayList<MappedFastq>(); CloseableIterator<MappedFastq> r = fastqCollection.iterator(); for (; ; ) { MappedFastq curr = null; if (r.hasNext()) curr = r.next(); if (curr == null || (!row.isEmpty() && !row.get(0).name.equals(curr.name))) { if (!row.isEmpty()) { if (row.size() > 2) { warning("WTF :" + row); } boolean found_F = false; boolean found_R = false; for (MappedFastq m : row) { switch ((int) m.side) { case 1: if (found_F) throw new PicardException("two forward reads found for " + row.get(0).name); found_F = true; echo(fqw1, m); break; case 2: if (found_R) throw new PicardException("two reverse reads found for " + row.get(0).name); found_R = true; echo(fqw2, m); break; default: throw new IllegalStateException("uh???"); } } if (!found_F) { if (repair_missing_read) { warning("forward not found for " + row.get(0)); MappedFastq pad = new MappedFastq(); pad.side = (byte) 1; pad.name = row.get(0).name; pad.seq = "N"; pad.qual = "#"; echo(fqw1, pad); } else { throw new PicardException("forward not found for " + row); } } if (!found_R) { if (repair_missing_read) { warning("reverse not found for " + row.get(0)); MappedFastq pad = new MappedFastq(); pad.side = (byte) 2; pad.name = row.get(0).name; pad.seq = "N"; pad.qual = "#"; echo(fqw2, pad); } else { throw new PicardException("reverse not found for " + row); } } } if (curr == null) break; row.clear(); } row.add(curr); } r.close(); fqw1.close(); fqw2.close(); } else if (found_single) { FastqWriter fqw1 = null; if (forwardFile != null) { info("Writing to " + forwardFile); fqw1 = new BasicFastqWriter(forwardFile); } else { info("Writing to stdout"); fqw1 = new BasicFastqWriter(new PrintStream(System.out)); } CloseableIterator<MappedFastq> r = fastqCollection.iterator(); while (r.hasNext()) { echo(fqw1, r.next()); } r.close(); fqw1.close(); } return 0; } catch (Exception err) { error(err); return -1; } finally { if (fastqCollection != null) fastqCollection.cleanup(); } }