/** @throws IOException */ private void readFileHeader() throws Exception { // fisrt four bytes are empty // it should be zero for new version of control file, backward compatibility int emptyBytes = this.readFourBytes(inputStream); if (emptyBytes != 0) { log.warn( "The first four bytes are not zero: " + emptyBytes + ". This is an old format control file."); this.totalClusters = emptyBytes; return; } // next four bytes should be version and greater or equal to the expected int version = this.readFourBytes(inputStream); if (version != this.EXPECTED_CONTROL_VERSION) { log.error("Unexpected version byte: " + version); throw new Exception("Unexpected version number in control file"); } // next four bytes should be the total number of clusters this.totalClusters = this.readFourBytes(inputStream); log.info("The total number of clusters: " + this.getTotalClusters()); }
@Override protected void finish() { multiCollector.finish(); final MetricsFile<InsertSizeMetrics, Integer> file = getMetricsFile(); multiCollector.addAllLevelsToFile(file); if (file.getNumHistograms() == 0) { // can happen if user sets MINIMUM_PCT = 0.5, etc. log.warn( "All data categories were discarded because they contained < " + MINIMUM_PCT + " of the total aligned paired data."); final InsertSizeMetricsCollector.PerUnitInsertSizeMetricsCollector allReadsCollector = (InsertSizeMetricsCollector.PerUnitInsertSizeMetricsCollector) multiCollector.getAllReadsCollector(); log.warn( "Total mapped pairs in all categories: " + (allReadsCollector == null ? allReadsCollector : allReadsCollector.getTotalInserts())); } else { file.write(OUTPUT); final int rResult; if (HISTOGRAM_WIDTH == null) { rResult = RExecutor.executeFromClasspath( HISTOGRAM_R_SCRIPT, OUTPUT.getAbsolutePath(), HISTOGRAM_FILE.getAbsolutePath(), INPUT.getName()); } else { rResult = RExecutor.executeFromClasspath( HISTOGRAM_R_SCRIPT, OUTPUT.getAbsolutePath(), HISTOGRAM_FILE.getAbsolutePath(), INPUT.getName(), String.valueOf( HISTOGRAM_WIDTH)); // HISTOGRAM_WIDTH is passed because R automatically sets // histogram width to the last // bin that has data, which may be less than HISTOGRAM_WIDTH and confuse the user. } if (rResult != 0) { throw new PicardException( "R script " + HISTOGRAM_R_SCRIPT + " failed with return code " + rResult); } } }
@Override public Object next() { try { int nextByte = this.inputStream.readUnsignedShort(); if (nextByte == -1) { log.warn( "There is no more cluster in Control file after cluster " + this.getCurrentCluster() + " in file " + this.getFileName()); return null; } this.currentCluster++; /* Bit0: always empty (0) Bit1: was the read identified as a control? Bit2: was the match ambiguous? Bit3: did the read match the phiX tag? Bit4: did the read align to match the phiX tag? Bit5: did the read match the control index sequence? (specified in controls.fata, TGTCACA) Bits6,7: reserved for future use Bits8..15: the report key for the matched record in the controls.fasta file (specified by the REPOControl FilesRT_ KEY metadata) */ nextByte = nextByte & 0x2; if (nextByte != 0) { this.currentControlClusters++; } return new Integer(nextByte); } catch (IOException ex) { log.error(ex, "Problem to read control file"); } return null; }
/** * Lift over the given interval to the new genome build. * * @param interval Interval to be lifted over. * @param liftOverMinMatch Minimum fraction of bases that must remap. * @return Interval in the output build coordinates, or null if it cannot be lifted over. */ public Interval liftOver(final Interval interval, final double liftOverMinMatch) { if (interval.length() == 0) { throw new IllegalArgumentException( "Zero-length interval cannot be lifted over. Interval: " + interval.getName()); } Chain chainHit = null; TargetIntersection targetIntersection = null; // Number of bases in interval that can be lifted over must be >= this. double minMatchSize = liftOverMinMatch * interval.length(); // Find the appropriate Chain, and the part of the chain corresponding to the interval to be // lifted over. for (final Chain chain : chains.getOverlaps(interval)) { final TargetIntersection candidateIntersection = targetIntersection(chain, interval); if (candidateIntersection != null && candidateIntersection.intersectionLength >= minMatchSize) { if (chainHit != null) { // In basic liftOver, multiple hits are not allowed. return null; } chainHit = chain; targetIntersection = candidateIntersection; } else if (candidateIntersection != null) { LOG.info( "Interval " + interval.getName() + " failed to match chain " + chain.id + " because intersection length " + candidateIntersection.intersectionLength + " < minMatchSize " + minMatchSize + " (" + (candidateIntersection.intersectionLength / (float) interval.length()) + " < " + liftOverMinMatch + ")"); } } if (chainHit == null) { // Can't be lifted over. return null; } return createToInterval(interval.getName(), targetIntersection); }
protected int doWork() { IoUtil.assertFileIsReadable(INPUT); IoUtil.assertFileIsWritable(OUTPUT); final SAMFileReader in = new SAMFileReader(INPUT); // create the read group we'll be using final SAMReadGroupRecord rg = new SAMReadGroupRecord(RGID); rg.setLibrary(RGLB); rg.setPlatform(RGPL); rg.setSample(RGSM); rg.setPlatformUnit(RGPU); if (RGCN != null) rg.setSequencingCenter(RGCN); if (RGDS != null) rg.setDescription(RGDS); if (RGDT != null) rg.setRunDate(RGDT); log.info( String.format( "Created read group ID=%s PL=%s LB=%s SM=%s%n", rg.getId(), rg.getPlatform(), rg.getLibrary(), rg.getSample())); // create the new header and output file final SAMFileHeader inHeader = in.getFileHeader(); final SAMFileHeader outHeader = inHeader.clone(); outHeader.setReadGroups(Arrays.asList(rg)); if (SORT_ORDER != null) outHeader.setSortOrder(SORT_ORDER); final SAMFileWriter outWriter = new SAMFileWriterFactory() .makeSAMOrBAMWriter( outHeader, outHeader.getSortOrder() == inHeader.getSortOrder(), OUTPUT); final ProgressLogger progress = new ProgressLogger(log); for (final SAMRecord read : in) { read.setAttribute(SAMTag.RG.name(), RGID); outWriter.addAlignment(read); progress.record(read); } // cleanup in.close(); outWriter.close(); return 0; }
/** * Both CollectTargetedPCRMetrics and CalculateHybridSelection metrics share virtually identical * program structures except for the name of their targeting mechanisms (e.g. bait set or amplicon * set). The shared behavior of these programs is encapsulated in CollectTargetedMetrics which is * then subclassed by CalculateHsMetrics and CollectTargetedPcrMetrics. * * <p>This program verifies the input parameters to TargetMetricsCollector and converts all files to * the format desired by TargetMetricsCollector. Then it instantiates a TargetMetricsCollector and * collects metric information for all reads in the INPUT sam file. */ public abstract class CollectTargetedMetrics extends CommandLineProgram { private static final Log log = Log.getInstance(CalculateHsMetrics.class); /** * The interval file to be fed to TargetMetricsCollector * * @return An interval file that denotes the intervals of the regions targeted by the probes for * this run that is passed to the TargetMetricsCollector produced by makeCollector */ protected abstract File getProbeIntervals(); /** @return The name of the probe set used in this run, getProbeIntervals().getName() is */ protected abstract String getProbeSetName(); /** * A factory method for the TargetMetricsCollector to use this time. Examples of * TargetMetricsCollector: (TargetedPcrMetricsCollector, HsMetricsCalculator) * * @return A TargetMetricsCollector to which we will pass SAMRecords */ protected abstract TargetMetricsCollector makeCollector( final Set<MetricAccumulationLevel> accumulationLevels, final List<SAMReadGroupRecord> samRgRecords, final ReferenceSequenceFile refFile, final File perTargetCoverage, final File targetIntervals, final File probeIntervals, final String probeSetName); @Option( shortName = "TI", doc = "An interval list file that contains the locations of the targets.") public File TARGET_INTERVALS; @Option( shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "An aligned SAM or BAM file.") public File INPUT; @Option( shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "The output file to write the metrics to.") public File OUTPUT; @Option(shortName = "LEVEL", doc = "The level(s) at which to accumulate metrics. ") public Set<MetricAccumulationLevel> METRIC_ACCUMULATION_LEVEL = CollectionUtil.makeSet(MetricAccumulationLevel.ALL_READS); @Option( shortName = StandardOptionDefinitions.REFERENCE_SHORT_NAME, optional = true, doc = "The reference sequence aligned to.") public File REFERENCE_SEQUENCE; @Option(optional = true, doc = "An optional file to output per target coverage information to.") public File PER_TARGET_COVERAGE; /** * Asserts that files are readable and writable and then fires off an HsMetricsCalculator instance * to do the real work. */ protected int doWork() { IoUtil.assertFileIsReadable(getProbeIntervals()); IoUtil.assertFileIsReadable(TARGET_INTERVALS); IoUtil.assertFileIsReadable(INPUT); IoUtil.assertFileIsWritable(OUTPUT); if (PER_TARGET_COVERAGE != null) IoUtil.assertFileIsWritable(PER_TARGET_COVERAGE); final SAMFileReader samReader = new SAMFileReader(INPUT); final File probeIntervals = getProbeIntervals(); // Validate that the targets and baits have the same references as the reads file SequenceUtil.assertSequenceDictionariesEqual( samReader.getFileHeader().getSequenceDictionary(), IntervalList.fromFile(TARGET_INTERVALS).getHeader().getSequenceDictionary(), INPUT, TARGET_INTERVALS); SequenceUtil.assertSequenceDictionariesEqual( samReader.getFileHeader().getSequenceDictionary(), IntervalList.fromFile(probeIntervals).getHeader().getSequenceDictionary(), INPUT, probeIntervals); ReferenceSequenceFile ref = null; if (REFERENCE_SEQUENCE != null) { IoUtil.assertFileIsReadable(REFERENCE_SEQUENCE); ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(REFERENCE_SEQUENCE); SequenceUtil.assertSequenceDictionariesEqual( samReader.getFileHeader().getSequenceDictionary(), ref.getSequenceDictionary(), INPUT, REFERENCE_SEQUENCE); } final TargetMetricsCollector collector = makeCollector( METRIC_ACCUMULATION_LEVEL, samReader.getFileHeader().getReadGroups(), ref, PER_TARGET_COVERAGE, TARGET_INTERVALS, probeIntervals, getProbeSetName()); // Add each record to the requested collectors final Iterator<SAMRecord> records = samReader.iterator(); final ProgressLogger progress = new ProgressLogger(log); while (records.hasNext()) { final SAMRecord sam = records.next(); collector.acceptRecord(sam, null); progress.record(sam); } // Write the output file final MetricsFile<HsMetrics, Integer> metrics = getMetricsFile(); collector.finish(); collector.addAllLevelsToFile(metrics); metrics.write(OUTPUT); return 0; } protected String[] customCommandLineValidation() { if (PER_TARGET_COVERAGE != null && (METRIC_ACCUMULATION_LEVEL.size() != 1 || METRIC_ACCUMULATION_LEVEL.iterator().next() != MetricAccumulationLevel.ALL_READS)) { return new String[] { "PER_TARGET_COVERAGE can be specified only when METRIC_ACCUMULATION_LEVEL is set " + "to ALL_READS." }; } if (PER_TARGET_COVERAGE != null && REFERENCE_SEQUENCE == null) { return new String[] {"Must supply REFERENCE_SEQUENCE when supplying PER_TARGET_COVERAGE"}; } return super.customCommandLineValidation(); } }
/** * Java port of UCSC liftOver. Only the most basic liftOver functionality is implemented. Internally * coordinates are 0-based, half-open. The API is standard Picard 1-based, inclusive. * * @author [email protected] */ public class LiftOver { private static final Log LOG = Log.getInstance(LiftOver.class); public static final double DEFAULT_LIFTOVER_MINMATCH = 0.95; private double liftOverMinMatch = DEFAULT_LIFTOVER_MINMATCH; private final OverlapDetector<Chain> chains; /** Load UCSC chain file in order to lift over Intervals. */ public LiftOver(File chainFile) { IoUtil.assertFileIsReadable(chainFile); chains = Chain.loadChains(chainFile); } /** * Throw an exception if all the "to" sequence names in the chains are not found in the given * sequence dictionary. */ public void validateToSequences(final SAMSequenceDictionary sequenceDictionary) { for (final Chain chain : chains.getAll()) { if (sequenceDictionary.getSequence(chain.toSequenceName) == null) { throw new PicardException( "Sequence " + chain.toSequenceName + " from chain file is not found in sequence dictionary."); } } } /** * Lift over the given interval to the new genome build using the liftOverMinMatch set for this * LiftOver object. * * @param interval Interval to be lifted over. * @return Interval in the output build coordinates, or null if it cannot be lifted over. */ public Interval liftOver(final Interval interval) { return liftOver(interval, liftOverMinMatch); } /** * Lift over the given interval to the new genome build. * * @param interval Interval to be lifted over. * @param liftOverMinMatch Minimum fraction of bases that must remap. * @return Interval in the output build coordinates, or null if it cannot be lifted over. */ public Interval liftOver(final Interval interval, final double liftOverMinMatch) { if (interval.length() == 0) { throw new IllegalArgumentException( "Zero-length interval cannot be lifted over. Interval: " + interval.getName()); } Chain chainHit = null; TargetIntersection targetIntersection = null; // Number of bases in interval that can be lifted over must be >= this. double minMatchSize = liftOverMinMatch * interval.length(); // Find the appropriate Chain, and the part of the chain corresponding to the interval to be // lifted over. for (final Chain chain : chains.getOverlaps(interval)) { final TargetIntersection candidateIntersection = targetIntersection(chain, interval); if (candidateIntersection != null && candidateIntersection.intersectionLength >= minMatchSize) { if (chainHit != null) { // In basic liftOver, multiple hits are not allowed. return null; } chainHit = chain; targetIntersection = candidateIntersection; } else if (candidateIntersection != null) { LOG.info( "Interval " + interval.getName() + " failed to match chain " + chain.id + " because intersection length " + candidateIntersection.intersectionLength + " < minMatchSize " + minMatchSize + " (" + (candidateIntersection.intersectionLength / (float) interval.length()) + " < " + liftOverMinMatch + ")"); } } if (chainHit == null) { // Can't be lifted over. return null; } return createToInterval(interval.getName(), targetIntersection); } public List<PartialLiftover> diagnosticLiftover(final Interval interval) { final List<PartialLiftover> ret = new ArrayList<PartialLiftover>(); if (interval.length() == 0) { throw new IllegalArgumentException( "Zero-length interval cannot be lifted over. Interval: " + interval.getName()); } for (final Chain chain : chains.getOverlaps(interval)) { Interval intersectingChain = interval.intersect(chain.interval); final TargetIntersection targetIntersection = targetIntersection(chain, intersectingChain); if (targetIntersection == null) { ret.add(new PartialLiftover(intersectingChain, chain.id)); } else { Interval toInterval = createToInterval(interval.getName(), targetIntersection); float percentLiftedOver = targetIntersection.intersectionLength / (float) interval.length(); ret.add( new PartialLiftover( intersectingChain, toInterval, targetIntersection.chain.id, percentLiftedOver)); } } return ret; } private static Interval createToInterval( final String intervalName, final TargetIntersection targetIntersection) { // Compute the query interval given the offsets of the target interval start and end into the // first and // last ContinuousBlocks. int toStart = targetIntersection.chain.getBlock(targetIntersection.firstBlockIndex).toStart + targetIntersection.startOffset; int toEnd = targetIntersection.chain.getBlock(targetIntersection.lastBlockIndex).getToEnd() - targetIntersection.offsetFromEnd; if (toEnd <= toStart || toStart < 0) { throw new PicardException("Something strange lifting over interval " + intervalName); } if (targetIntersection.chain.toNegativeStrand) { // Flip if query is negative. int negativeStart = targetIntersection.chain.toSequenceSize - toEnd; int negativeEnd = targetIntersection.chain.toSequenceSize - toStart; toStart = negativeStart; toEnd = negativeEnd; } // Convert to 1-based, inclusive. return new Interval( targetIntersection.chain.toSequenceName, toStart + 1, toEnd, targetIntersection.chain.toNegativeStrand, intervalName); } /** * Add up overlap btw the blocks in this chain and the given interval. * * @return Length of overlap, offsets into first and last ContinuousBlocks, and indices of first * and last ContinuousBlocks. */ private static TargetIntersection targetIntersection(final Chain chain, final Interval interval) { int intersectionLength = 0; // Convert interval to 0-based, half-open int start = interval.getStart() - 1; int end = interval.getEnd(); int firstBlockIndex = -1; int lastBlockIndex = -1; int startOffset = -1; int offsetFromEnd = -1; List<Chain.ContinuousBlock> blockList = chain.getBlocks(); for (int i = 0; i < blockList.size(); ++i) { final Chain.ContinuousBlock block = blockList.get(i); if (block.fromStart >= end) { break; } else if (block.getFromEnd() <= start) { continue; } if (firstBlockIndex == -1) { firstBlockIndex = i; if (start > block.fromStart) { startOffset = start - block.fromStart; } else { startOffset = 0; } } lastBlockIndex = i; if (block.getFromEnd() > end) { offsetFromEnd = block.getFromEnd() - end; } else { offsetFromEnd = 0; } int thisIntersection = Math.min(end, block.getFromEnd()) - Math.max(start, block.fromStart); if (thisIntersection <= 0) { throw new PicardException("Should have been some intersection."); } intersectionLength += thisIntersection; } if (intersectionLength == 0) { return null; } return new TargetIntersection( chain, intersectionLength, startOffset, offsetFromEnd, firstBlockIndex, lastBlockIndex); } /** Get minimum fraction of bases that must remap. */ public double getLiftOverMinMatch() { return liftOverMinMatch; } /** Set minimum fraction of bases that must remap. */ public void setLiftOverMinMatch(final double liftOverMinMatch) { this.liftOverMinMatch = liftOverMinMatch; } /** Value class returned by targetIntersection() */ private static class TargetIntersection { /** Chain used for this intersection */ final Chain chain; /** Total intersectionLength length */ final int intersectionLength; /** Offset of target interval start in first block. */ final int startOffset; /** Distance from target interval end to end of last block. */ final int offsetFromEnd; /** Index of first ContinuousBlock matching interval. */ final int firstBlockIndex; /** Index of last ContinuousBlock matching interval. */ final int lastBlockIndex; TargetIntersection( final Chain chain, final int intersectionLength, final int startOffset, final int offsetFromEnd, final int firstBlockIndex, final int lastBlockIndex) { this.chain = chain; this.intersectionLength = intersectionLength; this.startOffset = startOffset; this.offsetFromEnd = offsetFromEnd; this.firstBlockIndex = firstBlockIndex; this.lastBlockIndex = lastBlockIndex; } } /** Represents a portion of a liftover operation, for use in diagnosing liftover failures. */ public static class PartialLiftover { /** Intersection between "from" interval and "from" region of a chain. */ final Interval fromInterval; /** * Result of lifting over fromInterval (with no percentage mapped requirement). This is null if * fromInterval falls entirely with a gap of the chain. */ final Interval toInterval; /** id of chain used for this liftover */ final int chainId; /** * Percentage of bases in fromInterval that lifted over. 0 if fromInterval is not covered by any * chain. */ final float percentLiftedOver; PartialLiftover( final Interval fromInterval, final Interval toInterval, final int chainId, final float percentLiftedOver) { this.fromInterval = fromInterval; this.toInterval = toInterval; this.chainId = chainId; this.percentLiftedOver = percentLiftedOver; } PartialLiftover(final Interval fromInterval, final int chainId) { this.fromInterval = fromInterval; this.toInterval = null; this.chainId = chainId; this.percentLiftedOver = 0.0f; } public String toString() { if (toInterval == null) { // Matched a chain, but entirely within a gap. return fromInterval.toString() + " (len " + fromInterval.length() + ")=>null using chain " + chainId; } final String strand = toInterval.isNegativeStrand() ? "-" : "+"; return fromInterval.toString() + " (len " + fromInterval.length() + ")=>" + toInterval + "(" + strand + ") using chain " + chainId + " ; pct matched " + percentLiftedOver; } } }
/** * Replaces read groups in a BAM file * * @author mdepristo */ public class AddOrReplaceReadGroups extends CommandLineProgram { @Usage(programVersion = "1.0") public String USAGE = "Replaces all read groups in the INPUT file with a new read group and assigns " + "all reads to this read group in the OUTPUT BAM"; @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "Input file (bam or sam).") public File INPUT = null; @Option( shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "Output file (bam or sam).") public File OUTPUT = null; @Option( shortName = StandardOptionDefinitions.SORT_ORDER_SHORT_NAME, optional = true, doc = "Optional sort order to output in. If not supplied OUTPUT is in the same order as INPUT.") public SortOrder SORT_ORDER; @Option(shortName = "ID", doc = "Read Group ID") public String RGID = "1"; @Option(shortName = "LB", doc = "Read Group Library") public String RGLB; @Option(shortName = "PL", doc = "Read Group platform (e.g. illumina, solid)") public String RGPL; @Option(shortName = "PU", doc = "Read Group platform unit (eg. run barcode)") public String RGPU; @Option(shortName = "SM", doc = "Read Group sample name") public String RGSM; @Option(shortName = "CN", doc = "Read Group sequencing center name", optional = true) public String RGCN; @Option(shortName = "DS", doc = "Read Group description", optional = true) public String RGDS; @Option(shortName = "DT", doc = "Read Group run date", optional = true) public Iso8601Date RGDT; private final Log log = Log.getInstance(AddOrReplaceReadGroups.class); /** Required main method implementation. */ public static void main(final String[] argv) { new AddOrReplaceReadGroups().instanceMainWithExit(argv); } protected int doWork() { IoUtil.assertFileIsReadable(INPUT); IoUtil.assertFileIsWritable(OUTPUT); final SAMFileReader in = new SAMFileReader(INPUT); // create the read group we'll be using final SAMReadGroupRecord rg = new SAMReadGroupRecord(RGID); rg.setLibrary(RGLB); rg.setPlatform(RGPL); rg.setSample(RGSM); rg.setPlatformUnit(RGPU); if (RGCN != null) rg.setSequencingCenter(RGCN); if (RGDS != null) rg.setDescription(RGDS); if (RGDT != null) rg.setRunDate(RGDT); log.info( String.format( "Created read group ID=%s PL=%s LB=%s SM=%s%n", rg.getId(), rg.getPlatform(), rg.getLibrary(), rg.getSample())); // create the new header and output file final SAMFileHeader inHeader = in.getFileHeader(); final SAMFileHeader outHeader = inHeader.clone(); outHeader.setReadGroups(Arrays.asList(rg)); if (SORT_ORDER != null) outHeader.setSortOrder(SORT_ORDER); final SAMFileWriter outWriter = new SAMFileWriterFactory() .makeSAMOrBAMWriter( outHeader, outHeader.getSortOrder() == inHeader.getSortOrder(), OUTPUT); final ProgressLogger progress = new ProgressLogger(log); for (final SAMRecord read : in) { read.setAttribute(SAMTag.RG.name(), RGID); outWriter.addAlignment(read); progress.record(read); } // cleanup in.close(); outWriter.close(); return 0; } }
/** * Command line program to read non-duplicate insert sizes, create a histogram and report * distribution statistics. * * @author Doug Voet (dvoet at broadinstitute dot org) */ public class CollectInsertSizeMetrics extends SinglePassSamProgram { private static final Log log = Log.getInstance(CollectInsertSizeMetrics.class); private static final String HISTOGRAM_R_SCRIPT = "net/sf/picard/analysis/insertSizeHistogram.R"; // Usage and parameters @Usage public String USAGE = getStandardUsagePreamble() + "Reads a SAM or BAM file and writes a file containing metrics about " + "the statistical distribution of insert size (excluding duplicates) " + "and generates a histogram plot.\n"; @Option(shortName = "H", doc = "File to write insert size histogram chart to.") public File HISTOGRAM_FILE; @Option( doc = "Generate mean, sd and plots by trimming the data down to MEDIAN + DEVIATIONS*MEDIAN_ABSOLUTE_DEVIATION. " + "This is done because insert size data typically includes enough anomalous values from chimeras and other " + "artifacts to make the mean and sd grossly misleading regarding the real distribution.") public double DEVIATIONS = 10; @Option( shortName = "W", doc = "Explicitly sets the histogram width, overriding automatic truncation of histogram tail. " + "Also, when calculating mean and standard deviation, only bins <= HISTOGRAM_WIDTH will be included.", optional = true) public Integer HISTOGRAM_WIDTH = null; @Option( shortName = "M", doc = "When generating the histogram, discard any data categories (out of FR, TANDEM, RF) that have fewer than this " + "percentage of overall reads. (Range: 0 to 1).") public float MINIMUM_PCT = 0.05f; @Option(shortName = "LEVEL", doc = "The level(s) at which to accumulate metrics. ") private Set<MetricAccumulationLevel> METRIC_ACCUMULATION_LEVEL = CollectionUtil.makeSet(MetricAccumulationLevel.ALL_READS); // Calculates InsertSizeMetrics for all METRIC_ACCUMULATION_LEVELs provided private InsertSizeMetricsCollector multiCollector; /** Required main method implementation. */ public static void main(final String[] argv) { new CollectInsertSizeMetrics().instanceMainWithExit(argv); } /** * Put any custom command-line validation in an override of this method. clp is initialized at * this point and can be used to print usage and access argv. Any options set by command-line * parser can be validated. * * @return null if command line is valid. If command line is invalid, returns an array of error * message to be written to the appropriate place. */ @Override protected String[] customCommandLineValidation() { if (MINIMUM_PCT < 0 || MINIMUM_PCT > 0.5) { return new String[] { "MINIMUM_PCT was set to " + MINIMUM_PCT + ". It must be between 0 and 0.5 so all data categories don't get discarded." }; } return super.customCommandLineValidation(); } @Override protected boolean usesNoRefReads() { return false; } @Override protected void setup(final SAMFileHeader header, final File samFile) { IoUtil.assertFileIsWritable(OUTPUT); IoUtil.assertFileIsWritable(HISTOGRAM_FILE); // Delegate actual collection to InsertSizeMetricCollector multiCollector = new InsertSizeMetricsCollector( METRIC_ACCUMULATION_LEVEL, header.getReadGroups(), MINIMUM_PCT, HISTOGRAM_WIDTH, DEVIATIONS); } @Override protected void acceptRead(final SAMRecord record, final ReferenceSequence ref) { multiCollector.acceptRecord(record, ref); } @Override protected void finish() { multiCollector.finish(); final MetricsFile<InsertSizeMetrics, Integer> file = getMetricsFile(); multiCollector.addAllLevelsToFile(file); if (file.getNumHistograms() == 0) { // can happen if user sets MINIMUM_PCT = 0.5, etc. log.warn( "All data categories were discarded because they contained < " + MINIMUM_PCT + " of the total aligned paired data."); final InsertSizeMetricsCollector.PerUnitInsertSizeMetricsCollector allReadsCollector = (InsertSizeMetricsCollector.PerUnitInsertSizeMetricsCollector) multiCollector.getAllReadsCollector(); log.warn( "Total mapped pairs in all categories: " + (allReadsCollector == null ? allReadsCollector : allReadsCollector.getTotalInserts())); } else { file.write(OUTPUT); final int rResult; if (HISTOGRAM_WIDTH == null) { rResult = RExecutor.executeFromClasspath( HISTOGRAM_R_SCRIPT, OUTPUT.getAbsolutePath(), HISTOGRAM_FILE.getAbsolutePath(), INPUT.getName()); } else { rResult = RExecutor.executeFromClasspath( HISTOGRAM_R_SCRIPT, OUTPUT.getAbsolutePath(), HISTOGRAM_FILE.getAbsolutePath(), INPUT.getName(), String.valueOf( HISTOGRAM_WIDTH)); // HISTOGRAM_WIDTH is passed because R automatically sets // histogram width to the last // bin that has data, which may be less than HISTOGRAM_WIDTH and confuse the user. } if (rResult != 0) { throw new PicardException( "R script " + HISTOGRAM_R_SCRIPT + " failed with return code " + rResult); } } } }
/** * This class is a reader of a control file * * @author Guoying Qi * @author Staffan Living */ public class ControlFileReader extends IlluminaFileReader { private final Log log = Log.getInstance(ControlFileReader.class); private final int EXPECTED_CONTROL_VERSION = 2; private int currentCluster = 0; private int totalClusters = 0; private int currentControlClusters = 0; /** * @param controlFileName control file name * @throws Exception */ public ControlFileReader(String controlFileName) throws Exception { super(controlFileName); this.readFileHeader(); } /** @throws IOException */ private void readFileHeader() throws Exception { // fisrt four bytes are empty // it should be zero for new version of control file, backward compatibility int emptyBytes = this.readFourBytes(inputStream); if (emptyBytes != 0) { log.warn( "The first four bytes are not zero: " + emptyBytes + ". This is an old format control file."); this.totalClusters = emptyBytes; return; } // next four bytes should be version and greater or equal to the expected int version = this.readFourBytes(inputStream); if (version != this.EXPECTED_CONTROL_VERSION) { log.error("Unexpected version byte: " + version); throw new Exception("Unexpected version number in control file"); } // next four bytes should be the total number of clusters this.totalClusters = this.readFourBytes(inputStream); log.info("The total number of clusters: " + this.getTotalClusters()); } @Override public boolean hasNext() { return (this.getCurrentCluster() < this.getTotalClusters()) ? true : false; } @Override public Object next() { try { int nextByte = this.inputStream.readUnsignedShort(); if (nextByte == -1) { log.warn( "There is no more cluster in Control file after cluster " + this.getCurrentCluster() + " in file " + this.getFileName()); return null; } this.currentCluster++; /* Bit0: always empty (0) Bit1: was the read identified as a control? Bit2: was the match ambiguous? Bit3: did the read match the phiX tag? Bit4: did the read align to match the phiX tag? Bit5: did the read match the control index sequence? (specified in controls.fata, TGTCACA) Bits6,7: reserved for future use Bits8..15: the report key for the matched record in the controls.fasta file (specified by the REPOControl FilesRT_ KEY metadata) */ nextByte = nextByte & 0x2; if (nextByte != 0) { this.currentControlClusters++; } return new Integer(nextByte); } catch (IOException ex) { log.error(ex, "Problem to read control file"); } return null; } /** @return the currentCluster */ public int getCurrentCluster() { return currentCluster; } /** @return the totalClusters */ public int getTotalClusters() { return totalClusters; } /** @return the currentClusters */ public int getCurrentControlClusters() { return currentControlClusters; } public static void main(String args[]) throws Exception { String controlFileName = "testdata/110323_HS13_06000_B_B039WABXX/Data/Intensities/BaseCalls/L001/s_1_1101.control"; if (args.length > 0 && args[0] != null) { controlFileName = args[0]; } ControlFileReader control = new ControlFileReader(controlFileName); int numberControlCluster = 0; while (control.hasNext()) { int nextCluster = (Integer) control.next(); if (nextCluster != 0) { numberControlCluster++; } } System.out.println(numberControlCluster); System.out.println(control.getCurrentCluster()); System.out.println(control.getCurrentControlClusters()); // control.next(); } }
private void read(InputStream in, String filename) throws IOException { // Pattern comma=Pattern.compile("[,]"); Pattern pipe = Pattern.compile("[\\|]"); Pattern amp = Pattern.compile("&"); out.println("insert into FILE" + SUFFIX + "(filename) values (" + quote(filename) + ");"); VcfIterator r = new VcfIterator(in); VCFHeader header = r.getHeader(); String csqColumns[] = null; VCFInfoHeaderLine infoHeader = header.getInfoHeaderLine("CSQ"); if (infoHeader != null && this.USE_VEP) { LOG.info("parsing VEP " + infoHeader.getDescription()); final String formatStr = "Format: "; int i = infoHeader.getDescription().indexOf(formatStr); if (i != -1) { csqColumns = pipe.split(infoHeader.getDescription().substring(i + formatStr.length()).trim()); LOG.debug(Arrays.asList(csqColumns)); } else { LOG.error("Cannot parse " + infoHeader.getDescription()); } } String snpEffColumns[] = null; infoHeader = header.getInfoHeaderLine("EFF"); if (infoHeader != null && this.USE_SNPEFF) { LOG.info("parsing EFF " + infoHeader.getDescription()); final String formatStr = ".Format: '"; final String desc = infoHeader.getDescription(); int i = desc.indexOf(formatStr); if (i != -1) i = desc.indexOf('(', i + formatStr.length()); int j = desc.lastIndexOf(')'); if (i != -1 && j > i) { snpEffColumns = pipe.split(desc.substring(i + 1, j).replaceAll("[ \\[\\]()\\.]", "").trim()); LOG.info(Arrays.asList(snpEffColumns)); } else { LOG.error("Cannot parse " + infoHeader.getDescription()); } } String nmdColumns[] = null; infoHeader = header.getInfoHeaderLine("NMD"); if (infoHeader != null && this.USE_SNPEFF) { final String formatStr = " Format: '"; final String desc = infoHeader.getDescription(); int i = desc.indexOf(formatStr); int j = (i == -1 ? -1 : desc.lastIndexOf('\'')); if (i != -1 && j > i) { nmdColumns = pipe.split( desc.substring(i + formatStr.length(), j).replaceAll("[ \\[\\]()\\.]", "").trim()); } else { LOG.error("Cannot parse " + infoHeader.getDescription()); } } String lofColumns[] = null; infoHeader = header.getInfoHeaderLine("LOF"); if (infoHeader != null && this.USE_SNPEFF) { final String formatStr = " Format: '"; final String desc = infoHeader.getDescription(); int i = desc.indexOf(formatStr); int j = (i == -1 ? -1 : desc.lastIndexOf('\'')); if (i != -1 && j > i) { lofColumns = pipe.split( desc.substring(i + formatStr.length(), j).replaceAll("[ \\[\\]()\\.]", "").trim()); } else { LOG.error("Cannot parse " + infoHeader.getDescription()); } } for (String S : header.getSampleNamesInOrder()) { // merge into SAMPLE using (select 1+MAX(id),'azdazd' from SAMPLE) as vals(x,y) on // SAMPLE.name=vals.y when NOT MATCHED THEN INSERT VALUES vals.x,vals.y; switch (this.engine) { case hsql: out.println( "merge into SAMPLE" + SUFFIX + " using ( values(" + quote(S) + ") ) " + "AS vals(y) ON SAMPLE" + SUFFIX + ".name = vals.y " + "WHEN NOT MATCHED THEN INSERT VALUES (NULL,vals.y);"); break; default: out.println( "insert or ignore into SAMPLE" + SUFFIX + "(name) values (" + quote(S) + ");"); break; } } List<String> headers = new ArrayList<String>(); for (VCFHeaderLine line : header.getMetaDataInSortedOrder()) { if (VCFHeaderVersion.isFormatString(line.getKey())) continue; headers.add(VCFHeader.METADATA_INDICATOR + line); } String chromLine = VCFHeader.HEADER_INDICATOR; for (VCFHeader.HEADER_FIELDS field : header.getHeaderFields()) { if (!VCFHeader.HEADER_INDICATOR.equals(chromLine)) chromLine += (VCFConstants.FIELD_SEPARATOR); chromLine += (field); } if (header.hasGenotypingData()) { chromLine += VCFConstants.FIELD_SEPARATOR + "FORMAT"; for (String sample : header.getGenotypeSamples()) { chromLine += VCFConstants.FIELD_SEPARATOR; chromLine += sample; } } headers.add(chromLine); for (String line : headers) { out.println( "insert into HEADER" + SUFFIX + "(file_id,header) values (" + "(select max(id) from FILE" + SUFFIX + ")," + quote(line) + ");"); } while (r.hasNext()) { VariantContext var = r.next(); if (var == null) { LOG.error("Cannot parse VCF"); continue; } // "create table if not exists FILE(id,filename text)"; // "create table if not exists VARIATION(id,file_id,chrom,pos,start0,end0,rs_id,ref,qual)"; out.println( "insert into VARIATION" + SUFFIX + "(file_id,chrom,pos,START0,END0,rs_id,ref,qual) values (" + "(select max(id) from FILE" + SUFFIX + ")," + quote(var.getChr()) + "," + var.getStart() + "," + (var.getStart() - 1) + "," + var.getEnd() + "," + (var.getID() == null || var.getID().equals(VCFConstants.EMPTY_ID_FIELD) ? "NULL" : quote(var.getID())) + "," + quote(var.getReference().getDisplayString()) + "," + (var.getPhredScaledQual() < 0 ? "NULL" : var.getPhredScaledQual()) + ");"); // "create table if not exists ALT(id,var_id,alt)"; for (Allele alt : var.getAlternateAlleles()) { out.println( "insert into ALT" + SUFFIX + "(var_id,alt) values (" + "(select max(id) from VARIATION" + SUFFIX + ")," + quote(alt.getDisplayString()) + ");"); } // "create table if not exists FILTER(id,var_id,filter)"; for (String filter : var.getFilters()) { out.println( "insert into FILTER" + SUFFIX + "(var_id,filter) values (" + "(select max(id) from VARIATION" + SUFFIX + ")," + quote(filter) + ");"); } CommonInfo infos = var.getCommonInfo(); for (String key : infos.getAttributes().keySet()) { Object val = infos.getAttribute(key); // "create table if not exists INFO(id,var_id,k,v)"; if (SPLIT4 && key.equals("DP4")) { String dp4[] = infotoString(val).split("[,]"); insertIntoInfo(quote(key + "[refFor]"), quote(dp4[0])); insertIntoInfo(quote(key + "[refRev]"), quote(dp4[1])); insertIntoInfo(quote(key + "[altFor]"), quote(dp4[2])); insertIntoInfo(quote(key + "[altRev]"), quote(dp4[3])); } else { insertIntoInfo(quote(key), quote(infotoString(val))); } if (key.equals("CSQ") && csqColumns != null) { List as_array = castToStringArray(val); for (Object csqs : as_array) { if (csqs.toString().isEmpty()) continue; String tokens[] = pipe.split(csqs.toString()); List<String> extraInfo = new ArrayList<String>(); for (int t = 0; t < tokens.length && t < csqColumns.length; ++t) { if (tokens[t].isEmpty()) continue; if (csqColumns[t].equals("Consequence")) { for (String pred : amp.split(tokens[t])) { if (pred.isEmpty()) continue; extraInfo.add(csqColumns[t]); extraInfo.add(pred); } } else { extraInfo.add(csqColumns[t]); extraInfo.add(tokens[t]); } } insertExtraInfos("CSQ", extraInfo); } } if (key.equals("EFF") && snpEffColumns != null) { for (Object item : castToStringArray(val)) { String snpeff = item.toString(); if (snpeff.isEmpty()) continue; int opar = snpeff.indexOf('('); if (opar == -1) continue; int cpar = snpeff.lastIndexOf(')'); if (cpar == -1) continue; String tokens[] = pipe.split(snpeff.substring(opar + 1, cpar)); List<String> h = new ArrayList<String>(); h.add("Effect"); h.add(snpeff.substring(0, opar)); for (int t = 0; t < tokens.length && t < snpEffColumns.length; ++t) { if (tokens[t].isEmpty()) continue; h.add(snpEffColumns[t]); h.add(tokens[t]); } insertExtraInfos(key, h); } } if (key.equals("NMD") && nmdColumns != null) { for (Object item : castToStringArray(val)) { String nmd = item.toString(); if (nmd.isEmpty()) continue; String tokens[] = pipe.split(nmd); List<String> h = new ArrayList<String>(nmdColumns.length * 2); for (int t = 0; t < tokens.length && t < nmdColumns.length; ++t) { if (tokens[t].isEmpty()) continue; h.add(nmdColumns[t]); h.add(tokens[t]); } insertExtraInfos(key, h); } } if (key.equals("LOF") && lofColumns != null) { for (Object item : castToStringArray(val)) { String lof = item.toString(); if (lof.isEmpty()) continue; String tokens[] = pipe.split(lof); List<String> h = new ArrayList<String>(lofColumns.length * 2); for (int t = 0; t < tokens.length && t < lofColumns.length; ++t) { if (tokens[t].isEmpty()) continue; h.add(lofColumns[t]); h.add(tokens[t]); } insertExtraInfos(key, h); } } } GenotypesContext genotypesCtx = var.getGenotypes(); for (Genotype g : genotypesCtx) { // "create table if not exists GENOTYPE(id,var_id,k,v)"; List<Allele> alleles = g.getAlleles(); out.println( "insert into GENOTYPE" + SUFFIX + "(var_id,sample_id,A1,A2,dp,ad,gq,pl," + "is_phased,is_hom,is_homref,is_homvar,is_mixed," + "is_nocall,is_noninformative,is_available,is_called,is_filtered" + ") values (" + "(select max(id) from VARIATION" + SUFFIX + ")," + "(select id from SAMPLE" + SUFFIX + " where name=" + quote(g.getSampleName()) + ")," + (alleles.size() == 2 ? quote(alleles.get(0).getBaseString()) : "NULL") + "," + (alleles.size() == 2 ? quote(alleles.get(1).getBaseString()) : "NULL") + "," + (g.hasDP() ? g.getDP() : "NULL") + "," + (g.hasAD() ? quote(infotoString(g.getAD())) : "NULL") + "," + (g.hasGQ() ? g.getGQ() : "NULL") + "," + (g.hasPL() ? quote(infotoString(g.getPL())) : "NULL") + "," + (g.isPhased() ? 1 : 0) + "," + (g.isHom() ? 1 : 0) + "," + (g.isHomRef() ? 1 : 0) + "," + (g.isHomVar() ? 1 : 0) + "," + (g.isMixed() ? 1 : 0) + "," + (g.isNoCall() ? 1 : 0) + "," + (g.isNonInformative() ? 1 : 0) + "," + (g.isAvailable() ? 1 : 0) + "," + (g.isCalled() ? 1 : 0) + "," + (g.isFiltered() ? 1 : 0) + ");"); for (String key : g.getExtendedAttributes().keySet()) { Object val = g.getExtendedAttribute(key); if (val == null) continue; out.println( "insert into GTPROP" + SUFFIX + "(genotype_id,k,v) values (" + "(select max(id) from GENOTYPE" + SUFFIX + ")," + quote(key) + "," + quote(infotoString(val)) + ");"); } } } r.close(); }
@SuppressWarnings("rawtypes") public class VcfToSql extends CommandLineProgram { @Usage(programVersion = "1.0") public String USAGE = getStandardUsagePreamble() + "Creates the code to insert one or more VCF into a SQL database. "; @Option( shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "VCF files to process.", minElements = 0) public List<File> IN = new ArrayList<File>(); @Option(shortName = "SFX", doc = "Table suffix", optional = true) public String SUFFIX = ""; @Option(shortName = "VEP", doc = "Use and explode VEP predictions", optional = true) public boolean USE_VEP = true; @Option(shortName = "SNPEFF", doc = "Use and explode SNPEFF predictions", optional = true) public boolean USE_SNPEFF = true; @Option(shortName = "SQLIDX", doc = "Create misc SQL Indexes.", optional = true) public boolean SQLINDEX = true; @Option(shortName = "EGN", doc = "sql engine [sqlite,hsql]", optional = true) public String ENGINE = SQLEngine.sqlite.name(); @Option(shortName = "S4", doc = "Split DP4", optional = true) public boolean SPLIT4 = false; private SQLEngine engine = SQLEngine.sqlite; private enum SQLEngine { sqlite, hsql }; private static Log LOG = Log.getInstance(VcfToSql.class); private PrintWriter out = new PrintWriter(System.out); @Override public String getVersion() { return "1.0"; } private String columnId() { switch (this.engine) { case hsql: return "id INTEGER GENERATED ALWAYS AS IDENTITY(START WITH 1, INCREMENT BY 1) PRIMARY KEY,"; default: return "id INTEGER PRIMARY KEY AUTOINCREMENT,"; } } private String varchar(int length) { switch (this.engine) { case hsql: return "VARCHAR(" + length + ")"; default: return "TEXT"; } } private String text() { switch (this.engine) { case hsql: return "LONGVARCHAR"; default: return "TEXT"; } } @Override protected int doWork() { try { try { this.engine = SQLEngine.valueOf(this.ENGINE); } catch (Exception err) { LOG.error("BAD SQL ENGINE " + this.ENGINE); return -1; } out.println( "create table if not exists FILE" + SUFFIX + "(" + columnId() + "filename " + varchar(255) + " NOT NULL" + ");"); out.println( "create table if not exists HEADER" + SUFFIX + "(" + columnId() + "file_id INT NOT NULL REFERENCES FILE" + SUFFIX + "(id) ON DELETE CASCADE," + "header " + text() + ");"); out.println( "create table if not exists SAMPLE" + SUFFIX + "(" + columnId() + "name " + varchar(100) + " NOT NULL UNIQUE" + ");"); out.println( "create table if not exists VARIATION" + SUFFIX + "(" + columnId() + "file_id INT NOT NULL REFERENCES FILE" + SUFFIX + "(id) ON DELETE CASCADE," + "CHROM VARCHAR(20) NOT NULL," + "POS INT NOT NULL," + "START0 INT NOT NULL," + "END0 INT NOT NULL," + "RS_ID VARCHAR(50)," + "REF " + text() + " NOT NULL," + "QUAL FLOAT" + ");"); out.println( "create table if not exists ALT" + SUFFIX + "(" + columnId() + "var_id INT NOT NULL REFERENCES VARIATION" + SUFFIX + "(id) ON DELETE CASCADE," + "ALT " + text() + ");"); out.println( "create table if not exists FILTER" + SUFFIX + "(" + columnId() + "var_id INT NOT NULL REFERENCES VARIATION" + SUFFIX + "(id) ON DELETE CASCADE," + "FILTER varchar(50) not null" + ");"); out.println( "create table if not exists INFO" + SUFFIX + "(" + columnId() + "var_id INT NOT NULL REFERENCES VARIATION" + SUFFIX + "(id) ON DELETE CASCADE," + "k varchar(50) not null," + "v " + text() + " not null" + ");"); out.println( "create table if not exists EXTRAINFO" + SUFFIX + "(" + columnId() + "info_id INT NOT NULL REFERENCES INFO" + SUFFIX + "(id) ON DELETE CASCADE," + "type varchar(50) not null" + ");"); out.println( "create table if not exists EXTRAINFOPROP" + SUFFIX + "(" + columnId() + "extrainfo_id INT NOT NULL REFERENCES EXTRAINFO" + SUFFIX + "(id) ON DELETE CASCADE," + "k varchar(50) not null," + "v " + text() + " not null" + ");"); out.println( "create table if not exists GENOTYPE" + SUFFIX + "(" + columnId() + "var_id INT NOT NULL REFERENCES VARIATION" + SUFFIX + "(id) ON DELETE CASCADE," + "sample_id INT NOT NULL REFERENCES SAMPLE" + SUFFIX + "(id) ON DELETE CASCADE," + "A1 " + text() + ", A2 " + text() + ", dp int, ad varchar(50), gq float,pl " + text() + "," + "is_phased SMALLINT not null,is_hom SMALLINT not null,is_homref SMALLINT not null,is_homvar SMALLINT not null,is_mixed SMALLINT not null," + "is_nocall SMALLINT not null,is_noninformative SMALLINT not null,is_available SMALLINT not null,is_called SMALLINT not null,is_filtered SMALLINT not null" + ");"); out.println( "create table if not exists GTPROP" + SUFFIX + "(" + columnId() + "genotype_id INT NOT NULL REFERENCES GENOTYPE" + SUFFIX + "(id) ON DELETE CASCADE," + "k varchar(50) not null," + "v " + text() + " not null" + ");"); switch (this.engine) { case sqlite: out.println("begin transaction;"); break; default: break; } if (IN.isEmpty()) { LOG.info("reading from stdin"); read(System.in, "<stdin>"); } else { for (File input : IN) { LOG.info("opening " + input); InputStream in = IOUtils.openFileForReading(input); read(in, input.toString()); in.close(); } } if (SQLINDEX) { index("SAMPLE", "name"); index("EXTRAINFO", "type"); index("EXTRAINFOPROP", "k"); index("EXTRAINFOPROP", "v"); index("INFO", "var_id"); index("INFO", "k"); index("EXTRAINFO", "info_id"); index("EXTRAINFOPROP", "extrainfo_id"); index("GENOTYPE", "var_id"); index("GENOTYPE", "sample_id"); } switch (this.engine) { case sqlite: out.println("commit;"); break; default: break; } out.flush(); } catch (IOException err) { err.printStackTrace(); return -1; } return 0; } private void index(String table, String column) { out.print("create index "); switch (this.engine) { case hsql: break; default: out.print(" if not exists "); break; } out.print( " " + (table + SUFFIX + "_" + column + "_IDX").toUpperCase() + " on " + table + SUFFIX + "(" + column + ");"); } private void read(InputStream in, String filename) throws IOException { // Pattern comma=Pattern.compile("[,]"); Pattern pipe = Pattern.compile("[\\|]"); Pattern amp = Pattern.compile("&"); out.println("insert into FILE" + SUFFIX + "(filename) values (" + quote(filename) + ");"); VcfIterator r = new VcfIterator(in); VCFHeader header = r.getHeader(); String csqColumns[] = null; VCFInfoHeaderLine infoHeader = header.getInfoHeaderLine("CSQ"); if (infoHeader != null && this.USE_VEP) { LOG.info("parsing VEP " + infoHeader.getDescription()); final String formatStr = "Format: "; int i = infoHeader.getDescription().indexOf(formatStr); if (i != -1) { csqColumns = pipe.split(infoHeader.getDescription().substring(i + formatStr.length()).trim()); LOG.debug(Arrays.asList(csqColumns)); } else { LOG.error("Cannot parse " + infoHeader.getDescription()); } } String snpEffColumns[] = null; infoHeader = header.getInfoHeaderLine("EFF"); if (infoHeader != null && this.USE_SNPEFF) { LOG.info("parsing EFF " + infoHeader.getDescription()); final String formatStr = ".Format: '"; final String desc = infoHeader.getDescription(); int i = desc.indexOf(formatStr); if (i != -1) i = desc.indexOf('(', i + formatStr.length()); int j = desc.lastIndexOf(')'); if (i != -1 && j > i) { snpEffColumns = pipe.split(desc.substring(i + 1, j).replaceAll("[ \\[\\]()\\.]", "").trim()); LOG.info(Arrays.asList(snpEffColumns)); } else { LOG.error("Cannot parse " + infoHeader.getDescription()); } } String nmdColumns[] = null; infoHeader = header.getInfoHeaderLine("NMD"); if (infoHeader != null && this.USE_SNPEFF) { final String formatStr = " Format: '"; final String desc = infoHeader.getDescription(); int i = desc.indexOf(formatStr); int j = (i == -1 ? -1 : desc.lastIndexOf('\'')); if (i != -1 && j > i) { nmdColumns = pipe.split( desc.substring(i + formatStr.length(), j).replaceAll("[ \\[\\]()\\.]", "").trim()); } else { LOG.error("Cannot parse " + infoHeader.getDescription()); } } String lofColumns[] = null; infoHeader = header.getInfoHeaderLine("LOF"); if (infoHeader != null && this.USE_SNPEFF) { final String formatStr = " Format: '"; final String desc = infoHeader.getDescription(); int i = desc.indexOf(formatStr); int j = (i == -1 ? -1 : desc.lastIndexOf('\'')); if (i != -1 && j > i) { lofColumns = pipe.split( desc.substring(i + formatStr.length(), j).replaceAll("[ \\[\\]()\\.]", "").trim()); } else { LOG.error("Cannot parse " + infoHeader.getDescription()); } } for (String S : header.getSampleNamesInOrder()) { // merge into SAMPLE using (select 1+MAX(id),'azdazd' from SAMPLE) as vals(x,y) on // SAMPLE.name=vals.y when NOT MATCHED THEN INSERT VALUES vals.x,vals.y; switch (this.engine) { case hsql: out.println( "merge into SAMPLE" + SUFFIX + " using ( values(" + quote(S) + ") ) " + "AS vals(y) ON SAMPLE" + SUFFIX + ".name = vals.y " + "WHEN NOT MATCHED THEN INSERT VALUES (NULL,vals.y);"); break; default: out.println( "insert or ignore into SAMPLE" + SUFFIX + "(name) values (" + quote(S) + ");"); break; } } List<String> headers = new ArrayList<String>(); for (VCFHeaderLine line : header.getMetaDataInSortedOrder()) { if (VCFHeaderVersion.isFormatString(line.getKey())) continue; headers.add(VCFHeader.METADATA_INDICATOR + line); } String chromLine = VCFHeader.HEADER_INDICATOR; for (VCFHeader.HEADER_FIELDS field : header.getHeaderFields()) { if (!VCFHeader.HEADER_INDICATOR.equals(chromLine)) chromLine += (VCFConstants.FIELD_SEPARATOR); chromLine += (field); } if (header.hasGenotypingData()) { chromLine += VCFConstants.FIELD_SEPARATOR + "FORMAT"; for (String sample : header.getGenotypeSamples()) { chromLine += VCFConstants.FIELD_SEPARATOR; chromLine += sample; } } headers.add(chromLine); for (String line : headers) { out.println( "insert into HEADER" + SUFFIX + "(file_id,header) values (" + "(select max(id) from FILE" + SUFFIX + ")," + quote(line) + ");"); } while (r.hasNext()) { VariantContext var = r.next(); if (var == null) { LOG.error("Cannot parse VCF"); continue; } // "create table if not exists FILE(id,filename text)"; // "create table if not exists VARIATION(id,file_id,chrom,pos,start0,end0,rs_id,ref,qual)"; out.println( "insert into VARIATION" + SUFFIX + "(file_id,chrom,pos,START0,END0,rs_id,ref,qual) values (" + "(select max(id) from FILE" + SUFFIX + ")," + quote(var.getChr()) + "," + var.getStart() + "," + (var.getStart() - 1) + "," + var.getEnd() + "," + (var.getID() == null || var.getID().equals(VCFConstants.EMPTY_ID_FIELD) ? "NULL" : quote(var.getID())) + "," + quote(var.getReference().getDisplayString()) + "," + (var.getPhredScaledQual() < 0 ? "NULL" : var.getPhredScaledQual()) + ");"); // "create table if not exists ALT(id,var_id,alt)"; for (Allele alt : var.getAlternateAlleles()) { out.println( "insert into ALT" + SUFFIX + "(var_id,alt) values (" + "(select max(id) from VARIATION" + SUFFIX + ")," + quote(alt.getDisplayString()) + ");"); } // "create table if not exists FILTER(id,var_id,filter)"; for (String filter : var.getFilters()) { out.println( "insert into FILTER" + SUFFIX + "(var_id,filter) values (" + "(select max(id) from VARIATION" + SUFFIX + ")," + quote(filter) + ");"); } CommonInfo infos = var.getCommonInfo(); for (String key : infos.getAttributes().keySet()) { Object val = infos.getAttribute(key); // "create table if not exists INFO(id,var_id,k,v)"; if (SPLIT4 && key.equals("DP4")) { String dp4[] = infotoString(val).split("[,]"); insertIntoInfo(quote(key + "[refFor]"), quote(dp4[0])); insertIntoInfo(quote(key + "[refRev]"), quote(dp4[1])); insertIntoInfo(quote(key + "[altFor]"), quote(dp4[2])); insertIntoInfo(quote(key + "[altRev]"), quote(dp4[3])); } else { insertIntoInfo(quote(key), quote(infotoString(val))); } if (key.equals("CSQ") && csqColumns != null) { List as_array = castToStringArray(val); for (Object csqs : as_array) { if (csqs.toString().isEmpty()) continue; String tokens[] = pipe.split(csqs.toString()); List<String> extraInfo = new ArrayList<String>(); for (int t = 0; t < tokens.length && t < csqColumns.length; ++t) { if (tokens[t].isEmpty()) continue; if (csqColumns[t].equals("Consequence")) { for (String pred : amp.split(tokens[t])) { if (pred.isEmpty()) continue; extraInfo.add(csqColumns[t]); extraInfo.add(pred); } } else { extraInfo.add(csqColumns[t]); extraInfo.add(tokens[t]); } } insertExtraInfos("CSQ", extraInfo); } } if (key.equals("EFF") && snpEffColumns != null) { for (Object item : castToStringArray(val)) { String snpeff = item.toString(); if (snpeff.isEmpty()) continue; int opar = snpeff.indexOf('('); if (opar == -1) continue; int cpar = snpeff.lastIndexOf(')'); if (cpar == -1) continue; String tokens[] = pipe.split(snpeff.substring(opar + 1, cpar)); List<String> h = new ArrayList<String>(); h.add("Effect"); h.add(snpeff.substring(0, opar)); for (int t = 0; t < tokens.length && t < snpEffColumns.length; ++t) { if (tokens[t].isEmpty()) continue; h.add(snpEffColumns[t]); h.add(tokens[t]); } insertExtraInfos(key, h); } } if (key.equals("NMD") && nmdColumns != null) { for (Object item : castToStringArray(val)) { String nmd = item.toString(); if (nmd.isEmpty()) continue; String tokens[] = pipe.split(nmd); List<String> h = new ArrayList<String>(nmdColumns.length * 2); for (int t = 0; t < tokens.length && t < nmdColumns.length; ++t) { if (tokens[t].isEmpty()) continue; h.add(nmdColumns[t]); h.add(tokens[t]); } insertExtraInfos(key, h); } } if (key.equals("LOF") && lofColumns != null) { for (Object item : castToStringArray(val)) { String lof = item.toString(); if (lof.isEmpty()) continue; String tokens[] = pipe.split(lof); List<String> h = new ArrayList<String>(lofColumns.length * 2); for (int t = 0; t < tokens.length && t < lofColumns.length; ++t) { if (tokens[t].isEmpty()) continue; h.add(lofColumns[t]); h.add(tokens[t]); } insertExtraInfos(key, h); } } } GenotypesContext genotypesCtx = var.getGenotypes(); for (Genotype g : genotypesCtx) { // "create table if not exists GENOTYPE(id,var_id,k,v)"; List<Allele> alleles = g.getAlleles(); out.println( "insert into GENOTYPE" + SUFFIX + "(var_id,sample_id,A1,A2,dp,ad,gq,pl," + "is_phased,is_hom,is_homref,is_homvar,is_mixed," + "is_nocall,is_noninformative,is_available,is_called,is_filtered" + ") values (" + "(select max(id) from VARIATION" + SUFFIX + ")," + "(select id from SAMPLE" + SUFFIX + " where name=" + quote(g.getSampleName()) + ")," + (alleles.size() == 2 ? quote(alleles.get(0).getBaseString()) : "NULL") + "," + (alleles.size() == 2 ? quote(alleles.get(1).getBaseString()) : "NULL") + "," + (g.hasDP() ? g.getDP() : "NULL") + "," + (g.hasAD() ? quote(infotoString(g.getAD())) : "NULL") + "," + (g.hasGQ() ? g.getGQ() : "NULL") + "," + (g.hasPL() ? quote(infotoString(g.getPL())) : "NULL") + "," + (g.isPhased() ? 1 : 0) + "," + (g.isHom() ? 1 : 0) + "," + (g.isHomRef() ? 1 : 0) + "," + (g.isHomVar() ? 1 : 0) + "," + (g.isMixed() ? 1 : 0) + "," + (g.isNoCall() ? 1 : 0) + "," + (g.isNonInformative() ? 1 : 0) + "," + (g.isAvailable() ? 1 : 0) + "," + (g.isCalled() ? 1 : 0) + "," + (g.isFiltered() ? 1 : 0) + ");"); for (String key : g.getExtendedAttributes().keySet()) { Object val = g.getExtendedAttribute(key); if (val == null) continue; out.println( "insert into GTPROP" + SUFFIX + "(genotype_id,k,v) values (" + "(select max(id) from GENOTYPE" + SUFFIX + ")," + quote(key) + "," + quote(infotoString(val)) + ");"); } } } r.close(); } private String quote(String s) { if (s == null) return "NULL"; StringBuilder b = new StringBuilder(); b.append("\'"); for (int i = 0; i < s.length(); ++i) { char c = s.charAt(i); switch (c) { case '\'': b.append("''"); break; default: b.append(c); break; } } b.append("\'"); return b.toString(); } private void insertExtraInfos(String type, List<String> h) { boolean first = true; for (int i = 0; i + 1 < h.size(); i += 2) { if (h.get(i + 1).isEmpty()) continue; if (first) { out.println( "insert into EXTRAINFO" + SUFFIX + "(info_id,type) values (" + "(select max(id) from INFO" + SUFFIX + ")," + quote(type) + ");"); first = false; } out.println( "insert into EXTRAINFOPROP" + SUFFIX + "(extrainfo_id,k,v) values (" + "(select max(id) from EXTRAINFO" + SUFFIX + ")," + quote(h.get(i)) + "," + quote(h.get(i + 1)) + ");"); } } @SuppressWarnings("unchecked") private List castToStringArray(Object val) { if (val instanceof List) { return (List) val; } else { return new ArrayList(Collections.singleton(val.toString())); } } private String infotoString(Object o) { if (o instanceof int[]) { int array[] = (int[]) o; StringBuilder b = new StringBuilder(); for (int i = 0; i < array.length; ++i) { if (i > 0) b.append(","); b.append(infotoString(array[i])); } return b.toString(); } if (o instanceof List) { List<?> L = List.class.cast(o); StringBuilder b = new StringBuilder(); for (int i = 0; i < L.size(); ++i) { if (i > 0) b.append(","); b.append(infotoString(L.get(i))); } return b.toString(); } return o.toString(); } private void insertIntoInfo(String key, String val) { out.println( "insert into INFO" + SUFFIX + "(var_id,k,v) values (" + "(select max(id) from VARIATION" + SUFFIX + ")," + key + "," + val + ");"); } public static void main(String[] args) { new VcfToSql().instanceMainWithExit(args); } }
@Override protected int doWork() { try { try { this.engine = SQLEngine.valueOf(this.ENGINE); } catch (Exception err) { LOG.error("BAD SQL ENGINE " + this.ENGINE); return -1; } out.println( "create table if not exists FILE" + SUFFIX + "(" + columnId() + "filename " + varchar(255) + " NOT NULL" + ");"); out.println( "create table if not exists HEADER" + SUFFIX + "(" + columnId() + "file_id INT NOT NULL REFERENCES FILE" + SUFFIX + "(id) ON DELETE CASCADE," + "header " + text() + ");"); out.println( "create table if not exists SAMPLE" + SUFFIX + "(" + columnId() + "name " + varchar(100) + " NOT NULL UNIQUE" + ");"); out.println( "create table if not exists VARIATION" + SUFFIX + "(" + columnId() + "file_id INT NOT NULL REFERENCES FILE" + SUFFIX + "(id) ON DELETE CASCADE," + "CHROM VARCHAR(20) NOT NULL," + "POS INT NOT NULL," + "START0 INT NOT NULL," + "END0 INT NOT NULL," + "RS_ID VARCHAR(50)," + "REF " + text() + " NOT NULL," + "QUAL FLOAT" + ");"); out.println( "create table if not exists ALT" + SUFFIX + "(" + columnId() + "var_id INT NOT NULL REFERENCES VARIATION" + SUFFIX + "(id) ON DELETE CASCADE," + "ALT " + text() + ");"); out.println( "create table if not exists FILTER" + SUFFIX + "(" + columnId() + "var_id INT NOT NULL REFERENCES VARIATION" + SUFFIX + "(id) ON DELETE CASCADE," + "FILTER varchar(50) not null" + ");"); out.println( "create table if not exists INFO" + SUFFIX + "(" + columnId() + "var_id INT NOT NULL REFERENCES VARIATION" + SUFFIX + "(id) ON DELETE CASCADE," + "k varchar(50) not null," + "v " + text() + " not null" + ");"); out.println( "create table if not exists EXTRAINFO" + SUFFIX + "(" + columnId() + "info_id INT NOT NULL REFERENCES INFO" + SUFFIX + "(id) ON DELETE CASCADE," + "type varchar(50) not null" + ");"); out.println( "create table if not exists EXTRAINFOPROP" + SUFFIX + "(" + columnId() + "extrainfo_id INT NOT NULL REFERENCES EXTRAINFO" + SUFFIX + "(id) ON DELETE CASCADE," + "k varchar(50) not null," + "v " + text() + " not null" + ");"); out.println( "create table if not exists GENOTYPE" + SUFFIX + "(" + columnId() + "var_id INT NOT NULL REFERENCES VARIATION" + SUFFIX + "(id) ON DELETE CASCADE," + "sample_id INT NOT NULL REFERENCES SAMPLE" + SUFFIX + "(id) ON DELETE CASCADE," + "A1 " + text() + ", A2 " + text() + ", dp int, ad varchar(50), gq float,pl " + text() + "," + "is_phased SMALLINT not null,is_hom SMALLINT not null,is_homref SMALLINT not null,is_homvar SMALLINT not null,is_mixed SMALLINT not null," + "is_nocall SMALLINT not null,is_noninformative SMALLINT not null,is_available SMALLINT not null,is_called SMALLINT not null,is_filtered SMALLINT not null" + ");"); out.println( "create table if not exists GTPROP" + SUFFIX + "(" + columnId() + "genotype_id INT NOT NULL REFERENCES GENOTYPE" + SUFFIX + "(id) ON DELETE CASCADE," + "k varchar(50) not null," + "v " + text() + " not null" + ");"); switch (this.engine) { case sqlite: out.println("begin transaction;"); break; default: break; } if (IN.isEmpty()) { LOG.info("reading from stdin"); read(System.in, "<stdin>"); } else { for (File input : IN) { LOG.info("opening " + input); InputStream in = IOUtils.openFileForReading(input); read(in, input.toString()); in.close(); } } if (SQLINDEX) { index("SAMPLE", "name"); index("EXTRAINFO", "type"); index("EXTRAINFOPROP", "k"); index("EXTRAINFOPROP", "v"); index("INFO", "var_id"); index("INFO", "k"); index("EXTRAINFO", "info_id"); index("EXTRAINFOPROP", "extrainfo_id"); index("GENOTYPE", "var_id"); index("GENOTYPE", "sample_id"); } switch (this.engine) { case sqlite: out.println("commit;"); break; default: break; } out.flush(); } catch (IOException err) { err.printStackTrace(); return -1; } return 0; }