/** @throws IOException */
  private void readFileHeader() throws Exception {

    // fisrt four bytes are empty
    // it should be zero for new version of control file, backward compatibility
    int emptyBytes = this.readFourBytes(inputStream);
    if (emptyBytes != 0) {

      log.warn(
          "The first four bytes are not zero: "
              + emptyBytes
              + ". This is an old format control file.");
      this.totalClusters = emptyBytes;
      return;
    }

    // next four bytes should be version and greater or equal to the expected
    int version = this.readFourBytes(inputStream);
    if (version != this.EXPECTED_CONTROL_VERSION) {
      log.error("Unexpected version byte: " + version);
      throw new Exception("Unexpected version number in control file");
    }

    // next four bytes should be the total number of clusters
    this.totalClusters = this.readFourBytes(inputStream);
    log.info("The total number of clusters: " + this.getTotalClusters());
  }
  @Override
  protected void finish() {
    multiCollector.finish();

    final MetricsFile<InsertSizeMetrics, Integer> file = getMetricsFile();
    multiCollector.addAllLevelsToFile(file);

    if (file.getNumHistograms() == 0) {
      // can happen if user sets MINIMUM_PCT = 0.5, etc.
      log.warn(
          "All data categories were discarded because they contained < "
              + MINIMUM_PCT
              + " of the total aligned paired data.");
      final InsertSizeMetricsCollector.PerUnitInsertSizeMetricsCollector allReadsCollector =
          (InsertSizeMetricsCollector.PerUnitInsertSizeMetricsCollector)
              multiCollector.getAllReadsCollector();
      log.warn(
          "Total mapped pairs in all categories: "
              + (allReadsCollector == null
                  ? allReadsCollector
                  : allReadsCollector.getTotalInserts()));
    } else {
      file.write(OUTPUT);

      final int rResult;
      if (HISTOGRAM_WIDTH == null) {
        rResult =
            RExecutor.executeFromClasspath(
                HISTOGRAM_R_SCRIPT,
                OUTPUT.getAbsolutePath(),
                HISTOGRAM_FILE.getAbsolutePath(),
                INPUT.getName());
      } else {
        rResult =
            RExecutor.executeFromClasspath(
                HISTOGRAM_R_SCRIPT,
                OUTPUT.getAbsolutePath(),
                HISTOGRAM_FILE.getAbsolutePath(),
                INPUT.getName(),
                String.valueOf(
                    HISTOGRAM_WIDTH)); // HISTOGRAM_WIDTH is passed because R automatically sets
                                       // histogram width to the last
        // bin that has data, which may be less than HISTOGRAM_WIDTH and confuse the user.
      }

      if (rResult != 0) {
        throw new PicardException(
            "R script " + HISTOGRAM_R_SCRIPT + " failed with return code " + rResult);
      }
    }
  }
  @Override
  public Object next() {

    try {
      int nextByte = this.inputStream.readUnsignedShort();

      if (nextByte == -1) {
        log.warn(
            "There is no more cluster in Control file after cluster "
                + this.getCurrentCluster()
                + " in file "
                + this.getFileName());
        return null;
      }

      this.currentCluster++;
      /*
      Bit0: always empty (0)
      Bit1: was the read identified as a control?
      Bit2: was the match ambiguous?
      Bit3: did the read match the phiX tag?
      Bit4: did the read align to match the phiX tag?
      Bit5: did the read match the control index sequence? (specified in controls.fata, TGTCACA)
      Bits6,7: reserved for future use
      Bits8..15: the report key for the matched record in the controls.fasta file (specified by the REPOControl FilesRT_ KEY metadata)
      */
      nextByte = nextByte & 0x2;
      if (nextByte != 0) {
        this.currentControlClusters++;
      }

      return new Integer(nextByte);

    } catch (IOException ex) {
      log.error(ex, "Problem to read control file");
    }

    return null;
  }
Example #4
0
  /**
   * Lift over the given interval to the new genome build.
   *
   * @param interval Interval to be lifted over.
   * @param liftOverMinMatch Minimum fraction of bases that must remap.
   * @return Interval in the output build coordinates, or null if it cannot be lifted over.
   */
  public Interval liftOver(final Interval interval, final double liftOverMinMatch) {
    if (interval.length() == 0) {
      throw new IllegalArgumentException(
          "Zero-length interval cannot be lifted over.  Interval: " + interval.getName());
    }
    Chain chainHit = null;
    TargetIntersection targetIntersection = null;
    // Number of bases in interval that can be lifted over must be >= this.
    double minMatchSize = liftOverMinMatch * interval.length();

    // Find the appropriate Chain, and the part of the chain corresponding to the interval to be
    // lifted over.
    for (final Chain chain : chains.getOverlaps(interval)) {
      final TargetIntersection candidateIntersection = targetIntersection(chain, interval);
      if (candidateIntersection != null
          && candidateIntersection.intersectionLength >= minMatchSize) {
        if (chainHit != null) {
          // In basic liftOver, multiple hits are not allowed.
          return null;
        }
        chainHit = chain;
        targetIntersection = candidateIntersection;
      } else if (candidateIntersection != null) {
        LOG.info(
            "Interval "
                + interval.getName()
                + " failed to match chain "
                + chain.id
                + " because intersection length "
                + candidateIntersection.intersectionLength
                + " < minMatchSize "
                + minMatchSize
                + " ("
                + (candidateIntersection.intersectionLength / (float) interval.length())
                + " < "
                + liftOverMinMatch
                + ")");
      }
    }
    if (chainHit == null) {
      // Can't be lifted over.
      return null;
    }

    return createToInterval(interval.getName(), targetIntersection);
  }
Example #5
0
  protected int doWork() {
    IoUtil.assertFileIsReadable(INPUT);
    IoUtil.assertFileIsWritable(OUTPUT);

    final SAMFileReader in = new SAMFileReader(INPUT);

    // create the read group we'll be using
    final SAMReadGroupRecord rg = new SAMReadGroupRecord(RGID);
    rg.setLibrary(RGLB);
    rg.setPlatform(RGPL);
    rg.setSample(RGSM);
    rg.setPlatformUnit(RGPU);
    if (RGCN != null) rg.setSequencingCenter(RGCN);
    if (RGDS != null) rg.setDescription(RGDS);
    if (RGDT != null) rg.setRunDate(RGDT);

    log.info(
        String.format(
            "Created read group ID=%s PL=%s LB=%s SM=%s%n",
            rg.getId(), rg.getPlatform(), rg.getLibrary(), rg.getSample()));

    // create the new header and output file
    final SAMFileHeader inHeader = in.getFileHeader();
    final SAMFileHeader outHeader = inHeader.clone();
    outHeader.setReadGroups(Arrays.asList(rg));
    if (SORT_ORDER != null) outHeader.setSortOrder(SORT_ORDER);

    final SAMFileWriter outWriter =
        new SAMFileWriterFactory()
            .makeSAMOrBAMWriter(
                outHeader, outHeader.getSortOrder() == inHeader.getSortOrder(), OUTPUT);

    final ProgressLogger progress = new ProgressLogger(log);
    for (final SAMRecord read : in) {
      read.setAttribute(SAMTag.RG.name(), RGID);
      outWriter.addAlignment(read);
      progress.record(read);
    }

    // cleanup
    in.close();
    outWriter.close();
    return 0;
  }
Example #6
0
/**
 * Both CollectTargetedPCRMetrics and CalculateHybridSelection metrics share virtually identical
 * program structures except for the name of their targeting mechanisms (e.g. bait set or amplicon
 * set). The shared behavior of these programs is encapsulated in CollectTargetedMetrics which is
 * then subclassed by CalculateHsMetrics and CollectTargetedPcrMetrics.
 *
 * <p>This program verifies the input parameters to TargetMetricsCollector and converts all files to
 * the format desired by TargetMetricsCollector. Then it instantiates a TargetMetricsCollector and
 * collects metric information for all reads in the INPUT sam file.
 */
public abstract class CollectTargetedMetrics extends CommandLineProgram {

  private static final Log log = Log.getInstance(CalculateHsMetrics.class);

  /**
   * The interval file to be fed to TargetMetricsCollector
   *
   * @return An interval file that denotes the intervals of the regions targeted by the probes for
   *     this run that is passed to the TargetMetricsCollector produced by makeCollector
   */
  protected abstract File getProbeIntervals();

  /** @return The name of the probe set used in this run, getProbeIntervals().getName() is */
  protected abstract String getProbeSetName();

  /**
   * A factory method for the TargetMetricsCollector to use this time. Examples of
   * TargetMetricsCollector: (TargetedPcrMetricsCollector, HsMetricsCalculator)
   *
   * @return A TargetMetricsCollector to which we will pass SAMRecords
   */
  protected abstract TargetMetricsCollector makeCollector(
      final Set<MetricAccumulationLevel> accumulationLevels,
      final List<SAMReadGroupRecord> samRgRecords,
      final ReferenceSequenceFile refFile,
      final File perTargetCoverage,
      final File targetIntervals,
      final File probeIntervals,
      final String probeSetName);

  @Option(
      shortName = "TI",
      doc = "An interval list file that contains the locations of the targets.")
  public File TARGET_INTERVALS;

  @Option(
      shortName = StandardOptionDefinitions.INPUT_SHORT_NAME,
      doc = "An aligned SAM or BAM file.")
  public File INPUT;

  @Option(
      shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME,
      doc = "The output file to write the metrics to.")
  public File OUTPUT;

  @Option(shortName = "LEVEL", doc = "The level(s) at which to accumulate metrics.  ")
  public Set<MetricAccumulationLevel> METRIC_ACCUMULATION_LEVEL =
      CollectionUtil.makeSet(MetricAccumulationLevel.ALL_READS);

  @Option(
      shortName = StandardOptionDefinitions.REFERENCE_SHORT_NAME,
      optional = true,
      doc = "The reference sequence aligned to.")
  public File REFERENCE_SEQUENCE;

  @Option(optional = true, doc = "An optional file to output per target coverage information to.")
  public File PER_TARGET_COVERAGE;

  /**
   * Asserts that files are readable and writable and then fires off an HsMetricsCalculator instance
   * to do the real work.
   */
  protected int doWork() {
    IoUtil.assertFileIsReadable(getProbeIntervals());
    IoUtil.assertFileIsReadable(TARGET_INTERVALS);
    IoUtil.assertFileIsReadable(INPUT);
    IoUtil.assertFileIsWritable(OUTPUT);
    if (PER_TARGET_COVERAGE != null) IoUtil.assertFileIsWritable(PER_TARGET_COVERAGE);

    final SAMFileReader samReader = new SAMFileReader(INPUT);

    final File probeIntervals = getProbeIntervals();

    // Validate that the targets and baits have the same references as the reads file
    SequenceUtil.assertSequenceDictionariesEqual(
        samReader.getFileHeader().getSequenceDictionary(),
        IntervalList.fromFile(TARGET_INTERVALS).getHeader().getSequenceDictionary(),
        INPUT,
        TARGET_INTERVALS);
    SequenceUtil.assertSequenceDictionariesEqual(
        samReader.getFileHeader().getSequenceDictionary(),
        IntervalList.fromFile(probeIntervals).getHeader().getSequenceDictionary(),
        INPUT,
        probeIntervals);

    ReferenceSequenceFile ref = null;
    if (REFERENCE_SEQUENCE != null) {
      IoUtil.assertFileIsReadable(REFERENCE_SEQUENCE);
      ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(REFERENCE_SEQUENCE);
      SequenceUtil.assertSequenceDictionariesEqual(
          samReader.getFileHeader().getSequenceDictionary(),
          ref.getSequenceDictionary(),
          INPUT,
          REFERENCE_SEQUENCE);
    }

    final TargetMetricsCollector collector =
        makeCollector(
            METRIC_ACCUMULATION_LEVEL,
            samReader.getFileHeader().getReadGroups(),
            ref,
            PER_TARGET_COVERAGE,
            TARGET_INTERVALS,
            probeIntervals,
            getProbeSetName());

    // Add each record to the requested collectors
    final Iterator<SAMRecord> records = samReader.iterator();
    final ProgressLogger progress = new ProgressLogger(log);

    while (records.hasNext()) {
      final SAMRecord sam = records.next();
      collector.acceptRecord(sam, null);
      progress.record(sam);
    }

    // Write the output file
    final MetricsFile<HsMetrics, Integer> metrics = getMetricsFile();
    collector.finish();

    collector.addAllLevelsToFile(metrics);

    metrics.write(OUTPUT);

    return 0;
  }

  protected String[] customCommandLineValidation() {
    if (PER_TARGET_COVERAGE != null
        && (METRIC_ACCUMULATION_LEVEL.size() != 1
            || METRIC_ACCUMULATION_LEVEL.iterator().next() != MetricAccumulationLevel.ALL_READS)) {
      return new String[] {
        "PER_TARGET_COVERAGE can be specified only when METRIC_ACCUMULATION_LEVEL is set "
            + "to ALL_READS."
      };
    }

    if (PER_TARGET_COVERAGE != null && REFERENCE_SEQUENCE == null) {
      return new String[] {"Must supply REFERENCE_SEQUENCE when supplying PER_TARGET_COVERAGE"};
    }

    return super.customCommandLineValidation();
  }
}
Example #7
0
/**
 * Java port of UCSC liftOver. Only the most basic liftOver functionality is implemented. Internally
 * coordinates are 0-based, half-open. The API is standard Picard 1-based, inclusive.
 *
 * @author [email protected]
 */
public class LiftOver {
  private static final Log LOG = Log.getInstance(LiftOver.class);

  public static final double DEFAULT_LIFTOVER_MINMATCH = 0.95;

  private double liftOverMinMatch = DEFAULT_LIFTOVER_MINMATCH;
  private final OverlapDetector<Chain> chains;

  /** Load UCSC chain file in order to lift over Intervals. */
  public LiftOver(File chainFile) {
    IoUtil.assertFileIsReadable(chainFile);
    chains = Chain.loadChains(chainFile);
  }

  /**
   * Throw an exception if all the "to" sequence names in the chains are not found in the given
   * sequence dictionary.
   */
  public void validateToSequences(final SAMSequenceDictionary sequenceDictionary) {
    for (final Chain chain : chains.getAll()) {
      if (sequenceDictionary.getSequence(chain.toSequenceName) == null) {
        throw new PicardException(
            "Sequence "
                + chain.toSequenceName
                + " from chain file is not found in sequence dictionary.");
      }
    }
  }

  /**
   * Lift over the given interval to the new genome build using the liftOverMinMatch set for this
   * LiftOver object.
   *
   * @param interval Interval to be lifted over.
   * @return Interval in the output build coordinates, or null if it cannot be lifted over.
   */
  public Interval liftOver(final Interval interval) {
    return liftOver(interval, liftOverMinMatch);
  }

  /**
   * Lift over the given interval to the new genome build.
   *
   * @param interval Interval to be lifted over.
   * @param liftOverMinMatch Minimum fraction of bases that must remap.
   * @return Interval in the output build coordinates, or null if it cannot be lifted over.
   */
  public Interval liftOver(final Interval interval, final double liftOverMinMatch) {
    if (interval.length() == 0) {
      throw new IllegalArgumentException(
          "Zero-length interval cannot be lifted over.  Interval: " + interval.getName());
    }
    Chain chainHit = null;
    TargetIntersection targetIntersection = null;
    // Number of bases in interval that can be lifted over must be >= this.
    double minMatchSize = liftOverMinMatch * interval.length();

    // Find the appropriate Chain, and the part of the chain corresponding to the interval to be
    // lifted over.
    for (final Chain chain : chains.getOverlaps(interval)) {
      final TargetIntersection candidateIntersection = targetIntersection(chain, interval);
      if (candidateIntersection != null
          && candidateIntersection.intersectionLength >= minMatchSize) {
        if (chainHit != null) {
          // In basic liftOver, multiple hits are not allowed.
          return null;
        }
        chainHit = chain;
        targetIntersection = candidateIntersection;
      } else if (candidateIntersection != null) {
        LOG.info(
            "Interval "
                + interval.getName()
                + " failed to match chain "
                + chain.id
                + " because intersection length "
                + candidateIntersection.intersectionLength
                + " < minMatchSize "
                + minMatchSize
                + " ("
                + (candidateIntersection.intersectionLength / (float) interval.length())
                + " < "
                + liftOverMinMatch
                + ")");
      }
    }
    if (chainHit == null) {
      // Can't be lifted over.
      return null;
    }

    return createToInterval(interval.getName(), targetIntersection);
  }

  public List<PartialLiftover> diagnosticLiftover(final Interval interval) {
    final List<PartialLiftover> ret = new ArrayList<PartialLiftover>();
    if (interval.length() == 0) {
      throw new IllegalArgumentException(
          "Zero-length interval cannot be lifted over.  Interval: " + interval.getName());
    }
    for (final Chain chain : chains.getOverlaps(interval)) {
      Interval intersectingChain = interval.intersect(chain.interval);
      final TargetIntersection targetIntersection = targetIntersection(chain, intersectingChain);
      if (targetIntersection == null) {
        ret.add(new PartialLiftover(intersectingChain, chain.id));
      } else {
        Interval toInterval = createToInterval(interval.getName(), targetIntersection);
        float percentLiftedOver = targetIntersection.intersectionLength / (float) interval.length();
        ret.add(
            new PartialLiftover(
                intersectingChain, toInterval, targetIntersection.chain.id, percentLiftedOver));
      }
    }
    return ret;
  }

  private static Interval createToInterval(
      final String intervalName, final TargetIntersection targetIntersection) {
    // Compute the query interval given the offsets of the target interval start and end into the
    // first and
    // last ContinuousBlocks.
    int toStart =
        targetIntersection.chain.getBlock(targetIntersection.firstBlockIndex).toStart
            + targetIntersection.startOffset;
    int toEnd =
        targetIntersection.chain.getBlock(targetIntersection.lastBlockIndex).getToEnd()
            - targetIntersection.offsetFromEnd;
    if (toEnd <= toStart || toStart < 0) {
      throw new PicardException("Something strange lifting over interval " + intervalName);
    }

    if (targetIntersection.chain.toNegativeStrand) {
      // Flip if query is negative.
      int negativeStart = targetIntersection.chain.toSequenceSize - toEnd;
      int negativeEnd = targetIntersection.chain.toSequenceSize - toStart;
      toStart = negativeStart;
      toEnd = negativeEnd;
    }
    // Convert to 1-based, inclusive.
    return new Interval(
        targetIntersection.chain.toSequenceName,
        toStart + 1,
        toEnd,
        targetIntersection.chain.toNegativeStrand,
        intervalName);
  }

  /**
   * Add up overlap btw the blocks in this chain and the given interval.
   *
   * @return Length of overlap, offsets into first and last ContinuousBlocks, and indices of first
   *     and last ContinuousBlocks.
   */
  private static TargetIntersection targetIntersection(final Chain chain, final Interval interval) {
    int intersectionLength = 0;
    // Convert interval to 0-based, half-open
    int start = interval.getStart() - 1;
    int end = interval.getEnd();
    int firstBlockIndex = -1;
    int lastBlockIndex = -1;
    int startOffset = -1;
    int offsetFromEnd = -1;
    List<Chain.ContinuousBlock> blockList = chain.getBlocks();
    for (int i = 0; i < blockList.size(); ++i) {
      final Chain.ContinuousBlock block = blockList.get(i);
      if (block.fromStart >= end) {
        break;
      } else if (block.getFromEnd() <= start) {
        continue;
      }
      if (firstBlockIndex == -1) {
        firstBlockIndex = i;
        if (start > block.fromStart) {
          startOffset = start - block.fromStart;
        } else {
          startOffset = 0;
        }
      }
      lastBlockIndex = i;
      if (block.getFromEnd() > end) {
        offsetFromEnd = block.getFromEnd() - end;
      } else {
        offsetFromEnd = 0;
      }
      int thisIntersection = Math.min(end, block.getFromEnd()) - Math.max(start, block.fromStart);
      if (thisIntersection <= 0) {
        throw new PicardException("Should have been some intersection.");
      }
      intersectionLength += thisIntersection;
    }
    if (intersectionLength == 0) {
      return null;
    }
    return new TargetIntersection(
        chain, intersectionLength, startOffset, offsetFromEnd, firstBlockIndex, lastBlockIndex);
  }

  /** Get minimum fraction of bases that must remap. */
  public double getLiftOverMinMatch() {
    return liftOverMinMatch;
  }

  /** Set minimum fraction of bases that must remap. */
  public void setLiftOverMinMatch(final double liftOverMinMatch) {
    this.liftOverMinMatch = liftOverMinMatch;
  }

  /** Value class returned by targetIntersection() */
  private static class TargetIntersection {
    /** Chain used for this intersection */
    final Chain chain;
    /** Total intersectionLength length */
    final int intersectionLength;
    /** Offset of target interval start in first block. */
    final int startOffset;
    /** Distance from target interval end to end of last block. */
    final int offsetFromEnd;
    /** Index of first ContinuousBlock matching interval. */
    final int firstBlockIndex;
    /** Index of last ContinuousBlock matching interval. */
    final int lastBlockIndex;

    TargetIntersection(
        final Chain chain,
        final int intersectionLength,
        final int startOffset,
        final int offsetFromEnd,
        final int firstBlockIndex,
        final int lastBlockIndex) {
      this.chain = chain;
      this.intersectionLength = intersectionLength;
      this.startOffset = startOffset;
      this.offsetFromEnd = offsetFromEnd;
      this.firstBlockIndex = firstBlockIndex;
      this.lastBlockIndex = lastBlockIndex;
    }
  }

  /** Represents a portion of a liftover operation, for use in diagnosing liftover failures. */
  public static class PartialLiftover {
    /** Intersection between "from" interval and "from" region of a chain. */
    final Interval fromInterval;
    /**
     * Result of lifting over fromInterval (with no percentage mapped requirement). This is null if
     * fromInterval falls entirely with a gap of the chain.
     */
    final Interval toInterval;
    /** id of chain used for this liftover */
    final int chainId;
    /**
     * Percentage of bases in fromInterval that lifted over. 0 if fromInterval is not covered by any
     * chain.
     */
    final float percentLiftedOver;

    PartialLiftover(
        final Interval fromInterval,
        final Interval toInterval,
        final int chainId,
        final float percentLiftedOver) {
      this.fromInterval = fromInterval;
      this.toInterval = toInterval;
      this.chainId = chainId;
      this.percentLiftedOver = percentLiftedOver;
    }

    PartialLiftover(final Interval fromInterval, final int chainId) {
      this.fromInterval = fromInterval;
      this.toInterval = null;
      this.chainId = chainId;
      this.percentLiftedOver = 0.0f;
    }

    public String toString() {
      if (toInterval == null) {
        // Matched a chain, but entirely within a gap.
        return fromInterval.toString()
            + " (len "
            + fromInterval.length()
            + ")=>null using chain "
            + chainId;
      }
      final String strand = toInterval.isNegativeStrand() ? "-" : "+";
      return fromInterval.toString()
          + " (len "
          + fromInterval.length()
          + ")=>"
          + toInterval
          + "("
          + strand
          + ") using chain "
          + chainId
          + " ; pct matched "
          + percentLiftedOver;
    }
  }
}
Example #8
0
/**
 * Replaces read groups in a BAM file
 *
 * @author mdepristo
 */
public class AddOrReplaceReadGroups extends CommandLineProgram {
  @Usage(programVersion = "1.0")
  public String USAGE =
      "Replaces all read groups in the INPUT file with a new read group and assigns "
          + "all reads to this read group in the OUTPUT BAM";

  @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "Input file (bam or sam).")
  public File INPUT = null;

  @Option(
      shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME,
      doc = "Output file (bam or sam).")
  public File OUTPUT = null;

  @Option(
      shortName = StandardOptionDefinitions.SORT_ORDER_SHORT_NAME,
      optional = true,
      doc =
          "Optional sort order to output in. If not supplied OUTPUT is in the same order as INPUT.")
  public SortOrder SORT_ORDER;

  @Option(shortName = "ID", doc = "Read Group ID")
  public String RGID = "1";

  @Option(shortName = "LB", doc = "Read Group Library")
  public String RGLB;

  @Option(shortName = "PL", doc = "Read Group platform (e.g. illumina, solid)")
  public String RGPL;

  @Option(shortName = "PU", doc = "Read Group platform unit (eg. run barcode)")
  public String RGPU;

  @Option(shortName = "SM", doc = "Read Group sample name")
  public String RGSM;

  @Option(shortName = "CN", doc = "Read Group sequencing center name", optional = true)
  public String RGCN;

  @Option(shortName = "DS", doc = "Read Group description", optional = true)
  public String RGDS;

  @Option(shortName = "DT", doc = "Read Group run date", optional = true)
  public Iso8601Date RGDT;

  private final Log log = Log.getInstance(AddOrReplaceReadGroups.class);

  /** Required main method implementation. */
  public static void main(final String[] argv) {
    new AddOrReplaceReadGroups().instanceMainWithExit(argv);
  }

  protected int doWork() {
    IoUtil.assertFileIsReadable(INPUT);
    IoUtil.assertFileIsWritable(OUTPUT);

    final SAMFileReader in = new SAMFileReader(INPUT);

    // create the read group we'll be using
    final SAMReadGroupRecord rg = new SAMReadGroupRecord(RGID);
    rg.setLibrary(RGLB);
    rg.setPlatform(RGPL);
    rg.setSample(RGSM);
    rg.setPlatformUnit(RGPU);
    if (RGCN != null) rg.setSequencingCenter(RGCN);
    if (RGDS != null) rg.setDescription(RGDS);
    if (RGDT != null) rg.setRunDate(RGDT);

    log.info(
        String.format(
            "Created read group ID=%s PL=%s LB=%s SM=%s%n",
            rg.getId(), rg.getPlatform(), rg.getLibrary(), rg.getSample()));

    // create the new header and output file
    final SAMFileHeader inHeader = in.getFileHeader();
    final SAMFileHeader outHeader = inHeader.clone();
    outHeader.setReadGroups(Arrays.asList(rg));
    if (SORT_ORDER != null) outHeader.setSortOrder(SORT_ORDER);

    final SAMFileWriter outWriter =
        new SAMFileWriterFactory()
            .makeSAMOrBAMWriter(
                outHeader, outHeader.getSortOrder() == inHeader.getSortOrder(), OUTPUT);

    final ProgressLogger progress = new ProgressLogger(log);
    for (final SAMRecord read : in) {
      read.setAttribute(SAMTag.RG.name(), RGID);
      outWriter.addAlignment(read);
      progress.record(read);
    }

    // cleanup
    in.close();
    outWriter.close();
    return 0;
  }
}
/**
 * Command line program to read non-duplicate insert sizes, create a histogram and report
 * distribution statistics.
 *
 * @author Doug Voet (dvoet at broadinstitute dot org)
 */
public class CollectInsertSizeMetrics extends SinglePassSamProgram {
  private static final Log log = Log.getInstance(CollectInsertSizeMetrics.class);
  private static final String HISTOGRAM_R_SCRIPT = "net/sf/picard/analysis/insertSizeHistogram.R";
  // Usage and parameters
  @Usage
  public String USAGE =
      getStandardUsagePreamble()
          + "Reads a SAM or BAM file and writes a file containing metrics about "
          + "the statistical distribution of insert size (excluding duplicates) "
          + "and generates a histogram plot.\n";

  @Option(shortName = "H", doc = "File to write insert size histogram chart to.")
  public File HISTOGRAM_FILE;

  @Option(
      doc =
          "Generate mean, sd and plots by trimming the data down to MEDIAN + DEVIATIONS*MEDIAN_ABSOLUTE_DEVIATION. "
              + "This is done because insert size data typically includes enough anomalous values from chimeras and other "
              + "artifacts to make the mean and sd grossly misleading regarding the real distribution.")
  public double DEVIATIONS = 10;

  @Option(
      shortName = "W",
      doc =
          "Explicitly sets the histogram width, overriding automatic truncation of histogram tail. "
              + "Also, when calculating mean and standard deviation, only bins <= HISTOGRAM_WIDTH will be included.",
      optional = true)
  public Integer HISTOGRAM_WIDTH = null;

  @Option(
      shortName = "M",
      doc =
          "When generating the histogram, discard any data categories (out of FR, TANDEM, RF) that have fewer than this "
              + "percentage of overall reads. (Range: 0 to 1).")
  public float MINIMUM_PCT = 0.05f;

  @Option(shortName = "LEVEL", doc = "The level(s) at which to accumulate metrics.  ")
  private Set<MetricAccumulationLevel> METRIC_ACCUMULATION_LEVEL =
      CollectionUtil.makeSet(MetricAccumulationLevel.ALL_READS);

  // Calculates InsertSizeMetrics for all METRIC_ACCUMULATION_LEVELs provided
  private InsertSizeMetricsCollector multiCollector;

  /** Required main method implementation. */
  public static void main(final String[] argv) {
    new CollectInsertSizeMetrics().instanceMainWithExit(argv);
  }

  /**
   * Put any custom command-line validation in an override of this method. clp is initialized at
   * this point and can be used to print usage and access argv. Any options set by command-line
   * parser can be validated.
   *
   * @return null if command line is valid. If command line is invalid, returns an array of error
   *     message to be written to the appropriate place.
   */
  @Override
  protected String[] customCommandLineValidation() {
    if (MINIMUM_PCT < 0 || MINIMUM_PCT > 0.5) {
      return new String[] {
        "MINIMUM_PCT was set to "
            + MINIMUM_PCT
            + ". It must be between 0 and 0.5 so all data categories don't get discarded."
      };
    }

    return super.customCommandLineValidation();
  }

  @Override
  protected boolean usesNoRefReads() {
    return false;
  }

  @Override
  protected void setup(final SAMFileHeader header, final File samFile) {
    IoUtil.assertFileIsWritable(OUTPUT);
    IoUtil.assertFileIsWritable(HISTOGRAM_FILE);

    // Delegate actual collection to InsertSizeMetricCollector
    multiCollector =
        new InsertSizeMetricsCollector(
            METRIC_ACCUMULATION_LEVEL,
            header.getReadGroups(),
            MINIMUM_PCT,
            HISTOGRAM_WIDTH,
            DEVIATIONS);
  }

  @Override
  protected void acceptRead(final SAMRecord record, final ReferenceSequence ref) {
    multiCollector.acceptRecord(record, ref);
  }

  @Override
  protected void finish() {
    multiCollector.finish();

    final MetricsFile<InsertSizeMetrics, Integer> file = getMetricsFile();
    multiCollector.addAllLevelsToFile(file);

    if (file.getNumHistograms() == 0) {
      // can happen if user sets MINIMUM_PCT = 0.5, etc.
      log.warn(
          "All data categories were discarded because they contained < "
              + MINIMUM_PCT
              + " of the total aligned paired data.");
      final InsertSizeMetricsCollector.PerUnitInsertSizeMetricsCollector allReadsCollector =
          (InsertSizeMetricsCollector.PerUnitInsertSizeMetricsCollector)
              multiCollector.getAllReadsCollector();
      log.warn(
          "Total mapped pairs in all categories: "
              + (allReadsCollector == null
                  ? allReadsCollector
                  : allReadsCollector.getTotalInserts()));
    } else {
      file.write(OUTPUT);

      final int rResult;
      if (HISTOGRAM_WIDTH == null) {
        rResult =
            RExecutor.executeFromClasspath(
                HISTOGRAM_R_SCRIPT,
                OUTPUT.getAbsolutePath(),
                HISTOGRAM_FILE.getAbsolutePath(),
                INPUT.getName());
      } else {
        rResult =
            RExecutor.executeFromClasspath(
                HISTOGRAM_R_SCRIPT,
                OUTPUT.getAbsolutePath(),
                HISTOGRAM_FILE.getAbsolutePath(),
                INPUT.getName(),
                String.valueOf(
                    HISTOGRAM_WIDTH)); // HISTOGRAM_WIDTH is passed because R automatically sets
                                       // histogram width to the last
        // bin that has data, which may be less than HISTOGRAM_WIDTH and confuse the user.
      }

      if (rResult != 0) {
        throw new PicardException(
            "R script " + HISTOGRAM_R_SCRIPT + " failed with return code " + rResult);
      }
    }
  }
}
/**
 * This class is a reader of a control file
 *
 * @author Guoying Qi
 * @author Staffan Living
 */
public class ControlFileReader extends IlluminaFileReader {

  private final Log log = Log.getInstance(ControlFileReader.class);

  private final int EXPECTED_CONTROL_VERSION = 2;
  private int currentCluster = 0;
  private int totalClusters = 0;
  private int currentControlClusters = 0;

  /**
   * @param controlFileName control file name
   * @throws Exception
   */
  public ControlFileReader(String controlFileName) throws Exception {

    super(controlFileName);
    this.readFileHeader();
  }

  /** @throws IOException */
  private void readFileHeader() throws Exception {

    // fisrt four bytes are empty
    // it should be zero for new version of control file, backward compatibility
    int emptyBytes = this.readFourBytes(inputStream);
    if (emptyBytes != 0) {

      log.warn(
          "The first four bytes are not zero: "
              + emptyBytes
              + ". This is an old format control file.");
      this.totalClusters = emptyBytes;
      return;
    }

    // next four bytes should be version and greater or equal to the expected
    int version = this.readFourBytes(inputStream);
    if (version != this.EXPECTED_CONTROL_VERSION) {
      log.error("Unexpected version byte: " + version);
      throw new Exception("Unexpected version number in control file");
    }

    // next four bytes should be the total number of clusters
    this.totalClusters = this.readFourBytes(inputStream);
    log.info("The total number of clusters: " + this.getTotalClusters());
  }

  @Override
  public boolean hasNext() {

    return (this.getCurrentCluster() < this.getTotalClusters()) ? true : false;
  }

  @Override
  public Object next() {

    try {
      int nextByte = this.inputStream.readUnsignedShort();

      if (nextByte == -1) {
        log.warn(
            "There is no more cluster in Control file after cluster "
                + this.getCurrentCluster()
                + " in file "
                + this.getFileName());
        return null;
      }

      this.currentCluster++;
      /*
      Bit0: always empty (0)
      Bit1: was the read identified as a control?
      Bit2: was the match ambiguous?
      Bit3: did the read match the phiX tag?
      Bit4: did the read align to match the phiX tag?
      Bit5: did the read match the control index sequence? (specified in controls.fata, TGTCACA)
      Bits6,7: reserved for future use
      Bits8..15: the report key for the matched record in the controls.fasta file (specified by the REPOControl FilesRT_ KEY metadata)
      */
      nextByte = nextByte & 0x2;
      if (nextByte != 0) {
        this.currentControlClusters++;
      }

      return new Integer(nextByte);

    } catch (IOException ex) {
      log.error(ex, "Problem to read control file");
    }

    return null;
  }

  /** @return the currentCluster */
  public int getCurrentCluster() {
    return currentCluster;
  }

  /** @return the totalClusters */
  public int getTotalClusters() {
    return totalClusters;
  }

  /** @return the currentClusters */
  public int getCurrentControlClusters() {
    return currentControlClusters;
  }

  public static void main(String args[]) throws Exception {

    String controlFileName =
        "testdata/110323_HS13_06000_B_B039WABXX/Data/Intensities/BaseCalls/L001/s_1_1101.control";
    if (args.length > 0 && args[0] != null) {
      controlFileName = args[0];
    }

    ControlFileReader control = new ControlFileReader(controlFileName);

    int numberControlCluster = 0;
    while (control.hasNext()) {
      int nextCluster = (Integer) control.next();

      if (nextCluster != 0) {
        numberControlCluster++;
      }
    }
    System.out.println(numberControlCluster);
    System.out.println(control.getCurrentCluster());
    System.out.println(control.getCurrentControlClusters());

    // control.next();
  }
}
Example #11
0
  private void read(InputStream in, String filename) throws IOException {
    // Pattern comma=Pattern.compile("[,]");
    Pattern pipe = Pattern.compile("[\\|]");
    Pattern amp = Pattern.compile("&");

    out.println("insert into FILE" + SUFFIX + "(filename) values (" + quote(filename) + ");");
    VcfIterator r = new VcfIterator(in);

    VCFHeader header = r.getHeader();

    String csqColumns[] = null;
    VCFInfoHeaderLine infoHeader = header.getInfoHeaderLine("CSQ");
    if (infoHeader != null && this.USE_VEP) {
      LOG.info("parsing VEP " + infoHeader.getDescription());
      final String formatStr = "Format: ";
      int i = infoHeader.getDescription().indexOf(formatStr);
      if (i != -1) {
        csqColumns =
            pipe.split(infoHeader.getDescription().substring(i + formatStr.length()).trim());
        LOG.debug(Arrays.asList(csqColumns));
      } else {
        LOG.error("Cannot parse " + infoHeader.getDescription());
      }
    }
    String snpEffColumns[] = null;
    infoHeader = header.getInfoHeaderLine("EFF");
    if (infoHeader != null && this.USE_SNPEFF) {
      LOG.info("parsing EFF " + infoHeader.getDescription());

      final String formatStr = ".Format: '";
      final String desc = infoHeader.getDescription();
      int i = desc.indexOf(formatStr);
      if (i != -1) i = desc.indexOf('(', i + formatStr.length());
      int j = desc.lastIndexOf(')');
      if (i != -1 && j > i) {
        snpEffColumns =
            pipe.split(desc.substring(i + 1, j).replaceAll("[ \\[\\]()\\.]", "").trim());
        LOG.info(Arrays.asList(snpEffColumns));
      } else {
        LOG.error("Cannot parse " + infoHeader.getDescription());
      }
    }

    String nmdColumns[] = null;
    infoHeader = header.getInfoHeaderLine("NMD");
    if (infoHeader != null && this.USE_SNPEFF) {

      final String formatStr = " Format: '";
      final String desc = infoHeader.getDescription();
      int i = desc.indexOf(formatStr);
      int j = (i == -1 ? -1 : desc.lastIndexOf('\''));

      if (i != -1 && j > i) {
        nmdColumns =
            pipe.split(
                desc.substring(i + formatStr.length(), j).replaceAll("[ \\[\\]()\\.]", "").trim());
      } else {
        LOG.error("Cannot parse " + infoHeader.getDescription());
      }
    }

    String lofColumns[] = null;
    infoHeader = header.getInfoHeaderLine("LOF");
    if (infoHeader != null && this.USE_SNPEFF) {

      final String formatStr = " Format: '";
      final String desc = infoHeader.getDescription();
      int i = desc.indexOf(formatStr);
      int j = (i == -1 ? -1 : desc.lastIndexOf('\''));

      if (i != -1 && j > i) {
        lofColumns =
            pipe.split(
                desc.substring(i + formatStr.length(), j).replaceAll("[ \\[\\]()\\.]", "").trim());
      } else {
        LOG.error("Cannot parse " + infoHeader.getDescription());
      }
    }

    for (String S : header.getSampleNamesInOrder()) {
      // merge into SAMPLE using (select 1+MAX(id),'azdazd' from SAMPLE) as vals(x,y) on
      // SAMPLE.name=vals.y when  NOT MATCHED THEN INSERT VALUES vals.x,vals.y;
      switch (this.engine) {
        case hsql:
          out.println(
              "merge into SAMPLE"
                  + SUFFIX
                  + " using ( values("
                  + quote(S)
                  + ") ) "
                  + "AS vals(y) ON SAMPLE"
                  + SUFFIX
                  + ".name = vals.y "
                  + "WHEN NOT MATCHED THEN INSERT VALUES  (NULL,vals.y);");
          break;
        default:
          out.println(
              "insert or ignore into SAMPLE" + SUFFIX + "(name) values (" + quote(S) + ");");
          break;
      }
    }

    List<String> headers = new ArrayList<String>();

    for (VCFHeaderLine line : header.getMetaDataInSortedOrder()) {
      if (VCFHeaderVersion.isFormatString(line.getKey())) continue;
      headers.add(VCFHeader.METADATA_INDICATOR + line);
    }

    String chromLine = VCFHeader.HEADER_INDICATOR;
    for (VCFHeader.HEADER_FIELDS field : header.getHeaderFields()) {
      if (!VCFHeader.HEADER_INDICATOR.equals(chromLine))
        chromLine += (VCFConstants.FIELD_SEPARATOR);
      chromLine += (field);
    }

    if (header.hasGenotypingData()) {
      chromLine += VCFConstants.FIELD_SEPARATOR + "FORMAT";
      for (String sample : header.getGenotypeSamples()) {
        chromLine += VCFConstants.FIELD_SEPARATOR;
        chromLine += sample;
      }
    }
    headers.add(chromLine);

    for (String line : headers) {
      out.println(
          "insert into HEADER"
              + SUFFIX
              + "(file_id,header) values ("
              + "(select max(id) from FILE"
              + SUFFIX
              + "),"
              + quote(line)
              + ");");
    }

    while (r.hasNext()) {
      VariantContext var = r.next();

      if (var == null) {
        LOG.error("Cannot parse VCF");
        continue;
      }
      // "create table if not exists FILE(id,filename text)";
      // "create table if not exists VARIATION(id,file_id,chrom,pos,start0,end0,rs_id,ref,qual)";

      out.println(
          "insert into VARIATION"
              + SUFFIX
              + "(file_id,chrom,pos,START0,END0,rs_id,ref,qual) values ("
              + "(select max(id) from FILE"
              + SUFFIX
              + "),"
              + quote(var.getChr())
              + ","
              + var.getStart()
              + ","
              + (var.getStart() - 1)
              + ","
              + var.getEnd()
              + ","
              + (var.getID() == null || var.getID().equals(VCFConstants.EMPTY_ID_FIELD)
                  ? "NULL"
                  : quote(var.getID()))
              + ","
              + quote(var.getReference().getDisplayString())
              + ","
              + (var.getPhredScaledQual() < 0 ? "NULL" : var.getPhredScaledQual())
              + ");");
      // "create table if not exists ALT(id,var_id,alt)";

      for (Allele alt : var.getAlternateAlleles()) {
        out.println(
            "insert into ALT"
                + SUFFIX
                + "(var_id,alt) values ("
                + "(select max(id) from VARIATION"
                + SUFFIX
                + "),"
                + quote(alt.getDisplayString())
                + ");");
      }
      // "create table if not exists FILTER(id,var_id,filter)";

      for (String filter : var.getFilters()) {
        out.println(
            "insert into FILTER"
                + SUFFIX
                + "(var_id,filter) values ("
                + "(select max(id) from VARIATION"
                + SUFFIX
                + "),"
                + quote(filter)
                + ");");
      }
      CommonInfo infos = var.getCommonInfo();
      for (String key : infos.getAttributes().keySet()) {
        Object val = infos.getAttribute(key);
        // "create table if not exists INFO(id,var_id,k,v)";

        if (SPLIT4 && key.equals("DP4")) {
          String dp4[] = infotoString(val).split("[,]");
          insertIntoInfo(quote(key + "[refFor]"), quote(dp4[0]));
          insertIntoInfo(quote(key + "[refRev]"), quote(dp4[1]));
          insertIntoInfo(quote(key + "[altFor]"), quote(dp4[2]));
          insertIntoInfo(quote(key + "[altRev]"), quote(dp4[3]));
        } else {
          insertIntoInfo(quote(key), quote(infotoString(val)));
        }

        if (key.equals("CSQ") && csqColumns != null) {
          List as_array = castToStringArray(val);

          for (Object csqs : as_array) {
            if (csqs.toString().isEmpty()) continue;
            String tokens[] = pipe.split(csqs.toString());
            List<String> extraInfo = new ArrayList<String>();
            for (int t = 0; t < tokens.length && t < csqColumns.length; ++t) {
              if (tokens[t].isEmpty()) continue;
              if (csqColumns[t].equals("Consequence")) {
                for (String pred : amp.split(tokens[t])) {
                  if (pred.isEmpty()) continue;
                  extraInfo.add(csqColumns[t]);
                  extraInfo.add(pred);
                }

              } else {
                extraInfo.add(csqColumns[t]);
                extraInfo.add(tokens[t]);
              }
            }
            insertExtraInfos("CSQ", extraInfo);
          }
        }

        if (key.equals("EFF") && snpEffColumns != null) {
          for (Object item : castToStringArray(val)) {
            String snpeff = item.toString();
            if (snpeff.isEmpty()) continue;
            int opar = snpeff.indexOf('(');
            if (opar == -1) continue;
            int cpar = snpeff.lastIndexOf(')');
            if (cpar == -1) continue;
            String tokens[] = pipe.split(snpeff.substring(opar + 1, cpar));
            List<String> h = new ArrayList<String>();
            h.add("Effect");
            h.add(snpeff.substring(0, opar));
            for (int t = 0; t < tokens.length && t < snpEffColumns.length; ++t) {
              if (tokens[t].isEmpty()) continue;
              h.add(snpEffColumns[t]);
              h.add(tokens[t]);
            }
            insertExtraInfos(key, h);
          }
        }

        if (key.equals("NMD") && nmdColumns != null) {

          for (Object item : castToStringArray(val)) {
            String nmd = item.toString();
            if (nmd.isEmpty()) continue;
            String tokens[] = pipe.split(nmd);
            List<String> h = new ArrayList<String>(nmdColumns.length * 2);
            for (int t = 0; t < tokens.length && t < nmdColumns.length; ++t) {
              if (tokens[t].isEmpty()) continue;
              h.add(nmdColumns[t]);
              h.add(tokens[t]);
            }
            insertExtraInfos(key, h);
          }
        }

        if (key.equals("LOF") && lofColumns != null) {

          for (Object item : castToStringArray(val)) {
            String lof = item.toString();
            if (lof.isEmpty()) continue;
            String tokens[] = pipe.split(lof);
            List<String> h = new ArrayList<String>(lofColumns.length * 2);
            for (int t = 0; t < tokens.length && t < lofColumns.length; ++t) {
              if (tokens[t].isEmpty()) continue;
              h.add(lofColumns[t]);
              h.add(tokens[t]);
            }
            insertExtraInfos(key, h);
          }
        }
      }
      GenotypesContext genotypesCtx = var.getGenotypes();
      for (Genotype g : genotypesCtx) {
        // "create table if not exists GENOTYPE(id,var_id,k,v)";

        List<Allele> alleles = g.getAlleles();

        out.println(
            "insert into GENOTYPE"
                + SUFFIX
                + "(var_id,sample_id,A1,A2,dp,ad,gq,pl,"
                + "is_phased,is_hom,is_homref,is_homvar,is_mixed,"
                + "is_nocall,is_noninformative,is_available,is_called,is_filtered"
                + ") values ("
                + "(select max(id) from VARIATION"
                + SUFFIX
                + "),"
                + "(select id from SAMPLE"
                + SUFFIX
                + " where name="
                + quote(g.getSampleName())
                + "),"
                + (alleles.size() == 2 ? quote(alleles.get(0).getBaseString()) : "NULL")
                + ","
                + (alleles.size() == 2 ? quote(alleles.get(1).getBaseString()) : "NULL")
                + ","
                + (g.hasDP() ? g.getDP() : "NULL")
                + ","
                + (g.hasAD() ? quote(infotoString(g.getAD())) : "NULL")
                + ","
                + (g.hasGQ() ? g.getGQ() : "NULL")
                + ","
                + (g.hasPL() ? quote(infotoString(g.getPL())) : "NULL")
                + ","
                + (g.isPhased() ? 1 : 0)
                + ","
                + (g.isHom() ? 1 : 0)
                + ","
                + (g.isHomRef() ? 1 : 0)
                + ","
                + (g.isHomVar() ? 1 : 0)
                + ","
                + (g.isMixed() ? 1 : 0)
                + ","
                + (g.isNoCall() ? 1 : 0)
                + ","
                + (g.isNonInformative() ? 1 : 0)
                + ","
                + (g.isAvailable() ? 1 : 0)
                + ","
                + (g.isCalled() ? 1 : 0)
                + ","
                + (g.isFiltered() ? 1 : 0)
                + ");");

        for (String key : g.getExtendedAttributes().keySet()) {
          Object val = g.getExtendedAttribute(key);
          if (val == null) continue;
          out.println(
              "insert into GTPROP"
                  + SUFFIX
                  + "(genotype_id,k,v) values ("
                  + "(select max(id) from GENOTYPE"
                  + SUFFIX
                  + "),"
                  + quote(key)
                  + ","
                  + quote(infotoString(val))
                  + ");");
        }
      }
    }
    r.close();
  }
Example #12
0
@SuppressWarnings("rawtypes")
public class VcfToSql extends CommandLineProgram {

  @Usage(programVersion = "1.0")
  public String USAGE =
      getStandardUsagePreamble()
          + "Creates the code to insert one or more VCF into a SQL database. ";

  @Option(
      shortName = StandardOptionDefinitions.INPUT_SHORT_NAME,
      doc = "VCF files to process.",
      minElements = 0)
  public List<File> IN = new ArrayList<File>();

  @Option(shortName = "SFX", doc = "Table suffix", optional = true)
  public String SUFFIX = "";

  @Option(shortName = "VEP", doc = "Use  and explode VEP predictions", optional = true)
  public boolean USE_VEP = true;

  @Option(shortName = "SNPEFF", doc = "Use and explode SNPEFF predictions", optional = true)
  public boolean USE_SNPEFF = true;

  @Option(shortName = "SQLIDX", doc = "Create misc SQL Indexes.", optional = true)
  public boolean SQLINDEX = true;

  @Option(shortName = "EGN", doc = "sql engine [sqlite,hsql]", optional = true)
  public String ENGINE = SQLEngine.sqlite.name();

  @Option(shortName = "S4", doc = "Split DP4", optional = true)
  public boolean SPLIT4 = false;

  private SQLEngine engine = SQLEngine.sqlite;

  private enum SQLEngine {
    sqlite,
    hsql
  };

  private static Log LOG = Log.getInstance(VcfToSql.class);

  private PrintWriter out = new PrintWriter(System.out);

  @Override
  public String getVersion() {
    return "1.0";
  }

  private String columnId() {
    switch (this.engine) {
      case hsql:
        return "id INTEGER GENERATED ALWAYS AS IDENTITY(START WITH 1, INCREMENT BY 1) PRIMARY KEY,";
      default:
        return "id INTEGER PRIMARY KEY AUTOINCREMENT,";
    }
  }

  private String varchar(int length) {
    switch (this.engine) {
      case hsql:
        return "VARCHAR(" + length + ")";
      default:
        return "TEXT";
    }
  }

  private String text() {
    switch (this.engine) {
      case hsql:
        return "LONGVARCHAR";
      default:
        return "TEXT";
    }
  }

  @Override
  protected int doWork() {
    try {
      try {
        this.engine = SQLEngine.valueOf(this.ENGINE);
      } catch (Exception err) {
        LOG.error("BAD SQL ENGINE " + this.ENGINE);
        return -1;
      }
      out.println(
          "create table if not exists FILE"
              + SUFFIX
              + "("
              + columnId()
              + "filename "
              + varchar(255)
              + " NOT NULL"
              + ");");

      out.println(
          "create table if not exists HEADER"
              + SUFFIX
              + "("
              + columnId()
              + "file_id INT NOT NULL REFERENCES FILE"
              + SUFFIX
              + "(id) ON DELETE CASCADE,"
              + "header "
              + text()
              + ");");

      out.println(
          "create table if not exists SAMPLE"
              + SUFFIX
              + "("
              + columnId()
              + "name "
              + varchar(100)
              + " NOT NULL UNIQUE"
              + ");");
      out.println(
          "create table if not exists VARIATION"
              + SUFFIX
              + "("
              + columnId()
              + "file_id INT NOT NULL REFERENCES FILE"
              + SUFFIX
              + "(id) ON DELETE CASCADE,"
              + "CHROM VARCHAR(20) NOT NULL,"
              + "POS INT NOT NULL,"
              + "START0 INT NOT NULL,"
              + "END0 INT NOT NULL,"
              + "RS_ID VARCHAR(50),"
              + "REF "
              + text()
              + " NOT NULL,"
              + "QUAL FLOAT"
              + ");");

      out.println(
          "create table if not exists ALT"
              + SUFFIX
              + "("
              + columnId()
              + "var_id INT NOT NULL REFERENCES VARIATION"
              + SUFFIX
              + "(id) ON DELETE CASCADE,"
              + "ALT "
              + text()
              + ");");
      out.println(
          "create table if not exists FILTER"
              + SUFFIX
              + "("
              + columnId()
              + "var_id INT NOT NULL REFERENCES VARIATION"
              + SUFFIX
              + "(id) ON DELETE CASCADE,"
              + "FILTER varchar(50) not null"
              + ");");

      out.println(
          "create table if not exists INFO"
              + SUFFIX
              + "("
              + columnId()
              + "var_id INT NOT NULL REFERENCES VARIATION"
              + SUFFIX
              + "(id) ON DELETE CASCADE,"
              + "k varchar(50) not null,"
              + "v "
              + text()
              + " not null"
              + ");");

      out.println(
          "create table if not exists EXTRAINFO"
              + SUFFIX
              + "("
              + columnId()
              + "info_id INT NOT NULL REFERENCES INFO"
              + SUFFIX
              + "(id) ON DELETE CASCADE,"
              + "type varchar(50) not null"
              + ");");

      out.println(
          "create table if not exists EXTRAINFOPROP"
              + SUFFIX
              + "("
              + columnId()
              + "extrainfo_id INT NOT NULL REFERENCES EXTRAINFO"
              + SUFFIX
              + "(id) ON DELETE CASCADE,"
              + "k varchar(50) not null,"
              + "v "
              + text()
              + " not null"
              + ");");

      out.println(
          "create table if not exists GENOTYPE"
              + SUFFIX
              + "("
              + columnId()
              + "var_id INT NOT NULL REFERENCES VARIATION"
              + SUFFIX
              + "(id) ON DELETE CASCADE,"
              + "sample_id INT NOT NULL REFERENCES SAMPLE"
              + SUFFIX
              + "(id) ON DELETE CASCADE,"
              + "A1 "
              + text()
              + ", A2 "
              + text()
              + ", dp int, ad varchar(50), gq float,pl "
              + text()
              + ","
              + "is_phased SMALLINT not null,is_hom SMALLINT not null,is_homref  SMALLINT not null,is_homvar  SMALLINT not null,is_mixed  SMALLINT not null,"
              + "is_nocall SMALLINT not null,is_noninformative SMALLINT not null,is_available SMALLINT not null,is_called SMALLINT not null,is_filtered  SMALLINT not null"
              + ");");
      out.println(
          "create table if not exists GTPROP"
              + SUFFIX
              + "("
              + columnId()
              + "genotype_id INT NOT NULL REFERENCES GENOTYPE"
              + SUFFIX
              + "(id) ON DELETE CASCADE,"
              + "k varchar(50) not null,"
              + "v "
              + text()
              + " not null"
              + ");");
      switch (this.engine) {
        case sqlite:
          out.println("begin transaction;");
          break;
        default:
          break;
      }

      if (IN.isEmpty()) {
        LOG.info("reading from stdin");
        read(System.in, "<stdin>");
      } else {
        for (File input : IN) {
          LOG.info("opening " + input);
          InputStream in = IOUtils.openFileForReading(input);
          read(in, input.toString());
          in.close();
        }
      }
      if (SQLINDEX) {
        index("SAMPLE", "name");
        index("EXTRAINFO", "type");
        index("EXTRAINFOPROP", "k");
        index("EXTRAINFOPROP", "v");

        index("INFO", "var_id");
        index("INFO", "k");
        index("EXTRAINFO", "info_id");
        index("EXTRAINFOPROP", "extrainfo_id");
        index("GENOTYPE", "var_id");
        index("GENOTYPE", "sample_id");
      }
      switch (this.engine) {
        case sqlite:
          out.println("commit;");
          break;
        default:
          break;
      }

      out.flush();
    } catch (IOException err) {
      err.printStackTrace();
      return -1;
    }
    return 0;
  }

  private void index(String table, String column) {
    out.print("create index ");

    switch (this.engine) {
      case hsql:
        break;
      default:
        out.print(" if not exists ");
        break;
    }

    out.print(
        " "
            + (table + SUFFIX + "_" + column + "_IDX").toUpperCase()
            + " on "
            + table
            + SUFFIX
            + "("
            + column
            + ");");
  }

  private void read(InputStream in, String filename) throws IOException {
    // Pattern comma=Pattern.compile("[,]");
    Pattern pipe = Pattern.compile("[\\|]");
    Pattern amp = Pattern.compile("&");

    out.println("insert into FILE" + SUFFIX + "(filename) values (" + quote(filename) + ");");
    VcfIterator r = new VcfIterator(in);

    VCFHeader header = r.getHeader();

    String csqColumns[] = null;
    VCFInfoHeaderLine infoHeader = header.getInfoHeaderLine("CSQ");
    if (infoHeader != null && this.USE_VEP) {
      LOG.info("parsing VEP " + infoHeader.getDescription());
      final String formatStr = "Format: ";
      int i = infoHeader.getDescription().indexOf(formatStr);
      if (i != -1) {
        csqColumns =
            pipe.split(infoHeader.getDescription().substring(i + formatStr.length()).trim());
        LOG.debug(Arrays.asList(csqColumns));
      } else {
        LOG.error("Cannot parse " + infoHeader.getDescription());
      }
    }
    String snpEffColumns[] = null;
    infoHeader = header.getInfoHeaderLine("EFF");
    if (infoHeader != null && this.USE_SNPEFF) {
      LOG.info("parsing EFF " + infoHeader.getDescription());

      final String formatStr = ".Format: '";
      final String desc = infoHeader.getDescription();
      int i = desc.indexOf(formatStr);
      if (i != -1) i = desc.indexOf('(', i + formatStr.length());
      int j = desc.lastIndexOf(')');
      if (i != -1 && j > i) {
        snpEffColumns =
            pipe.split(desc.substring(i + 1, j).replaceAll("[ \\[\\]()\\.]", "").trim());
        LOG.info(Arrays.asList(snpEffColumns));
      } else {
        LOG.error("Cannot parse " + infoHeader.getDescription());
      }
    }

    String nmdColumns[] = null;
    infoHeader = header.getInfoHeaderLine("NMD");
    if (infoHeader != null && this.USE_SNPEFF) {

      final String formatStr = " Format: '";
      final String desc = infoHeader.getDescription();
      int i = desc.indexOf(formatStr);
      int j = (i == -1 ? -1 : desc.lastIndexOf('\''));

      if (i != -1 && j > i) {
        nmdColumns =
            pipe.split(
                desc.substring(i + formatStr.length(), j).replaceAll("[ \\[\\]()\\.]", "").trim());
      } else {
        LOG.error("Cannot parse " + infoHeader.getDescription());
      }
    }

    String lofColumns[] = null;
    infoHeader = header.getInfoHeaderLine("LOF");
    if (infoHeader != null && this.USE_SNPEFF) {

      final String formatStr = " Format: '";
      final String desc = infoHeader.getDescription();
      int i = desc.indexOf(formatStr);
      int j = (i == -1 ? -1 : desc.lastIndexOf('\''));

      if (i != -1 && j > i) {
        lofColumns =
            pipe.split(
                desc.substring(i + formatStr.length(), j).replaceAll("[ \\[\\]()\\.]", "").trim());
      } else {
        LOG.error("Cannot parse " + infoHeader.getDescription());
      }
    }

    for (String S : header.getSampleNamesInOrder()) {
      // merge into SAMPLE using (select 1+MAX(id),'azdazd' from SAMPLE) as vals(x,y) on
      // SAMPLE.name=vals.y when  NOT MATCHED THEN INSERT VALUES vals.x,vals.y;
      switch (this.engine) {
        case hsql:
          out.println(
              "merge into SAMPLE"
                  + SUFFIX
                  + " using ( values("
                  + quote(S)
                  + ") ) "
                  + "AS vals(y) ON SAMPLE"
                  + SUFFIX
                  + ".name = vals.y "
                  + "WHEN NOT MATCHED THEN INSERT VALUES  (NULL,vals.y);");
          break;
        default:
          out.println(
              "insert or ignore into SAMPLE" + SUFFIX + "(name) values (" + quote(S) + ");");
          break;
      }
    }

    List<String> headers = new ArrayList<String>();

    for (VCFHeaderLine line : header.getMetaDataInSortedOrder()) {
      if (VCFHeaderVersion.isFormatString(line.getKey())) continue;
      headers.add(VCFHeader.METADATA_INDICATOR + line);
    }

    String chromLine = VCFHeader.HEADER_INDICATOR;
    for (VCFHeader.HEADER_FIELDS field : header.getHeaderFields()) {
      if (!VCFHeader.HEADER_INDICATOR.equals(chromLine))
        chromLine += (VCFConstants.FIELD_SEPARATOR);
      chromLine += (field);
    }

    if (header.hasGenotypingData()) {
      chromLine += VCFConstants.FIELD_SEPARATOR + "FORMAT";
      for (String sample : header.getGenotypeSamples()) {
        chromLine += VCFConstants.FIELD_SEPARATOR;
        chromLine += sample;
      }
    }
    headers.add(chromLine);

    for (String line : headers) {
      out.println(
          "insert into HEADER"
              + SUFFIX
              + "(file_id,header) values ("
              + "(select max(id) from FILE"
              + SUFFIX
              + "),"
              + quote(line)
              + ");");
    }

    while (r.hasNext()) {
      VariantContext var = r.next();

      if (var == null) {
        LOG.error("Cannot parse VCF");
        continue;
      }
      // "create table if not exists FILE(id,filename text)";
      // "create table if not exists VARIATION(id,file_id,chrom,pos,start0,end0,rs_id,ref,qual)";

      out.println(
          "insert into VARIATION"
              + SUFFIX
              + "(file_id,chrom,pos,START0,END0,rs_id,ref,qual) values ("
              + "(select max(id) from FILE"
              + SUFFIX
              + "),"
              + quote(var.getChr())
              + ","
              + var.getStart()
              + ","
              + (var.getStart() - 1)
              + ","
              + var.getEnd()
              + ","
              + (var.getID() == null || var.getID().equals(VCFConstants.EMPTY_ID_FIELD)
                  ? "NULL"
                  : quote(var.getID()))
              + ","
              + quote(var.getReference().getDisplayString())
              + ","
              + (var.getPhredScaledQual() < 0 ? "NULL" : var.getPhredScaledQual())
              + ");");
      // "create table if not exists ALT(id,var_id,alt)";

      for (Allele alt : var.getAlternateAlleles()) {
        out.println(
            "insert into ALT"
                + SUFFIX
                + "(var_id,alt) values ("
                + "(select max(id) from VARIATION"
                + SUFFIX
                + "),"
                + quote(alt.getDisplayString())
                + ");");
      }
      // "create table if not exists FILTER(id,var_id,filter)";

      for (String filter : var.getFilters()) {
        out.println(
            "insert into FILTER"
                + SUFFIX
                + "(var_id,filter) values ("
                + "(select max(id) from VARIATION"
                + SUFFIX
                + "),"
                + quote(filter)
                + ");");
      }
      CommonInfo infos = var.getCommonInfo();
      for (String key : infos.getAttributes().keySet()) {
        Object val = infos.getAttribute(key);
        // "create table if not exists INFO(id,var_id,k,v)";

        if (SPLIT4 && key.equals("DP4")) {
          String dp4[] = infotoString(val).split("[,]");
          insertIntoInfo(quote(key + "[refFor]"), quote(dp4[0]));
          insertIntoInfo(quote(key + "[refRev]"), quote(dp4[1]));
          insertIntoInfo(quote(key + "[altFor]"), quote(dp4[2]));
          insertIntoInfo(quote(key + "[altRev]"), quote(dp4[3]));
        } else {
          insertIntoInfo(quote(key), quote(infotoString(val)));
        }

        if (key.equals("CSQ") && csqColumns != null) {
          List as_array = castToStringArray(val);

          for (Object csqs : as_array) {
            if (csqs.toString().isEmpty()) continue;
            String tokens[] = pipe.split(csqs.toString());
            List<String> extraInfo = new ArrayList<String>();
            for (int t = 0; t < tokens.length && t < csqColumns.length; ++t) {
              if (tokens[t].isEmpty()) continue;
              if (csqColumns[t].equals("Consequence")) {
                for (String pred : amp.split(tokens[t])) {
                  if (pred.isEmpty()) continue;
                  extraInfo.add(csqColumns[t]);
                  extraInfo.add(pred);
                }

              } else {
                extraInfo.add(csqColumns[t]);
                extraInfo.add(tokens[t]);
              }
            }
            insertExtraInfos("CSQ", extraInfo);
          }
        }

        if (key.equals("EFF") && snpEffColumns != null) {
          for (Object item : castToStringArray(val)) {
            String snpeff = item.toString();
            if (snpeff.isEmpty()) continue;
            int opar = snpeff.indexOf('(');
            if (opar == -1) continue;
            int cpar = snpeff.lastIndexOf(')');
            if (cpar == -1) continue;
            String tokens[] = pipe.split(snpeff.substring(opar + 1, cpar));
            List<String> h = new ArrayList<String>();
            h.add("Effect");
            h.add(snpeff.substring(0, opar));
            for (int t = 0; t < tokens.length && t < snpEffColumns.length; ++t) {
              if (tokens[t].isEmpty()) continue;
              h.add(snpEffColumns[t]);
              h.add(tokens[t]);
            }
            insertExtraInfos(key, h);
          }
        }

        if (key.equals("NMD") && nmdColumns != null) {

          for (Object item : castToStringArray(val)) {
            String nmd = item.toString();
            if (nmd.isEmpty()) continue;
            String tokens[] = pipe.split(nmd);
            List<String> h = new ArrayList<String>(nmdColumns.length * 2);
            for (int t = 0; t < tokens.length && t < nmdColumns.length; ++t) {
              if (tokens[t].isEmpty()) continue;
              h.add(nmdColumns[t]);
              h.add(tokens[t]);
            }
            insertExtraInfos(key, h);
          }
        }

        if (key.equals("LOF") && lofColumns != null) {

          for (Object item : castToStringArray(val)) {
            String lof = item.toString();
            if (lof.isEmpty()) continue;
            String tokens[] = pipe.split(lof);
            List<String> h = new ArrayList<String>(lofColumns.length * 2);
            for (int t = 0; t < tokens.length && t < lofColumns.length; ++t) {
              if (tokens[t].isEmpty()) continue;
              h.add(lofColumns[t]);
              h.add(tokens[t]);
            }
            insertExtraInfos(key, h);
          }
        }
      }
      GenotypesContext genotypesCtx = var.getGenotypes();
      for (Genotype g : genotypesCtx) {
        // "create table if not exists GENOTYPE(id,var_id,k,v)";

        List<Allele> alleles = g.getAlleles();

        out.println(
            "insert into GENOTYPE"
                + SUFFIX
                + "(var_id,sample_id,A1,A2,dp,ad,gq,pl,"
                + "is_phased,is_hom,is_homref,is_homvar,is_mixed,"
                + "is_nocall,is_noninformative,is_available,is_called,is_filtered"
                + ") values ("
                + "(select max(id) from VARIATION"
                + SUFFIX
                + "),"
                + "(select id from SAMPLE"
                + SUFFIX
                + " where name="
                + quote(g.getSampleName())
                + "),"
                + (alleles.size() == 2 ? quote(alleles.get(0).getBaseString()) : "NULL")
                + ","
                + (alleles.size() == 2 ? quote(alleles.get(1).getBaseString()) : "NULL")
                + ","
                + (g.hasDP() ? g.getDP() : "NULL")
                + ","
                + (g.hasAD() ? quote(infotoString(g.getAD())) : "NULL")
                + ","
                + (g.hasGQ() ? g.getGQ() : "NULL")
                + ","
                + (g.hasPL() ? quote(infotoString(g.getPL())) : "NULL")
                + ","
                + (g.isPhased() ? 1 : 0)
                + ","
                + (g.isHom() ? 1 : 0)
                + ","
                + (g.isHomRef() ? 1 : 0)
                + ","
                + (g.isHomVar() ? 1 : 0)
                + ","
                + (g.isMixed() ? 1 : 0)
                + ","
                + (g.isNoCall() ? 1 : 0)
                + ","
                + (g.isNonInformative() ? 1 : 0)
                + ","
                + (g.isAvailable() ? 1 : 0)
                + ","
                + (g.isCalled() ? 1 : 0)
                + ","
                + (g.isFiltered() ? 1 : 0)
                + ");");

        for (String key : g.getExtendedAttributes().keySet()) {
          Object val = g.getExtendedAttribute(key);
          if (val == null) continue;
          out.println(
              "insert into GTPROP"
                  + SUFFIX
                  + "(genotype_id,k,v) values ("
                  + "(select max(id) from GENOTYPE"
                  + SUFFIX
                  + "),"
                  + quote(key)
                  + ","
                  + quote(infotoString(val))
                  + ");");
        }
      }
    }
    r.close();
  }

  private String quote(String s) {
    if (s == null) return "NULL";
    StringBuilder b = new StringBuilder();
    b.append("\'");
    for (int i = 0; i < s.length(); ++i) {
      char c = s.charAt(i);
      switch (c) {
        case '\'':
          b.append("''");
          break;
        default:
          b.append(c);
          break;
      }
    }
    b.append("\'");
    return b.toString();
  }

  private void insertExtraInfos(String type, List<String> h) {
    boolean first = true;
    for (int i = 0; i + 1 < h.size(); i += 2) {
      if (h.get(i + 1).isEmpty()) continue;
      if (first) {

        out.println(
            "insert into EXTRAINFO"
                + SUFFIX
                + "(info_id,type) values ("
                + "(select max(id) from INFO"
                + SUFFIX
                + "),"
                + quote(type)
                + ");");
        first = false;
      }

      out.println(
          "insert into EXTRAINFOPROP"
              + SUFFIX
              + "(extrainfo_id,k,v) values ("
              + "(select max(id) from EXTRAINFO"
              + SUFFIX
              + "),"
              + quote(h.get(i))
              + ","
              + quote(h.get(i + 1))
              + ");");
    }
  }

  @SuppressWarnings("unchecked")
  private List castToStringArray(Object val) {
    if (val instanceof List) {
      return (List) val;
    } else {
      return new ArrayList(Collections.singleton(val.toString()));
    }
  }

  private String infotoString(Object o) {
    if (o instanceof int[]) {
      int array[] = (int[]) o;
      StringBuilder b = new StringBuilder();
      for (int i = 0; i < array.length; ++i) {
        if (i > 0) b.append(",");
        b.append(infotoString(array[i]));
      }
      return b.toString();
    }
    if (o instanceof List) {
      List<?> L = List.class.cast(o);
      StringBuilder b = new StringBuilder();
      for (int i = 0; i < L.size(); ++i) {
        if (i > 0) b.append(",");
        b.append(infotoString(L.get(i)));
      }
      return b.toString();
    }
    return o.toString();
  }

  private void insertIntoInfo(String key, String val) {
    out.println(
        "insert into INFO"
            + SUFFIX
            + "(var_id,k,v) values ("
            + "(select max(id) from VARIATION"
            + SUFFIX
            + "),"
            + key
            + ","
            + val
            + ");");
  }

  public static void main(String[] args) {
    new VcfToSql().instanceMainWithExit(args);
  }
}
Example #13
0
  @Override
  protected int doWork() {
    try {
      try {
        this.engine = SQLEngine.valueOf(this.ENGINE);
      } catch (Exception err) {
        LOG.error("BAD SQL ENGINE " + this.ENGINE);
        return -1;
      }
      out.println(
          "create table if not exists FILE"
              + SUFFIX
              + "("
              + columnId()
              + "filename "
              + varchar(255)
              + " NOT NULL"
              + ");");

      out.println(
          "create table if not exists HEADER"
              + SUFFIX
              + "("
              + columnId()
              + "file_id INT NOT NULL REFERENCES FILE"
              + SUFFIX
              + "(id) ON DELETE CASCADE,"
              + "header "
              + text()
              + ");");

      out.println(
          "create table if not exists SAMPLE"
              + SUFFIX
              + "("
              + columnId()
              + "name "
              + varchar(100)
              + " NOT NULL UNIQUE"
              + ");");
      out.println(
          "create table if not exists VARIATION"
              + SUFFIX
              + "("
              + columnId()
              + "file_id INT NOT NULL REFERENCES FILE"
              + SUFFIX
              + "(id) ON DELETE CASCADE,"
              + "CHROM VARCHAR(20) NOT NULL,"
              + "POS INT NOT NULL,"
              + "START0 INT NOT NULL,"
              + "END0 INT NOT NULL,"
              + "RS_ID VARCHAR(50),"
              + "REF "
              + text()
              + " NOT NULL,"
              + "QUAL FLOAT"
              + ");");

      out.println(
          "create table if not exists ALT"
              + SUFFIX
              + "("
              + columnId()
              + "var_id INT NOT NULL REFERENCES VARIATION"
              + SUFFIX
              + "(id) ON DELETE CASCADE,"
              + "ALT "
              + text()
              + ");");
      out.println(
          "create table if not exists FILTER"
              + SUFFIX
              + "("
              + columnId()
              + "var_id INT NOT NULL REFERENCES VARIATION"
              + SUFFIX
              + "(id) ON DELETE CASCADE,"
              + "FILTER varchar(50) not null"
              + ");");

      out.println(
          "create table if not exists INFO"
              + SUFFIX
              + "("
              + columnId()
              + "var_id INT NOT NULL REFERENCES VARIATION"
              + SUFFIX
              + "(id) ON DELETE CASCADE,"
              + "k varchar(50) not null,"
              + "v "
              + text()
              + " not null"
              + ");");

      out.println(
          "create table if not exists EXTRAINFO"
              + SUFFIX
              + "("
              + columnId()
              + "info_id INT NOT NULL REFERENCES INFO"
              + SUFFIX
              + "(id) ON DELETE CASCADE,"
              + "type varchar(50) not null"
              + ");");

      out.println(
          "create table if not exists EXTRAINFOPROP"
              + SUFFIX
              + "("
              + columnId()
              + "extrainfo_id INT NOT NULL REFERENCES EXTRAINFO"
              + SUFFIX
              + "(id) ON DELETE CASCADE,"
              + "k varchar(50) not null,"
              + "v "
              + text()
              + " not null"
              + ");");

      out.println(
          "create table if not exists GENOTYPE"
              + SUFFIX
              + "("
              + columnId()
              + "var_id INT NOT NULL REFERENCES VARIATION"
              + SUFFIX
              + "(id) ON DELETE CASCADE,"
              + "sample_id INT NOT NULL REFERENCES SAMPLE"
              + SUFFIX
              + "(id) ON DELETE CASCADE,"
              + "A1 "
              + text()
              + ", A2 "
              + text()
              + ", dp int, ad varchar(50), gq float,pl "
              + text()
              + ","
              + "is_phased SMALLINT not null,is_hom SMALLINT not null,is_homref  SMALLINT not null,is_homvar  SMALLINT not null,is_mixed  SMALLINT not null,"
              + "is_nocall SMALLINT not null,is_noninformative SMALLINT not null,is_available SMALLINT not null,is_called SMALLINT not null,is_filtered  SMALLINT not null"
              + ");");
      out.println(
          "create table if not exists GTPROP"
              + SUFFIX
              + "("
              + columnId()
              + "genotype_id INT NOT NULL REFERENCES GENOTYPE"
              + SUFFIX
              + "(id) ON DELETE CASCADE,"
              + "k varchar(50) not null,"
              + "v "
              + text()
              + " not null"
              + ");");
      switch (this.engine) {
        case sqlite:
          out.println("begin transaction;");
          break;
        default:
          break;
      }

      if (IN.isEmpty()) {
        LOG.info("reading from stdin");
        read(System.in, "<stdin>");
      } else {
        for (File input : IN) {
          LOG.info("opening " + input);
          InputStream in = IOUtils.openFileForReading(input);
          read(in, input.toString());
          in.close();
        }
      }
      if (SQLINDEX) {
        index("SAMPLE", "name");
        index("EXTRAINFO", "type");
        index("EXTRAINFOPROP", "k");
        index("EXTRAINFOPROP", "v");

        index("INFO", "var_id");
        index("INFO", "k");
        index("EXTRAINFO", "info_id");
        index("EXTRAINFOPROP", "extrainfo_id");
        index("GENOTYPE", "var_id");
        index("GENOTYPE", "sample_id");
      }
      switch (this.engine) {
        case sqlite:
          out.println("commit;");
          break;
        default:
          break;
      }

      out.flush();
    } catch (IOException err) {
      err.printStackTrace();
      return -1;
    }
    return 0;
  }