Exemplo n.º 1
0
  /**
   * Instantiates multiple underlying SAM writes, one per input SAM reader registered with GATK
   * engine (those will be retrieved from <code>toolkit</code>). The output file names will be
   * generated automatically by stripping ".sam" or ".bam" off the input file name and adding ext
   * instead (e.g. ".cleaned.bam"). onto a unique output file name.
   *
   * @param toolkit
   * @param ext
   */
  public void setupByReader(
      GenomeAnalysisEngine toolkit,
      String ext,
      SAMFileHeader.SortOrder order,
      boolean presorted,
      boolean indexOnTheFly,
      boolean generateMD5,
      SAMProgramRecord pRecord) {
    for (SAMReaderID rid : toolkit.getReadsDataSource().getReaderIDs()) {

      String fName = toolkit.getReadsDataSource().getSAMFile(rid).getName();

      String outName;
      int pos;
      if (fName.toUpperCase().endsWith(".BAM")) pos = fName.toUpperCase().lastIndexOf(".BAM");
      else {
        if (fName.toUpperCase().endsWith(".SAM")) pos = fName.toUpperCase().lastIndexOf(".SAM");
        else
          throw new UserException.BadInput(
              "Input file name " + fName + " does not end with .sam or .bam");
      }
      String prefix = fName.substring(0, pos);
      outName = prefix + ext;

      if (writerMap.containsKey(rid))
        throw new StingException(
            "nWayOut mode: Reader id for input sam file " + fName + " is already registered");
      addWriter(rid, outName, order, presorted, indexOnTheFly, generateMD5, pRecord);
    }
  }
Exemplo n.º 2
0
  @Requires({
    "toolkit != null",
    "UAC != null",
    "logger != null",
    "samples != null && samples.size() > 0",
    "ploidy>0"
  })
  public UnifiedGenotyperEngine(
      GenomeAnalysisEngine toolkit,
      UnifiedArgumentCollection UAC,
      Logger logger,
      PrintStream verboseWriter,
      VariantAnnotatorEngine engine,
      Set<String> samples,
      int ploidy) {
    this.BAQEnabledOnCMDLine = toolkit.getArguments().BAQMode != BAQ.CalculationMode.OFF;
    genomeLocParser = toolkit.getGenomeLocParser();
    this.samples = new TreeSet<String>(samples);
    // note that, because we cap the base quality by the mapping quality, minMQ cannot be less than
    // minBQ
    this.UAC = UAC;

    this.logger = logger;
    this.verboseWriter = verboseWriter;
    this.annotationEngine = engine;

    this.ploidy = ploidy;
    this.N = samples.size() * ploidy;
    log10AlleleFrequencyPriorsSNPs = new double[N + 1];
    log10AlleleFrequencyPriorsIndels = new double[N + 1];
    computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity);
    computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY);

    filter.add(LOW_QUAL_FILTER_NAME);
  }
Exemplo n.º 3
0
  /**
   * Instantiates multiple underlying SAM writes, one per input SAM reader registered with GATK
   * engine (those will be retrieved from <code>toolkit</code>). The <code>in2out</code> map must
   * contain an entry for each input filename and map it onto a unique output file name.
   *
   * @param toolkit
   * @param in2out
   */
  public void setupByReader(
      GenomeAnalysisEngine toolkit,
      Map<String, String> in2out,
      SAMFileHeader.SortOrder order,
      boolean presorted,
      boolean indexOnTheFly,
      boolean generateMD5,
      SAMProgramRecord pRecord) {
    if (in2out == null)
      throw new StingException("input-output bam filename map for n-way-out writing is NULL");
    for (SAMReaderID rid : toolkit.getReadsDataSource().getReaderIDs()) {

      String fName = toolkit.getReadsDataSource().getSAMFile(rid).getName();

      String outName;
      if (!in2out.containsKey(fName))
        throw new UserException.BadInput(
            "Input-output bam filename map does not contain an entry for the input file " + fName);
      outName = in2out.get(fName);

      if (writerMap.containsKey(rid))
        throw new StingException(
            "nWayOut mode: Reader id for input sam file "
                + fName
                + " is already registered; "
                + "map file likely contains multiple entries for this input file");

      addWriter(rid, outName, order, presorted, indexOnTheFly, generateMD5, pRecord);
    }
  }
Exemplo n.º 4
0
 public void addAlignment(SAMRecord samRecord) {
   final SAMReaderID id = toolkit.getReaderIDForRead(samRecord);
   String rg = samRecord.getStringAttribute("RG");
   if (rg != null) {
     String rg_orig = toolkit.getReadsDataSource().getOriginalReadGroupId(rg);
     samRecord.setAttribute("RG", rg_orig);
   }
   addAlignment(samRecord, id);
 }
Exemplo n.º 5
0
 /**
  * Create random read qualities
  *
  * @param length the length of the read
  * @param allowNs whether or not to allow N's in the read
  * @return an array with randomized bases (A-N) with equal probability
  */
 public static byte[] createRandomReadBases(int length, boolean allowNs) {
   Random random = GenomeAnalysisEngine.getRandomGenerator();
   int numberOfBases = allowNs ? 5 : 4;
   byte[] bases = new byte[length];
   for (int i = 0; i < length; i++) {
     switch (random.nextInt(numberOfBases)) {
       case 0:
         bases[i] = 'A';
         break;
       case 1:
         bases[i] = 'C';
         break;
       case 2:
         bases[i] = 'G';
         break;
       case 3:
         bases[i] = 'T';
         break;
       case 4:
         bases[i] = 'N';
         break;
       default:
         throw new ReviewedStingException("Something went wrong, this is just impossible");
     }
   }
   return bases;
 }
  /**
   * Gets the appropriately formatted header for a VCF file describing this GATK run
   *
   * @param engine the GATK engine that holds the walker name, GATK version, and other information
   * @param argumentSources contains information on the argument values provided to the GATK for
   *     converting to a command line string. Should be provided from the data in the parsing
   *     engine. Can be empty in which case the command line will be the empty string.
   * @return VCF header line describing this run of the GATK.
   */
  public static VCFHeaderLine getCommandLineArgumentHeaderLine(
      final GenomeAnalysisEngine engine, final Collection<Object> argumentSources) {
    if (engine == null) throw new IllegalArgumentException("engine cannot be null");
    if (argumentSources == null)
      throw new IllegalArgumentException("argumentSources cannot be null");

    final Map<String, String> attributes = new LinkedHashMap<>();
    attributes.put("ID", engine.getWalkerName());
    attributes.put("Version", CommandLineGATK.getVersionNumber());
    final Date date = new Date();
    attributes.put("Date", date.toString());
    attributes.put("Epoch", Long.toString(date.getTime()));
    attributes.put(
        "CommandLineOptions",
        engine.createApproximateCommandLineArgumentString(argumentSources.toArray()));
    return new VCFSimpleHeaderLine(GATK_COMMAND_LINE_KEY, attributes);
  }
Exemplo n.º 7
0
 // ---------------------------------------------------------------------------------------------------------
 //
 // Public interface functions
 //
 // ---------------------------------------------------------------------------------------------------------
 @Requires({"toolkit != null", "UAC != null"})
 public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) {
   this(
       toolkit,
       UAC,
       Logger.getLogger(UnifiedGenotyperEngine.class),
       null,
       null,
       SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()),
       VariantContextUtils.DEFAULT_PLOIDY);
 }
Exemplo n.º 8
0
 private void randomlyAddVariant(int rank, VariantContext vc, byte refBase) {
   if (nVariantsAdded < numRandom) variantArray[nVariantsAdded++] = new RandomVariantStructure(vc);
   else {
     double v = GenomeAnalysisEngine.getRandomGenerator().nextDouble();
     double t = (1.0 / (rank - numRandom + 1));
     if (v < t) {
       variantArray[positionToAdd].set(vc);
       nVariantsAdded++;
       positionToAdd = nextCircularPosition(positionToAdd);
     }
   }
 }
Exemplo n.º 9
0
 /**
  * Creates a program record (@PG) tag
  *
  * @param toolkit the engine
  * @param walker the walker object (so we can extract the command line)
  * @param PROGRAM_RECORD_NAME the name for the PG tag
  * @return a program record for the tool
  */
 public static SAMProgramRecord createProgramRecord(
     GenomeAnalysisEngine toolkit, Object walker, String PROGRAM_RECORD_NAME) {
   final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME);
   final ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText");
   try {
     final String version = headerInfo.getString("org.broadinstitute.sting.gatk.version");
     programRecord.setProgramVersion(version);
   } catch (MissingResourceException e) {
     // couldn't care less if the resource is missing...
   }
   programRecord.setCommandLine(
       toolkit.createApproximateCommandLineArgumentString(toolkit, walker));
   return programRecord;
 }
Exemplo n.º 10
0
  public static Map<String, VCFHeader> getVCFHeadersFromRodPrefix(
      GenomeAnalysisEngine toolkit, String prefix) {
    Map<String, VCFHeader> data = new HashMap<String, VCFHeader>();

    // iterate to get all of the sample names
    List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources();
    for (ReferenceOrderedDataSource source : dataSources) {
      // ignore the rod if lacks the prefix
      if (!source.getName().startsWith(prefix)) continue;

      if (source.getHeader() != null && source.getHeader() instanceof VCFHeader)
        data.put(source.getName(), (VCFHeader) source.getHeader());
    }

    return data;
  }
Exemplo n.º 11
0
  public static Map<String, VCFHeader> getVCFHeadersFromRods(
      GenomeAnalysisEngine toolkit, Collection<String> rodNames) {
    Map<String, VCFHeader> data = new HashMap<String, VCFHeader>();

    // iterate to get all of the sample names
    List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources();
    for (ReferenceOrderedDataSource source : dataSources) {
      // ignore the rod if it's not in our list
      if (rodNames != null && !rodNames.contains(source.getName())) continue;

      if (source.getHeader() != null && source.getHeader() instanceof VCFHeader)
        data.put(source.getName(), (VCFHeader) source.getHeader());
    }

    return data;
  }
 @Test(enabled = true)
 public void testGetBasesReverseComplement() {
   int iterations = 1000;
   Random random = GenomeAnalysisEngine.getRandomGenerator();
   while (iterations-- > 0) {
     final int l = random.nextInt(1000);
     GATKSAMRecord read = GATKSAMRecord.createRandomRead(l);
     byte[] original = read.getReadBases();
     byte[] reconverted = new byte[l];
     String revComp = ReadUtils.getBasesReverseComplement(read);
     for (int i = 0; i < l; i++) {
       reconverted[l - 1 - i] = BaseUtils.getComplement((byte) revComp.charAt(i));
     }
     Assert.assertEquals(reconverted, original);
   }
 }
Exemplo n.º 13
0
 private void addWriter(
     SAMReaderID id,
     String outName,
     SAMFileHeader.SortOrder order,
     boolean presorted,
     boolean indexOnTheFly,
     boolean generateMD5,
     SAMProgramRecord programRecord) {
   File f = new File(outName);
   SAMFileHeader header =
       Utils.setupWriter(
           toolkit, toolkit.getSAMFileHeader(id), KEEP_ALL_PG_RECORDS, programRecord);
   SAMFileWriterFactory factory = new SAMFileWriterFactory();
   factory.setCreateIndex(indexOnTheFly);
   factory.setCreateMd5File(generateMD5);
   SAMFileWriter sw = factory.makeSAMOrBAMWriter(header, presorted, f);
   writerMap.put(id, sw);
 }
  private void testPerSampleEqualsFlat(
      final String bam1,
      final String bam2,
      final String persampleFile,
      final Double downsampling,
      final String md5) {
    final String command =
        baseCommand3
            + " -I "
            + ArtificalBAMLocation
            + bam1
            + " -I "
            + ArtificalBAMLocation
            + bam2
            + " -o %s  ";

    WalkerTestSpec spec =
        new WalkerTestSpec(command + " -contaminationFile " + persampleFile, 1, Arrays.asList(md5));
    final Random rnd = GenomeAnalysisEngine.getRandomGenerator();

    rnd.setSeed(123451); // so that the two test cases have a hope of giving the same result
    executeTest(
        "test contamination on Artificial Contamination, with per-sample file on "
            + bam1
            + " and "
            + bam2
            + " with "
            + persampleFile,
        spec);

    spec =
        new WalkerTestSpec(
            command + "-contamination " + downsampling.toString(), 1, Arrays.asList(md5));

    rnd.setSeed(123451); // so that the two test cases have a hope of giving the same result
    executeTest(
        "test contamination on Artificial Contamination, with flat contamination on "
            + bam1
            + " and "
            + bam2
            + " with "
            + downsampling.toString(),
        spec);
  }
Exemplo n.º 15
0
  /**
   * Gets the header fields from all VCF rods input by the user
   *
   * @param toolkit GATK engine
   * @param rodNames names of rods to use, or null if we should use all possible ones
   * @return a set of all fields
   */
  public static Set<VCFHeaderLine> getHeaderFields(
      GenomeAnalysisEngine toolkit, Collection<String> rodNames) {

    // keep a map of sample name to occurrences encountered
    TreeSet<VCFHeaderLine> fields = new TreeSet<VCFHeaderLine>();

    // iterate to get all of the sample names
    List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources();
    for (ReferenceOrderedDataSource source : dataSources) {
      // ignore the rod if it's not in our list
      if (rodNames != null && !rodNames.contains(source.getName())) continue;

      if (source.getRecordType().equals(VariantContext.class)) {
        VCFHeader header = (VCFHeader) source.getHeader();
        if (header != null) fields.addAll(header.getMetaDataInSortedOrder());
      }
    }

    return fields;
  }
Exemplo n.º 16
0
 /**
  * Create random read qualities
  *
  * @param length the length of the read
  * @return an array with randomized base qualities between 0 and 50
  */
 public static byte[] createRandomReadQuals(int length) {
   Random random = GenomeAnalysisEngine.getRandomGenerator();
   byte[] quals = new byte[length];
   for (int i = 0; i < length; i++) quals[i] = (byte) random.nextInt(50);
   return quals;
 }
Exemplo n.º 17
0
 public SAMFileHeader getFileHeader() {
   return toolkit.getSAMFileHeader();
 }
Exemplo n.º 18
0
 /**
  * Add / replace the contig header lines in the VCFHeader with the information in the GATK engine
  *
  * @param header the header to update
  * @param engine the GATK engine containing command line arguments and the master sequence
  *     dictionary
  */
 public static VCFHeader withUpdatedContigs(
     final VCFHeader header, final GenomeAnalysisEngine engine) {
   return VCFUtils.withUpdatedContigs(
       header, engine.getArguments().referenceFile, engine.getMasterSequenceDictionary());
 }
Exemplo n.º 19
0
  /**
   * Subset VC record if necessary and emit the modified record (provided it satisfies criteria for
   * printing)
   *
   * @param tracker the ROD tracker
   * @param ref reference information
   * @param context alignment info
   * @return 1 if the record was printed to the output file, 0 if otherwise
   */
  @Override
  public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
    if (tracker == null) return 0;

    Collection<VariantContext> vcs =
        tracker.getValues(variantCollection.variants, context.getLocation());

    if (vcs == null || vcs.size() == 0) {
      return 0;
    }

    for (VariantContext vc : vcs) {
      if (MENDELIAN_VIOLATIONS) {
        boolean foundMV = false;
        for (MendelianViolation mv : mvSet) {
          if (mv.isViolation(vc)) {
            foundMV = true;
            // System.out.println(vc.toString());
            if (outMVFile != null)
              outMVFileStream.format(
                  "MV@%s:%d. REF=%s, ALT=%s, AC=%d, momID=%s, dadID=%s, childID=%s, momG=%s, momGL=%s, dadG=%s, dadGL=%s, "
                      + "childG=%s childGL=%s\n",
                  vc.getChr(),
                  vc.getStart(),
                  vc.getReference().getDisplayString(),
                  vc.getAlternateAllele(0).getDisplayString(),
                  vc.getChromosomeCount(vc.getAlternateAllele(0)),
                  mv.getSampleMom(),
                  mv.getSampleDad(),
                  mv.getSampleChild(),
                  vc.getGenotype(mv.getSampleMom()).toBriefString(),
                  vc.getGenotype(mv.getSampleMom()).getLikelihoods().getAsString(),
                  vc.getGenotype(mv.getSampleDad()).toBriefString(),
                  vc.getGenotype(mv.getSampleMom()).getLikelihoods().getAsString(),
                  vc.getGenotype(mv.getSampleChild()).toBriefString(),
                  vc.getGenotype(mv.getSampleChild()).getLikelihoods().getAsString());
          }
        }

        if (!foundMV) break;
      }
      if (DISCORDANCE_ONLY) {
        Collection<VariantContext> compVCs =
            tracker.getValues(discordanceTrack, context.getLocation());
        if (!isDiscordant(vc, compVCs)) return 0;
      }
      if (CONCORDANCE_ONLY) {
        Collection<VariantContext> compVCs =
            tracker.getValues(concordanceTrack, context.getLocation());
        if (!isConcordant(vc, compVCs)) return 0;
      }

      if (alleleRestriction.equals(NumberAlleleRestriction.BIALLELIC) && !vc.isBiallelic())
        continue;

      if (alleleRestriction.equals(NumberAlleleRestriction.MULTIALLELIC) && vc.isBiallelic())
        continue;

      if (!selectedTypes.contains(vc.getType())) continue;

      VariantContext sub = subsetRecord(vc, samples);
      if ((sub.isPolymorphic() || !EXCLUDE_NON_VARIANTS)
          && (!sub.isFiltered() || !EXCLUDE_FILTERED)) {
        for (VariantContextUtils.JexlVCMatchExp jexl : jexls) {
          if (!VariantContextUtils.match(sub, jexl)) {
            return 0;
          }
        }
        if (SELECT_RANDOM_NUMBER) {
          randomlyAddVariant(++variantNumber, sub, ref.getBase());
        } else if (!SELECT_RANDOM_FRACTION
            || (GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) {
          vcfWriter.add(sub);
        }
      }
    }

    return 1;
  }
Exemplo n.º 20
0
  public void writeBeagleOutput(
      VariantContext preferredVC, VariantContext otherVC, boolean isValidationSite, double prior) {
    GenomeLoc currentLoc =
        VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), preferredVC);
    StringBuffer beagleOut = new StringBuffer();

    String marker = String.format("%s:%d ", currentLoc.getContig(), currentLoc.getStart());
    beagleOut.append(marker);
    if (markers != null)
      markers.append(marker).append("\t").append(Integer.toString(markerCounter++)).append("\t");
    for (Allele allele : preferredVC.getAlleles()) {
      String bglPrintString;
      if (allele.isNoCall() || allele.isNull()) bglPrintString = "-";
      else bglPrintString = allele.getBaseString(); // get rid of * in case of reference allele

      beagleOut.append(String.format("%s ", bglPrintString));
      if (markers != null) markers.append(bglPrintString).append("\t");
    }
    if (markers != null) markers.append("\n");

    GenotypesContext preferredGenotypes = preferredVC.getGenotypes();
    GenotypesContext otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null;
    for (String sample : samples) {
      boolean isMaleOnChrX = CHECK_IS_MALE_ON_CHR_X && getSample(sample).getGender() == Gender.MALE;

      Genotype genotype;
      boolean isValidation;
      // use sample as key into genotypes structure
      if (preferredGenotypes.containsSample(sample)) {
        genotype = preferredGenotypes.get(sample);
        isValidation = isValidationSite;
      } else if (otherGenotypes != null && otherGenotypes.containsSample(sample)) {
        genotype = otherGenotypes.get(sample);
        isValidation = !isValidationSite;
      } else {
        // there is magically no genotype for this sample.
        throw new StingException(
            "Sample "
                + sample
                + " arose with no genotype in variant or validation VCF. This should never happen.");
      }

      /*
       * Use likelihoods if: is validation, prior is negative; or: is not validation, has genotype key
       */
      double[] log10Likelihoods = null;
      if ((isValidation && prior < 0.0) || genotype.hasLikelihoods()) {
        log10Likelihoods = genotype.getLikelihoods().getAsVector();

        // see if we need to randomly mask out genotype in this position.
        if (GenomeAnalysisEngine.getRandomGenerator().nextDouble() <= insertedNoCallRate) {
          // we are masking out this genotype
          log10Likelihoods =
              isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS;
        }

        if (isMaleOnChrX) {
          log10Likelihoods[1] = -255; // todo -- warning this is dangerous for multi-allele case
        }
      }
      /** otherwise, use the prior uniformly */
      else if (!isValidation && genotype.isCalled() && !genotype.hasLikelihoods()) {
        // hack to deal with input VCFs with no genotype likelihoods.  Just assume the called
        // genotype
        // is confident.  This is useful for Hapmap and 1KG release VCFs.
        double AA = (1.0 - prior) / 2.0;
        double AB = (1.0 - prior) / 2.0;
        double BB = (1.0 - prior) / 2.0;

        if (genotype.isHomRef()) {
          AA = prior;
        } else if (genotype.isHet()) {
          AB = prior;
        } else if (genotype.isHomVar()) {
          BB = prior;
        }

        log10Likelihoods = MathUtils.toLog10(new double[] {AA, isMaleOnChrX ? 0.0 : AB, BB});
      } else {
        log10Likelihoods =
            isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS;
      }

      writeSampleLikelihoods(beagleOut, preferredVC, log10Likelihoods);
    }

    beagleWriter.println(beagleOut.toString());
  }