/** * Calculates the reference coordinate for the end of the read taking into account soft clips but * not hard clips. * * <p>Note: getUnclippedEnd() adds soft and hard clips, this function only adds soft clips. * * @param read the read * @param cigar the read's cigar * <p>Note: this overload of the function takes the cigar as input for speed because getCigar * is an expensive operation. Most callers should use the overload that does not take the * cigar. * @return the unclipped end of the read taking soft clips (but not hard clips) into account */ public static int getSoftEnd(final GATKRead read, final Cigar cigar) { Utils.nonNull(read, "read"); Utils.nonNull(cigar, "cigar"); boolean foundAlignedBase = false; int softEnd = read.getEnd(); final List<CigarElement> cigs = cigar.getCigarElements(); for (int i = cigs.size() - 1; i >= 0; --i) { final CigarElement cig = cigs.get(i); final CigarOperator op = cig.getOperator(); if (op == CigarOperator .SOFT_CLIP) { // assumes the soft clip that we found is at the end of the aligned read softEnd += cig.getLength(); } else if (op != CigarOperator.HARD_CLIP) { foundAlignedBase = true; break; } } if (!foundAlignedBase) { // for example 64H14S, the soft end is actually the same as the // alignment end softEnd = read.getEnd(); } return softEnd; }
/** * Combines the right table into the left table, in-place (without making a copy) * * @param left first table to combine * @param right second table to combine * @return modified version of left with the contents of right incorporated into it */ public static RecalibrationTables inPlaceCombine( final RecalibrationTables left, final RecalibrationTables right) { Utils.nonNull(left); Utils.nonNull(right); return left.combine(right); }
/** * Do the full tangent normalization process given proportional coverage data. * * <p>This includes: * * <ul> * <li>normalization by target factors * <li>projection of the normalized coverage profile into the hyperplane from the PoN * </ul> * * @param pon -- never {@code null} * @param pcov -- never {@code null}. Must contain data for at least one sample. * @param ctx spark context. Use {@code null} if no context is available * @return never {@code null} */ public static TangentNormalizationResult tangentNormalizePcov( final PoN pon, final ReadCountCollection pcov, final JavaSparkContext ctx) { Utils.nonNull(pon, "PoN cannot be null."); Utils.nonNull(pcov, "input pcov read counts cannot be null when creating a coverage profile."); ParamUtils.isPositive( pcov.columnNames().size(), "input cov profile column names cannot be an empty list."); final ReadCountCollection coverageProfile = createCoverageProfile(pon, pcov); return TangentNormalizer.tangentNormalize(pon, coverageProfile, ctx); }
/** * Set the base qualities from a string of ASCII encoded values * * @param read read whose base qualities should be set * @param baseQualityString ASCII encoded (encoded as a FASTQ string) values of base qualities. */ public static void setBaseQualityString(final GATKRead read, final String baseQualityString) { Utils.nonNull(read); Utils.nonNull(baseQualityString); if (SAMRecord.NULL_QUALS_STRING.equals(baseQualityString)) { read.setBaseQualities(SAMRecord.NULL_QUALS); } else { read.setBaseQualities(SAMUtils.fastqToPhred(baseQualityString)); } }
/** * Creates a new target-out-info enum value given the composer function reference a the list of * output column names. * * @param composer the composer lambda reference. * @param headerNames list of info column names in the order these are going to be lay out in * output files. * @throws IllegalArgumentException if {@code composer} or {@code headerNames} are {@code null}, * or {@code headerNames} contains a {@code null}. */ TargetOutInfo(final Composer composer, final String... headerNames) { this.composer = Utils.nonNull(composer, "the info string composer cannot be null"); this.headerNames = Collections.unmodifiableList( Arrays.asList( Utils.nonNull(headerNames, "the header name list provided cannot be null"))); if (this.headerNames.stream().anyMatch(Objects::isNull)) { throw new IllegalArgumentException("the input header-name cannot contain nulls"); } }
/** * Combines the two tables into a new table (allocating a new table in the process) * * @param left first table to combine * @param right second table to combine * @return a new table with the merged contents of left and right */ public static RecalibrationTables safeCombine( final RecalibrationTables left, final RecalibrationTables right) { Utils.nonNull(left); Utils.nonNull(right); final RecalibrationTables newTable = new RecalibrationTables(left.covariates, left.numReadGroups); newTable.combine(left); newTable.combine(right); return newTable; }
/** * Create a common SAMFileWriter from a factory for use with GATK tools. Assumes that if the * factory has been set to create an index, the header must be set to coordinate sorted. * * @param outputFile if this file has a .cram extension then a reference is required. Can not be * null. * @param referenceFile the reference source to use. Can not be null if a output file has a .cram * extension. * @param header header to be used for the output writer * @param preSorted if true then records must already be sorted to match the header sort order * @param factory SAMFileWriterFactory factory to use * @return SAMFileWriter */ public static SAMFileWriter createCommonSAMWriterFromFactory( final SAMFileWriterFactory factory, final File outputFile, final File referenceFile, final SAMFileHeader header, final boolean preSorted) { Utils.nonNull(outputFile); Utils.nonNull(header); if (null == referenceFile && outputFile.getName().endsWith(CramIO.CRAM_FILE_EXTENSION)) { throw new UserException("A reference file is required for writing CRAM files"); } return factory.makeWriter(header.clone(), preSorted, outputFile, referenceFile); }
/** * Calculates the reference coordinate for the beginning of the read taking into account soft * clips but not hard clips. * * <p>Note: getUnclippedStart() adds soft and hard clips, this function only adds soft clips. * * @param read the read * @param cigar the read's cigar * <p>Note: this overload of the function takes the cigar as input for speed because getCigar * is an expensive operation. Most callers should use the overload that does not take the * cigar. * @return the unclipped start of the read taking soft clips (but not hard clips) into account */ public static int getSoftStart(final GATKRead read, final Cigar cigar) { Utils.nonNull(read, "read"); Utils.nonNull(cigar, "cigar"); int softStart = read.getStart(); for (final CigarElement cig : cigar.getCigarElements()) { final CigarOperator op = cig.getOperator(); if (op == CigarOperator.SOFT_CLIP) { softStart -= cig.getLength(); } else if (op != CigarOperator.HARD_CLIP) { break; } } return softStart; }
/** * Returns the base qualities for the read as a string. * * @param read read whose base qualities should be returned * @return Base qualities string as printable ASCII values (encoded as a FASTQ string). */ public static String getBaseQualityString(final GATKRead read) { Utils.nonNull(read); if (Arrays.equals(SAMRecord.NULL_QUALS, read.getBaseQualities())) { return SAMRecord.NULL_QUALS_STRING; } return SAMUtils.phredToFastq(read.getBaseQualities()); }
private static ReadCountCollection createCoverageProfile( final PoN pon, final ReadCountCollection inputReadCounts) { Utils.nonNull(pon, "PoN cannot be null."); Utils.nonNull( inputReadCounts, "input read counts cannot be null when creating a coverage profile."); ParamUtils.isPositive( inputReadCounts.columnNames().size(), "inputReadCounts column names cannot be an empty list."); final Case2PoNTargetMapper targetMapper = new Case2PoNTargetMapper(inputReadCounts.targets(), pon.getTargetNames()); final RealMatrix inputCounts = targetMapper.fromCaseToPoNCounts(inputReadCounts.counts()); final RealMatrix targetNormalizedCounts = pon.factorNormalization(inputCounts); return targetMapper.fromPoNtoCaseCountCollection( targetNormalizedCounts, inputReadCounts.columnNames()); }
@Override protected AFCalculationResult computeLog10PNonRef( final VariantContext vc, final int defaultPloidy, final double[] log10AlleleFrequencyPriors, final StateTracker stateTracker) { Utils.nonNull(vc, "vc is null"); Utils.nonNull(log10AlleleFrequencyPriors, "log10AlleleFrequencyPriors is null"); Utils.nonNull(stateTracker, "stateTracker is null"); final int numAlternateAlleles = vc.getNAlleles() - 1; final List<double[]> genotypeLikelihoods = getGLs(vc.getGenotypes(), true); final int numSamples = genotypeLikelihoods.size() - 1; final int numChr = 2 * numSamples; // queue of AC conformations to process final Deque<ExactACset> ACqueue = new LinkedList<>(); // mapping of ExactACset indexes to the objects final Map<ExactACcounts, ExactACset> indexesToACset = new HashMap<>(numChr + 1); // add AC=0 to the queue final int[] zeroCounts = new int[numAlternateAlleles]; final ExactACset zeroSet = new ExactACset(numSamples + 1, new ExactACcounts(zeroCounts)); ACqueue.add(zeroSet); indexesToACset.put(zeroSet.getACcounts(), zeroSet); while (!ACqueue.isEmpty()) { // compute log10Likelihoods final ExactACset set = ACqueue.remove(); calculateAlleleCountConformation( set, genotypeLikelihoods, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, stateTracker); // clean up memory indexesToACset.remove(set.getACcounts()); } return getResultFromFinalState(vc, log10AlleleFrequencyPriors, stateTracker); }
/** * Returns the value that corresponds to a given implementation calculator class. * * @param clazz the target class. * @throws IllegalArgumentException if {@code clazz} is {@code null} or if it is abstract. * @return never {@code null}. */ public static AFCalculatorImplementation fromCalculatorClass( final Class<? extends AFCalculator> clazz) { Utils.nonNull(clazz, "input class cannot be null"); Utils.validateArg( !Modifier.isAbstract(clazz.getModifiers()), "class " + clazz.getCanonicalName() + " should not be abstract"); // Using iteration instead of a static map to avoid static state. for (final AFCalculatorImplementation impl : AFCalculatorImplementation.values()) { if (clazz.equals(impl.newInstance().getClass())) { return impl; } } throw new IllegalArgumentException( "Attempt to retrieve AFCalculatorImplementation instance from a non-registered calculator class " + clazz.getName()); }
/** * Constructs the target collection from an target-file passed by the user. * * @return never {@code null}. */ private TargetCollection<Target> resolveTargetsFromFile() { Utils.regularReadableUserFile(targetsFile); logger.log( Level.INFO, String.format( "Reading target intervals from targets file '%s' ...", targetsFile.getAbsolutePath())); final List<Target> targets = TargetTableReader.readTargetFile(targetsFile); return new HashedListTargetCollection<>(targets); }
/** * Constructs a new instance given all its properties * * @param afCalculatorSupplier the calculator class that realizes this implementation. * @param requiredPloidy the required ploidy; zero or greater or {@link #UNBOUND_PLOIDY} to * indicate that any ploidy is supported. * @param maxAltAlleles the maximum alternative alleles; zero or greater or {@link * #UNBOUND_ALTERNATIVE_ALLELE_COUNT} to indicate that any maximum number of alternative * alleles is supported. */ AFCalculatorImplementation( final Supplier<AFCalculator> afCalculatorSupplier, final int requiredPloidy, final int maxAltAlleles) { Utils.nonNull(afCalculatorSupplier); this.afCalculatorSupplier = afCalculatorSupplier; this.requiredPloidy = requiredPloidy; this.maxAltAlleles = maxAltAlleles; }
/** * Create a common SAMFileWriter for use with GATK tools. * * @param outputFile - if this file has a .cram extension then a reference is required. Can not be * null. * @param referenceFile - the reference source to use. Can not be null if a output file has a * .cram extension. * @param header - header to be used for the output writer * @param preSorted - if true then the records must already be sorted to match the header sort * order * @param createOutputBamIndex - if true an index will be created for .BAM and .CRAM files * @param createMD5 - if true an MD5 file will be created * @return SAMFileWriter */ public static SAMFileWriter createCommonSAMWriter( final File outputFile, final File referenceFile, final SAMFileHeader header, final boolean preSorted, boolean createOutputBamIndex, final boolean createMD5) { Utils.nonNull(outputFile); Utils.nonNull(header); if (createOutputBamIndex && header.getSortOrder() != SAMFileHeader.SortOrder.coordinate) { logger.warn( "Skipping index file creation for: " + outputFile.getAbsolutePath() + ". Index file creation requires reads in coordinate sorted order."); createOutputBamIndex = false; } final SAMFileWriterFactory factory = new SAMFileWriterFactory().setCreateIndex(createOutputBamIndex).setCreateMd5File(createMD5); return ReadUtils.createCommonSAMWriterFromFactory( factory, outputFile, referenceFile, header, preSorted); }
/** * Merges the argument into this histogram generator. Returns the modified 'this' object. Note: * you can only merge HistogramGenerator is they have the same 'useOriginalQualities' value. */ public HistogramGenerator merge(final HistogramGenerator hg2) { Utils.nonNull(hg2); if (this.useOriginalQualities != hg2.useOriginalQualities) { throw new IllegalArgumentException( "unequal useOriginalQualities. This has " + this.useOriginalQualities); } ensureArraysBigEnough(hg2.maxLengthSoFar); for (int i = 0; i < hg2.firstReadTotalsByCycle.length; i++) { this.firstReadTotalsByCycle[i] += hg2.firstReadTotalsByCycle[i]; } for (int i = 0; i < hg2.secondReadTotalsByCycle.length; i++) { this.secondReadTotalsByCycle[i] += hg2.secondReadTotalsByCycle[i]; } for (int i = 0; i < hg2.firstReadCountsByCycle.length; i++) { this.firstReadCountsByCycle[i] += hg2.firstReadCountsByCycle[i]; } for (int i = 0; i < hg2.secondReadCountsByCycle.length; i++) { this.secondReadCountsByCycle[i] += hg2.secondReadCountsByCycle[i]; } return this; }
public Pulldown(final SAMFileHeader header) { this.header = Utils.nonNull(header, "SAMFileHeader must be supplied."); ; }
/** * Calculates the reference coordinate for the beginning of the read taking into account soft * clips but not hard clips. * * <p>Note: getUnclippedStart() adds soft and hard clips, this function only adds soft clips. * * @return the unclipped start of the read taking soft clips (but not hard clips) into account */ public static int getSoftStart(final GATKRead read) { Utils.nonNull(read); return getSoftStart(read, read.getCigar()); }
/** * Transforms and composes the string representation of an individual count. * * <p>The output string must be fully formatted human friendly representation of the transformed * value. * * @param count the individual count value. * @param columnTotal the corresponding column total sum. * @return never {@code null}. * @throws IllegalArgumentException if {@code count} is less than 0 or greater than {@code * columnTotal}. */ protected String apply(final int count, final long columnTotal) { ParamUtils.isPositiveOrZero(count, "the count cannot less than 0"); Utils.validateArg(count <= columnTotal, "the count cannot be larger than the column total"); return operator.apply(count, columnTotal); }
/** * Composes the target information output string. * * @param index of the target in the collection. * @param collection the target containing collection. * @throws IllegalArgumentException if either {@code target} or {@code collection} is {@code * null}. */ protected String composeTargetOutInfoString( final int index, final TargetCollection<Target> collection) { Utils.nonNull(collection, "the collection cannot be null"); Utils.validIndex(index, collection.targetCount()); return composer.apply(index, collection); }
/** * Constructor that reads (sequence, position, reference count, alternate count) from the * specified file and uses external SAMFile header to construct Pulldown. * * @param inputFile file to read from * @param header SAMFile header for IntervalList */ public Pulldown(final File inputFile, final SAMFileHeader header) { super(inputFile); this.header = Utils.nonNull(header, "SAMFileHeader must be supplied."); }
public VariantTypesVariantFilter(Set<VariantContext.Type> includeTypes) { Utils.nonNull(includeTypes); sampleTypes = includeTypes; }
/** * Tangent normalize a coverage profile. * * <p>Notes about the Spark tangent normalization can be found in docs/PoN/ * * @param pon Not {@code null} * @param targetFactorNormalizedCounts ReadCountCollection of counts that have already been * normalized fully (typically, including the target factor normalization). I.e. a coverage * profile The column names should be intact. Not {@code null} See {@link * TangentNormalizer::createCoverageProfile} * @return never {@code null} */ private static TangentNormalizationResult tangentNormalize( final PoN pon, final ReadCountCollection targetFactorNormalizedCounts, JavaSparkContext ctx) { Utils.nonNull(pon, "PoN cannot be null."); Utils.nonNull(targetFactorNormalizedCounts, "targetFactorNormalizedCounts cannot be null."); Utils.nonNull( targetFactorNormalizedCounts.columnNames(), "targetFactorNormalizedCounts column names cannot be null."); ParamUtils.isPositive( targetFactorNormalizedCounts.columnNames().size(), "targetFactorNormalizedCounts column names cannot be an empty list."); final Case2PoNTargetMapper targetMapper = new Case2PoNTargetMapper(targetFactorNormalizedCounts.targets(), pon.getPanelTargetNames()); // The input counts with rows (targets) sorted so that they match the PoN's order. final RealMatrix tangentNormalizationRawInputCounts = targetMapper.fromCaseToPoNCounts(targetFactorNormalizedCounts.counts()); // We prepare the counts for tangent normalization. final RealMatrix tangentNormalizationInputCounts = composeTangentNormalizationInputMatrix(tangentNormalizationRawInputCounts); if (ctx == null) { // Calculate the beta-hats for the input read count columns (samples). logger.info("Calculating beta hats..."); final RealMatrix tangentBetaHats = pon.betaHats(tangentNormalizationInputCounts, true, EPSILON); // Actual tangent normalization step. logger.info( "Performing actual tangent normalization (" + tangentNormalizationInputCounts.getColumnDimension() + " columns)..."); final RealMatrix tangentNormalizedCounts = pon.tangentNormalization(tangentNormalizationInputCounts, tangentBetaHats, true); // Output the tangent normalized counts. logger.info("Post-processing tangent normalization results..."); final ReadCountCollection tangentNormalized = targetMapper.fromPoNtoCaseCountCollection( tangentNormalizedCounts, targetFactorNormalizedCounts.columnNames()); final ReadCountCollection preTangentNormalized = targetMapper.fromPoNtoCaseCountCollection( tangentNormalizationInputCounts, targetFactorNormalizedCounts.columnNames()); return new TangentNormalizationResult( tangentNormalized, preTangentNormalized, tangentBetaHats, targetFactorNormalizedCounts); } else { /* Using Spark: the code here is a little more complex for optimization purposes. Please see notes in docs/PoN ... Ahat^T = (C^T P^T) A^T Therefore, C^T is the RowMatrix pinv: P panel: A projection: Ahat cases: C betahat: C^T P^T tangentNormalizedCounts: C - Ahat */ final RealMatrix pinv = pon.getReducedPanelPInverseCounts(); final RealMatrix panel = pon.getReducedPanelCounts(); // Make the C^T a distributed matrix (RowMatrix) final RowMatrix caseTDistMat = SparkConverter.convertRealMatrixToSparkRowMatrix( ctx, tangentNormalizationInputCounts.transpose(), TN_NUM_SLICES_SPARK); // Spark local matrices (transposed) final Matrix pinvTLocalMat = new DenseMatrix( pinv.getRowDimension(), pinv.getColumnDimension(), Doubles.concat(pinv.getData()), true) .transpose(); final Matrix panelTLocalMat = new DenseMatrix( panel.getRowDimension(), panel.getColumnDimension(), Doubles.concat(panel.getData()), true) .transpose(); // Calculate the projection transpose in a distributed matrix, then convert to Apache Commons // matrix (not transposed) final RowMatrix betahatDistMat = caseTDistMat.multiply(pinvTLocalMat); final RowMatrix projectionTDistMat = betahatDistMat.multiply(panelTLocalMat); final RealMatrix projection = SparkConverter.convertSparkRowMatrixToRealMatrix( projectionTDistMat, tangentNormalizationInputCounts.transpose().getRowDimension()) .transpose(); // Subtract the cases from the projection final RealMatrix tangentNormalizedCounts = tangentNormalizationInputCounts.subtract(projection); // Construct the result object and return it with the correct targets. final ReadCountCollection tangentNormalized = targetMapper.fromPoNtoCaseCountCollection( tangentNormalizedCounts, targetFactorNormalizedCounts.columnNames()); final ReadCountCollection preTangentNormalized = targetMapper.fromPoNtoCaseCountCollection( tangentNormalizationInputCounts, targetFactorNormalizedCounts.columnNames()); final RealMatrix tangentBetaHats = SparkConverter.convertSparkRowMatrixToRealMatrix( betahatDistMat, tangentNormalizedCounts.getColumnDimension()); return new TangentNormalizationResult( tangentNormalized, preTangentNormalized, tangentBetaHats.transpose(), targetFactorNormalizedCounts); } }