/** * Helper method to subset a VC record, modifying some metadata stored in the INFO field (i.e. AN, * AC, AF). * * @param vc the VariantContext record to subset * @param samples the samples to extract * @return the subsetted VariantContext */ private VariantContext subsetRecord(VariantContext vc, Set<String> samples) { if (samples == null || samples.isEmpty()) return vc; ArrayList<Genotype> genotypes = new ArrayList<Genotype>(); for (Map.Entry<String, Genotype> genotypePair : vc.getGenotypes().entrySet()) { if (samples.contains(genotypePair.getKey())) genotypes.add(genotypePair.getValue()); } VariantContext sub = vc.subContextFromGenotypes(genotypes, vc.getAlleles()); // if we have fewer alternate alleles in the selected VC than in the original VC, we need to // strip out the GL/PLs (because they are no longer accurate) if (vc.getAlleles().size() != sub.getAlleles().size()) sub = VariantContext.modifyGenotypes(sub, VariantContextUtils.stripPLs(vc.getGenotypes())); HashMap<String, Object> attributes = new HashMap<String, Object>(sub.getAttributes()); int depth = 0; for (String sample : sub.getSampleNames()) { Genotype g = sub.getGenotype(sample); if (g.isNotFiltered() && g.isCalled()) { String dp = (String) g.getAttribute("DP"); if (dp != null && !dp.equals(VCFConstants.MISSING_DEPTH_v3) && !dp.equals(VCFConstants.MISSING_VALUE_v4)) { depth += Integer.valueOf(dp); } } } if (KEEP_ORIGINAL_CHR_COUNTS) { if (attributes.containsKey(VCFConstants.ALLELE_COUNT_KEY)) attributes.put("AC_Orig", attributes.get(VCFConstants.ALLELE_COUNT_KEY)); if (attributes.containsKey(VCFConstants.ALLELE_FREQUENCY_KEY)) attributes.put("AF_Orig", attributes.get(VCFConstants.ALLELE_FREQUENCY_KEY)); if (attributes.containsKey(VCFConstants.ALLELE_NUMBER_KEY)) attributes.put("AN_Orig", attributes.get(VCFConstants.ALLELE_NUMBER_KEY)); } VariantContextUtils.calculateChromosomeCounts(sub, attributes, false); attributes.put("DP", depth); sub = VariantContext.modifyAttributes(sub, attributes); return sub; }
/** * Subset VC record if necessary and emit the modified record (provided it satisfies criteria for * printing) * * @param tracker the ROD tracker * @param ref reference information * @param context alignment info * @return 1 if the record was printed to the output file, 0 if otherwise */ @Override public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if (tracker == null) return 0; Collection<VariantContext> vcs = tracker.getValues(variantCollection.variants, context.getLocation()); if (vcs == null || vcs.size() == 0) { return 0; } for (VariantContext vc : vcs) { if (MENDELIAN_VIOLATIONS) { boolean foundMV = false; for (MendelianViolation mv : mvSet) { if (mv.isViolation(vc)) { foundMV = true; // System.out.println(vc.toString()); if (outMVFile != null) outMVFileStream.format( "MV@%s:%d. REF=%s, ALT=%s, AC=%d, momID=%s, dadID=%s, childID=%s, momG=%s, momGL=%s, dadG=%s, dadGL=%s, " + "childG=%s childGL=%s\n", vc.getChr(), vc.getStart(), vc.getReference().getDisplayString(), vc.getAlternateAllele(0).getDisplayString(), vc.getChromosomeCount(vc.getAlternateAllele(0)), mv.getSampleMom(), mv.getSampleDad(), mv.getSampleChild(), vc.getGenotype(mv.getSampleMom()).toBriefString(), vc.getGenotype(mv.getSampleMom()).getLikelihoods().getAsString(), vc.getGenotype(mv.getSampleDad()).toBriefString(), vc.getGenotype(mv.getSampleMom()).getLikelihoods().getAsString(), vc.getGenotype(mv.getSampleChild()).toBriefString(), vc.getGenotype(mv.getSampleChild()).getLikelihoods().getAsString()); } } if (!foundMV) break; } if (DISCORDANCE_ONLY) { Collection<VariantContext> compVCs = tracker.getValues(discordanceTrack, context.getLocation()); if (!isDiscordant(vc, compVCs)) return 0; } if (CONCORDANCE_ONLY) { Collection<VariantContext> compVCs = tracker.getValues(concordanceTrack, context.getLocation()); if (!isConcordant(vc, compVCs)) return 0; } if (alleleRestriction.equals(NumberAlleleRestriction.BIALLELIC) && !vc.isBiallelic()) continue; if (alleleRestriction.equals(NumberAlleleRestriction.MULTIALLELIC) && vc.isBiallelic()) continue; if (!selectedTypes.contains(vc.getType())) continue; VariantContext sub = subsetRecord(vc, samples); if ((sub.isPolymorphic() || !EXCLUDE_NON_VARIANTS) && (!sub.isFiltered() || !EXCLUDE_FILTERED)) { for (VariantContextUtils.JexlVCMatchExp jexl : jexls) { if (!VariantContextUtils.match(sub, jexl)) { return 0; } } if (SELECT_RANDOM_NUMBER) { randomlyAddVariant(++variantNumber, sub, ref.getBase()); } else if (!SELECT_RANDOM_FRACTION || (GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) { vcfWriter.add(sub); } } } return 1; }
/** Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher */ public void initialize() { // Get list of samples to include in the output List<String> rodNames = Arrays.asList(variantCollection.variants.getName()); Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); TreeSet<String> vcfSamples = new TreeSet<String>( SampleUtils.getSampleList( vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE)); Collection<String> samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles); Collection<String> samplesFromExpressions = SampleUtils.matchSamplesExpressions(vcfSamples, sampleExpressions); // first, add any requested samples samples.addAll(samplesFromFile); samples.addAll(samplesFromExpressions); samples.addAll(sampleNames); // if none were requested, we want all of them if (samples.isEmpty()) { samples.addAll(vcfSamples); NO_SAMPLES_SPECIFIED = true; } // now, exclude any requested samples Collection<String> XLsamplesFromFile = SampleUtils.getSamplesFromFiles(XLsampleFiles); samples.removeAll(XLsamplesFromFile); samples.removeAll(XLsampleNames); if (samples.size() == 0 && !NO_SAMPLES_SPECIFIED) throw new UserException( "All samples requested to be included were also requested to be excluded."); for (String sample : samples) logger.info("Including sample '" + sample + "'"); // if user specified types to include, add these, otherwise, add all possible variant context // types to list of vc types to include if (TYPES_TO_INCLUDE.isEmpty()) { for (VariantContext.Type t : VariantContext.Type.values()) selectedTypes.add(t); } else { for (VariantContext.Type t : TYPES_TO_INCLUDE) selectedTypes.add(t); } // Initialize VCF header Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger); headerLines.add(new VCFHeaderLine("source", "SelectVariants")); if (KEEP_ORIGINAL_CHR_COUNTS) { headerLines.add( new VCFFormatHeaderLine("AC_Orig", 1, VCFHeaderLineType.Integer, "Original AC")); headerLines.add( new VCFFormatHeaderLine("AF_Orig", 1, VCFHeaderLineType.Float, "Original AF")); headerLines.add( new VCFFormatHeaderLine("AN_Orig", 1, VCFHeaderLineType.Integer, "Original AN")); } vcfWriter.writeHeader(new VCFHeader(headerLines, samples)); for (int i = 0; i < SELECT_EXPRESSIONS.size(); i++) { // It's not necessary that the user supply select names for the JEXL expressions, since those // expressions will only be needed for omitting records. Make up the select names here. selectNames.add(String.format("select-%d", i)); } jexls = VariantContextUtils.initializeMatchExps(selectNames, SELECT_EXPRESSIONS); // Look at the parameters to decide which analysis to perform DISCORDANCE_ONLY = discordanceTrack.isBound(); if (DISCORDANCE_ONLY) logger.info( "Selecting only variants discordant with the track: " + discordanceTrack.getName()); CONCORDANCE_ONLY = concordanceTrack.isBound(); if (CONCORDANCE_ONLY) logger.info( "Selecting only variants concordant with the track: " + concordanceTrack.getName()); if (MENDELIAN_VIOLATIONS) { if (FAMILY_STRUCTURE_FILE != null) { try { for (final String line : new XReadLines(FAMILY_STRUCTURE_FILE)) { MendelianViolation mv = new MendelianViolation(line, MENDELIAN_VIOLATION_QUAL_THRESHOLD); if (samples.contains(mv.getSampleChild()) && samples.contains(mv.getSampleDad()) && samples.contains(mv.getSampleMom())) mvSet.add(mv); } } catch (FileNotFoundException e) { throw new UserException.CouldNotReadInputFile(FAMILY_STRUCTURE_FILE, e); } if (outMVFile != null) try { outMVFileStream = new PrintStream(outMVFile); } catch (FileNotFoundException e) { throw new UserException.CouldNotCreateOutputFile( outMVFile, "Can't open output file", e); } } else mvSet.add(new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD)); } else if (!FAMILY_STRUCTURE.isEmpty()) { mvSet.add(new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD)); MENDELIAN_VIOLATIONS = true; } SELECT_RANDOM_NUMBER = numRandom > 0; if (SELECT_RANDOM_NUMBER) { logger.info("Selecting " + numRandom + " variants at random from the variant track"); variantArray = new RandomVariantStructure[numRandom]; } SELECT_RANDOM_FRACTION = fractionRandom > 0; if (SELECT_RANDOM_FRACTION) logger.info( "Selecting approximately " + 100.0 * fractionRandom + "% of the variants at random from the variant track"); }