public static Map<VariantContext.Type, List<VariantContext>> separateVariantContextsByType( Collection<VariantContext> VCs) { HashMap<VariantContext.Type, List<VariantContext>> mappedVCs = new HashMap<VariantContext.Type, List<VariantContext>>(); for (VariantContext vc : VCs) { // look at previous variant contexts of different type. If: // a) otherVC has alleles which are subset of vc, remove otherVC from its list and add otherVC // to vc's list // b) vc has alleles which are subset of otherVC. Then, add vc to otherVC's type list (rather, // do nothing since vc will be added automatically to its list) // c) neither: do nothing, just add vc to its own list boolean addtoOwnList = true; for (VariantContext.Type type : VariantContext.Type.values()) { if (type.equals(vc.getType())) continue; if (!mappedVCs.containsKey(type)) continue; List<VariantContext> vcList = mappedVCs.get(type); for (int k = 0; k < vcList.size(); k++) { VariantContext otherVC = vcList.get(k); if (allelesAreSubset(otherVC, vc)) { // otherVC has a type different than vc and its alleles are a subset of vc: remove // otherVC from its list and add it to vc's type list vcList.remove(k); // avoid having empty lists if (vcList.size() == 0) mappedVCs.remove(vcList); if (!mappedVCs.containsKey(vc.getType())) mappedVCs.put(vc.getType(), new ArrayList<VariantContext>()); mappedVCs.get(vc.getType()).add(otherVC); break; } else if (allelesAreSubset(vc, otherVC)) { // vc has a type different than otherVC and its alleles are a subset of VC: add vc to // otherVC's type list and don't add to its own mappedVCs.get(type).add(vc); addtoOwnList = false; break; } } } if (addtoOwnList) { if (!mappedVCs.containsKey(vc.getType())) mappedVCs.put(vc.getType(), new ArrayList<VariantContext>()); mappedVCs.get(vc.getType()).add(vc); } } return mappedVCs; }
/** Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher */ public void initialize() { // Get list of samples to include in the output List<String> rodNames = Arrays.asList(variantCollection.variants.getName()); Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); TreeSet<String> vcfSamples = new TreeSet<String>( SampleUtils.getSampleList( vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE)); Collection<String> samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles); Collection<String> samplesFromExpressions = SampleUtils.matchSamplesExpressions(vcfSamples, sampleExpressions); // first, add any requested samples samples.addAll(samplesFromFile); samples.addAll(samplesFromExpressions); samples.addAll(sampleNames); // if none were requested, we want all of them if (samples.isEmpty()) { samples.addAll(vcfSamples); NO_SAMPLES_SPECIFIED = true; } // now, exclude any requested samples Collection<String> XLsamplesFromFile = SampleUtils.getSamplesFromFiles(XLsampleFiles); samples.removeAll(XLsamplesFromFile); samples.removeAll(XLsampleNames); if (samples.size() == 0 && !NO_SAMPLES_SPECIFIED) throw new UserException( "All samples requested to be included were also requested to be excluded."); for (String sample : samples) logger.info("Including sample '" + sample + "'"); // if user specified types to include, add these, otherwise, add all possible variant context // types to list of vc types to include if (TYPES_TO_INCLUDE.isEmpty()) { for (VariantContext.Type t : VariantContext.Type.values()) selectedTypes.add(t); } else { for (VariantContext.Type t : TYPES_TO_INCLUDE) selectedTypes.add(t); } // Initialize VCF header Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger); headerLines.add(new VCFHeaderLine("source", "SelectVariants")); if (KEEP_ORIGINAL_CHR_COUNTS) { headerLines.add( new VCFFormatHeaderLine("AC_Orig", 1, VCFHeaderLineType.Integer, "Original AC")); headerLines.add( new VCFFormatHeaderLine("AF_Orig", 1, VCFHeaderLineType.Float, "Original AF")); headerLines.add( new VCFFormatHeaderLine("AN_Orig", 1, VCFHeaderLineType.Integer, "Original AN")); } vcfWriter.writeHeader(new VCFHeader(headerLines, samples)); for (int i = 0; i < SELECT_EXPRESSIONS.size(); i++) { // It's not necessary that the user supply select names for the JEXL expressions, since those // expressions will only be needed for omitting records. Make up the select names here. selectNames.add(String.format("select-%d", i)); } jexls = VariantContextUtils.initializeMatchExps(selectNames, SELECT_EXPRESSIONS); // Look at the parameters to decide which analysis to perform DISCORDANCE_ONLY = discordanceTrack.isBound(); if (DISCORDANCE_ONLY) logger.info( "Selecting only variants discordant with the track: " + discordanceTrack.getName()); CONCORDANCE_ONLY = concordanceTrack.isBound(); if (CONCORDANCE_ONLY) logger.info( "Selecting only variants concordant with the track: " + concordanceTrack.getName()); if (MENDELIAN_VIOLATIONS) { if (FAMILY_STRUCTURE_FILE != null) { try { for (final String line : new XReadLines(FAMILY_STRUCTURE_FILE)) { MendelianViolation mv = new MendelianViolation(line, MENDELIAN_VIOLATION_QUAL_THRESHOLD); if (samples.contains(mv.getSampleChild()) && samples.contains(mv.getSampleDad()) && samples.contains(mv.getSampleMom())) mvSet.add(mv); } } catch (FileNotFoundException e) { throw new UserException.CouldNotReadInputFile(FAMILY_STRUCTURE_FILE, e); } if (outMVFile != null) try { outMVFileStream = new PrintStream(outMVFile); } catch (FileNotFoundException e) { throw new UserException.CouldNotCreateOutputFile( outMVFile, "Can't open output file", e); } } else mvSet.add(new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD)); } else if (!FAMILY_STRUCTURE.isEmpty()) { mvSet.add(new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD)); MENDELIAN_VIOLATIONS = true; } SELECT_RANDOM_NUMBER = numRandom > 0; if (SELECT_RANDOM_NUMBER) { logger.info("Selecting " + numRandom + " variants at random from the variant track"); variantArray = new RandomVariantStructure[numRandom]; } SELECT_RANDOM_FRACTION = fractionRandom > 0; if (SELECT_RANDOM_FRACTION) logger.info( "Selecting approximately " + 100.0 * fractionRandom + "% of the variants at random from the variant track"); }