/** * Concatenate VCF files of non-overlapping genome intervals, all with the same set of samples * * <p>The main purpose of this tool is to speed up the gather function when using scatter-gather * parallelization. This tool concatenates the scattered output VCF files. It assumes that: * * <ul> * <li>All the input VCFs (or BCFs) contain the same samples in the same order. * <li>The variants in each input file are from non-overlapping (scattered) intervals. * </ul> * * <p>When the input files are already sorted based on the intervals start positions, use * -assumeSorted. * * <h3>Input</h3> * * <p>Two or more variant sets to combine. They should be of non-overlapping genome intervals and * with the same samples (sorted in the same order). If the files are ordered according to the * appearance of intervals in the ref genome, then one can use the -assumeSorted flag. * * <h3>Output</h3> * * <p>A combined VCF or BCF. The output file should have the same extension as the input(s). <\p> * * <h3>Important note</h3> * * <p>This is a command-line utility that bypasses the GATK engine. As a result, the command-line * you must use to invoke it is a little different from other GATK tools (see example below), and it * does not accept any of the classic "CommandLineGATK" arguments. * * <h3>Usage example</h3> * * <pre> * java -cp GenomeAnalysisTK.jar org.broadinstitute.gatk.tools.CatVariants \ * -R reference.fasta \ * -V input1.vcf \ * -V input2.vcf \ * -out output.vcf \ * -assumeSorted * </pre> * * <h3>Caveat</h3> * * <p>Currently the tool is more efficient when working with VCFs than with BCFs. * * @author Ami Levy Moonshine * @since Jan 2012 */ @DocumentedGATKFeature(groupName = HelpConstants.DOCS_CAT_VARMANIP) public class CatVariants extends CommandLineProgram { // setup the logging system, used by some codecs private static org.apache.log4j.Logger logger = org.apache.log4j.Logger.getRootLogger(); @Input( fullName = "reference", shortName = "R", doc = "genome reference file <name>.fasta", required = true) private File refFile = null; /** * The VCF or BCF files to merge together * * <p>CatVariants can take any number of -V arguments on the command line. Each -V argument will * be included in the final merged output VCF/BCF. The order of arguments does not matter, but it * runs more efficiently if they are sorted based on the intervals and the assumeSorted argument * is used. */ @Input(fullName = "variant", shortName = "V", doc = "Input VCF file/s", required = true) private List<File> variant = null; @Output(fullName = "outputFile", shortName = "out", doc = "output file", required = true) private File outputFile = null; @Argument( fullName = "assumeSorted", shortName = "assumeSorted", doc = "assumeSorted should be true if the input files are already sorted (based on the position of the variants)", required = false) private Boolean assumeSorted = false; @Argument( fullName = "variant_index_type", doc = "which type of IndexCreator to use for VCF/BCF indices", required = false) private GATKVCFIndexType variant_index_type = GATKVCFUtils.DEFAULT_INDEX_TYPE; @Argument( fullName = "variant_index_parameter", doc = "the parameter (bin width or features per bin) to pass to the VCF/BCF IndexCreator", required = false) private Integer variant_index_parameter = GATKVCFUtils.DEFAULT_INDEX_PARAMETER; /* * print usage information */ private static void printUsage() { System.err.println( "Usage: java -cp target/GenomeAnalysisTK.jar org.broadinstitute.gatk.tools.CatVariants --reference <reference> --variant <input VCF or BCF file; can specify --variant multiple times> --outputFile <outputFile> [--assumeSorted]"); System.err.println(" The output file must be of the same type as all input files."); System.err.println( " If the input files are already sorted, then indicate that with --assumeSorted to improve performance."); } private enum FileType { VCF, BCF, BLOCK_COMPRESSED_VCF, INVALID } private FileType fileExtensionCheck(File inFile, FileType previousFileType) { final String inFileName = inFile.toString().toLowerCase(); if (inFileName.endsWith(".vcf")) { if (previousFileType == FileType.VCF || previousFileType == null) { return FileType.VCF; } } if (inFileName.endsWith(".bcf")) { if (previousFileType == FileType.BCF || previousFileType == null) { return FileType.BCF; } } for (String extension : AbstractFeatureReader.BLOCK_COMPRESSED_EXTENSIONS) { if (inFileName.endsWith(".vcf" + extension)) { if (previousFileType == FileType.BLOCK_COMPRESSED_VCF || previousFileType == null) { return FileType.BLOCK_COMPRESSED_VCF; } } } System.err.println( String.format("File extension for input file %s is not valid for CatVariants", inFile)); printUsage(); return FileType.INVALID; } private FeatureReader<VariantContext> getFeatureReader(final FileType fileType, final File file) { FeatureReader<VariantContext> reader = null; switch (fileType) { case VCF: case BLOCK_COMPRESSED_VCF: // getFeatureReader will handle both block-compressed and plain text VCFs reader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), new VCFCodec(), false); break; case BCF: reader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), new BCF2Codec(), false); break; } return reader; } /** * Replaces any .list files in rawFileList with the files named in said .list file * * @param rawFileList the original file list, possibly including .list files * @return a new List, with .list files replaced */ private List<File> parseVariantList(final List<File> rawFileList) { final List<File> result = new ArrayList<>(rawFileList.size()); for (final File rawFile : rawFileList) { if (rawFile.getName().endsWith(".list")) { try { for (final String line : new XReadLines(rawFile, true)) result.add(new File(line)); } catch (IOException e) { throw new UserException.CouldNotReadInputFile(rawFile, e); } } else { result.add(rawFile); } } return result; } @Override protected int execute() throws Exception { BasicConfigurator.configure(); logger.setLevel(Level.INFO); final ReferenceSequenceFile ref; try { ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile); } catch (Exception e) { throw new UserException("Couldn't load provided reference sequence file " + refFile, e); } variant = parseVariantList(variant); Comparator<Pair<Integer, File>> positionComparator = new PositionComparator(); Queue<Pair<Integer, File>> priorityQueue; if (assumeSorted) priorityQueue = new LinkedList<>(); else priorityQueue = new PriorityQueue<>(10000, positionComparator); FileType fileType = null; for (File file : variant) { // if it returns a valid type, it will be the same for all files fileType = fileExtensionCheck(file, fileType); if (fileType == FileType.INVALID) return 1; if (assumeSorted) { priorityQueue.add(new Pair<>(0, file)); } else { if (!file.exists()) { throw new UserException(String.format("File %s doesn't exist", file.getAbsolutePath())); } FeatureReader<VariantContext> reader = getFeatureReader(fileType, file); Iterator<VariantContext> it = reader.iterator(); if (!it.hasNext()) { System.err.println( String.format("File %s is empty. This file will be ignored", file.getAbsolutePath())); continue; } VariantContext vc = it.next(); int firstPosition = vc.getStart(); reader.close(); priorityQueue.add(new Pair<>(firstPosition, file)); } } FileOutputStream outputStream = new FileOutputStream(outputFile); EnumSet<Options> options = EnumSet.of(Options.INDEX_ON_THE_FLY); IndexCreator idxCreator = GATKVCFUtils.makeIndexCreator( variant_index_type, variant_index_parameter, outputFile, ref.getSequenceDictionary()); final VariantContextWriter outputWriter = VariantContextWriterFactory.create( outputFile, outputStream, ref.getSequenceDictionary(), idxCreator, options); boolean firstFile = true; int count = 0; while (!priorityQueue.isEmpty()) { count++; File file = priorityQueue.remove().getSecond(); if (!file.exists()) { throw new UserException(String.format("File %s doesn't exist", file.getAbsolutePath())); } FeatureReader<VariantContext> reader = getFeatureReader(fileType, file); if (count % 10 == 0) System.out.print(count); else System.out.print("."); if (firstFile) { VCFHeader header = (VCFHeader) reader.getHeader(); outputWriter.writeHeader(header); firstFile = false; } Iterator<VariantContext> it = reader.iterator(); while (it.hasNext()) { VariantContext vc = it.next(); outputWriter.add(vc); } reader.close(); } System.out.println(); outputWriter.close(); return 0; } public static void main(String[] args) { try { CatVariants instance = new CatVariants(); start(instance, args); System.exit(CommandLineProgram.result); } catch (UserException e) { printUsage(); exitSystemWithUserError(e); } catch (Exception e) { exitSystemWithError(e); } } private static class PositionComparator implements Comparator<Pair<Integer, File>> { @Override public int compare(Pair<Integer, File> p1, Pair<Integer, File> p2) { int startPositionP1 = p1.getFirst(); int startPositionP2 = p2.getFirst(); if (startPositionP1 == startPositionP2) return 0; return startPositionP1 < startPositionP2 ? -1 : 1; } } }