Exemplo n.º 1
0
  private void writeDifferences(
      final List<VariantContext> source1Alleles, final List<VariantContext> source2Alleles) {
    int currentIndex1 = 0, currentIndex2 = 0;
    final int size1 = source1Alleles.size(), size2 = source2Alleles.size();
    VariantContext current1 = source1Alleles.get(0);
    VariantContext current2 = source2Alleles.get(0);

    while (currentIndex1 < size1 || currentIndex2 < size2) {
      if (current1 == null) {
        writeOne(current2, source2, null);
        currentIndex2++;
        current2 = (currentIndex2 < size2 ? source2Alleles.get(currentIndex2) : null);
      } else if (current2 == null) {
        writeOne(current1, source1, null);
        currentIndex1++;
        current1 = (currentIndex1 < size1 ? source1Alleles.get(currentIndex1) : null);
      } else {

        final GenomeLoc loc1 = getToolkit().getGenomeLocParser().createGenomeLoc(current1);
        final GenomeLoc loc2 = getToolkit().getGenomeLocParser().createGenomeLoc(current2);

        if (loc1.getStart() == loc2.getStart() || loc1.overlapsP(loc2)) {
          String status;
          if (loc1.getStart() == loc2.getStart()) {
            final String allele1 = current1.getAlternateAllele(0).getBaseString();
            final String allele2 = current2.getAlternateAllele(0).getBaseString();
            if (allele1.indexOf(allele2) != -1 || allele2.indexOf(allele1) != -1)
              status = ONE_ALLELE_SUBSET_OF_OTHER_STATUS;
            else status = SAME_START_DIFFERENT_ALLELES_STATUS;
          } else {
            status = OVERLAPPING_EVENTS_STATUS;
          }

          writeOne(current1, INTERSECTION_SET, status);
          currentIndex1++;
          currentIndex2++;
          current1 = (currentIndex1 < size1 ? source1Alleles.get(currentIndex1) : null);
          current2 = (currentIndex2 < size2 ? source2Alleles.get(currentIndex2) : null);
        } else if (loc1.isBefore(loc2)) {
          writeOne(current1, source1, null);
          currentIndex1++;
          current1 = (currentIndex1 < size1 ? source1Alleles.get(currentIndex1) : null);
        } else {
          writeOne(current2, source2, null);
          currentIndex2++;
          current2 = (currentIndex2 < size2 ? source2Alleles.get(currentIndex2) : null);
        }
      }
    }
  }
Exemplo n.º 2
0
  private byte[] generateHaplotype(
      final List<VariantContext> sourceVCs, final ReferenceContext refContext) {

    final StringBuilder sb = new StringBuilder();

    final int startPos = refContext.getWindow().getStart();
    int currentPos = startPos;
    final byte[] reference = refContext.getBases();

    for (final VariantContext vc : sourceVCs) {
      // add any missing reference context
      int vcStart = vc.getStart();
      final int refAlleleLength = vc.getReference().length();
      if (refAlleleLength
          == vc.getEnd()
              - vc.getStart()) // this is a deletion (whereas for other events the padding base
        // isn't part of the position)
        vcStart++;

      while (currentPos < vcStart) sb.append((char) reference[currentPos++ - startPos]);

      // add the alt allele
      sb.append(vc.getAlternateAllele(0).getBaseString());

      // skip the reference allele
      currentPos += refAlleleLength;
    }
    // add any missing reference context
    final int stopPos = refContext.getWindow().getStop();
    while (currentPos < stopPos) sb.append((char) reference[currentPos++ - startPos]);

    return sb.toString().getBytes();
  }
Exemplo n.º 3
0
  @Test
  public void shouldPreserveSymbolicAlleleCase() {
    VCFFileReader reader =
        new VCFFileReader(new File(VariantBaseTest.variantTestDataRoot + "breakpoint.vcf"), false);
    VariantContext variant = reader.iterator().next();
    reader.close();

    // VCF v4.1 s1.4.5
    // Tools processing VCF files are not required to preserve case in the allele String, except for
    // IDs, which are case sensitive.
    Assert.assertTrue(variant.getAlternateAllele(0).getDisplayString().contains("chr12"));
  }
  @Override
  public void runCommand() {

    logger.info("MergeVCFColumnsCommand");
    /*
     * Assumptions
     * (1) Only two vcfs that are sorted with the same contig order
     * (2) if contigs on it same order, then we will just skip that contig
     * (3) No overlapping samples allowed
     *
     * Output:
     * A vcf where intersecting sites are merged together and will only return biallelic markers
     * the info field will be cleared
     * the only GT FORMAT field will be there
     */

    Collection<File> vcfs = applicationOptions.getVcfs();
    String outfile = applicationOptions.getOutFile();

    if (vcfs.size() != 2) {
      throw new IllegalArgumentException("This function requires exactly two vcfs");
    }

    Iterator<File> vcfFileIter = vcfs.iterator();

    File vcf1 = vcfFileIter.next();
    File vcf2 = vcfFileIter.next();
    VCFFileReader reader1 = new VCFFileReader(vcf1, false);
    VCFFileReader reader2 = new VCFFileReader(vcf2, false);

    Iterator<VariantContext> iter1 = reader1.iterator();
    Iterator<VariantContext> iter2 = reader2.iterator();

    VariantContextComparator comparator = new VariantContextComparator();

    /*
     * Merge headers
     */
    VCFHeader header1 = reader1.getFileHeader();

    VCFHeader header2 = reader2.getFileHeader();

    List<String> samples1 = header1.getGenotypeSamples();
    List<String> samples2 = header2.getGenotypeSamples();

    List<String> mergedSamples = new ArrayList<>(samples1.size() + samples2.size());
    mergedSamples.addAll(samples1);
    mergedSamples.addAll(samples2);

    // Validate that there are no duplicates
    HashSet<String> sampleSet = new HashSet<String>();
    for (String id : mergedSamples) {
      if (sampleSet.contains(id)) {
        throw new IllegalArgumentException("Duplicate id found: " + id);
      } else {
        sampleSet.add(id);
      }
    }

    HashSet<VCFHeaderLine> meta = new HashSet<>();
    meta.add(new VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.String, "GT"));
    meta.addAll(header1.getContigLines());
    VCFHeader mergedHeader = new VCFHeader(meta, mergedSamples);

    /*
     * Create encoder
     */
    VCFEncoder encoder = new VCFEncoder(mergedHeader, false, false);

    BufferedWriter writer = null;
    try {
      if (outfile.endsWith(".gz")) {
        BlockCompressedOutputStream outstream = new BlockCompressedOutputStream(new File(outfile));
        writer = new BufferedWriter(new OutputStreamWriter(outstream));
      } else {
        writer =
            Files.newBufferedWriter(
                Paths.get(outfile),
                Charset.defaultCharset(),
                StandardOpenOption.CREATE,
                StandardOpenOption.TRUNCATE_EXISTING);
      }

      /*
       * Write header
       */
      VCFHeaderWriter.writeHeader(writer, mergedHeader);
      logger.info("Wrote header");

      VariantContext previous1 = null;
      VariantContext previous2 = null;

      int count = 0;

      int countFile1 = 0;
      int countFile2 = 0;

      boolean usePrevious1 = false;
      boolean usePrevious2 = false;
      while (iter1.hasNext() || iter2.hasNext()) {

        if ((iter1.hasNext() || usePrevious1) && (iter2.hasNext() || usePrevious2)) {

          VariantContext variant1 = null;

          VariantContext variant2 = null;

          //					if(usePrevious1 == true && usePrevious2 == true &&
          // comparator.compare(previous1,previous2) != 0) {
          //						//then skip both
          //						usePrevious1 = false;
          //						usePrevious2 = false;
          //					}

          if (usePrevious1) {
            variant1 = previous1;
          } else {
            variant1 = iter1.next();
            countFile1++;
          }

          if (usePrevious2) {
            variant2 = previous2;
          } else {
            variant2 = iter2.next();
            countFile2++;
          }

          // check that variants are ordered correctly
          if (previous1 != null
              && previous1 != variant1
              && comparator.compare(previous1, variant1) > 0) {
            throw new IllegalStateException(
                previous1.getContig()
                    + ":"
                    + previous1.getStart()
                    + " > "
                    + variant1.getContig()
                    + ":"
                    + variant1.getStart());
          }

          if (previous2 != null
              && previous2 != variant2
              && comparator.compare(previous2, variant2) > 0) {
            throw new IllegalStateException(
                previous2.getContig()
                    + ":"
                    + previous2.getStart()
                    + " > "
                    + variant2.getContig()
                    + ":"
                    + variant2.getStart());
          }

          int cmp = comparator.compare(variant1, variant2);

          if (cmp < 0) {
            // logger.info("Skipping VCF1: " + variant1.getContig() + ":" + variant1.getStart() +
            // "\t" + variant1.getReference().toString() +  "\t" + variant1.getAlternateAlleles());

            if (usePrevious1 == true && usePrevious2 == true) {
              // variant1 < variant2
              // we need to go to next variant in vcf1
              usePrevious1 = false;
            }

            usePrevious2 = true;
          } else if (cmp > 0) {

            if (usePrevious1 == true && usePrevious2 == true) {
              // variant1 > variant2
              // we need to go to next variant in vcf2
              usePrevious2 = false;
            }

            usePrevious1 = true;
            // logger.info("Skipping VCF2: " + variant2.getContig() + ":" + variant2.getStart() +
            // "\t" + variant2.getReference().toString() +  "\t" + variant2.getAlternateAlleles());
          } else {
            // they equal position
            usePrevious1 = false;
            usePrevious2 = false;

            if (variant1.isBiallelic()
                && variant2.isBiallelic()
                && variant1.getReference().equals(variant2.getReference())
                && variant1.getAlternateAllele(0).equals(variant2.getAlternateAllele(0))) {

              // TODO: Finish merging
              // both variants are bialleleic and the reference and alternative alleles match

              count++;
              if (count % 10000 == 0) {
                logger.info(count + " mergeable variants found");
              }

              VariantContext merged = VariantContextMerger.merge(variant1, variant2);

              writer.write(encoder.encode(merged));
              writer.write("\n");

            } else {
              // skip if they do not equal

              //			logger.info("Skipping: " + variant1.getContig() + ":" + variant1.getStart() +
              // "\t" + variant1.getReference().toString() +  "\t" +
              // variant1.getAlternateAlleles());
              //			logger.info("Skipping: " + variant2.getContig() + ":" + variant2.getStart() +
              // "\t" + variant2.getReference().toString() +  "\t" +
              // variant2.getAlternateAlleles());
            }
          }

          previous1 = variant1;
          previous2 = variant2;
        } else if (iter1.hasNext()) {
          // just skip remaining variants
          VariantContext current = iter1.next();
          countFile1++;

          if (previous1 != null && current != null && comparator.compare(previous1, current) > 0) {
            throw new IllegalStateException(
                previous1.getContig()
                    + ":"
                    + previous1.getStart()
                    + " > "
                    + current.getContig()
                    + ":"
                    + current.getStart());
          }

          previous1 = current;

          // logger.info("Skipping: " + previous1.getContig() + ":" + previous1.getStart() + "\t" +
          // previous1.getReference().toString() +  "\t" + previous1.getAlternateAlleles());

        } else if (iter2.hasNext()) {
          // just skip remaining variants
          // fixed bug/ was iter1 changed to iter2
          VariantContext current = iter2.next();
          countFile2++;

          if (previous2 != null && current != null && comparator.compare(previous2, current) > 0) {
            throw new IllegalStateException(
                previous2.getContig()
                    + ":"
                    + previous2.getStart()
                    + " > "
                    + current.getContig()
                    + ":"
                    + current.getStart());
          }

          previous2 = current;

          // logger.info("Skipping: " + previous2.getContig() + ":" + previous2.getStart() + "\t" +
          // previous2.getReference().toString() +  "\t" + previous2.getAlternateAlleles());
        } else {
          throw new IllegalStateException("Error should not of reached this point");
        }
      }

      reader1.close();
      reader2.close();

      logger.info(count + "  merged variants");
      logger.info(countFile1 + "  variants in " + vcf1.getAbsolutePath());
      logger.info(countFile2 + "  variants in " + vcf2.getAbsolutePath());

    } catch (Exception e) {
      e.printStackTrace();
    } finally {
      if (writer != null) {
        try {
          logger.info("Flushing writer");
          writer.close();
        } catch (IOException e) {
          // TODO Auto-generated catch block
          e.printStackTrace();
        }
      }
    }

    logger.info("finished merging vcfs");
  }