Exemplo n.º 1
0
  public String assembleContigs(
      List<String> inputFiles,
      String output,
      String tempDir,
      List<Feature> regions,
      String prefix,
      boolean checkForDupes,
      ReAligner realigner,
      CompareToReference2 c2r) {

    if ((kmers.length == 0) || (kmers[0] < KmerSizeEvaluator.MIN_KMER)) {
      KmerSizeEvaluator kmerEval = new KmerSizeEvaluator();
      int kmer = kmerEval.identifyMinKmer(readLength, c2r, regions);
      this.kmers = realigner.toKmerArray(kmer, readLength);
    }

    String contigs = "";

    long start = System.currentTimeMillis();

    int readCount = 0;

    int minReadCount = Integer.MAX_VALUE;

    // if c2r is null, this is the unaligned region.
    boolean isAssemblyCandidate = c2r == null ? true : false;

    try {

      List<List<SAMRecord>> readsList = getReads(inputFiles, regions, realigner);

      for (List<SAMRecord> reads : readsList) {
        int candidateReadCount = 0;
        for (SAMRecord read : reads) {
          if (!isAssemblyCandidate && isAssemblyTriggerCandidate(read, c2r)) {
            candidateReadCount++;
          }

          if (shouldSearchForSv && isSvCandidate(read)) {
            svCandidates.add(
                new Position(read.getMateReferenceName(), read.getMateAlignmentStart()));
          }
        }

        if (candidateReadCount > minCandidateCount(reads.size(), regions.get(0))) {
          isAssemblyCandidate = true;
        }

        if (reads.size() < minReadCount) {
          minReadCount = reads.size();
        }

        readCount += reads.size();
      }

      StringBuffer readBuffer = new StringBuffer();

      if (isAssemblyCandidate) {

        int downsampleTarget = desiredNumberOfReads(regions);

        char sampleId = 1;

        for (List<SAMRecord> reads : readsList) {
          // Default to always keep
          double keepProbability = 1.1;

          if (reads.size() > downsampleTarget) {
            keepProbability = (double) downsampleTarget / (double) reads.size();
          }

          Random random = new Random(1);

          for (SAMRecord read : reads) {
            if (random.nextDouble() < keepProbability) {
              readBuffer.append(sampleId);
              readBuffer.append(read.getReadNegativeStrandFlag() ? "1" : "0");

              if (read.getReadString().length() == readLength) {
                readBuffer.append(read.getReadString());
                readBuffer.append(read.getBaseQualityString());
              } else {
                StringBuffer basePadding = new StringBuffer();
                StringBuffer qualPadding = new StringBuffer();

                for (int i = 0; i < readLength - read.getReadString().length(); i++) {
                  basePadding.append('N');
                  qualPadding.append('!');
                }

                readBuffer.append(read.getReadString() + basePadding.toString());
                readBuffer.append(read.getBaseQualityString() + qualPadding.toString());
              }
            }
          }

          // Make this set of reads eligible for GC
          reads.clear();
          sampleId += 1;
        }
      }

      readsList.clear();

      if (isAssemblyCandidate) {
        for (int kmer : kmers) {

          String outputFile = output + "_k" + kmer;

          contigs =
              assemble(
                  readBuffer.toString(),
                  outputFile,
                  prefix,
                  truncateOnRepeat ? 1 : 0,
                  maxContigs,
                  maxPathsFromRoot,
                  readLength,
                  kmer,
                  minKmerFrequency,
                  minBaseQuality,
                  minEdgeRatio,
                  isDebug ? 1 : 0,
                  maxNodes);

          if (!contigs.equals("<REPEAT>")) {
            break;
          } else {
            if (kmer >= readLength / 2 || kmer >= CYCLE_KMER_LENGTH_THRESHOLD) {
              isCycleExceedingThresholdDetected = true;
            }
          }
        }
      } else {
        //				System.out.println("Skipping assembly for: " + prefix);
      }

    } catch (Exception e) {
      e.printStackTrace();
      throw new RuntimeException(e);
    }

    if (this.shouldSearchForSv) {

      Collections.sort(this.svCandidates);
      Position last = null;
      String currentFeatureChr = null;
      int currentFeatureStart = -1;
      int currentFeatureStop = -1;
      int currentFeatureCount = 0;

      // TODO: Calc this dynamically
      int windowSize = 500;

      for (Position pos : this.svCandidates) {
        if ((last != null)
            && pos.getChromosome().equals(last.getChromosome())
            && Math.abs(pos.getPosition() - last.getPosition()) < windowSize) {

          if (currentFeatureChr == null) {
            currentFeatureChr = pos.getChromosome();
            currentFeatureStart = last.getPosition();
            currentFeatureStop = pos.getPosition() + readLength;
            currentFeatureCount = 1;
          } else {
            currentFeatureStop = pos.getPosition() + readLength;
            currentFeatureCount++;
          }
        } else {
          if (currentFeatureChr != null) {
            if (currentFeatureCount
                > (minReadCount / MAX_READ_LENGTHS_PER_REGION) * minReadCandidateFraction) {
              Feature region =
                  new Feature(
                      currentFeatureChr,
                      currentFeatureStart - readLength,
                      currentFeatureStop + readLength);
              BreakpointCandidate candidate = new BreakpointCandidate(region, currentFeatureCount);
              this.svCandidateRegions.add(candidate);
            }
            currentFeatureChr = null;
            currentFeatureStart = -1;
            currentFeatureStop = -1;
            currentFeatureCount = 0;
          } else {
            currentFeatureChr = pos.getChromosome();
            currentFeatureStart = pos.getPosition();
            currentFeatureStop = pos.getPosition() + readLength;
            currentFeatureCount = 1;
          }
        }
        last = pos;
      }

      // Don't forget last SV candidate region
      if (currentFeatureCount
          > (minReadCount / MAX_READ_LENGTHS_PER_REGION) * minReadCandidateFraction) {
        Feature region =
            new Feature(
                currentFeatureChr,
                currentFeatureStart - readLength,
                currentFeatureStop + readLength);
        BreakpointCandidate candidate = new BreakpointCandidate(region, currentFeatureCount);
        this.svCandidateRegions.add(candidate);
      }
    }

    long end = System.currentTimeMillis();

    int kmer = readLength + 1;
    if (kmers.length > 0) {
      kmer = kmers[0];
    }

    if (isDebug) {
      System.err.println(
          "Elapsed_msecs_in_NativeAssembler\tRegion:\t"
              + regions.get(0).getDescriptor()
              + "\tLength:\t"
              + regions.get(0).getLength()
              + "\tReadCount:\t"
              + readCount
              + "\tElapsed\t"
              + (end - start)
              + "\tAssembled\t"
              + isAssemblyCandidate
              + "\t"
              + kmer);
    }

    return contigs;
  }
Exemplo n.º 2
0
  //
  //  Returns a downsampled set of reads for each sample.
  //
  private List<List<SAMRecord>> getReads(
      List<String> inputFiles, List<Feature> regions, ReAligner realigner) {

    int downsampleTarget = desiredNumberOfReads(regions);
    List<DownsampledReadList> readsList = new ArrayList<DownsampledReadList>();

    for (String input : inputFiles) {
      Set<String> readIds = new HashSet<String>();
      DownsampledReadList reads = new DownsampledReadList(downsampleTarget);
      readsList.add(reads);

      for (Feature region : regions) {
        SAMFileReader reader = new SAMFileReader(new File(input));
        reader.setValidationStringency(ValidationStringency.SILENT);

        Iterator<SAMRecord> iter;
        if (region != null) {
          iter =
              reader.queryOverlapping(
                  region.getSeqname(), (int) region.getStart(), (int) region.getEnd());
        } else {
          iter = reader.iterator();
        }

        while (iter.hasNext()) {

          SAMRecord read = iter.next();

          // Don't allow same read to be counted twice.
          if ((!realigner.isFiltered(read))
              && (!read.getDuplicateReadFlag())
              && (!read.getReadFailsVendorQualityCheckFlag())
              && (read.getMappingQuality() >= realigner.getMinMappingQuality()
                  || read.getReadUnmappedFlag())
              && (!readIds.contains(getIdentifier(read)))) {

            if (read.getReadString().length() > readLength) {
              reader.close();
              throw new IllegalArgumentException(
                  "Maximum read length of: "
                      + readLength
                      + " exceeded for: "
                      + read.getSAMString());
            }

            readIds.add(getIdentifier(read));

            reads.add(read);
          }
        }

        if (reads.getTotalReadCount() != reads.getReads().size()) {
          if (isDebug) {
            System.err.println(
                "downsampled: "
                    + regions.get(0).getDescriptor()
                    + ": "
                    + reads.getTotalReadCount()
                    + " -> "
                    + reads.getReads().size());
          }
        }

        reader.close();
      }
    }

    List<List<SAMRecord>> sampleReads = new ArrayList<List<SAMRecord>>();

    for (DownsampledReadList downsampledReads : readsList) {
      sampleReads.add(downsampledReads.getReads());
    }

    return sampleReads;
  }