Exemplo n.º 1
0
  OverlapDetector<Gene> load() {
    final OverlapDetector<Gene> overlapDetector = new OverlapDetector<Gene>(0, 0);

    final int expectedColumns = RefFlatColumns.values().length;
    final TabbedTextFileWithHeaderParser parser =
        new TabbedTextFileWithHeaderParser(refFlatFile, RefFlatColumnLabels);
    final Map<String, List<TabbedTextFileWithHeaderParser.Row>> refFlatLinesByGene =
        new HashMap<String, List<TabbedTextFileWithHeaderParser.Row>>();

    for (final TabbedTextFileWithHeaderParser.Row row : parser) {
      final int lineNumber =
          parser.getCurrentLineNumber(); // getCurrentLineNumber returns the number of the next line
      if (row.getFields().length != expectedColumns) {
        throw new AnnotationException(
            "Wrong number of fields in refFlat file " + refFlatFile + " at line " + lineNumber);
      }
      final String geneName = row.getField(RefFlatColumns.GENE_NAME.name());
      final String transcriptName = row.getField(RefFlatColumns.TRANSCRIPT_NAME.name());
      final String transcriptDescription = geneName + ":" + transcriptName;
      final String chromosome = row.getField(RefFlatColumns.CHROMOSOME.name());
      if (!isSequenceRecognized(chromosome)) {
        LOG.debug(
            "Skipping " + transcriptDescription + " due to unrecognized sequence " + chromosome);
      } else {
        List<TabbedTextFileWithHeaderParser.Row> transcriptLines = refFlatLinesByGene.get(geneName);
        if (transcriptLines == null) {
          transcriptLines = new ArrayList<TabbedTextFileWithHeaderParser.Row>();
          refFlatLinesByGene.put(geneName, transcriptLines);
        }
        transcriptLines.add(row);
      }
    }

    int longestInterval = 0;
    int numIntervalsOver1MB = 0;

    for (final List<TabbedTextFileWithHeaderParser.Row> transcriptLines :
        refFlatLinesByGene.values()) {
      try {
        final Gene gene = makeGeneFromRefFlatLines(transcriptLines);
        overlapDetector.addLhs(gene, gene);
        if (gene.length() > longestInterval) longestInterval = gene.length();
        if (gene.length() > 1000000) ++numIntervalsOver1MB;
      } catch (AnnotationException e) {
        LOG.debug(e.getMessage() + " -- skipping");
      }
    }
    LOG.debug(
        "Longest gene: " + longestInterval + "; number of genes > 1MB: " + numIntervalsOver1MB);
    return overlapDetector;
  }
Exemplo n.º 2
0
  /**
   * For each line in the MULTIPLEX_PARAMS file create a FastqRecordsWriter and put it in the
   * sampleBarcodeFastqWriterMap map, where the key to the map is the concatenation of all
   * sampleBarcodes in order for the given line.
   */
  private void populateWritersFromMultiplexParams() {
    final TabbedTextFileWithHeaderParser libraryParamsParser =
        new TabbedTextFileWithHeaderParser(MULTIPLEX_PARAMS);

    final Set<String> expectedColumnLabels = CollectionUtil.makeSet("OUTPUT_PREFIX");
    final List<String> sampleBarcodeColumnLabels = new ArrayList<String>();
    for (int i = 1; i <= readStructure.sampleBarcodes.length(); i++) {
      sampleBarcodeColumnLabels.add("BARCODE_" + i);
    }

    expectedColumnLabels.addAll(sampleBarcodeColumnLabels);
    assertExpectedColumns(libraryParamsParser.columnLabels(), expectedColumnLabels);

    for (final TabbedTextFileWithHeaderParser.Row row : libraryParamsParser) {
      List<String> sampleBarcodeValues = null;

      if (sampleBarcodeColumnLabels.size() > 0) {
        sampleBarcodeValues = new ArrayList<String>();
        for (final String sampleBarcodeLabel : sampleBarcodeColumnLabels) {
          sampleBarcodeValues.add(row.getField(sampleBarcodeLabel));
        }
      }

      final String key =
          (sampleBarcodeValues == null || sampleBarcodeValues.contains("N"))
              ? null
              : StringUtil.join("", sampleBarcodeValues);
      if (sampleBarcodeFastqWriterMap.containsKey(
          key)) { // This will catch the case of having more than 1 line in a non-barcoded
                  // MULTIPLEX_PARAMS file
        throw new PicardException(
            "Row for barcode "
                + key
                + " appears more than once in MULTIPLEX_PARAMS file "
                + MULTIPLEX_PARAMS);
      }

      final FastqRecordsWriter writer = buildWriter(new File(row.getField("OUTPUT_PREFIX")));
      sampleBarcodeFastqWriterMap.put(key, writer);
    }
    if (sampleBarcodeFastqWriterMap.isEmpty()) {
      throw new PicardException(
          "MULTIPLEX_PARAMS file " + MULTIPLEX_PARAMS + " does have any data rows.");
    }
    libraryParamsParser.close();
  }
Exemplo n.º 3
0
  /** Conversion from 0-based half-open to 1-based inclusive intervals is done here. */
  private Gene.Transcript makeTranscriptFromRefFlatLine(
      final Gene gene, final TabbedTextFileWithHeaderParser.Row row) {
    final String geneName = row.getField(RefFlatColumns.GENE_NAME.name());
    final String transcriptName = row.getField(RefFlatColumns.TRANSCRIPT_NAME.name());
    final String transcriptDescription = geneName + ":" + transcriptName;
    final int exonCount = Integer.parseInt(row.getField(RefFlatColumns.EXON_COUNT.name()));
    final String[] exonStarts = row.getField(RefFlatColumns.EXON_STARTS.name()).split(",");
    final String[] exonEnds = row.getField(RefFlatColumns.EXON_ENDS.name()).split(",");

    if (exonCount != exonStarts.length) {
      throw new AnnotationException(
          "Number of exon starts does not agree with number of exons for " + transcriptDescription);
    }
    if (exonCount != exonEnds.length) {
      throw new AnnotationException(
          "Number of exon ends does not agree with number of exons for " + transcriptDescription);
    }

    final int transcriptionStart = row.getIntegerField(RefFlatColumns.TX_START.name()) + 1;
    final int transcriptionEnd = row.getIntegerField(RefFlatColumns.TX_END.name());
    final int codingStart = row.getIntegerField(RefFlatColumns.CDS_START.name()) + 1;
    final int codingEnd = row.getIntegerField(RefFlatColumns.CDS_END.name());

    final Transcript tx =
        gene.addTranscript(
            transcriptName,
            transcriptionStart,
            transcriptionEnd,
            codingStart,
            codingEnd,
            exonCount);

    for (int i = 0; i < exonCount; ++i) {
      final Exon e = tx.addExon(Integer.parseInt(exonStarts[i]) + 1, Integer.parseInt(exonEnds[i]));

      if (e.start > e.end) {
        throw new AnnotationException("Exon has 0 or negative extent for " + transcriptDescription);
      }
      if (i > 0 && tx.exons[i - 1].end >= tx.exons[i].start) {
        throw new AnnotationException("Exons overlap for " + transcriptDescription);
      }
    }

    return tx;
  }
Exemplo n.º 4
0
  private Gene makeGeneFromRefFlatLines(
      final List<TabbedTextFileWithHeaderParser.Row> transcriptLines) {
    final String geneName = transcriptLines.get(0).getField(RefFlatColumns.GENE_NAME.name());
    final String strandStr = transcriptLines.get(0).getField(RefFlatColumns.STRAND.name());
    final boolean negative = strandStr.equals("-");
    final String chromosome = transcriptLines.get(0).getField(RefFlatColumns.CHROMOSOME.name());

    // Figure out the extend of the gene
    int start = Integer.MAX_VALUE;
    int end = Integer.MIN_VALUE;
    for (final TabbedTextFileWithHeaderParser.Row row : transcriptLines) {
      start = Math.min(start, row.getIntegerField(RefFlatColumns.TX_START.name()) + 1);
      end = Math.max(end, row.getIntegerField(RefFlatColumns.TX_END.name()));
    }

    final Gene gene = new Gene(chromosome, start, end, negative, geneName);

    for (final TabbedTextFileWithHeaderParser.Row row : transcriptLines) {
      if (!strandStr.equals(row.getField(RefFlatColumns.STRAND.name()))) {
        throw new AnnotationException("Strand disagreement in refFlat file for gene " + geneName);
      }
      if (!chromosome.equals(row.getField(RefFlatColumns.CHROMOSOME.name()))) {
        throw new AnnotationException(
            "Chromosome disagreement("
                + chromosome
                + " != "
                + row.getField(RefFlatColumns.CHROMOSOME.name())
                + ") in refFlat file for gene "
                + geneName);
      }

      // This adds it to the Gene also
      final Transcript tx = makeTranscriptFromRefFlatLine(gene, row);
    }

    return gene;
  }