Ejemplo n.º 1
0
  OverlapDetector<Gene> load() {
    final OverlapDetector<Gene> overlapDetector = new OverlapDetector<Gene>(0, 0);

    final int expectedColumns = RefFlatColumns.values().length;
    final TabbedTextFileWithHeaderParser parser =
        new TabbedTextFileWithHeaderParser(refFlatFile, RefFlatColumnLabels);
    final Map<String, List<TabbedTextFileWithHeaderParser.Row>> refFlatLinesByGene =
        new HashMap<String, List<TabbedTextFileWithHeaderParser.Row>>();

    for (final TabbedTextFileWithHeaderParser.Row row : parser) {
      final int lineNumber =
          parser.getCurrentLineNumber(); // getCurrentLineNumber returns the number of the next line
      if (row.getFields().length != expectedColumns) {
        throw new AnnotationException(
            "Wrong number of fields in refFlat file " + refFlatFile + " at line " + lineNumber);
      }
      final String geneName = row.getField(RefFlatColumns.GENE_NAME.name());
      final String transcriptName = row.getField(RefFlatColumns.TRANSCRIPT_NAME.name());
      final String transcriptDescription = geneName + ":" + transcriptName;
      final String chromosome = row.getField(RefFlatColumns.CHROMOSOME.name());
      if (!isSequenceRecognized(chromosome)) {
        LOG.debug(
            "Skipping " + transcriptDescription + " due to unrecognized sequence " + chromosome);
      } else {
        List<TabbedTextFileWithHeaderParser.Row> transcriptLines = refFlatLinesByGene.get(geneName);
        if (transcriptLines == null) {
          transcriptLines = new ArrayList<TabbedTextFileWithHeaderParser.Row>();
          refFlatLinesByGene.put(geneName, transcriptLines);
        }
        transcriptLines.add(row);
      }
    }

    int longestInterval = 0;
    int numIntervalsOver1MB = 0;

    for (final List<TabbedTextFileWithHeaderParser.Row> transcriptLines :
        refFlatLinesByGene.values()) {
      try {
        final Gene gene = makeGeneFromRefFlatLines(transcriptLines);
        overlapDetector.addLhs(gene, gene);
        if (gene.length() > longestInterval) longestInterval = gene.length();
        if (gene.length() > 1000000) ++numIntervalsOver1MB;
      } catch (AnnotationException e) {
        LOG.debug(e.getMessage() + " -- skipping");
      }
    }
    LOG.debug(
        "Longest gene: " + longestInterval + "; number of genes > 1MB: " + numIntervalsOver1MB);
    return overlapDetector;
  }
Ejemplo n.º 2
0
  /**
   * For each line in the MULTIPLEX_PARAMS file create a FastqRecordsWriter and put it in the
   * sampleBarcodeFastqWriterMap map, where the key to the map is the concatenation of all
   * sampleBarcodes in order for the given line.
   */
  private void populateWritersFromMultiplexParams() {
    final TabbedTextFileWithHeaderParser libraryParamsParser =
        new TabbedTextFileWithHeaderParser(MULTIPLEX_PARAMS);

    final Set<String> expectedColumnLabels = CollectionUtil.makeSet("OUTPUT_PREFIX");
    final List<String> sampleBarcodeColumnLabels = new ArrayList<String>();
    for (int i = 1; i <= readStructure.sampleBarcodes.length(); i++) {
      sampleBarcodeColumnLabels.add("BARCODE_" + i);
    }

    expectedColumnLabels.addAll(sampleBarcodeColumnLabels);
    assertExpectedColumns(libraryParamsParser.columnLabels(), expectedColumnLabels);

    for (final TabbedTextFileWithHeaderParser.Row row : libraryParamsParser) {
      List<String> sampleBarcodeValues = null;

      if (sampleBarcodeColumnLabels.size() > 0) {
        sampleBarcodeValues = new ArrayList<String>();
        for (final String sampleBarcodeLabel : sampleBarcodeColumnLabels) {
          sampleBarcodeValues.add(row.getField(sampleBarcodeLabel));
        }
      }

      final String key =
          (sampleBarcodeValues == null || sampleBarcodeValues.contains("N"))
              ? null
              : StringUtil.join("", sampleBarcodeValues);
      if (sampleBarcodeFastqWriterMap.containsKey(
          key)) { // This will catch the case of having more than 1 line in a non-barcoded
                  // MULTIPLEX_PARAMS file
        throw new PicardException(
            "Row for barcode "
                + key
                + " appears more than once in MULTIPLEX_PARAMS file "
                + MULTIPLEX_PARAMS);
      }

      final FastqRecordsWriter writer = buildWriter(new File(row.getField("OUTPUT_PREFIX")));
      sampleBarcodeFastqWriterMap.put(key, writer);
    }
    if (sampleBarcodeFastqWriterMap.isEmpty()) {
      throw new PicardException(
          "MULTIPLEX_PARAMS file " + MULTIPLEX_PARAMS + " does have any data rows.");
    }
    libraryParamsParser.close();
  }