OverlapDetector<Gene> load() { final OverlapDetector<Gene> overlapDetector = new OverlapDetector<Gene>(0, 0); final int expectedColumns = RefFlatColumns.values().length; final TabbedTextFileWithHeaderParser parser = new TabbedTextFileWithHeaderParser(refFlatFile, RefFlatColumnLabels); final Map<String, List<TabbedTextFileWithHeaderParser.Row>> refFlatLinesByGene = new HashMap<String, List<TabbedTextFileWithHeaderParser.Row>>(); for (final TabbedTextFileWithHeaderParser.Row row : parser) { final int lineNumber = parser.getCurrentLineNumber(); // getCurrentLineNumber returns the number of the next line if (row.getFields().length != expectedColumns) { throw new AnnotationException( "Wrong number of fields in refFlat file " + refFlatFile + " at line " + lineNumber); } final String geneName = row.getField(RefFlatColumns.GENE_NAME.name()); final String transcriptName = row.getField(RefFlatColumns.TRANSCRIPT_NAME.name()); final String transcriptDescription = geneName + ":" + transcriptName; final String chromosome = row.getField(RefFlatColumns.CHROMOSOME.name()); if (!isSequenceRecognized(chromosome)) { LOG.debug( "Skipping " + transcriptDescription + " due to unrecognized sequence " + chromosome); } else { List<TabbedTextFileWithHeaderParser.Row> transcriptLines = refFlatLinesByGene.get(geneName); if (transcriptLines == null) { transcriptLines = new ArrayList<TabbedTextFileWithHeaderParser.Row>(); refFlatLinesByGene.put(geneName, transcriptLines); } transcriptLines.add(row); } } int longestInterval = 0; int numIntervalsOver1MB = 0; for (final List<TabbedTextFileWithHeaderParser.Row> transcriptLines : refFlatLinesByGene.values()) { try { final Gene gene = makeGeneFromRefFlatLines(transcriptLines); overlapDetector.addLhs(gene, gene); if (gene.length() > longestInterval) longestInterval = gene.length(); if (gene.length() > 1000000) ++numIntervalsOver1MB; } catch (AnnotationException e) { LOG.debug(e.getMessage() + " -- skipping"); } } LOG.debug( "Longest gene: " + longestInterval + "; number of genes > 1MB: " + numIntervalsOver1MB); return overlapDetector; }
/** * For each line in the MULTIPLEX_PARAMS file create a FastqRecordsWriter and put it in the * sampleBarcodeFastqWriterMap map, where the key to the map is the concatenation of all * sampleBarcodes in order for the given line. */ private void populateWritersFromMultiplexParams() { final TabbedTextFileWithHeaderParser libraryParamsParser = new TabbedTextFileWithHeaderParser(MULTIPLEX_PARAMS); final Set<String> expectedColumnLabels = CollectionUtil.makeSet("OUTPUT_PREFIX"); final List<String> sampleBarcodeColumnLabels = new ArrayList<String>(); for (int i = 1; i <= readStructure.sampleBarcodes.length(); i++) { sampleBarcodeColumnLabels.add("BARCODE_" + i); } expectedColumnLabels.addAll(sampleBarcodeColumnLabels); assertExpectedColumns(libraryParamsParser.columnLabels(), expectedColumnLabels); for (final TabbedTextFileWithHeaderParser.Row row : libraryParamsParser) { List<String> sampleBarcodeValues = null; if (sampleBarcodeColumnLabels.size() > 0) { sampleBarcodeValues = new ArrayList<String>(); for (final String sampleBarcodeLabel : sampleBarcodeColumnLabels) { sampleBarcodeValues.add(row.getField(sampleBarcodeLabel)); } } final String key = (sampleBarcodeValues == null || sampleBarcodeValues.contains("N")) ? null : StringUtil.join("", sampleBarcodeValues); if (sampleBarcodeFastqWriterMap.containsKey( key)) { // This will catch the case of having more than 1 line in a non-barcoded // MULTIPLEX_PARAMS file throw new PicardException( "Row for barcode " + key + " appears more than once in MULTIPLEX_PARAMS file " + MULTIPLEX_PARAMS); } final FastqRecordsWriter writer = buildWriter(new File(row.getField("OUTPUT_PREFIX"))); sampleBarcodeFastqWriterMap.put(key, writer); } if (sampleBarcodeFastqWriterMap.isEmpty()) { throw new PicardException( "MULTIPLEX_PARAMS file " + MULTIPLEX_PARAMS + " does have any data rows."); } libraryParamsParser.close(); }