OverlapDetector<Gene> load() { final OverlapDetector<Gene> overlapDetector = new OverlapDetector<Gene>(0, 0); final int expectedColumns = RefFlatColumns.values().length; final TabbedTextFileWithHeaderParser parser = new TabbedTextFileWithHeaderParser(refFlatFile, RefFlatColumnLabels); final Map<String, List<TabbedTextFileWithHeaderParser.Row>> refFlatLinesByGene = new HashMap<String, List<TabbedTextFileWithHeaderParser.Row>>(); for (final TabbedTextFileWithHeaderParser.Row row : parser) { final int lineNumber = parser.getCurrentLineNumber(); // getCurrentLineNumber returns the number of the next line if (row.getFields().length != expectedColumns) { throw new AnnotationException( "Wrong number of fields in refFlat file " + refFlatFile + " at line " + lineNumber); } final String geneName = row.getField(RefFlatColumns.GENE_NAME.name()); final String transcriptName = row.getField(RefFlatColumns.TRANSCRIPT_NAME.name()); final String transcriptDescription = geneName + ":" + transcriptName; final String chromosome = row.getField(RefFlatColumns.CHROMOSOME.name()); if (!isSequenceRecognized(chromosome)) { LOG.debug( "Skipping " + transcriptDescription + " due to unrecognized sequence " + chromosome); } else { List<TabbedTextFileWithHeaderParser.Row> transcriptLines = refFlatLinesByGene.get(geneName); if (transcriptLines == null) { transcriptLines = new ArrayList<TabbedTextFileWithHeaderParser.Row>(); refFlatLinesByGene.put(geneName, transcriptLines); } transcriptLines.add(row); } } int longestInterval = 0; int numIntervalsOver1MB = 0; for (final List<TabbedTextFileWithHeaderParser.Row> transcriptLines : refFlatLinesByGene.values()) { try { final Gene gene = makeGeneFromRefFlatLines(transcriptLines); overlapDetector.addLhs(gene, gene); if (gene.length() > longestInterval) longestInterval = gene.length(); if (gene.length() > 1000000) ++numIntervalsOver1MB; } catch (AnnotationException e) { LOG.debug(e.getMessage() + " -- skipping"); } } LOG.debug( "Longest gene: " + longestInterval + "; number of genes > 1MB: " + numIntervalsOver1MB); return overlapDetector; }
/** * For each line in the MULTIPLEX_PARAMS file create a FastqRecordsWriter and put it in the * sampleBarcodeFastqWriterMap map, where the key to the map is the concatenation of all * sampleBarcodes in order for the given line. */ private void populateWritersFromMultiplexParams() { final TabbedTextFileWithHeaderParser libraryParamsParser = new TabbedTextFileWithHeaderParser(MULTIPLEX_PARAMS); final Set<String> expectedColumnLabels = CollectionUtil.makeSet("OUTPUT_PREFIX"); final List<String> sampleBarcodeColumnLabels = new ArrayList<String>(); for (int i = 1; i <= readStructure.sampleBarcodes.length(); i++) { sampleBarcodeColumnLabels.add("BARCODE_" + i); } expectedColumnLabels.addAll(sampleBarcodeColumnLabels); assertExpectedColumns(libraryParamsParser.columnLabels(), expectedColumnLabels); for (final TabbedTextFileWithHeaderParser.Row row : libraryParamsParser) { List<String> sampleBarcodeValues = null; if (sampleBarcodeColumnLabels.size() > 0) { sampleBarcodeValues = new ArrayList<String>(); for (final String sampleBarcodeLabel : sampleBarcodeColumnLabels) { sampleBarcodeValues.add(row.getField(sampleBarcodeLabel)); } } final String key = (sampleBarcodeValues == null || sampleBarcodeValues.contains("N")) ? null : StringUtil.join("", sampleBarcodeValues); if (sampleBarcodeFastqWriterMap.containsKey( key)) { // This will catch the case of having more than 1 line in a non-barcoded // MULTIPLEX_PARAMS file throw new PicardException( "Row for barcode " + key + " appears more than once in MULTIPLEX_PARAMS file " + MULTIPLEX_PARAMS); } final FastqRecordsWriter writer = buildWriter(new File(row.getField("OUTPUT_PREFIX"))); sampleBarcodeFastqWriterMap.put(key, writer); } if (sampleBarcodeFastqWriterMap.isEmpty()) { throw new PicardException( "MULTIPLEX_PARAMS file " + MULTIPLEX_PARAMS + " does have any data rows."); } libraryParamsParser.close(); }
/** Conversion from 0-based half-open to 1-based inclusive intervals is done here. */ private Gene.Transcript makeTranscriptFromRefFlatLine( final Gene gene, final TabbedTextFileWithHeaderParser.Row row) { final String geneName = row.getField(RefFlatColumns.GENE_NAME.name()); final String transcriptName = row.getField(RefFlatColumns.TRANSCRIPT_NAME.name()); final String transcriptDescription = geneName + ":" + transcriptName; final int exonCount = Integer.parseInt(row.getField(RefFlatColumns.EXON_COUNT.name())); final String[] exonStarts = row.getField(RefFlatColumns.EXON_STARTS.name()).split(","); final String[] exonEnds = row.getField(RefFlatColumns.EXON_ENDS.name()).split(","); if (exonCount != exonStarts.length) { throw new AnnotationException( "Number of exon starts does not agree with number of exons for " + transcriptDescription); } if (exonCount != exonEnds.length) { throw new AnnotationException( "Number of exon ends does not agree with number of exons for " + transcriptDescription); } final int transcriptionStart = row.getIntegerField(RefFlatColumns.TX_START.name()) + 1; final int transcriptionEnd = row.getIntegerField(RefFlatColumns.TX_END.name()); final int codingStart = row.getIntegerField(RefFlatColumns.CDS_START.name()) + 1; final int codingEnd = row.getIntegerField(RefFlatColumns.CDS_END.name()); final Transcript tx = gene.addTranscript( transcriptName, transcriptionStart, transcriptionEnd, codingStart, codingEnd, exonCount); for (int i = 0; i < exonCount; ++i) { final Exon e = tx.addExon(Integer.parseInt(exonStarts[i]) + 1, Integer.parseInt(exonEnds[i])); if (e.start > e.end) { throw new AnnotationException("Exon has 0 or negative extent for " + transcriptDescription); } if (i > 0 && tx.exons[i - 1].end >= tx.exons[i].start) { throw new AnnotationException("Exons overlap for " + transcriptDescription); } } return tx; }
private Gene makeGeneFromRefFlatLines( final List<TabbedTextFileWithHeaderParser.Row> transcriptLines) { final String geneName = transcriptLines.get(0).getField(RefFlatColumns.GENE_NAME.name()); final String strandStr = transcriptLines.get(0).getField(RefFlatColumns.STRAND.name()); final boolean negative = strandStr.equals("-"); final String chromosome = transcriptLines.get(0).getField(RefFlatColumns.CHROMOSOME.name()); // Figure out the extend of the gene int start = Integer.MAX_VALUE; int end = Integer.MIN_VALUE; for (final TabbedTextFileWithHeaderParser.Row row : transcriptLines) { start = Math.min(start, row.getIntegerField(RefFlatColumns.TX_START.name()) + 1); end = Math.max(end, row.getIntegerField(RefFlatColumns.TX_END.name())); } final Gene gene = new Gene(chromosome, start, end, negative, geneName); for (final TabbedTextFileWithHeaderParser.Row row : transcriptLines) { if (!strandStr.equals(row.getField(RefFlatColumns.STRAND.name()))) { throw new AnnotationException("Strand disagreement in refFlat file for gene " + geneName); } if (!chromosome.equals(row.getField(RefFlatColumns.CHROMOSOME.name()))) { throw new AnnotationException( "Chromosome disagreement(" + chromosome + " != " + row.getField(RefFlatColumns.CHROMOSOME.name()) + ") in refFlat file for gene " + geneName); } // This adds it to the Gene also final Transcript tx = makeTranscriptFromRefFlatLine(gene, row); } return gene; }