OverlapDetector<Gene> load() { final OverlapDetector<Gene> overlapDetector = new OverlapDetector<Gene>(0, 0); final int expectedColumns = RefFlatColumns.values().length; final TabbedTextFileWithHeaderParser parser = new TabbedTextFileWithHeaderParser(refFlatFile, RefFlatColumnLabels); final Map<String, List<TabbedTextFileWithHeaderParser.Row>> refFlatLinesByGene = new HashMap<String, List<TabbedTextFileWithHeaderParser.Row>>(); for (final TabbedTextFileWithHeaderParser.Row row : parser) { final int lineNumber = parser.getCurrentLineNumber(); // getCurrentLineNumber returns the number of the next line if (row.getFields().length != expectedColumns) { throw new AnnotationException( "Wrong number of fields in refFlat file " + refFlatFile + " at line " + lineNumber); } final String geneName = row.getField(RefFlatColumns.GENE_NAME.name()); final String transcriptName = row.getField(RefFlatColumns.TRANSCRIPT_NAME.name()); final String transcriptDescription = geneName + ":" + transcriptName; final String chromosome = row.getField(RefFlatColumns.CHROMOSOME.name()); if (!isSequenceRecognized(chromosome)) { LOG.debug( "Skipping " + transcriptDescription + " due to unrecognized sequence " + chromosome); } else { List<TabbedTextFileWithHeaderParser.Row> transcriptLines = refFlatLinesByGene.get(geneName); if (transcriptLines == null) { transcriptLines = new ArrayList<TabbedTextFileWithHeaderParser.Row>(); refFlatLinesByGene.put(geneName, transcriptLines); } transcriptLines.add(row); } } int longestInterval = 0; int numIntervalsOver1MB = 0; for (final List<TabbedTextFileWithHeaderParser.Row> transcriptLines : refFlatLinesByGene.values()) { try { final Gene gene = makeGeneFromRefFlatLines(transcriptLines); overlapDetector.addLhs(gene, gene); if (gene.length() > longestInterval) longestInterval = gene.length(); if (gene.length() > 1000000) ++numIntervalsOver1MB; } catch (AnnotationException e) { LOG.debug(e.getMessage() + " -- skipping"); } } LOG.debug( "Longest gene: " + longestInterval + "; number of genes > 1MB: " + numIntervalsOver1MB); return overlapDetector; }
private Gene makeGeneFromRefFlatLines( final List<TabbedTextFileWithHeaderParser.Row> transcriptLines) { final String geneName = transcriptLines.get(0).getField(RefFlatColumns.GENE_NAME.name()); final String strandStr = transcriptLines.get(0).getField(RefFlatColumns.STRAND.name()); final boolean negative = strandStr.equals("-"); final String chromosome = transcriptLines.get(0).getField(RefFlatColumns.CHROMOSOME.name()); // Figure out the extend of the gene int start = Integer.MAX_VALUE; int end = Integer.MIN_VALUE; for (final TabbedTextFileWithHeaderParser.Row row : transcriptLines) { start = Math.min(start, row.getIntegerField(RefFlatColumns.TX_START.name()) + 1); end = Math.max(end, row.getIntegerField(RefFlatColumns.TX_END.name())); } final Gene gene = new Gene(chromosome, start, end, negative, geneName); for (final TabbedTextFileWithHeaderParser.Row row : transcriptLines) { if (!strandStr.equals(row.getField(RefFlatColumns.STRAND.name()))) { throw new AnnotationException("Strand disagreement in refFlat file for gene " + geneName); } if (!chromosome.equals(row.getField(RefFlatColumns.CHROMOSOME.name()))) { throw new AnnotationException( "Chromosome disagreement(" + chromosome + " != " + row.getField(RefFlatColumns.CHROMOSOME.name()) + ") in refFlat file for gene " + geneName); } // This adds it to the Gene also final Transcript tx = makeTranscriptFromRefFlatLine(gene, row); } return gene; }