Exemplo n.º 1
0
  OverlapDetector<Gene> load() {
    final OverlapDetector<Gene> overlapDetector = new OverlapDetector<Gene>(0, 0);

    final int expectedColumns = RefFlatColumns.values().length;
    final TabbedTextFileWithHeaderParser parser =
        new TabbedTextFileWithHeaderParser(refFlatFile, RefFlatColumnLabels);
    final Map<String, List<TabbedTextFileWithHeaderParser.Row>> refFlatLinesByGene =
        new HashMap<String, List<TabbedTextFileWithHeaderParser.Row>>();

    for (final TabbedTextFileWithHeaderParser.Row row : parser) {
      final int lineNumber =
          parser.getCurrentLineNumber(); // getCurrentLineNumber returns the number of the next line
      if (row.getFields().length != expectedColumns) {
        throw new AnnotationException(
            "Wrong number of fields in refFlat file " + refFlatFile + " at line " + lineNumber);
      }
      final String geneName = row.getField(RefFlatColumns.GENE_NAME.name());
      final String transcriptName = row.getField(RefFlatColumns.TRANSCRIPT_NAME.name());
      final String transcriptDescription = geneName + ":" + transcriptName;
      final String chromosome = row.getField(RefFlatColumns.CHROMOSOME.name());
      if (!isSequenceRecognized(chromosome)) {
        LOG.debug(
            "Skipping " + transcriptDescription + " due to unrecognized sequence " + chromosome);
      } else {
        List<TabbedTextFileWithHeaderParser.Row> transcriptLines = refFlatLinesByGene.get(geneName);
        if (transcriptLines == null) {
          transcriptLines = new ArrayList<TabbedTextFileWithHeaderParser.Row>();
          refFlatLinesByGene.put(geneName, transcriptLines);
        }
        transcriptLines.add(row);
      }
    }

    int longestInterval = 0;
    int numIntervalsOver1MB = 0;

    for (final List<TabbedTextFileWithHeaderParser.Row> transcriptLines :
        refFlatLinesByGene.values()) {
      try {
        final Gene gene = makeGeneFromRefFlatLines(transcriptLines);
        overlapDetector.addLhs(gene, gene);
        if (gene.length() > longestInterval) longestInterval = gene.length();
        if (gene.length() > 1000000) ++numIntervalsOver1MB;
      } catch (AnnotationException e) {
        LOG.debug(e.getMessage() + " -- skipping");
      }
    }
    LOG.debug(
        "Longest gene: " + longestInterval + "; number of genes > 1MB: " + numIntervalsOver1MB);
    return overlapDetector;
  }