Ejemplo n.º 1
0
  OverlapDetector<Gene> load() {
    final OverlapDetector<Gene> overlapDetector = new OverlapDetector<Gene>(0, 0);

    final int expectedColumns = RefFlatColumns.values().length;
    final TabbedTextFileWithHeaderParser parser =
        new TabbedTextFileWithHeaderParser(refFlatFile, RefFlatColumnLabels);
    final Map<String, List<TabbedTextFileWithHeaderParser.Row>> refFlatLinesByGene =
        new HashMap<String, List<TabbedTextFileWithHeaderParser.Row>>();

    for (final TabbedTextFileWithHeaderParser.Row row : parser) {
      final int lineNumber =
          parser.getCurrentLineNumber(); // getCurrentLineNumber returns the number of the next line
      if (row.getFields().length != expectedColumns) {
        throw new AnnotationException(
            "Wrong number of fields in refFlat file " + refFlatFile + " at line " + lineNumber);
      }
      final String geneName = row.getField(RefFlatColumns.GENE_NAME.name());
      final String transcriptName = row.getField(RefFlatColumns.TRANSCRIPT_NAME.name());
      final String transcriptDescription = geneName + ":" + transcriptName;
      final String chromosome = row.getField(RefFlatColumns.CHROMOSOME.name());
      if (!isSequenceRecognized(chromosome)) {
        LOG.debug(
            "Skipping " + transcriptDescription + " due to unrecognized sequence " + chromosome);
      } else {
        List<TabbedTextFileWithHeaderParser.Row> transcriptLines = refFlatLinesByGene.get(geneName);
        if (transcriptLines == null) {
          transcriptLines = new ArrayList<TabbedTextFileWithHeaderParser.Row>();
          refFlatLinesByGene.put(geneName, transcriptLines);
        }
        transcriptLines.add(row);
      }
    }

    int longestInterval = 0;
    int numIntervalsOver1MB = 0;

    for (final List<TabbedTextFileWithHeaderParser.Row> transcriptLines :
        refFlatLinesByGene.values()) {
      try {
        final Gene gene = makeGeneFromRefFlatLines(transcriptLines);
        overlapDetector.addLhs(gene, gene);
        if (gene.length() > longestInterval) longestInterval = gene.length();
        if (gene.length() > 1000000) ++numIntervalsOver1MB;
      } catch (AnnotationException e) {
        LOG.debug(e.getMessage() + " -- skipping");
      }
    }
    LOG.debug(
        "Longest gene: " + longestInterval + "; number of genes > 1MB: " + numIntervalsOver1MB);
    return overlapDetector;
  }
Ejemplo n.º 2
0
  private Gene makeGeneFromRefFlatLines(
      final List<TabbedTextFileWithHeaderParser.Row> transcriptLines) {
    final String geneName = transcriptLines.get(0).getField(RefFlatColumns.GENE_NAME.name());
    final String strandStr = transcriptLines.get(0).getField(RefFlatColumns.STRAND.name());
    final boolean negative = strandStr.equals("-");
    final String chromosome = transcriptLines.get(0).getField(RefFlatColumns.CHROMOSOME.name());

    // Figure out the extend of the gene
    int start = Integer.MAX_VALUE;
    int end = Integer.MIN_VALUE;
    for (final TabbedTextFileWithHeaderParser.Row row : transcriptLines) {
      start = Math.min(start, row.getIntegerField(RefFlatColumns.TX_START.name()) + 1);
      end = Math.max(end, row.getIntegerField(RefFlatColumns.TX_END.name()));
    }

    final Gene gene = new Gene(chromosome, start, end, negative, geneName);

    for (final TabbedTextFileWithHeaderParser.Row row : transcriptLines) {
      if (!strandStr.equals(row.getField(RefFlatColumns.STRAND.name()))) {
        throw new AnnotationException("Strand disagreement in refFlat file for gene " + geneName);
      }
      if (!chromosome.equals(row.getField(RefFlatColumns.CHROMOSOME.name()))) {
        throw new AnnotationException(
            "Chromosome disagreement("
                + chromosome
                + " != "
                + row.getField(RefFlatColumns.CHROMOSOME.name())
                + ") in refFlat file for gene "
                + geneName);
      }

      // This adds it to the Gene also
      final Transcript tx = makeTranscriptFromRefFlatLine(gene, row);
    }

    return gene;
  }