示例#1
0
  /**
   * Estimage the number of lines in the given file, or all files in the given directory, or all
   * files referenced in a ".list" file.
   *
   * @param file a file or directory.
   * @return
   */
  private int estimateLineCount(File file) {

    int nLines = 0;
    if (file.isDirectory() || file.getName().endsWith(".list")) {
      List<File> files = getFilesFromDirOrList(file);
      for (File f : files) {
        if (!f.isDirectory()) {
          nLines += ParsingUtils.estimateLineCount(f.getAbsolutePath());
        }
      }
    } else {
      nLines = ParsingUtils.estimateLineCount(file.getAbsolutePath());
    }
    return nLines;
  }
示例#2
0
  /**
   * Scan the datafile for chromosome breaks.
   *
   * @param dataset
   * @return
   */
  public List<ChromosomeSummary> scan(IGVDataset dataset) {

    int estLineCount = ParsingUtils.estimateLineCount(dataResourceLocator.getPath());
    Map<String, Integer> longestFeatureMap = new HashMap();

    float dataMin = 0;
    float dataMax = 0;

    InputStream is = null;
    AsciiLineReader reader = null;
    String nextLine = null;
    ChromosomeSummary chrSummary = null;
    List<ChromosomeSummary> chrSummaries = new ArrayList();
    String[] headings = null;
    WholeGenomeData wgData = null;
    int nRows = 0;

    int headerRows = 0;
    int count = 0;

    boolean logNormalized;
    try {

      int skipColumns = hasCalls ? 2 : 1;

      // BufferedReader reader = ParsingUtils.openBufferedReader(dataResourceLocator);
      is = ParsingUtils.openInputStreamGZ(dataResourceLocator);
      reader = new AsciiLineReader(is);

      // Infer datatype from extension.  This can be overriden in the
      // comment section
      if (isCopyNumberFileExt(dataResourceLocator.getPath())) {
        dataset.setTrackType(TrackType.COPY_NUMBER);
        dataset.getTrackProperties().setWindowingFunction(WindowFunction.mean);
      } else if (isLOHFileExt(dataResourceLocator.getPath())) {
        dataset.setTrackType(TrackType.LOH);
        dataset.getTrackProperties().setWindowingFunction(WindowFunction.mean);
      } else {
        dataset.getTrackProperties().setWindowingFunction(WindowFunction.mean);
      }

      // Parse comments and directives, if any
      nextLine = reader.readLine();
      while (nextLine.startsWith("#") || (nextLine.trim().length() == 0)) {
        headerRows++;

        if (nextLine.length() > 0) {
          parseDirective(nextLine, dataset);
        }
        nextLine = reader.readLine();
      }

      if (chrColumn < 0) {
        setColumnDefaults();
      }

      // Parse column headings
      String[] data = nextLine.trim().split("\t");

      // Set last data column
      if (lastDataColumn < 0) {
        lastDataColumn = data.length - 1;
      }

      headings = getHeadings(data, skipColumns);

      dataset.setDataHeadings(headings);

      // Infer if the data is logNormalized by looking for negative data values.
      // Assume it is not until proven otherwise
      logNormalized = false;

      wgData = new WholeGenomeData(headings);

      int chrRowCount = 0;

      // Update
      int updateCount = 5000;
      long lastPosition = 0;
      while ((nextLine = reader.readLine()) != null) {

        if (igv != null && ++count % updateCount == 0) {
          igv.setStatusBarMessage("Loaded: " + count + " / " + estLineCount + " (est)");
        }
        // Distance since last sample

        String[] tokens = Globals.tabPattern.split(nextLine, -1);
        int nTokens = tokens.length;
        if (nTokens > 0) {
          String thisChr = genome.getChromosomeAlias(tokens[chrColumn]);
          if (chrSummary == null || !thisChr.equals(chrSummary.getName())) {
            // Update whole genome and previous chromosome summary, unless this is
            // the first chromosome
            if (chrSummary != null) {
              updateWholeGenome(chrSummary.getName(), dataset, headings, wgData);
              chrSummary.setNDataPoints(nRows);
            }

            // Shart the next chromosome
            chrSummary = new ChromosomeSummary(thisChr, lastPosition);
            chrSummaries.add(chrSummary);
            nRows = 0;
            wgData = new WholeGenomeData(headings);
            chrRowCount = 0;
          }
          lastPosition = reader.getPosition();

          int location = -1;
          try {
            location = ParsingUtils.parseInt(tokens[startColumn]) - startBase;

          } catch (NumberFormatException numberFormatException) {
            log.error("Column " + tokens[startColumn] + " is not a number");
            throw new ParserException(
                "Column "
                    + (startColumn + 1)
                    + " must contain an integer value."
                    + " Found: "
                    + tokens[startColumn],
                count + headerRows,
                nextLine);
          }

          int length = 1;
          if (hasEndLocations) {
            try {
              length = ParsingUtils.parseInt(tokens[endColumn].trim()) - location + 1;

            } catch (NumberFormatException numberFormatException) {
              log.error("Column " + tokens[endColumn] + " is not a number");
              throw new ParserException(
                  "Column "
                      + (endColumn + 1)
                      + " must contain an integer value."
                      + " Found: "
                      + tokens[endColumn],
                  count + headerRows,
                  nextLine);
            }
          }

          updateLongestFeature(longestFeatureMap, thisChr, length);

          if (wgData.locations.size() > 0
              && wgData.locations.get(wgData.locations.size() - 1) > location) {
            throw new ParserException(
                "File is not sorted, .igv and .cn files must be sorted by start position."
                    + " Use igvtools (File > Run igvtools..) to sort the file.",
                count + headerRows);
          }

          wgData.locations.add(location);

          for (int idx = 0; idx < headings.length; idx++) {
            int i = firstDataColumn + idx * skipColumns;

            float copyNo = i < tokens.length ? readFloat(tokens[i]) : Float.NaN;

            if (!Float.isNaN(copyNo)) {
              dataMin = Math.min(dataMin, copyNo);
              dataMax = Math.max(dataMax, copyNo);
            }
            if (copyNo < 0) {
              logNormalized = true;
            }
            String heading = headings[idx];
            wgData.data.get(heading).add(copyNo);
          }

          nRows++;
        }
        chrRowCount++;
      }

      dataset.setLongestFeatureMap(longestFeatureMap);

    } catch (ParserException pe) {
      throw pe;
    } catch (FileNotFoundException e) {
      // DialogUtils.showError("SNP file not found: " + dataSource.getCopyNoFile());
      log.error("File not found: " + dataResourceLocator);
      throw new RuntimeException(e);
    } catch (Exception e) {
      log.error("Exception when loading: " + dataResourceLocator.getPath(), e);
      if (nextLine != null && (count + headerRows != 0)) {
        throw new ParserException(e.getMessage(), e, count + headerRows, nextLine);
      } else {
        throw new RuntimeException(e);
      }
    } finally {
      if (is != null) {
        try {
          is.close();
        } catch (IOException e) {
          log.error("Error closing IGVDataset stream", e);
        }
      }
    }

    // Update last chromosome
    if (chrSummary != null) {
      updateWholeGenome(chrSummary.getName(), dataset, headings, wgData);
      chrSummary.setNDataPoints(nRows);
    }

    dataset.setLogNormalized(logNormalized);
    dataset.setDataMin(dataMin);
    dataset.setDataMax(dataMax);

    return chrSummaries;
  }