Beispiel #1
0
  public static boolean parsableMAGE_TAB(ResourceLocator file) throws IOException {
    AsciiLineReader reader = null;
    try {
      reader = ParsingUtils.openAsciiReader(file);
      String nextLine = null;

      // skip first row
      reader.readLine();

      // check second row for MAGE_TAB identifiers
      if ((nextLine = reader.readLine()) != null
          && (nextLine.contains("Reporter REF")
              || nextLine.contains("Composite Element REF")
              || nextLine.contains("Term Source REF")
              || nextLine.contains("CompositeElement REF")
              || nextLine.contains("TermSource REF")
              || nextLine.contains("Coordinates REF"))) {
        int count = 0;
        // check if this mage_tab data matrix can be parsed by this class
        while ((nextLine = reader.readLine()) != null && count < 5) {
          nextLine = nextLine.trim();
          if (nextLine.startsWith("SNP_A") || nextLine.startsWith("CN_")) {
            return true;
          }

          count++;
        }
        return false;
      }
    } finally {
      if (reader != null) {
        reader.close();
      }
    }

    return false;
  }
Beispiel #2
0
  /**
   * Load data for a single chromosome.
   *
   * @param chrSummary
   * @param dataHeaders
   * @return
   */
  public ChromosomeData loadChromosomeData(ChromosomeSummary chrSummary, String[] dataHeaders) {

    // InputStream is = null;
    try {
      int skipColumns = hasCalls ? 2 : 1;

      // Get an estimate of the number of snps (rows).  THIS IS ONLY AN ESTIMATE
      int nRowsEst = chrSummary.getNDataPts();

      SeekableStream is = IGVSeekableStreamFactory.getStreamFor(dataResourceLocator.getPath());
      is.seek(chrSummary.getStartPosition());
      AsciiLineReader reader = new AsciiLineReader(is);

      // Create containers to hold data
      IntArrayList startLocations = new IntArrayList(nRowsEst);
      IntArrayList endLocations = (hasEndLocations ? new IntArrayList(nRowsEst) : null);
      List<String> probes = new ArrayList(nRowsEst);

      Map<String, FloatArrayList> dataMap = new HashMap();
      for (String h : dataHeaders) {
        dataMap.put(h, new FloatArrayList(nRowsEst));
      }

      // Begin loop through rows
      String chromosome = chrSummary.getName();
      boolean chromosomeStarted = false;
      String nextLine = reader.readLine();

      while ((nextLine != null) && (nextLine.trim().length() > 0)) {

        if (!nextLine.startsWith("#")) {
          try {
            String[] tokens = Globals.tabPattern.split(nextLine, -1);
            String thisChromosome = genome.getChromosomeAlias(tokens[chrColumn].trim());
            if (thisChromosome.equals(chromosome)) {
              chromosomeStarted = true;

              // chromosomeData.setMarkerId(nRows, tokens[0]);

              // The probe.  A new string is created to prevent holding on to the entire row through
              // a substring reference
              String probe = new String(tokens[probeColumn]);
              probes.add(probe);

              int start = ParsingUtils.parseInt(tokens[startColumn].trim()) - startBase;
              if (hasEndLocations) {
                endLocations.add(ParsingUtils.parseInt(tokens[endColumn].trim()));
              }

              startLocations.add(start);

              for (int idx = 0; idx < dataHeaders.length; idx++) {
                int i = firstDataColumn + idx * skipColumns;
                float copyNo = i <= lastDataColumn ? readFloat(tokens[i]) : Float.NaN;
                String heading = dataHeaders[idx];
                dataMap.get(heading).add(copyNo);
              }

            } else if (chromosomeStarted) {
              break;
            }

          } catch (NumberFormatException numberFormatException) {

            // Skip line
            log.info("Skipping line (NumberFormatException) " + nextLine);
          }
        }

        nextLine = reader.readLine();
      }

      // Loop complete
      ChromosomeData cd = new ChromosomeData(chrSummary.getName());
      cd.setProbes(probes.toArray(new String[] {}));
      cd.setStartLocations(startLocations.toArray());
      if (hasEndLocations) {
        cd.setEndLocations(endLocations.toArray());
      }

      for (String h : dataHeaders) {
        cd.setData(h, dataMap.get(h).toArray());
      }

      return cd;

    } catch (IOException ex) {
      log.error("Error parsing cn file", ex);
      throw new RuntimeException("Error parsing cn file", ex);
    }
  }
Beispiel #3
0
  /**
   * Scan the datafile for chromosome breaks.
   *
   * @param dataset
   * @return
   */
  public List<ChromosomeSummary> scan(IGVDataset dataset) {

    int estLineCount = ParsingUtils.estimateLineCount(dataResourceLocator.getPath());
    Map<String, Integer> longestFeatureMap = new HashMap();

    float dataMin = 0;
    float dataMax = 0;

    InputStream is = null;
    AsciiLineReader reader = null;
    String nextLine = null;
    ChromosomeSummary chrSummary = null;
    List<ChromosomeSummary> chrSummaries = new ArrayList();
    String[] headings = null;
    WholeGenomeData wgData = null;
    int nRows = 0;

    int headerRows = 0;
    int count = 0;

    boolean logNormalized;
    try {

      int skipColumns = hasCalls ? 2 : 1;

      // BufferedReader reader = ParsingUtils.openBufferedReader(dataResourceLocator);
      is = ParsingUtils.openInputStreamGZ(dataResourceLocator);
      reader = new AsciiLineReader(is);

      // Infer datatype from extension.  This can be overriden in the
      // comment section
      if (isCopyNumberFileExt(dataResourceLocator.getPath())) {
        dataset.setTrackType(TrackType.COPY_NUMBER);
        dataset.getTrackProperties().setWindowingFunction(WindowFunction.mean);
      } else if (isLOHFileExt(dataResourceLocator.getPath())) {
        dataset.setTrackType(TrackType.LOH);
        dataset.getTrackProperties().setWindowingFunction(WindowFunction.mean);
      } else {
        dataset.getTrackProperties().setWindowingFunction(WindowFunction.mean);
      }

      // Parse comments and directives, if any
      nextLine = reader.readLine();
      while (nextLine.startsWith("#") || (nextLine.trim().length() == 0)) {
        headerRows++;

        if (nextLine.length() > 0) {
          parseDirective(nextLine, dataset);
        }
        nextLine = reader.readLine();
      }

      if (chrColumn < 0) {
        setColumnDefaults();
      }

      // Parse column headings
      String[] data = nextLine.trim().split("\t");

      // Set last data column
      if (lastDataColumn < 0) {
        lastDataColumn = data.length - 1;
      }

      headings = getHeadings(data, skipColumns);

      dataset.setDataHeadings(headings);

      // Infer if the data is logNormalized by looking for negative data values.
      // Assume it is not until proven otherwise
      logNormalized = false;

      wgData = new WholeGenomeData(headings);

      int chrRowCount = 0;

      // Update
      int updateCount = 5000;
      long lastPosition = 0;
      while ((nextLine = reader.readLine()) != null) {

        if (igv != null && ++count % updateCount == 0) {
          igv.setStatusBarMessage("Loaded: " + count + " / " + estLineCount + " (est)");
        }
        // Distance since last sample

        String[] tokens = Globals.tabPattern.split(nextLine, -1);
        int nTokens = tokens.length;
        if (nTokens > 0) {
          String thisChr = genome.getChromosomeAlias(tokens[chrColumn]);
          if (chrSummary == null || !thisChr.equals(chrSummary.getName())) {
            // Update whole genome and previous chromosome summary, unless this is
            // the first chromosome
            if (chrSummary != null) {
              updateWholeGenome(chrSummary.getName(), dataset, headings, wgData);
              chrSummary.setNDataPoints(nRows);
            }

            // Shart the next chromosome
            chrSummary = new ChromosomeSummary(thisChr, lastPosition);
            chrSummaries.add(chrSummary);
            nRows = 0;
            wgData = new WholeGenomeData(headings);
            chrRowCount = 0;
          }
          lastPosition = reader.getPosition();

          int location = -1;
          try {
            location = ParsingUtils.parseInt(tokens[startColumn]) - startBase;

          } catch (NumberFormatException numberFormatException) {
            log.error("Column " + tokens[startColumn] + " is not a number");
            throw new ParserException(
                "Column "
                    + (startColumn + 1)
                    + " must contain an integer value."
                    + " Found: "
                    + tokens[startColumn],
                count + headerRows,
                nextLine);
          }

          int length = 1;
          if (hasEndLocations) {
            try {
              length = ParsingUtils.parseInt(tokens[endColumn].trim()) - location + 1;

            } catch (NumberFormatException numberFormatException) {
              log.error("Column " + tokens[endColumn] + " is not a number");
              throw new ParserException(
                  "Column "
                      + (endColumn + 1)
                      + " must contain an integer value."
                      + " Found: "
                      + tokens[endColumn],
                  count + headerRows,
                  nextLine);
            }
          }

          updateLongestFeature(longestFeatureMap, thisChr, length);

          if (wgData.locations.size() > 0
              && wgData.locations.get(wgData.locations.size() - 1) > location) {
            throw new ParserException(
                "File is not sorted, .igv and .cn files must be sorted by start position."
                    + " Use igvtools (File > Run igvtools..) to sort the file.",
                count + headerRows);
          }

          wgData.locations.add(location);

          for (int idx = 0; idx < headings.length; idx++) {
            int i = firstDataColumn + idx * skipColumns;

            float copyNo = i < tokens.length ? readFloat(tokens[i]) : Float.NaN;

            if (!Float.isNaN(copyNo)) {
              dataMin = Math.min(dataMin, copyNo);
              dataMax = Math.max(dataMax, copyNo);
            }
            if (copyNo < 0) {
              logNormalized = true;
            }
            String heading = headings[idx];
            wgData.data.get(heading).add(copyNo);
          }

          nRows++;
        }
        chrRowCount++;
      }

      dataset.setLongestFeatureMap(longestFeatureMap);

    } catch (ParserException pe) {
      throw pe;
    } catch (FileNotFoundException e) {
      // DialogUtils.showError("SNP file not found: " + dataSource.getCopyNoFile());
      log.error("File not found: " + dataResourceLocator);
      throw new RuntimeException(e);
    } catch (Exception e) {
      log.error("Exception when loading: " + dataResourceLocator.getPath(), e);
      if (nextLine != null && (count + headerRows != 0)) {
        throw new ParserException(e.getMessage(), e, count + headerRows, nextLine);
      } else {
        throw new RuntimeException(e);
      }
    } finally {
      if (is != null) {
        try {
          is.close();
        } catch (IOException e) {
          log.error("Error closing IGVDataset stream", e);
        }
      }
    }

    // Update last chromosome
    if (chrSummary != null) {
      updateWholeGenome(chrSummary.getName(), dataset, headings, wgData);
      chrSummary.setNDataPoints(nRows);
    }

    dataset.setLogNormalized(logNormalized);
    dataset.setDataMin(dataMin);
    dataset.setDataMax(dataMax);

    return chrSummaries;
  }