/** * Estimage the number of lines in the given file, or all files in the given directory, or all * files referenced in a ".list" file. * * @param file a file or directory. * @return */ private int estimateLineCount(File file) { int nLines = 0; if (file.isDirectory() || file.getName().endsWith(".list")) { List<File> files = getFilesFromDirOrList(file); for (File f : files) { if (!f.isDirectory()) { nLines += ParsingUtils.estimateLineCount(f.getAbsolutePath()); } } } else { nLines = ParsingUtils.estimateLineCount(file.getAbsolutePath()); } return nLines; }
/** * Scan the datafile for chromosome breaks. * * @param dataset * @return */ public List<ChromosomeSummary> scan(IGVDataset dataset) { int estLineCount = ParsingUtils.estimateLineCount(dataResourceLocator.getPath()); Map<String, Integer> longestFeatureMap = new HashMap(); float dataMin = 0; float dataMax = 0; InputStream is = null; AsciiLineReader reader = null; String nextLine = null; ChromosomeSummary chrSummary = null; List<ChromosomeSummary> chrSummaries = new ArrayList(); String[] headings = null; WholeGenomeData wgData = null; int nRows = 0; int headerRows = 0; int count = 0; boolean logNormalized; try { int skipColumns = hasCalls ? 2 : 1; // BufferedReader reader = ParsingUtils.openBufferedReader(dataResourceLocator); is = ParsingUtils.openInputStreamGZ(dataResourceLocator); reader = new AsciiLineReader(is); // Infer datatype from extension. This can be overriden in the // comment section if (isCopyNumberFileExt(dataResourceLocator.getPath())) { dataset.setTrackType(TrackType.COPY_NUMBER); dataset.getTrackProperties().setWindowingFunction(WindowFunction.mean); } else if (isLOHFileExt(dataResourceLocator.getPath())) { dataset.setTrackType(TrackType.LOH); dataset.getTrackProperties().setWindowingFunction(WindowFunction.mean); } else { dataset.getTrackProperties().setWindowingFunction(WindowFunction.mean); } // Parse comments and directives, if any nextLine = reader.readLine(); while (nextLine.startsWith("#") || (nextLine.trim().length() == 0)) { headerRows++; if (nextLine.length() > 0) { parseDirective(nextLine, dataset); } nextLine = reader.readLine(); } if (chrColumn < 0) { setColumnDefaults(); } // Parse column headings String[] data = nextLine.trim().split("\t"); // Set last data column if (lastDataColumn < 0) { lastDataColumn = data.length - 1; } headings = getHeadings(data, skipColumns); dataset.setDataHeadings(headings); // Infer if the data is logNormalized by looking for negative data values. // Assume it is not until proven otherwise logNormalized = false; wgData = new WholeGenomeData(headings); int chrRowCount = 0; // Update int updateCount = 5000; long lastPosition = 0; while ((nextLine = reader.readLine()) != null) { if (igv != null && ++count % updateCount == 0) { igv.setStatusBarMessage("Loaded: " + count + " / " + estLineCount + " (est)"); } // Distance since last sample String[] tokens = Globals.tabPattern.split(nextLine, -1); int nTokens = tokens.length; if (nTokens > 0) { String thisChr = genome.getChromosomeAlias(tokens[chrColumn]); if (chrSummary == null || !thisChr.equals(chrSummary.getName())) { // Update whole genome and previous chromosome summary, unless this is // the first chromosome if (chrSummary != null) { updateWholeGenome(chrSummary.getName(), dataset, headings, wgData); chrSummary.setNDataPoints(nRows); } // Shart the next chromosome chrSummary = new ChromosomeSummary(thisChr, lastPosition); chrSummaries.add(chrSummary); nRows = 0; wgData = new WholeGenomeData(headings); chrRowCount = 0; } lastPosition = reader.getPosition(); int location = -1; try { location = ParsingUtils.parseInt(tokens[startColumn]) - startBase; } catch (NumberFormatException numberFormatException) { log.error("Column " + tokens[startColumn] + " is not a number"); throw new ParserException( "Column " + (startColumn + 1) + " must contain an integer value." + " Found: " + tokens[startColumn], count + headerRows, nextLine); } int length = 1; if (hasEndLocations) { try { length = ParsingUtils.parseInt(tokens[endColumn].trim()) - location + 1; } catch (NumberFormatException numberFormatException) { log.error("Column " + tokens[endColumn] + " is not a number"); throw new ParserException( "Column " + (endColumn + 1) + " must contain an integer value." + " Found: " + tokens[endColumn], count + headerRows, nextLine); } } updateLongestFeature(longestFeatureMap, thisChr, length); if (wgData.locations.size() > 0 && wgData.locations.get(wgData.locations.size() - 1) > location) { throw new ParserException( "File is not sorted, .igv and .cn files must be sorted by start position." + " Use igvtools (File > Run igvtools..) to sort the file.", count + headerRows); } wgData.locations.add(location); for (int idx = 0; idx < headings.length; idx++) { int i = firstDataColumn + idx * skipColumns; float copyNo = i < tokens.length ? readFloat(tokens[i]) : Float.NaN; if (!Float.isNaN(copyNo)) { dataMin = Math.min(dataMin, copyNo); dataMax = Math.max(dataMax, copyNo); } if (copyNo < 0) { logNormalized = true; } String heading = headings[idx]; wgData.data.get(heading).add(copyNo); } nRows++; } chrRowCount++; } dataset.setLongestFeatureMap(longestFeatureMap); } catch (ParserException pe) { throw pe; } catch (FileNotFoundException e) { // DialogUtils.showError("SNP file not found: " + dataSource.getCopyNoFile()); log.error("File not found: " + dataResourceLocator); throw new RuntimeException(e); } catch (Exception e) { log.error("Exception when loading: " + dataResourceLocator.getPath(), e); if (nextLine != null && (count + headerRows != 0)) { throw new ParserException(e.getMessage(), e, count + headerRows, nextLine); } else { throw new RuntimeException(e); } } finally { if (is != null) { try { is.close(); } catch (IOException e) { log.error("Error closing IGVDataset stream", e); } } } // Update last chromosome if (chrSummary != null) { updateWholeGenome(chrSummary.getName(), dataset, headings, wgData); chrSummary.setNDataPoints(nRows); } dataset.setLogNormalized(logNormalized); dataset.setDataMin(dataMin); dataset.setDataMax(dataMax); return chrSummaries; }