public static boolean parsableMAGE_TAB(ResourceLocator file) throws IOException { AsciiLineReader reader = null; try { reader = ParsingUtils.openAsciiReader(file); String nextLine = null; // skip first row reader.readLine(); // check second row for MAGE_TAB identifiers if ((nextLine = reader.readLine()) != null && (nextLine.contains("Reporter REF") || nextLine.contains("Composite Element REF") || nextLine.contains("Term Source REF") || nextLine.contains("CompositeElement REF") || nextLine.contains("TermSource REF") || nextLine.contains("Coordinates REF"))) { int count = 0; // check if this mage_tab data matrix can be parsed by this class while ((nextLine = reader.readLine()) != null && count < 5) { nextLine = nextLine.trim(); if (nextLine.startsWith("SNP_A") || nextLine.startsWith("CN_")) { return true; } count++; } return false; } } finally { if (reader != null) { reader.close(); } } return false; }
/** * Load data for a single chromosome. * * @param chrSummary * @param dataHeaders * @return */ public ChromosomeData loadChromosomeData(ChromosomeSummary chrSummary, String[] dataHeaders) { // InputStream is = null; try { int skipColumns = hasCalls ? 2 : 1; // Get an estimate of the number of snps (rows). THIS IS ONLY AN ESTIMATE int nRowsEst = chrSummary.getNDataPts(); SeekableStream is = IGVSeekableStreamFactory.getStreamFor(dataResourceLocator.getPath()); is.seek(chrSummary.getStartPosition()); AsciiLineReader reader = new AsciiLineReader(is); // Create containers to hold data IntArrayList startLocations = new IntArrayList(nRowsEst); IntArrayList endLocations = (hasEndLocations ? new IntArrayList(nRowsEst) : null); List<String> probes = new ArrayList(nRowsEst); Map<String, FloatArrayList> dataMap = new HashMap(); for (String h : dataHeaders) { dataMap.put(h, new FloatArrayList(nRowsEst)); } // Begin loop through rows String chromosome = chrSummary.getName(); boolean chromosomeStarted = false; String nextLine = reader.readLine(); while ((nextLine != null) && (nextLine.trim().length() > 0)) { if (!nextLine.startsWith("#")) { try { String[] tokens = Globals.tabPattern.split(nextLine, -1); String thisChromosome = genome.getChromosomeAlias(tokens[chrColumn].trim()); if (thisChromosome.equals(chromosome)) { chromosomeStarted = true; // chromosomeData.setMarkerId(nRows, tokens[0]); // The probe. A new string is created to prevent holding on to the entire row through // a substring reference String probe = new String(tokens[probeColumn]); probes.add(probe); int start = ParsingUtils.parseInt(tokens[startColumn].trim()) - startBase; if (hasEndLocations) { endLocations.add(ParsingUtils.parseInt(tokens[endColumn].trim())); } startLocations.add(start); for (int idx = 0; idx < dataHeaders.length; idx++) { int i = firstDataColumn + idx * skipColumns; float copyNo = i <= lastDataColumn ? readFloat(tokens[i]) : Float.NaN; String heading = dataHeaders[idx]; dataMap.get(heading).add(copyNo); } } else if (chromosomeStarted) { break; } } catch (NumberFormatException numberFormatException) { // Skip line log.info("Skipping line (NumberFormatException) " + nextLine); } } nextLine = reader.readLine(); } // Loop complete ChromosomeData cd = new ChromosomeData(chrSummary.getName()); cd.setProbes(probes.toArray(new String[] {})); cd.setStartLocations(startLocations.toArray()); if (hasEndLocations) { cd.setEndLocations(endLocations.toArray()); } for (String h : dataHeaders) { cd.setData(h, dataMap.get(h).toArray()); } return cd; } catch (IOException ex) { log.error("Error parsing cn file", ex); throw new RuntimeException("Error parsing cn file", ex); } }
/** * Scan the datafile for chromosome breaks. * * @param dataset * @return */ public List<ChromosomeSummary> scan(IGVDataset dataset) { int estLineCount = ParsingUtils.estimateLineCount(dataResourceLocator.getPath()); Map<String, Integer> longestFeatureMap = new HashMap(); float dataMin = 0; float dataMax = 0; InputStream is = null; AsciiLineReader reader = null; String nextLine = null; ChromosomeSummary chrSummary = null; List<ChromosomeSummary> chrSummaries = new ArrayList(); String[] headings = null; WholeGenomeData wgData = null; int nRows = 0; int headerRows = 0; int count = 0; boolean logNormalized; try { int skipColumns = hasCalls ? 2 : 1; // BufferedReader reader = ParsingUtils.openBufferedReader(dataResourceLocator); is = ParsingUtils.openInputStreamGZ(dataResourceLocator); reader = new AsciiLineReader(is); // Infer datatype from extension. This can be overriden in the // comment section if (isCopyNumberFileExt(dataResourceLocator.getPath())) { dataset.setTrackType(TrackType.COPY_NUMBER); dataset.getTrackProperties().setWindowingFunction(WindowFunction.mean); } else if (isLOHFileExt(dataResourceLocator.getPath())) { dataset.setTrackType(TrackType.LOH); dataset.getTrackProperties().setWindowingFunction(WindowFunction.mean); } else { dataset.getTrackProperties().setWindowingFunction(WindowFunction.mean); } // Parse comments and directives, if any nextLine = reader.readLine(); while (nextLine.startsWith("#") || (nextLine.trim().length() == 0)) { headerRows++; if (nextLine.length() > 0) { parseDirective(nextLine, dataset); } nextLine = reader.readLine(); } if (chrColumn < 0) { setColumnDefaults(); } // Parse column headings String[] data = nextLine.trim().split("\t"); // Set last data column if (lastDataColumn < 0) { lastDataColumn = data.length - 1; } headings = getHeadings(data, skipColumns); dataset.setDataHeadings(headings); // Infer if the data is logNormalized by looking for negative data values. // Assume it is not until proven otherwise logNormalized = false; wgData = new WholeGenomeData(headings); int chrRowCount = 0; // Update int updateCount = 5000; long lastPosition = 0; while ((nextLine = reader.readLine()) != null) { if (igv != null && ++count % updateCount == 0) { igv.setStatusBarMessage("Loaded: " + count + " / " + estLineCount + " (est)"); } // Distance since last sample String[] tokens = Globals.tabPattern.split(nextLine, -1); int nTokens = tokens.length; if (nTokens > 0) { String thisChr = genome.getChromosomeAlias(tokens[chrColumn]); if (chrSummary == null || !thisChr.equals(chrSummary.getName())) { // Update whole genome and previous chromosome summary, unless this is // the first chromosome if (chrSummary != null) { updateWholeGenome(chrSummary.getName(), dataset, headings, wgData); chrSummary.setNDataPoints(nRows); } // Shart the next chromosome chrSummary = new ChromosomeSummary(thisChr, lastPosition); chrSummaries.add(chrSummary); nRows = 0; wgData = new WholeGenomeData(headings); chrRowCount = 0; } lastPosition = reader.getPosition(); int location = -1; try { location = ParsingUtils.parseInt(tokens[startColumn]) - startBase; } catch (NumberFormatException numberFormatException) { log.error("Column " + tokens[startColumn] + " is not a number"); throw new ParserException( "Column " + (startColumn + 1) + " must contain an integer value." + " Found: " + tokens[startColumn], count + headerRows, nextLine); } int length = 1; if (hasEndLocations) { try { length = ParsingUtils.parseInt(tokens[endColumn].trim()) - location + 1; } catch (NumberFormatException numberFormatException) { log.error("Column " + tokens[endColumn] + " is not a number"); throw new ParserException( "Column " + (endColumn + 1) + " must contain an integer value." + " Found: " + tokens[endColumn], count + headerRows, nextLine); } } updateLongestFeature(longestFeatureMap, thisChr, length); if (wgData.locations.size() > 0 && wgData.locations.get(wgData.locations.size() - 1) > location) { throw new ParserException( "File is not sorted, .igv and .cn files must be sorted by start position." + " Use igvtools (File > Run igvtools..) to sort the file.", count + headerRows); } wgData.locations.add(location); for (int idx = 0; idx < headings.length; idx++) { int i = firstDataColumn + idx * skipColumns; float copyNo = i < tokens.length ? readFloat(tokens[i]) : Float.NaN; if (!Float.isNaN(copyNo)) { dataMin = Math.min(dataMin, copyNo); dataMax = Math.max(dataMax, copyNo); } if (copyNo < 0) { logNormalized = true; } String heading = headings[idx]; wgData.data.get(heading).add(copyNo); } nRows++; } chrRowCount++; } dataset.setLongestFeatureMap(longestFeatureMap); } catch (ParserException pe) { throw pe; } catch (FileNotFoundException e) { // DialogUtils.showError("SNP file not found: " + dataSource.getCopyNoFile()); log.error("File not found: " + dataResourceLocator); throw new RuntimeException(e); } catch (Exception e) { log.error("Exception when loading: " + dataResourceLocator.getPath(), e); if (nextLine != null && (count + headerRows != 0)) { throw new ParserException(e.getMessage(), e, count + headerRows, nextLine); } else { throw new RuntimeException(e); } } finally { if (is != null) { try { is.close(); } catch (IOException e) { log.error("Error closing IGVDataset stream", e); } } } // Update last chromosome if (chrSummary != null) { updateWholeGenome(chrSummary.getName(), dataset, headings, wgData); chrSummary.setNDataPoints(nRows); } dataset.setLogNormalized(logNormalized); dataset.setDataMin(dataMin); dataset.setDataMax(dataMax); return chrSummaries; }