@Override public void process(final DAVOptions options) { final String[] args = getOriginalArgs(); final UseModality<DAVOptions> executed; final int maxSplitIndex = splitPlan.getMaxSplitIndex(); final ProgressLogger logger = new ProgressLogger(LOGGER); logger.expectedUpdates = maxSplitIndex; logger.itemsName = "splits"; logger.priority = Level.INFO; logger.start("Parallel split processing"); final SplitParallelRegion region = new SplitParallelRegion(maxSplitIndex, args, logger); try { getParallelTeam().execute(region); } catch (Exception e) { LOGGER.error("An exception occurred.", e); } logger.stop(); /** Time the duration of the sequence: */ timeService.setModelId(modelId); timeService.stop(); executed = region.getExecuted(); if (executed != null && executed instanceof SequenceMode) { // if we executed SequenceMode final SequenceMode sequenceMode = (SequenceMode) executed; if (evaluateStatistics) { final String label = sequenceMode.getValue("label"); final String statsFilename = sequenceMode.getValue("predictions-filename"); if (statsFilename != null && label != null) { // and the sequence defined the variables "predictions-filename" and "label" try { final List<String> statsModeArgs = new ObjectArrayList<String>( new String[] { "--mode", "stats", "--predictions", statsFilename, "--submission-file", labelPrefix(label) + "-maqcii-submission.txt", "--label", label, "--model-id", modelId, "--dataset-name", options.datasetName, "--other-measures", "prec,rec,F-1,MCC,binary-auc" }); if (options.adjustSignalToFloorValue) { statsModeArgs.add("--floor"); statsModeArgs.add(Double.toString(options.signalFloorValue)); } // extract survival options if any // TODO: clean this up - we should not be checking for "%survival%" final String survivalFileName = sequenceMode.getValue("survival"); if (StringUtils.isNotBlank(survivalFileName) && !"%survival%".equals(survivalFileName)) { statsModeArgs.add("--survival"); statsModeArgs.add(survivalFileName); } LOGGER.debug("Estimating statistics: " + statsModeArgs); // we create a new DAVMode here since we want to use the old StatsMode code // which is no longer exposed by DiscoverAndValidate (BDVal main method) final DAVMode statsMode = new DAVMode(); statsMode.registerMode("stats", StatsMode.class); final DAVOptions statsModeOptions = new DAVOptions(); statsMode.process( statsModeArgs.toArray(new String[statsModeArgs.size()]), statsModeOptions); } catch (Exception e) { LOGGER.error("Error executing --mode stats for all splits", e); } } } } }
/** * Perform the split transcripts mode. * * @throws IOException error reading / writing */ @Override public void execute() throws IOException { // Load the gene to transcripts file if (!config.validate()) { throw new IOException("Invalid SplitTranscripts configuration"); } final GeneTranscriptRelationships gtr = new GeneTranscriptRelationships(); final IndexedIdentifier transcriptIdents = new IndexedIdentifier(); final Int2ObjectMap<MutableString> transcriptIndexToIdMap = new Int2ObjectOpenHashMap<MutableString>(); final List<FastXEntry> fastxEntries = new LinkedList<FastXEntry>(); // // Pass through the file once to collect the transcript - gene relationships // int entryCount = 0; try { for (final FastXEntry entry : new FastXReader(config.getInputFile())) { entryCount++; parseHeader(entry.getEntryHeader()); final MutableString transcriptId = transcriptHeader.get("transcriptId"); final MutableString geneId = transcriptHeader.get("geneId"); final int transcriptIndex = transcriptIdents.registerIdentifier(transcriptId); gtr.addRelationship(geneId, transcriptIndex); transcriptIndexToIdMap.put(transcriptIndex, transcriptId); fastxEntries.add(entry.clone()); } } catch (CloneNotSupportedException e) { LOG.error("Couldn't clone for some reason", e); throw new GobyRuntimeException("Couldn't clone for some reason", e); } LOG.info("Loading map of genes-transcripts complete."); // // Scan through the transcript-gene relationships to determine which // transcript id goes into which file // final Int2IntMap transcriptIndex2FileIndex = new Int2IntOpenHashMap(); final String configOutputFilename = config.getOutputBase() + ".config"; final String configOutputPath = FilenameUtils.getFullPath(configOutputFilename); if (StringUtils.isNotBlank(configOutputPath)) { LOG.info("Creating output directory: " + configOutputPath); FileUtils.forceMkdir(new File(configOutputPath)); } PrintWriter configOutput = null; try { configOutput = new PrintWriter(configOutputFilename); configOutput.println("Ensembl Gene ID\tEnsembl Transcript ID"); final Int2IntMap fileIndex2NumberOfEntries = new Int2IntOpenHashMap(); fileIndex2NumberOfEntries.defaultReturnValue(0); transcriptIndex2FileIndex.defaultReturnValue(-1); final int initialNumberOfFiles = getNumberOfFiles(gtr, transcriptIndex2FileIndex); for (int geneIndex = 0; geneIndex < gtr.getNumberOfGenes(); geneIndex++) { final MutableString geneId = gtr.getGeneId(geneIndex); final IntSet transcriptIndices = gtr.getTranscriptSet(geneIndex); int fileNum = 0; for (final int transcriptIndex : transcriptIndices) { if (transcriptIndex2FileIndex.get(transcriptIndex) != -1) { LOG.warn("Skipping repeated transcriptIndex: " + transcriptIndex); continue; } final int maxEntriesPerFile = config.getMaxEntriesPerFile(); final int numberOfEntriesInOriginalBucket = fileIndex2NumberOfEntries.get(fileNum); final int adjustedFileIndex = fileNum + initialNumberOfFiles * (numberOfEntriesInOriginalBucket / maxEntriesPerFile); transcriptIndex2FileIndex.put(transcriptIndex, adjustedFileIndex); fileIndex2NumberOfEntries.put(fileNum, fileIndex2NumberOfEntries.get(fileNum) + 1); final MutableString transcriptId = transcriptIndexToIdMap.get(transcriptIndex); configOutput.printf("%s\t%s%n", geneId, transcriptId); fileNum++; } } } finally { IOUtils.closeQuietly(configOutput); } final int numFiles = getFileIndices(transcriptIndex2FileIndex).size(); if (LOG.isInfoEnabled()) { LOG.info( NumberFormat.getInstance().format(entryCount) + " entries will be written to " + numFiles + " files"); final int maxEntriesPerFile = config.getMaxEntriesPerFile(); if (maxEntriesPerFile < Integer.MAX_VALUE) { LOG.info("Each file will contain at most " + maxEntriesPerFile + " entries"); } } // formatter for uniquely numbering files each with the same number of digits final NumberFormat fileNumberFormatter = getNumberFormatter(numFiles - 1); final ProgressLogger progressLogger = new ProgressLogger(); progressLogger.expectedUpdates = entryCount; progressLogger.itemsName = "entries"; progressLogger.start(); // Write each file one at a time rather than in the order they appear in the input file // to avoid the issue of having too many streams open at the same or continually opening // and closing streams which is quite costly. We could store the gene/transcripts in // memory and then just write the files at the end but that could be worse. for (final int fileIndex : getFileIndices(transcriptIndex2FileIndex)) { final String filename = config.getOutputBase() + "." + fileNumberFormatter.format(fileIndex) + ".fa.gz"; PrintStream printStream = null; try { // each file is compressed printStream = new PrintStream(new GZIPOutputStream(new FileOutputStream(filename))); // // Read through the input file get the actual sequence information // final Iterator<FastXEntry> entries = fastxEntries.iterator(); while (entries.hasNext()) { final FastXEntry entry = entries.next(); parseHeader(entry.getEntryHeader()); final MutableString transcriptId = transcriptHeader.get("transcriptId"); final MutableString geneId = transcriptHeader.get("geneId"); final int transcriptIndex = transcriptIdents.getInt(transcriptId); final int transcriptFileIndex = transcriptIndex2FileIndex.get(transcriptIndex); if (transcriptFileIndex == fileIndex) { printStream.print(entry.getHeaderSymbol()); printStream.print(transcriptId); printStream.print(" gene:"); printStream.println(geneId); printStream.println(entry.getEntrySansHeader()); entries.remove(); progressLogger.lightUpdate(); } } } finally { IOUtils.closeQuietly(printStream); } } assert progressLogger.count == entryCount : "Some entries were not processed!"; progressLogger.done(); }
/** Computes the next step of the Power Method. */ public void step() throws IOException { double[] oldRank = rank, newRank = previousRank; DoubleArrays.fill(newRank, 0.0); // for each node, calculate its outdegree and redistribute its rank among pointed nodes double accum = 0.0; progressLogger.expectedUpdates = numNodes; progressLogger.start("Iteration " + (++iterationNumber) + "..."); final ArcLabelledNodeIterator nodeIterator = g.nodeIterator(); int i, outdegree, j, n = numNodes; int[] succ; Label[] lab; while (n-- != 0) { i = nodeIterator.nextInt(); outdegree = nodeIterator.outdegree(); if (outdegree == 0 || buckets != null && buckets.get(i)) accum += oldRank[i]; else { j = outdegree; succ = nodeIterator.successorArray(); lab = nodeIterator.labelArray(); while (j-- != 0) { newRank[succ[j]] += (oldRank[i] * lab[j].getFloat()) / sumoutweight[i]; } } progressLogger.update(); } progressLogger.done(); final double accumOverNumNodes = accum / numNodes; final double oneOverNumNodes = 1.0 / numNodes; if (preference != null) if (preferentialAdjustment == null) for (i = numNodes; i-- != 0; ) newRank[i] = alpha * newRank[i] + (1 - alpha) * preference.getDouble(i) + alpha * accumOverNumNodes; else for (i = numNodes; i-- != 0; ) newRank[i] = alpha * newRank[i] + (1 - alpha) * preference.getDouble(i) + alpha * accum * preferentialAdjustment.getDouble(i); else if (preferentialAdjustment == null) for (i = numNodes; i-- != 0; ) newRank[i] = alpha * newRank[i] + (1 - alpha) * oneOverNumNodes + alpha * accumOverNumNodes; else for (i = numNodes; i-- != 0; ) newRank[i] = alpha * newRank[i] + (1 - alpha) * oneOverNumNodes + alpha * accum * preferentialAdjustment.getDouble(i); // make the rank just computed the new rank rank = newRank; previousRank = oldRank; // Compute derivatives. n = iterationNumber; if (subset == null) { for (i = 0; i < order.length; i++) { final int k = order[i]; final double alphak = Math.pow(alpha, k); final double nFallingK = Util.falling(n, k); for (j = 0; j < numNodes; j++) derivative[i][j] += nFallingK * (rank[j] - previousRank[j]) / alphak; } } else { for (i = 0; i < order.length; i++) { final int k = order[i]; final double alphak = Math.pow(alpha, k); final double nFallingK = Util.falling(n, k); for (int t : subset) derivative[i][t] += nFallingK * (rank[t] - previousRank[t]) / alphak; } } // Compute coefficients, if required. if (coeffBasename != null) { final DataOutputStream coefficients = new DataOutputStream( new FastBufferedOutputStream( new FileOutputStream(coeffBasename + "-" + (iterationNumber)))); final double alphaN = Math.pow(alpha, n); for (i = 0; i < numNodes; i++) coefficients.writeDouble((rank[i] - previousRank[i]) / alphaN); coefficients.close(); } }
/** * For a specific sub-set of blocks (child nodes), find a 'base' subset of parents for which the * block's logLikelihood is not -Infinity * * @param candidateParentsPerNode * @param chosenArcsPerNode * @param setOfBlocks * @return */ protected double getOutOfMinusInfinity( Int2ObjectOpenHashMap<IntOpenHashSet> candidateParentsPerNode, Int2ObjectOpenHashMap<ObjectOpenHashSet<Arc>> chosenArcsPerNode, IntOpenHashSet setOfBlocks, TIntDoubleHashMap logLPerNode) { double totalLogL = 0; ProgressLogger pl = new ProgressLogger(LOGGER, ProgressLogger.TEN_SECONDS, "blocks"); pl.start("Begin initializing, to avoid zero likelihood, using set-cover heuristic"); pl.expectedUpdates = setOfBlocks.size(); int nArcs = 0; for (int v : setOfBlocks) { pl.update(); IntOpenHashSet vParents = candidateParentsPerNode.get(v); Int2ObjectOpenHashMap<IntOpenHashSet> parentActions = new Int2ObjectOpenHashMap<IntOpenHashSet>(); Int2ObjectOpenHashMap<IntArrayList> cPlusV = auxiliary.getCplusOnline(v); Int2ObjectOpenHashMap<IntArrayList> cMinusV = auxiliary.getCminusOnline(v); if (cPlusV != null) { IntSet actions = cPlusV.keySet(); // Heuristic: first add the parents that participate in A+ for // most actions for (int action : actions) { for (int u : cPlusV.get(action)) { if (!parentActions.containsKey(u)) { parentActions.put(u, new IntOpenHashSet()); } parentActions.get(u).add(action); } } } KeepMaximum km = new KeepMaximum(); km.addAllKey2Listsize(parentActions); IntOpenHashSet baseSetOfParents = new IntOpenHashSet(); double logL = Double.NEGATIVE_INFINITY; while (logL == Double.NEGATIVE_INFINITY && (km.getMaximumKey() != -1)) { int u = km.getMaximumKey(); if (baseSetOfParents.contains(u)) { throw new IllegalStateException("Attempted to add twice the same parent"); } baseSetOfParents.add(u); logL = blockLogLikelihood(v, cPlusV, cMinusV, baseSetOfParents); IntOpenHashSet uActions = parentActions.get(u); for (int parent : vParents) { parentActions.get(parent).removeAll(uActions); } vParents.remove(u); parentActions.remove(u); km.reset(); km.addAllKey2Listsize(parentActions); } // keep track of the likelihood totalLogL += logL; if (logLPerNode != null) { logLPerNode.put(v, logL); } chosenArcsPerNode.put(v, new ObjectOpenHashSet<Arc>()); for (int u : baseSetOfParents) { nArcs++; chosenArcsPerNode.get(v).add(new Arc(u, v)); } } pl.stop("Done initialization. Added " + nArcs + " arcs, logLikelihood=" + totalLogL); return totalLogL; }