Example #1
0
  @Override
  public void process(final DAVOptions options) {
    final String[] args = getOriginalArgs();
    final UseModality<DAVOptions> executed;
    final int maxSplitIndex = splitPlan.getMaxSplitIndex();
    final ProgressLogger logger = new ProgressLogger(LOGGER);
    logger.expectedUpdates = maxSplitIndex;
    logger.itemsName = "splits";
    logger.priority = Level.INFO;
    logger.start("Parallel split processing");

    final SplitParallelRegion region = new SplitParallelRegion(maxSplitIndex, args, logger);
    try {
      getParallelTeam().execute(region);
    } catch (Exception e) {
      LOGGER.error("An exception occurred.", e);
    }
    logger.stop();

    /** Time the duration of the sequence: */
    timeService.setModelId(modelId);
    timeService.stop();

    executed = region.getExecuted();
    if (executed != null && executed instanceof SequenceMode) {
      // if we executed SequenceMode
      final SequenceMode sequenceMode = (SequenceMode) executed;
      if (evaluateStatistics) {
        final String label = sequenceMode.getValue("label");
        final String statsFilename = sequenceMode.getValue("predictions-filename");

        if (statsFilename != null && label != null) {
          // and the sequence defined the variables "predictions-filename" and "label"
          try {
            final List<String> statsModeArgs =
                new ObjectArrayList<String>(
                    new String[] {
                      "--mode",
                      "stats",
                      "--predictions",
                      statsFilename,
                      "--submission-file",
                      labelPrefix(label) + "-maqcii-submission.txt",
                      "--label",
                      label,
                      "--model-id",
                      modelId,
                      "--dataset-name",
                      options.datasetName,
                      "--other-measures",
                      "prec,rec,F-1,MCC,binary-auc"
                    });

            if (options.adjustSignalToFloorValue) {
              statsModeArgs.add("--floor");
              statsModeArgs.add(Double.toString(options.signalFloorValue));
            }

            // extract survival options if any
            // TODO: clean this up - we should not be checking for "%survival%"
            final String survivalFileName = sequenceMode.getValue("survival");
            if (StringUtils.isNotBlank(survivalFileName)
                && !"%survival%".equals(survivalFileName)) {
              statsModeArgs.add("--survival");
              statsModeArgs.add(survivalFileName);
            }

            LOGGER.debug("Estimating statistics: " + statsModeArgs);

            // we create a new DAVMode here since we want to use the old StatsMode code
            // which is no longer exposed by DiscoverAndValidate (BDVal main method)
            final DAVMode statsMode = new DAVMode();
            statsMode.registerMode("stats", StatsMode.class);
            final DAVOptions statsModeOptions = new DAVOptions();
            statsMode.process(
                statsModeArgs.toArray(new String[statsModeArgs.size()]), statsModeOptions);
          } catch (Exception e) {
            LOGGER.error("Error executing --mode stats for all splits", e);
          }
        }
      }
    }
  }
  /**
   * Perform the split transcripts mode.
   *
   * @throws IOException error reading / writing
   */
  @Override
  public void execute() throws IOException {
    // Load the gene to transcripts file
    if (!config.validate()) {
      throw new IOException("Invalid SplitTranscripts configuration");
    }
    final GeneTranscriptRelationships gtr = new GeneTranscriptRelationships();
    final IndexedIdentifier transcriptIdents = new IndexedIdentifier();
    final Int2ObjectMap<MutableString> transcriptIndexToIdMap =
        new Int2ObjectOpenHashMap<MutableString>();
    final List<FastXEntry> fastxEntries = new LinkedList<FastXEntry>();
    //
    // Pass through the file once to collect the transcript - gene relationships
    //
    int entryCount = 0;
    try {
      for (final FastXEntry entry : new FastXReader(config.getInputFile())) {
        entryCount++;
        parseHeader(entry.getEntryHeader());
        final MutableString transcriptId = transcriptHeader.get("transcriptId");
        final MutableString geneId = transcriptHeader.get("geneId");

        final int transcriptIndex = transcriptIdents.registerIdentifier(transcriptId);
        gtr.addRelationship(geneId, transcriptIndex);

        transcriptIndexToIdMap.put(transcriptIndex, transcriptId);

        fastxEntries.add(entry.clone());
      }
    } catch (CloneNotSupportedException e) {
      LOG.error("Couldn't clone for some reason", e);
      throw new GobyRuntimeException("Couldn't clone for some reason", e);
    }

    LOG.info("Loading map of genes-transcripts complete.");

    //
    // Scan through the transcript-gene relationships to determine which
    // transcript id goes into which file
    //
    final Int2IntMap transcriptIndex2FileIndex = new Int2IntOpenHashMap();
    final String configOutputFilename = config.getOutputBase() + ".config";
    final String configOutputPath = FilenameUtils.getFullPath(configOutputFilename);
    if (StringUtils.isNotBlank(configOutputPath)) {
      LOG.info("Creating output directory: " + configOutputPath);
      FileUtils.forceMkdir(new File(configOutputPath));
    }

    PrintWriter configOutput = null;
    try {
      configOutput = new PrintWriter(configOutputFilename);
      configOutput.println("Ensembl Gene ID\tEnsembl Transcript ID");

      final Int2IntMap fileIndex2NumberOfEntries = new Int2IntOpenHashMap();
      fileIndex2NumberOfEntries.defaultReturnValue(0);
      transcriptIndex2FileIndex.defaultReturnValue(-1);

      final int initialNumberOfFiles = getNumberOfFiles(gtr, transcriptIndex2FileIndex);

      for (int geneIndex = 0; geneIndex < gtr.getNumberOfGenes(); geneIndex++) {
        final MutableString geneId = gtr.getGeneId(geneIndex);
        final IntSet transcriptIndices = gtr.getTranscriptSet(geneIndex);
        int fileNum = 0;

        for (final int transcriptIndex : transcriptIndices) {
          if (transcriptIndex2FileIndex.get(transcriptIndex) != -1) {
            LOG.warn("Skipping repeated transcriptIndex: " + transcriptIndex);
            continue;
          }
          final int maxEntriesPerFile = config.getMaxEntriesPerFile();
          final int numberOfEntriesInOriginalBucket = fileIndex2NumberOfEntries.get(fileNum);
          final int adjustedFileIndex =
              fileNum
                  + initialNumberOfFiles * (numberOfEntriesInOriginalBucket / maxEntriesPerFile);

          transcriptIndex2FileIndex.put(transcriptIndex, adjustedFileIndex);
          fileIndex2NumberOfEntries.put(fileNum, fileIndex2NumberOfEntries.get(fileNum) + 1);
          final MutableString transcriptId = transcriptIndexToIdMap.get(transcriptIndex);
          configOutput.printf("%s\t%s%n", geneId, transcriptId);

          fileNum++;
        }
      }
    } finally {
      IOUtils.closeQuietly(configOutput);
    }

    final int numFiles = getFileIndices(transcriptIndex2FileIndex).size();
    if (LOG.isInfoEnabled()) {
      LOG.info(
          NumberFormat.getInstance().format(entryCount)
              + " entries will be written to "
              + numFiles
              + " files");
      final int maxEntriesPerFile = config.getMaxEntriesPerFile();
      if (maxEntriesPerFile < Integer.MAX_VALUE) {
        LOG.info("Each file will contain at most " + maxEntriesPerFile + " entries");
      }
    }

    // formatter for uniquely numbering files each with the same number of digits
    final NumberFormat fileNumberFormatter = getNumberFormatter(numFiles - 1);

    final ProgressLogger progressLogger = new ProgressLogger();
    progressLogger.expectedUpdates = entryCount;
    progressLogger.itemsName = "entries";
    progressLogger.start();

    // Write each file one at a time rather than in the order they appear in the input file
    // to avoid the issue of having too many streams open at the same or continually opening
    // and closing streams which is quite costly.  We could store the gene/transcripts in
    // memory and then just write the files at the end but that could be worse.
    for (final int fileIndex : getFileIndices(transcriptIndex2FileIndex)) {
      final String filename =
          config.getOutputBase() + "." + fileNumberFormatter.format(fileIndex) + ".fa.gz";
      PrintStream printStream = null;
      try {
        // each file is compressed
        printStream = new PrintStream(new GZIPOutputStream(new FileOutputStream(filename)));

        //
        // Read through the input file get the actual sequence information
        //
        final Iterator<FastXEntry> entries = fastxEntries.iterator();
        while (entries.hasNext()) {
          final FastXEntry entry = entries.next();
          parseHeader(entry.getEntryHeader());
          final MutableString transcriptId = transcriptHeader.get("transcriptId");
          final MutableString geneId = transcriptHeader.get("geneId");
          final int transcriptIndex = transcriptIdents.getInt(transcriptId);
          final int transcriptFileIndex = transcriptIndex2FileIndex.get(transcriptIndex);
          if (transcriptFileIndex == fileIndex) {
            printStream.print(entry.getHeaderSymbol());
            printStream.print(transcriptId);
            printStream.print(" gene:");
            printStream.println(geneId);
            printStream.println(entry.getEntrySansHeader());
            entries.remove();
            progressLogger.lightUpdate();
          }
        }
      } finally {
        IOUtils.closeQuietly(printStream);
      }
    }

    assert progressLogger.count == entryCount : "Some entries were not processed!";
    progressLogger.done();
  }
  /** Computes the next step of the Power Method. */
  public void step() throws IOException {
    double[] oldRank = rank, newRank = previousRank;
    DoubleArrays.fill(newRank, 0.0);

    // for each node, calculate its outdegree and redistribute its rank among pointed nodes
    double accum = 0.0;

    progressLogger.expectedUpdates = numNodes;
    progressLogger.start("Iteration " + (++iterationNumber) + "...");

    final ArcLabelledNodeIterator nodeIterator = g.nodeIterator();
    int i, outdegree, j, n = numNodes;
    int[] succ;
    Label[] lab;

    while (n-- != 0) {
      i = nodeIterator.nextInt();
      outdegree = nodeIterator.outdegree();

      if (outdegree == 0 || buckets != null && buckets.get(i)) accum += oldRank[i];
      else {
        j = outdegree;
        succ = nodeIterator.successorArray();
        lab = nodeIterator.labelArray();
        while (j-- != 0) {
          newRank[succ[j]] += (oldRank[i] * lab[j].getFloat()) / sumoutweight[i];
        }
      }
      progressLogger.update();
    }
    progressLogger.done();

    final double accumOverNumNodes = accum / numNodes;

    final double oneOverNumNodes = 1.0 / numNodes;
    if (preference != null)
      if (preferentialAdjustment == null)
        for (i = numNodes; i-- != 0; )
          newRank[i] =
              alpha * newRank[i]
                  + (1 - alpha) * preference.getDouble(i)
                  + alpha * accumOverNumNodes;
      else
        for (i = numNodes; i-- != 0; )
          newRank[i] =
              alpha * newRank[i]
                  + (1 - alpha) * preference.getDouble(i)
                  + alpha * accum * preferentialAdjustment.getDouble(i);
    else if (preferentialAdjustment == null)
      for (i = numNodes; i-- != 0; )
        newRank[i] = alpha * newRank[i] + (1 - alpha) * oneOverNumNodes + alpha * accumOverNumNodes;
    else
      for (i = numNodes; i-- != 0; )
        newRank[i] =
            alpha * newRank[i]
                + (1 - alpha) * oneOverNumNodes
                + alpha * accum * preferentialAdjustment.getDouble(i);

    // make the rank just computed the new rank
    rank = newRank;
    previousRank = oldRank;

    // Compute derivatives.
    n = iterationNumber;

    if (subset == null) {
      for (i = 0; i < order.length; i++) {
        final int k = order[i];
        final double alphak = Math.pow(alpha, k);
        final double nFallingK = Util.falling(n, k);
        for (j = 0; j < numNodes; j++)
          derivative[i][j] += nFallingK * (rank[j] - previousRank[j]) / alphak;
      }
    } else {
      for (i = 0; i < order.length; i++) {
        final int k = order[i];
        final double alphak = Math.pow(alpha, k);
        final double nFallingK = Util.falling(n, k);

        for (int t : subset) derivative[i][t] += nFallingK * (rank[t] - previousRank[t]) / alphak;
      }
    }

    // Compute coefficients, if required.

    if (coeffBasename != null) {
      final DataOutputStream coefficients =
          new DataOutputStream(
              new FastBufferedOutputStream(
                  new FileOutputStream(coeffBasename + "-" + (iterationNumber))));
      final double alphaN = Math.pow(alpha, n);
      for (i = 0; i < numNodes; i++) coefficients.writeDouble((rank[i] - previousRank[i]) / alphaN);
      coefficients.close();
    }
  }
Example #4
0
  /**
   * For a specific sub-set of blocks (child nodes), find a 'base' subset of parents for which the
   * block's logLikelihood is not -Infinity
   *
   * @param candidateParentsPerNode
   * @param chosenArcsPerNode
   * @param setOfBlocks
   * @return
   */
  protected double getOutOfMinusInfinity(
      Int2ObjectOpenHashMap<IntOpenHashSet> candidateParentsPerNode,
      Int2ObjectOpenHashMap<ObjectOpenHashSet<Arc>> chosenArcsPerNode,
      IntOpenHashSet setOfBlocks,
      TIntDoubleHashMap logLPerNode) {
    double totalLogL = 0;

    ProgressLogger pl = new ProgressLogger(LOGGER, ProgressLogger.TEN_SECONDS, "blocks");
    pl.start("Begin initializing, to avoid zero likelihood, using set-cover heuristic");
    pl.expectedUpdates = setOfBlocks.size();
    int nArcs = 0;
    for (int v : setOfBlocks) {
      pl.update();

      IntOpenHashSet vParents = candidateParentsPerNode.get(v);

      Int2ObjectOpenHashMap<IntOpenHashSet> parentActions =
          new Int2ObjectOpenHashMap<IntOpenHashSet>();

      Int2ObjectOpenHashMap<IntArrayList> cPlusV = auxiliary.getCplusOnline(v);
      Int2ObjectOpenHashMap<IntArrayList> cMinusV = auxiliary.getCminusOnline(v);

      if (cPlusV != null) {
        IntSet actions = cPlusV.keySet();
        // Heuristic: first add the parents that participate in A+ for
        // most actions
        for (int action : actions) {
          for (int u : cPlusV.get(action)) {
            if (!parentActions.containsKey(u)) {
              parentActions.put(u, new IntOpenHashSet());
            }
            parentActions.get(u).add(action);
          }
        }
      }

      KeepMaximum km = new KeepMaximum();
      km.addAllKey2Listsize(parentActions);

      IntOpenHashSet baseSetOfParents = new IntOpenHashSet();
      double logL = Double.NEGATIVE_INFINITY;
      while (logL == Double.NEGATIVE_INFINITY && (km.getMaximumKey() != -1)) {
        int u = km.getMaximumKey();
        if (baseSetOfParents.contains(u)) {
          throw new IllegalStateException("Attempted to add twice the same parent");
        }
        baseSetOfParents.add(u);
        logL = blockLogLikelihood(v, cPlusV, cMinusV, baseSetOfParents);
        IntOpenHashSet uActions = parentActions.get(u);
        for (int parent : vParents) {
          parentActions.get(parent).removeAll(uActions);
        }
        vParents.remove(u);
        parentActions.remove(u);
        km.reset();
        km.addAllKey2Listsize(parentActions);
      }

      // keep track of the likelihood
      totalLogL += logL;
      if (logLPerNode != null) {
        logLPerNode.put(v, logL);
      }

      chosenArcsPerNode.put(v, new ObjectOpenHashSet<Arc>());
      for (int u : baseSetOfParents) {
        nArcs++;
        chosenArcsPerNode.get(v).add(new Arc(u, v));
      }
    }
    pl.stop("Done initialization. Added " + nArcs + " arcs, logLikelihood=" + totalLogL);
    return totalLogL;
  }