예제 #1
0
  public void cleanup() {

    if (tmpDirFile != null) {
      if (doCleanup) MetaUtils.recursiveDelete(tmpDirFile);
      tmpDirFile = null;
    }
  }
예제 #2
0
  /** new blast command based on default parameters */
  public BlatCommand() throws IOException {
    // look in configuration file to determine default values
    commandLine = DEFAULTCOMMANDLINE;
    commandPath = DEFAULTCOMMANDPATH;
    tmpDir = DEFAULTTMPDIR;

    tmpDirFile = MetaUtils.createTempDir("blat_", tmpDir);
  }
예제 #3
0
  public DataBag exec(Tuple input) throws IOException {

    DataBag output = DefaultBagFactory.getInstance().newDefaultBag();

    if (input == null || input.size() == 0) return null;
    try {
      String seq = ((String) input.get(0));
      // byte[] ba  = ((DataByteArray) input.get(0)).get();
      int distance = (Integer) input.get(1);
      // int seqLength = SequenceString.numBases(ba);
      // String seq = SequenceString.byteArrayToSequence(ba);

      Set<String> neighbors = MetaUtils.generateAllNeighborsWithinDistance(seq, distance);
      for (String n : neighbors) {
        Tuple t = DefaultTupleFactory.getInstance().newTuple(1);
        t.set(0, n);
        output.add(t);
      }
    } catch (Exception e) {
      System.err.println("HammingDistance: failed to process input; error - " + e.getMessage());
      return null;
    }
    return output;
  }
예제 #4
0
  /**
   * new blast command based on values stored in the configuration.
   *
   * <p>Looks for the following config values: blast.commandline, blast.commandpath, and
   * blast.tmpdir
   *
   * @param config is the hadoop configuration with overriding values for commandline options and
   *     paths
   */
  public BlatCommand(Configuration config) throws IOException {

    String c;

    if ((c = config.get("blat.commandline")) != null) {
      commandLine = c;
    } else {
      commandLine = DEFAULTCOMMANDLINE;
    }
    if ((c = config.get("blat.commandpath")) != null) {
      commandPath = c;
    } else {
      commandPath = DEFAULTCOMMANDPATH;
    }
    if ((c = config.get("blat.tmpdir")) != null) {
      tmpDir = c;
    } else {
      tmpDir = DEFAULTTMPDIR;
    }

    doCleanup = config.getBoolean("blat.cleanup", true);
    paired = config.getBoolean("blat.paired", true);

    /*
    do sanity check to make sure all paths exist
     */
    // checkFileExists(commandLine);
    // checkFileExists(commandPath);
    // checkDirExists(tmpDir);

    /*
    if all is good, create a working space inside tmpDir
     */

    tmpDirFile = MetaUtils.createTempDir("blat_", tmpDir);
  }
예제 #5
0
  /**
   * execute the blat command and return a list of sequence ids that match
   *
   * @param seqDatabase is the key/value map of sequences that act as reference keyed by name
   * @param seqQueryFilepath is the path the the blast output results
   * @return a list of sequence ids in the reference that match the cazy database
   */
  public Set<String> exec(
      Map<String, String> seqDatabase, String seqQueryFilepath, Mapper.Context context)
      throws IOException, InterruptedException {

    /*
    first, take the blatInputFile and find the corresponding sequence in the
    seqMap.  find both the exact sequence id, as well as its matching pair
    and write to temporary file.
    */
    // Map<String,String> l = new HashMap<String,String>();

    File seqQueryFile = null;

    log.info("Preparing Blat execution");
    if (context != null) context.setStatus("Preparing Blat execution");

    Map<String, String> l = new HashMap<String, String>();
    int numGroups = 0;
    int numReads = 0;

    /*
    open query file.
     */

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);

    for (Path filenamePath : MetaUtils.findAllPaths(new Path(seqQueryFilepath))) {
      if (!fs.exists(filenamePath)) {
        throw new IOException("file not found: " + seqQueryFilepath);
      }

      FSDataInputStream in = fs.open(filenamePath);
      BufferedReader bufRead = new BufferedReader(new InputStreamReader(in));

      /*
      Filter FileReader through a Buffered read to read a line at a time
      */

      String line = bufRead.readLine(); // String that holds current file line

      /*
      read the line into key/value with key being the first column, value is all the
      remaining columns
      */
      while (line != null) {
        numGroups++;
        String[] a = line.split("\t", 2);
        l.put(a[0], a[1]);
        numReads += a[1].split("\t").length;
        line = bufRead.readLine();
      }
      bufRead.close();
    }

    if (context != null)
      context.getCounter("blat.input", "NUMBER_OF_INPUT_READS").increment(numReads);
    if (context != null)
      context.getCounter("blat.input", "NUMBER_OF_INPUT_GROUPS").increment(numGroups);
    log.info("read " + numReads + " Reads in " + numGroups + " gene groups");

    /*
    now dump the database from the map to a file
     */
    String seqFilepath = dumpToFile(seqDatabase);

    if (seqFilepath == null) {
      /*
      return with fail
       */
      throw new IOException("unable to write " + seqDatabase + " to filesystem");
    }

    Map<String, Set<String>> s = new HashMap<String, Set<String>>();

    /*
    now loop through all the lines previously read in, write out a seqfile in temp directory
    then execute blat.
     */
    int numBlats = 0;
    int totalBlats = l.size();

    for (String k : l.keySet()) {
      numBlats++;

      /*
      k is a grouping key
       */

      log.info("processing group " + k);

      if (context != null) {
        context.setStatus("Executing Blat " + numBlats + "/" + totalBlats);
      }
      /*
      create a new file in temp direectory
       */
      seqQueryFile = new File(tmpDirFile, "blatquery.fa");
      BufferedWriter out = new BufferedWriter(new FileWriter(seqQueryFile.getPath()));

      /*
      look up all the sequences and write them to the file.  include the paired ends
       */
      int queryCount = 0;
      for (String key : l.get(k).split("\t")) {

        if (paired) {

          /*
          for paired end data, look for both pairs
           */

          String key1 = key + "/1"; // forward
          String key2 = key + "/2"; // backward

          if (seqDatabase.containsKey(key1)) {
            queryCount++;
            out.write(">" + key1 + "\n");
            out.write(seqDatabase.get(key1) + "\n");
          }
          if (seqDatabase.containsKey(key2)) {
            queryCount++;
            out.write(">" + key2 + "\n");
            out.write(seqDatabase.get(key2) + "\n");
          }
        } else {

          /*
          if data is not paired, just look up key
           */

          if (seqDatabase.containsKey(key)) {
            queryCount++;
            out.write(">" + key + "\n");
            out.write(seqDatabase.get(key) + "\n");
          }
        }
      }
      /*
      close the temporary file
      */
      out.close();

      if (queryCount == 0) {
        /*
        means that none of these queries were in this portion of the database.  no point
        executing blat, so just return
         */
        log.info("skipping blat since i didn't find any query sequences in this database");
        continue;
      }

      /*
      now set up a blat execution
       */
      List<String> commands = new ArrayList<String>();
      commands.add("/bin/sh");
      commands.add("-c");
      commands.add(
          commandPath
              + " "
              + commandLine
              + " "
              + seqFilepath
              + " "
              + seqQueryFile.getPath()
              + " "
              + tmpDirFile.getPath()
              + "/blat.output");

      log.info("command = " + commands);

      SystemCommandExecutor commandExecutor = new SystemCommandExecutor(commands);
      exitValue = commandExecutor.executeCommand();

      // stdout and stderr of the command are returned as StringBuilder objects
      stdout = commandExecutor.getStandardOutputFromCommand().toString();
      stderr = commandExecutor.getStandardErrorFromCommand().toString();

      log.debug("exit = " + exitValue);
      log.debug("stdout = " + stdout);
      log.debug("stderr = " + stderr);

      /*
      now parse the output and clean up
      */

      log.debug("reading outputfile: " + tmpDirFile.getPath() + "/blat.output");

      FileReader input = new FileReader(tmpDirFile.getPath() + "/blat.output");

      /*
      Filter FileReader through a Buffered read to read a line at a time
      */
      BufferedReader bufRead2 = new BufferedReader(input);

      String line2; // String that holds current file line
      int count = 0; // Line number of count

      // Read first line
      line2 = bufRead2.readLine();

      // Read through file one line at time. Print line # and line
      while (line2 != null) {
        String[] a = line2.split("\t", 3);
        if (s.containsKey(k)) {
          s.get(k).add(a[1]);
        } else {
          s.put(k, new HashSet<String>());
          s.get(k).add(a[1]);
        }
        line2 = bufRead2.readLine();
        count++;
      }

      bufRead2.close();

      log.debug("done reading file");

      /*
      should clean up - note: files get overwritten, so don't worry about it. :-)
       */

    }

    if (context != null) context.setStatus("Postprocessing Blat output");
    /*
    post processing.  since i need to return in the format of
    <groupid> <readid1> <readid2> <readid3> ...
    as a single string (one string per line).
     */

    log.info("Postprocessing Blat");
    log.info("  numGroups = " + s.keySet().size());

    Set<String> ss = new HashSet<String>();

    for (String k : s.keySet()) {
      StringBuilder stringBuilder = new StringBuilder();
      for (Iterator iter = s.get(k).iterator(); iter.hasNext(); ) {
        stringBuilder.append(", " + iter.next());
      }
      ss.add(k + stringBuilder);
    }

    return ss;
  }