public void map(LongWritable key, Text value, Mapper.Context context)
     throws IOException, InterruptedException {
   parser.parse(value);
   if (parser.isValidTemperature()) {
     int airTemperature = parser.getAirTemperature();
     if (airTemperature > 1000) {
       System.err.println("Temperature over 100 degrees for input: " + value);
       context.setStatus("Detected possibly corrupt record: see logs.");
       context.getCounter(Temperature.OVER_100).increment(1);
     }
     LOG.info("Map key:" + key);
     if (LOG.isDebugEnabled()) {
       LOG.debug("Map value" + value);
     }
     context.write(new Text(parser.getYear()), new IntWritable(airTemperature));
   }
 }
Пример #2
0
  /**
   * execute the blat command and return a list of sequence ids that match
   *
   * @param seqDatabase is the key/value map of sequences that act as reference keyed by name
   * @param seqQueryFilepath is the path the the blast output results
   * @return a list of sequence ids in the reference that match the cazy database
   */
  public Set<String> exec(
      Map<String, String> seqDatabase, String seqQueryFilepath, Mapper.Context context)
      throws IOException, InterruptedException {

    /*
    first, take the blatInputFile and find the corresponding sequence in the
    seqMap.  find both the exact sequence id, as well as its matching pair
    and write to temporary file.
    */
    // Map<String,String> l = new HashMap<String,String>();

    File seqQueryFile = null;

    log.info("Preparing Blat execution");
    if (context != null) context.setStatus("Preparing Blat execution");

    Map<String, String> l = new HashMap<String, String>();
    int numGroups = 0;
    int numReads = 0;

    /*
    open query file.
     */

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);

    for (Path filenamePath : MetaUtils.findAllPaths(new Path(seqQueryFilepath))) {
      if (!fs.exists(filenamePath)) {
        throw new IOException("file not found: " + seqQueryFilepath);
      }

      FSDataInputStream in = fs.open(filenamePath);
      BufferedReader bufRead = new BufferedReader(new InputStreamReader(in));

      /*
      Filter FileReader through a Buffered read to read a line at a time
      */

      String line = bufRead.readLine(); // String that holds current file line

      /*
      read the line into key/value with key being the first column, value is all the
      remaining columns
      */
      while (line != null) {
        numGroups++;
        String[] a = line.split("\t", 2);
        l.put(a[0], a[1]);
        numReads += a[1].split("\t").length;
        line = bufRead.readLine();
      }
      bufRead.close();
    }

    if (context != null)
      context.getCounter("blat.input", "NUMBER_OF_INPUT_READS").increment(numReads);
    if (context != null)
      context.getCounter("blat.input", "NUMBER_OF_INPUT_GROUPS").increment(numGroups);
    log.info("read " + numReads + " Reads in " + numGroups + " gene groups");

    /*
    now dump the database from the map to a file
     */
    String seqFilepath = dumpToFile(seqDatabase);

    if (seqFilepath == null) {
      /*
      return with fail
       */
      throw new IOException("unable to write " + seqDatabase + " to filesystem");
    }

    Map<String, Set<String>> s = new HashMap<String, Set<String>>();

    /*
    now loop through all the lines previously read in, write out a seqfile in temp directory
    then execute blat.
     */
    int numBlats = 0;
    int totalBlats = l.size();

    for (String k : l.keySet()) {
      numBlats++;

      /*
      k is a grouping key
       */

      log.info("processing group " + k);

      if (context != null) {
        context.setStatus("Executing Blat " + numBlats + "/" + totalBlats);
      }
      /*
      create a new file in temp direectory
       */
      seqQueryFile = new File(tmpDirFile, "blatquery.fa");
      BufferedWriter out = new BufferedWriter(new FileWriter(seqQueryFile.getPath()));

      /*
      look up all the sequences and write them to the file.  include the paired ends
       */
      int queryCount = 0;
      for (String key : l.get(k).split("\t")) {

        if (paired) {

          /*
          for paired end data, look for both pairs
           */

          String key1 = key + "/1"; // forward
          String key2 = key + "/2"; // backward

          if (seqDatabase.containsKey(key1)) {
            queryCount++;
            out.write(">" + key1 + "\n");
            out.write(seqDatabase.get(key1) + "\n");
          }
          if (seqDatabase.containsKey(key2)) {
            queryCount++;
            out.write(">" + key2 + "\n");
            out.write(seqDatabase.get(key2) + "\n");
          }
        } else {

          /*
          if data is not paired, just look up key
           */

          if (seqDatabase.containsKey(key)) {
            queryCount++;
            out.write(">" + key + "\n");
            out.write(seqDatabase.get(key) + "\n");
          }
        }
      }
      /*
      close the temporary file
      */
      out.close();

      if (queryCount == 0) {
        /*
        means that none of these queries were in this portion of the database.  no point
        executing blat, so just return
         */
        log.info("skipping blat since i didn't find any query sequences in this database");
        continue;
      }

      /*
      now set up a blat execution
       */
      List<String> commands = new ArrayList<String>();
      commands.add("/bin/sh");
      commands.add("-c");
      commands.add(
          commandPath
              + " "
              + commandLine
              + " "
              + seqFilepath
              + " "
              + seqQueryFile.getPath()
              + " "
              + tmpDirFile.getPath()
              + "/blat.output");

      log.info("command = " + commands);

      SystemCommandExecutor commandExecutor = new SystemCommandExecutor(commands);
      exitValue = commandExecutor.executeCommand();

      // stdout and stderr of the command are returned as StringBuilder objects
      stdout = commandExecutor.getStandardOutputFromCommand().toString();
      stderr = commandExecutor.getStandardErrorFromCommand().toString();

      log.debug("exit = " + exitValue);
      log.debug("stdout = " + stdout);
      log.debug("stderr = " + stderr);

      /*
      now parse the output and clean up
      */

      log.debug("reading outputfile: " + tmpDirFile.getPath() + "/blat.output");

      FileReader input = new FileReader(tmpDirFile.getPath() + "/blat.output");

      /*
      Filter FileReader through a Buffered read to read a line at a time
      */
      BufferedReader bufRead2 = new BufferedReader(input);

      String line2; // String that holds current file line
      int count = 0; // Line number of count

      // Read first line
      line2 = bufRead2.readLine();

      // Read through file one line at time. Print line # and line
      while (line2 != null) {
        String[] a = line2.split("\t", 3);
        if (s.containsKey(k)) {
          s.get(k).add(a[1]);
        } else {
          s.put(k, new HashSet<String>());
          s.get(k).add(a[1]);
        }
        line2 = bufRead2.readLine();
        count++;
      }

      bufRead2.close();

      log.debug("done reading file");

      /*
      should clean up - note: files get overwritten, so don't worry about it. :-)
       */

    }

    if (context != null) context.setStatus("Postprocessing Blat output");
    /*
    post processing.  since i need to return in the format of
    <groupid> <readid1> <readid2> <readid3> ...
    as a single string (one string per line).
     */

    log.info("Postprocessing Blat");
    log.info("  numGroups = " + s.keySet().size());

    Set<String> ss = new HashSet<String>();

    for (String k : s.keySet()) {
      StringBuilder stringBuilder = new StringBuilder();
      for (Iterator iter = s.get(k).iterator(); iter.hasNext(); ) {
        stringBuilder.append(", " + iter.next());
      }
      ss.add(k + stringBuilder);
    }

    return ss;
  }