public void map(LongWritable key, Text value, Mapper.Context context) throws IOException, InterruptedException { parser.parse(value); if (parser.isValidTemperature()) { int airTemperature = parser.getAirTemperature(); if (airTemperature > 1000) { System.err.println("Temperature over 100 degrees for input: " + value); context.setStatus("Detected possibly corrupt record: see logs."); context.getCounter(Temperature.OVER_100).increment(1); } LOG.info("Map key:" + key); if (LOG.isDebugEnabled()) { LOG.debug("Map value" + value); } context.write(new Text(parser.getYear()), new IntWritable(airTemperature)); } }
/** * execute the blat command and return a list of sequence ids that match * * @param seqDatabase is the key/value map of sequences that act as reference keyed by name * @param seqQueryFilepath is the path the the blast output results * @return a list of sequence ids in the reference that match the cazy database */ public Set<String> exec( Map<String, String> seqDatabase, String seqQueryFilepath, Mapper.Context context) throws IOException, InterruptedException { /* first, take the blatInputFile and find the corresponding sequence in the seqMap. find both the exact sequence id, as well as its matching pair and write to temporary file. */ // Map<String,String> l = new HashMap<String,String>(); File seqQueryFile = null; log.info("Preparing Blat execution"); if (context != null) context.setStatus("Preparing Blat execution"); Map<String, String> l = new HashMap<String, String>(); int numGroups = 0; int numReads = 0; /* open query file. */ Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); for (Path filenamePath : MetaUtils.findAllPaths(new Path(seqQueryFilepath))) { if (!fs.exists(filenamePath)) { throw new IOException("file not found: " + seqQueryFilepath); } FSDataInputStream in = fs.open(filenamePath); BufferedReader bufRead = new BufferedReader(new InputStreamReader(in)); /* Filter FileReader through a Buffered read to read a line at a time */ String line = bufRead.readLine(); // String that holds current file line /* read the line into key/value with key being the first column, value is all the remaining columns */ while (line != null) { numGroups++; String[] a = line.split("\t", 2); l.put(a[0], a[1]); numReads += a[1].split("\t").length; line = bufRead.readLine(); } bufRead.close(); } if (context != null) context.getCounter("blat.input", "NUMBER_OF_INPUT_READS").increment(numReads); if (context != null) context.getCounter("blat.input", "NUMBER_OF_INPUT_GROUPS").increment(numGroups); log.info("read " + numReads + " Reads in " + numGroups + " gene groups"); /* now dump the database from the map to a file */ String seqFilepath = dumpToFile(seqDatabase); if (seqFilepath == null) { /* return with fail */ throw new IOException("unable to write " + seqDatabase + " to filesystem"); } Map<String, Set<String>> s = new HashMap<String, Set<String>>(); /* now loop through all the lines previously read in, write out a seqfile in temp directory then execute blat. */ int numBlats = 0; int totalBlats = l.size(); for (String k : l.keySet()) { numBlats++; /* k is a grouping key */ log.info("processing group " + k); if (context != null) { context.setStatus("Executing Blat " + numBlats + "/" + totalBlats); } /* create a new file in temp direectory */ seqQueryFile = new File(tmpDirFile, "blatquery.fa"); BufferedWriter out = new BufferedWriter(new FileWriter(seqQueryFile.getPath())); /* look up all the sequences and write them to the file. include the paired ends */ int queryCount = 0; for (String key : l.get(k).split("\t")) { if (paired) { /* for paired end data, look for both pairs */ String key1 = key + "/1"; // forward String key2 = key + "/2"; // backward if (seqDatabase.containsKey(key1)) { queryCount++; out.write(">" + key1 + "\n"); out.write(seqDatabase.get(key1) + "\n"); } if (seqDatabase.containsKey(key2)) { queryCount++; out.write(">" + key2 + "\n"); out.write(seqDatabase.get(key2) + "\n"); } } else { /* if data is not paired, just look up key */ if (seqDatabase.containsKey(key)) { queryCount++; out.write(">" + key + "\n"); out.write(seqDatabase.get(key) + "\n"); } } } /* close the temporary file */ out.close(); if (queryCount == 0) { /* means that none of these queries were in this portion of the database. no point executing blat, so just return */ log.info("skipping blat since i didn't find any query sequences in this database"); continue; } /* now set up a blat execution */ List<String> commands = new ArrayList<String>(); commands.add("/bin/sh"); commands.add("-c"); commands.add( commandPath + " " + commandLine + " " + seqFilepath + " " + seqQueryFile.getPath() + " " + tmpDirFile.getPath() + "/blat.output"); log.info("command = " + commands); SystemCommandExecutor commandExecutor = new SystemCommandExecutor(commands); exitValue = commandExecutor.executeCommand(); // stdout and stderr of the command are returned as StringBuilder objects stdout = commandExecutor.getStandardOutputFromCommand().toString(); stderr = commandExecutor.getStandardErrorFromCommand().toString(); log.debug("exit = " + exitValue); log.debug("stdout = " + stdout); log.debug("stderr = " + stderr); /* now parse the output and clean up */ log.debug("reading outputfile: " + tmpDirFile.getPath() + "/blat.output"); FileReader input = new FileReader(tmpDirFile.getPath() + "/blat.output"); /* Filter FileReader through a Buffered read to read a line at a time */ BufferedReader bufRead2 = new BufferedReader(input); String line2; // String that holds current file line int count = 0; // Line number of count // Read first line line2 = bufRead2.readLine(); // Read through file one line at time. Print line # and line while (line2 != null) { String[] a = line2.split("\t", 3); if (s.containsKey(k)) { s.get(k).add(a[1]); } else { s.put(k, new HashSet<String>()); s.get(k).add(a[1]); } line2 = bufRead2.readLine(); count++; } bufRead2.close(); log.debug("done reading file"); /* should clean up - note: files get overwritten, so don't worry about it. :-) */ } if (context != null) context.setStatus("Postprocessing Blat output"); /* post processing. since i need to return in the format of <groupid> <readid1> <readid2> <readid3> ... as a single string (one string per line). */ log.info("Postprocessing Blat"); log.info(" numGroups = " + s.keySet().size()); Set<String> ss = new HashSet<String>(); for (String k : s.keySet()) { StringBuilder stringBuilder = new StringBuilder(); for (Iterator iter = s.get(k).iterator(); iter.hasNext(); ) { stringBuilder.append(", " + iter.next()); } ss.add(k + stringBuilder); } return ss; }