public void cleanup() { if (tmpDirFile != null) { if (doCleanup) MetaUtils.recursiveDelete(tmpDirFile); tmpDirFile = null; } }
/** new blast command based on default parameters */ public BlatCommand() throws IOException { // look in configuration file to determine default values commandLine = DEFAULTCOMMANDLINE; commandPath = DEFAULTCOMMANDPATH; tmpDir = DEFAULTTMPDIR; tmpDirFile = MetaUtils.createTempDir("blat_", tmpDir); }
public DataBag exec(Tuple input) throws IOException { DataBag output = DefaultBagFactory.getInstance().newDefaultBag(); if (input == null || input.size() == 0) return null; try { String seq = ((String) input.get(0)); // byte[] ba = ((DataByteArray) input.get(0)).get(); int distance = (Integer) input.get(1); // int seqLength = SequenceString.numBases(ba); // String seq = SequenceString.byteArrayToSequence(ba); Set<String> neighbors = MetaUtils.generateAllNeighborsWithinDistance(seq, distance); for (String n : neighbors) { Tuple t = DefaultTupleFactory.getInstance().newTuple(1); t.set(0, n); output.add(t); } } catch (Exception e) { System.err.println("HammingDistance: failed to process input; error - " + e.getMessage()); return null; } return output; }
/** * new blast command based on values stored in the configuration. * * <p>Looks for the following config values: blast.commandline, blast.commandpath, and * blast.tmpdir * * @param config is the hadoop configuration with overriding values for commandline options and * paths */ public BlatCommand(Configuration config) throws IOException { String c; if ((c = config.get("blat.commandline")) != null) { commandLine = c; } else { commandLine = DEFAULTCOMMANDLINE; } if ((c = config.get("blat.commandpath")) != null) { commandPath = c; } else { commandPath = DEFAULTCOMMANDPATH; } if ((c = config.get("blat.tmpdir")) != null) { tmpDir = c; } else { tmpDir = DEFAULTTMPDIR; } doCleanup = config.getBoolean("blat.cleanup", true); paired = config.getBoolean("blat.paired", true); /* do sanity check to make sure all paths exist */ // checkFileExists(commandLine); // checkFileExists(commandPath); // checkDirExists(tmpDir); /* if all is good, create a working space inside tmpDir */ tmpDirFile = MetaUtils.createTempDir("blat_", tmpDir); }
/** * execute the blat command and return a list of sequence ids that match * * @param seqDatabase is the key/value map of sequences that act as reference keyed by name * @param seqQueryFilepath is the path the the blast output results * @return a list of sequence ids in the reference that match the cazy database */ public Set<String> exec( Map<String, String> seqDatabase, String seqQueryFilepath, Mapper.Context context) throws IOException, InterruptedException { /* first, take the blatInputFile and find the corresponding sequence in the seqMap. find both the exact sequence id, as well as its matching pair and write to temporary file. */ // Map<String,String> l = new HashMap<String,String>(); File seqQueryFile = null; log.info("Preparing Blat execution"); if (context != null) context.setStatus("Preparing Blat execution"); Map<String, String> l = new HashMap<String, String>(); int numGroups = 0; int numReads = 0; /* open query file. */ Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); for (Path filenamePath : MetaUtils.findAllPaths(new Path(seqQueryFilepath))) { if (!fs.exists(filenamePath)) { throw new IOException("file not found: " + seqQueryFilepath); } FSDataInputStream in = fs.open(filenamePath); BufferedReader bufRead = new BufferedReader(new InputStreamReader(in)); /* Filter FileReader through a Buffered read to read a line at a time */ String line = bufRead.readLine(); // String that holds current file line /* read the line into key/value with key being the first column, value is all the remaining columns */ while (line != null) { numGroups++; String[] a = line.split("\t", 2); l.put(a[0], a[1]); numReads += a[1].split("\t").length; line = bufRead.readLine(); } bufRead.close(); } if (context != null) context.getCounter("blat.input", "NUMBER_OF_INPUT_READS").increment(numReads); if (context != null) context.getCounter("blat.input", "NUMBER_OF_INPUT_GROUPS").increment(numGroups); log.info("read " + numReads + " Reads in " + numGroups + " gene groups"); /* now dump the database from the map to a file */ String seqFilepath = dumpToFile(seqDatabase); if (seqFilepath == null) { /* return with fail */ throw new IOException("unable to write " + seqDatabase + " to filesystem"); } Map<String, Set<String>> s = new HashMap<String, Set<String>>(); /* now loop through all the lines previously read in, write out a seqfile in temp directory then execute blat. */ int numBlats = 0; int totalBlats = l.size(); for (String k : l.keySet()) { numBlats++; /* k is a grouping key */ log.info("processing group " + k); if (context != null) { context.setStatus("Executing Blat " + numBlats + "/" + totalBlats); } /* create a new file in temp direectory */ seqQueryFile = new File(tmpDirFile, "blatquery.fa"); BufferedWriter out = new BufferedWriter(new FileWriter(seqQueryFile.getPath())); /* look up all the sequences and write them to the file. include the paired ends */ int queryCount = 0; for (String key : l.get(k).split("\t")) { if (paired) { /* for paired end data, look for both pairs */ String key1 = key + "/1"; // forward String key2 = key + "/2"; // backward if (seqDatabase.containsKey(key1)) { queryCount++; out.write(">" + key1 + "\n"); out.write(seqDatabase.get(key1) + "\n"); } if (seqDatabase.containsKey(key2)) { queryCount++; out.write(">" + key2 + "\n"); out.write(seqDatabase.get(key2) + "\n"); } } else { /* if data is not paired, just look up key */ if (seqDatabase.containsKey(key)) { queryCount++; out.write(">" + key + "\n"); out.write(seqDatabase.get(key) + "\n"); } } } /* close the temporary file */ out.close(); if (queryCount == 0) { /* means that none of these queries were in this portion of the database. no point executing blat, so just return */ log.info("skipping blat since i didn't find any query sequences in this database"); continue; } /* now set up a blat execution */ List<String> commands = new ArrayList<String>(); commands.add("/bin/sh"); commands.add("-c"); commands.add( commandPath + " " + commandLine + " " + seqFilepath + " " + seqQueryFile.getPath() + " " + tmpDirFile.getPath() + "/blat.output"); log.info("command = " + commands); SystemCommandExecutor commandExecutor = new SystemCommandExecutor(commands); exitValue = commandExecutor.executeCommand(); // stdout and stderr of the command are returned as StringBuilder objects stdout = commandExecutor.getStandardOutputFromCommand().toString(); stderr = commandExecutor.getStandardErrorFromCommand().toString(); log.debug("exit = " + exitValue); log.debug("stdout = " + stdout); log.debug("stderr = " + stderr); /* now parse the output and clean up */ log.debug("reading outputfile: " + tmpDirFile.getPath() + "/blat.output"); FileReader input = new FileReader(tmpDirFile.getPath() + "/blat.output"); /* Filter FileReader through a Buffered read to read a line at a time */ BufferedReader bufRead2 = new BufferedReader(input); String line2; // String that holds current file line int count = 0; // Line number of count // Read first line line2 = bufRead2.readLine(); // Read through file one line at time. Print line # and line while (line2 != null) { String[] a = line2.split("\t", 3); if (s.containsKey(k)) { s.get(k).add(a[1]); } else { s.put(k, new HashSet<String>()); s.get(k).add(a[1]); } line2 = bufRead2.readLine(); count++; } bufRead2.close(); log.debug("done reading file"); /* should clean up - note: files get overwritten, so don't worry about it. :-) */ } if (context != null) context.setStatus("Postprocessing Blat output"); /* post processing. since i need to return in the format of <groupid> <readid1> <readid2> <readid3> ... as a single string (one string per line). */ log.info("Postprocessing Blat"); log.info(" numGroups = " + s.keySet().size()); Set<String> ss = new HashSet<String>(); for (String k : s.keySet()) { StringBuilder stringBuilder = new StringBuilder(); for (Iterator iter = s.get(k).iterator(); iter.hasNext(); ) { stringBuilder.append(", " + iter.next()); } ss.add(k + stringBuilder); } return ss; }