/** * Converts the fasta sequences into phylip format: <number of sequences> <length of aligned * sequences> <id - exactly ten-space character padded><sequence for id> ... * * @param fasta_aligned_sequences * @return the phylip formatted result */ protected String fasta2phylip(String fasta_aligned_sequences) throws IOException { StringBuffer seq = new StringBuffer(); int count = 0; int length = 0; boolean prev = false; BufferedReader rdr = new BufferedReader(new StringReader(fasta_aligned_sequences)); String line; String cur_id = null; StringBuffer tmp = new StringBuffer(); while ((line = rdr.readLine()) != null) { // System.err.println(cur_id+" "+line); if (line.startsWith(">")) { count++; if (prev) { seq.append(fasta_accession2phylip(cur_id) + tmp + "\n"); cur_id = line.trim().substring(1); length = tmp.length(); tmp = new StringBuffer(); } else { prev = true; cur_id = line.trim().substring(1); } } else { tmp.append(line.trim()); } } // dont forget the last sequence if (tmp.length() > 0) { seq.append(fasta_accession2phylip(cur_id) + tmp + "\n"); } return " " + count + " " + length + "\n" + seq.toString(); }
/** * Ensures an accession from MUSCLE is in a format suitable for phylip. Each accession must be * unique to 10chars (sigh) */ protected String fasta_accession2phylip(String accsn) { if (accsn.length() > 10) { return accsn.substring(0, 10); } else { StringBuffer tmp = new StringBuffer(accsn); int n_spaces = 10 - accsn.length(); while (n_spaces-- > 0) { tmp.append(" "); } return tmp.toString(); } }
/** {@inheritDoc} */ @Override protected BufferedDataTable[] execute( final BufferedDataTable[] inData, final ExecutionContext exec) throws Exception { if (m_email.equals(DEFAULT_EMAIL)) { throw new Exception( "You must set a valid E-Mail for EBI to contact you in the event of problems with the service!"); } int n_rows = inData[0].getRowCount(); int seq_idx = inData[0].getSpec().findColumnIndex(((SettingsModelString) m_seq_col).getStringValue()); int accsn_idx = inData[0].getSpec().findColumnIndex(((SettingsModelString) m_accsn_col).getStringValue()); if (seq_idx < 0 || accsn_idx < 0) { throw new Exception("Cannot find columns... valid data?"); } int done = 0; // create the output columns (raw format for use with R) DataTableSpec outputSpec = new DataTableSpec(inData[0].getDataTableSpec(), make_output_spec()); BufferedDataContainer container = exec.createDataContainer(outputSpec, false, 0); // instantiate MUSCLE client MuscleClient cli = new MuscleClient(); // each row is a separate MUSCLE job, the sequences are in one collection cell, the accessions // (IDs) in the other RowIterator it = inData[0].iterator(); while (it.hasNext()) { DataRow r = it.next(); ListCell seqs = (ListCell) r.getCell(seq_idx); ListCell accsns = (ListCell) r.getCell(accsn_idx); if (seqs.size() != accsns.size()) { throw new Exception( "Every sequence must have a corresponding accession: error at row " + r.getKey().getString()); } if (seqs.size() < 1) { throw new Exception("Cannot MUSCLE zero sequences: error at row " + r.getKey().getString()); } if (seqs.size() > 1000) { throw new Exception("Too many sequences in row " + r.getKey().getString()); } // dummy a fake "FASTA" file (in memory) and then submit that to MUSCLE@EBI along with other // necessary parameters StringBuffer seq_as_fasta = new StringBuffer(); for (int i = 0; i < seqs.size(); i++) { seq_as_fasta.append(">"); seq_as_fasta.append(accsns.get(i).toString()); seq_as_fasta.append("\n"); seq_as_fasta.append(seqs.get(i).toString()); seq_as_fasta.append("\n"); } // System.err.println(seq_as_fasta); // lodge the muscle job and store the results in the output table InputParameters ip = new InputParameters(); ip.setSequence(seq_as_fasta.toString()); // start the job String jobId = cli.runApp(m_email.getStringValue(), r.getKey().getString(), ip); exec.checkCanceled(); exec.setProgress(((double) done) / n_rows, "Executing " + jobId); Thread.sleep(20 * 1000); // 20 seconds waitForCompletion(cli, exec, jobId); done++; // process results and add them into the table... // 1. fasta alignment data byte[] bytes = cli.getSrvProxy().getResult(jobId, "aln-fasta", null); DataCell[] cells = new DataCell[3]; cells[0] = new StringCell(jobId); // compute the base64 encoded phylip aligned sequences suitable for use by R's phangorn // package String fasta = new String(bytes); String ret = fasta2phylip(fasta); // it must be encoded (I chose base64) as it is common to both Java and R and it must be // encoded due to containing multiple lines, which confuses the CSV passed between KNIME and R String rk = r.getKey().getString(); DataCell mac = AlignmentCellFactory.createCell(fasta, AlignmentType.AL_AA); if (mac instanceof MultiAlignmentCell) m_muscle_map.put(rk, (MultiAlignmentCell) mac); cells[1] = mac; bytes = cli.getSrvProxy().getResult(jobId, "out", null); cells[2] = new StringCell("<html><pre>" + new String(bytes)); container.addRowToTable(new JoinedRow(r, new DefaultRow(r.getKey(), cells))); } container.close(); BufferedDataTable out = container.getTable(); return new BufferedDataTable[] {out}; }