@Override public List<MarkerDataType> transform(final DataTable inTable) { checkNotNull(this.minimalTableSpec, "Minimal DataTableSpec hat to be set"); checkNotNull(this.inputTableSpec, "Input DataTableSpec hat to be set"); checkNotNull(inTable, "InTable has to be set"); List<MarkerDataType> markerData = Lists.newArrayListWithExpectedSize(1000); RowIterator iterator = inTable.iterator(); while (iterator.hasNext()) { markerData.add(transformRow(iterator.next())); } return markerData; }
/** {@inheritDoc} */ @Override protected BufferedDataTable[] execute( final BufferedDataTable[] inData, final ExecutionContext exec) throws Exception { if (m_email.equals(DEFAULT_EMAIL)) { throw new Exception( "You must set a valid E-Mail for EBI to contact you in the event of problems with the service!"); } int n_rows = inData[0].getRowCount(); int seq_idx = inData[0].getSpec().findColumnIndex(((SettingsModelString) m_seq_col).getStringValue()); int accsn_idx = inData[0].getSpec().findColumnIndex(((SettingsModelString) m_accsn_col).getStringValue()); if (seq_idx < 0 || accsn_idx < 0) { throw new Exception("Cannot find columns... valid data?"); } int done = 0; // create the output columns (raw format for use with R) DataTableSpec outputSpec = new DataTableSpec(inData[0].getDataTableSpec(), make_output_spec()); BufferedDataContainer container = exec.createDataContainer(outputSpec, false, 0); // instantiate MUSCLE client MuscleClient cli = new MuscleClient(); // each row is a separate MUSCLE job, the sequences are in one collection cell, the accessions // (IDs) in the other RowIterator it = inData[0].iterator(); while (it.hasNext()) { DataRow r = it.next(); ListCell seqs = (ListCell) r.getCell(seq_idx); ListCell accsns = (ListCell) r.getCell(accsn_idx); if (seqs.size() != accsns.size()) { throw new Exception( "Every sequence must have a corresponding accession: error at row " + r.getKey().getString()); } if (seqs.size() < 1) { throw new Exception("Cannot MUSCLE zero sequences: error at row " + r.getKey().getString()); } if (seqs.size() > 1000) { throw new Exception("Too many sequences in row " + r.getKey().getString()); } // dummy a fake "FASTA" file (in memory) and then submit that to MUSCLE@EBI along with other // necessary parameters StringBuffer seq_as_fasta = new StringBuffer(); for (int i = 0; i < seqs.size(); i++) { seq_as_fasta.append(">"); seq_as_fasta.append(accsns.get(i).toString()); seq_as_fasta.append("\n"); seq_as_fasta.append(seqs.get(i).toString()); seq_as_fasta.append("\n"); } // System.err.println(seq_as_fasta); // lodge the muscle job and store the results in the output table InputParameters ip = new InputParameters(); ip.setSequence(seq_as_fasta.toString()); // start the job String jobId = cli.runApp(m_email.getStringValue(), r.getKey().getString(), ip); exec.checkCanceled(); exec.setProgress(((double) done) / n_rows, "Executing " + jobId); Thread.sleep(20 * 1000); // 20 seconds waitForCompletion(cli, exec, jobId); done++; // process results and add them into the table... // 1. fasta alignment data byte[] bytes = cli.getSrvProxy().getResult(jobId, "aln-fasta", null); DataCell[] cells = new DataCell[3]; cells[0] = new StringCell(jobId); // compute the base64 encoded phylip aligned sequences suitable for use by R's phangorn // package String fasta = new String(bytes); String ret = fasta2phylip(fasta); // it must be encoded (I chose base64) as it is common to both Java and R and it must be // encoded due to containing multiple lines, which confuses the CSV passed between KNIME and R String rk = r.getKey().getString(); DataCell mac = AlignmentCellFactory.createCell(fasta, AlignmentType.AL_AA); if (mac instanceof MultiAlignmentCell) m_muscle_map.put(rk, (MultiAlignmentCell) mac); cells[1] = mac; bytes = cli.getSrvProxy().getResult(jobId, "out", null); cells[2] = new StringCell("<html><pre>" + new String(bytes)); container.addRowToTable(new JoinedRow(r, new DefaultRow(r.getKey(), cells))); } container.close(); BufferedDataTable out = container.getTable(); return new BufferedDataTable[] {out}; }