Exemplo n.º 1
0
  /**
   * Converts the fasta sequences into phylip format: <number of sequences> <length of aligned
   * sequences> <id - exactly ten-space character padded><sequence for id> ...
   *
   * @param fasta_aligned_sequences
   * @return the phylip formatted result
   */
  protected String fasta2phylip(String fasta_aligned_sequences) throws IOException {
    StringBuffer seq = new StringBuffer();
    int count = 0;
    int length = 0;
    boolean prev = false;
    BufferedReader rdr = new BufferedReader(new StringReader(fasta_aligned_sequences));
    String line;
    String cur_id = null;
    StringBuffer tmp = new StringBuffer();

    while ((line = rdr.readLine()) != null) {
      // System.err.println(cur_id+" "+line);
      if (line.startsWith(">")) {
        count++;
        if (prev) {
          seq.append(fasta_accession2phylip(cur_id) + tmp + "\n");
          cur_id = line.trim().substring(1);
          length = tmp.length();
          tmp = new StringBuffer();
        } else {
          prev = true;
          cur_id = line.trim().substring(1);
        }
      } else {
        tmp.append(line.trim());
      }
    }
    // dont forget the last sequence
    if (tmp.length() > 0) {
      seq.append(fasta_accession2phylip(cur_id) + tmp + "\n");
    }
    return " " + count + " " + length + "\n" + seq.toString();
  }
Exemplo n.º 2
0
 /**
  * Ensures an accession from MUSCLE is in a format suitable for phylip. Each accession must be
  * unique to 10chars (sigh)
  */
 protected String fasta_accession2phylip(String accsn) {
   if (accsn.length() > 10) {
     return accsn.substring(0, 10);
   } else {
     StringBuffer tmp = new StringBuffer(accsn);
     int n_spaces = 10 - accsn.length();
     while (n_spaces-- > 0) {
       tmp.append(" ");
     }
     return tmp.toString();
   }
 }
Exemplo n.º 3
0
  /** {@inheritDoc} */
  @Override
  protected BufferedDataTable[] execute(
      final BufferedDataTable[] inData, final ExecutionContext exec) throws Exception {
    if (m_email.equals(DEFAULT_EMAIL)) {
      throw new Exception(
          "You must set a valid E-Mail for EBI to contact you in the event of problems with the service!");
    }
    int n_rows = inData[0].getRowCount();
    int seq_idx =
        inData[0].getSpec().findColumnIndex(((SettingsModelString) m_seq_col).getStringValue());
    int accsn_idx =
        inData[0].getSpec().findColumnIndex(((SettingsModelString) m_accsn_col).getStringValue());
    if (seq_idx < 0 || accsn_idx < 0) {
      throw new Exception("Cannot find columns... valid data?");
    }
    int done = 0;

    // create the output columns (raw format for use with R)
    DataTableSpec outputSpec = new DataTableSpec(inData[0].getDataTableSpec(), make_output_spec());
    BufferedDataContainer container = exec.createDataContainer(outputSpec, false, 0);

    // instantiate MUSCLE client
    MuscleClient cli = new MuscleClient();

    // each row is a separate MUSCLE job, the sequences are in one collection cell, the accessions
    // (IDs) in the other
    RowIterator it = inData[0].iterator();
    while (it.hasNext()) {
      DataRow r = it.next();
      ListCell seqs = (ListCell) r.getCell(seq_idx);
      ListCell accsns = (ListCell) r.getCell(accsn_idx);
      if (seqs.size() != accsns.size()) {
        throw new Exception(
            "Every sequence must have a corresponding accession: error at row "
                + r.getKey().getString());
      }
      if (seqs.size() < 1) {
        throw new Exception("Cannot MUSCLE zero sequences: error at row " + r.getKey().getString());
      }
      if (seqs.size() > 1000) {
        throw new Exception("Too many sequences in row " + r.getKey().getString());
      }
      // dummy a fake "FASTA" file (in memory) and then submit that to MUSCLE@EBI along with other
      // necessary parameters
      StringBuffer seq_as_fasta = new StringBuffer();
      for (int i = 0; i < seqs.size(); i++) {
        seq_as_fasta.append(">");
        seq_as_fasta.append(accsns.get(i).toString());
        seq_as_fasta.append("\n");
        seq_as_fasta.append(seqs.get(i).toString());
        seq_as_fasta.append("\n");
      }
      // System.err.println(seq_as_fasta);

      // lodge the muscle job and store the results in the output table
      InputParameters ip = new InputParameters();
      ip.setSequence(seq_as_fasta.toString());

      // start the job
      String jobId = cli.runApp(m_email.getStringValue(), r.getKey().getString(), ip);

      exec.checkCanceled();
      exec.setProgress(((double) done) / n_rows, "Executing " + jobId);
      Thread.sleep(20 * 1000); // 20 seconds
      waitForCompletion(cli, exec, jobId);
      done++;

      // process results and add them into the table...
      // 1. fasta alignment data
      byte[] bytes = cli.getSrvProxy().getResult(jobId, "aln-fasta", null);

      DataCell[] cells = new DataCell[3];
      cells[0] = new StringCell(jobId);

      // compute the base64 encoded phylip aligned sequences suitable for use by R's phangorn
      // package
      String fasta = new String(bytes);
      String ret = fasta2phylip(fasta);

      // it must be encoded (I chose base64) as it is common to both Java and R and it must be
      // encoded due to containing multiple lines, which confuses the CSV passed between KNIME and R
      String rk = r.getKey().getString();
      DataCell mac = AlignmentCellFactory.createCell(fasta, AlignmentType.AL_AA);
      if (mac instanceof MultiAlignmentCell) m_muscle_map.put(rk, (MultiAlignmentCell) mac);
      cells[1] = mac;

      bytes = cli.getSrvProxy().getResult(jobId, "out", null);
      cells[2] = new StringCell("<html><pre>" + new String(bytes));

      container.addRowToTable(new JoinedRow(r, new DefaultRow(r.getKey(), cells)));
    }
    container.close();
    BufferedDataTable out = container.getTable();
    return new BufferedDataTable[] {out};
  }