Ejemplo n.º 1
0
  /**
   * Loads the biological assembly for a given PDB ID and bioAssemblyId. If a bioAssemblyId > 0 is
   * specified, the corresponding biological assembly file will be loaded. Note, the number of
   * available biological unit files varies. Many entries don't have a biological assembly specified
   * (i.e. NMR structures), many entries have only one biological assembly (bioAssemblyId=1), and a
   * few structures have multiple biological assemblies. Set bioAssemblyFallback to true, to
   * download the original PDB file in cases that a biological assembly file is not available.
   *
   * @param pdbId the PDB ID
   * @param bioAssemblyId the 1-based index of the biological assembly (0 gets the asymmetric unit)
   * @param bioAssemblyFallback if true, try reading original PDB file in case the biological
   *     assembly file is not available
   * @return a structure object
   * @throws IOException
   * @throws StructureException
   * @author Peter Rose
   * @since 3.2
   */
  public Structure getBiologicalAssembly(
      String pdbId, int bioAssemblyId, boolean bioAssemblyFallback)
      throws StructureException, IOException {

    if (bioAssemblyId < 0) {
      throw new StructureException(
          "bioAssemblyID must be nonnegative: " + pdbId + " bioAssemblyId " + bioAssemblyId);
    }
    Structure s = StructureIO.getBiologicalAssembly(pdbId, bioAssemblyId, this);

    if (s == null && bioAssemblyFallback) return StructureIO.getBiologicalAssembly(pdbId, 0, this);

    return s;
  }
  public static void toSequenceFile(String fileName, Collection<String> pdbIds, boolean verbose)
      throws IOException {

    int failure = 0;
    int success = 0;
    int chains = 0;

    try (SequenceFile.Writer writer =
        SequenceFile.createWriter(
            new Configuration(),
            SequenceFile.Writer.file(new Path(fileName)),
            SequenceFile.Writer.keyClass(Text.class),
            SequenceFile.Writer.valueClass(IntArrayWritable.class),
            SequenceFile.Writer.compression(
                SequenceFile.CompressionType.BLOCK, new BZip2Codec())); ) {
      for (String pdbId : pdbIds) {
        if (verbose) {
          System.out.println(pdbId);
        }

        Structure s = null;
        try {
          s = StructureIO.getStructure(pdbId);
          success++;
        } catch (Exception e) {
          // some files can't be read. Let's just skip those!
          e.printStackTrace();
          failure++;
          continue;
        }

        if (s == null) {
          System.err.println("structure null: " + pdbId);
          continue;
        }

        if (s.getChains().size() == 0) {
          continue;
        }

        chains += append(writer, pdbId, s);
      }
      IOUtils.closeStream(writer);
    }

    if (verbose) {
      System.out.println("Total structures: " + pdbIds.size());
      System.out.println("Success: " + success);
      System.out.println("Failure: " + failure);
      System.out.println("Chains: " + chains);
    }
  }
  public static void main(String[] args) throws IOException {
    String timeStamp =
        new SimpleDateFormat("yyyyMMdd_HHmmss").format(Calendar.getInstance().getTime());

    String uri = args[0] + "_" + timeStamp + ".seq";

    Set<String> pdbIds = getAll();

    StructureIO.setAtomCache(cache);
    cache.setPath("/Users/Chris/Documents/RCSB/Data/Protein_chains/cache/");

    long start = System.nanoTime();
    toSequenceFile(uri, pdbIds, true);
    long end = System.nanoTime();

    System.out.println("Time: " + (end - start) / 1E9 + " sec.");
  }