Beispiel #1
1
  /**
   * Print an alignment map in a concise representation. Edges are given as two numbers separated by
   * '>'. They are chained together where possible, or separated by spaces where disjoint or
   * branched.
   *
   * <p>Note that more concise representations may be possible. Examples:
   * <li>1>2>3>1
   * <li>1>2>3>2 4>3
   *
   * @param alignment The input function, as a map (see {@link
   *     AlignmentTools#alignmentAsMap(AFPChain)})
   * @param identity An identity-like function providing the isomorphism between the codomain of
   *     alignment (of type <T>) and the domain (type <S>).
   * @return
   */
  public static <S, T> String toConciseAlignmentString(Map<S, T> alignment, Map<T, S> identity) {
    // Clone input to prevent changes
    Map<S, T> alig = new HashMap<S, T>(alignment);

    // Generate inverse alignment
    Map<S, List<S>> inverse = new HashMap<S, List<S>>();
    for (Entry<S, T> e : alig.entrySet()) {
      S val = identity.get(e.getValue());
      if (inverse.containsKey(val)) {
        List<S> l = inverse.get(val);
        l.add(e.getKey());
      } else {
        List<S> l = new ArrayList<S>();
        l.add(e.getKey());
        inverse.put(val, l);
      }
    }

    StringBuilder str = new StringBuilder();

    while (!alig.isEmpty()) {
      // Pick an edge and work upstream to a root or cycle
      S seedNode = alig.keySet().iterator().next();
      S node = seedNode;
      if (inverse.containsKey(seedNode)) {
        node = inverse.get(seedNode).iterator().next();
        while (node != seedNode && inverse.containsKey(node)) {
          node = inverse.get(node).iterator().next();
        }
      }

      // Now work downstream, deleting edges as we go
      seedNode = node;
      str.append(node);

      while (alig.containsKey(node)) {
        S lastNode = node;
        node = identity.get(alig.get(lastNode));

        // Output
        str.append('>');
        str.append(node);

        // Remove edge
        alig.remove(lastNode);
        List<S> inv = inverse.get(node);
        if (inv.size() > 1) {
          inv.remove(node);
        } else {
          inverse.remove(node);
        }
      }
      if (!alig.isEmpty()) {
        str.append(' ');
      }
    }

    return str.toString();
  }
Beispiel #2
0
  /**
   * Takes a potentially non-sequential alignment and guesses a sequential version of it. Residues
   * from each structure are sorted sequentially and then compared directly.
   *
   * <p>The results of this method are consistent with what one might expect from an identity
   * function, and are therefore useful with {@link #getSymmetryOrder(Map, Map identity, int,
   * float)}.
   *
   * <ul>
   *   <li>Perfect self-alignments will have the same pre-image and image, so will map X->X
   *   <li>Gaps and alignment errors will cause errors in the resulting map, but only locally.
   *       Errors do not propagate through the whole alignment.
   * </ul>
   *
   * <h4>Example:</h4>
   *
   * A non sequential alignment, represented schematically as
   *
   * <pre>
   * 12456789
   * 78912345</pre>
   *
   * would result in a map
   *
   * <pre>
   * 12456789
   * 12345789</pre>
   *
   * @param alignment The non-sequential input alignment
   * @param inverseAlignment If false, map from structure1 to structure2. If true, generate the
   *     inverse of that map.
   * @return A mapping from sequential residues of one protein to those of the other
   * @throws IllegalArgumentException if the input alignment is not one-to-one.
   */
  public static Map<Integer, Integer> guessSequentialAlignment(
      Map<Integer, Integer> alignment, boolean inverseAlignment) {
    Map<Integer, Integer> identity = new HashMap<Integer, Integer>();

    SortedSet<Integer> aligned1 = new TreeSet<Integer>();
    SortedSet<Integer> aligned2 = new TreeSet<Integer>();

    for (Entry<Integer, Integer> pair : alignment.entrySet()) {
      aligned1.add(pair.getKey());
      if (!aligned2.add(pair.getValue()))
        throw new IllegalArgumentException(
            "Alignment is not one-to-one for residue "
                + pair.getValue()
                + " of the second structure.");
    }

    Iterator<Integer> it1 = aligned1.iterator();
    Iterator<Integer> it2 = aligned2.iterator();
    while (it1.hasNext()) {
      if (inverseAlignment) { // 2->1
        identity.put(it2.next(), it1.next());
      } else { // 1->2
        identity.put(it1.next(), it2.next());
      }
    }
    return identity;
  }
Beispiel #3
0
  /**
   * Takes an AFPChain and replaces the optimal alignment based on an alignment map
   *
   * <p>Parameters are filled with defaults (often null) or sometimes calculated.
   *
   * <p>For a way to create a new AFPChain, see {@link AlignmentTools#createAFPChain(Atom[], Atom[],
   * ResidueNumber[], ResidueNumber[])}
   *
   * @param afpChain The alignment to be modified
   * @param alignment The new alignment, as a Map
   * @throws StructureException if an error occurred during superposition
   * @see AlignmentTools#createAFPChain(Atom[], Atom[], ResidueNumber[], ResidueNumber[])
   */
  public static AFPChain replaceOptAln(
      AFPChain afpChain, Atom[] ca1, Atom[] ca2, Map<Integer, Integer> alignment)
      throws StructureException {

    // Determine block lengths
    // Sort ca1 indices, then start a new block whenever ca2 indices aren't
    // increasing monotonically.
    Integer[] res1 = alignment.keySet().toArray(new Integer[0]);
    Arrays.sort(res1);
    List<Integer> blockLens = new ArrayList<Integer>(2);
    int optLength = 0;
    Integer lastRes = alignment.get(res1[0]);
    int blkLen = lastRes == null ? 0 : 1;
    for (int i = 1; i < res1.length; i++) {
      Integer currRes = alignment.get(res1[i]); // res2 index
      assert (currRes
          != null); // could be converted to if statement if assertion doesn't hold; just modify
                    // below as well.
      if (lastRes < currRes) {
        blkLen++;
      } else {
        // CP!
        blockLens.add(blkLen);
        optLength += blkLen;
        blkLen = 1;
      }
      lastRes = currRes;
    }
    blockLens.add(blkLen);
    optLength += blkLen;

    // Create array structure for alignment
    int[][][] optAln = new int[blockLens.size()][][];
    int pos1 = 0; // index into res1
    for (int blk = 0; blk < blockLens.size(); blk++) {
      optAln[blk] = new int[2][];
      blkLen = blockLens.get(blk);
      optAln[blk][0] = new int[blkLen];
      optAln[blk][1] = new int[blkLen];
      int pos = 0; // index into optAln
      while (pos < blkLen) {
        optAln[blk][0][pos] = res1[pos1];
        Integer currRes = alignment.get(res1[pos1]);
        optAln[blk][1][pos] = currRes;
        pos++;
        pos1++;
      }
    }
    assert (pos1 == optLength);

    // Create length array
    int[] optLens = new int[blockLens.size()];
    for (int i = 0; i < blockLens.size(); i++) {
      optLens[i] = blockLens.get(i);
    }

    return replaceOptAln(afpChain, ca1, ca2, blockLens.size(), optLens, optAln);
  }
Beispiel #4
0
 /** @see #toConciseAlignmentString(Map, Map) */
 public static Map<Integer, Integer> fromConciseAlignmentString(String string) {
   Map<Integer, Integer> map = new HashMap<Integer, Integer>();
   boolean matches = true;
   while (matches) {
     Pattern pattern = Pattern.compile("(\\d+)>(\\d+)");
     Matcher matcher = pattern.matcher(string);
     matches = matcher.find();
     if (matches) {
       Integer from = Integer.parseInt(matcher.group(1));
       Integer to = Integer.parseInt(matcher.group(2));
       map.put(from, to);
       string = string.substring(matcher.end(1) + 1);
     }
   }
   return map;
 }
  /**
   * identify additional groups that are not directly attached to amino acids.
   *
   * @param mc {@link ModifiedCompound}.
   * @param chain a {@link Chain}.
   * @return a list of added groups.
   */
  private void identifyAdditionalAttachments(
      ModifiedCompound mc, List<Group> ligands, Map<String, Chain> mapChainIdChain) {
    if (ligands.isEmpty()) {
      return;
    }

    // TODO: should the additional groups only be allowed to the identified
    // ligands or both amino acids and ligands? Currently only on ligands
    // ligands to amino acid bonds for same modification of unknown category
    // will be combined in mergeModComps()
    // TODO: how about chain-chain links?
    List<Group> identifiedGroups = new ArrayList<Group>();
    for (StructureGroup num : mc.getGroups(false)) {
      Group group;
      try {
        // String numIns = "" + num.getResidueNumber();
        // if (num.getInsCode() != null) {
        //	numIns += num.getInsCode();
        // }
        ResidueNumber resNum = new ResidueNumber();
        resNum.setChainId(num.getChainId());
        resNum.setSeqNum(num.getResidueNumber());
        resNum.setInsCode(num.getInsCode());
        // group = chain.getGroupByPDB(numIns);
        group = mapChainIdChain.get(num.getChainId()).getGroupByPDB(resNum);
      } catch (StructureException e) {
        logger.error("Exception: ", e);
        // should not happen
        continue;
      }
      identifiedGroups.add(group);
    }

    int start = 0;

    int n = identifiedGroups.size();
    while (n > start) {
      for (Group group1 : ligands) {
        for (int i = start; i < n; i++) {
          Group group2 = identifiedGroups.get(i);
          if (!identifiedGroups.contains(group1)) {
            List<Atom[]> linkedAtoms =
                StructureUtil.findAtomLinkages(group1, group2, false, bondLengthTolerance);
            if (!linkedAtoms.isEmpty()) {
              for (Atom[] atoms : linkedAtoms) {
                mc.addAtomLinkage(
                    StructureUtil.getStructureAtomLinkage(atoms[0], false, atoms[1], false));
              }
              identifiedGroups.add(group1);
              break;
            }
          }
        }
      }

      start = n;
      n = identifiedGroups.size();
    }
  }
Beispiel #6
0
  /**
   * Creates a Map specifying the alignment as a mapping between residue indices of protein 1 and
   * residue indices of protein 2.
   *
   * <p>For example,
   *
   * <pre>
   * 1234
   * 5678</pre>
   *
   * becomes
   *
   * <pre>
   * 1->5
   * 2->6
   * 3->7
   * 4->8</pre>
   *
   * @param afpChain An alignment
   * @return A mapping from aligned residues of protein 1 to their partners in protein 2.
   * @throws StructureException If afpChain is not one-to-one
   */
  public static Map<Integer, Integer> alignmentAsMap(AFPChain afpChain) throws StructureException {
    Map<Integer, Integer> map = new HashMap<Integer, Integer>();

    if (afpChain.getAlnLength() < 1) {
      return map;
    }
    int[][][] optAln = afpChain.getOptAln();
    int[] optLen = afpChain.getOptLen();
    for (int block = 0; block < afpChain.getBlockNum(); block++) {
      for (int pos = 0; pos < optLen[block]; pos++) {
        int res1 = optAln[block][0][pos];
        int res2 = optAln[block][1][pos];
        if (map.containsKey(res1)) {
          throw new StructureException(
              String.format("Residue %d aligned to both %d and %d.", res1, map.get(res1), res2));
        }
        map.put(res1, res2);
      }
    }
    return map;
  }
  /**
   * Uses two sequences each with a corresponding structure to create an AFPChain corresponding to
   * the alignment. Provided only for convenience since FastaReaders return such maps.
   *
   * @param sequences A Map containing exactly two entries from sequence names as Strings to gapped
   *     ProteinSequences; the name is ignored
   * @see #fastaToAfpChain(ProteinSequence, ProteinSequence, Structure, Structure)
   * @throws StructureException
   */
  public static AFPChain fastaToAfpChain(
      Map<String, ProteinSequence> sequences, Structure structure1, Structure structure2)
      throws StructureException {

    if (sequences.size() != 2) {
      throw new IllegalArgumentException(
          "There must be exactly 2 sequences, but there were " + sequences.size());
    }

    if (structure1 == null || structure2 == null) {
      throw new IllegalArgumentException("A structure is null");
    }

    List<ProteinSequence> seqs = new ArrayList<ProteinSequence>();
    List<String> names = new ArrayList<String>(2);
    for (Map.Entry<String, ProteinSequence> entry : sequences.entrySet()) {
      seqs.add(entry.getValue());
      names.add(entry.getKey());
    }

    return fastaToAfpChain(seqs.get(0), seqs.get(1), structure1, structure2);
  }
Beispiel #8
0
  /**
   * Applies an alignment k times. Eg if alignmentMap defines function f(x), this returns a function
   * f^k(x)=f(f(...f(x)...)).
   *
   * <p>To allow for functions with different domains and codomains, the identity function allows
   * converting back in a reasonable way. For instance, if alignmentMap represented an alignment
   * between two proteins with different numbering schemes, the identity function could calculate
   * the offset between residue numbers, eg I(x) = x-offset.
   *
   * <p>When an identity function is provided, the returned function calculates f^k(x) = f(I( f(I(
   * ... f(x) ... )) )).
   *
   * @param <S>
   * @param <T>
   * @param alignmentMap The input function, as a map (see {@link
   *     AlignmentTools#alignmentAsMap(AFPChain)})
   * @param identity An identity-like function providing the isomorphism between the codomain of
   *     alignmentMap (of type <T>) and the domain (type <S>).
   * @param k The number of times to apply the alignment
   * @return A new alignment. If the input function is not automorphic (one-to-one), then some
   *     inputs may map to null, indicating that the function is undefined for that input.
   */
  public static <S, T> Map<S, T> applyAlignment(Map<S, T> alignmentMap, Map<T, S> identity, int k) {

    // This implementation simply applies the map k times.
    // If k were large, it would be more efficient to do this recursively,
    // (eg f^4 = (f^2)^2) but k will usually be small.

    if (k < 0) throw new IllegalArgumentException("k must be positive");
    if (k == 1) {
      return new HashMap<S, T>(alignmentMap);
    }
    // Convert to lists to establish a fixed order
    List<S> preimage = new ArrayList<S>(alignmentMap.keySet()); // currently unmodified
    List<S> image = new ArrayList<S>(preimage);

    for (int n = 1; n < k; n++) {
      // apply alignment
      for (int i = 0; i < image.size(); i++) {
        S pre = image.get(i);
        T intermediate = (pre == null ? null : alignmentMap.get(pre));
        S post = (intermediate == null ? null : identity.get(intermediate));
        image.set(i, post);
      }
    }

    Map<S, T> imageMap = new HashMap<S, T>(alignmentMap.size());

    // TODO handle nulls consistently.
    // assure that all the residues in the domain are valid keys
    /*
    for(int i=0;i<preimage.size();i++) {
    	S pre = preimage.get(i);
    	T intermediate = (pre==null?null: alignmentMap.get(pre));
    	S post = (intermediate==null?null: identity.get(intermediate));
    	imageMap.put(post, null);
    }
     */
    // now populate with actual values
    for (int i = 0; i < preimage.size(); i++) {
      S pre = preimage.get(i);

      // image is currently f^k-1(x), so take the final step
      S preK1 = image.get(i);
      T postK = (preK1 == null ? null : alignmentMap.get(preK1));
      imageMap.put(pre, postK);
    }
    return imageMap;
  }
Beispiel #9
0
  /**
   * Tries to detect symmetry in an alignment.
   *
   * <p>Conceptually, an alignment is a function f:A->B between two sets of integers. The function
   * may have simple topology (meaning that if two elements of A are close, then their images in B
   * will also be close), or may have more complex topology (such as a circular permutation). This
   * function checks <i>alignment</i> against a reference function <i>identity</i>, which should
   * have simple topology. It then tries to determine the symmetry order of <i>alignment</i>
   * relative to <i>identity</i>, up to a maximum order of <i>maxSymmetry</i>.
   *
   * <p><strong>Details</strong><br>
   * Considers the offset (in number of residues) which a residue moves after undergoing <i>n</i>
   * alternating transforms by alignment and identity. If <i>n</i> corresponds to the intrinsic
   * order of the alignment, this will be small. This algorithm tries increasing values of <i>n</i>
   * and looks for abrupt decreases in the root mean squared offset. If none are found at
   * <i>n</i><=maxSymmetry, the alignment is reported as non-symmetric.
   *
   * @param alignment The alignment to test for symmetry
   * @param identity An alignment with simple topology which approximates the sequential
   *     relationship between the two proteins. Should map in the reverse direction from alignment.
   * @param maxSymmetry Maximum symmetry to consider. High values increase the calculation time and
   *     can lead to overfitting.
   * @param minimumMetricChange Percent decrease in root mean squared offsets in order to declare
   *     symmetry. 0.4f seems to work well for CeSymm.
   * @return The order of symmetry of alignment, or 1 if no order <= maxSymmetry is found.
   * @see IdentityMap For a simple identity function
   */
  public static int getSymmetryOrder(
      Map<Integer, Integer> alignment,
      Map<Integer, Integer> identity,
      final int maxSymmetry,
      final float minimumMetricChange) {
    List<Integer> preimage = new ArrayList<Integer>(alignment.keySet()); // currently unmodified
    List<Integer> image = new ArrayList<Integer>(preimage);

    int bestSymmetry = 1;
    double bestMetric = Double.POSITIVE_INFINITY; // lower is better
    boolean foundSymmetry = false;

    if (debug) {
      logger.trace("Symm\tPos\tDelta");
    }

    for (int n = 1; n <= maxSymmetry; n++) {
      int deltasSq = 0;
      int numDeltas = 0;
      // apply alignment
      for (int i = 0; i < image.size(); i++) {
        Integer pre = image.get(i);
        Integer intermediate = (pre == null ? null : alignment.get(pre));
        Integer post = (intermediate == null ? null : identity.get(intermediate));
        image.set(i, post);

        if (post != null) {
          int delta = post - preimage.get(i);

          deltasSq += delta * delta;
          numDeltas++;

          if (debug) {
            logger.debug("%d\t%d\t%d\n", n, preimage.get(i), delta);
          }
        }
      }

      // Metrics: RMS compensates for the trend of smaller numDeltas with higher order
      // Not normalizing by numDeltas favors smaller orders

      double metric = Math.sqrt((double) deltasSq / numDeltas); // root mean squared distance

      if (!foundSymmetry && metric < bestMetric * minimumMetricChange) {
        // n = 1 is never the best symmetry
        if (bestMetric < Double.POSITIVE_INFINITY) {
          foundSymmetry = true;
        }
        bestSymmetry = n;
        bestMetric = metric;
      }

      // When debugging need to loop over everything. Unneeded in production
      if (!debug && foundSymmetry) {
        break;
      }
    }
    if (foundSymmetry) {
      return bestSymmetry;
    } else {
      return 1;
    }
  }
  /**
   * Identify a set of modifications in a a list of chains.
   *
   * @param chains query {@link Chain}s.
   * @param potentialModifications query {@link ProteinModification}s.
   */
  public void identify(
      final List<Chain> chains, final Set<ProteinModification> potentialModifications) {

    if (chains == null) {
      throw new IllegalArgumentException("Null structure.");
    }

    if (potentialModifications == null) {
      throw new IllegalArgumentException("Null potentialModifications.");
    }

    reset();

    if (potentialModifications.isEmpty()) {
      return;
    }

    Map<String, Chain> mapChainIdChain = new HashMap<String, Chain>(chains.size());
    residues = new ArrayList<Group>();
    List<Group> ligands = new ArrayList<Group>();
    Map<Component, Set<Group>> mapCompGroups = new HashMap<Component, Set<Group>>();

    for (Chain chain : chains) {
      mapChainIdChain.put(chain.getChainID(), chain);

      List<Group> ress = StructureUtil.getAminoAcids(chain);

      // List<Group> ligs = chain.getAtomLigands();
      List<Group> ligs = StructureTools.filterLigands(chain.getAtomGroups());
      residues.addAll(ress);
      residues.removeAll(ligs);
      ligands.addAll(ligs);
      addModificationGroups(potentialModifications, ress, ligs, mapCompGroups);
    }

    if (residues.isEmpty()) {
      String pdbId = "?";
      if (chains.size() > 0) {
        Structure struc = chains.get(0).getParent();
        if (struc != null) pdbId = struc.getPDBCode();
      }
      logger.warn(
          "No amino acids found for {}. Either you did not parse the PDB file with alignSEQRES records, or this record does not contain any amino acids.",
          pdbId);
    }
    List<ModifiedCompound> modComps = new ArrayList<ModifiedCompound>();

    for (ProteinModification mod : potentialModifications) {
      ModificationCondition condition = mod.getCondition();
      List<Component> components = condition.getComponents();
      if (!mapCompGroups.keySet().containsAll(components)) {
        // not all components exist for this mod.
        continue;
      }

      int sizeComps = components.size();
      if (sizeComps == 1) {

        processCrosslink1(mapCompGroups, modComps, mod, components);

      } else {

        processMultiCrosslink(mapCompGroups, modComps, mod, condition);
      }
    }

    if (recordAdditionalAttachments) {
      // identify additional groups that are not directly attached to amino acids.
      for (ModifiedCompound mc : modComps) {
        identifyAdditionalAttachments(mc, ligands, mapChainIdChain);
      }
    }

    mergeModComps(modComps);

    identifiedModifiedCompounds.addAll(modComps);

    // record unidentifiable linkage
    if (recordUnidentifiableModifiedCompounds) {
      recordUnidentifiableAtomLinkages(modComps, ligands);
      recordUnidentifiableModifiedResidues(modComps);
    }
  }