/** * Print an alignment map in a concise representation. Edges are given as two numbers separated by * '>'. They are chained together where possible, or separated by spaces where disjoint or * branched. * * <p>Note that more concise representations may be possible. Examples: * <li>1>2>3>1 * <li>1>2>3>2 4>3 * * @param alignment The input function, as a map (see {@link * AlignmentTools#alignmentAsMap(AFPChain)}) * @param identity An identity-like function providing the isomorphism between the codomain of * alignment (of type <T>) and the domain (type <S>). * @return */ public static <S, T> String toConciseAlignmentString(Map<S, T> alignment, Map<T, S> identity) { // Clone input to prevent changes Map<S, T> alig = new HashMap<S, T>(alignment); // Generate inverse alignment Map<S, List<S>> inverse = new HashMap<S, List<S>>(); for (Entry<S, T> e : alig.entrySet()) { S val = identity.get(e.getValue()); if (inverse.containsKey(val)) { List<S> l = inverse.get(val); l.add(e.getKey()); } else { List<S> l = new ArrayList<S>(); l.add(e.getKey()); inverse.put(val, l); } } StringBuilder str = new StringBuilder(); while (!alig.isEmpty()) { // Pick an edge and work upstream to a root or cycle S seedNode = alig.keySet().iterator().next(); S node = seedNode; if (inverse.containsKey(seedNode)) { node = inverse.get(seedNode).iterator().next(); while (node != seedNode && inverse.containsKey(node)) { node = inverse.get(node).iterator().next(); } } // Now work downstream, deleting edges as we go seedNode = node; str.append(node); while (alig.containsKey(node)) { S lastNode = node; node = identity.get(alig.get(lastNode)); // Output str.append('>'); str.append(node); // Remove edge alig.remove(lastNode); List<S> inv = inverse.get(node); if (inv.size() > 1) { inv.remove(node); } else { inverse.remove(node); } } if (!alig.isEmpty()) { str.append(' '); } } return str.toString(); }
/** * Takes a potentially non-sequential alignment and guesses a sequential version of it. Residues * from each structure are sorted sequentially and then compared directly. * * <p>The results of this method are consistent with what one might expect from an identity * function, and are therefore useful with {@link #getSymmetryOrder(Map, Map identity, int, * float)}. * * <ul> * <li>Perfect self-alignments will have the same pre-image and image, so will map X->X * <li>Gaps and alignment errors will cause errors in the resulting map, but only locally. * Errors do not propagate through the whole alignment. * </ul> * * <h4>Example:</h4> * * A non sequential alignment, represented schematically as * * <pre> * 12456789 * 78912345</pre> * * would result in a map * * <pre> * 12456789 * 12345789</pre> * * @param alignment The non-sequential input alignment * @param inverseAlignment If false, map from structure1 to structure2. If true, generate the * inverse of that map. * @return A mapping from sequential residues of one protein to those of the other * @throws IllegalArgumentException if the input alignment is not one-to-one. */ public static Map<Integer, Integer> guessSequentialAlignment( Map<Integer, Integer> alignment, boolean inverseAlignment) { Map<Integer, Integer> identity = new HashMap<Integer, Integer>(); SortedSet<Integer> aligned1 = new TreeSet<Integer>(); SortedSet<Integer> aligned2 = new TreeSet<Integer>(); for (Entry<Integer, Integer> pair : alignment.entrySet()) { aligned1.add(pair.getKey()); if (!aligned2.add(pair.getValue())) throw new IllegalArgumentException( "Alignment is not one-to-one for residue " + pair.getValue() + " of the second structure."); } Iterator<Integer> it1 = aligned1.iterator(); Iterator<Integer> it2 = aligned2.iterator(); while (it1.hasNext()) { if (inverseAlignment) { // 2->1 identity.put(it2.next(), it1.next()); } else { // 1->2 identity.put(it1.next(), it2.next()); } } return identity; }
/** * Takes an AFPChain and replaces the optimal alignment based on an alignment map * * <p>Parameters are filled with defaults (often null) or sometimes calculated. * * <p>For a way to create a new AFPChain, see {@link AlignmentTools#createAFPChain(Atom[], Atom[], * ResidueNumber[], ResidueNumber[])} * * @param afpChain The alignment to be modified * @param alignment The new alignment, as a Map * @throws StructureException if an error occurred during superposition * @see AlignmentTools#createAFPChain(Atom[], Atom[], ResidueNumber[], ResidueNumber[]) */ public static AFPChain replaceOptAln( AFPChain afpChain, Atom[] ca1, Atom[] ca2, Map<Integer, Integer> alignment) throws StructureException { // Determine block lengths // Sort ca1 indices, then start a new block whenever ca2 indices aren't // increasing monotonically. Integer[] res1 = alignment.keySet().toArray(new Integer[0]); Arrays.sort(res1); List<Integer> blockLens = new ArrayList<Integer>(2); int optLength = 0; Integer lastRes = alignment.get(res1[0]); int blkLen = lastRes == null ? 0 : 1; for (int i = 1; i < res1.length; i++) { Integer currRes = alignment.get(res1[i]); // res2 index assert (currRes != null); // could be converted to if statement if assertion doesn't hold; just modify // below as well. if (lastRes < currRes) { blkLen++; } else { // CP! blockLens.add(blkLen); optLength += blkLen; blkLen = 1; } lastRes = currRes; } blockLens.add(blkLen); optLength += blkLen; // Create array structure for alignment int[][][] optAln = new int[blockLens.size()][][]; int pos1 = 0; // index into res1 for (int blk = 0; blk < blockLens.size(); blk++) { optAln[blk] = new int[2][]; blkLen = blockLens.get(blk); optAln[blk][0] = new int[blkLen]; optAln[blk][1] = new int[blkLen]; int pos = 0; // index into optAln while (pos < blkLen) { optAln[blk][0][pos] = res1[pos1]; Integer currRes = alignment.get(res1[pos1]); optAln[blk][1][pos] = currRes; pos++; pos1++; } } assert (pos1 == optLength); // Create length array int[] optLens = new int[blockLens.size()]; for (int i = 0; i < blockLens.size(); i++) { optLens[i] = blockLens.get(i); } return replaceOptAln(afpChain, ca1, ca2, blockLens.size(), optLens, optAln); }
/** @see #toConciseAlignmentString(Map, Map) */ public static Map<Integer, Integer> fromConciseAlignmentString(String string) { Map<Integer, Integer> map = new HashMap<Integer, Integer>(); boolean matches = true; while (matches) { Pattern pattern = Pattern.compile("(\\d+)>(\\d+)"); Matcher matcher = pattern.matcher(string); matches = matcher.find(); if (matches) { Integer from = Integer.parseInt(matcher.group(1)); Integer to = Integer.parseInt(matcher.group(2)); map.put(from, to); string = string.substring(matcher.end(1) + 1); } } return map; }
/** * identify additional groups that are not directly attached to amino acids. * * @param mc {@link ModifiedCompound}. * @param chain a {@link Chain}. * @return a list of added groups. */ private void identifyAdditionalAttachments( ModifiedCompound mc, List<Group> ligands, Map<String, Chain> mapChainIdChain) { if (ligands.isEmpty()) { return; } // TODO: should the additional groups only be allowed to the identified // ligands or both amino acids and ligands? Currently only on ligands // ligands to amino acid bonds for same modification of unknown category // will be combined in mergeModComps() // TODO: how about chain-chain links? List<Group> identifiedGroups = new ArrayList<Group>(); for (StructureGroup num : mc.getGroups(false)) { Group group; try { // String numIns = "" + num.getResidueNumber(); // if (num.getInsCode() != null) { // numIns += num.getInsCode(); // } ResidueNumber resNum = new ResidueNumber(); resNum.setChainId(num.getChainId()); resNum.setSeqNum(num.getResidueNumber()); resNum.setInsCode(num.getInsCode()); // group = chain.getGroupByPDB(numIns); group = mapChainIdChain.get(num.getChainId()).getGroupByPDB(resNum); } catch (StructureException e) { logger.error("Exception: ", e); // should not happen continue; } identifiedGroups.add(group); } int start = 0; int n = identifiedGroups.size(); while (n > start) { for (Group group1 : ligands) { for (int i = start; i < n; i++) { Group group2 = identifiedGroups.get(i); if (!identifiedGroups.contains(group1)) { List<Atom[]> linkedAtoms = StructureUtil.findAtomLinkages(group1, group2, false, bondLengthTolerance); if (!linkedAtoms.isEmpty()) { for (Atom[] atoms : linkedAtoms) { mc.addAtomLinkage( StructureUtil.getStructureAtomLinkage(atoms[0], false, atoms[1], false)); } identifiedGroups.add(group1); break; } } } } start = n; n = identifiedGroups.size(); } }
/** * Creates a Map specifying the alignment as a mapping between residue indices of protein 1 and * residue indices of protein 2. * * <p>For example, * * <pre> * 1234 * 5678</pre> * * becomes * * <pre> * 1->5 * 2->6 * 3->7 * 4->8</pre> * * @param afpChain An alignment * @return A mapping from aligned residues of protein 1 to their partners in protein 2. * @throws StructureException If afpChain is not one-to-one */ public static Map<Integer, Integer> alignmentAsMap(AFPChain afpChain) throws StructureException { Map<Integer, Integer> map = new HashMap<Integer, Integer>(); if (afpChain.getAlnLength() < 1) { return map; } int[][][] optAln = afpChain.getOptAln(); int[] optLen = afpChain.getOptLen(); for (int block = 0; block < afpChain.getBlockNum(); block++) { for (int pos = 0; pos < optLen[block]; pos++) { int res1 = optAln[block][0][pos]; int res2 = optAln[block][1][pos]; if (map.containsKey(res1)) { throw new StructureException( String.format("Residue %d aligned to both %d and %d.", res1, map.get(res1), res2)); } map.put(res1, res2); } } return map; }
/** * Uses two sequences each with a corresponding structure to create an AFPChain corresponding to * the alignment. Provided only for convenience since FastaReaders return such maps. * * @param sequences A Map containing exactly two entries from sequence names as Strings to gapped * ProteinSequences; the name is ignored * @see #fastaToAfpChain(ProteinSequence, ProteinSequence, Structure, Structure) * @throws StructureException */ public static AFPChain fastaToAfpChain( Map<String, ProteinSequence> sequences, Structure structure1, Structure structure2) throws StructureException { if (sequences.size() != 2) { throw new IllegalArgumentException( "There must be exactly 2 sequences, but there were " + sequences.size()); } if (structure1 == null || structure2 == null) { throw new IllegalArgumentException("A structure is null"); } List<ProteinSequence> seqs = new ArrayList<ProteinSequence>(); List<String> names = new ArrayList<String>(2); for (Map.Entry<String, ProteinSequence> entry : sequences.entrySet()) { seqs.add(entry.getValue()); names.add(entry.getKey()); } return fastaToAfpChain(seqs.get(0), seqs.get(1), structure1, structure2); }
/** * Applies an alignment k times. Eg if alignmentMap defines function f(x), this returns a function * f^k(x)=f(f(...f(x)...)). * * <p>To allow for functions with different domains and codomains, the identity function allows * converting back in a reasonable way. For instance, if alignmentMap represented an alignment * between two proteins with different numbering schemes, the identity function could calculate * the offset between residue numbers, eg I(x) = x-offset. * * <p>When an identity function is provided, the returned function calculates f^k(x) = f(I( f(I( * ... f(x) ... )) )). * * @param <S> * @param <T> * @param alignmentMap The input function, as a map (see {@link * AlignmentTools#alignmentAsMap(AFPChain)}) * @param identity An identity-like function providing the isomorphism between the codomain of * alignmentMap (of type <T>) and the domain (type <S>). * @param k The number of times to apply the alignment * @return A new alignment. If the input function is not automorphic (one-to-one), then some * inputs may map to null, indicating that the function is undefined for that input. */ public static <S, T> Map<S, T> applyAlignment(Map<S, T> alignmentMap, Map<T, S> identity, int k) { // This implementation simply applies the map k times. // If k were large, it would be more efficient to do this recursively, // (eg f^4 = (f^2)^2) but k will usually be small. if (k < 0) throw new IllegalArgumentException("k must be positive"); if (k == 1) { return new HashMap<S, T>(alignmentMap); } // Convert to lists to establish a fixed order List<S> preimage = new ArrayList<S>(alignmentMap.keySet()); // currently unmodified List<S> image = new ArrayList<S>(preimage); for (int n = 1; n < k; n++) { // apply alignment for (int i = 0; i < image.size(); i++) { S pre = image.get(i); T intermediate = (pre == null ? null : alignmentMap.get(pre)); S post = (intermediate == null ? null : identity.get(intermediate)); image.set(i, post); } } Map<S, T> imageMap = new HashMap<S, T>(alignmentMap.size()); // TODO handle nulls consistently. // assure that all the residues in the domain are valid keys /* for(int i=0;i<preimage.size();i++) { S pre = preimage.get(i); T intermediate = (pre==null?null: alignmentMap.get(pre)); S post = (intermediate==null?null: identity.get(intermediate)); imageMap.put(post, null); } */ // now populate with actual values for (int i = 0; i < preimage.size(); i++) { S pre = preimage.get(i); // image is currently f^k-1(x), so take the final step S preK1 = image.get(i); T postK = (preK1 == null ? null : alignmentMap.get(preK1)); imageMap.put(pre, postK); } return imageMap; }
/** * Tries to detect symmetry in an alignment. * * <p>Conceptually, an alignment is a function f:A->B between two sets of integers. The function * may have simple topology (meaning that if two elements of A are close, then their images in B * will also be close), or may have more complex topology (such as a circular permutation). This * function checks <i>alignment</i> against a reference function <i>identity</i>, which should * have simple topology. It then tries to determine the symmetry order of <i>alignment</i> * relative to <i>identity</i>, up to a maximum order of <i>maxSymmetry</i>. * * <p><strong>Details</strong><br> * Considers the offset (in number of residues) which a residue moves after undergoing <i>n</i> * alternating transforms by alignment and identity. If <i>n</i> corresponds to the intrinsic * order of the alignment, this will be small. This algorithm tries increasing values of <i>n</i> * and looks for abrupt decreases in the root mean squared offset. If none are found at * <i>n</i><=maxSymmetry, the alignment is reported as non-symmetric. * * @param alignment The alignment to test for symmetry * @param identity An alignment with simple topology which approximates the sequential * relationship between the two proteins. Should map in the reverse direction from alignment. * @param maxSymmetry Maximum symmetry to consider. High values increase the calculation time and * can lead to overfitting. * @param minimumMetricChange Percent decrease in root mean squared offsets in order to declare * symmetry. 0.4f seems to work well for CeSymm. * @return The order of symmetry of alignment, or 1 if no order <= maxSymmetry is found. * @see IdentityMap For a simple identity function */ public static int getSymmetryOrder( Map<Integer, Integer> alignment, Map<Integer, Integer> identity, final int maxSymmetry, final float minimumMetricChange) { List<Integer> preimage = new ArrayList<Integer>(alignment.keySet()); // currently unmodified List<Integer> image = new ArrayList<Integer>(preimage); int bestSymmetry = 1; double bestMetric = Double.POSITIVE_INFINITY; // lower is better boolean foundSymmetry = false; if (debug) { logger.trace("Symm\tPos\tDelta"); } for (int n = 1; n <= maxSymmetry; n++) { int deltasSq = 0; int numDeltas = 0; // apply alignment for (int i = 0; i < image.size(); i++) { Integer pre = image.get(i); Integer intermediate = (pre == null ? null : alignment.get(pre)); Integer post = (intermediate == null ? null : identity.get(intermediate)); image.set(i, post); if (post != null) { int delta = post - preimage.get(i); deltasSq += delta * delta; numDeltas++; if (debug) { logger.debug("%d\t%d\t%d\n", n, preimage.get(i), delta); } } } // Metrics: RMS compensates for the trend of smaller numDeltas with higher order // Not normalizing by numDeltas favors smaller orders double metric = Math.sqrt((double) deltasSq / numDeltas); // root mean squared distance if (!foundSymmetry && metric < bestMetric * minimumMetricChange) { // n = 1 is never the best symmetry if (bestMetric < Double.POSITIVE_INFINITY) { foundSymmetry = true; } bestSymmetry = n; bestMetric = metric; } // When debugging need to loop over everything. Unneeded in production if (!debug && foundSymmetry) { break; } } if (foundSymmetry) { return bestSymmetry; } else { return 1; } }
/** * Identify a set of modifications in a a list of chains. * * @param chains query {@link Chain}s. * @param potentialModifications query {@link ProteinModification}s. */ public void identify( final List<Chain> chains, final Set<ProteinModification> potentialModifications) { if (chains == null) { throw new IllegalArgumentException("Null structure."); } if (potentialModifications == null) { throw new IllegalArgumentException("Null potentialModifications."); } reset(); if (potentialModifications.isEmpty()) { return; } Map<String, Chain> mapChainIdChain = new HashMap<String, Chain>(chains.size()); residues = new ArrayList<Group>(); List<Group> ligands = new ArrayList<Group>(); Map<Component, Set<Group>> mapCompGroups = new HashMap<Component, Set<Group>>(); for (Chain chain : chains) { mapChainIdChain.put(chain.getChainID(), chain); List<Group> ress = StructureUtil.getAminoAcids(chain); // List<Group> ligs = chain.getAtomLigands(); List<Group> ligs = StructureTools.filterLigands(chain.getAtomGroups()); residues.addAll(ress); residues.removeAll(ligs); ligands.addAll(ligs); addModificationGroups(potentialModifications, ress, ligs, mapCompGroups); } if (residues.isEmpty()) { String pdbId = "?"; if (chains.size() > 0) { Structure struc = chains.get(0).getParent(); if (struc != null) pdbId = struc.getPDBCode(); } logger.warn( "No amino acids found for {}. Either you did not parse the PDB file with alignSEQRES records, or this record does not contain any amino acids.", pdbId); } List<ModifiedCompound> modComps = new ArrayList<ModifiedCompound>(); for (ProteinModification mod : potentialModifications) { ModificationCondition condition = mod.getCondition(); List<Component> components = condition.getComponents(); if (!mapCompGroups.keySet().containsAll(components)) { // not all components exist for this mod. continue; } int sizeComps = components.size(); if (sizeComps == 1) { processCrosslink1(mapCompGroups, modComps, mod, components); } else { processMultiCrosslink(mapCompGroups, modComps, mod, condition); } } if (recordAdditionalAttachments) { // identify additional groups that are not directly attached to amino acids. for (ModifiedCompound mc : modComps) { identifyAdditionalAttachments(mc, ligands, mapChainIdChain); } } mergeModComps(modComps); identifiedModifiedCompounds.addAll(modComps); // record unidentifiable linkage if (recordUnidentifiableModifiedCompounds) { recordUnidentifiableAtomLinkages(modComps, ligands); recordUnidentifiableModifiedResidues(modComps); } }