/** * Returns an AFPChain corresponding to the alignment between {@code structure1} and {@code * structure2}, which is given by the gapped protein sequences {@code sequence1} and {@code * sequence2}. The sequences need not correspond to the entire structures, since local alignment * is performed to match the sequences to structures. Assumes that a residue is aligned if and * only if it is given by an uppercase letter. * * @param sequence1 <em>Must</em> have {@link ProteinSequence#getUserCollection()} set to document * upper- and lower-case as aligned and unaligned; see {@link * #getAlignedUserCollection(String)} * @throws StructureException */ public static AFPChain fastaToAfpChain( ProteinSequence sequence1, ProteinSequence sequence2, Structure structure1, Structure structure2) throws StructureException { if (structure1 == null || structure2 == null) { throw new IllegalArgumentException("A structure is null"); } if (sequence1 == null || sequence2 == null) { throw new IllegalArgumentException("A sequence is null"); } Atom[] ca1 = StructureTools.getRepresentativeAtomArray(structure1); Atom[] ca2 = StructureTools.getRepresentativeAtomArray(structure2); ResidueNumber[] residues1 = StructureSequenceMatcher.matchSequenceToStructure(sequence1, structure1); ResidueNumber[] residues2 = StructureSequenceMatcher.matchSequenceToStructure(sequence2, structure2); // nullify ResidueNumbers that have a lowercase sequence character if (sequence1.getUserCollection() != null) { CasePreservingProteinSequenceCreator.setLowercaseToNull(sequence1, residues1); } if (sequence2.getUserCollection() != null) { CasePreservingProteinSequenceCreator.setLowercaseToNull(sequence2, residues2); } return buildAlignment(ca1, ca2, residues1, residues2); }
public static AFPChain cpFastaToAfpChain( String first, String second, Structure structure, int cpSite) throws StructureException, CompoundNotFoundException { ProteinSequence s1 = new ProteinSequence(first); s1.setUserCollection(getAlignedUserCollection(first)); ProteinSequence s2 = new ProteinSequence(second); s2.setUserCollection(getAlignedUserCollection(second)); return cpFastaToAfpChain(s1, s2, structure, cpSite); }
/** * TODO Write comment * * @param sequence1 * @param sequence2 * @param structure1 * @param structure2 * @return * @throws StructureException * @throws CompoundNotFoundException */ public static AFPChain fastaToAfpChain( String sequence1, String sequence2, Structure structure1, Structure structure2) throws StructureException, CompoundNotFoundException { ProteinSequence s1 = new ProteinSequence(sequence1); s1.setUserCollection(getAlignedUserCollection(sequence1)); ProteinSequence s2 = new ProteinSequence(sequence2); s2.setUserCollection(getAlignedUserCollection(sequence2)); return fastaToAfpChain(s1, s2, structure1, structure2); }
public LinkedHashMap<String, ProteinSequence> getProteinSequences() throws Exception { LinkedHashMap<String, ProteinSequence> proteinSequenceList = new LinkedHashMap<String, ProteinSequence>(); ArrayList<Element> elementList = XMLHelper.selectElements(geneidDoc.getDocumentElement(), "prediction/gene/protein"); logger.info("{} hits", elementList.size()); for (Element proteinElement : elementList) { Element geneElement = (Element) proteinElement.getParentNode(); String sequence = proteinElement.getTextContent().replaceAll("\\W", ""); ProteinSequence proteinSequence = new ProteinSequence(sequence); String idGene = geneElement.getAttribute("idGene"); proteinSequence.setAccession(new AccessionID(idGene)); proteinSequenceList.put(idGene, proteinSequence); } return proteinSequenceList; }
/** Test of process method, of class FastaReader. */ @Test public void testProcess() throws Exception { logger.info("process"); InputStream inStream = this.getClass().getResourceAsStream("/PF00104_small.fasta"); assertNotNull(inStream); FastaReader<ProteinSequence, AminoAcidCompound> fastaReader = new FastaReader<ProteinSequence, AminoAcidCompound>( inStream, new GenericFastaHeaderParser<ProteinSequence, AminoAcidCompound>(), new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet())); LinkedHashMap<String, ProteinSequence> proteinSequences = fastaReader.process(); inStream.close(); // Should have 282 sequences // logger.debug("Expecting 283 got " + proteinSequences.size()); assertEquals(proteinSequences.size(), 283); int seqNum = 0; for (String id : proteinSequences.keySet()) { ProteinSequence proteinSequence = proteinSequences.get(id); switch (seqNum) { case 0: assertEquals(proteinSequence.getAccession().getID(), "A2D504_ATEGE/1-46"); assertEquals( proteinSequence.getSequenceAsString(), "-----------------FK-N----LP-LED----------------Q----ITL--IQY-----------SWM----------------------CL-SSFA------LSWRSYK---HTNSQFLYFAPDLVF-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"); break; case 281: // logger.debug("Get Accession: {}", proteinSequence.getAccession()); // logger.debug("Get Protein Sequence: {}", proteinSequence.getSequenceAsString()); assertEquals(proteinSequence.getAccession().getID(), "Q9PU76_CRONI/141-323"); assertEquals( proteinSequence.getSequenceAsString(), "VETVTELTEFAKSI-PGFS-N----LD-LND----------------Q----VTL--LKY-----------GVY----------------------EA-IFAM------LASVMNK---DGMPVAYGNGFITRE------------------------------------------------------------------------------------------------------------------------------------------------------------FLKSLRKPFCDIMEPKFDFA-MKF-NSL-E-LDDSDI--------------------SLFVA-AIIC-CGDRPG-------------------------------------------LVNV--GHIEKMQESIVHVLKL-H-----LQN---------NH---PD----------------------------DI------F--------LFP-KLLQKMAD-LRQLV-----------------TEH-AQLV--QIIKK---TESDAHLHPLL-------QEI---"); break; case 282: assertEquals(proteinSequence.getAccession().getID(), "Q98SJ1_CHICK/15-61"); assertEquals( proteinSequence.getSequenceAsString(), "---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------Q-----------------NW------Q--------RFY-QLTKLLDS-MHDVV-----------------ENL-LSFC--FQTFLDKSM--SIEFPEML-------AEI---"); break; } seqNum++; } assertEquals(seqNum, 283); }
/** Test of process method, of class GenbankReader. */ @Test public void testProcess() throws Throwable { /* * Method 1: With the GenbankProxySequenceReader */ // Try with the GenbankProxySequenceReader GenbankProxySequenceReader<AminoAcidCompound> genbankProteinReader = new GenbankProxySequenceReader<AminoAcidCompound>( System.getProperty("java.io.tmpdir"), "NP_000257", AminoAcidCompoundSet.getAminoAcidCompoundSet()); ProteinSequence proteinSequence = new ProteinSequence(genbankProteinReader); genbankProteinReader .getHeaderParser() .parseHeader(genbankProteinReader.getHeader(), proteinSequence); logger.info( "Sequence({},{}) = {}...", proteinSequence.getAccession(), proteinSequence.getLength(), proteinSequence.getSequenceAsString().substring(0, 10)); GenbankProxySequenceReader<NucleotideCompound> genbankDNAReader = new GenbankProxySequenceReader<NucleotideCompound>( System.getProperty("java.io.tmpdir"), "NM_001126", DNACompoundSet.getDNACompoundSet()); DNASequence dnaSequence = new DNASequence(genbankDNAReader); genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence); logger.info( "Sequence({},{}) = {}...", dnaSequence.getAccession(), dnaSequence.getLength(), dnaSequence.getSequenceAsString().substring(0, 10)); /* * Method 2: With the GenbankReaderHelper */ // Try with the GenbankReaderHelper ClasspathResource dnaResource = new ClasspathResource("NM_000266.gb", true); // File dnaFile = new File("src/test/resources/NM_000266.gb"); // File protFile = new File("src/test/resources/BondFeature.gb"); ClasspathResource protResource = new ClasspathResource("BondFeature.gb"); LinkedHashMap<String, DNASequence> dnaSequences = GenbankReaderHelper.readGenbankDNASequence(dnaResource.getInputStream()); for (DNASequence sequence : dnaSequences.values()) { logger.info("DNA Sequence: {}", sequence.getSequenceAsString()); } LinkedHashMap<String, ProteinSequence> protSequences = GenbankReaderHelper.readGenbankProteinSequence(protResource.getInputStream()); for (ProteinSequence sequence : protSequences.values()) { logger.info("Protein Sequence: {}", sequence.getSequenceAsString()); } /* * Method 3: With the GenbankReader Object */ // Try reading with the GanbankReader GenbankReader<DNASequence, NucleotideCompound> dnaReader = new GenbankReader<DNASequence, NucleotideCompound>( dnaResource.getInputStream(), new GenericGenbankHeaderParser<DNASequence, NucleotideCompound>(), new DNASequenceCreator(DNACompoundSet.getDNACompoundSet())); dnaSequences = dnaReader.process(); logger.info("DNA Sequence: {}", dnaSequences); GenbankReader<ProteinSequence, AminoAcidCompound> protReader = new GenbankReader<ProteinSequence, AminoAcidCompound>( protResource.getInputStream(), new GenericGenbankHeaderParser<ProteinSequence, AminoAcidCompound>(), new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet())); protSequences = protReader.process(); logger.info("Protein Sequence: {}", protSequences); }
@Test public void processIntTest() throws Exception { logger.info("process(int)"); InputStream inStream = this.getClass().getResourceAsStream("/PF00104_small.fasta"); assertNotNull(inStream); FastaReader<ProteinSequence, AminoAcidCompound> fastaReader = new FastaReader<ProteinSequence, AminoAcidCompound>( inStream, new GenericFastaHeaderParser<ProteinSequence, AminoAcidCompound>(), new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet())); LinkedHashMap<String, ProteinSequence> proteinSequences = fastaReader.process(200); // Should have 200 sequences // logger.debug("Expecting 200 got " + proteinSequences.size()); assertEquals(proteinSequences.size(), 200); int seqNum = 0; for (String id : proteinSequences.keySet()) { ProteinSequence proteinSequence = proteinSequences.get(id); switch (seqNum) { case 0: assertEquals(proteinSequence.getAccession().getID(), "A2D504_ATEGE/1-46"); assertEquals( proteinSequence.getSequenceAsString(), "-----------------FK-N----LP-LED----------------Q----ITL--IQY-----------SWM----------------------CL-SSFA------LSWRSYK---HTNSQFLYFAPDLVF-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"); break; case 199: assertEquals(proteinSequence.getAccession().getID(), "Q5F0P7_HUMAN/248-428"); assertEquals( proteinSequence.getSequenceAsString(), "DRELVVIIGWAKHI-PGFS-S----LS-LGD----------------Q----MSL--LQS-----------AWM----------------------EI-LILG------IVYRSLP---YDDKLVYAEDYIMD-------------------------------------------------------------------------------------------------------------------------------------------------------------EEHSRLAGLLELYRAILQLV-RRY-KKL-K-VEKEEF--------------------VTLKA-LALA-NSDSMY-------------------------------------------IEDL--EAVQKLQDLLHEALQD-Y-----ELS---------QR---HE----------------------------EP------W--------RTG-KLLLTLPL-LRQTA-----------------AKA-VQHF--YSVKLQGKV--PMH--KLF-------LEM---"); break; } seqNum++; } assertEquals(seqNum, 200); // Should have 83 sequences proteinSequences = fastaReader.process(200); assertEquals(proteinSequences.size(), 83); seqNum = 0; for (String id : proteinSequences.keySet()) { ProteinSequence proteinSequence = proteinSequences.get(id); switch (seqNum) { case 0: assertEquals(proteinSequence.getAccession().getID(), "RARA_CANFA/233-413"); assertEquals( proteinSequence.getSequenceAsString(), "TKCIIKTVEFAKQL-PGFT-T----LT-IAD----------------Q----ITL--LKA-----------ACL----------------------DI-LILR------ICTRYTP---EQDTMTFSEGLTLN-------------------------------------------------------------------------------------------------------------------------------------------------------------RTQMHKAGFGPLTDLVFAFA-NQL-LPL-E-MDDAET--------------------GLLSA-ICLI-CGDRQD-------------------------------------------LEQP--DRVDMLQEPLLEALKV-Y-----VRK---------RR---PS----------------------------RP------H--------MFP-KMLMKITD-LRSIS-----------------AKG-AERV--ITLKMEIPG--SMP--PLI-------QEM---"); break; case 81: // logger.debug(proteinSequence.getAccession()); // logger.debug(proteinSequence.getSequenceAsString()); assertEquals(proteinSequence.getAccession().getID(), "Q9PU76_CRONI/141-323"); assertEquals( proteinSequence.getSequenceAsString(), "VETVTELTEFAKSI-PGFS-N----LD-LND----------------Q----VTL--LKY-----------GVY----------------------EA-IFAM------LASVMNK---DGMPVAYGNGFITRE------------------------------------------------------------------------------------------------------------------------------------------------------------FLKSLRKPFCDIMEPKFDFA-MKF-NSL-E-LDDSDI--------------------SLFVA-AIIC-CGDRPG-------------------------------------------LVNV--GHIEKMQESIVHVLKL-H-----LQN---------NH---PD----------------------------DI------F--------LFP-KLLQKMAD-LRQLV-----------------TEH-AQLV--QIIKK---TESDAHLHPLL-------QEI---"); break; case 82: assertEquals(proteinSequence.getAccession().getID(), "Q98SJ1_CHICK/15-61"); assertEquals( proteinSequence.getSequenceAsString(), "---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------Q-----------------NW------Q--------RFY-QLTKLLDS-MHDVV-----------------ENL-LSFC--FQTFLDKSM--SIEFPEML-------AEI---"); break; } seqNum++; } assertEquals(seqNum, 83); fastaReader.close(); inStream.close(); }
/** * Takes a structure and sequence corresponding to an alignment between a structure or sequence * and itself (or even a structure with a sequence), where the result has a circular permutation * site {@link cpSite} residues to the right. * * @param first The unpermuted sequence * @param second The sequence permuted by cpSite * @param cpSite The number of residues from the beginning of the sequence at which the circular * permutation site occurs; can be positive or negative; values greater than the length of the * sequence are acceptable * @throws StructureException */ public static AFPChain cpFastaToAfpChain( ProteinSequence first, ProteinSequence second, Structure structure, int cpSite) throws StructureException { if (structure == null) { throw new IllegalArgumentException("The structure is null"); } if (first == null) { throw new IllegalArgumentException("The sequence is null"); } // we need to find the ungapped CP site int gappedCpShift = 0; int ungappedCpShift = 0; while (ungappedCpShift < Math.abs(cpSite)) { char c; try { if (cpSite <= 0) { c = second.getSequenceAsString().charAt(gappedCpShift); } else { c = second.getSequenceAsString().charAt(first.getLength() - 1 - gappedCpShift); } } catch (StringIndexOutOfBoundsException e) { throw new IllegalArgumentException("CP site of " + cpSite + " is wrong"); } if (c != '-') { ungappedCpShift++; } gappedCpShift++; } Atom[] ca1 = StructureTools.getRepresentativeAtomArray(structure); Atom[] ca2 = StructureTools.getRepresentativeAtomArray( structure); // can't use cloneCAArray because it doesn't set parent // group.chain.structure ProteinSequence antipermuted = null; try { antipermuted = new ProteinSequence( SequenceTools.permuteCyclic(second.getSequenceAsString(), gappedCpShift)); } catch (CompoundNotFoundException e) { // this can't happen, the original sequence comes from a ProteinSequence logger.error( "Unexpected error while creating protein sequence: {}. This is most likely a bug.", e.getMessage()); } ResidueNumber[] residues = StructureSequenceMatcher.matchSequenceToStructure(first, structure); ResidueNumber[] antipermutedResidues = StructureSequenceMatcher.matchSequenceToStructure(antipermuted, structure); ResidueNumber[] nonpermutedResidues = new ResidueNumber[antipermutedResidues.length]; SequenceTools.permuteCyclic(antipermutedResidues, nonpermutedResidues, -gappedCpShift); // nullify ResidueNumbers that have a lowercase sequence character if (first.getUserCollection() != null) { CasePreservingProteinSequenceCreator.setLowercaseToNull(first, residues); } if (second.getUserCollection() != null) { CasePreservingProteinSequenceCreator.setLowercaseToNull(second, nonpermutedResidues); } // for (int i = 0; i < residues.length; i++) { // if (residues[i] == null) { // System.out.print("="); // } else { // System.out.print(sequence.getSequenceAsString().charAt(i)); // } // } // System.out.println(); // for (int i = 0; i < residues.length; i++) { // if (nonpermutedResidues[i] == null) { // System.out.print("="); // } else { // System.out.print(second.getSequenceAsString().charAt(i)); // } // } // System.out.println(); return buildAlignment(ca1, ca2, residues, nonpermutedResidues); }