/**
   * Returns an AFPChain corresponding to the alignment between {@code structure1} and {@code
   * structure2}, which is given by the gapped protein sequences {@code sequence1} and {@code
   * sequence2}. The sequences need not correspond to the entire structures, since local alignment
   * is performed to match the sequences to structures. Assumes that a residue is aligned if and
   * only if it is given by an uppercase letter.
   *
   * @param sequence1 <em>Must</em> have {@link ProteinSequence#getUserCollection()} set to document
   *     upper- and lower-case as aligned and unaligned; see {@link
   *     #getAlignedUserCollection(String)}
   * @throws StructureException
   */
  public static AFPChain fastaToAfpChain(
      ProteinSequence sequence1,
      ProteinSequence sequence2,
      Structure structure1,
      Structure structure2)
      throws StructureException {

    if (structure1 == null || structure2 == null) {
      throw new IllegalArgumentException("A structure is null");
    }

    if (sequence1 == null || sequence2 == null) {
      throw new IllegalArgumentException("A sequence is null");
    }

    Atom[] ca1 = StructureTools.getRepresentativeAtomArray(structure1);
    Atom[] ca2 = StructureTools.getRepresentativeAtomArray(structure2);

    ResidueNumber[] residues1 =
        StructureSequenceMatcher.matchSequenceToStructure(sequence1, structure1);
    ResidueNumber[] residues2 =
        StructureSequenceMatcher.matchSequenceToStructure(sequence2, structure2);

    // nullify ResidueNumbers that have a lowercase sequence character
    if (sequence1.getUserCollection() != null) {
      CasePreservingProteinSequenceCreator.setLowercaseToNull(sequence1, residues1);
    }
    if (sequence2.getUserCollection() != null) {
      CasePreservingProteinSequenceCreator.setLowercaseToNull(sequence2, residues2);
    }

    return buildAlignment(ca1, ca2, residues1, residues2);
  }
 public static AFPChain cpFastaToAfpChain(
     String first, String second, Structure structure, int cpSite)
     throws StructureException, CompoundNotFoundException {
   ProteinSequence s1 = new ProteinSequence(first);
   s1.setUserCollection(getAlignedUserCollection(first));
   ProteinSequence s2 = new ProteinSequence(second);
   s2.setUserCollection(getAlignedUserCollection(second));
   return cpFastaToAfpChain(s1, s2, structure, cpSite);
 }
 /**
  * TODO Write comment
  *
  * @param sequence1
  * @param sequence2
  * @param structure1
  * @param structure2
  * @return
  * @throws StructureException
  * @throws CompoundNotFoundException
  */
 public static AFPChain fastaToAfpChain(
     String sequence1, String sequence2, Structure structure1, Structure structure2)
     throws StructureException, CompoundNotFoundException {
   ProteinSequence s1 = new ProteinSequence(sequence1);
   s1.setUserCollection(getAlignedUserCollection(sequence1));
   ProteinSequence s2 = new ProteinSequence(sequence2);
   s2.setUserCollection(getAlignedUserCollection(sequence2));
   return fastaToAfpChain(s1, s2, structure1, structure2);
 }
  public LinkedHashMap<String, ProteinSequence> getProteinSequences() throws Exception {
    LinkedHashMap<String, ProteinSequence> proteinSequenceList =
        new LinkedHashMap<String, ProteinSequence>();
    ArrayList<Element> elementList =
        XMLHelper.selectElements(geneidDoc.getDocumentElement(), "prediction/gene/protein");
    logger.info("{} hits", elementList.size());

    for (Element proteinElement : elementList) {
      Element geneElement = (Element) proteinElement.getParentNode();
      String sequence = proteinElement.getTextContent().replaceAll("\\W", "");
      ProteinSequence proteinSequence = new ProteinSequence(sequence);
      String idGene = geneElement.getAttribute("idGene");
      proteinSequence.setAccession(new AccessionID(idGene));
      proteinSequenceList.put(idGene, proteinSequence);
    }

    return proteinSequenceList;
  }
Example #5
0
  /** Test of process method, of class FastaReader. */
  @Test
  public void testProcess() throws Exception {
    logger.info("process");
    InputStream inStream = this.getClass().getResourceAsStream("/PF00104_small.fasta");
    assertNotNull(inStream);

    FastaReader<ProteinSequence, AminoAcidCompound> fastaReader =
        new FastaReader<ProteinSequence, AminoAcidCompound>(
            inStream,
            new GenericFastaHeaderParser<ProteinSequence, AminoAcidCompound>(),
            new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet()));
    LinkedHashMap<String, ProteinSequence> proteinSequences = fastaReader.process();
    inStream.close();

    // Should have 282 sequences
    // logger.debug("Expecting 283 got " + proteinSequences.size());
    assertEquals(proteinSequences.size(), 283);

    int seqNum = 0;
    for (String id : proteinSequences.keySet()) {
      ProteinSequence proteinSequence = proteinSequences.get(id);
      switch (seqNum) {
        case 0:
          assertEquals(proteinSequence.getAccession().getID(), "A2D504_ATEGE/1-46");
          assertEquals(
              proteinSequence.getSequenceAsString(),
              "-----------------FK-N----LP-LED----------------Q----ITL--IQY-----------SWM----------------------CL-SSFA------LSWRSYK---HTNSQFLYFAPDLVF-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------");
          break;
        case 281:
          // logger.debug("Get Accession: {}", proteinSequence.getAccession());
          // logger.debug("Get Protein Sequence: {}", proteinSequence.getSequenceAsString());
          assertEquals(proteinSequence.getAccession().getID(), "Q9PU76_CRONI/141-323");
          assertEquals(
              proteinSequence.getSequenceAsString(),
              "VETVTELTEFAKSI-PGFS-N----LD-LND----------------Q----VTL--LKY-----------GVY----------------------EA-IFAM------LASVMNK---DGMPVAYGNGFITRE------------------------------------------------------------------------------------------------------------------------------------------------------------FLKSLRKPFCDIMEPKFDFA-MKF-NSL-E-LDDSDI--------------------SLFVA-AIIC-CGDRPG-------------------------------------------LVNV--GHIEKMQESIVHVLKL-H-----LQN---------NH---PD----------------------------DI------F--------LFP-KLLQKMAD-LRQLV-----------------TEH-AQLV--QIIKK---TESDAHLHPLL-------QEI---");
          break;
        case 282:
          assertEquals(proteinSequence.getAccession().getID(), "Q98SJ1_CHICK/15-61");
          assertEquals(
              proteinSequence.getSequenceAsString(),
              "---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------Q-----------------NW------Q--------RFY-QLTKLLDS-MHDVV-----------------ENL-LSFC--FQTFLDKSM--SIEFPEML-------AEI---");
          break;
      }
      seqNum++;
    }
    assertEquals(seqNum, 283);
  }
  /** Test of process method, of class GenbankReader. */
  @Test
  public void testProcess() throws Throwable {
    /*
     * Method 1: With the GenbankProxySequenceReader
     */
    // Try with the GenbankProxySequenceReader
    GenbankProxySequenceReader<AminoAcidCompound> genbankProteinReader =
        new GenbankProxySequenceReader<AminoAcidCompound>(
            System.getProperty("java.io.tmpdir"),
            "NP_000257",
            AminoAcidCompoundSet.getAminoAcidCompoundSet());
    ProteinSequence proteinSequence = new ProteinSequence(genbankProteinReader);
    genbankProteinReader
        .getHeaderParser()
        .parseHeader(genbankProteinReader.getHeader(), proteinSequence);
    logger.info(
        "Sequence({},{}) = {}...",
        proteinSequence.getAccession(),
        proteinSequence.getLength(),
        proteinSequence.getSequenceAsString().substring(0, 10));

    GenbankProxySequenceReader<NucleotideCompound> genbankDNAReader =
        new GenbankProxySequenceReader<NucleotideCompound>(
            System.getProperty("java.io.tmpdir"), "NM_001126", DNACompoundSet.getDNACompoundSet());
    DNASequence dnaSequence = new DNASequence(genbankDNAReader);
    genbankDNAReader.getHeaderParser().parseHeader(genbankDNAReader.getHeader(), dnaSequence);
    logger.info(
        "Sequence({},{}) = {}...",
        dnaSequence.getAccession(),
        dnaSequence.getLength(),
        dnaSequence.getSequenceAsString().substring(0, 10));
    /*
     * Method 2: With the GenbankReaderHelper
     */
    // Try with the GenbankReaderHelper
    ClasspathResource dnaResource = new ClasspathResource("NM_000266.gb", true);
    // File dnaFile = new File("src/test/resources/NM_000266.gb");
    // File protFile = new File("src/test/resources/BondFeature.gb");
    ClasspathResource protResource = new ClasspathResource("BondFeature.gb");

    LinkedHashMap<String, DNASequence> dnaSequences =
        GenbankReaderHelper.readGenbankDNASequence(dnaResource.getInputStream());
    for (DNASequence sequence : dnaSequences.values()) {
      logger.info("DNA Sequence: {}", sequence.getSequenceAsString());
    }

    LinkedHashMap<String, ProteinSequence> protSequences =
        GenbankReaderHelper.readGenbankProteinSequence(protResource.getInputStream());
    for (ProteinSequence sequence : protSequences.values()) {
      logger.info("Protein Sequence: {}", sequence.getSequenceAsString());
    }
    /*
     * Method 3: With the GenbankReader Object
     */
    // Try reading with the GanbankReader

    GenbankReader<DNASequence, NucleotideCompound> dnaReader =
        new GenbankReader<DNASequence, NucleotideCompound>(
            dnaResource.getInputStream(),
            new GenericGenbankHeaderParser<DNASequence, NucleotideCompound>(),
            new DNASequenceCreator(DNACompoundSet.getDNACompoundSet()));
    dnaSequences = dnaReader.process();

    logger.info("DNA Sequence: {}", dnaSequences);

    GenbankReader<ProteinSequence, AminoAcidCompound> protReader =
        new GenbankReader<ProteinSequence, AminoAcidCompound>(
            protResource.getInputStream(),
            new GenericGenbankHeaderParser<ProteinSequence, AminoAcidCompound>(),
            new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet()));
    protSequences = protReader.process();

    logger.info("Protein Sequence: {}", protSequences);
  }
Example #7
0
  @Test
  public void processIntTest() throws Exception {
    logger.info("process(int)");
    InputStream inStream = this.getClass().getResourceAsStream("/PF00104_small.fasta");
    assertNotNull(inStream);
    FastaReader<ProteinSequence, AminoAcidCompound> fastaReader =
        new FastaReader<ProteinSequence, AminoAcidCompound>(
            inStream,
            new GenericFastaHeaderParser<ProteinSequence, AminoAcidCompound>(),
            new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet()));
    LinkedHashMap<String, ProteinSequence> proteinSequences = fastaReader.process(200);

    // Should have 200 sequences
    // logger.debug("Expecting 200 got " + proteinSequences.size());
    assertEquals(proteinSequences.size(), 200);

    int seqNum = 0;
    for (String id : proteinSequences.keySet()) {
      ProteinSequence proteinSequence = proteinSequences.get(id);
      switch (seqNum) {
        case 0:
          assertEquals(proteinSequence.getAccession().getID(), "A2D504_ATEGE/1-46");
          assertEquals(
              proteinSequence.getSequenceAsString(),
              "-----------------FK-N----LP-LED----------------Q----ITL--IQY-----------SWM----------------------CL-SSFA------LSWRSYK---HTNSQFLYFAPDLVF-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------");
          break;
        case 199:
          assertEquals(proteinSequence.getAccession().getID(), "Q5F0P7_HUMAN/248-428");
          assertEquals(
              proteinSequence.getSequenceAsString(),
              "DRELVVIIGWAKHI-PGFS-S----LS-LGD----------------Q----MSL--LQS-----------AWM----------------------EI-LILG------IVYRSLP---YDDKLVYAEDYIMD-------------------------------------------------------------------------------------------------------------------------------------------------------------EEHSRLAGLLELYRAILQLV-RRY-KKL-K-VEKEEF--------------------VTLKA-LALA-NSDSMY-------------------------------------------IEDL--EAVQKLQDLLHEALQD-Y-----ELS---------QR---HE----------------------------EP------W--------RTG-KLLLTLPL-LRQTA-----------------AKA-VQHF--YSVKLQGKV--PMH--KLF-------LEM---");
          break;
      }
      seqNum++;
    }
    assertEquals(seqNum, 200);

    // Should have 83 sequences
    proteinSequences = fastaReader.process(200);
    assertEquals(proteinSequences.size(), 83);
    seqNum = 0;
    for (String id : proteinSequences.keySet()) {
      ProteinSequence proteinSequence = proteinSequences.get(id);
      switch (seqNum) {
        case 0:
          assertEquals(proteinSequence.getAccession().getID(), "RARA_CANFA/233-413");
          assertEquals(
              proteinSequence.getSequenceAsString(),
              "TKCIIKTVEFAKQL-PGFT-T----LT-IAD----------------Q----ITL--LKA-----------ACL----------------------DI-LILR------ICTRYTP---EQDTMTFSEGLTLN-------------------------------------------------------------------------------------------------------------------------------------------------------------RTQMHKAGFGPLTDLVFAFA-NQL-LPL-E-MDDAET--------------------GLLSA-ICLI-CGDRQD-------------------------------------------LEQP--DRVDMLQEPLLEALKV-Y-----VRK---------RR---PS----------------------------RP------H--------MFP-KMLMKITD-LRSIS-----------------AKG-AERV--ITLKMEIPG--SMP--PLI-------QEM---");
          break;
        case 81:
          // logger.debug(proteinSequence.getAccession());
          // logger.debug(proteinSequence.getSequenceAsString());
          assertEquals(proteinSequence.getAccession().getID(), "Q9PU76_CRONI/141-323");
          assertEquals(
              proteinSequence.getSequenceAsString(),
              "VETVTELTEFAKSI-PGFS-N----LD-LND----------------Q----VTL--LKY-----------GVY----------------------EA-IFAM------LASVMNK---DGMPVAYGNGFITRE------------------------------------------------------------------------------------------------------------------------------------------------------------FLKSLRKPFCDIMEPKFDFA-MKF-NSL-E-LDDSDI--------------------SLFVA-AIIC-CGDRPG-------------------------------------------LVNV--GHIEKMQESIVHVLKL-H-----LQN---------NH---PD----------------------------DI------F--------LFP-KLLQKMAD-LRQLV-----------------TEH-AQLV--QIIKK---TESDAHLHPLL-------QEI---");
          break;
        case 82:
          assertEquals(proteinSequence.getAccession().getID(), "Q98SJ1_CHICK/15-61");
          assertEquals(
              proteinSequence.getSequenceAsString(),
              "---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------Q-----------------NW------Q--------RFY-QLTKLLDS-MHDVV-----------------ENL-LSFC--FQTFLDKSM--SIEFPEML-------AEI---");
          break;
      }
      seqNum++;
    }
    assertEquals(seqNum, 83);
    fastaReader.close();
    inStream.close();
  }
  /**
   * Takes a structure and sequence corresponding to an alignment between a structure or sequence
   * and itself (or even a structure with a sequence), where the result has a circular permutation
   * site {@link cpSite} residues to the right.
   *
   * @param first The unpermuted sequence
   * @param second The sequence permuted by cpSite
   * @param cpSite The number of residues from the beginning of the sequence at which the circular
   *     permutation site occurs; can be positive or negative; values greater than the length of the
   *     sequence are acceptable
   * @throws StructureException
   */
  public static AFPChain cpFastaToAfpChain(
      ProteinSequence first, ProteinSequence second, Structure structure, int cpSite)
      throws StructureException {

    if (structure == null) {
      throw new IllegalArgumentException("The structure is null");
    }

    if (first == null) {
      throw new IllegalArgumentException("The sequence is null");
    }

    // we need to find the ungapped CP site
    int gappedCpShift = 0;
    int ungappedCpShift = 0;
    while (ungappedCpShift < Math.abs(cpSite)) {
      char c;
      try {
        if (cpSite <= 0) {
          c = second.getSequenceAsString().charAt(gappedCpShift);
        } else {
          c = second.getSequenceAsString().charAt(first.getLength() - 1 - gappedCpShift);
        }
      } catch (StringIndexOutOfBoundsException e) {
        throw new IllegalArgumentException("CP site of " + cpSite + " is wrong");
      }
      if (c != '-') {
        ungappedCpShift++;
      }
      gappedCpShift++;
    }

    Atom[] ca1 = StructureTools.getRepresentativeAtomArray(structure);
    Atom[] ca2 =
        StructureTools.getRepresentativeAtomArray(
            structure); // can't use cloneCAArray because it doesn't set parent
                        // group.chain.structure

    ProteinSequence antipermuted = null;
    try {
      antipermuted =
          new ProteinSequence(
              SequenceTools.permuteCyclic(second.getSequenceAsString(), gappedCpShift));
    } catch (CompoundNotFoundException e) {
      // this can't happen, the original sequence comes from a ProteinSequence
      logger.error(
          "Unexpected error while creating protein sequence: {}. This is most likely a bug.",
          e.getMessage());
    }

    ResidueNumber[] residues = StructureSequenceMatcher.matchSequenceToStructure(first, structure);
    ResidueNumber[] antipermutedResidues =
        StructureSequenceMatcher.matchSequenceToStructure(antipermuted, structure);

    ResidueNumber[] nonpermutedResidues = new ResidueNumber[antipermutedResidues.length];
    SequenceTools.permuteCyclic(antipermutedResidues, nonpermutedResidues, -gappedCpShift);

    // nullify ResidueNumbers that have a lowercase sequence character
    if (first.getUserCollection() != null) {
      CasePreservingProteinSequenceCreator.setLowercaseToNull(first, residues);
    }
    if (second.getUserCollection() != null) {
      CasePreservingProteinSequenceCreator.setLowercaseToNull(second, nonpermutedResidues);
    }

    //		for (int i = 0; i < residues.length; i++) {
    //			if (residues[i] == null) {
    //				System.out.print("=");
    //			} else {
    //				System.out.print(sequence.getSequenceAsString().charAt(i));
    //			}
    //		}
    //		System.out.println();
    //		for (int i = 0; i < residues.length; i++) {
    //			if (nonpermutedResidues[i] == null) {
    //				System.out.print("=");
    //			} else {
    //				System.out.print(second.getSequenceAsString().charAt(i));
    //			}
    //		}
    //		System.out.println();

    return buildAlignment(ca1, ca2, residues, nonpermutedResidues);
  }