private String getSubstrateSymbol(
     Matcher patternSubstrateSymbol, SubstrateDatabaseEntry substratedatabase) {
   String Substratesymbol = null;
   if (patternSubstrateSymbol.find()) {
     Substratesymbol = patternSubstrateSymbol.group(2);
     substratedatabase.setS_Symbol(Substratesymbol);
     // System.out.println(Substratesymbol);
   } else {
     Substratesymbol = "n.d.";
     substratedatabase.setS_Symbol(Substratesymbol);
     // System.out.println(Substratesymbol);
   }
   return Substratesymbol;
 }
 private String getSubstrateAccession(
     Matcher patternSubstrateAccession, SubstrateDatabaseEntry subtratedatabase) {
   String accession = null;
   if (patternSubstrateAccession.find()) {
     accession = patternSubstrateAccession.group(1);
     accession = accession.trim();
     subtratedatabase.setS_UniprotID(accession);
     // System.out.println(accession);
   } else {
     accession = "n.d.";
     subtratedatabase.setS_UniprotID(accession);
     // System.out.println(accession);
   }
   return accession;
 }
 private String getUniSubstrategenename(
     NodeList entries, int i, SubstrateDatabaseEntry substratedatabase) {
   // GET SUBSTRATE GENE NAME using getInformation method
   LinkedList<String> genenamelist =
       getInformation("./gene/name[@type][1]/text()", entries.item(i));
   String genename = null;
   if (!genenamelist.isEmpty()) {
     genename = genenamelist.getFirst();
     System.out.println(genename);
     substratedatabase.setS_Symbol(genename);
   }
   return genename;
 }
 private void getUniSubstratepproteinname(
     NodeList entries, int i, SubstrateDatabaseEntry substratedatabase) {
   // GET SUBSTRATE PROTEIN NAME using getInformation method
   LinkedList<String> protnamelist =
       getInformation("./protein/recommendedName/fullName/text()", entries.item(i));
   String protname = null;
   if (!protnamelist.isEmpty()) {
     protname = protnamelist.getFirst();
     protname = protname.replaceAll(",", "");
     System.out.println(protname);
     substratedatabase.setS_Name(protname);
   }
 }
  private String getSubstrateNameSymbolId(
      Matcher patternSubstrateName,
      SubstrateDatabaseEntry substratedatabase,
      CsDatabaseEntry csdatabase,
      String entry,
      String substrateTaxon)
      throws IOException {
    String commentS = null;
    Matcher patternSubstrateAccession =
        getPatternmatcher(
            "UniProt\\s+Accession:</th>[^<]+<td><a\\s+href\\s+=\\s+\"[^\"]+\"\\s+target=\"[^\"]+\">"
                + "([^<]+)",
            entry);
    String accession = getSubstrateAccession(patternSubstrateAccession, substratedatabase);

    Matcher patternSubstrateSymbol =
        getPatternmatcher(
            "Substrate[^<]+</th>[^<]+<td>[^<]+<table>[^<]+<tr>[^<]+<th\\s+class=\"th3\">Definition:</th>[^<]+<td><b>"
                + "[^<]+"
                + "</b></td>[^<]+<tr>[^<]+<th\\s+class=\"th3\">Symbol:</th>"
                + "([^<]+<td><b>)?([^<]+)",
            entry);
    String symbol = getSubstrateSymbol(patternSubstrateSymbol, substratedatabase);

    if (patternSubstrateName.find()) {
      String Substratename = "to check";
      String Substratesymbol = "to check";
      String Substrateaccession = "to check";
      substratedatabase.setS_NL_Name(Substratename);
      substratedatabase.setS_Name(Substratename);
      substratedatabase.setS_Symbol(Substratesymbol);
      substratedatabase.setS_UniprotID(Substrateaccession);

      Substratename = patternSubstrateName.group(1);
      Substratename = Substratename.trim();
      Substratename = Substratename.replaceAll(",", "");
      Substratename = Substratename.replaceAll(";", "");
      substratedatabase.setS_NL_Name(Substratename);
      commentS = "Check Substrate Symbol and Accession; add to Substrate Librairy";

      BufferedReader bReader = null;
      if (substrateTaxon.contains("H**o")) {
        bReader =
            createBufferedreader(
                "/Users/julieklein/Dropbox/ProteasiX/LIBRAIRIES/SubstrateHSALibrairy.txt");
      } else if (substrateTaxon.contains("Mus")) {
        bReader =
            createBufferedreader(
                "/Users/julieklein/Dropbox/ProteasiX/LIBRAIRIES/SubstrateMMULibrairy.txt");
      } else if (substrateTaxon.contains("Rattus")) {
        bReader =
            createBufferedreader(
                "/Users/julieklein/Dropbox/ProteasiX/LIBRAIRIES/SubstrateRNOLibrairy.txt");
      }
      String line;
      while ((line = bReader.readLine()) != null) {
        String splitarray[] = line.split("\t");
        String naturallanguage = splitarray[1];
        naturallanguage = naturallanguage.replaceAll("\"", "");
        naturallanguage = naturallanguage.replaceAll(",", "");
        naturallanguage = naturallanguage.replaceAll(";", "");
        if (naturallanguage.equalsIgnoreCase(Substratename)) {
          Substratesymbol = splitarray[0];
          Substratesymbol = Substratesymbol.replaceAll("sept-0", "SEPT");
          Substrateaccession = splitarray[2];

          if (Substrateaccession.contains("n.d.")) {
            substratedatabase.setS_Name("n.d.");
            substratedatabase.setS_UniprotID(Substrateaccession);
            substratedatabase.setS_Symbol(Substratesymbol);

          } else {
            String UniprotURL = "http://www.uniprot.org/uniprot/" + Substrateaccession + ".xml";
            NodeList entries = getEntries("/uniprot/entry", parseUniprot(UniprotURL));
            for (int i = 0; i < entries.getLength(); i++) {
              getUniSubstratepproteinname(entries, i, substratedatabase);
              String genename = getUniSubstrategenename(entries, i, substratedatabase);
            }
            // System.out.println(Substrateaccession);
            substratedatabase.setS_UniprotID(Substrateaccession);
            commentS = "-";
            System.out.println(commentS);
          }
        }
        csdatabase.setSubstrate(substratedatabase);
      }

    } else if (!symbol.contains("n.d.")) {
      String Substratename = "to check";
      String Substratesymbol = "to check";
      String Substrateaccession = "to check";
      substratedatabase.setS_NL_Name(Substratename);
      substratedatabase.setS_Name(Substratename);
      substratedatabase.setS_Symbol(Substratesymbol);
      substratedatabase.setS_UniprotID(Substrateaccession);

      BufferedReader bReader = null;
      if (substrateTaxon.contains("H**o")) {
        bReader =
            createBufferedreader(
                "/Users/julieklein/Dropbox/ProteasiX/LIBRAIRIES/SubstrateHSALibrairy.txt");
      } else if (substrateTaxon.contains("Mus")) {
        bReader =
            createBufferedreader(
                "/Users/julieklein/Dropbox/ProteasiX/LIBRAIRIES/SubstrateMMULibrairy.txt");
      } else if (substrateTaxon.contains("Rattus")) {
        bReader =
            createBufferedreader(
                "/Users/julieklein/Dropbox/ProteasiX/LIBRAIRIES/SubstrateRNOLibrairy.txt");
      }
      String line;
      while ((line = bReader.readLine()) != null) {
        String splitarray[] = line.split("\t");
        String librairisymbol = splitarray[0];
        librairisymbol = librairisymbol.replaceAll("\"", "");
        librairisymbol = librairisymbol.replaceAll("sept-0", "SEPT");
        if (librairisymbol.equals(symbol)) {
          Substrateaccession = splitarray[2];

          if (Substrateaccession.contains("n.d.")) {
            substratedatabase.setS_Name("n.d.");
            substratedatabase.setS_UniprotID(Substrateaccession);
            substratedatabase.setS_Symbol(symbol);

          } else {
            String UniprotURL = "http://www.uniprot.org/uniprot/" + Substrateaccession + ".xml";
            NodeList entries = getEntries("/uniprot/entry", parseUniprot(UniprotURL));
            for (int i = 0; i < entries.getLength(); i++) {
              getUniSubstratepproteinname(entries, i, substratedatabase);
              String genename = getUniSubstrategenename(entries, i, substratedatabase);
            }
            // System.out.println(Substrateaccession);
            substratedatabase.setS_UniprotID(Substrateaccession);
            commentS = "-";
            System.out.println(commentS);
          }
        }
        csdatabase.setSubstrate(substratedatabase);
      }

    } else {
      String Substratename = "n.d.";
      String Substratesymbol = "n.d.";
      String Substrateaccession = "n.d";
      substratedatabase.setS_NL_Name(Substratename);
      substratedatabase.setS_Name(Substratename);
      substratedatabase.setS_Symbol(Substratesymbol);
      substratedatabase.setS_UniprotID(Substrateaccession);
      System.out.println(Substratename);
      System.out.println(Substratesymbol);
      System.out.println(Substrateaccession);
      commentS = "-";
      System.out.println(commentS);
      csdatabase.setSubstrate(substratedatabase);
    }
    return commentS;
  }
  public Main_PDB_Pmap() throws MalformedURLException, IOException {
    PrintStream csvWriter = null;
    LinkedList<CsDatabaseEntry> PmapnotcuratedProteasixDB = new LinkedList<CsDatabaseEntry>();
    java.util.Calendar calendar = java.util.Calendar.getInstance();

    String version = "JUNE2012";

    File f =
        new File(
            "//Users/julieklein/Dropbox/ProteasiX/ProteasiX/ProteasixVersionJune2012/PMAPJUIN2012_5");
    File[] files = f.listFiles();
    for (File file : files) {
      String filepath = "file://" + file.getPath() + "/";
      String htmlcontentmultipleentries = getHtmlcontent(new URL(filepath)).toString();

      Matcher splithtml =
          getPatternmatcher(
              "(<input\\s+id=\"ballot.*?>Detail</a></td>)", htmlcontentmultipleentries);
      String htmlsplitted = putSplittedhtmlintostringbuilder(splithtml);
      Matcher retrievepmapentryid =
          getPatternmatcher("<td><a\\s+href=\"" + "([^\"]+)" + "\"[^>]*>", htmlsplitted);

      while (retrievepmapentryid.find()) {
        String url = retrievepmapentryid.group(1);
        if (url.equalsIgnoreCase("/relation/show/16398")
            || url.equalsIgnoreCase("/relation/show/17178")
            || url.equalsIgnoreCase("/relation/show/17177")
            || url.equalsIgnoreCase("/relation/show/17074")
            || url.equalsIgnoreCase("/relation/show/17458")
            || url.equalsIgnoreCase("/relation/show/17467")
            || url.equalsIgnoreCase("/relation/show/16083")
            || url.equalsIgnoreCase("/relation/show/16082")
            || url.equalsIgnoreCase("/relation/show/16081")
            || url.equalsIgnoreCase("/relation/show/16080")
            || url.equalsIgnoreCase("/relation/show/16398")
            || url.equalsIgnoreCase("/relation/show/16271")) {
        } else {
          url = "http://cutdb.burnham.org" + url;
          String entry = getHtmlcontent(new URL(url)).toString();
          Matcher patternProteaseTaxon =
              getPatternmatcher(
                  "<div\\s+id=\"protdata\">[^<]+<table>[^<]+<tr>[^<]+<th\\s+class=\"th3\">Definition:</th>"
                      + "([^<]+<td><b>[^<]+</b></td>)?"
                      + "[^<]+</tr>[^<]+<tr>[^<]+<th\\s+class=\"th3\">Organism:</th>[^<]+<td><a\\s+href\\s+=\\s+\"[^\"]+\"\\s+target=\"[^\"]+\">"
                      + "([^<]+)",
                  entry);
          String proteaseTaxon = getProteaseTaxon(patternProteaseTaxon);
          System.out.println(proteaseTaxon);

          Matcher patternSubstrateTaxon =
              getPatternmatcher(
                  "<div\\s+id=\"sbstdata\">[^<]+<table>[^<]+<tr>[^<]+<th\\s+class=\"th3\">Organism:</th>[^<]+<td><a\\s+href\\s+=\\s+\"[^\"]+\"\\s+target=\"[^\"]+\">"
                      + "([^<]+)",
                  entry);
          String substrateTaxon = getSubstrateTaxon(patternSubstrateTaxon);
          System.out.println(substrateTaxon);

          System.out.println("\n" + "******************************" + url);

          Matcher patternSubstrateName =
              getPatternmatcher(
                  "Substrate[^<]+</th>[^<]+<td>[^<]+<table>[^<]+<tr>[^<]+<th\\s+class=\"th3\">Definition:</th>[^<]+<td><b>"
                      + "([^<]+)",
                  entry);
          SubstrateDatabaseEntry substratedatabase = new SubstrateDatabaseEntry();
          CsDatabaseEntry csdatabase = new CsDatabaseEntry();
          ProteaseDatabaseEntry proteasedatabase = new ProteaseDatabaseEntry();

          substratedatabase.setS_Taxon(substrateTaxon);

          String commentS =
              getSubstrateNameSymbolId(
                  patternSubstrateName, substratedatabase, csdatabase, entry, substrateTaxon);
          System.out.println("out");

          proteasedatabase.setP_Taxon(proteaseTaxon);
          // csdatabase.setSubstrate(substratedatabase);

          Matcher patternProteaseName =
              getPatternmatcher(
                  "<div\\s+id=\"protdata\">[^<]+<table>[^<]+<tr>[^<]+<th\\s+class=\"th3\">Definition:</th>[^<]+<td><b>"
                      + "([^<]+)",
                  entry);
          String commentP =
              getProteaseNameSymbolId(
                  patternProteaseName, csdatabase, proteasedatabase, proteaseTaxon, commentS);

          csdatabase.setComment(commentP);
          System.out.println(commentP);
          csdatabase.setExternal_Link(url);

          Matcher patternCleavagesitePosition =
              getPatternmatcher(
                  "<div\\s+id=\"cleav2\">[^<]+<table>[^<]+<th\\s+class=\"th3\">Position:</th>[^<]+<td>"
                      + "([^<]+)?",
                  entry);
          getCleavagesitePosition(patternCleavagesitePosition, csdatabase);

          Matcher patternCleavagesiteSequence =
              getPatternmatcher(
                  "<div\\s+id=\"cleav\">[^<]+<table>[^<]+<th\\s+class=\"th3\">Sequence:</td>[^<]+<td>"
                      + "([^<]+)?",
                  entry);
          String csSequence = getCleavagesiteSequence(patternCleavagesiteSequence, csdatabase);
          String csSequencenodash = csSequence.replaceAll("-", "");
          csSequencenodash = csSequencenodash.trim();

          Matcher patternPmid =
              getPatternmatcher(
                  "<div\\s+id=\"pubmed\">[^<]+<table>[^<]+<td>[^<]+<a\\s+href=\"[^\"]+\"\\s+target=\"[^\"]+\"\\s+>"
                      + "([^<]+)",
                  entry);
          getPmid(patternPmid, csdatabase);

          Matcher patternErrorUnmatched =
              getPatternmatcher("<td><font\\s+color=\"#FF0000\">" + "(\\*Unmatched)", entry);
          getErrorUnmatched(patternErrorUnmatched, csdatabase);

          SimpleDateFormat format = new SimpleDateFormat("dd-MM-yyyy");
          Calendar originalDate = Calendar.getInstance();
          String dateString = format.format(originalDate.getTime());
          System.out.println(dateString);
          csdatabase.setCreation_Date(dateString);

          if (!substratedatabase.getS_UniprotID().contains("n.d")
              && !substratedatabase.getS_UniprotID().equalsIgnoreCase("to check")
              && !csdatabase.getP1_Sequence().equalsIgnoreCase("?")
              && !csdatabase.getP1prime_Sequence().equalsIgnoreCase("?")
              && !(csdatabase.getP1_Position() == 0)
              && !(csdatabase.getP1prime_Position() == 0)
              && !csdatabase.getCuration_Status().contains("discarded")) {
            String motif = csdatabase.getP1_Sequence() + "-" + csdatabase.getP1prime_Sequence();
            int motifposition = csSequence.indexOf(motif) + 1;

            NodeList entries =
                getEntries(
                    "/uniprot/entry/sequence/text()",
                    parseUniprot(
                        "http://www.uniprot.org/uniprot/"
                            + substratedatabase.getS_UniprotID()
                            + ".xml"));
            for (int i = 0; i < entries.getLength(); i++) {
              String sequence = getUniSubstratesequence(entries, i, substratedatabase);

              int startcs = 0;

              if (csSequence.startsWith("---")) {
                startcs = csdatabase.getP1_Position() - 1;
              } else if (csSequence.startsWith("--")) {
                startcs = csdatabase.getP1_Position() - 2;
              } else if (csSequence.startsWith("-")) {
                startcs = csdatabase.getP1_Position() - 3;
              } else {
                startcs = csdatabase.getP1_Position() - 4;
              }

              int length = csSequencenodash.length();

              if ((startcs + length) < sequence.length()
                  || (startcs + length) == sequence.length()) {

                String csonsequence = sequence.substring(startcs, startcs + length);
                System.out.println("CS ON SEQUENCE " + csonsequence);

                if (!csonsequence.equals(csSequencenodash)) {
                  if (sequence.contains(csSequencenodash)) {
                    int cleavagesiteposition = sequence.indexOf(csSequencenodash);
                    int newp1 = motifposition + cleavagesiteposition;
                    int newp1prime = newp1 + 1;
                    csdatabase.setP1_Position(newp1);
                    csdatabase.setP1prime_Position(newp1prime);
                    System.out.println(newp1);
                    System.out.println(newp1prime);
                    csdatabase.setCuration_Status(
                        "Cleavage site curated based on Uniprot protein sequence");
                    System.out.println("Cleavage site curated based on Uniprot protein sequence");
                  } else {
                    csdatabase.setCuration_Status(
                        "Unmatched cleavage site; Cleavage site discarded");
                    System.out.println("Unmatched cleavage site; Cleavage site discarded");
                  }
                }
              } else {

                if (sequence.contains(csSequencenodash)) {
                  int cleavagesiteposition = sequence.indexOf(csSequencenodash);
                  int newp1 = motifposition + cleavagesiteposition;
                  int newp1prime = newp1 + 1;
                  if (newp1 == csdatabase.getP1_Position()
                      && newp1prime == csdatabase.getP1prime_Position()) {
                    continue;
                  } else {
                    csdatabase.setP1_Position(newp1);
                    csdatabase.setP1prime_Position(newp1prime);
                    System.out.println(newp1);
                    System.out.println(newp1prime);
                    csdatabase.setCuration_Status(
                        "Cleavage site curated based on Uniprot protein sequence");
                    System.out.println("Cleavage site curated based on Uniprot protein sequence");
                  }
                }
              }
            }

            PmapnotcuratedProteasixDB.add(csdatabase);

          } else if (!substratedatabase.getS_UniprotID().contains("n.d")
              && !substratedatabase.getS_UniprotID().equalsIgnoreCase("to check")
              && !csdatabase.getP1_Sequence().equalsIgnoreCase("?")
              && !csdatabase.getP1prime_Sequence().equalsIgnoreCase("?")
              && csdatabase.getP1_Position() == 0
              && csdatabase.getP1prime_Position() == 0
              && !csdatabase.getCuration_Status().contains("discarded")) {

            String motif = csdatabase.getP1_Sequence() + "-" + csdatabase.getP1prime_Sequence();
            int motifposition = csSequence.indexOf(motif) + 1;

            NodeList entries =
                getEntries(
                    "/uniprot/entry/sequence/text()",
                    parseUniprot(
                        "http://www.uniprot.org/uniprot/"
                            + substratedatabase.getS_UniprotID()
                            + ".xml"));
            for (int i = 0; i < entries.getLength(); i++) {
              String sequence = getUniSubstratesequence(entries, i, substratedatabase);
              if (sequence.contains(csSequencenodash)) {
                int cleavagesiteposition = sequence.indexOf(csSequencenodash);
                int newp1 = motifposition + cleavagesiteposition;
                int newp1prime = newp1 + 1;
                if (newp1 == csdatabase.getP1_Position()
                    && newp1prime == csdatabase.getP1prime_Position()) {
                  continue;
                } else {
                  csdatabase.setP1_Position(newp1);
                  csdatabase.setP1prime_Position(newp1prime);
                  System.out.println(newp1);
                  System.out.println(newp1prime);
                  csdatabase.setCuration_Status(
                      "Cleavage site curated based on Uniprot protein sequence");
                  System.out.println("Cleavage site curated based on Uniprot protein sequence");
                }

              } else {
                csdatabase.setCuration_Status("Unmatched cleavage site; Cleavage site discarded");
                System.out.println("Unmatched cleavage site; Cleavage site discarded");
              }
            }
            PmapnotcuratedProteasixDB.add(csdatabase);

          } else {
            PmapnotcuratedProteasixDB.add(csdatabase);
          }
        }
      }
    }
    try {
      System.out.println("-----------------");
      csvWriter = new PrintStream("Pmap5notcuratedProteasixDB" + "_" + version + ".csv");
      // populateHeaders(csvWriter);
      for (CsDatabaseEntry csDatabaseEntry : PmapnotcuratedProteasixDB) {
        System.out.println(csDatabaseEntry.getExternal_Link());
        populateData(csvWriter, csDatabaseEntry);
        System.out.println("OK");
      }

    } catch (FileNotFoundException ex) {
      Logger.getLogger(Main_PDB_Pmap.class.getName()).log(Level.SEVERE, null, ex);
    } finally {
      csvWriter.close();
    }
  }