private String getProteaseNameSymbolId(
      Matcher patternProteaseName,
      CsDatabaseEntry csdatabase,
      ProteaseDatabaseEntry proteasedatabase,
      String proteaseTaxon,
      String commentS)
      throws IOException {
    String commentP = null;
    if (patternProteaseName.find()) {
      String proteaseName = patternProteaseName.group(1);
      proteaseName = proteaseName.trim();
      proteaseName = proteaseName.replaceAll(",", "");
      proteaseName = proteaseName.replaceAll(";", "");
      commentP =
          mapProteasetoLibrairy(
              commentS, proteaseTaxon, proteaseName, csdatabase, proteasedatabase);

    } else {
      String proteaseName = "n.d.";
      String proteaseSymbol = "n.d.";
      String proteaseUniprot = "n.d";
      String proteaseBrenda = "n.d.";
      commentP = commentS + ";-";
      proteasedatabase.setP_NL_Name(proteaseName);
      proteasedatabase.setP_Symbol(proteaseSymbol);
      proteasedatabase.setP_UniprotID(proteaseUniprot);
      proteasedatabase.setP_EC_Number(proteaseBrenda);
      csdatabase.setProtease(proteasedatabase);
      System.out.println(proteaseName);
      System.out.println(proteaseSymbol);
      System.out.println(proteaseUniprot);
      System.out.println(proteaseBrenda);
    }
    return commentP;
  }
  private String getProteaseInformation(
      BufferedReader bReader,
      String proteaseName,
      CsDatabaseEntry csdatabase,
      ProteaseDatabaseEntry proteasedatabase,
      String commentS)
      throws IOException {
    String line;
    String commentP = null;
    proteasedatabase.setP_NL_Name(proteaseName);
    proteasedatabase.setP_Name("to check");
    proteasedatabase.setP_EC_Number("to check");
    proteasedatabase.setP_UniprotID("to check");
    while ((line = bReader.readLine()) != null) {
      String splitarray[] = line.split("\t");
      String naturallanguage = splitarray[0];
      naturallanguage = naturallanguage.replaceAll("\"", "");
      naturallanguage = naturallanguage.replaceAll(",", "");
      naturallanguage = naturallanguage.replaceAll(";", "");
      if (naturallanguage.equalsIgnoreCase(proteaseName)) {
        String proteaseSymbol = splitarray[1];
        proteaseSymbol = proteaseSymbol.replaceAll("sept-0", "SEPT");
        String proteaseUniprot = splitarray[2];
        String proteaseBrenda = splitarray[3];

        if (proteaseUniprot.contains("n.d")) {
          proteasedatabase.setP_Name("n.d.");
          proteasedatabase.setP_UniprotID(proteaseUniprot);
          proteasedatabase.setP_EC_Number(proteaseBrenda);
          csdatabase.setProtease(proteasedatabase);
        } else {
          String UniprotURL = "http://www.uniprot.org/uniprot/" + proteaseUniprot + ".xml";
          NodeList entries = getEntries("/uniprot/entry", parseUniprot(UniprotURL));
          for (int i = 0; i < entries.getLength(); i++) {
            getUniProteasepproteinname(entries, i, proteasedatabase);
            String genename = getUniProteasegenename(entries, i, proteasedatabase);
          }
          commentP = commentS + ";-";
          proteasedatabase.setP_UniprotID(proteaseUniprot);
          proteasedatabase.setP_EC_Number(proteaseBrenda);
          csdatabase.setProtease(proteasedatabase);
          System.out.println(proteaseUniprot);
          System.out.println(proteaseBrenda);
        }
      }
    }
    return commentP;
  }
 private void getUniProteasepproteinname(
     NodeList entries, int i, ProteaseDatabaseEntry proteasedatabase) {
   // GET SUBSTRATE PROTEIN NAME using getInformation method
   LinkedList<String> protnamelist =
       getInformation("./protein/recommendedName/fullName/text()", entries.item(i));
   String protname = null;
   if (!protnamelist.isEmpty()) {
     protname = protnamelist.getFirst();
     protname = protname.replaceAll(",", "");
     System.out.println(protname);
     proteasedatabase.setP_Name(protname);
   }
 }
 private String getUniProteasegenename(
     NodeList entries, int i, ProteaseDatabaseEntry proteasedatabase) {
   // GET SUBSTRATE GENE NAME using getInformation method
   LinkedList<String> genenamelist =
       getInformation("./gene/name[@type][1]/text()", entries.item(i));
   String genename = null;
   if (!genenamelist.isEmpty()) {
     genename = genenamelist.getFirst();
     System.out.println(genename);
     proteasedatabase.setP_Symbol(genename);
   }
   return genename;
 }
  public Main_PDB_Pmap() throws MalformedURLException, IOException {
    PrintStream csvWriter = null;
    LinkedList<CsDatabaseEntry> PmapnotcuratedProteasixDB = new LinkedList<CsDatabaseEntry>();
    java.util.Calendar calendar = java.util.Calendar.getInstance();

    String version = "JUNE2012";

    File f =
        new File(
            "//Users/julieklein/Dropbox/ProteasiX/ProteasiX/ProteasixVersionJune2012/PMAPJUIN2012_5");
    File[] files = f.listFiles();
    for (File file : files) {
      String filepath = "file://" + file.getPath() + "/";
      String htmlcontentmultipleentries = getHtmlcontent(new URL(filepath)).toString();

      Matcher splithtml =
          getPatternmatcher(
              "(<input\\s+id=\"ballot.*?>Detail</a></td>)", htmlcontentmultipleentries);
      String htmlsplitted = putSplittedhtmlintostringbuilder(splithtml);
      Matcher retrievepmapentryid =
          getPatternmatcher("<td><a\\s+href=\"" + "([^\"]+)" + "\"[^>]*>", htmlsplitted);

      while (retrievepmapentryid.find()) {
        String url = retrievepmapentryid.group(1);
        if (url.equalsIgnoreCase("/relation/show/16398")
            || url.equalsIgnoreCase("/relation/show/17178")
            || url.equalsIgnoreCase("/relation/show/17177")
            || url.equalsIgnoreCase("/relation/show/17074")
            || url.equalsIgnoreCase("/relation/show/17458")
            || url.equalsIgnoreCase("/relation/show/17467")
            || url.equalsIgnoreCase("/relation/show/16083")
            || url.equalsIgnoreCase("/relation/show/16082")
            || url.equalsIgnoreCase("/relation/show/16081")
            || url.equalsIgnoreCase("/relation/show/16080")
            || url.equalsIgnoreCase("/relation/show/16398")
            || url.equalsIgnoreCase("/relation/show/16271")) {
        } else {
          url = "http://cutdb.burnham.org" + url;
          String entry = getHtmlcontent(new URL(url)).toString();
          Matcher patternProteaseTaxon =
              getPatternmatcher(
                  "<div\\s+id=\"protdata\">[^<]+<table>[^<]+<tr>[^<]+<th\\s+class=\"th3\">Definition:</th>"
                      + "([^<]+<td><b>[^<]+</b></td>)?"
                      + "[^<]+</tr>[^<]+<tr>[^<]+<th\\s+class=\"th3\">Organism:</th>[^<]+<td><a\\s+href\\s+=\\s+\"[^\"]+\"\\s+target=\"[^\"]+\">"
                      + "([^<]+)",
                  entry);
          String proteaseTaxon = getProteaseTaxon(patternProteaseTaxon);
          System.out.println(proteaseTaxon);

          Matcher patternSubstrateTaxon =
              getPatternmatcher(
                  "<div\\s+id=\"sbstdata\">[^<]+<table>[^<]+<tr>[^<]+<th\\s+class=\"th3\">Organism:</th>[^<]+<td><a\\s+href\\s+=\\s+\"[^\"]+\"\\s+target=\"[^\"]+\">"
                      + "([^<]+)",
                  entry);
          String substrateTaxon = getSubstrateTaxon(patternSubstrateTaxon);
          System.out.println(substrateTaxon);

          System.out.println("\n" + "******************************" + url);

          Matcher patternSubstrateName =
              getPatternmatcher(
                  "Substrate[^<]+</th>[^<]+<td>[^<]+<table>[^<]+<tr>[^<]+<th\\s+class=\"th3\">Definition:</th>[^<]+<td><b>"
                      + "([^<]+)",
                  entry);
          SubstrateDatabaseEntry substratedatabase = new SubstrateDatabaseEntry();
          CsDatabaseEntry csdatabase = new CsDatabaseEntry();
          ProteaseDatabaseEntry proteasedatabase = new ProteaseDatabaseEntry();

          substratedatabase.setS_Taxon(substrateTaxon);

          String commentS =
              getSubstrateNameSymbolId(
                  patternSubstrateName, substratedatabase, csdatabase, entry, substrateTaxon);
          System.out.println("out");

          proteasedatabase.setP_Taxon(proteaseTaxon);
          // csdatabase.setSubstrate(substratedatabase);

          Matcher patternProteaseName =
              getPatternmatcher(
                  "<div\\s+id=\"protdata\">[^<]+<table>[^<]+<tr>[^<]+<th\\s+class=\"th3\">Definition:</th>[^<]+<td><b>"
                      + "([^<]+)",
                  entry);
          String commentP =
              getProteaseNameSymbolId(
                  patternProteaseName, csdatabase, proteasedatabase, proteaseTaxon, commentS);

          csdatabase.setComment(commentP);
          System.out.println(commentP);
          csdatabase.setExternal_Link(url);

          Matcher patternCleavagesitePosition =
              getPatternmatcher(
                  "<div\\s+id=\"cleav2\">[^<]+<table>[^<]+<th\\s+class=\"th3\">Position:</th>[^<]+<td>"
                      + "([^<]+)?",
                  entry);
          getCleavagesitePosition(patternCleavagesitePosition, csdatabase);

          Matcher patternCleavagesiteSequence =
              getPatternmatcher(
                  "<div\\s+id=\"cleav\">[^<]+<table>[^<]+<th\\s+class=\"th3\">Sequence:</td>[^<]+<td>"
                      + "([^<]+)?",
                  entry);
          String csSequence = getCleavagesiteSequence(patternCleavagesiteSequence, csdatabase);
          String csSequencenodash = csSequence.replaceAll("-", "");
          csSequencenodash = csSequencenodash.trim();

          Matcher patternPmid =
              getPatternmatcher(
                  "<div\\s+id=\"pubmed\">[^<]+<table>[^<]+<td>[^<]+<a\\s+href=\"[^\"]+\"\\s+target=\"[^\"]+\"\\s+>"
                      + "([^<]+)",
                  entry);
          getPmid(patternPmid, csdatabase);

          Matcher patternErrorUnmatched =
              getPatternmatcher("<td><font\\s+color=\"#FF0000\">" + "(\\*Unmatched)", entry);
          getErrorUnmatched(patternErrorUnmatched, csdatabase);

          SimpleDateFormat format = new SimpleDateFormat("dd-MM-yyyy");
          Calendar originalDate = Calendar.getInstance();
          String dateString = format.format(originalDate.getTime());
          System.out.println(dateString);
          csdatabase.setCreation_Date(dateString);

          if (!substratedatabase.getS_UniprotID().contains("n.d")
              && !substratedatabase.getS_UniprotID().equalsIgnoreCase("to check")
              && !csdatabase.getP1_Sequence().equalsIgnoreCase("?")
              && !csdatabase.getP1prime_Sequence().equalsIgnoreCase("?")
              && !(csdatabase.getP1_Position() == 0)
              && !(csdatabase.getP1prime_Position() == 0)
              && !csdatabase.getCuration_Status().contains("discarded")) {
            String motif = csdatabase.getP1_Sequence() + "-" + csdatabase.getP1prime_Sequence();
            int motifposition = csSequence.indexOf(motif) + 1;

            NodeList entries =
                getEntries(
                    "/uniprot/entry/sequence/text()",
                    parseUniprot(
                        "http://www.uniprot.org/uniprot/"
                            + substratedatabase.getS_UniprotID()
                            + ".xml"));
            for (int i = 0; i < entries.getLength(); i++) {
              String sequence = getUniSubstratesequence(entries, i, substratedatabase);

              int startcs = 0;

              if (csSequence.startsWith("---")) {
                startcs = csdatabase.getP1_Position() - 1;
              } else if (csSequence.startsWith("--")) {
                startcs = csdatabase.getP1_Position() - 2;
              } else if (csSequence.startsWith("-")) {
                startcs = csdatabase.getP1_Position() - 3;
              } else {
                startcs = csdatabase.getP1_Position() - 4;
              }

              int length = csSequencenodash.length();

              if ((startcs + length) < sequence.length()
                  || (startcs + length) == sequence.length()) {

                String csonsequence = sequence.substring(startcs, startcs + length);
                System.out.println("CS ON SEQUENCE " + csonsequence);

                if (!csonsequence.equals(csSequencenodash)) {
                  if (sequence.contains(csSequencenodash)) {
                    int cleavagesiteposition = sequence.indexOf(csSequencenodash);
                    int newp1 = motifposition + cleavagesiteposition;
                    int newp1prime = newp1 + 1;
                    csdatabase.setP1_Position(newp1);
                    csdatabase.setP1prime_Position(newp1prime);
                    System.out.println(newp1);
                    System.out.println(newp1prime);
                    csdatabase.setCuration_Status(
                        "Cleavage site curated based on Uniprot protein sequence");
                    System.out.println("Cleavage site curated based on Uniprot protein sequence");
                  } else {
                    csdatabase.setCuration_Status(
                        "Unmatched cleavage site; Cleavage site discarded");
                    System.out.println("Unmatched cleavage site; Cleavage site discarded");
                  }
                }
              } else {

                if (sequence.contains(csSequencenodash)) {
                  int cleavagesiteposition = sequence.indexOf(csSequencenodash);
                  int newp1 = motifposition + cleavagesiteposition;
                  int newp1prime = newp1 + 1;
                  if (newp1 == csdatabase.getP1_Position()
                      && newp1prime == csdatabase.getP1prime_Position()) {
                    continue;
                  } else {
                    csdatabase.setP1_Position(newp1);
                    csdatabase.setP1prime_Position(newp1prime);
                    System.out.println(newp1);
                    System.out.println(newp1prime);
                    csdatabase.setCuration_Status(
                        "Cleavage site curated based on Uniprot protein sequence");
                    System.out.println("Cleavage site curated based on Uniprot protein sequence");
                  }
                }
              }
            }

            PmapnotcuratedProteasixDB.add(csdatabase);

          } else if (!substratedatabase.getS_UniprotID().contains("n.d")
              && !substratedatabase.getS_UniprotID().equalsIgnoreCase("to check")
              && !csdatabase.getP1_Sequence().equalsIgnoreCase("?")
              && !csdatabase.getP1prime_Sequence().equalsIgnoreCase("?")
              && csdatabase.getP1_Position() == 0
              && csdatabase.getP1prime_Position() == 0
              && !csdatabase.getCuration_Status().contains("discarded")) {

            String motif = csdatabase.getP1_Sequence() + "-" + csdatabase.getP1prime_Sequence();
            int motifposition = csSequence.indexOf(motif) + 1;

            NodeList entries =
                getEntries(
                    "/uniprot/entry/sequence/text()",
                    parseUniprot(
                        "http://www.uniprot.org/uniprot/"
                            + substratedatabase.getS_UniprotID()
                            + ".xml"));
            for (int i = 0; i < entries.getLength(); i++) {
              String sequence = getUniSubstratesequence(entries, i, substratedatabase);
              if (sequence.contains(csSequencenodash)) {
                int cleavagesiteposition = sequence.indexOf(csSequencenodash);
                int newp1 = motifposition + cleavagesiteposition;
                int newp1prime = newp1 + 1;
                if (newp1 == csdatabase.getP1_Position()
                    && newp1prime == csdatabase.getP1prime_Position()) {
                  continue;
                } else {
                  csdatabase.setP1_Position(newp1);
                  csdatabase.setP1prime_Position(newp1prime);
                  System.out.println(newp1);
                  System.out.println(newp1prime);
                  csdatabase.setCuration_Status(
                      "Cleavage site curated based on Uniprot protein sequence");
                  System.out.println("Cleavage site curated based on Uniprot protein sequence");
                }

              } else {
                csdatabase.setCuration_Status("Unmatched cleavage site; Cleavage site discarded");
                System.out.println("Unmatched cleavage site; Cleavage site discarded");
              }
            }
            PmapnotcuratedProteasixDB.add(csdatabase);

          } else {
            PmapnotcuratedProteasixDB.add(csdatabase);
          }
        }
      }
    }
    try {
      System.out.println("-----------------");
      csvWriter = new PrintStream("Pmap5notcuratedProteasixDB" + "_" + version + ".csv");
      // populateHeaders(csvWriter);
      for (CsDatabaseEntry csDatabaseEntry : PmapnotcuratedProteasixDB) {
        System.out.println(csDatabaseEntry.getExternal_Link());
        populateData(csvWriter, csDatabaseEntry);
        System.out.println("OK");
      }

    } catch (FileNotFoundException ex) {
      Logger.getLogger(Main_PDB_Pmap.class.getName()).log(Level.SEVERE, null, ex);
    } finally {
      csvWriter.close();
    }
  }