private String getProteaseNameSymbolId( Matcher patternProteaseName, CsDatabaseEntry csdatabase, ProteaseDatabaseEntry proteasedatabase, String proteaseTaxon, String commentS) throws IOException { String commentP = null; if (patternProteaseName.find()) { String proteaseName = patternProteaseName.group(1); proteaseName = proteaseName.trim(); proteaseName = proteaseName.replaceAll(",", ""); proteaseName = proteaseName.replaceAll(";", ""); commentP = mapProteasetoLibrairy( commentS, proteaseTaxon, proteaseName, csdatabase, proteasedatabase); } else { String proteaseName = "n.d."; String proteaseSymbol = "n.d."; String proteaseUniprot = "n.d"; String proteaseBrenda = "n.d."; commentP = commentS + ";-"; proteasedatabase.setP_NL_Name(proteaseName); proteasedatabase.setP_Symbol(proteaseSymbol); proteasedatabase.setP_UniprotID(proteaseUniprot); proteasedatabase.setP_EC_Number(proteaseBrenda); csdatabase.setProtease(proteasedatabase); System.out.println(proteaseName); System.out.println(proteaseSymbol); System.out.println(proteaseUniprot); System.out.println(proteaseBrenda); } return commentP; }
private String getProteaseInformation( BufferedReader bReader, String proteaseName, CsDatabaseEntry csdatabase, ProteaseDatabaseEntry proteasedatabase, String commentS) throws IOException { String line; String commentP = null; proteasedatabase.setP_NL_Name(proteaseName); proteasedatabase.setP_Name("to check"); proteasedatabase.setP_EC_Number("to check"); proteasedatabase.setP_UniprotID("to check"); while ((line = bReader.readLine()) != null) { String splitarray[] = line.split("\t"); String naturallanguage = splitarray[0]; naturallanguage = naturallanguage.replaceAll("\"", ""); naturallanguage = naturallanguage.replaceAll(",", ""); naturallanguage = naturallanguage.replaceAll(";", ""); if (naturallanguage.equalsIgnoreCase(proteaseName)) { String proteaseSymbol = splitarray[1]; proteaseSymbol = proteaseSymbol.replaceAll("sept-0", "SEPT"); String proteaseUniprot = splitarray[2]; String proteaseBrenda = splitarray[3]; if (proteaseUniprot.contains("n.d")) { proteasedatabase.setP_Name("n.d."); proteasedatabase.setP_UniprotID(proteaseUniprot); proteasedatabase.setP_EC_Number(proteaseBrenda); csdatabase.setProtease(proteasedatabase); } else { String UniprotURL = "http://www.uniprot.org/uniprot/" + proteaseUniprot + ".xml"; NodeList entries = getEntries("/uniprot/entry", parseUniprot(UniprotURL)); for (int i = 0; i < entries.getLength(); i++) { getUniProteasepproteinname(entries, i, proteasedatabase); String genename = getUniProteasegenename(entries, i, proteasedatabase); } commentP = commentS + ";-"; proteasedatabase.setP_UniprotID(proteaseUniprot); proteasedatabase.setP_EC_Number(proteaseBrenda); csdatabase.setProtease(proteasedatabase); System.out.println(proteaseUniprot); System.out.println(proteaseBrenda); } } } return commentP; }
private void getUniProteasepproteinname( NodeList entries, int i, ProteaseDatabaseEntry proteasedatabase) { // GET SUBSTRATE PROTEIN NAME using getInformation method LinkedList<String> protnamelist = getInformation("./protein/recommendedName/fullName/text()", entries.item(i)); String protname = null; if (!protnamelist.isEmpty()) { protname = protnamelist.getFirst(); protname = protname.replaceAll(",", ""); System.out.println(protname); proteasedatabase.setP_Name(protname); } }
private String getUniProteasegenename( NodeList entries, int i, ProteaseDatabaseEntry proteasedatabase) { // GET SUBSTRATE GENE NAME using getInformation method LinkedList<String> genenamelist = getInformation("./gene/name[@type][1]/text()", entries.item(i)); String genename = null; if (!genenamelist.isEmpty()) { genename = genenamelist.getFirst(); System.out.println(genename); proteasedatabase.setP_Symbol(genename); } return genename; }
public Main_PDB_Pmap() throws MalformedURLException, IOException { PrintStream csvWriter = null; LinkedList<CsDatabaseEntry> PmapnotcuratedProteasixDB = new LinkedList<CsDatabaseEntry>(); java.util.Calendar calendar = java.util.Calendar.getInstance(); String version = "JUNE2012"; File f = new File( "//Users/julieklein/Dropbox/ProteasiX/ProteasiX/ProteasixVersionJune2012/PMAPJUIN2012_5"); File[] files = f.listFiles(); for (File file : files) { String filepath = "file://" + file.getPath() + "/"; String htmlcontentmultipleentries = getHtmlcontent(new URL(filepath)).toString(); Matcher splithtml = getPatternmatcher( "(<input\\s+id=\"ballot.*?>Detail</a></td>)", htmlcontentmultipleentries); String htmlsplitted = putSplittedhtmlintostringbuilder(splithtml); Matcher retrievepmapentryid = getPatternmatcher("<td><a\\s+href=\"" + "([^\"]+)" + "\"[^>]*>", htmlsplitted); while (retrievepmapentryid.find()) { String url = retrievepmapentryid.group(1); if (url.equalsIgnoreCase("/relation/show/16398") || url.equalsIgnoreCase("/relation/show/17178") || url.equalsIgnoreCase("/relation/show/17177") || url.equalsIgnoreCase("/relation/show/17074") || url.equalsIgnoreCase("/relation/show/17458") || url.equalsIgnoreCase("/relation/show/17467") || url.equalsIgnoreCase("/relation/show/16083") || url.equalsIgnoreCase("/relation/show/16082") || url.equalsIgnoreCase("/relation/show/16081") || url.equalsIgnoreCase("/relation/show/16080") || url.equalsIgnoreCase("/relation/show/16398") || url.equalsIgnoreCase("/relation/show/16271")) { } else { url = "http://cutdb.burnham.org" + url; String entry = getHtmlcontent(new URL(url)).toString(); Matcher patternProteaseTaxon = getPatternmatcher( "<div\\s+id=\"protdata\">[^<]+<table>[^<]+<tr>[^<]+<th\\s+class=\"th3\">Definition:</th>" + "([^<]+<td><b>[^<]+</b></td>)?" + "[^<]+</tr>[^<]+<tr>[^<]+<th\\s+class=\"th3\">Organism:</th>[^<]+<td><a\\s+href\\s+=\\s+\"[^\"]+\"\\s+target=\"[^\"]+\">" + "([^<]+)", entry); String proteaseTaxon = getProteaseTaxon(patternProteaseTaxon); System.out.println(proteaseTaxon); Matcher patternSubstrateTaxon = getPatternmatcher( "<div\\s+id=\"sbstdata\">[^<]+<table>[^<]+<tr>[^<]+<th\\s+class=\"th3\">Organism:</th>[^<]+<td><a\\s+href\\s+=\\s+\"[^\"]+\"\\s+target=\"[^\"]+\">" + "([^<]+)", entry); String substrateTaxon = getSubstrateTaxon(patternSubstrateTaxon); System.out.println(substrateTaxon); System.out.println("\n" + "******************************" + url); Matcher patternSubstrateName = getPatternmatcher( "Substrate[^<]+</th>[^<]+<td>[^<]+<table>[^<]+<tr>[^<]+<th\\s+class=\"th3\">Definition:</th>[^<]+<td><b>" + "([^<]+)", entry); SubstrateDatabaseEntry substratedatabase = new SubstrateDatabaseEntry(); CsDatabaseEntry csdatabase = new CsDatabaseEntry(); ProteaseDatabaseEntry proteasedatabase = new ProteaseDatabaseEntry(); substratedatabase.setS_Taxon(substrateTaxon); String commentS = getSubstrateNameSymbolId( patternSubstrateName, substratedatabase, csdatabase, entry, substrateTaxon); System.out.println("out"); proteasedatabase.setP_Taxon(proteaseTaxon); // csdatabase.setSubstrate(substratedatabase); Matcher patternProteaseName = getPatternmatcher( "<div\\s+id=\"protdata\">[^<]+<table>[^<]+<tr>[^<]+<th\\s+class=\"th3\">Definition:</th>[^<]+<td><b>" + "([^<]+)", entry); String commentP = getProteaseNameSymbolId( patternProteaseName, csdatabase, proteasedatabase, proteaseTaxon, commentS); csdatabase.setComment(commentP); System.out.println(commentP); csdatabase.setExternal_Link(url); Matcher patternCleavagesitePosition = getPatternmatcher( "<div\\s+id=\"cleav2\">[^<]+<table>[^<]+<th\\s+class=\"th3\">Position:</th>[^<]+<td>" + "([^<]+)?", entry); getCleavagesitePosition(patternCleavagesitePosition, csdatabase); Matcher patternCleavagesiteSequence = getPatternmatcher( "<div\\s+id=\"cleav\">[^<]+<table>[^<]+<th\\s+class=\"th3\">Sequence:</td>[^<]+<td>" + "([^<]+)?", entry); String csSequence = getCleavagesiteSequence(patternCleavagesiteSequence, csdatabase); String csSequencenodash = csSequence.replaceAll("-", ""); csSequencenodash = csSequencenodash.trim(); Matcher patternPmid = getPatternmatcher( "<div\\s+id=\"pubmed\">[^<]+<table>[^<]+<td>[^<]+<a\\s+href=\"[^\"]+\"\\s+target=\"[^\"]+\"\\s+>" + "([^<]+)", entry); getPmid(patternPmid, csdatabase); Matcher patternErrorUnmatched = getPatternmatcher("<td><font\\s+color=\"#FF0000\">" + "(\\*Unmatched)", entry); getErrorUnmatched(patternErrorUnmatched, csdatabase); SimpleDateFormat format = new SimpleDateFormat("dd-MM-yyyy"); Calendar originalDate = Calendar.getInstance(); String dateString = format.format(originalDate.getTime()); System.out.println(dateString); csdatabase.setCreation_Date(dateString); if (!substratedatabase.getS_UniprotID().contains("n.d") && !substratedatabase.getS_UniprotID().equalsIgnoreCase("to check") && !csdatabase.getP1_Sequence().equalsIgnoreCase("?") && !csdatabase.getP1prime_Sequence().equalsIgnoreCase("?") && !(csdatabase.getP1_Position() == 0) && !(csdatabase.getP1prime_Position() == 0) && !csdatabase.getCuration_Status().contains("discarded")) { String motif = csdatabase.getP1_Sequence() + "-" + csdatabase.getP1prime_Sequence(); int motifposition = csSequence.indexOf(motif) + 1; NodeList entries = getEntries( "/uniprot/entry/sequence/text()", parseUniprot( "http://www.uniprot.org/uniprot/" + substratedatabase.getS_UniprotID() + ".xml")); for (int i = 0; i < entries.getLength(); i++) { String sequence = getUniSubstratesequence(entries, i, substratedatabase); int startcs = 0; if (csSequence.startsWith("---")) { startcs = csdatabase.getP1_Position() - 1; } else if (csSequence.startsWith("--")) { startcs = csdatabase.getP1_Position() - 2; } else if (csSequence.startsWith("-")) { startcs = csdatabase.getP1_Position() - 3; } else { startcs = csdatabase.getP1_Position() - 4; } int length = csSequencenodash.length(); if ((startcs + length) < sequence.length() || (startcs + length) == sequence.length()) { String csonsequence = sequence.substring(startcs, startcs + length); System.out.println("CS ON SEQUENCE " + csonsequence); if (!csonsequence.equals(csSequencenodash)) { if (sequence.contains(csSequencenodash)) { int cleavagesiteposition = sequence.indexOf(csSequencenodash); int newp1 = motifposition + cleavagesiteposition; int newp1prime = newp1 + 1; csdatabase.setP1_Position(newp1); csdatabase.setP1prime_Position(newp1prime); System.out.println(newp1); System.out.println(newp1prime); csdatabase.setCuration_Status( "Cleavage site curated based on Uniprot protein sequence"); System.out.println("Cleavage site curated based on Uniprot protein sequence"); } else { csdatabase.setCuration_Status( "Unmatched cleavage site; Cleavage site discarded"); System.out.println("Unmatched cleavage site; Cleavage site discarded"); } } } else { if (sequence.contains(csSequencenodash)) { int cleavagesiteposition = sequence.indexOf(csSequencenodash); int newp1 = motifposition + cleavagesiteposition; int newp1prime = newp1 + 1; if (newp1 == csdatabase.getP1_Position() && newp1prime == csdatabase.getP1prime_Position()) { continue; } else { csdatabase.setP1_Position(newp1); csdatabase.setP1prime_Position(newp1prime); System.out.println(newp1); System.out.println(newp1prime); csdatabase.setCuration_Status( "Cleavage site curated based on Uniprot protein sequence"); System.out.println("Cleavage site curated based on Uniprot protein sequence"); } } } } PmapnotcuratedProteasixDB.add(csdatabase); } else if (!substratedatabase.getS_UniprotID().contains("n.d") && !substratedatabase.getS_UniprotID().equalsIgnoreCase("to check") && !csdatabase.getP1_Sequence().equalsIgnoreCase("?") && !csdatabase.getP1prime_Sequence().equalsIgnoreCase("?") && csdatabase.getP1_Position() == 0 && csdatabase.getP1prime_Position() == 0 && !csdatabase.getCuration_Status().contains("discarded")) { String motif = csdatabase.getP1_Sequence() + "-" + csdatabase.getP1prime_Sequence(); int motifposition = csSequence.indexOf(motif) + 1; NodeList entries = getEntries( "/uniprot/entry/sequence/text()", parseUniprot( "http://www.uniprot.org/uniprot/" + substratedatabase.getS_UniprotID() + ".xml")); for (int i = 0; i < entries.getLength(); i++) { String sequence = getUniSubstratesequence(entries, i, substratedatabase); if (sequence.contains(csSequencenodash)) { int cleavagesiteposition = sequence.indexOf(csSequencenodash); int newp1 = motifposition + cleavagesiteposition; int newp1prime = newp1 + 1; if (newp1 == csdatabase.getP1_Position() && newp1prime == csdatabase.getP1prime_Position()) { continue; } else { csdatabase.setP1_Position(newp1); csdatabase.setP1prime_Position(newp1prime); System.out.println(newp1); System.out.println(newp1prime); csdatabase.setCuration_Status( "Cleavage site curated based on Uniprot protein sequence"); System.out.println("Cleavage site curated based on Uniprot protein sequence"); } } else { csdatabase.setCuration_Status("Unmatched cleavage site; Cleavage site discarded"); System.out.println("Unmatched cleavage site; Cleavage site discarded"); } } PmapnotcuratedProteasixDB.add(csdatabase); } else { PmapnotcuratedProteasixDB.add(csdatabase); } } } } try { System.out.println("-----------------"); csvWriter = new PrintStream("Pmap5notcuratedProteasixDB" + "_" + version + ".csv"); // populateHeaders(csvWriter); for (CsDatabaseEntry csDatabaseEntry : PmapnotcuratedProteasixDB) { System.out.println(csDatabaseEntry.getExternal_Link()); populateData(csvWriter, csDatabaseEntry); System.out.println("OK"); } } catch (FileNotFoundException ex) { Logger.getLogger(Main_PDB_Pmap.class.getName()).log(Level.SEVERE, null, ex); } finally { csvWriter.close(); } }