private String getSubstrateSymbol( Matcher patternSubstrateSymbol, SubstrateDatabaseEntry substratedatabase) { String Substratesymbol = null; if (patternSubstrateSymbol.find()) { Substratesymbol = patternSubstrateSymbol.group(2); substratedatabase.setS_Symbol(Substratesymbol); // System.out.println(Substratesymbol); } else { Substratesymbol = "n.d."; substratedatabase.setS_Symbol(Substratesymbol); // System.out.println(Substratesymbol); } return Substratesymbol; }
private String getSubstrateAccession( Matcher patternSubstrateAccession, SubstrateDatabaseEntry subtratedatabase) { String accession = null; if (patternSubstrateAccession.find()) { accession = patternSubstrateAccession.group(1); accession = accession.trim(); subtratedatabase.setS_UniprotID(accession); // System.out.println(accession); } else { accession = "n.d."; subtratedatabase.setS_UniprotID(accession); // System.out.println(accession); } return accession; }
private String getUniSubstrategenename( NodeList entries, int i, SubstrateDatabaseEntry substratedatabase) { // GET SUBSTRATE GENE NAME using getInformation method LinkedList<String> genenamelist = getInformation("./gene/name[@type][1]/text()", entries.item(i)); String genename = null; if (!genenamelist.isEmpty()) { genename = genenamelist.getFirst(); System.out.println(genename); substratedatabase.setS_Symbol(genename); } return genename; }
private void getUniSubstratepproteinname( NodeList entries, int i, SubstrateDatabaseEntry substratedatabase) { // GET SUBSTRATE PROTEIN NAME using getInformation method LinkedList<String> protnamelist = getInformation("./protein/recommendedName/fullName/text()", entries.item(i)); String protname = null; if (!protnamelist.isEmpty()) { protname = protnamelist.getFirst(); protname = protname.replaceAll(",", ""); System.out.println(protname); substratedatabase.setS_Name(protname); } }
private String getSubstrateNameSymbolId( Matcher patternSubstrateName, SubstrateDatabaseEntry substratedatabase, CsDatabaseEntry csdatabase, String entry, String substrateTaxon) throws IOException { String commentS = null; Matcher patternSubstrateAccession = getPatternmatcher( "UniProt\\s+Accession:</th>[^<]+<td><a\\s+href\\s+=\\s+\"[^\"]+\"\\s+target=\"[^\"]+\">" + "([^<]+)", entry); String accession = getSubstrateAccession(patternSubstrateAccession, substratedatabase); Matcher patternSubstrateSymbol = getPatternmatcher( "Substrate[^<]+</th>[^<]+<td>[^<]+<table>[^<]+<tr>[^<]+<th\\s+class=\"th3\">Definition:</th>[^<]+<td><b>" + "[^<]+" + "</b></td>[^<]+<tr>[^<]+<th\\s+class=\"th3\">Symbol:</th>" + "([^<]+<td><b>)?([^<]+)", entry); String symbol = getSubstrateSymbol(patternSubstrateSymbol, substratedatabase); if (patternSubstrateName.find()) { String Substratename = "to check"; String Substratesymbol = "to check"; String Substrateaccession = "to check"; substratedatabase.setS_NL_Name(Substratename); substratedatabase.setS_Name(Substratename); substratedatabase.setS_Symbol(Substratesymbol); substratedatabase.setS_UniprotID(Substrateaccession); Substratename = patternSubstrateName.group(1); Substratename = Substratename.trim(); Substratename = Substratename.replaceAll(",", ""); Substratename = Substratename.replaceAll(";", ""); substratedatabase.setS_NL_Name(Substratename); commentS = "Check Substrate Symbol and Accession; add to Substrate Librairy"; BufferedReader bReader = null; if (substrateTaxon.contains("H**o")) { bReader = createBufferedreader( "/Users/julieklein/Dropbox/ProteasiX/LIBRAIRIES/SubstrateHSALibrairy.txt"); } else if (substrateTaxon.contains("Mus")) { bReader = createBufferedreader( "/Users/julieklein/Dropbox/ProteasiX/LIBRAIRIES/SubstrateMMULibrairy.txt"); } else if (substrateTaxon.contains("Rattus")) { bReader = createBufferedreader( "/Users/julieklein/Dropbox/ProteasiX/LIBRAIRIES/SubstrateRNOLibrairy.txt"); } String line; while ((line = bReader.readLine()) != null) { String splitarray[] = line.split("\t"); String naturallanguage = splitarray[1]; naturallanguage = naturallanguage.replaceAll("\"", ""); naturallanguage = naturallanguage.replaceAll(",", ""); naturallanguage = naturallanguage.replaceAll(";", ""); if (naturallanguage.equalsIgnoreCase(Substratename)) { Substratesymbol = splitarray[0]; Substratesymbol = Substratesymbol.replaceAll("sept-0", "SEPT"); Substrateaccession = splitarray[2]; if (Substrateaccession.contains("n.d.")) { substratedatabase.setS_Name("n.d."); substratedatabase.setS_UniprotID(Substrateaccession); substratedatabase.setS_Symbol(Substratesymbol); } else { String UniprotURL = "http://www.uniprot.org/uniprot/" + Substrateaccession + ".xml"; NodeList entries = getEntries("/uniprot/entry", parseUniprot(UniprotURL)); for (int i = 0; i < entries.getLength(); i++) { getUniSubstratepproteinname(entries, i, substratedatabase); String genename = getUniSubstrategenename(entries, i, substratedatabase); } // System.out.println(Substrateaccession); substratedatabase.setS_UniprotID(Substrateaccession); commentS = "-"; System.out.println(commentS); } } csdatabase.setSubstrate(substratedatabase); } } else if (!symbol.contains("n.d.")) { String Substratename = "to check"; String Substratesymbol = "to check"; String Substrateaccession = "to check"; substratedatabase.setS_NL_Name(Substratename); substratedatabase.setS_Name(Substratename); substratedatabase.setS_Symbol(Substratesymbol); substratedatabase.setS_UniprotID(Substrateaccession); BufferedReader bReader = null; if (substrateTaxon.contains("H**o")) { bReader = createBufferedreader( "/Users/julieklein/Dropbox/ProteasiX/LIBRAIRIES/SubstrateHSALibrairy.txt"); } else if (substrateTaxon.contains("Mus")) { bReader = createBufferedreader( "/Users/julieklein/Dropbox/ProteasiX/LIBRAIRIES/SubstrateMMULibrairy.txt"); } else if (substrateTaxon.contains("Rattus")) { bReader = createBufferedreader( "/Users/julieklein/Dropbox/ProteasiX/LIBRAIRIES/SubstrateRNOLibrairy.txt"); } String line; while ((line = bReader.readLine()) != null) { String splitarray[] = line.split("\t"); String librairisymbol = splitarray[0]; librairisymbol = librairisymbol.replaceAll("\"", ""); librairisymbol = librairisymbol.replaceAll("sept-0", "SEPT"); if (librairisymbol.equals(symbol)) { Substrateaccession = splitarray[2]; if (Substrateaccession.contains("n.d.")) { substratedatabase.setS_Name("n.d."); substratedatabase.setS_UniprotID(Substrateaccession); substratedatabase.setS_Symbol(symbol); } else { String UniprotURL = "http://www.uniprot.org/uniprot/" + Substrateaccession + ".xml"; NodeList entries = getEntries("/uniprot/entry", parseUniprot(UniprotURL)); for (int i = 0; i < entries.getLength(); i++) { getUniSubstratepproteinname(entries, i, substratedatabase); String genename = getUniSubstrategenename(entries, i, substratedatabase); } // System.out.println(Substrateaccession); substratedatabase.setS_UniprotID(Substrateaccession); commentS = "-"; System.out.println(commentS); } } csdatabase.setSubstrate(substratedatabase); } } else { String Substratename = "n.d."; String Substratesymbol = "n.d."; String Substrateaccession = "n.d"; substratedatabase.setS_NL_Name(Substratename); substratedatabase.setS_Name(Substratename); substratedatabase.setS_Symbol(Substratesymbol); substratedatabase.setS_UniprotID(Substrateaccession); System.out.println(Substratename); System.out.println(Substratesymbol); System.out.println(Substrateaccession); commentS = "-"; System.out.println(commentS); csdatabase.setSubstrate(substratedatabase); } return commentS; }
public Main_PDB_Pmap() throws MalformedURLException, IOException { PrintStream csvWriter = null; LinkedList<CsDatabaseEntry> PmapnotcuratedProteasixDB = new LinkedList<CsDatabaseEntry>(); java.util.Calendar calendar = java.util.Calendar.getInstance(); String version = "JUNE2012"; File f = new File( "//Users/julieklein/Dropbox/ProteasiX/ProteasiX/ProteasixVersionJune2012/PMAPJUIN2012_5"); File[] files = f.listFiles(); for (File file : files) { String filepath = "file://" + file.getPath() + "/"; String htmlcontentmultipleentries = getHtmlcontent(new URL(filepath)).toString(); Matcher splithtml = getPatternmatcher( "(<input\\s+id=\"ballot.*?>Detail</a></td>)", htmlcontentmultipleentries); String htmlsplitted = putSplittedhtmlintostringbuilder(splithtml); Matcher retrievepmapentryid = getPatternmatcher("<td><a\\s+href=\"" + "([^\"]+)" + "\"[^>]*>", htmlsplitted); while (retrievepmapentryid.find()) { String url = retrievepmapentryid.group(1); if (url.equalsIgnoreCase("/relation/show/16398") || url.equalsIgnoreCase("/relation/show/17178") || url.equalsIgnoreCase("/relation/show/17177") || url.equalsIgnoreCase("/relation/show/17074") || url.equalsIgnoreCase("/relation/show/17458") || url.equalsIgnoreCase("/relation/show/17467") || url.equalsIgnoreCase("/relation/show/16083") || url.equalsIgnoreCase("/relation/show/16082") || url.equalsIgnoreCase("/relation/show/16081") || url.equalsIgnoreCase("/relation/show/16080") || url.equalsIgnoreCase("/relation/show/16398") || url.equalsIgnoreCase("/relation/show/16271")) { } else { url = "http://cutdb.burnham.org" + url; String entry = getHtmlcontent(new URL(url)).toString(); Matcher patternProteaseTaxon = getPatternmatcher( "<div\\s+id=\"protdata\">[^<]+<table>[^<]+<tr>[^<]+<th\\s+class=\"th3\">Definition:</th>" + "([^<]+<td><b>[^<]+</b></td>)?" + "[^<]+</tr>[^<]+<tr>[^<]+<th\\s+class=\"th3\">Organism:</th>[^<]+<td><a\\s+href\\s+=\\s+\"[^\"]+\"\\s+target=\"[^\"]+\">" + "([^<]+)", entry); String proteaseTaxon = getProteaseTaxon(patternProteaseTaxon); System.out.println(proteaseTaxon); Matcher patternSubstrateTaxon = getPatternmatcher( "<div\\s+id=\"sbstdata\">[^<]+<table>[^<]+<tr>[^<]+<th\\s+class=\"th3\">Organism:</th>[^<]+<td><a\\s+href\\s+=\\s+\"[^\"]+\"\\s+target=\"[^\"]+\">" + "([^<]+)", entry); String substrateTaxon = getSubstrateTaxon(patternSubstrateTaxon); System.out.println(substrateTaxon); System.out.println("\n" + "******************************" + url); Matcher patternSubstrateName = getPatternmatcher( "Substrate[^<]+</th>[^<]+<td>[^<]+<table>[^<]+<tr>[^<]+<th\\s+class=\"th3\">Definition:</th>[^<]+<td><b>" + "([^<]+)", entry); SubstrateDatabaseEntry substratedatabase = new SubstrateDatabaseEntry(); CsDatabaseEntry csdatabase = new CsDatabaseEntry(); ProteaseDatabaseEntry proteasedatabase = new ProteaseDatabaseEntry(); substratedatabase.setS_Taxon(substrateTaxon); String commentS = getSubstrateNameSymbolId( patternSubstrateName, substratedatabase, csdatabase, entry, substrateTaxon); System.out.println("out"); proteasedatabase.setP_Taxon(proteaseTaxon); // csdatabase.setSubstrate(substratedatabase); Matcher patternProteaseName = getPatternmatcher( "<div\\s+id=\"protdata\">[^<]+<table>[^<]+<tr>[^<]+<th\\s+class=\"th3\">Definition:</th>[^<]+<td><b>" + "([^<]+)", entry); String commentP = getProteaseNameSymbolId( patternProteaseName, csdatabase, proteasedatabase, proteaseTaxon, commentS); csdatabase.setComment(commentP); System.out.println(commentP); csdatabase.setExternal_Link(url); Matcher patternCleavagesitePosition = getPatternmatcher( "<div\\s+id=\"cleav2\">[^<]+<table>[^<]+<th\\s+class=\"th3\">Position:</th>[^<]+<td>" + "([^<]+)?", entry); getCleavagesitePosition(patternCleavagesitePosition, csdatabase); Matcher patternCleavagesiteSequence = getPatternmatcher( "<div\\s+id=\"cleav\">[^<]+<table>[^<]+<th\\s+class=\"th3\">Sequence:</td>[^<]+<td>" + "([^<]+)?", entry); String csSequence = getCleavagesiteSequence(patternCleavagesiteSequence, csdatabase); String csSequencenodash = csSequence.replaceAll("-", ""); csSequencenodash = csSequencenodash.trim(); Matcher patternPmid = getPatternmatcher( "<div\\s+id=\"pubmed\">[^<]+<table>[^<]+<td>[^<]+<a\\s+href=\"[^\"]+\"\\s+target=\"[^\"]+\"\\s+>" + "([^<]+)", entry); getPmid(patternPmid, csdatabase); Matcher patternErrorUnmatched = getPatternmatcher("<td><font\\s+color=\"#FF0000\">" + "(\\*Unmatched)", entry); getErrorUnmatched(patternErrorUnmatched, csdatabase); SimpleDateFormat format = new SimpleDateFormat("dd-MM-yyyy"); Calendar originalDate = Calendar.getInstance(); String dateString = format.format(originalDate.getTime()); System.out.println(dateString); csdatabase.setCreation_Date(dateString); if (!substratedatabase.getS_UniprotID().contains("n.d") && !substratedatabase.getS_UniprotID().equalsIgnoreCase("to check") && !csdatabase.getP1_Sequence().equalsIgnoreCase("?") && !csdatabase.getP1prime_Sequence().equalsIgnoreCase("?") && !(csdatabase.getP1_Position() == 0) && !(csdatabase.getP1prime_Position() == 0) && !csdatabase.getCuration_Status().contains("discarded")) { String motif = csdatabase.getP1_Sequence() + "-" + csdatabase.getP1prime_Sequence(); int motifposition = csSequence.indexOf(motif) + 1; NodeList entries = getEntries( "/uniprot/entry/sequence/text()", parseUniprot( "http://www.uniprot.org/uniprot/" + substratedatabase.getS_UniprotID() + ".xml")); for (int i = 0; i < entries.getLength(); i++) { String sequence = getUniSubstratesequence(entries, i, substratedatabase); int startcs = 0; if (csSequence.startsWith("---")) { startcs = csdatabase.getP1_Position() - 1; } else if (csSequence.startsWith("--")) { startcs = csdatabase.getP1_Position() - 2; } else if (csSequence.startsWith("-")) { startcs = csdatabase.getP1_Position() - 3; } else { startcs = csdatabase.getP1_Position() - 4; } int length = csSequencenodash.length(); if ((startcs + length) < sequence.length() || (startcs + length) == sequence.length()) { String csonsequence = sequence.substring(startcs, startcs + length); System.out.println("CS ON SEQUENCE " + csonsequence); if (!csonsequence.equals(csSequencenodash)) { if (sequence.contains(csSequencenodash)) { int cleavagesiteposition = sequence.indexOf(csSequencenodash); int newp1 = motifposition + cleavagesiteposition; int newp1prime = newp1 + 1; csdatabase.setP1_Position(newp1); csdatabase.setP1prime_Position(newp1prime); System.out.println(newp1); System.out.println(newp1prime); csdatabase.setCuration_Status( "Cleavage site curated based on Uniprot protein sequence"); System.out.println("Cleavage site curated based on Uniprot protein sequence"); } else { csdatabase.setCuration_Status( "Unmatched cleavage site; Cleavage site discarded"); System.out.println("Unmatched cleavage site; Cleavage site discarded"); } } } else { if (sequence.contains(csSequencenodash)) { int cleavagesiteposition = sequence.indexOf(csSequencenodash); int newp1 = motifposition + cleavagesiteposition; int newp1prime = newp1 + 1; if (newp1 == csdatabase.getP1_Position() && newp1prime == csdatabase.getP1prime_Position()) { continue; } else { csdatabase.setP1_Position(newp1); csdatabase.setP1prime_Position(newp1prime); System.out.println(newp1); System.out.println(newp1prime); csdatabase.setCuration_Status( "Cleavage site curated based on Uniprot protein sequence"); System.out.println("Cleavage site curated based on Uniprot protein sequence"); } } } } PmapnotcuratedProteasixDB.add(csdatabase); } else if (!substratedatabase.getS_UniprotID().contains("n.d") && !substratedatabase.getS_UniprotID().equalsIgnoreCase("to check") && !csdatabase.getP1_Sequence().equalsIgnoreCase("?") && !csdatabase.getP1prime_Sequence().equalsIgnoreCase("?") && csdatabase.getP1_Position() == 0 && csdatabase.getP1prime_Position() == 0 && !csdatabase.getCuration_Status().contains("discarded")) { String motif = csdatabase.getP1_Sequence() + "-" + csdatabase.getP1prime_Sequence(); int motifposition = csSequence.indexOf(motif) + 1; NodeList entries = getEntries( "/uniprot/entry/sequence/text()", parseUniprot( "http://www.uniprot.org/uniprot/" + substratedatabase.getS_UniprotID() + ".xml")); for (int i = 0; i < entries.getLength(); i++) { String sequence = getUniSubstratesequence(entries, i, substratedatabase); if (sequence.contains(csSequencenodash)) { int cleavagesiteposition = sequence.indexOf(csSequencenodash); int newp1 = motifposition + cleavagesiteposition; int newp1prime = newp1 + 1; if (newp1 == csdatabase.getP1_Position() && newp1prime == csdatabase.getP1prime_Position()) { continue; } else { csdatabase.setP1_Position(newp1); csdatabase.setP1prime_Position(newp1prime); System.out.println(newp1); System.out.println(newp1prime); csdatabase.setCuration_Status( "Cleavage site curated based on Uniprot protein sequence"); System.out.println("Cleavage site curated based on Uniprot protein sequence"); } } else { csdatabase.setCuration_Status("Unmatched cleavage site; Cleavage site discarded"); System.out.println("Unmatched cleavage site; Cleavage site discarded"); } } PmapnotcuratedProteasixDB.add(csdatabase); } else { PmapnotcuratedProteasixDB.add(csdatabase); } } } } try { System.out.println("-----------------"); csvWriter = new PrintStream("Pmap5notcuratedProteasixDB" + "_" + version + ".csv"); // populateHeaders(csvWriter); for (CsDatabaseEntry csDatabaseEntry : PmapnotcuratedProteasixDB) { System.out.println(csDatabaseEntry.getExternal_Link()); populateData(csvWriter, csDatabaseEntry); System.out.println("OK"); } } catch (FileNotFoundException ex) { Logger.getLogger(Main_PDB_Pmap.class.getName()).log(Level.SEVERE, null, ex); } finally { csvWriter.close(); } }