private static void retriveHTMLfiles() { Printer.createFolder("dataset", "ML_dataset_htmls"); String pidouQs = "eliza turing test,dzogchen mahamudra,coltrane olatunji concert,systema martial art,bourbaki category theory,robin milner sml ocaml,spinoza substance,freud nietzsche,nietzsche sister,heidegger being and time,free will consciousness,david bohm thought,david bohm implicate order,wolfram new kind of science,computational irreducibility,jhana theravada,mont sainte victoire cezanne,first film louis auguste lumiere,star wars kurosawa influence,matrix jean baudrillard"; String mazenQs = "Karl Marx paris,french revolution causes,playstation history,the godfather,stephen hawking theories,noam chomsky syria,whitney houston death,Android L,Maya cities,best paid soccer players,machine learning algorithms,Ramadan,Islamic State of Iraq and the Levant,Steve Jobs syrian,Supernatural powers,noah sons,Francis Albert Sinatra,SWOT analysis,Proxy server,string theory"; String qs = pidouQs + "," + mazenQs; String out = ""; int qindex = 0; for (String q : qs.split(",")) { out += "Q: " + q + "\n"; Searcher S = new Searcher(); List<Result> Results = S.search(q, 5); int dindex = 0; for (Result r : Results) { try { String fileName = "Q" + qindex + "D" + dindex; String htmlCode = HTMLhandler.readURLHTMLasString(r.getLink()); if (htmlCode.trim() != "") { Printer.printToFile( "dataset/ML_dataset_htmls/" + /*q + "/" +*/ fileName + ".html", htmlCode); System.out.println(fileName); out += fileName + "," + r.getTitle().replace(",", " ") + "\n"; dindex++; } } catch (Exception e) { } } qindex++; } Printer.printToFile("dataset/ML_dataset_htmls/files.txt", out); }
private static void mergeDataWithEvaluation() { String physicalFolder = PropertiesManager.getProperty("webRootPath"); // Main sentence String mainSentenceDataFilePath = physicalFolder + "/ml/tomerge/mainSentences.csv"; String mainSentenceEvaluationFilePath = physicalFolder + "/ml/tomerge/main-sentences-evaluation.txt"; String mainSentenceData = readTextDocument(mainSentenceDataFilePath); String mainSentenceEval = readTextDocument(mainSentenceEvaluationFilePath); mainSentenceEval = mainSentenceEval.substring(0, mainSentenceEval.length() - 2) + ","; String mainSentenceOut = ""; System.out.println(mainSentenceEval); for (String line : mainSentenceData.split(System.lineSeparator())) { if (!line.contains("selected")) { String id = line.split(",")[0]; // System.out.println(id); if (mainSentenceEval.contains(id + ",")) { mainSentenceOut += line + "true" + System.lineSeparator(); } else { mainSentenceOut += line + "false" + System.lineSeparator(); } } else { mainSentenceOut += line + System.lineSeparator(); } } Printer.printToFile("ml/tomerge/mainSentenceMLDataset.csv", mainSentenceOut); // concepts' sentences String conceptSentenceDataFilePath = physicalFolder + "/ml/tomerge/conceptSentences.csv"; String conceptSentenceEvaluationFilePath = physicalFolder + "/ml/tomerge/concepts-sentences-evaluation.txt"; String conceptSentenceData = readTextDocument(conceptSentenceDataFilePath); String conceptSentenceEval = readTextDocument(conceptSentenceEvaluationFilePath); conceptSentenceEval = conceptSentenceEval.substring(0, conceptSentenceEval.length() - 2) + ","; String conceptSentenceOut = ""; for (String line : conceptSentenceData.split(System.lineSeparator())) { if (!line.contains("selected")) { String id = line.split(",")[0]; if (conceptSentenceEval.contains(id + ",")) { conceptSentenceOut += line + "true" + System.lineSeparator(); } else { conceptSentenceOut += line + "false" + System.lineSeparator(); } } else { conceptSentenceOut += line + System.lineSeparator(); } } // conceptSentenceOut = conceptSentenceOut.replace("\"", "”").replace(",,,", // ",\",\",").replace("'", "`"); Printer.printToFile("ml/tomerge/conceptSentenceMLDataset.csv", conceptSentenceOut); }
private static void runOverHTMLfiles() { System.out.println("Start ML generation"); // read files.txt String files = HTMLhandler.readURLTxtasString("http://localhost:8080/ensenTensorielWeb/ml/files.txt"); String out2 = "id,tct,cb,cn,cbl,cnl,ca,cp,qt,qr,mr,ar,arl,mrl,pt,cas,as,len,tt,se,ss,hl,sss,csc,cs,nch,d,php,fph,lph,selected\n"; // concepts Sentences.csv String out3 = "id,qt,qr,qmr,mr,ar,cns,arl,mrl,pt,as,len,tt,se,ss,hl,sss,cs,nch,d,php,fph,lph,selected\n"; // concepts Sentences.csv String Html1 = "<html> <head> <meta http-equiv=\"content-type\" content=\"text/html; charset=windows-1250\"> <meta name=\"generator\" content=\"PSPad editor, www.pspad.com\"> <script src='jquery.1.9.1.js'></script> <script src='jquery-ui.js'></script> <link rel='stylesheet' href='jquery-ui.css' type='text/css' charset=''utf-8' /> <link rel='stylesheet' href='jquery.ui.dialog.css' type='text/css' charset='utf-8' /><title></title> </head> <body>"; Html1 += " <script type='text/javascript'>"; Html1 += "$(function() { $( '#dialog' ).dialog({ autoOpen: false, height: 700, width:750, modal: true, position:['middle',20], }); $( '#opener' ).click(function() { save(); $( '#dialog' ).dialog( 'open' ); }); });"; Html1 += "var all1='';var all2='';"; Html1 += "function save(){ all1='';all2=''; $('input:checkbox:checked').each(function() { all2+= $(this).val()+' '; }); var res2 = all2.split(' '); document.getElementById('res2').value = (res2.sort()+''); $('input:radio:checked').each(function() { if($(this).val()!='') all1+= $(this).val()+' '; }); var res1 = all1.split(' '); document.getElementById('res1').value = res1.sort(); }"; Html1 += " </script>"; Html1 += " <div id='dialog' title='Results'> <p>Please copy this to <b>\"main-sentences-evaluation.txt\"</b>:<textarea name='res1' id='res1' cols='35' rows='10'></textarea> </p>"; Html1 += " <p>Please copy this to <b>\"concepts-sentences-evaluation.txt\"</b>:<textarea name='res2' id='res2' cols='35' rows='10'></textarea> </p></div>"; String Html2 = " </body> </html>"; /*int phCounter=0; int conceptCounter=0;*/ String currQ = ""; Query query = null; int index = 0; int i = 0; String[] lines = files.split("\n"); for (int lineN = 0; lineN < lines.length; lineN++) { /*if (index == 2) break;*/ String line = lines[lineN]; if (line.contains("Q:")) // query { currQ = line.split(":")[1]; System.out.println(currQ); query = new Query(currQ); query.id = index; i = 0; index++; } else { // file String id = line.split(",")[0]; String title = line.split(",")[1]; String url = "http://localhost:8080/ensenTensorielWeb/ml/" + id + ".html"; System.out.println("Start id: " + id + ", title: " + title); DocumentAnalyzer analyzer = new DocumentAnalyzer(); String out = ""; // for HTML out += Html1; out += "<h1>Q" + query.id + " - " + currQ + "<a href='#' id='opener'>click here to save</a></h1></hr> <h2>Q" + query.id + "D" + i + " (<a target='blank' href='" + url + "'> " + title + "</a>)</h2>"; analyzer.run(url, query, title, i); System.out.println("Q" + analyzer.Doci.q.id + ", doc" + analyzer.Doci.Rank + " analyzed"); String[] results = generateConcepts(analyzer, id); if (results != null) { out += results[0]; out2 += results[1]; out3 += results[2]; out += Html2; Printer.printToFile("ml/dataset/" + id + "-sentences.html", out); out = ""; } System.out.println("finished id: " + id + ", title: " + title); System.out.println(); System.out.println(); System.out.println(); i++; } } Printer.printToFile("ml/dataset/conceptSentences.csv", out2); Printer.printToFile("ml/dataset/mainSentences.csv", out3); }
static void MLDataset() { // String qs = "Karl Marx, French revolution,Bermuda Triangle,Pineal Gland,Gray // wolf,PlayStation,Eurovision Song Contest,Flu,Gucci,the godfather"; String qs = "Karl Marx, French revolution"; int index = 0; String out2 = "id,tct,cb,cn,cbl,cnl,ca,cp,qt,qr,mr,ar,arl,mrl,pt,cas,as,len,tt,se,ss,hl,sss,csc,cs,selected\n"; // concepts Sentences.csv String out3 = "id,qt,qr,qmr,mr,ar,cns,arl,mrl,pt,as,len,tt,se,ss,hl,sss,cs,selected\n"; // concepts // Sentences.csv for (String q : qs.split(",")) { Query query = new Query(q); query.id = index++; Searcher S = new Searcher(); List<Result> Results = S.search(q, 5); ArrayList<Document> documents = new ArrayList<Document>(); DocumentAnalyzer analyzer = new DocumentAnalyzer(); int i = 0; String out = ""; // for HTML String Html1 = "<html> <head> <meta http-equiv=\"content-type\" content=\"text/html; charset=windows-1250\"> <meta name=\"generator\" content=\"PSPad editor, www.pspad.com\"> <script src='jquery.1.9.1.js'></script> <script src='jquery-ui.js'></script> <link rel='stylesheet' href='jquery-ui.css' type='text/css' charset=''utf-8' /> <link rel='stylesheet' href='jquery.ui.dialog.css' type='text/css' charset='utf-8' /><title></title> </head> <body>"; Html1 += " <script type='text/javascript'>"; Html1 += "$(function() { $( '#dialog' ).dialog({ autoOpen: false, height: 700, width:750, modal: true, position:['middle',20], }); $( '#opener' ).click(function() { save(); $( '#dialog' ).dialog( 'open' ); }); });"; Html1 += "var all1='';var all2='';"; Html1 += "function save(){ all1='';all2=''; $('input:checkbox:checked').each(function() { all2+= $(this).val()+' '; }); var res2 = all2.split(' '); document.getElementById('res2').value = (res2.sort()+''); $('input:radio:checked').each(function() { if($(this).val()!='') all1+= $(this).val()+' '; }); var res1 = all1.split(' '); document.getElementById('res1').value = res1.sort(); }"; Html1 += " </script>"; Html1 += " <div id='dialog' title='Results'> <p>Please copy this to <b>\"main-sentences-evaluation.txt\"</b>:<textarea name='res1' id='res1' cols='35' rows='10'></textarea> </p>"; Html1 += " <p>Please copy this to <b>\"concepts-sentences-evaluation.txt\"</b>:<textarea name='res2' id='res2' cols='35' rows='10'></textarea> </p></div>"; String Html2 = " </body> </html>"; for (Result res : Results) { out += Html1; out += "<h1>Q" + query.id + " - " + q + "<a href='#' id='opener'>click here to save</a></h1></hr> <h2>Q" + query.id + "D" + i + " (<a target='blank' href='" + res.getLink() + "'> " + res.getTitle() + "</a>)</h2>"; analyzer.run(res, i, query); String id = "Q" + query.id + "D" + analyzer.Doci.Rank; String[] results = generateConcepts(analyzer, id); if (results != null) { out += results[0]; out2 += results[1]; out3 += results[2]; out += Html2; Printer.printToFile("ML/Q" + query.id + "D" + i + ".html", out); out = ""; } i++; } } Printer.printToFile("ML/conceptSentences.csv", out2); Printer.printToFile("ML/mainSentences.csv", out3); }