private static void retriveHTMLfiles() { Printer.createFolder("dataset", "ML_dataset_htmls"); String pidouQs = "eliza turing test,dzogchen mahamudra,coltrane olatunji concert,systema martial art,bourbaki category theory,robin milner sml ocaml,spinoza substance,freud nietzsche,nietzsche sister,heidegger being and time,free will consciousness,david bohm thought,david bohm implicate order,wolfram new kind of science,computational irreducibility,jhana theravada,mont sainte victoire cezanne,first film louis auguste lumiere,star wars kurosawa influence,matrix jean baudrillard"; String mazenQs = "Karl Marx paris,french revolution causes,playstation history,the godfather,stephen hawking theories,noam chomsky syria,whitney houston death,Android L,Maya cities,best paid soccer players,machine learning algorithms,Ramadan,Islamic State of Iraq and the Levant,Steve Jobs syrian,Supernatural powers,noah sons,Francis Albert Sinatra,SWOT analysis,Proxy server,string theory"; String qs = pidouQs + "," + mazenQs; String out = ""; int qindex = 0; for (String q : qs.split(",")) { out += "Q: " + q + "\n"; Searcher S = new Searcher(); List<Result> Results = S.search(q, 5); int dindex = 0; for (Result r : Results) { try { String fileName = "Q" + qindex + "D" + dindex; String htmlCode = HTMLhandler.readURLHTMLasString(r.getLink()); if (htmlCode.trim() != "") { Printer.printToFile( "dataset/ML_dataset_htmls/" + /*q + "/" +*/ fileName + ".html", htmlCode); System.out.println(fileName); out += fileName + "," + r.getTitle().replace(",", " ") + "\n"; dindex++; } } catch (Exception e) { } } qindex++; } Printer.printToFile("dataset/ML_dataset_htmls/files.txt", out); }
static void MLDataset() { // String qs = "Karl Marx, French revolution,Bermuda Triangle,Pineal Gland,Gray // wolf,PlayStation,Eurovision Song Contest,Flu,Gucci,the godfather"; String qs = "Karl Marx, French revolution"; int index = 0; String out2 = "id,tct,cb,cn,cbl,cnl,ca,cp,qt,qr,mr,ar,arl,mrl,pt,cas,as,len,tt,se,ss,hl,sss,csc,cs,selected\n"; // concepts Sentences.csv String out3 = "id,qt,qr,qmr,mr,ar,cns,arl,mrl,pt,as,len,tt,se,ss,hl,sss,cs,selected\n"; // concepts // Sentences.csv for (String q : qs.split(",")) { Query query = new Query(q); query.id = index++; Searcher S = new Searcher(); List<Result> Results = S.search(q, 5); ArrayList<Document> documents = new ArrayList<Document>(); DocumentAnalyzer analyzer = new DocumentAnalyzer(); int i = 0; String out = ""; // for HTML String Html1 = "<html> <head> <meta http-equiv=\"content-type\" content=\"text/html; charset=windows-1250\"> <meta name=\"generator\" content=\"PSPad editor, www.pspad.com\"> <script src='jquery.1.9.1.js'></script> <script src='jquery-ui.js'></script> <link rel='stylesheet' href='jquery-ui.css' type='text/css' charset=''utf-8' /> <link rel='stylesheet' href='jquery.ui.dialog.css' type='text/css' charset='utf-8' /><title></title> </head> <body>"; Html1 += " <script type='text/javascript'>"; Html1 += "$(function() { $( '#dialog' ).dialog({ autoOpen: false, height: 700, width:750, modal: true, position:['middle',20], }); $( '#opener' ).click(function() { save(); $( '#dialog' ).dialog( 'open' ); }); });"; Html1 += "var all1='';var all2='';"; Html1 += "function save(){ all1='';all2=''; $('input:checkbox:checked').each(function() { all2+= $(this).val()+' '; }); var res2 = all2.split(' '); document.getElementById('res2').value = (res2.sort()+''); $('input:radio:checked').each(function() { if($(this).val()!='') all1+= $(this).val()+' '; }); var res1 = all1.split(' '); document.getElementById('res1').value = res1.sort(); }"; Html1 += " </script>"; Html1 += " <div id='dialog' title='Results'> <p>Please copy this to <b>\"main-sentences-evaluation.txt\"</b>:<textarea name='res1' id='res1' cols='35' rows='10'></textarea> </p>"; Html1 += " <p>Please copy this to <b>\"concepts-sentences-evaluation.txt\"</b>:<textarea name='res2' id='res2' cols='35' rows='10'></textarea> </p></div>"; String Html2 = " </body> </html>"; for (Result res : Results) { out += Html1; out += "<h1>Q" + query.id + " - " + q + "<a href='#' id='opener'>click here to save</a></h1></hr> <h2>Q" + query.id + "D" + i + " (<a target='blank' href='" + res.getLink() + "'> " + res.getTitle() + "</a>)</h2>"; analyzer.run(res, i, query); String id = "Q" + query.id + "D" + analyzer.Doci.Rank; String[] results = generateConcepts(analyzer, id); if (results != null) { out += results[0]; out2 += results[1]; out3 += results[2]; out += Html2; Printer.printToFile("ML/Q" + query.id + "D" + i + ".html", out); out = ""; } i++; } } Printer.printToFile("ML/conceptSentences.csv", out2); Printer.printToFile("ML/mainSentences.csv", out3); }