public void testCrawl2() throws Exception { System.setProperty("http.proxyHost", "localhost"); System.setProperty("http.proxyPort", "3128"); Crawler c = new Crawler(1); Frontier frontier = new BasicFrontier(); frontier.add(new URI("http://harth.org/andreas/foaf.rdf")); // frontier.add(new URI("http://umbrich.net/foaf.rdf")); // frontier.setBlacklist(CrawlerConstants.BLACKLIST); LinkFilter lf = new LinkFilterDefault(frontier); c.setFetchFilter(new FetchFilterRdfXml()); c.setLinkFilter(lf); // c.setLinkFilter(new LinkFilterDummy()); ErrorHandler eh = new ErrorHandlerLogger(null, null); c.setErrorHandler(eh); Callback cb = new CallbackNxOutputStream(System.out); SinkCallback sc = new SinkCallback(cb, true); c.setOutputCallback(sc); c.evaluateBreadthFirst(frontier, 1, -1, -1); }
public void testNormalise() throws Exception { long time = System.currentTimeMillis(); TldManager tldm = new TldManager(); BreadthFirstQueue fq = new BreadthFirstQueue(tldm, Integer.MAX_VALUE, Integer.MAX_VALUE); InputStream is = new GZIPInputStream(new FileInputStream("test/uris.txt.gz")); BufferedReader br = new BufferedReader(new InputStreamReader(is)); Frontier f = new BasicFrontier(); int i = 0; String line = br.readLine(); while (line != null) { i++; URI u = new URI(line); f.add(u); line = br.readLine(); } br.close(); fq.schedule(f); int size = fq.size(); URI u = fq.poll(); Random r = new Random(); int j = 0; int redirects = 0; while (u != null) { u = fq.poll(); if (u != null && r.nextFloat() < 0.01) { fq.setRedirect(u, new URI("http://dbpedia.org/resource/Redirect"), 303); fq.addDirectly(u); System.out.println("adding " + u); redirects++; } j++; } long time1 = System.currentTimeMillis(); System.out.println(fq); System.out.println(fq.size()); System.out.println("initial queue size " + size); System.out.println("redirects " + redirects); System.out.println(i + " uris, " + j + " polled, in " + (time1 - time) + " ms"); }
public static void breadthFirstCrawling(String configFilePath) { // read xml config file Hashtable<String, String> configMap = XMLParser.parse(configFilePath); int numberOfThread = Integer.parseInt(configMap.get("numberOfThread")); String seedFile = configMap.get("seedFile"); int depth = Integer.parseInt(configMap.get("depth")); int maxURIs = Integer.parseInt(configMap.get("maxURIs")); int maxplds = Integer.parseInt(configMap.get("maxplds")); int minActplds = Integer.parseInt(configMap.get("minActplds")); String linkFilterDomain = configMap.get("linkFilterDomain"); Callback cboutput1 = null, cboutput2 = null; // set Crawler Crawler crawler = new Crawler(numberOfThread); // Add seed URIs to the Frontier // Frontier frontier = new BasicFrontier(); Frontier frontier = new RankedFrontier(); try { BufferedReader bReader = new BufferedReader(new FileReader(new File(seedFile))); String URI; while ((URI = bReader.readLine()) != null) { frontier.add(new URI(URI)); } } catch (FileNotFoundException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (URISyntaxException e) { // TODO Auto-generated catch block e.printStackTrace(); } //// // "http://localhost:8080/marmotta/resource?uri=http%3A%2F%2Flocalhost%2Finterpro%3AIPR002061")); // // "http://localhost:8080/marmotta/resource?uri=http%3A%2F%2Flocalhost%2Finterpro%3AIPR018218")); //// "http://localhost:8080/marmotta/resource?uri=http%3A%2F%2Flocalhost%2FTestResult")); //// "http://bio2rdf.org/interpro/describe/rdf/interpro:IPR002061")); //// check html version on // http://localhost:8080/marmotta/meta/text/html?uri=http%3A%2F%2Flocalhost%2FTestResult // content handler ContentHandler contentHandler; try { contentHandler = new ContentHandlers( new ContentHandlerRdfXml(), new ContentHandlerAny23(new URI("http://localhost:8080/apache-any23-service/"))); crawler.setContentHandler(contentHandler); } catch (URISyntaxException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } // sink // try { // OutputStream os = new FileOutputStream("crawlerLog"); // cboutput1 = new MyCallback(os,new PartialMatchingForEachItem()); // OutputStream os2 = new FileOutputStream("crawlerLog_ED"); // cboutput2 = new MyCallback(os2,new EditDistance(0.8)); // OutputStream os3 = new FileOutputStream("crawlerLog_Full"); // Callback cboutput3 = new CallbackNQOutputStream(os3);//if used the thread will be stoped // when a rdf file is finished, don't know why // OutputStream os2 = new BufferedOutputStream(new FileOutputStream("CrawlGraph.gexf")); // cboutput2 = new CallbackGEXFOutputStream(os2); Callbacks cbs = new Callbacks(new Callback[] {}); // Sink sink = new SinkCallback(cbs,false); // crawler.setOutputCallback(sink); // cboutput1.startDocument(); // cboutput2.startDocument(); // } catch (FileNotFoundException e) { // // TODO Auto-generated catch block // e.printStackTrace(); // } // link filter and blacklist String[] domains = linkFilterDomain.split(";"); LinkFilter linkFilter; if (domains.length >= 0 && !domains[0].equals("")) { linkFilter = new LinkFilterDomain(frontier); for (String domian : domains) { ((LinkFilterDomain) linkFilter).addHost(domian); } } else { linkFilter = new LinkFilterDefault(frontier); // linkedfilter can add new uri into Frontier } crawler.setLinkFilter(linkFilter); FetchFilter blackList = new FetchFilterSuffix(CrawlerConstants.BLACKLIST); crawler.setBlacklistFilter(blackList); // error handler try { // Print to Stdout PrintStream ps = System.out; // Print to file FileOutputStream fos; fos = new FileOutputStream("errorLogFile"); // Add printstream and file stream to error handler Callback rcb = new CallbackNQOutputStream(fos); ErrorHandler eh = new ErrorHandlerLogger(ps, rcb, true); rcb.startDocument(); // Connect hooks with error handler crawler.setErrorHandler(eh); frontier.setErrorHandler(eh); linkFilter.setErrorHandler(eh); // // java.util.logging.Logger.getLogger("com.ontologycentral.ldspider").setLevel(java.util.logging.Level.WARNING); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } crawler.evaluateBreadthFirst( frontier, new HashSetSeen(), new HashTableRedirects(), depth, maxURIs, maxplds, minActplds, false, Mode.ABOX_AND_TBOX); // ((CallbackGEXFOutputStream) cboutput2).readyToClose(); // cboutput2.endDocument(); // print callback1 result // System.out.println(cboutput1.toString()); ((LinkFilterDefault) linkFilter).printStatistics(); }