private void crawlForEmailAddresses(String urlSeedToBeScraped) { // SELECT shopurl FROM shops.shops where placetypes like '%clothing%' and shopurl is not // null and shopurl <> ''; CrawlConfig config = new CrawlConfig(); config.setCrawlStorageFolder( "C:\\Users\\janitha\\Documents\\NetBeansProjects\\crawlshopsforcoucou\\target\\interim"); config.setPolitenessDelay(1000); config.setMaxDepthOfCrawling(1); config.setIncludeBinaryContentInCrawling(false); config.setResumableCrawling(false); PageFetcher pageFetcher = new PageFetcher(config); RobotstxtConfig robotstxtConfig = new RobotstxtConfig(); RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher); CrawlController controller = null; String shopUrl = urlSeedToBeScraped; try { controller = new CrawlController(config, pageFetcher, robotstxtServer); controller.addSeed(shopUrl); EmailScraper.setStartWithDomain(shopUrl); controller.start(EmailScraper.class, 1); } catch (Exception ex) { Logger.getLogger(EmailScraperTest.class.getName()) .log(Level.SEVERE, "Something went wrong" + ex.getMessage()); } }
public static void main(String[] args) throws Exception { String crawlStorageFolder = "H:\\DB_51aspx\\"; int numberOfCrawlers = 7; CrawlConfig config = new CrawlConfig(); config.setCrawlStorageFolder(crawlStorageFolder); /* * Instantiate the controller for this crawl. */ PageFetcher pageFetcher = new PageFetcher(config); RobotstxtConfig robotstxtConfig = new RobotstxtConfig(); RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher); CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer); /* * For each crawl, you need to add some seed urls. These are the first * URLs that are fetched and then the crawler starts following links * which are found in these pages */ controller.addSeed("http://manyou.189.cn/xzgj/index.html"); /* * Start the crawl. This is a blocking operation, meaning that your code * will reach the line after this only when crawling is finished. */ controller.start(MyCrawler.class, numberOfCrawlers); }
public void runCrawler() { final int numberOfCrawlers = 4; final CrawlConfig config = getCrawlConfig(); try { final CrawlController controller = getCrawlController(config); controller.addSeed( "http://www.gotvetesmen.com/recipes/categories/desserts/cake/index.php?pageID=1"); controller.start(BasicCrawler.class, numberOfCrawlers); } catch (Exception e) { throw new RuntimeException("Could not initialize CrawlerController", e); } }
public static void main(String[] args) { try { String crawlStorageFolder = "crawler_data"; int numberOfCrawlers = Integer.parseInt("1"); CrawlConfig config = new CrawlConfig(); config.setCrawlStorageFolder(crawlStorageFolder); config.setPolitenessDelay(1000); config.setMaxDepthOfCrawling(0); config.setMaxPagesToFetch(10000); config.setResumableCrawling(false); PageFetcher pageFetcher = new PageFetcher(config); RobotstxtConfig robotstxtConfig = new RobotstxtConfig(); RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher); CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer); Properties myProps = new Properties(); FileInputStream propInputStream; try { propInputStream = new FileInputStream("chemsyno.properties"); myProps.load(propInputStream); } catch (FileNotFoundException e1) { e1.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } Enumeration eProps = myProps.propertyNames(); while (eProps.hasMoreElements()) { String key = (String) eProps.nextElement(); String value = myProps.getProperty(key); System.out.println(value); controller.addSeed(value); } // controller.addSeed("http://en.wikipedia.org/wiki/Category:Annulenes"); // controller.addSeed("http://en.wikipedia.org/wiki/Category:Cycloalkenes"); // controller.addSeed("http://www.webref.org/scientists/scientists.htm"); controller.start(BasicCrawler.class, numberOfCrawlers); } catch (Exception e) { System.out.println("hii errrrrr"); } }
public static void main(String[] args) throws Exception { String rootFolder = "/tmp"; int numberOfCrawlers = 1; CrawlController controller = new CrawlController(rootFolder); controller.addSeed("http://hadoop.apache.org/"); controller.addSeed("http://hadoop.apache.org/common/"); controller.addSeed("http://hadoop.apache.org/hdfs/"); controller.addSeed("http://hadoop.apache.org/mapreduce/"); controller.addSeed("http://avro.apache.org/"); controller.addSeed("http://hbase.apache.org/"); controller.addSeed("http://hive.apache.org/"); controller.addSeed("http://pig.apache.org/"); controller.addSeed("http://zookeeper.apache.org/"); controller.setPolitenessDelay(1000); controller.setMaximumCrawlDepth(2); controller.setMaximumPagesToFetch(1); controller.start(MyCrawler.class, numberOfCrawlers); }
public static void main(String[] args) throws Exception { String rootFolder = "data/crowler"; int numberOfCrawlers = 1; CrawlConfig config = new CrawlConfig(); config.setCrawlStorageFolder(rootFolder); config.setMaxPagesToFetch(4); config.setPolitenessDelay(1000); config.setMaxDepthOfCrawling(10); // config.setProxyHost("cache.mrt.ac.lk"); // config.setProxyPort(3128); PageFetcher pageFetcher = new PageFetcher(config); RobotstxtConfig robotstxtConfig = new RobotstxtConfig(); RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher); CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer); controller.addSeed("http://www.lankadeepa.lk/"); controller.start(Crawler.class, numberOfCrawlers); }
public static void main(String[] args) throws Exception { if (args.length < 3) { System.out.println("Needed parameters: "); System.out.println("\t rootFolder (it will contain intermediate crawl data)"); System.out.println("\t numberOfCralwers (number of concurrent threads)"); System.out.println("\t storageFolder (a folder for storing downloaded images)"); return; } String rootFolder = args[0]; int numberOfCrawlers = Integer.parseInt(args[1]); String storageFolder = args[2]; CrawlConfig config = new CrawlConfig(); config.setCrawlStorageFolder(rootFolder); /* * Since images are binary content, we need to set this parameter to * true to make sure they are included in the crawl. */ config.setIncludeBinaryContentInCrawling(true); String[] crawlDomains = new String[] {"http://uci.edu/"}; PageFetcher pageFetcher = new PageFetcher(config); RobotstxtConfig robotstxtConfig = new RobotstxtConfig(); RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher); CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer); for (String domain : crawlDomains) { controller.addSeed(domain); } ImageCrawler.configure(crawlDomains, storageFolder); controller.start(ImageCrawler.class, numberOfCrawlers); }
public static void main(String[] args) throws Exception { if (args.length != 2) { logger.info("Needed parameters: "); logger.info("\t rootFolder (it will contain intermediate crawl data)"); logger.info("\t numberOfCralwers (number of concurrent threads)"); return; } /* * crawlStorageFolder is a folder where intermediate crawl data is * stored. */ String crawlStorageFolder = args[0]; /* * numberOfCrawlers shows the number of concurrent threads that should * be initiated for crawling. */ int numberOfCrawlers = Integer.parseInt(args[1]); CrawlConfig config = new CrawlConfig(); config.setCrawlStorageFolder(crawlStorageFolder); /* * Be polite: Make sure that we don't send more than 1 request per * second (1000 milliseconds between requests). */ config.setPolitenessDelay(1000); /* * You can set the maximum crawl depth here. The default value is -1 for * unlimited depth */ config.setMaxDepthOfCrawling(2); /* * You can set the maximum number of pages to crawl. The default value * is -1 for unlimited number of pages */ config.setMaxPagesToFetch(1000); /** * Do you want crawler4j to crawl also binary data ? example: the contents of pdf, or the * metadata of images etc */ config.setIncludeBinaryContentInCrawling(false); /* * Do you need to set a proxy? If so, you can use: * config.setProxyHost("proxyserver.example.com"); * config.setProxyPort(8080); * * If your proxy also needs authentication: * config.setProxyUsername(username); config.getProxyPassword(password); */ /* * This config parameter can be used to set your crawl to be resumable * (meaning that you can resume the crawl from a previously * interrupted/crashed crawl). Note: if you enable resuming feature and * want to start a fresh crawl, you need to delete the contents of * rootFolder manually. */ config.setResumableCrawling(false); /* * Instantiate the controller for this crawl. */ PageFetcher pageFetcher = new PageFetcher(config); RobotstxtConfig robotstxtConfig = new RobotstxtConfig(); RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher); CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer); /* * For each crawl, you need to add some seed urls. These are the first * URLs that are fetched and then the crawler starts following links * which are found in these pages */ controller.addSeed(UrlStartingSeed.ICS_DOMAIN); /* * Start the crawl. This is a blocking operation, meaning that your code * will reach the line after this only when crawling is finished. */ controller.start(BasicCrawler.class, numberOfCrawlers); }