/** * Run this method to start the crawl. * * @throws IOException when the output folder cannot be created or emptied. */ public static void main(String[] args) throws IOException { CrawljaxConfigurationBuilder builder = CrawljaxConfiguration.builderFor(URL); builder.crawlRules().insertRandomDataInInputForms(false); // We are going to use the WebScarab proxy plugin WebScarabProxyPlugin proxyPlugin = new WebScarabProxyPlugin(); // Provide the JS file to be inserted proxyPlugin.addPlugin(new JSInjectorProxyAddon(new File("foo.js"))); builder.addPlugin(proxyPlugin); // Configure the proxy to use the port 8084 (you can change this of course) builder.setProxyConfig(ProxyConfiguration.manualProxyOn("127.0.0.1", 8084)); // click these elements builder.crawlRules().clickDefaultElements(); builder.crawlRules().click("div").withAttribute("class", "clickable"); builder.setMaximumStates(4); // Set timeouts builder.crawlRules().waitAfterReloadUrl(WAIT_TIME_AFTER_RELOAD, TimeUnit.MILLISECONDS); builder.crawlRules().waitAfterEvent(WAIT_TIME_AFTER_EVENT, TimeUnit.MILLISECONDS); builder.setBrowserConfig(new BrowserConfiguration(BrowserType.firefox, 1)); CrawljaxRunner crawljax = new CrawljaxRunner(builder.build()); crawljax.call(); }
/** entry point */ public static void main(String[] args) { CrawljaxConfigurationBuilder builder = CrawljaxConfiguration.builderFor(URL); builder.crawlRules().insertRandomDataInInputForms(false); builder.crawlRules().click("a"); // click these elements builder.crawlRules().clickDefaultElements(); builder.crawlRules().click("div").withAttribute("class", "clickable"); // but don't click these builder.crawlRules().dontClick("a").withAttribute("class", "ignore"); builder.crawlRules().dontClick("a").underXPath("//DIV[@id='footer']"); // Set timeouts builder.crawlRules().waitAfterReloadUrl(WAIT_TIME_AFTER_RELOAD, TimeUnit.MILLISECONDS); builder.crawlRules().waitAfterEvent(WAIT_TIME_AFTER_EVENT, TimeUnit.MILLISECONDS); // Add a condition that this XPath doesn't exits builder .crawlRules() .addCrawlCondition( "No spans with foo as class", new NotXPathCondition("//*[@class='foo']")); // Set some input for fields builder.crawlRules().setInputSpec(getInputSpecification()); // This will generate a nice output in the output directory. builder.addPlugin(new CrawlOverview(new File(outputDir))); CrawljaxController crawljax = new CrawljaxController(builder.build()); crawljax.run(); }
/** * Make sure InvariantViolationPlugin executed. * * @throws ConfigurationException when failure configuring Properties */ @Test public void testInvariantFailurePlugin() throws ConfigurationException { hit = false; CrawljaxConfigurationBuilder builder = CrawljaxConfiguration.builderFor("http://localhost") .addPlugin( new OnInvariantViolationPlugin() { @Override public void onInvariantViolation(Invariant invariant, CrawlSession session) { hit = true; } }); builder .crawlRules() .addInvariant( new Invariant( "Test123", new Condition() { @Override public NodeList getAffectedNodes() { return null; } @Override public boolean check(EmbeddedBrowser browser) { return false; } })); setStateMachineForConfig(builder.build()); // state2.equals(state3) StateVertex state2 = new StateVertex("state2", "<table><div>state2</div></table>"); StateVertex state3 = new StateVertex("state3", "<table><div>state2</div></table>"); Eventable c = new Eventable(new Identification(How.xpath, "/bla"), EventType.click); assertTrue(sm.updateAndCheckIfClone(c, state2, dummyBrowser, new CrawlSession(dummyPool))); // New State so hit must be true; assertTrue("InvariantViolationPlugin are exeucted", hit); hit = false; assertFalse("Hit reseted", hit); Eventable c2 = new Eventable(new Identification(How.xpath, "/bla"), EventType.click); assertFalse(sm.updateAndCheckIfClone(c2, state3, dummyBrowser, new CrawlSession(dummyPool))); // New State so plugin execution assertTrue("InvariantViolationPlugin are exeucted", hit); }
/** Entry point */ public static void main(String[] args) { System.setProperty( "webdriver.firefox.bin", "C:\\Program Files (x86)\\Mozilla Firefox\\Firefox.exe"); CrawljaxConfigurationBuilder builder = CrawljaxConfiguration.builderFor(URL); BrowserConfiguration browserConfig = new BrowserConfiguration(BrowserType.firefox); builder.setBrowserConfig(browserConfig); // PopUpCancel configuration = new PopUpCancel(); // limit the crawling scope builder.setMaximumStates(MAX_STATES); builder.setMaximumDepth(MAX_CRAWL_DEPTH); builder.crawlRules().setInputSpec(getInputSpecification()); CrawljaxController crawljax = new CrawljaxController(builder.build()); crawljax.run(); }
private CrawljaxConfiguration getCrawljaxBuilder() { String url = ""; if (options.getUrl().toLowerCase().startsWith("http://")) { url = options.getUrl(); } else { server = new Server(8080); ResourceHandler handler = new ResourceHandler(); try { File fileToCrawl = new File(options.getUrl()); handler.setBaseResource( Resource.newResource(fileToCrawl.getParentFile().getAbsolutePath())); server.setHandler(handler); server.start(); int port = ((ServerConnector) server.getConnectors()[0]).getLocalPort(); url = "http://localhost:" + port + "/" + fileToCrawl.getName(); // URI.create(url); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } CrawljaxConfigurationBuilder builder = CrawljaxConfiguration.builderFor(url); // builder.addPlugin(new CrawlOverview()); builder.addPlugin( new OnNewStatePlugin() { @Override public void onNewState(CrawlerContext arg0, StateVertex arg1) { if (cancelSupplier != null) { if (cancelSupplier.getAsBoolean()) { crawljax.stop(); } } Document document; try { document = arg1.getDocument(); documents.add(document); notifyObservers(document); } catch (IOException e) { e.printStackTrace(); } } }); builder.setBrowserConfig(new BrowserConfiguration(BrowserType.PHANTOMJS, 1)); builder.setOutputDirectory( new File(options.getOutputDirectory().getAbsolutePath() + "/crawljax")); builder.setMaximumDepth(options.getMaxDepth()); builder.setMaximumStates(options.getMaxStates()); CrawlRulesBuilder crawlRules = builder.crawlRules(); if (options.shouldClickDefaultElements()) { crawlRules.clickDefaultElements(); } if (options.getDontClickElements().size() > 0) { for (String dontClick : options.getDontClickElements()) { crawlRules.dontClick(dontClick); // TODO: .withAttribute("value", "I don't recognize"); // TODO: .underXPath("//*[@id='pageFooter']"); // .underXPath("//*[@id='content']/div/div[2]"); } } if (options.getClickElements().size() > 0) { for (String click : options.getClickElements()) { crawlRules.click(click); // TODO: .withAttribute("type", "submit"); } } if (options.getDontClickElementsChildrenOf().size() > 0) { for (String dontClick : options.getDontClickElementsChildrenOf()) { crawlRules.dontClickChildrenOf(dontClick); } } crawlRules.insertRandomDataInInputForms(options.shouldPutRandomDataInForms()); crawlRules.clickElementsInRandomOrder(options.shouldClickElementsInRandomOrder()); crawlRules.crawlFrames(options.shouldCrawlFrames()); crawlRules.waitAfterReloadUrl(options.getWaitTimeAferReload(), TimeUnit.MILLISECONDS); crawlRules.waitAfterEvent(options.getWaitTimeAfterEvent(), TimeUnit.MILLISECONDS); crawlRules.clickOnce(options.shouldClickOnce()); crawlRules.crawlHiddenAnchors(options.shouldCrawlHiddenAnchorsButton()); return builder.build(); }