Пример #1
0
  /**
   * Run this method to start the crawl.
   *
   * @throws IOException when the output folder cannot be created or emptied.
   */
  public static void main(String[] args) throws IOException {
    CrawljaxConfigurationBuilder builder = CrawljaxConfiguration.builderFor(URL);
    builder.crawlRules().insertRandomDataInInputForms(false);

    // We are going to use the WebScarab proxy plugin
    WebScarabProxyPlugin proxyPlugin = new WebScarabProxyPlugin();
    // Provide the JS file to be inserted
    proxyPlugin.addPlugin(new JSInjectorProxyAddon(new File("foo.js")));
    builder.addPlugin(proxyPlugin);

    // Configure the proxy to use the port 8084 (you can change this of course)
    builder.setProxyConfig(ProxyConfiguration.manualProxyOn("127.0.0.1", 8084));

    // click these elements
    builder.crawlRules().clickDefaultElements();
    builder.crawlRules().click("div").withAttribute("class", "clickable");

    builder.setMaximumStates(4);

    // Set timeouts
    builder.crawlRules().waitAfterReloadUrl(WAIT_TIME_AFTER_RELOAD, TimeUnit.MILLISECONDS);
    builder.crawlRules().waitAfterEvent(WAIT_TIME_AFTER_EVENT, TimeUnit.MILLISECONDS);

    builder.setBrowserConfig(new BrowserConfiguration(BrowserType.firefox, 1));

    CrawljaxRunner crawljax = new CrawljaxRunner(builder.build());
    crawljax.call();
  }
  /** entry point */
  public static void main(String[] args) {
    CrawljaxConfigurationBuilder builder = CrawljaxConfiguration.builderFor(URL);
    builder.crawlRules().insertRandomDataInInputForms(false);

    builder.crawlRules().click("a");

    // click these elements
    builder.crawlRules().clickDefaultElements();
    builder.crawlRules().click("div").withAttribute("class", "clickable");

    // but don't click these
    builder.crawlRules().dontClick("a").withAttribute("class", "ignore");
    builder.crawlRules().dontClick("a").underXPath("//DIV[@id='footer']");

    // Set timeouts
    builder.crawlRules().waitAfterReloadUrl(WAIT_TIME_AFTER_RELOAD, TimeUnit.MILLISECONDS);
    builder.crawlRules().waitAfterEvent(WAIT_TIME_AFTER_EVENT, TimeUnit.MILLISECONDS);

    // Add a condition that this XPath doesn't exits
    builder
        .crawlRules()
        .addCrawlCondition(
            "No spans with foo as class", new NotXPathCondition("//*[@class='foo']"));

    // Set some input for fields
    builder.crawlRules().setInputSpec(getInputSpecification());

    // This will generate a nice output in the output directory.
    builder.addPlugin(new CrawlOverview(new File(outputDir)));

    CrawljaxController crawljax = new CrawljaxController(builder.build());
    crawljax.run();
  }
Пример #3
0
  /**
   * Make sure InvariantViolationPlugin executed.
   *
   * @throws ConfigurationException when failure configuring Properties
   */
  @Test
  public void testInvariantFailurePlugin() throws ConfigurationException {
    hit = false;
    CrawljaxConfigurationBuilder builder =
        CrawljaxConfiguration.builderFor("http://localhost")
            .addPlugin(
                new OnInvariantViolationPlugin() {
                  @Override
                  public void onInvariantViolation(Invariant invariant, CrawlSession session) {
                    hit = true;
                  }
                });
    builder
        .crawlRules()
        .addInvariant(
            new Invariant(
                "Test123",
                new Condition() {

                  @Override
                  public NodeList getAffectedNodes() {
                    return null;
                  }

                  @Override
                  public boolean check(EmbeddedBrowser browser) {
                    return false;
                  }
                }));
    setStateMachineForConfig(builder.build());

    // state2.equals(state3)
    StateVertex state2 = new StateVertex("state2", "<table><div>state2</div></table>");
    StateVertex state3 = new StateVertex("state3", "<table><div>state2</div></table>");

    Eventable c = new Eventable(new Identification(How.xpath, "/bla"), EventType.click);

    assertTrue(sm.updateAndCheckIfClone(c, state2, dummyBrowser, new CrawlSession(dummyPool)));

    // New State so hit must be true;
    assertTrue("InvariantViolationPlugin are exeucted", hit);
    hit = false;
    assertFalse("Hit reseted", hit);

    Eventable c2 = new Eventable(new Identification(How.xpath, "/bla"), EventType.click);

    assertFalse(sm.updateAndCheckIfClone(c2, state3, dummyBrowser, new CrawlSession(dummyPool)));

    // New State so plugin execution
    assertTrue("InvariantViolationPlugin are exeucted", hit);
  }
Пример #4
0
  /** Entry point */
  public static void main(String[] args) {
    System.setProperty(
        "webdriver.firefox.bin", "C:\\Program Files (x86)\\Mozilla Firefox\\Firefox.exe");

    CrawljaxConfigurationBuilder builder = CrawljaxConfiguration.builderFor(URL);

    BrowserConfiguration browserConfig = new BrowserConfiguration(BrowserType.firefox);

    builder.setBrowserConfig(browserConfig);

    // PopUpCancel configuration  = new PopUpCancel();

    // limit the crawling scope
    builder.setMaximumStates(MAX_STATES);
    builder.setMaximumDepth(MAX_CRAWL_DEPTH);

    builder.crawlRules().setInputSpec(getInputSpecification());

    CrawljaxController crawljax = new CrawljaxController(builder.build());
    crawljax.run();
  }
  private CrawljaxConfiguration getCrawljaxBuilder() {
    String url = "";
    if (options.getUrl().toLowerCase().startsWith("http://")) {
      url = options.getUrl();
    } else {
      server = new Server(8080);
      ResourceHandler handler = new ResourceHandler();
      try {
        File fileToCrawl = new File(options.getUrl());
        handler.setBaseResource(
            Resource.newResource(fileToCrawl.getParentFile().getAbsolutePath()));
        server.setHandler(handler);
        server.start();
        int port = ((ServerConnector) server.getConnectors()[0]).getLocalPort();
        url = "http://localhost:" + port + "/" + fileToCrawl.getName(); // URI.create(url);
      } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      }
    }
    CrawljaxConfigurationBuilder builder = CrawljaxConfiguration.builderFor(url);
    // builder.addPlugin(new CrawlOverview());
    builder.addPlugin(
        new OnNewStatePlugin() {
          @Override
          public void onNewState(CrawlerContext arg0, StateVertex arg1) {
            if (cancelSupplier != null) {
              if (cancelSupplier.getAsBoolean()) {
                crawljax.stop();
              }
            }
            Document document;
            try {
              document = arg1.getDocument();
              documents.add(document);
              notifyObservers(document);
            } catch (IOException e) {
              e.printStackTrace();
            }
          }
        });
    builder.setBrowserConfig(new BrowserConfiguration(BrowserType.PHANTOMJS, 1));
    builder.setOutputDirectory(
        new File(options.getOutputDirectory().getAbsolutePath() + "/crawljax"));
    builder.setMaximumDepth(options.getMaxDepth());
    builder.setMaximumStates(options.getMaxStates());
    CrawlRulesBuilder crawlRules = builder.crawlRules();
    if (options.shouldClickDefaultElements()) {
      crawlRules.clickDefaultElements();
    }
    if (options.getDontClickElements().size() > 0) {
      for (String dontClick : options.getDontClickElements()) {
        crawlRules.dontClick(dontClick);
        // TODO: .withAttribute("value", "I don't recognize");
        // TODO: .underXPath("//*[@id='pageFooter']");
        //		.underXPath("//*[@id='content']/div/div[2]");
      }
    }
    if (options.getClickElements().size() > 0) {
      for (String click : options.getClickElements()) {
        crawlRules.click(click);
        // TODO: .withAttribute("type", "submit");
      }
    }
    if (options.getDontClickElementsChildrenOf().size() > 0) {
      for (String dontClick : options.getDontClickElementsChildrenOf()) {
        crawlRules.dontClickChildrenOf(dontClick);
      }
    }
    crawlRules.insertRandomDataInInputForms(options.shouldPutRandomDataInForms());
    crawlRules.clickElementsInRandomOrder(options.shouldClickElementsInRandomOrder());
    crawlRules.crawlFrames(options.shouldCrawlFrames());
    crawlRules.waitAfterReloadUrl(options.getWaitTimeAferReload(), TimeUnit.MILLISECONDS);
    crawlRules.waitAfterEvent(options.getWaitTimeAfterEvent(), TimeUnit.MILLISECONDS);
    crawlRules.clickOnce(options.shouldClickOnce());
    crawlRules.crawlHiddenAnchors(options.shouldCrawlHiddenAnchorsButton());

    return builder.build();
  }