예제 #1
0
  /**
   * Run this method to start the crawl.
   *
   * @throws IOException when the output folder cannot be created or emptied.
   */
  public static void main(String[] args) throws IOException {
    CrawljaxConfigurationBuilder builder = CrawljaxConfiguration.builderFor(URL);
    builder.crawlRules().insertRandomDataInInputForms(false);

    // We are going to use the WebScarab proxy plugin
    WebScarabProxyPlugin proxyPlugin = new WebScarabProxyPlugin();
    // Provide the JS file to be inserted
    proxyPlugin.addPlugin(new JSInjectorProxyAddon(new File("foo.js")));
    builder.addPlugin(proxyPlugin);

    // Configure the proxy to use the port 8084 (you can change this of course)
    builder.setProxyConfig(ProxyConfiguration.manualProxyOn("127.0.0.1", 8084));

    // click these elements
    builder.crawlRules().clickDefaultElements();
    builder.crawlRules().click("div").withAttribute("class", "clickable");

    builder.setMaximumStates(4);

    // Set timeouts
    builder.crawlRules().waitAfterReloadUrl(WAIT_TIME_AFTER_RELOAD, TimeUnit.MILLISECONDS);
    builder.crawlRules().waitAfterEvent(WAIT_TIME_AFTER_EVENT, TimeUnit.MILLISECONDS);

    builder.setBrowserConfig(new BrowserConfiguration(BrowserType.firefox, 1));

    CrawljaxRunner crawljax = new CrawljaxRunner(builder.build());
    crawljax.call();
  }
예제 #2
0
  /** Entry point */
  public static void main(String[] args) {
    System.setProperty(
        "webdriver.firefox.bin", "C:\\Program Files (x86)\\Mozilla Firefox\\Firefox.exe");

    CrawljaxConfigurationBuilder builder = CrawljaxConfiguration.builderFor(URL);

    BrowserConfiguration browserConfig = new BrowserConfiguration(BrowserType.firefox);

    builder.setBrowserConfig(browserConfig);

    // PopUpCancel configuration  = new PopUpCancel();

    // limit the crawling scope
    builder.setMaximumStates(MAX_STATES);
    builder.setMaximumDepth(MAX_CRAWL_DEPTH);

    builder.crawlRules().setInputSpec(getInputSpecification());

    CrawljaxController crawljax = new CrawljaxController(builder.build());
    crawljax.run();
  }
  private CrawljaxConfiguration getCrawljaxBuilder() {
    String url = "";
    if (options.getUrl().toLowerCase().startsWith("http://")) {
      url = options.getUrl();
    } else {
      server = new Server(8080);
      ResourceHandler handler = new ResourceHandler();
      try {
        File fileToCrawl = new File(options.getUrl());
        handler.setBaseResource(
            Resource.newResource(fileToCrawl.getParentFile().getAbsolutePath()));
        server.setHandler(handler);
        server.start();
        int port = ((ServerConnector) server.getConnectors()[0]).getLocalPort();
        url = "http://localhost:" + port + "/" + fileToCrawl.getName(); // URI.create(url);
      } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      }
    }
    CrawljaxConfigurationBuilder builder = CrawljaxConfiguration.builderFor(url);
    // builder.addPlugin(new CrawlOverview());
    builder.addPlugin(
        new OnNewStatePlugin() {
          @Override
          public void onNewState(CrawlerContext arg0, StateVertex arg1) {
            if (cancelSupplier != null) {
              if (cancelSupplier.getAsBoolean()) {
                crawljax.stop();
              }
            }
            Document document;
            try {
              document = arg1.getDocument();
              documents.add(document);
              notifyObservers(document);
            } catch (IOException e) {
              e.printStackTrace();
            }
          }
        });
    builder.setBrowserConfig(new BrowserConfiguration(BrowserType.PHANTOMJS, 1));
    builder.setOutputDirectory(
        new File(options.getOutputDirectory().getAbsolutePath() + "/crawljax"));
    builder.setMaximumDepth(options.getMaxDepth());
    builder.setMaximumStates(options.getMaxStates());
    CrawlRulesBuilder crawlRules = builder.crawlRules();
    if (options.shouldClickDefaultElements()) {
      crawlRules.clickDefaultElements();
    }
    if (options.getDontClickElements().size() > 0) {
      for (String dontClick : options.getDontClickElements()) {
        crawlRules.dontClick(dontClick);
        // TODO: .withAttribute("value", "I don't recognize");
        // TODO: .underXPath("//*[@id='pageFooter']");
        //		.underXPath("//*[@id='content']/div/div[2]");
      }
    }
    if (options.getClickElements().size() > 0) {
      for (String click : options.getClickElements()) {
        crawlRules.click(click);
        // TODO: .withAttribute("type", "submit");
      }
    }
    if (options.getDontClickElementsChildrenOf().size() > 0) {
      for (String dontClick : options.getDontClickElementsChildrenOf()) {
        crawlRules.dontClickChildrenOf(dontClick);
      }
    }
    crawlRules.insertRandomDataInInputForms(options.shouldPutRandomDataInForms());
    crawlRules.clickElementsInRandomOrder(options.shouldClickElementsInRandomOrder());
    crawlRules.crawlFrames(options.shouldCrawlFrames());
    crawlRules.waitAfterReloadUrl(options.getWaitTimeAferReload(), TimeUnit.MILLISECONDS);
    crawlRules.waitAfterEvent(options.getWaitTimeAfterEvent(), TimeUnit.MILLISECONDS);
    crawlRules.clickOnce(options.shouldClickOnce());
    crawlRules.crawlHiddenAnchors(options.shouldCrawlHiddenAnchorsButton());

    return builder.build();
  }