protected void diveIn( final ScrapeContext context, final Page page, final int currentDepth, final ParentOMatic parentOMatic, final Node<Payload> currentNode) throws IOException { // entry protection if (currentDepth >= context.getScrapeDepth()) { return; } // cancelation CancelableUtil.checkInterruption(); getLogger().debug("Processing page response from URL {}", page.getUrl()); final Elements elements = page.getDocument().getElementsByTag("a"); final List<String> pathElements = currentNode.getPathElements(); final String currentPath = currentNode.getPath(); for (Element element : elements) { if (isDeeperRepoLink(context, pathElements, element)) { if (element.text().startsWith(".")) { // skip hidden paths continue; } final Node<Payload> newSibling = parentOMatic.addPath(currentPath + "/" + element.text()); if (element.absUrl("href").endsWith("/")) { // "cut" recursion preemptively to save remote fetch (and then stop recursion due to // depth) final int siblingDepth = currentDepth + 1; if (siblingDepth < context.getScrapeDepth()) { maySleepBeforeSubsequentFetch(); final String newSiblingEncodedUrl = getRemoteUrlForRepositoryPath(context, newSibling.getPathElements()) + "/"; final Page siblingPage = Page.getPageFor(context, newSiblingEncodedUrl); if (siblingPage.getHttpResponse().getStatusLine().getStatusCode() == 200) { diveIn(context, siblingPage, siblingDepth, parentOMatic, newSibling); } else { // we do expect strictly 200 here throw new UnexpectedPageResponse( page.getUrl(), page.getHttpResponse().getStatusLine()); } } } } } }
/** * Immediately refreshes the specified page using the specified URL. * * @param page the page that is going to be refreshed * @param url the URL where the new page will be loaded * @param seconds the number of seconds to wait before reloading the page (ignored!) * @throws IOException if the refresh fails */ public void handleRefresh(final Page page, final URL url, final int seconds) throws IOException { final WebWindow window = page.getEnclosingWindow(); if (window == null) { return; } final WebClient client = window.getWebClient(); if (page.getUrl().toExternalForm().equals(url.toExternalForm()) && HttpMethod.GET == page.getWebResponse().getWebRequest().getHttpMethod()) { final String msg = "Refresh to " + url + " (" + seconds + "s) aborted by HtmlUnit: " + "Attempted to refresh a page using an ImmediateRefreshHandler " + "which could have caused an OutOfMemoryError " + "Please use WaitingRefreshHandler or ThreadedRefreshHandler instead."; throw new RuntimeException(msg); } client.getPage(window, new WebRequest(url)); }