protected void dump(final Node<Payload> node, final int depth, final StringBuilder sb) { sb.append(Strings.repeat(" ", depth)); sb.append(node.getLabel()); sb.append(" (").append(node.getPath()).append(")"); if (node.getPayload().isMarked()) { sb.append("*"); } sb.append("\n"); for (Node<Payload> child : node.getChildren()) { dump(child, depth + 1, sb); } }
protected void diveIn( final ScrapeContext context, final Page page, final int currentDepth, final ParentOMatic parentOMatic, final Node<Payload> currentNode) throws IOException { // entry protection if (currentDepth >= context.getScrapeDepth()) { return; } // cancelation CancelableUtil.checkInterruption(); getLogger().debug("Processing page response from URL {}", page.getUrl()); final Elements elements = page.getDocument().getElementsByTag("a"); final List<String> pathElements = currentNode.getPathElements(); final String currentPath = currentNode.getPath(); for (Element element : elements) { if (isDeeperRepoLink(context, pathElements, element)) { if (element.text().startsWith(".")) { // skip hidden paths continue; } final Node<Payload> newSibling = parentOMatic.addPath(currentPath + "/" + element.text()); if (element.absUrl("href").endsWith("/")) { // "cut" recursion preemptively to save remote fetch (and then stop recursion due to // depth) final int siblingDepth = currentDepth + 1; if (siblingDepth < context.getScrapeDepth()) { maySleepBeforeSubsequentFetch(); final String newSiblingEncodedUrl = getRemoteUrlForRepositoryPath(context, newSibling.getPathElements()) + "/"; final Page siblingPage = Page.getPageFor(context, newSiblingEncodedUrl); if (siblingPage.getHttpResponse().getStatusLine().getStatusCode() == 200) { diveIn(context, siblingPage, siblingDepth, parentOMatic, newSibling); } else { // we do expect strictly 200 here throw new UnexpectedPageResponse( page.getUrl(), page.getHttpResponse().getStatusLine()); } } } } } }