Exemplo n.º 1
0
 protected void dump(final Node<Payload> node, final int depth, final StringBuilder sb) {
   sb.append(Strings.repeat("  ", depth));
   sb.append(node.getLabel());
   sb.append(" (").append(node.getPath()).append(")");
   if (node.getPayload().isMarked()) {
     sb.append("*");
   }
   sb.append("\n");
   for (Node<Payload> child : node.getChildren()) {
     dump(child, depth + 1, sb);
   }
 }
 protected void diveIn(
     final ScrapeContext context,
     final Page page,
     final int currentDepth,
     final ParentOMatic parentOMatic,
     final Node<Payload> currentNode)
     throws IOException {
   // entry protection
   if (currentDepth >= context.getScrapeDepth()) {
     return;
   }
   // cancelation
   CancelableUtil.checkInterruption();
   getLogger().debug("Processing page response from URL {}", page.getUrl());
   final Elements elements = page.getDocument().getElementsByTag("a");
   final List<String> pathElements = currentNode.getPathElements();
   final String currentPath = currentNode.getPath();
   for (Element element : elements) {
     if (isDeeperRepoLink(context, pathElements, element)) {
       if (element.text().startsWith(".")) {
         // skip hidden paths
         continue;
       }
       final Node<Payload> newSibling = parentOMatic.addPath(currentPath + "/" + element.text());
       if (element.absUrl("href").endsWith("/")) {
         // "cut" recursion preemptively to save remote fetch (and then stop recursion due to
         // depth)
         final int siblingDepth = currentDepth + 1;
         if (siblingDepth < context.getScrapeDepth()) {
           maySleepBeforeSubsequentFetch();
           final String newSiblingEncodedUrl =
               getRemoteUrlForRepositoryPath(context, newSibling.getPathElements()) + "/";
           final Page siblingPage = Page.getPageFor(context, newSiblingEncodedUrl);
           if (siblingPage.getHttpResponse().getStatusLine().getStatusCode() == 200) {
             diveIn(context, siblingPage, siblingDepth, parentOMatic, newSibling);
           } else {
             // we do expect strictly 200 here
             throw new UnexpectedPageResponse(
                 page.getUrl(), page.getHttpResponse().getStatusLine());
           }
         }
       }
     }
   }
 }