Beispiel #1
0
  /**
   * Adds a path to this ParentOMatic and marks it. This might result in changes in tree that
   * actually tries to "optimize" the markings, and it may result in tree where the currently added
   * and marked path is not marked, but it's some parent is.
   */
  public void addAndMarkPath(final String path) {
    final Node<Payload> currentNode = addPath(path, false);

    // rule A: unmark children if any
    if (applyRuleA) {
      applyRecursively(
          currentNode,
          new Function<Node<Payload>, Node<Payload>>() {
            @Override
            public Node<Payload> apply(Node<Payload> input) {
              input.getPayload().setMarked(false);
              return input;
            }
          });
    }

    currentNode.getPayload().setMarked(true);

    // reorganize if needed
    final Node<Payload> flippedNode = reorganizeForRecursion(currentNode);

    // optimize tree size if asked for
    if (keepMarkedNodesOnly) {
      optimizeTreeSize(flippedNode);
    }
  }
Beispiel #2
0
 protected void dump(final Node<Payload> node, final int depth, final StringBuilder sb) {
   sb.append(Strings.repeat("  ", depth));
   sb.append(node.getLabel());
   sb.append(" (").append(node.getPath()).append(")");
   if (node.getPayload().isMarked()) {
     sb.append("*");
   }
   sb.append("\n");
   for (Node<Payload> child : node.getChildren()) {
     dump(child, depth + 1, sb);
   }
 }
Beispiel #3
0
  /**
   * Returns true if parent exists (passed in node is not ROOT), and if parent {@link
   * Payload#isMarked()} returns {@code true}.
   */
  protected boolean isParentMarked(final Node<Payload> node) {
    final Node<Payload> parent = node.getParent();

    if (parent != null) {
      if (parent.getPayload().isMarked()) {
        return true;
      } else {
        return isParentMarked(parent);
      }
    } else {
      return false;
    }
  }
Beispiel #4
0
  /** Applies function recursively from the given node. */
  public void applyRecursively(
      final Node<Payload> fromNode, final Function<Node<Payload>, Node<Payload>> modifier) {
    modifier.apply(fromNode);

    for (Node<Payload> child : fromNode.getChildren()) {
      applyRecursively(child, modifier);
    }
  }
 protected void diveIn(
     final ScrapeContext context,
     final Page page,
     final int currentDepth,
     final ParentOMatic parentOMatic,
     final Node<Payload> currentNode)
     throws IOException {
   // entry protection
   if (currentDepth >= context.getScrapeDepth()) {
     return;
   }
   // cancelation
   CancelableUtil.checkInterruption();
   getLogger().debug("Processing page response from URL {}", page.getUrl());
   final Elements elements = page.getDocument().getElementsByTag("a");
   final List<String> pathElements = currentNode.getPathElements();
   final String currentPath = currentNode.getPath();
   for (Element element : elements) {
     if (isDeeperRepoLink(context, pathElements, element)) {
       if (element.text().startsWith(".")) {
         // skip hidden paths
         continue;
       }
       final Node<Payload> newSibling = parentOMatic.addPath(currentPath + "/" + element.text());
       if (element.absUrl("href").endsWith("/")) {
         // "cut" recursion preemptively to save remote fetch (and then stop recursion due to
         // depth)
         final int siblingDepth = currentDepth + 1;
         if (siblingDepth < context.getScrapeDepth()) {
           maySleepBeforeSubsequentFetch();
           final String newSiblingEncodedUrl =
               getRemoteUrlForRepositoryPath(context, newSibling.getPathElements()) + "/";
           final Page siblingPage = Page.getPageFor(context, newSiblingEncodedUrl);
           if (siblingPage.getHttpResponse().getStatusLine().getStatusCode() == 200) {
             diveIn(context, siblingPage, siblingDepth, parentOMatic, newSibling);
           } else {
             // we do expect strictly 200 here
             throw new UnexpectedPageResponse(
                 page.getUrl(), page.getHttpResponse().getStatusLine());
           }
         }
       }
     }
   }
 }
Beispiel #6
0
  /**
   * Returns true if parent exists (passed in node is not ROOT), and parent's all children are
   * marked (their {@link Payload#isMarked()} is {@code true} for all of them.
   */
  protected boolean isParentAllChildMarkedForRuleB(final Node<Payload> node) {
    final Node<Payload> parent = node.getParent();

    if (parent != null) {
      final List<Node<Payload>> children = parent.getChildren();

      if (children.size() < 2) {
        return false;
      }

      for (Node<Payload> child : children) {
        if (!child.getPayload().isMarked()) {
          return false;
        }
      }

      return true;
    } else {
      return false;
    }
  }
Beispiel #7
0
  protected Node<Payload> addPath(final String path, final boolean optimize) {
    final List<String> pathElems = getPathElements(Preconditions.checkNotNull(path));
    final List<String> actualPathElems = Lists.newArrayList();

    Node<Payload> currentNode = ROOT;

    for (String pathElem : pathElems) {
      actualPathElems.add(pathElem);
      final Node<Payload> node = currentNode.getChildByLabel(pathElem);

      if (node == null) {
        currentNode = currentNode.addChild(pathElem, new Payload());
      } else {
        currentNode = node;
      }
    }

    if (optimize) {
      optimizeTreeSize(currentNode);
    }

    return currentNode;
  }
Beispiel #8
0
  /**
   * Reorganizes the tree by applying the rules to the tree from the changed node and returns a node
   * that was top most of the flipped ones.
   */
  protected Node<Payload> reorganizeForRecursion(final Node<Payload> changedNode) {
    // rule a: if parent is marked already, do not mark the child
    if (applyRuleA && isParentMarked(changedNode)) {
      changedNode.getPayload().setMarked(false);
      return changedNode.getParent();
    }

    // rule b: if this parent's all children are marked, mark parent, unmark children
    if (applyRuleB && isParentAllChildMarkedForRuleB(changedNode)) {
      changedNode.getParent().getPayload().setMarked(true);
      for (Node<Payload> child : changedNode.getParent().getChildren()) {
        child.getPayload().setMarked(false);
      }
      return changedNode.getParent();
    }

    return changedNode;
  }
Beispiel #9
0
 /**
  * Optimizes the tree by making the marked nodes as leafs, basically cutting all the branches that
  * are below marked node.
  */
 protected void optimizeTreeSize(final Node<Payload> changedNode) {
   // simply "cut off" children if any
   for (Node<Payload> child : changedNode.getChildren()) {
     changedNode.removeChild(child);
   }
 }