/** * Adds a path to this ParentOMatic and marks it. This might result in changes in tree that * actually tries to "optimize" the markings, and it may result in tree where the currently added * and marked path is not marked, but it's some parent is. */ public void addAndMarkPath(final String path) { final Node<Payload> currentNode = addPath(path, false); // rule A: unmark children if any if (applyRuleA) { applyRecursively( currentNode, new Function<Node<Payload>, Node<Payload>>() { @Override public Node<Payload> apply(Node<Payload> input) { input.getPayload().setMarked(false); return input; } }); } currentNode.getPayload().setMarked(true); // reorganize if needed final Node<Payload> flippedNode = reorganizeForRecursion(currentNode); // optimize tree size if asked for if (keepMarkedNodesOnly) { optimizeTreeSize(flippedNode); } }
protected void dump(final Node<Payload> node, final int depth, final StringBuilder sb) { sb.append(Strings.repeat(" ", depth)); sb.append(node.getLabel()); sb.append(" (").append(node.getPath()).append(")"); if (node.getPayload().isMarked()) { sb.append("*"); } sb.append("\n"); for (Node<Payload> child : node.getChildren()) { dump(child, depth + 1, sb); } }
/** * Returns true if parent exists (passed in node is not ROOT), and if parent {@link * Payload#isMarked()} returns {@code true}. */ protected boolean isParentMarked(final Node<Payload> node) { final Node<Payload> parent = node.getParent(); if (parent != null) { if (parent.getPayload().isMarked()) { return true; } else { return isParentMarked(parent); } } else { return false; } }
/** Applies function recursively from the given node. */ public void applyRecursively( final Node<Payload> fromNode, final Function<Node<Payload>, Node<Payload>> modifier) { modifier.apply(fromNode); for (Node<Payload> child : fromNode.getChildren()) { applyRecursively(child, modifier); } }
protected void diveIn( final ScrapeContext context, final Page page, final int currentDepth, final ParentOMatic parentOMatic, final Node<Payload> currentNode) throws IOException { // entry protection if (currentDepth >= context.getScrapeDepth()) { return; } // cancelation CancelableUtil.checkInterruption(); getLogger().debug("Processing page response from URL {}", page.getUrl()); final Elements elements = page.getDocument().getElementsByTag("a"); final List<String> pathElements = currentNode.getPathElements(); final String currentPath = currentNode.getPath(); for (Element element : elements) { if (isDeeperRepoLink(context, pathElements, element)) { if (element.text().startsWith(".")) { // skip hidden paths continue; } final Node<Payload> newSibling = parentOMatic.addPath(currentPath + "/" + element.text()); if (element.absUrl("href").endsWith("/")) { // "cut" recursion preemptively to save remote fetch (and then stop recursion due to // depth) final int siblingDepth = currentDepth + 1; if (siblingDepth < context.getScrapeDepth()) { maySleepBeforeSubsequentFetch(); final String newSiblingEncodedUrl = getRemoteUrlForRepositoryPath(context, newSibling.getPathElements()) + "/"; final Page siblingPage = Page.getPageFor(context, newSiblingEncodedUrl); if (siblingPage.getHttpResponse().getStatusLine().getStatusCode() == 200) { diveIn(context, siblingPage, siblingDepth, parentOMatic, newSibling); } else { // we do expect strictly 200 here throw new UnexpectedPageResponse( page.getUrl(), page.getHttpResponse().getStatusLine()); } } } } } }
/** * Returns true if parent exists (passed in node is not ROOT), and parent's all children are * marked (their {@link Payload#isMarked()} is {@code true} for all of them. */ protected boolean isParentAllChildMarkedForRuleB(final Node<Payload> node) { final Node<Payload> parent = node.getParent(); if (parent != null) { final List<Node<Payload>> children = parent.getChildren(); if (children.size() < 2) { return false; } for (Node<Payload> child : children) { if (!child.getPayload().isMarked()) { return false; } } return true; } else { return false; } }
protected Node<Payload> addPath(final String path, final boolean optimize) { final List<String> pathElems = getPathElements(Preconditions.checkNotNull(path)); final List<String> actualPathElems = Lists.newArrayList(); Node<Payload> currentNode = ROOT; for (String pathElem : pathElems) { actualPathElems.add(pathElem); final Node<Payload> node = currentNode.getChildByLabel(pathElem); if (node == null) { currentNode = currentNode.addChild(pathElem, new Payload()); } else { currentNode = node; } } if (optimize) { optimizeTreeSize(currentNode); } return currentNode; }
/** * Reorganizes the tree by applying the rules to the tree from the changed node and returns a node * that was top most of the flipped ones. */ protected Node<Payload> reorganizeForRecursion(final Node<Payload> changedNode) { // rule a: if parent is marked already, do not mark the child if (applyRuleA && isParentMarked(changedNode)) { changedNode.getPayload().setMarked(false); return changedNode.getParent(); } // rule b: if this parent's all children are marked, mark parent, unmark children if (applyRuleB && isParentAllChildMarkedForRuleB(changedNode)) { changedNode.getParent().getPayload().setMarked(true); for (Node<Payload> child : changedNode.getParent().getChildren()) { child.getPayload().setMarked(false); } return changedNode.getParent(); } return changedNode; }
/** * Optimizes the tree by making the marked nodes as leafs, basically cutting all the branches that * are below marked node. */ protected void optimizeTreeSize(final Node<Payload> changedNode) { // simply "cut off" children if any for (Node<Payload> child : changedNode.getChildren()) { changedNode.removeChild(child); } }