@Override public DiscoveryResult<MavenHostedRepository> discoverLocalContent( final MavenHostedRepository mavenRepository) throws IOException { final DiscoveryResult<MavenHostedRepository> discoveryResult = new DiscoveryResult<MavenHostedRepository>(mavenRepository); final WalkerContext context = new DefaultWalkerContext( mavenRepository, new ResourceStoreRequest("/"), new DefaultStoreWalkerFilter(), true); final PrefixCollectorProcessor prefixCollectorProcessor = new PrefixCollectorProcessor(); context.getProcessors().add(prefixCollectorProcessor); try { walker.walk(context); final ParentOMatic parentOMatic = prefixCollectorProcessor.getParentOMatic(); if (parentOMatic.getRoot().isLeaf()) { // tree is basically empty, so make the list too discoveryResult.recordSuccess( ID, "Repository crawled successfully (is empty)", new ArrayListPrefixSource(Collections.<String>emptyList())); } else { discoveryResult.recordSuccess( ID, "Repository crawled successfully", new ArrayListPrefixSource(getAllLeafPaths(parentOMatic, config.getLocalScrapeDepth()))); } } catch (WalkerException e) { if (e.getWalkerContext().getStopCause() != null) { discoveryResult.recordError(ID, e.getWalkerContext().getStopCause()); } else { discoveryResult.recordError(ID, e); } } return discoveryResult; }
protected List<String> getAllLeafPaths(final ParentOMatic parentOMatic, final int maxDepth) { // cut the tree if (maxDepth != Integer.MAX_VALUE) { parentOMatic.cutNodesDeeperThan(maxDepth); } // get leafs return parentOMatic.getAllLeafPaths(); }
@Override public void processItem(final WalkerContext context, final StorageItem item) throws Exception { // cancelation CancelableUtil.checkInterruption(); if (item instanceof StorageFileItem) { parentOMatic.addPath(item.getPath()); } }
@Override protected List<String> diveIn(final ScrapeContext context, final Page page) throws IOException { // we use the great and all-mighty ParentOMatic final ParentOMatic parentOMatic = new ParentOMatic(); diveIn(context, page, 0, parentOMatic, parentOMatic.getRoot()); // Special case: scraped with 0 entry, we consider this as an error // Remote repo empty? Why are you proxying it? Or worse, some scrape // exotic index page and we end up with 0 entries by mistake? if (parentOMatic.getRoot().isLeaf()) { context.stop( "Remote recognized as " + getTargetedServer() + ", but scraped 0 entries. This is considered a failure."); return null; } final List<String> entries = parentOMatic.getAllLeafPaths(); return entries; }
/** * Simple "naive" case. Just adding a bunch of paths. * * @throws Exception */ @Test public void exampleCase() throws Exception { stringBuilder = new StringBuilder(); final ParentOMatic cn = new ParentOMatic(); print("Example case"); print(""); cn.addAndMarkPath("/foo/bam/car2"); cn.addAndMarkPath("/foo/baz"); cn.addAndMarkPath("/foo/baz/foo"); cn.addAndMarkPath("/foo/bar"); cn.addAndMarkPath("/foo/bar/car1"); cn.addAndMarkPath("/foo/bar/car3"); print(cn.dump()); print(""); print("Maven MD recreate would run against paths:"); printListPerLine(cn.getMarkedPaths()); doAssert(); }
protected void diveIn( final ScrapeContext context, final Page page, final int currentDepth, final ParentOMatic parentOMatic, final Node<Payload> currentNode) throws IOException { // entry protection if (currentDepth >= context.getScrapeDepth()) { return; } // cancelation CancelableUtil.checkInterruption(); getLogger().debug("Processing page response from URL {}", page.getUrl()); final Elements elements = page.getDocument().getElementsByTag("a"); final List<String> pathElements = currentNode.getPathElements(); final String currentPath = currentNode.getPath(); for (Element element : elements) { if (isDeeperRepoLink(context, pathElements, element)) { if (element.text().startsWith(".")) { // skip hidden paths continue; } final Node<Payload> newSibling = parentOMatic.addPath(currentPath + "/" + element.text()); if (element.absUrl("href").endsWith("/")) { // "cut" recursion preemptively to save remote fetch (and then stop recursion due to // depth) final int siblingDepth = currentDepth + 1; if (siblingDepth < context.getScrapeDepth()) { maySleepBeforeSubsequentFetch(); final String newSiblingEncodedUrl = getRemoteUrlForRepositoryPath(context, newSibling.getPathElements()) + "/"; final Page siblingPage = Page.getPageFor(context, newSiblingEncodedUrl); if (siblingPage.getHttpResponse().getStatusLine().getStatusCode() == 200) { diveIn(context, siblingPage, siblingDepth, parentOMatic, newSibling); } else { // we do expect strictly 200 here throw new UnexpectedPageResponse( page.getUrl(), page.getHttpResponse().getStatusLine()); } } } } } }
/** * "Peter's case" as Peter did actually implement this and realized that snapshot removal (main * work) takes 3 minutes, and all the "bookkeeping" takes 20 minutes. This is kinda "generated" * repository and snapshot removals are equally spread out. * * @throws Exception */ @Test public void petersCase() throws Exception { stringBuilder = new StringBuilder(); final ParentOMatic cn = new ParentOMatic(); print("Peter's case"); print(""); cn.addAndMarkPath("/g1/a1/v1"); cn.addAndMarkPath("/g1/a1/v2"); cn.addAndMarkPath("/g1/a1/v3"); cn.addAndMarkPath("/g1/a2/v1"); cn.addAndMarkPath("/g1/a2/v2"); cn.addAndMarkPath("/g1/a2/v3"); cn.addAndMarkPath("/g1/a3/v1"); cn.addAndMarkPath("/g1/a3/v2"); cn.addAndMarkPath("/g1/a3/v3"); print(cn.dump()); print(""); print("Maven MD recreate would run against paths:"); printListPerLine(cn.getMarkedPaths()); doAssert(); }