protected void decisionMade( CrawlURI uri, DecideRule decisiveRule, int decisiveRuleNumber, DecideResult result) { if (fileLogger != null) { JSONObject extraInfo = null; if (logExtraInfo) { CrawlHost crawlHost = getServerCache().getHostFor(uri.getUURI()); String host = "-"; if (crawlHost != null) { host = crawlHost.fixUpName(); } extraInfo = new JSONObject(); extraInfo.put("hopPath", uri.getPathFromSeed()); extraInfo.put("via", uri.getVia()); extraInfo.put("seed", uri.getSourceTag()); extraInfo.put("host", host); } fileLogger.info( decisiveRuleNumber + " " + decisiveRule.getClass().getSimpleName() + " " + result + " " + uri + (extraInfo != null ? " " + extraInfo : "")); } }
/** * Add constant penalties for certain features of URI (and its 'via') that make it more * delayable/skippable. * * @param curi CrawlURI to be assigned a cost * @see org.archive.crawler.frontier.CostAssignmentPolicy#costOf(org.archive.modules.CrawlURI) */ public int costOf(CrawlURI curi) { int cost = 1; UURI uuri = curi.getUURI(); if (uuri.hasQuery()) { // has query string cost++; int qIndex = uuri.toString().indexOf('?'); if (curi.flattenVia().startsWith(uuri.toString().substring(0, qIndex))) { // non-query-string portion of URI is same as previous cost++; } // TODO: other potential query-related cost penalties: // - more than X query-string attributes // - calendarish terms // - query-string over certain size } // TODO: other potential path-based penalties // - new path is simply extension of via path // - many path segments // TODO: other potential hops-based penalties // - more than X hops // - each speculative hop return cost; }
/** * Return a preferred String key for persisting the given CrawlURI's AList state. * * @param curi CrawlURI * @return String key */ public static String persistKeyFor(CrawlURI curi) { // use a case-sensitive SURT for uniqueness and sorting benefits return persistKeyFor(curi.getUURI().toString()); }