Esempio n. 1
0
  protected void decisionMade(
      CrawlURI uri, DecideRule decisiveRule, int decisiveRuleNumber, DecideResult result) {
    if (fileLogger != null) {
      JSONObject extraInfo = null;
      if (logExtraInfo) {
        CrawlHost crawlHost = getServerCache().getHostFor(uri.getUURI());
        String host = "-";
        if (crawlHost != null) {
          host = crawlHost.fixUpName();
        }

        extraInfo = new JSONObject();
        extraInfo.put("hopPath", uri.getPathFromSeed());
        extraInfo.put("via", uri.getVia());
        extraInfo.put("seed", uri.getSourceTag());
        extraInfo.put("host", host);
      }

      fileLogger.info(
          decisiveRuleNumber
              + " "
              + decisiveRule.getClass().getSimpleName()
              + " "
              + result
              + " "
              + uri
              + (extraInfo != null ? " " + extraInfo : ""));
    }
  }
 /**
  * Add constant penalties for certain features of URI (and its 'via') that make it more
  * delayable/skippable.
  *
  * @param curi CrawlURI to be assigned a cost
  * @see org.archive.crawler.frontier.CostAssignmentPolicy#costOf(org.archive.modules.CrawlURI)
  */
 public int costOf(CrawlURI curi) {
   int cost = 1;
   UURI uuri = curi.getUURI();
   if (uuri.hasQuery()) {
     // has query string
     cost++;
     int qIndex = uuri.toString().indexOf('?');
     if (curi.flattenVia().startsWith(uuri.toString().substring(0, qIndex))) {
       // non-query-string portion of URI is same as previous
       cost++;
     }
     // TODO: other potential query-related cost penalties:
     //  - more than X query-string attributes
     //  - calendarish terms
     //  - query-string over certain size
   }
   // TODO: other potential path-based penalties
   //  - new path is simply extension of via path
   //  - many path segments
   // TODO: other potential hops-based penalties
   //  - more than X hops
   //  - each speculative hop
   return cost;
 }
Esempio n. 3
0
 /**
  * Return a preferred String key for persisting the given CrawlURI's AList state.
  *
  * @param curi CrawlURI
  * @return String key
  */
 public static String persistKeyFor(CrawlURI curi) {
   // use a case-sensitive SURT for uniqueness and sorting benefits
   return persistKeyFor(curi.getUURI().toString());
 }