Exemplo n.º 1
0
 /**
  * Add constant penalties for certain features of URI (and its 'via') that make it more
  * delayable/skippable.
  *
  * @param curi CrawlURI to be assigned a cost
  * @see org.archive.crawler.frontier.CostAssignmentPolicy#costOf(org.archive.modules.CrawlURI)
  */
 public int costOf(CrawlURI curi) {
   int cost = 1;
   UURI uuri = curi.getUURI();
   if (uuri.hasQuery()) {
     // has query string
     cost++;
     int qIndex = uuri.toString().indexOf('?');
     if (curi.flattenVia().startsWith(uuri.toString().substring(0, qIndex))) {
       // non-query-string portion of URI is same as previous
       cost++;
     }
     // TODO: other potential query-related cost penalties:
     //  - more than X query-string attributes
     //  - calendarish terms
     //  - query-string over certain size
   }
   // TODO: other potential path-based penalties
   //  - new path is simply extension of via path
   //  - many path segments
   // TODO: other potential hops-based penalties
   //  - more than X hops
   //  - each speculative hop
   return cost;
 }