protected Set<String> filesInArcs() throws IOException {
   List<ArchiveRecordHeader> headers = headersInArcs();
   HashSet<String> result = new HashSet<String>();
   for (ArchiveRecordHeader arh : headers) {
     // ignore 'filedesc:' record
     if (arh.getUrl().startsWith("filedesc:")) {
       continue;
     }
     UURI uuri = UURIFactory.getInstance(arh.getUrl());
     String path = uuri.getPath();
     if (path.startsWith("/")) {
       path = path.substring(1);
     }
     if (arh.getUrl().startsWith("http:")) {
       result.add(path);
     }
   }
   LOGGER.finest(result.toString());
   return result;
 }
 public boolean isPrerequisite(final CrawlURI curi) {
   boolean result = false;
   String curiStr = curi.getUURI().toString();
   String loginUri = getPrerequisite(curi);
   if (loginUri != null) {
     try {
       UURI uuri = UURIFactory.getInstance(curi.getUURI(), loginUri);
       if (uuri != null && curiStr != null && uuri.toString().equals(curiStr)) {
         result = true;
         if (!curi.isPrerequisite()) {
           curi.setPrerequisite(true);
           logger.fine(curi + " is prereq.");
         }
       }
     } catch (URIException e) {
       logger.severe("Failed to uuri: " + curi + ", " + e.getMessage());
     }
   }
   return result;
 }
示例#3
0
  protected void addHeaderLink(CrawlURI curi, Header loc) {
    if (loc == null) {
      // If null, return without adding anything.
      return;
    }
    // TODO: consider possibility of multiple headers
    try {
      /**
       * 302重定向使用自定义的方法存储link
       *
       * @modify: wuliufu
       * @since : 2012-05-11
       */
      curi.createAndAddLocationLink(
          curi.getVia(), loc.getValue(), loc.getName() + ":", Link.REFER_HOP);

      if (curi.getObject(URLInfo.ATTACH) != null) {
        UURI outUURI = UURIFactory.getInstance(curi.getUURI(), loc.getValue());
        logger.debug(
            "ParseHTTP: curi = "
                + curi.getUURI().toString()
                + "&& "
                + loc.getName()
                + "="
                + outUURI.toString());
        curi.putObject(outUURI.toString(), curi.getObject(URLInfo.ATTACH));
      }

      numberOfLinksExtracted++;
    } catch (URIException e) {
      // There may not be a controller (e.g. If we're being run
      // by the extractor tool).
      if (getController() != null) {
        getController().logUriError(e, curi.getUURI(), loc.getValue());
      } else {
        logger.info(curi + ", " + loc.getValue() + ": " + e.getMessage());
      }
    }
  }
 /**
  * Add constant penalties for certain features of URI (and its 'via') that make it more
  * delayable/skippable.
  *
  * @param curi CrawlURI to be assigned a cost
  * @see org.archive.crawler.frontier.CostAssignmentPolicy#costOf(org.archive.modules.CrawlURI)
  */
 public int costOf(CrawlURI curi) {
   int cost = 1;
   UURI uuri = curi.getUURI();
   if (uuri.hasQuery()) {
     // has query string
     cost++;
     int qIndex = uuri.toString().indexOf('?');
     if (curi.flattenVia().startsWith(uuri.toString().substring(0, qIndex))) {
       // non-query-string portion of URI is same as previous
       cost++;
     }
     // TODO: other potential query-related cost penalties:
     //  - more than X query-string attributes
     //  - calendarish terms
     //  - query-string over certain size
   }
   // TODO: other potential path-based penalties
   //  - new path is simply extension of via path
   //  - many path segments
   // TODO: other potential hops-based penalties
   //  - more than X hops
   //  - each speculative hop
   return cost;
 }
 /* (non-Javadoc)
  * @see org.archive.crawler.settings.refinements.Criteria#isWithinRefinementBounds(org.archive.crawler.datamodel.UURI, int)
  */
 public boolean isWithinRefinementBounds(UURI uri) {
   return (uri == null || uri == null) ? false : TextUtils.matches(regexp, uri.toString());
 }