protected Set<String> filesInArcs() throws IOException { List<ArchiveRecordHeader> headers = headersInArcs(); HashSet<String> result = new HashSet<String>(); for (ArchiveRecordHeader arh : headers) { // ignore 'filedesc:' record if (arh.getUrl().startsWith("filedesc:")) { continue; } UURI uuri = UURIFactory.getInstance(arh.getUrl()); String path = uuri.getPath(); if (path.startsWith("/")) { path = path.substring(1); } if (arh.getUrl().startsWith("http:")) { result.add(path); } } LOGGER.finest(result.toString()); return result; }
public boolean isPrerequisite(final CrawlURI curi) { boolean result = false; String curiStr = curi.getUURI().toString(); String loginUri = getPrerequisite(curi); if (loginUri != null) { try { UURI uuri = UURIFactory.getInstance(curi.getUURI(), loginUri); if (uuri != null && curiStr != null && uuri.toString().equals(curiStr)) { result = true; if (!curi.isPrerequisite()) { curi.setPrerequisite(true); logger.fine(curi + " is prereq."); } } } catch (URIException e) { logger.severe("Failed to uuri: " + curi + ", " + e.getMessage()); } } return result; }
protected void addHeaderLink(CrawlURI curi, Header loc) { if (loc == null) { // If null, return without adding anything. return; } // TODO: consider possibility of multiple headers try { /** * 302重定向使用自定义的方法存储link * * @modify: wuliufu * @since : 2012-05-11 */ curi.createAndAddLocationLink( curi.getVia(), loc.getValue(), loc.getName() + ":", Link.REFER_HOP); if (curi.getObject(URLInfo.ATTACH) != null) { UURI outUURI = UURIFactory.getInstance(curi.getUURI(), loc.getValue()); logger.debug( "ParseHTTP: curi = " + curi.getUURI().toString() + "&& " + loc.getName() + "=" + outUURI.toString()); curi.putObject(outUURI.toString(), curi.getObject(URLInfo.ATTACH)); } numberOfLinksExtracted++; } catch (URIException e) { // There may not be a controller (e.g. If we're being run // by the extractor tool). if (getController() != null) { getController().logUriError(e, curi.getUURI(), loc.getValue()); } else { logger.info(curi + ", " + loc.getValue() + ": " + e.getMessage()); } } }
/** * Add constant penalties for certain features of URI (and its 'via') that make it more * delayable/skippable. * * @param curi CrawlURI to be assigned a cost * @see org.archive.crawler.frontier.CostAssignmentPolicy#costOf(org.archive.modules.CrawlURI) */ public int costOf(CrawlURI curi) { int cost = 1; UURI uuri = curi.getUURI(); if (uuri.hasQuery()) { // has query string cost++; int qIndex = uuri.toString().indexOf('?'); if (curi.flattenVia().startsWith(uuri.toString().substring(0, qIndex))) { // non-query-string portion of URI is same as previous cost++; } // TODO: other potential query-related cost penalties: // - more than X query-string attributes // - calendarish terms // - query-string over certain size } // TODO: other potential path-based penalties // - new path is simply extension of via path // - many path segments // TODO: other potential hops-based penalties // - more than X hops // - each speculative hop return cost; }
/* (non-Javadoc) * @see org.archive.crawler.settings.refinements.Criteria#isWithinRefinementBounds(org.archive.crawler.datamodel.UURI, int) */ public boolean isWithinRefinementBounds(UURI uri) { return (uri == null || uri == null) ? false : TextUtils.matches(regexp, uri.toString()); }