protected void decisionMade( CrawlURI uri, DecideRule decisiveRule, int decisiveRuleNumber, DecideResult result) { if (fileLogger != null) { JSONObject extraInfo = null; if (logExtraInfo) { CrawlHost crawlHost = getServerCache().getHostFor(uri.getUURI()); String host = "-"; if (crawlHost != null) { host = crawlHost.fixUpName(); } extraInfo = new JSONObject(); extraInfo.put("hopPath", uri.getPathFromSeed()); extraInfo.put("via", uri.getVia()); extraInfo.put("seed", uri.getSourceTag()); extraInfo.put("host", host); } fileLogger.info( decisiveRuleNumber + " " + decisiveRule.getClass().getSimpleName() + " " + result + " " + uri + (extraInfo != null ? " " + extraInfo : "")); } }
public Map<String, Object> shortReportMap() { Map<String, Object> data = new LinkedHashMap<String, Object>(); data.put("serialNumber", serialNumber); CrawlURI c = currentCuri; if (c != null) { data.put("currentURI", c.toString()); data.put("currentProcessor", currentProcessorName); data.put("fetchAttempts", c.getFetchAttempts()); } else { data.put("currentURI", null); } long now = System.currentTimeMillis(); long time = 0; if (lastFinishTime > lastStartTime) { data.put("status", "WAITING"); time = now - lastFinishTime; } else if (lastStartTime > 0) { data.put("status", "ACTIVE"); time = now - lastStartTime; } data.put("currentStatusElapsedMilliseconds", time); data.put("currentStatusElapsedPretty", ArchiveUtils.formatMillisecondsToConventional(time)); data.put("step", step); return data; }
public void testOutOfBounds() throws Exception { MatchesStatusCodeDecideRule dr = makeDecideRule(400, 499); CrawlURI testUri = createTestUri("http://www.archive.org"); testUri.setFetchStatus(200); assertFalse(dr.evaluate(testUri)); }
/** * Compiles and returns a report on its status. * * @param name Report name. * @param pw Where to print. */ public void reportTo(String name, PrintWriter pw) { // name is ignored for now: only one kind of report pw.print("["); pw.println(getName()); // Make a local copy of the currentCuri reference in case it gets // nulled while we're using it. We're doing this because // alternative is synchronizing and we don't want to do this -- // it causes hang ups as controller waits on a lock for this thread, // something it gets easily enough on old threading model but something // it can wait interminably for on NPTL threading model. // See [ 994946 ] Pause/Terminate ignored on 2.6 kernel 1.5 JVM. CrawlURI c = currentCuri; if (c != null) { pw.print(" "); c.shortReportLineTo(pw); pw.print(" "); pw.print(c.getFetchAttempts()); pw.print(" attempts"); pw.println(); pw.print(" "); pw.print("in processor: "); pw.print(currentProcessorName); } else { pw.print(" -no CrawlURI- "); } pw.println(); long now = System.currentTimeMillis(); long time = 0; pw.print(" "); if (lastFinishTime > lastStartTime) { // That means we finished something after we last started something // or in other words we are not working on anything. pw.print("WAITING for "); time = now - lastFinishTime; } else if (lastStartTime > 0) { // We are working on something pw.print("ACTIVE for "); time = now - lastStartTime; } pw.print(ArchiveUtils.formatMillisecondsToConventional(time)); pw.println(); pw.print(" "); pw.print("step: "); pw.print(step); pw.print(" for "); pw.print( ArchiveUtils.formatMillisecondsToConventional(System.currentTimeMillis() - atStepSince)); pw.println(); reportThread(this, pw); pw.print("]"); pw.println(); pw.flush(); }
/** * Delete all CrawlURIs matching the given expression. * * @param match * @param queue * @param headKey * @return count of deleted items * @throws DatabaseException * @throws DatabaseException */ public long deleteMatchingFromQueue(String match, String queue, DatabaseEntry headKey) throws DatabaseException { long deletedCount = 0; Pattern pattern = Pattern.compile(match); DatabaseEntry key = headKey; DatabaseEntry value = new DatabaseEntry(); Cursor cursor = null; try { cursor = pendingUrisDB.openCursor(null, null); OperationStatus result = cursor.getSearchKeyRange(headKey, value, null); while (result == OperationStatus.SUCCESS) { if (value.getData().length > 0) { CrawlURI curi = (CrawlURI) crawlUriBinding.entryToObject(value); if (!curi.getClassKey().equals(queue)) { // rolled into next queue; finished with this queue break; } if (pattern.matcher(curi.toString()).matches()) { cursor.delete(); deletedCount++; } } result = cursor.getNext(key, value, null); } } finally { if (cursor != null) { cursor.close(); } } return deletedCount; }
/** * Delete the given CrawlURI from persistent store. Requires the key under which it was stored be * available. * * @param item * @throws DatabaseException */ public void delete(CrawlURI item) throws DatabaseException { OperationStatus status; DatabaseEntry de = (DatabaseEntry) item.getHolderKey(); status = pendingUrisDB.delete(null, de); if (status != OperationStatus.SUCCESS) { LOGGER.severe( "expected item not present: " + item + "(" + (new BigInteger(((DatabaseEntry) item.getHolderKey()).getData())).toString(16) + ")"); } }
/** * Calculate the insertKey that places a CrawlURI in the desired spot. First bytes are always * classKey (usu. host) based -- ensuring grouping by host -- terminated by a zero byte. Then 8 * bytes of data ensuring desired ordering within that 'queue' are used. The first byte of these 8 * is priority -- allowing 'immediate' and 'soon' items to sort above regular. Next 1 byte is * 'precedence'. Last 6 bytes are ordinal serial number, ensuring earlier-discovered URIs sort * before later. * * <p>NOTE: Dangers here are: (1) priorities or precedences over 2^7 (signed byte comparison) (2) * ordinals over 2^48 * * <p>Package access & static for testing purposes. * * @param curi * @return a DatabaseEntry key for the CrawlURI */ static DatabaseEntry calculateInsertKey(CrawlURI curi) { byte[] classKeyBytes = null; int len = 0; classKeyBytes = curi.getClassKey().getBytes(Charsets.UTF_8); len = classKeyBytes.length; byte[] keyData = new byte[len + 9]; System.arraycopy(classKeyBytes, 0, keyData, 0, len); keyData[len] = 0; long ordinalPlus = curi.getOrdinal() & 0x0000FFFFFFFFFFFFL; ordinalPlus = ((long) curi.getSchedulingDirective() << 56) | ordinalPlus; long precedence = Math.min(curi.getPrecedence(), 127); ordinalPlus = (((precedence) & 0xFFL) << 48) | ordinalPlus; ArchiveUtils.longIntoByteArray(ordinalPlus, keyData, len + 1); return new DatabaseEntry(keyData); }
/** * Get the next nearest item after the given key. Relies on external discipline -- we'll look at * the queues count of how many items it has -- to avoid asking for something from a range where * there are no associated items -- otherwise could get first item of next 'queue' by mistake. * * <p>TODO: hold within a queue's range * * @param headKey Key prefix that demarks the beginning of the range in <code>pendingUrisDB</code> * we're interested in. * @return CrawlURI. * @throws DatabaseException */ public CrawlURI get(DatabaseEntry headKey) throws DatabaseException { DatabaseEntry result = new DatabaseEntry(); // From Linda Lee of sleepycat: // "You want to check the status returned from Cursor.getSearchKeyRange // to make sure that you have OperationStatus.SUCCESS. In that case, // you have found a valid data record, and result.getData() // (called by internally by the binding code, in this case) will be // non-null. The other possible status return is // OperationStatus.NOTFOUND, in which case no data record matched // the criteria. " OperationStatus status = getNextNearestItem(headKey, result); CrawlURI retVal = null; if (status != OperationStatus.SUCCESS) { LOGGER.severe( "See '1219854 NPE je-2.0 " + "entryToObject...'. OperationStatus " + " was not SUCCESS: " + status + ", headKey " + BdbWorkQueue.getPrefixClassKey(headKey.getData())); return null; } try { retVal = (CrawlURI) crawlUriBinding.entryToObject(result); } catch (ClassCastException cce) { Object obj = crawlUriBinding.entryToObject(result); LOGGER.log( Level.SEVERE, "see [#HER-1283]: deserialized " + obj.getClass() + " has ClassLoader " + obj.getClass().getClassLoader().getClass(), cce); return null; } catch (RuntimeExceptionWrapper rw) { LOGGER.log( Level.SEVERE, "expected object missing in queue " + BdbWorkQueue.getPrefixClassKey(headKey.getData()), rw); return null; } retVal.setHolderKey(headKey); return retVal; }
/** @param w PrintWriter to write to. */ public void shortReportLineTo(PrintWriter w) { w.print("#"); w.print(this.serialNumber); // Make a local copy of the currentCuri reference in case it gets // nulled while we're using it. We're doing this because // alternative is synchronizing and we don't want to do this -- // it causes hang ups as controller waits on a lock for this thread, // something it gets easily enough on old threading model but something // it can wait interminably for on NPTL threading model. // See [ 994946 ] Pause/Terminate ignored on 2.6 kernel 1.5 JVM. CrawlURI c = currentCuri; if (c != null) { w.print(" "); w.print(currentProcessorName); w.print(" "); w.print(c.toString()); w.print(" ("); w.print(c.getFetchAttempts()); w.print(") "); } else { w.print(" [no CrawlURI] "); } long now = System.currentTimeMillis(); long time = 0; if (lastFinishTime > lastStartTime) { // That means we finished something after we last started something // or in other words we are not working on anything. w.print("WAITING for "); time = now - lastFinishTime; } else if (lastStartTime > 0) { // We are working on something w.print("ACTIVE for "); time = now - lastStartTime; } w.print(ArchiveUtils.formatMillisecondsToConventional(time)); w.print(" at "); w.print(step); w.print(" for "); w.print(ArchiveUtils.formatMillisecondsToConventional(now - atStepSince)); w.print("\n"); w.flush(); }
/** * Merge any data from the Map stored in the URI-history store into the current instance. * * <p>TODO: ensure compatibility with use of PersistLoadProcessor; suppress double-loading * * @param curi CrawlURI to receive prior state data */ protected void mergePrior(CrawlURI curi) { String key = PersistProcessor.persistKeyFor(curi); @SuppressWarnings({"rawtypes", "unchecked"}) Map<String, Map> prior = (Map<String, Map>) store.get(key); if (prior != null) { // merge in keys curi.getData().putAll(prior); } }
/** * Terminates a thread. * * <p>Calling this method will ensure that the current thread will stop processing as soon as * possible (note: this may be never). Meant to 'short circuit' hung threads. * * <p>Current crawl uri will have its fetch status set accordingly and will be immediately * returned to the frontier. * * <p>As noted before, this does not ensure that the thread will stop running (ever). But once * evoked it will not try and communicate with other parts of crawler and will terminate as soon * as control is established. */ protected void kill() { this.interrupt(); synchronized (this) { if (currentCuri != null) { currentCuri.setFetchStatus(S_PROCESSING_THREAD_KILLED); controller.getFrontier().finished(currentCuri); } } }
/* (non-Javadoc) * @see org.archive.crawler.frontier.precedence.BaseUriPrecedencePolicy#calculatePrecedence(org.archive.crawler.datamodel.CrawlURI) */ @Override protected int calculatePrecedence(CrawlURI curi) { mergePrior(curi); Integer preloadPrecedence = (Integer) curi.getData().get(A_PRECALC_PRECEDENCE); if (preloadPrecedence == null) { return 0; } return super.calculatePrecedence(curi) + preloadPrecedence; }
/* (non-Javadoc) * @see org.archive.crawler.frontier.precedence.BaseUriPrecedencePolicy#uriScheduled(org.archive.crawler.datamodel.CrawlURI) */ @Override public void uriScheduled(CrawlURI curi) { int precedence = calculatePrecedence(curi); if (precedence == 0) { // fall back to configured default policy getDefaultUriPrecedencePolicy().uriScheduled(curi); return; } curi.setPrecedence(precedence); }
private void seriousError(Error err) { // try to prevent timeslicing until we have a chance to deal with OOM // Note that modern-day JVM priority indifference with native threads // may make this priority-jumbling pointless setPriority(DEFAULT_PRIORITY + 1); if (controller != null) { // hold all ToeThreads from proceeding to next processor controller.freeReserveMemory(); controller.requestCrawlPause(); if (controller.getFrontier().getFrontierJournal() != null) { controller.getFrontier().getFrontierJournal().seriousError(getName() + err.getMessage()); } } // OutOfMemory etc. String extraInfo = DevUtils.extraInfo(); System.err.println("<<<"); System.err.println(ArchiveUtils.getLog17Date()); System.err.println(err); System.err.println(extraInfo); err.printStackTrace(System.err); if (controller != null) { PrintWriter pw = new PrintWriter(System.err); controller.getToePool().compactReportTo(pw); pw.flush(); } System.err.println(">>>"); // DevUtils.sigquitSelf(); String context = "unknown"; if (currentCuri != null) { // update fetch-status, saving original as annotation currentCuri.getAnnotations().add("err=" + err.getClass().getName()); currentCuri.getAnnotations().add("os" + currentCuri.getFetchStatus()); currentCuri.setFetchStatus(S_SERIOUS_ERROR); context = currentCuri.shortReportLine() + " in " + currentProcessorName; } String message = "Serious error occured trying " + "to process '" + context + "'\n" + extraInfo; logger.log(Level.SEVERE, message.toString(), err); setPriority(DEFAULT_PRIORITY); }
/** * Handling for exceptions and errors that are possibly recoverable. * * @param e */ private void recoverableProblem(Throwable e) { Object previousStep = step; setStep(Step.HANDLING_RUNTIME_EXCEPTION, null); // e.printStackTrace(System.err); currentCuri.setFetchStatus(S_RUNTIME_EXCEPTION); // store exception temporarily for logging currentCuri.getAnnotations().add("err=" + e.getClass().getName()); currentCuri.getData().put(A_RUNTIME_EXCEPTION, e); String message = "Problem " + e + " occured when trying to process '" + currentCuri.toString() + "' at step " + previousStep + " in " + currentProcessorName + "\n"; logger.log(Level.SEVERE, message.toString(), e); }
/** * Add constant penalties for certain features of URI (and its 'via') that make it more * delayable/skippable. * * @param curi CrawlURI to be assigned a cost * @see org.archive.crawler.frontier.CostAssignmentPolicy#costOf(org.archive.modules.CrawlURI) */ public int costOf(CrawlURI curi) { int cost = 1; UURI uuri = curi.getUURI(); if (uuri.hasQuery()) { // has query string cost++; int qIndex = uuri.toString().indexOf('?'); if (curi.flattenVia().startsWith(uuri.toString().substring(0, qIndex))) { // non-query-string portion of URI is same as previous cost++; } // TODO: other potential query-related cost penalties: // - more than X query-string attributes // - calendarish terms // - query-string over certain size } // TODO: other potential path-based penalties // - new path is simply extension of via path // - many path segments // TODO: other potential hops-based penalties // - more than X hops // - each speculative hop return cost; }
/** * Put the given CrawlURI in at the appropriate place. * * @param curi * @throws DatabaseException */ public void put(CrawlURI curi, boolean overwriteIfPresent) throws DatabaseException { DatabaseEntry insertKey = (DatabaseEntry) curi.getHolderKey(); if (insertKey == null) { insertKey = calculateInsertKey(curi); curi.setHolderKey(insertKey); } DatabaseEntry value = new DatabaseEntry(); crawlUriBinding.objectToEntry(curi, value); // Output tally on avg. size if level is FINE or greater. if (LOGGER.isLoggable(Level.FINE)) { tallyAverageEntrySize(curi, value); } OperationStatus status; if (overwriteIfPresent) { status = pendingUrisDB.put(null, insertKey, value); } else { status = pendingUrisDB.putNoOverwrite(null, insertKey, value); } if (status != OperationStatus.SUCCESS) { LOGGER.log( Level.SEVERE, "URI enqueueing failed; " + status + " " + curi, new RuntimeException()); } }
/** * Return a preferred String key for persisting the given CrawlURI's AList state. * * @param curi CrawlURI * @return String key */ public static String persistKeyFor(CrawlURI curi) { // use a case-sensitive SURT for uniqueness and sorting benefits return persistKeyFor(curi.getUURI().toString()); }
/** * (non-Javadoc) * * @see java.lang.Thread#run() */ public void run() { String name = controller.getMetadata().getJobName(); logger.fine(getName() + " started for order '" + name + "'"); Recorder.setHttpRecorder(httpRecorder); try { while (true) { ArchiveUtils.continueCheck(); setStep(Step.ABOUT_TO_GET_URI, null); CrawlURI curi = controller.getFrontier().next(); synchronized (this) { ArchiveUtils.continueCheck(); setCurrentCuri(curi); currentCuri.setThreadNumber(this.serialNumber); lastStartTime = System.currentTimeMillis(); currentCuri.setRecorder(httpRecorder); } try { KeyedProperties.loadOverridesFrom(curi); controller.getFetchChain().process(curi, this); controller.getFrontier().beginDisposition(curi); controller.getDispositionChain().process(curi, this); } catch (RuntimeExceptionWrapper e) { // Workaround to get cause from BDB if (e.getCause() == null) { e.initCause(e.getCause()); } recoverableProblem(e); } catch (AssertionError ae) { // This risks leaving crawl in fatally inconsistent state, // but is often reasonable for per-Processor assertion problems recoverableProblem(ae); } catch (RuntimeException e) { recoverableProblem(e); } catch (InterruptedException e) { if (currentCuri != null) { recoverableProblem(e); Thread.interrupted(); // clear interrupt status } else { throw e; } } catch (StackOverflowError err) { recoverableProblem(err); } catch (Error err) { // OutOfMemory and any others seriousError(err); } finally { KeyedProperties.clearOverridesFrom(curi); } setStep(Step.ABOUT_TO_RETURN_URI, null); ArchiveUtils.continueCheck(); synchronized (this) { controller.getFrontier().finished(currentCuri); controller.getFrontier().endDisposition(); setCurrentCuri(null); } setStep(Step.FINISHING_PROCESS, null); lastFinishTime = System.currentTimeMillis(); if (shouldRetire) { break; // from while(true) } } } catch (InterruptedException e) { if (currentCuri != null) { logger.log( Level.SEVERE, "Interrupt leaving unfinished CrawlURI " + getName() + " - job may hang", e); } // thread interrupted, ok to end logger.log(Level.FINE, this.getName() + " ended with Interruption"); } catch (Exception e) { // everything else (including interruption) logger.log(Level.SEVERE, "Fatal exception in " + getName(), e); } catch (OutOfMemoryError err) { seriousError(err); } finally { controller.getFrontier().endDisposition(); } setCurrentCuri(null); // Do cleanup so that objects can be GC. this.httpRecorder.closeRecorders(); this.httpRecorder = null; logger.fine(getName() + " finished for order '" + name + "'"); setStep(Step.FINISHED, null); controller = null; }
/** * @param m marker or null to start with first entry * @param maxMatches * @return list of matches starting from marker position * @throws DatabaseException */ public CompositeData getFrom(String m, int maxMatches, Pattern pattern, boolean verbose) throws DatabaseException { int matches = 0; int tries = 0; ArrayList<String> results = new ArrayList<String>(maxMatches); DatabaseEntry key; if (m == null) { key = getFirstKey(); } else { byte[] marker = m.getBytes(); // = FrontierJMXTypes.fromString(m); key = new DatabaseEntry(marker); } DatabaseEntry value = new DatabaseEntry(); Cursor cursor = null; OperationStatus result = null; try { cursor = pendingUrisDB.openCursor(null, null); result = cursor.getSearchKey(key, value, null); while (matches < maxMatches && result == OperationStatus.SUCCESS) { if (value.getData().length > 0) { CrawlURI curi = (CrawlURI) crawlUriBinding.entryToObject(value); if (pattern.matcher(curi.toString()).matches()) { if (verbose) { results.add("[" + curi.getClassKey() + "] " + curi.shortReportLine()); } else { results.add(curi.toString()); } matches++; } tries++; } result = cursor.getNext(key, value, null); } } finally { if (cursor != null) { cursor.close(); } } if (result != OperationStatus.SUCCESS) { // end of scan m = null; } else { m = new String(key.getData()); // = FrontierJMXTypes.toString(key.getData()); } String[] arr = results.toArray(new String[results.size()]); CompositeData cd; try { cd = new CompositeDataSupport( /*FrontierJMXTypes.URI_LIST_DATA*/ null, new String[] {"list", "marker"}, new Object[] {arr, m}); } catch (OpenDataException e) { throw new IllegalStateException(e); } return cd; }