protected void decisionMade(
      CrawlURI uri, DecideRule decisiveRule, int decisiveRuleNumber, DecideResult result) {
    if (fileLogger != null) {
      JSONObject extraInfo = null;
      if (logExtraInfo) {
        CrawlHost crawlHost = getServerCache().getHostFor(uri.getUURI());
        String host = "-";
        if (crawlHost != null) {
          host = crawlHost.fixUpName();
        }

        extraInfo = new JSONObject();
        extraInfo.put("hopPath", uri.getPathFromSeed());
        extraInfo.put("via", uri.getVia());
        extraInfo.put("seed", uri.getSourceTag());
        extraInfo.put("host", host);
      }

      fileLogger.info(
          decisiveRuleNumber
              + " "
              + decisiveRule.getClass().getSimpleName()
              + " "
              + result
              + " "
              + uri
              + (extraInfo != null ? " " + extraInfo : ""));
    }
  }
  public Map<String, Object> shortReportMap() {
    Map<String, Object> data = new LinkedHashMap<String, Object>();
    data.put("serialNumber", serialNumber);
    CrawlURI c = currentCuri;
    if (c != null) {
      data.put("currentURI", c.toString());
      data.put("currentProcessor", currentProcessorName);
      data.put("fetchAttempts", c.getFetchAttempts());
    } else {
      data.put("currentURI", null);
    }

    long now = System.currentTimeMillis();
    long time = 0;
    if (lastFinishTime > lastStartTime) {
      data.put("status", "WAITING");
      time = now - lastFinishTime;
    } else if (lastStartTime > 0) {
      data.put("status", "ACTIVE");
      time = now - lastStartTime;
    }
    data.put("currentStatusElapsedMilliseconds", time);
    data.put("currentStatusElapsedPretty", ArchiveUtils.formatMillisecondsToConventional(time));
    data.put("step", step);
    return data;
  }
  public void testOutOfBounds() throws Exception {
    MatchesStatusCodeDecideRule dr = makeDecideRule(400, 499);
    CrawlURI testUri = createTestUri("http://www.archive.org");
    testUri.setFetchStatus(200);

    assertFalse(dr.evaluate(testUri));
  }
  /**
   * Compiles and returns a report on its status.
   *
   * @param name Report name.
   * @param pw Where to print.
   */
  public void reportTo(String name, PrintWriter pw) {
    // name is ignored for now: only one kind of report

    pw.print("[");
    pw.println(getName());

    // Make a local copy of the currentCuri reference in case it gets
    // nulled while we're using it.  We're doing this because
    // alternative is synchronizing and we don't want to do this --
    // it causes hang ups as controller waits on a lock for this thread,
    // something it gets easily enough on old threading model but something
    // it can wait interminably for on NPTL threading model.
    // See [ 994946 ] Pause/Terminate ignored on 2.6 kernel 1.5 JVM.
    CrawlURI c = currentCuri;
    if (c != null) {
      pw.print(" ");
      c.shortReportLineTo(pw);
      pw.print("    ");
      pw.print(c.getFetchAttempts());
      pw.print(" attempts");
      pw.println();
      pw.print("    ");
      pw.print("in processor: ");
      pw.print(currentProcessorName);
    } else {
      pw.print(" -no CrawlURI- ");
    }
    pw.println();

    long now = System.currentTimeMillis();
    long time = 0;

    pw.print("    ");
    if (lastFinishTime > lastStartTime) {
      // That means we finished something after we last started something
      // or in other words we are not working on anything.
      pw.print("WAITING for ");
      time = now - lastFinishTime;
    } else if (lastStartTime > 0) {
      // We are working on something
      pw.print("ACTIVE for ");
      time = now - lastStartTime;
    }
    pw.print(ArchiveUtils.formatMillisecondsToConventional(time));
    pw.println();

    pw.print("    ");
    pw.print("step: ");
    pw.print(step);
    pw.print(" for ");
    pw.print(
        ArchiveUtils.formatMillisecondsToConventional(System.currentTimeMillis() - atStepSince));
    pw.println();

    reportThread(this, pw);
    pw.print("]");
    pw.println();

    pw.flush();
  }
  /**
   * Delete all CrawlURIs matching the given expression.
   *
   * @param match
   * @param queue
   * @param headKey
   * @return count of deleted items
   * @throws DatabaseException
   * @throws DatabaseException
   */
  public long deleteMatchingFromQueue(String match, String queue, DatabaseEntry headKey)
      throws DatabaseException {
    long deletedCount = 0;
    Pattern pattern = Pattern.compile(match);
    DatabaseEntry key = headKey;
    DatabaseEntry value = new DatabaseEntry();
    Cursor cursor = null;
    try {
      cursor = pendingUrisDB.openCursor(null, null);
      OperationStatus result = cursor.getSearchKeyRange(headKey, value, null);

      while (result == OperationStatus.SUCCESS) {
        if (value.getData().length > 0) {
          CrawlURI curi = (CrawlURI) crawlUriBinding.entryToObject(value);
          if (!curi.getClassKey().equals(queue)) {
            // rolled into next queue; finished with this queue
            break;
          }
          if (pattern.matcher(curi.toString()).matches()) {
            cursor.delete();
            deletedCount++;
          }
        }
        result = cursor.getNext(key, value, null);
      }
    } finally {
      if (cursor != null) {
        cursor.close();
      }
    }

    return deletedCount;
  }
 /**
  * Delete the given CrawlURI from persistent store. Requires the key under which it was stored be
  * available.
  *
  * @param item
  * @throws DatabaseException
  */
 public void delete(CrawlURI item) throws DatabaseException {
   OperationStatus status;
   DatabaseEntry de = (DatabaseEntry) item.getHolderKey();
   status = pendingUrisDB.delete(null, de);
   if (status != OperationStatus.SUCCESS) {
     LOGGER.severe(
         "expected item not present: "
             + item
             + "("
             + (new BigInteger(((DatabaseEntry) item.getHolderKey()).getData())).toString(16)
             + ")");
   }
 }
 /**
  * Calculate the insertKey that places a CrawlURI in the desired spot. First bytes are always
  * classKey (usu. host) based -- ensuring grouping by host -- terminated by a zero byte. Then 8
  * bytes of data ensuring desired ordering within that 'queue' are used. The first byte of these 8
  * is priority -- allowing 'immediate' and 'soon' items to sort above regular. Next 1 byte is
  * 'precedence'. Last 6 bytes are ordinal serial number, ensuring earlier-discovered URIs sort
  * before later.
  *
  * <p>NOTE: Dangers here are: (1) priorities or precedences over 2^7 (signed byte comparison) (2)
  * ordinals over 2^48
  *
  * <p>Package access & static for testing purposes.
  *
  * @param curi
  * @return a DatabaseEntry key for the CrawlURI
  */
 static DatabaseEntry calculateInsertKey(CrawlURI curi) {
   byte[] classKeyBytes = null;
   int len = 0;
   classKeyBytes = curi.getClassKey().getBytes(Charsets.UTF_8);
   len = classKeyBytes.length;
   byte[] keyData = new byte[len + 9];
   System.arraycopy(classKeyBytes, 0, keyData, 0, len);
   keyData[len] = 0;
   long ordinalPlus = curi.getOrdinal() & 0x0000FFFFFFFFFFFFL;
   ordinalPlus = ((long) curi.getSchedulingDirective() << 56) | ordinalPlus;
   long precedence = Math.min(curi.getPrecedence(), 127);
   ordinalPlus = (((precedence) & 0xFFL) << 48) | ordinalPlus;
   ArchiveUtils.longIntoByteArray(ordinalPlus, keyData, len + 1);
   return new DatabaseEntry(keyData);
 }
  /**
   * Get the next nearest item after the given key. Relies on external discipline -- we'll look at
   * the queues count of how many items it has -- to avoid asking for something from a range where
   * there are no associated items -- otherwise could get first item of next 'queue' by mistake.
   *
   * <p>TODO: hold within a queue's range
   *
   * @param headKey Key prefix that demarks the beginning of the range in <code>pendingUrisDB</code>
   *     we're interested in.
   * @return CrawlURI.
   * @throws DatabaseException
   */
  public CrawlURI get(DatabaseEntry headKey) throws DatabaseException {
    DatabaseEntry result = new DatabaseEntry();

    // From Linda Lee of sleepycat:
    // "You want to check the status returned from Cursor.getSearchKeyRange
    // to make sure that you have OperationStatus.SUCCESS. In that case,
    // you have found a valid data record, and result.getData()
    // (called by internally by the binding code, in this case) will be
    // non-null. The other possible status return is
    // OperationStatus.NOTFOUND, in which case no data record matched
    // the criteria. "
    OperationStatus status = getNextNearestItem(headKey, result);
    CrawlURI retVal = null;
    if (status != OperationStatus.SUCCESS) {
      LOGGER.severe(
          "See '1219854 NPE je-2.0 "
              + "entryToObject...'. OperationStatus "
              + " was not SUCCESS: "
              + status
              + ", headKey "
              + BdbWorkQueue.getPrefixClassKey(headKey.getData()));
      return null;
    }

    try {
      retVal = (CrawlURI) crawlUriBinding.entryToObject(result);
    } catch (ClassCastException cce) {
      Object obj = crawlUriBinding.entryToObject(result);
      LOGGER.log(
          Level.SEVERE,
          "see [#HER-1283]: deserialized "
              + obj.getClass()
              + " has ClassLoader "
              + obj.getClass().getClassLoader().getClass(),
          cce);
      return null;
    } catch (RuntimeExceptionWrapper rw) {
      LOGGER.log(
          Level.SEVERE,
          "expected object missing in queue " + BdbWorkQueue.getPrefixClassKey(headKey.getData()),
          rw);
      return null;
    }
    retVal.setHolderKey(headKey);
    return retVal;
  }
  /** @param w PrintWriter to write to. */
  public void shortReportLineTo(PrintWriter w) {
    w.print("#");
    w.print(this.serialNumber);

    // Make a local copy of the currentCuri reference in case it gets
    // nulled while we're using it.  We're doing this because
    // alternative is synchronizing and we don't want to do this --
    // it causes hang ups as controller waits on a lock for this thread,
    // something it gets easily enough on old threading model but something
    // it can wait interminably for on NPTL threading model.
    // See [ 994946 ] Pause/Terminate ignored on 2.6 kernel 1.5 JVM.
    CrawlURI c = currentCuri;
    if (c != null) {
      w.print(" ");
      w.print(currentProcessorName);
      w.print(" ");
      w.print(c.toString());
      w.print(" (");
      w.print(c.getFetchAttempts());
      w.print(") ");
    } else {
      w.print(" [no CrawlURI] ");
    }

    long now = System.currentTimeMillis();
    long time = 0;

    if (lastFinishTime > lastStartTime) {
      // That means we finished something after we last started something
      // or in other words we are not working on anything.
      w.print("WAITING for ");
      time = now - lastFinishTime;
    } else if (lastStartTime > 0) {
      // We are working on something
      w.print("ACTIVE for ");
      time = now - lastStartTime;
    }
    w.print(ArchiveUtils.formatMillisecondsToConventional(time));
    w.print(" at ");
    w.print(step);
    w.print(" for ");
    w.print(ArchiveUtils.formatMillisecondsToConventional(now - atStepSince));
    w.print("\n");
    w.flush();
  }
 /**
  * Merge any data from the Map stored in the URI-history store into the current instance.
  *
  * <p>TODO: ensure compatibility with use of PersistLoadProcessor; suppress double-loading
  *
  * @param curi CrawlURI to receive prior state data
  */
 protected void mergePrior(CrawlURI curi) {
   String key = PersistProcessor.persistKeyFor(curi);
   @SuppressWarnings({"rawtypes", "unchecked"})
   Map<String, Map> prior = (Map<String, Map>) store.get(key);
   if (prior != null) {
     // merge in keys
     curi.getData().putAll(prior);
   }
 }
 /**
  * Terminates a thread.
  *
  * <p>Calling this method will ensure that the current thread will stop processing as soon as
  * possible (note: this may be never). Meant to 'short circuit' hung threads.
  *
  * <p>Current crawl uri will have its fetch status set accordingly and will be immediately
  * returned to the frontier.
  *
  * <p>As noted before, this does not ensure that the thread will stop running (ever). But once
  * evoked it will not try and communicate with other parts of crawler and will terminate as soon
  * as control is established.
  */
 protected void kill() {
   this.interrupt();
   synchronized (this) {
     if (currentCuri != null) {
       currentCuri.setFetchStatus(S_PROCESSING_THREAD_KILLED);
       controller.getFrontier().finished(currentCuri);
     }
   }
 }
 /* (non-Javadoc)
  * @see org.archive.crawler.frontier.precedence.BaseUriPrecedencePolicy#calculatePrecedence(org.archive.crawler.datamodel.CrawlURI)
  */
 @Override
 protected int calculatePrecedence(CrawlURI curi) {
   mergePrior(curi);
   Integer preloadPrecedence = (Integer) curi.getData().get(A_PRECALC_PRECEDENCE);
   if (preloadPrecedence == null) {
     return 0;
   }
   return super.calculatePrecedence(curi) + preloadPrecedence;
 }
 /* (non-Javadoc)
  * @see org.archive.crawler.frontier.precedence.BaseUriPrecedencePolicy#uriScheduled(org.archive.crawler.datamodel.CrawlURI)
  */
 @Override
 public void uriScheduled(CrawlURI curi) {
   int precedence = calculatePrecedence(curi);
   if (precedence == 0) {
     // fall back to configured default policy
     getDefaultUriPrecedencePolicy().uriScheduled(curi);
     return;
   }
   curi.setPrecedence(precedence);
 }
  private void seriousError(Error err) {
    // try to prevent timeslicing until we have a chance to deal with OOM
    // Note that modern-day JVM priority indifference with native threads
    // may make this priority-jumbling pointless
    setPriority(DEFAULT_PRIORITY + 1);
    if (controller != null) {
      // hold all ToeThreads from proceeding to next processor
      controller.freeReserveMemory();
      controller.requestCrawlPause();
      if (controller.getFrontier().getFrontierJournal() != null) {
        controller.getFrontier().getFrontierJournal().seriousError(getName() + err.getMessage());
      }
    }

    // OutOfMemory etc.
    String extraInfo = DevUtils.extraInfo();
    System.err.println("<<<");
    System.err.println(ArchiveUtils.getLog17Date());
    System.err.println(err);
    System.err.println(extraInfo);
    err.printStackTrace(System.err);

    if (controller != null) {
      PrintWriter pw = new PrintWriter(System.err);
      controller.getToePool().compactReportTo(pw);
      pw.flush();
    }
    System.err.println(">>>");
    //        DevUtils.sigquitSelf();

    String context = "unknown";
    if (currentCuri != null) {
      // update fetch-status, saving original as annotation
      currentCuri.getAnnotations().add("err=" + err.getClass().getName());
      currentCuri.getAnnotations().add("os" + currentCuri.getFetchStatus());
      currentCuri.setFetchStatus(S_SERIOUS_ERROR);
      context = currentCuri.shortReportLine() + " in " + currentProcessorName;
    }
    String message = "Serious error occured trying " + "to process '" + context + "'\n" + extraInfo;
    logger.log(Level.SEVERE, message.toString(), err);
    setPriority(DEFAULT_PRIORITY);
  }
 /**
  * Handling for exceptions and errors that are possibly recoverable.
  *
  * @param e
  */
 private void recoverableProblem(Throwable e) {
   Object previousStep = step;
   setStep(Step.HANDLING_RUNTIME_EXCEPTION, null);
   // e.printStackTrace(System.err);
   currentCuri.setFetchStatus(S_RUNTIME_EXCEPTION);
   // store exception temporarily for logging
   currentCuri.getAnnotations().add("err=" + e.getClass().getName());
   currentCuri.getData().put(A_RUNTIME_EXCEPTION, e);
   String message =
       "Problem "
           + e
           + " occured when trying to process '"
           + currentCuri.toString()
           + "' at step "
           + previousStep
           + " in "
           + currentProcessorName
           + "\n";
   logger.log(Level.SEVERE, message.toString(), e);
 }
 /**
  * Add constant penalties for certain features of URI (and its 'via') that make it more
  * delayable/skippable.
  *
  * @param curi CrawlURI to be assigned a cost
  * @see org.archive.crawler.frontier.CostAssignmentPolicy#costOf(org.archive.modules.CrawlURI)
  */
 public int costOf(CrawlURI curi) {
   int cost = 1;
   UURI uuri = curi.getUURI();
   if (uuri.hasQuery()) {
     // has query string
     cost++;
     int qIndex = uuri.toString().indexOf('?');
     if (curi.flattenVia().startsWith(uuri.toString().substring(0, qIndex))) {
       // non-query-string portion of URI is same as previous
       cost++;
     }
     // TODO: other potential query-related cost penalties:
     //  - more than X query-string attributes
     //  - calendarish terms
     //  - query-string over certain size
   }
   // TODO: other potential path-based penalties
   //  - new path is simply extension of via path
   //  - many path segments
   // TODO: other potential hops-based penalties
   //  - more than X hops
   //  - each speculative hop
   return cost;
 }
  /**
   * Put the given CrawlURI in at the appropriate place.
   *
   * @param curi
   * @throws DatabaseException
   */
  public void put(CrawlURI curi, boolean overwriteIfPresent) throws DatabaseException {
    DatabaseEntry insertKey = (DatabaseEntry) curi.getHolderKey();
    if (insertKey == null) {
      insertKey = calculateInsertKey(curi);
      curi.setHolderKey(insertKey);
    }
    DatabaseEntry value = new DatabaseEntry();
    crawlUriBinding.objectToEntry(curi, value);
    // Output tally on avg. size if level is FINE or greater.
    if (LOGGER.isLoggable(Level.FINE)) {
      tallyAverageEntrySize(curi, value);
    }
    OperationStatus status;
    if (overwriteIfPresent) {
      status = pendingUrisDB.put(null, insertKey, value);
    } else {
      status = pendingUrisDB.putNoOverwrite(null, insertKey, value);
    }

    if (status != OperationStatus.SUCCESS) {
      LOGGER.log(
          Level.SEVERE, "URI enqueueing failed; " + status + " " + curi, new RuntimeException());
    }
  }
Exemple #18
0
 /**
  * Return a preferred String key for persisting the given CrawlURI's AList state.
  *
  * @param curi CrawlURI
  * @return String key
  */
 public static String persistKeyFor(CrawlURI curi) {
   // use a case-sensitive SURT for uniqueness and sorting benefits
   return persistKeyFor(curi.getUURI().toString());
 }
  /**
   * (non-Javadoc)
   *
   * @see java.lang.Thread#run()
   */
  public void run() {
    String name = controller.getMetadata().getJobName();
    logger.fine(getName() + " started for order '" + name + "'");
    Recorder.setHttpRecorder(httpRecorder);

    try {
      while (true) {
        ArchiveUtils.continueCheck();

        setStep(Step.ABOUT_TO_GET_URI, null);

        CrawlURI curi = controller.getFrontier().next();

        synchronized (this) {
          ArchiveUtils.continueCheck();
          setCurrentCuri(curi);
          currentCuri.setThreadNumber(this.serialNumber);
          lastStartTime = System.currentTimeMillis();
          currentCuri.setRecorder(httpRecorder);
        }

        try {
          KeyedProperties.loadOverridesFrom(curi);

          controller.getFetchChain().process(curi, this);

          controller.getFrontier().beginDisposition(curi);

          controller.getDispositionChain().process(curi, this);

        } catch (RuntimeExceptionWrapper e) {
          // Workaround to get cause from BDB
          if (e.getCause() == null) {
            e.initCause(e.getCause());
          }
          recoverableProblem(e);
        } catch (AssertionError ae) {
          // This risks leaving crawl in fatally inconsistent state,
          // but is often reasonable for per-Processor assertion problems
          recoverableProblem(ae);
        } catch (RuntimeException e) {
          recoverableProblem(e);
        } catch (InterruptedException e) {
          if (currentCuri != null) {
            recoverableProblem(e);
            Thread.interrupted(); // clear interrupt status
          } else {
            throw e;
          }
        } catch (StackOverflowError err) {
          recoverableProblem(err);
        } catch (Error err) {
          // OutOfMemory and any others
          seriousError(err);
        } finally {
          KeyedProperties.clearOverridesFrom(curi);
        }

        setStep(Step.ABOUT_TO_RETURN_URI, null);
        ArchiveUtils.continueCheck();

        synchronized (this) {
          controller.getFrontier().finished(currentCuri);
          controller.getFrontier().endDisposition();
          setCurrentCuri(null);
        }

        setStep(Step.FINISHING_PROCESS, null);
        lastFinishTime = System.currentTimeMillis();
        if (shouldRetire) {
          break; // from while(true)
        }
      }
    } catch (InterruptedException e) {
      if (currentCuri != null) {
        logger.log(
            Level.SEVERE,
            "Interrupt leaving unfinished CrawlURI " + getName() + " - job may hang",
            e);
      }
      // thread interrupted, ok to end
      logger.log(Level.FINE, this.getName() + " ended with Interruption");
    } catch (Exception e) {
      // everything else (including interruption)
      logger.log(Level.SEVERE, "Fatal exception in " + getName(), e);
    } catch (OutOfMemoryError err) {
      seriousError(err);
    } finally {
      controller.getFrontier().endDisposition();
    }

    setCurrentCuri(null);
    // Do cleanup so that objects can be GC.
    this.httpRecorder.closeRecorders();
    this.httpRecorder = null;

    logger.fine(getName() + " finished for order '" + name + "'");
    setStep(Step.FINISHED, null);
    controller = null;
  }
  /**
   * @param m marker or null to start with first entry
   * @param maxMatches
   * @return list of matches starting from marker position
   * @throws DatabaseException
   */
  public CompositeData getFrom(String m, int maxMatches, Pattern pattern, boolean verbose)
      throws DatabaseException {
    int matches = 0;
    int tries = 0;
    ArrayList<String> results = new ArrayList<String>(maxMatches);

    DatabaseEntry key;
    if (m == null) {
      key = getFirstKey();
    } else {
      byte[] marker = m.getBytes(); // = FrontierJMXTypes.fromString(m);
      key = new DatabaseEntry(marker);
    }

    DatabaseEntry value = new DatabaseEntry();

    Cursor cursor = null;
    OperationStatus result = null;
    try {
      cursor = pendingUrisDB.openCursor(null, null);
      result = cursor.getSearchKey(key, value, null);

      while (matches < maxMatches && result == OperationStatus.SUCCESS) {
        if (value.getData().length > 0) {
          CrawlURI curi = (CrawlURI) crawlUriBinding.entryToObject(value);
          if (pattern.matcher(curi.toString()).matches()) {
            if (verbose) {
              results.add("[" + curi.getClassKey() + "] " + curi.shortReportLine());
            } else {
              results.add(curi.toString());
            }
            matches++;
          }
          tries++;
        }
        result = cursor.getNext(key, value, null);
      }
    } finally {
      if (cursor != null) {
        cursor.close();
      }
    }

    if (result != OperationStatus.SUCCESS) {
      // end of scan
      m = null;
    } else {
      m = new String(key.getData()); // = FrontierJMXTypes.toString(key.getData());
    }

    String[] arr = results.toArray(new String[results.size()]);
    CompositeData cd;
    try {
      cd =
          new CompositeDataSupport(
              /*FrontierJMXTypes.URI_LIST_DATA*/ null,
              new String[] {"list", "marker"},
              new Object[] {arr, m});
    } catch (OpenDataException e) {
      throw new IllegalStateException(e);
    }
    return cd;
  }