예제 #1
0
  protected void open() throws Exception {
    // We expect to be run from the project directory.
    // (Both eclipse and maven run junit tests from there).
    String name = getSelfTestName();

    // Make sure the project directory contains a selftest profile
    // and content for the self test.
    File src = getTestDataDir();
    if (!src.exists()) {
      throw new Exception("No selftest directory for " + name);
    }

    // Create temporary directories for Heritrix to run in.
    File tmpDir = new File(getTmpDir(), "selftest");
    File tmpTestDir = new File(tmpDir, name);

    // If we have an old job lying around from a previous run, delete it.
    File tmpJobs = new File(tmpTestDir, "jobs");
    if (tmpJobs.exists()) {
      FileUtils.deleteDirectory(tmpJobs);
    }

    // Copy the selftest's profile in the project directory to the
    // default profile in the temporary Heritrix directory.
    File tmpDefProfile = new File(tmpJobs, "selftest-job");
    File profileTemplate = new File(src, "profile");
    if (profileTemplate.exists()) {
      org.apache.commons.io.FileUtils.copyDirectory(profileTemplate, tmpDefProfile);
    } else {
      org.archive.util.FileUtils.ensureWriteableDirectory(tmpDefProfile);
    }

    // Start up a Jetty that serves the selftest's content directory.
    startHttpServer();

    // Copy configuration for eg Logging over
    File tmpConfDir = new File(tmpTestDir, "conf");
    org.archive.util.FileUtils.ensureWriteableDirectory(tmpConfDir);
    File srcConf = new File(src.getParentFile(), "conf");
    FileUtils.copyDirectory(srcConf, tmpConfDir);

    String crawlerBeansText =
        FileUtils.readFileToString(new File(srcConf, "selftest-crawler-beans.cxml"));
    crawlerBeansText = changeGlobalConfig(crawlerBeansText);
    File crawlerBeans = new File(tmpDefProfile, "selftest-crawler-beans.cxml");
    FileWriter fw = new FileWriter(crawlerBeans);
    fw.write(crawlerBeansText);
    fw.close();

    startHeritrix(tmpTestDir.getAbsolutePath());

    waitForCrawlFinish();
  }
 /**
  * Create a backup of this given file, first by trying a "hard link", then by using a copy if hard
  * linking is unavailable (either because it is unsupported or the origin and checkpoint
  * directories are on different volumes).
  *
  * @param file
  * @param destination
  * @throws IOException
  */
 private void hardlinkOrCopy(File file, File destination) throws IOException {
   // For Linux/UNIX, try a hard link first.
   Process link =
       Runtime.getRuntime()
           .exec("ln " + file.getAbsolutePath() + " " + destination.getAbsolutePath());
   // TODO NTFS also supports hard links; add appropriate try
   try {
     link.waitFor();
   } catch (InterruptedException e) {
     // TODO Auto-generated catch block
     e.printStackTrace();
   }
   if (link.exitValue() != 0) {
     // hard link failed
     FileUtils.copyFile(file, destination);
   }
 }
예제 #3
0
  /**
   * Populates a new environment db from an old environment db or a persist log. If path to new
   * environment is not provided, only logs the entries that would have been populated.
   *
   * @param sourcePath source of old entries: can be a path to an existing environment db, or a URL
   *     or path to a persist log
   * @param envFile path to new environment db (or null for a dry run)
   * @return number of records
   * @throws DatabaseException
   * @throws IOException
   */
  public static int populatePersistEnv(String sourcePath, File envFile) throws IOException {
    int count = 0;
    StoredSortedMap<String, Map> historyMap = null;
    EnhancedEnvironment targetEnv = null;
    StoredClassCatalog classCatalog = null;
    Database historyDB = null;

    if (envFile != null) {
      // set up target environment
      FileUtils.ensureWriteableDirectory(envFile);
      targetEnv = setupCopyEnvironment(envFile);
      classCatalog = targetEnv.getClassCatalog();
      historyDB =
          targetEnv.openDatabase(null, URI_HISTORY_DBNAME, HISTORY_DB_CONFIG.toDatabaseConfig());
      historyMap =
          new StoredSortedMap<String, Map>(
              historyDB,
              new StringBinding(),
              new SerialBinding<Map>(classCatalog, Map.class),
              true);
    }

    try {
      count = copyPersistSourceToHistoryMap(new File(sourcePath), historyMap);
    } finally {
      // in finally block so that we unlock the target env even if we
      // failed to populate it
      if (envFile != null) {
        logger.info(count + " records imported from " + sourcePath + " to BDB env " + envFile);
        historyDB.sync();
        historyDB.close();
        targetEnv.close();
      } else {
        logger.info(count + " records found in " + sourcePath);
      }
    }

    return count;
  }
  public void testHQ() throws Exception {
    EnvironmentConfig envConfig = new EnvironmentConfig();
    envConfig.setTransactional(true);
    envConfig.setAllowCreate(true);
    File envDir = new File(getTmpDir(), "AR");
    if (envDir.exists()) {
      FileUtils.deleteDir(envDir);
    }
    envDir.mkdirs();
    Environment env = new Environment(envDir, envConfig);
    // Open the class catalog database. Create it if it does not
    // already exist.
    DatabaseConfig dbConfig = new DatabaseConfig();
    dbConfig.setAllowCreate(true);
    StoredClassCatalog catalog =
        new StoredClassCatalog(env.openDatabase(null, "classes", dbConfig));
    AdaptiveRevisitHostQueue hq = new AdaptiveRevisitHostQueue("bok.hi.is", env, catalog, 1);

    // Make the CrawlUris
    CrawlURI[] curis = {null, null, null, null};

    UURI uuri = UURIFactory.getInstance("http://bok.hi.is/1.html");
    curis[0] = new CrawlURI(uuri);
    curis[0].setVia(null);

    uuri = UURIFactory.getInstance("http://bok.hi.is/2.html");
    curis[1] = new CrawlURI(uuri);
    curis[1].setVia(null);

    uuri = UURIFactory.getInstance("http://bok.hi.is/3.html");
    curis[2] = new CrawlURI(uuri);
    curis[2].setVia(null);

    uuri = UURIFactory.getInstance("http://bok.hi.is/4.html");
    curis[3] = new CrawlURI(uuri);
    curis[3].setVia(null);

    assertTrue(
        "HQ should be empty initially", hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_EMPTY);
    assertEquals("Incorrect nextReadyTime on Empty", Long.MAX_VALUE, hq.getNextReadyTime());
    assertEquals("Initial size of HQ should be 0", 0, hq.getSize());

    assertEquals("Peek should return null when 'ready queue' is empty", null, hq.peek());

    /*
     * Add three CrawlURIs and ensures that the correct one is reported by
     * peek(); All are added later then current time!
     */

    curis[0].putLong(A_TIME_OF_NEXT_PROCESSING, System.currentTimeMillis()); // now
    curis[1].putLong(A_TIME_OF_NEXT_PROCESSING, System.currentTimeMillis() + 5000); // in 5 sec
    curis[2].putLong(A_TIME_OF_NEXT_PROCESSING, System.currentTimeMillis() + 20000); // in 20 sec.

    hq.add(curis[0], false);
    assertEquals("First CrawlURI should be top", curis[0].toString(), hq.peek().toString());
    assertTrue(
        "HQ should no longer be empty", hq.getState() != AdaptiveRevisitHostQueue.HQSTATE_EMPTY);
    assertEquals("Size of HQ should now be 1", 1, hq.getSize());

    /*
     * Invoke next and ensure that the HQ is now busy (initial valence was
     * set to 1). Also check for proper errors for a busy HQ. Such as when
     * trying to reinvoke next().
     *
     */
    CrawlURI curi = hq.next(); // Should return curis[2]
    assertEquals("next() did not return 'top' URI", curis[0].toString(), curi.toString());
    assertTrue(
        "HQ should now be busy, is " + hq.getStateByName(),
        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_BUSY);
    try {
      hq.next();
      assertTrue("next() should throw an IllegalStateException if HQ " + "not ready", false);
    } catch (IllegalStateException e) {
      // This is supposed to happen.
    }
    assertEquals("New top URI should be null", null, hq.peek());

    hq.add(curis[1], false);
    assertEquals("Second CrawlURI should be top", curis[1].toString(), hq.peek().toString());
    assertEquals("Size of HQ should now be 2", 2, hq.getSize());

    // Return it with next fetch time in the future.
    curi.putLong(
        A_TIME_OF_NEXT_PROCESSING,
        hq.peek().getLong(A_TIME_OF_NEXT_PROCESSING) + 100000); // 100 sec behind current top.
    hq.update(curi, false, 0);
    assertEquals(
        "Second CrawlURI should be still be top", curis[1].toString(), hq.peek().toString());
    assertEquals("Size of HQ should still be 2", 2, hq.getSize());

    hq.add(curis[2], false);
    assertEquals("Second CrawlURI should still be top", curis[1].toString(), hq.peek().toString());
    assertEquals("Size of HQ should now be 3", 3, hq.getSize());

    /*
     * If there are no URIs ready, the queue should snooze, even though no
     * politeness demand has been made.
     * <p>
     * Confirms this and that it wakes up.
     */
    assertTrue(
        "HQ should be snoozed, is " + hq.getStateByName(),
        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_SNOOZED);
    // Wait past wakeup time
    synchronized (this) {
      wait(hq.getNextReadyTime() - System.currentTimeMillis() + 100);
    }
    assertTrue(
        "HQ should now be ready, is " + hq.getStateByName(),
        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_READY);

    /*
     * Re-adds a URI with a lower ready time which should promote it to the
     * top of the queue. Checks if this happens correctly.
     *
     * Then tests an add override which would demote it back, ensures that
     * this fails as it should (i.e. URIs time of next processing remains
     * unchanged).
     */
    curis[2].putLong(
        A_TIME_OF_NEXT_PROCESSING,
        curis[1].getLong(A_TIME_OF_NEXT_PROCESSING) - 1000); // 1 sec. prior to current top
    hq.add(curis[2], true);
    assertEquals("Size of HQ should still be 3", hq.getSize(), 3);
    assertEquals("Third CrawlURI should be now be top", curis[2].toString(), hq.peek().toString());
    curis[2].putLong(
        A_TIME_OF_NEXT_PROCESSING,
        curis[1].getLong(A_TIME_OF_NEXT_PROCESSING) + 10000); // 10 sec. later
    hq.add(curis[2], true);
    assertEquals("Size of HQ should still be 3", hq.getSize(), 3);
    assertEquals("Third CrawlURI should still top", curis[2].toString(), hq.peek().toString());

    /*
     * Invoke next and ensure that the HQ is now busy (initial valence was
     * set to 1). Also check for proper errors for a busy HQ. Such as when
     * trying to reinvoke next().
     *
     */
    curi = hq.next(); // Should return curis[2]
    assertEquals("next() did not return 'top' URI", curis[2].toString(), curi.toString());
    assertTrue(
        "HQ should now be busy, is " + hq.getStateByName(),
        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_BUSY);
    try {
      hq.next();
      assertTrue("next() should throw an IllegalStateException if HQ " + "not ready", false);
    } catch (IllegalStateException e) {
      // This is supposed to happen.
    }
    assertEquals("New top URI", curis[1].toString(), hq.peek().toString());

    /*
     * Add a URI while HQ is busy. Check if this succeeds normally.
     *
     */

    curis[3].putLong(
        A_TIME_OF_NEXT_PROCESSING,
        curis[1].getLong(A_TIME_OF_NEXT_PROCESSING)
            - 1); // 1 msec. ahead of current top (order [2] 3 1 0)
    hq.add(curis[3], false);
    assertEquals("Size of HQ should now be 4", 4, hq.getSize());

    /*
     * Invoke update, first with an invalid URI (not the one issued by
     * next() earlier), this should fail. Then with the correct one, this
     * should succeed. Then finally test update again with an invalid URI
     * (i.e. when no HQ has no outstanding URIs, that should fail.
     *
     * At each step, proper checks are made of state and that  methods give
     * appropriate errors.
     *
     * Updated URI is given low time of next processing to put it 'in front'
     */

    try {
      hq.update(curis[1], false, 0);
      assertTrue("update() should not accept URI", false);
    } catch (IllegalStateException e) {
      // This is supposed to happen
    }

    // We do not change the 'time of next processing' on update
    // so curis[2] should again be at top of queue.
    long timeOfPolitenessWakeUp = System.currentTimeMillis() + 2000;
    hq.update(curi, true, timeOfPolitenessWakeUp); // Wake in 5 sec.
    assertTrue(
        "HQ should be snoozed, is " + hq.getStateByName(),
        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_SNOOZED);

    try {
      hq.update(curis[2], false, 0);
      assertTrue("update() should not accept URI", false);
    } catch (IllegalStateException e) {
      // This is supposed to happen
    }
    assertEquals(
        "HQs time of next ready should reflect set wait time ",
        timeOfPolitenessWakeUp,
        hq.getNextReadyTime());

    /*
     * Check if the HQ wakes up from it's 'snoozing'
     *
     */
    // Wait past wakeup time
    synchronized (this) {
      wait(hq.getNextReadyTime() - System.currentTimeMillis() + 100);
    }
    assertTrue(
        "HQ should now be ready, is " + hq.getStateByName(),
        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_READY);
    assertEquals(
        "HQs time of next ready should still be when it 'woken' " + "up.",
        timeOfPolitenessWakeUp,
        hq.getNextReadyTime());

    /*
     * Invoke next so that the HQ has a URI being processed. Then
     * close the HQ and reopen it to ensure that this happens normally, i.e.
     * state is recovered properly, including the restoration of the URI
     * being processed, back to the regular queue (where it should be
     * first).
     *
     * On recreating the HQ, set valence to 2.
     */
    curi = hq.next(); // Should return curis[2]
    assertEquals("next() did not return 'top' URI", curis[2].toString(), curi.toString());
    assertTrue(
        "HQ should now be busy, is " + hq.getStateByName(),
        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_BUSY);
    hq.close();

    hq = new AdaptiveRevisitHostQueue("bok.hi.is", env, catalog, 2);

    assertEquals("Size of HQ after reopening should now be 4", 4, hq.getSize());
    assertTrue(
        "HQ should be ready on reopen, is " + hq.getStateByName(),
        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_READY);
    assertEquals(
        "CrawlURI 'in processing' before should be top", curi.toString(), hq.peek().toString());

    /* Check if valence higher then 1 is properly handled.
     *
     * Invoke next(), check if still ready and new top URI.
     */
    curi = hq.next(); // Should return curis[2]
    assertEquals("next() did not return 'top' URI", curis[2].toString(), curi.toString());
    assertTrue(
        "HQ should still be ready, is " + hq.getStateByName(),
        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_READY);

    /* Invoke next() again, check if now busy.
     */
    curi = hq.next(); // Should return curis[3]
    assertEquals("next() did not return 'top' URI", curis[3].toString(), curi.toString());
    assertTrue(
        "HQ should be busy, is " + hq.getStateByName(),
        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_BUSY);
    assertEquals("Size of HQ should still be 4", 4, hq.getSize());

    /* Update() second URI issued. Confirm HQ is now ready again. URI is
     * given same time of next processing to put it 'in front'. (no snooze)
     */
    hq.update(curi, false, 0);
    assertTrue(
        "HQ should now be ready, is " + hq.getStateByName(),
        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_READY);
    assertEquals("'updated' CrawlURI before should be top", curi.toString(), hq.peek().toString());

    /* Update() again, ensure proper state. URI is NOT placed at front of
     * queue and snooze time is given. But the HQ should not enter a
     * snoozed state because the 'other' slot is free.
     */

    hq.update(curis[2], true, System.currentTimeMillis() + 1000000); // 10sec
    curis[3].putLong(
        A_TIME_OF_NEXT_PROCESSING,
        curis[1].getLong(A_TIME_OF_NEXT_PROCESSING) + 1000); // 1 sec. behind of current top
    assertTrue(
        "HQ should still be ready, is " + hq.getStateByName(),
        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_READY);
    assertEquals("Top CrawlURI before should be unchanged", curi.toString(), hq.peek().toString());

    // TODO: Test sorting with scheduling directives.

    /*
     * Close the ARHostQueue and the Environment
     */
    hq.close();
    catalog.close();
    env.close();
    cleanUpOldFiles("AR");
  }