public boolean isPrerequisite(final CrawlURI curi) {
   boolean result = false;
   String curiStr = curi.getUURI().toString();
   String loginUri = getPrerequisite(curi);
   if (loginUri != null) {
     try {
       UURI uuri = UURIFactory.getInstance(curi.getUURI(), loginUri);
       if (uuri != null && curiStr != null && uuri.toString().equals(curiStr)) {
         result = true;
         if (!curi.isPrerequisite()) {
           curi.setPrerequisite(true);
           logger.fine(curi + " is prereq.");
         }
       }
     } catch (URIException e) {
       logger.severe("Failed to uuri: " + curi + ", " + e.getMessage());
     }
   }
   return result;
 }
 protected Set<String> filesInArcs() throws IOException {
   List<ArchiveRecordHeader> headers = headersInArcs();
   HashSet<String> result = new HashSet<String>();
   for (ArchiveRecordHeader arh : headers) {
     // ignore 'filedesc:' record
     if (arh.getUrl().startsWith("filedesc:")) {
       continue;
     }
     UURI uuri = UURIFactory.getInstance(arh.getUrl());
     String path = uuri.getPath();
     if (path.startsWith("/")) {
       path = path.substring(1);
     }
     if (arh.getUrl().startsWith("http:")) {
       result.add(path);
     }
   }
   LOGGER.finest(result.toString());
   return result;
 }
Exemple #3
0
  protected void addHeaderLink(CrawlURI curi, Header loc) {
    if (loc == null) {
      // If null, return without adding anything.
      return;
    }
    // TODO: consider possibility of multiple headers
    try {
      /**
       * 302重定向使用自定义的方法存储link
       *
       * @modify: wuliufu
       * @since : 2012-05-11
       */
      curi.createAndAddLocationLink(
          curi.getVia(), loc.getValue(), loc.getName() + ":", Link.REFER_HOP);

      if (curi.getObject(URLInfo.ATTACH) != null) {
        UURI outUURI = UURIFactory.getInstance(curi.getUURI(), loc.getValue());
        logger.debug(
            "ParseHTTP: curi = "
                + curi.getUURI().toString()
                + "&& "
                + loc.getName()
                + "="
                + outUURI.toString());
        curi.putObject(outUURI.toString(), curi.getObject(URLInfo.ATTACH));
      }

      numberOfLinksExtracted++;
    } catch (URIException e) {
      // There may not be a controller (e.g. If we're being run
      // by the extractor tool).
      if (getController() != null) {
        getController().logUriError(e, curi.getUURI(), loc.getValue());
      } else {
        logger.info(curi + ", " + loc.getValue() + ": " + e.getMessage());
      }
    }
  }
  private CrawlURI createTestUri(String urlStr) throws URIException {
    UURI testUuri = UURIFactory.getInstance(urlStr);
    CrawlURI testUri = new CrawlURI(testUuri, null, null, LinkContext.NAVLINK_MISC);

    return testUri;
  }
  public void testHQ() throws Exception {
    EnvironmentConfig envConfig = new EnvironmentConfig();
    envConfig.setTransactional(true);
    envConfig.setAllowCreate(true);
    File envDir = new File(getTmpDir(), "AR");
    if (envDir.exists()) {
      FileUtils.deleteDir(envDir);
    }
    envDir.mkdirs();
    Environment env = new Environment(envDir, envConfig);
    // Open the class catalog database. Create it if it does not
    // already exist.
    DatabaseConfig dbConfig = new DatabaseConfig();
    dbConfig.setAllowCreate(true);
    StoredClassCatalog catalog =
        new StoredClassCatalog(env.openDatabase(null, "classes", dbConfig));
    AdaptiveRevisitHostQueue hq = new AdaptiveRevisitHostQueue("bok.hi.is", env, catalog, 1);

    // Make the CrawlUris
    CrawlURI[] curis = {null, null, null, null};

    UURI uuri = UURIFactory.getInstance("http://bok.hi.is/1.html");
    curis[0] = new CrawlURI(uuri);
    curis[0].setVia(null);

    uuri = UURIFactory.getInstance("http://bok.hi.is/2.html");
    curis[1] = new CrawlURI(uuri);
    curis[1].setVia(null);

    uuri = UURIFactory.getInstance("http://bok.hi.is/3.html");
    curis[2] = new CrawlURI(uuri);
    curis[2].setVia(null);

    uuri = UURIFactory.getInstance("http://bok.hi.is/4.html");
    curis[3] = new CrawlURI(uuri);
    curis[3].setVia(null);

    assertTrue(
        "HQ should be empty initially", hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_EMPTY);
    assertEquals("Incorrect nextReadyTime on Empty", Long.MAX_VALUE, hq.getNextReadyTime());
    assertEquals("Initial size of HQ should be 0", 0, hq.getSize());

    assertEquals("Peek should return null when 'ready queue' is empty", null, hq.peek());

    /*
     * Add three CrawlURIs and ensures that the correct one is reported by
     * peek(); All are added later then current time!
     */

    curis[0].putLong(A_TIME_OF_NEXT_PROCESSING, System.currentTimeMillis()); // now
    curis[1].putLong(A_TIME_OF_NEXT_PROCESSING, System.currentTimeMillis() + 5000); // in 5 sec
    curis[2].putLong(A_TIME_OF_NEXT_PROCESSING, System.currentTimeMillis() + 20000); // in 20 sec.

    hq.add(curis[0], false);
    assertEquals("First CrawlURI should be top", curis[0].toString(), hq.peek().toString());
    assertTrue(
        "HQ should no longer be empty", hq.getState() != AdaptiveRevisitHostQueue.HQSTATE_EMPTY);
    assertEquals("Size of HQ should now be 1", 1, hq.getSize());

    /*
     * Invoke next and ensure that the HQ is now busy (initial valence was
     * set to 1). Also check for proper errors for a busy HQ. Such as when
     * trying to reinvoke next().
     *
     */
    CrawlURI curi = hq.next(); // Should return curis[2]
    assertEquals("next() did not return 'top' URI", curis[0].toString(), curi.toString());
    assertTrue(
        "HQ should now be busy, is " + hq.getStateByName(),
        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_BUSY);
    try {
      hq.next();
      assertTrue("next() should throw an IllegalStateException if HQ " + "not ready", false);
    } catch (IllegalStateException e) {
      // This is supposed to happen.
    }
    assertEquals("New top URI should be null", null, hq.peek());

    hq.add(curis[1], false);
    assertEquals("Second CrawlURI should be top", curis[1].toString(), hq.peek().toString());
    assertEquals("Size of HQ should now be 2", 2, hq.getSize());

    // Return it with next fetch time in the future.
    curi.putLong(
        A_TIME_OF_NEXT_PROCESSING,
        hq.peek().getLong(A_TIME_OF_NEXT_PROCESSING) + 100000); // 100 sec behind current top.
    hq.update(curi, false, 0);
    assertEquals(
        "Second CrawlURI should be still be top", curis[1].toString(), hq.peek().toString());
    assertEquals("Size of HQ should still be 2", 2, hq.getSize());

    hq.add(curis[2], false);
    assertEquals("Second CrawlURI should still be top", curis[1].toString(), hq.peek().toString());
    assertEquals("Size of HQ should now be 3", 3, hq.getSize());

    /*
     * If there are no URIs ready, the queue should snooze, even though no
     * politeness demand has been made.
     * <p>
     * Confirms this and that it wakes up.
     */
    assertTrue(
        "HQ should be snoozed, is " + hq.getStateByName(),
        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_SNOOZED);
    // Wait past wakeup time
    synchronized (this) {
      wait(hq.getNextReadyTime() - System.currentTimeMillis() + 100);
    }
    assertTrue(
        "HQ should now be ready, is " + hq.getStateByName(),
        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_READY);

    /*
     * Re-adds a URI with a lower ready time which should promote it to the
     * top of the queue. Checks if this happens correctly.
     *
     * Then tests an add override which would demote it back, ensures that
     * this fails as it should (i.e. URIs time of next processing remains
     * unchanged).
     */
    curis[2].putLong(
        A_TIME_OF_NEXT_PROCESSING,
        curis[1].getLong(A_TIME_OF_NEXT_PROCESSING) - 1000); // 1 sec. prior to current top
    hq.add(curis[2], true);
    assertEquals("Size of HQ should still be 3", hq.getSize(), 3);
    assertEquals("Third CrawlURI should be now be top", curis[2].toString(), hq.peek().toString());
    curis[2].putLong(
        A_TIME_OF_NEXT_PROCESSING,
        curis[1].getLong(A_TIME_OF_NEXT_PROCESSING) + 10000); // 10 sec. later
    hq.add(curis[2], true);
    assertEquals("Size of HQ should still be 3", hq.getSize(), 3);
    assertEquals("Third CrawlURI should still top", curis[2].toString(), hq.peek().toString());

    /*
     * Invoke next and ensure that the HQ is now busy (initial valence was
     * set to 1). Also check for proper errors for a busy HQ. Such as when
     * trying to reinvoke next().
     *
     */
    curi = hq.next(); // Should return curis[2]
    assertEquals("next() did not return 'top' URI", curis[2].toString(), curi.toString());
    assertTrue(
        "HQ should now be busy, is " + hq.getStateByName(),
        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_BUSY);
    try {
      hq.next();
      assertTrue("next() should throw an IllegalStateException if HQ " + "not ready", false);
    } catch (IllegalStateException e) {
      // This is supposed to happen.
    }
    assertEquals("New top URI", curis[1].toString(), hq.peek().toString());

    /*
     * Add a URI while HQ is busy. Check if this succeeds normally.
     *
     */

    curis[3].putLong(
        A_TIME_OF_NEXT_PROCESSING,
        curis[1].getLong(A_TIME_OF_NEXT_PROCESSING)
            - 1); // 1 msec. ahead of current top (order [2] 3 1 0)
    hq.add(curis[3], false);
    assertEquals("Size of HQ should now be 4", 4, hq.getSize());

    /*
     * Invoke update, first with an invalid URI (not the one issued by
     * next() earlier), this should fail. Then with the correct one, this
     * should succeed. Then finally test update again with an invalid URI
     * (i.e. when no HQ has no outstanding URIs, that should fail.
     *
     * At each step, proper checks are made of state and that  methods give
     * appropriate errors.
     *
     * Updated URI is given low time of next processing to put it 'in front'
     */

    try {
      hq.update(curis[1], false, 0);
      assertTrue("update() should not accept URI", false);
    } catch (IllegalStateException e) {
      // This is supposed to happen
    }

    // We do not change the 'time of next processing' on update
    // so curis[2] should again be at top of queue.
    long timeOfPolitenessWakeUp = System.currentTimeMillis() + 2000;
    hq.update(curi, true, timeOfPolitenessWakeUp); // Wake in 5 sec.
    assertTrue(
        "HQ should be snoozed, is " + hq.getStateByName(),
        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_SNOOZED);

    try {
      hq.update(curis[2], false, 0);
      assertTrue("update() should not accept URI", false);
    } catch (IllegalStateException e) {
      // This is supposed to happen
    }
    assertEquals(
        "HQs time of next ready should reflect set wait time ",
        timeOfPolitenessWakeUp,
        hq.getNextReadyTime());

    /*
     * Check if the HQ wakes up from it's 'snoozing'
     *
     */
    // Wait past wakeup time
    synchronized (this) {
      wait(hq.getNextReadyTime() - System.currentTimeMillis() + 100);
    }
    assertTrue(
        "HQ should now be ready, is " + hq.getStateByName(),
        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_READY);
    assertEquals(
        "HQs time of next ready should still be when it 'woken' " + "up.",
        timeOfPolitenessWakeUp,
        hq.getNextReadyTime());

    /*
     * Invoke next so that the HQ has a URI being processed. Then
     * close the HQ and reopen it to ensure that this happens normally, i.e.
     * state is recovered properly, including the restoration of the URI
     * being processed, back to the regular queue (where it should be
     * first).
     *
     * On recreating the HQ, set valence to 2.
     */
    curi = hq.next(); // Should return curis[2]
    assertEquals("next() did not return 'top' URI", curis[2].toString(), curi.toString());
    assertTrue(
        "HQ should now be busy, is " + hq.getStateByName(),
        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_BUSY);
    hq.close();

    hq = new AdaptiveRevisitHostQueue("bok.hi.is", env, catalog, 2);

    assertEquals("Size of HQ after reopening should now be 4", 4, hq.getSize());
    assertTrue(
        "HQ should be ready on reopen, is " + hq.getStateByName(),
        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_READY);
    assertEquals(
        "CrawlURI 'in processing' before should be top", curi.toString(), hq.peek().toString());

    /* Check if valence higher then 1 is properly handled.
     *
     * Invoke next(), check if still ready and new top URI.
     */
    curi = hq.next(); // Should return curis[2]
    assertEquals("next() did not return 'top' URI", curis[2].toString(), curi.toString());
    assertTrue(
        "HQ should still be ready, is " + hq.getStateByName(),
        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_READY);

    /* Invoke next() again, check if now busy.
     */
    curi = hq.next(); // Should return curis[3]
    assertEquals("next() did not return 'top' URI", curis[3].toString(), curi.toString());
    assertTrue(
        "HQ should be busy, is " + hq.getStateByName(),
        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_BUSY);
    assertEquals("Size of HQ should still be 4", 4, hq.getSize());

    /* Update() second URI issued. Confirm HQ is now ready again. URI is
     * given same time of next processing to put it 'in front'. (no snooze)
     */
    hq.update(curi, false, 0);
    assertTrue(
        "HQ should now be ready, is " + hq.getStateByName(),
        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_READY);
    assertEquals("'updated' CrawlURI before should be top", curi.toString(), hq.peek().toString());

    /* Update() again, ensure proper state. URI is NOT placed at front of
     * queue and snooze time is given. But the HQ should not enter a
     * snoozed state because the 'other' slot is free.
     */

    hq.update(curis[2], true, System.currentTimeMillis() + 1000000); // 10sec
    curis[3].putLong(
        A_TIME_OF_NEXT_PROCESSING,
        curis[1].getLong(A_TIME_OF_NEXT_PROCESSING) + 1000); // 1 sec. behind of current top
    assertTrue(
        "HQ should still be ready, is " + hq.getStateByName(),
        hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_READY);
    assertEquals("Top CrawlURI before should be unchanged", curi.toString(), hq.peek().toString());

    // TODO: Test sorting with scheduling directives.

    /*
     * Close the ARHostQueue and the Environment
     */
    hq.close();
    catalog.close();
    env.close();
    cleanUpOldFiles("AR");
  }