protected void checkUrlContent(
     SimulatedArchivalUnit sau,
     String path,
     int fileNum,
     int depth,
     int branchNum,
     boolean isAbnormal,
     boolean isDamaged)
     throws IOException {
   String file = sau.getUrlRoot() + path;
   CachedUrl url = sau.makeCachedUrl(file);
   String content = getUrlContent(url);
   String expectedContent;
   if (path.endsWith(".html")) {
     String fn = path.substring(path.lastIndexOf("/") + 1);
     expectedContent = scgen.getHtmlFileContent(fn, fileNum, depth, branchNum, isAbnormal);
   } else {
     expectedContent = scgen.getTxtContent(fileNum, depth, branchNum, isAbnormal);
   }
   if (isDamaged) {
     assertNotEquals(expectedContent, content);
   } else {
     assertEquals(expectedContent, content);
   }
 }
  protected void checkRoot(SimulatedArchivalUnit sau) {
    log.debug("checkRoot()");
    CachedUrlSet set = sau.getAuCachedUrlSet();
    Iterator setIt = set.flatSetIterator();
    ArrayList childL = new ArrayList(1);
    CachedUrlSet cus = null;
    while (setIt.hasNext()) {
      cus = (CachedUrlSet) setIt.next();
      childL.add(cus.getUrl());
    }

    String urlRoot = sau.getUrlRoot();

    String[] expectedA = new String[1];
    expectedA[0] = urlRoot;
    assertIsomorphic(expectedA, childL);

    setIt = cus.flatSetIterator();
    childL = new ArrayList(7);
    while (setIt.hasNext()) {
      childL.add(((CachedUrlSetNode) setIt.next()).getUrl());
    }

    expectedA =
        new String[] {
          urlRoot + "/001file.html",
          urlRoot + "/001file.txt",
          urlRoot + "/002file.html",
          urlRoot + "/002file.txt",
          urlRoot + "/branch1",
          urlRoot + "/branch2",
          urlRoot + "/index.html"
        };
    assertIsomorphic(expectedA, childL);
  }
 protected void checkLeaf(SimulatedArchivalUnit sau) {
   log.debug("checkLeaf()");
   String parent = sau.getUrlRoot() + "/branch1";
   CachedUrlSetSpec spec = new RangeCachedUrlSetSpec(parent);
   CachedUrlSet set = sau.makeCachedUrlSet(spec);
   Iterator setIt = set.contentHashIterator();
   ArrayList childL = new ArrayList(16);
   while (setIt.hasNext()) {
     childL.add(((CachedUrlSetNode) setIt.next()).getUrl());
   }
   String[] expectedA =
       new String[] {
         parent,
         parent + "/001file.html",
         parent + "/001file.txt",
         parent + "/002file.html",
         parent + "/002file.txt",
         parent + "/branch1",
         parent + "/branch1/001file.html",
         parent + "/branch1/001file.txt",
         parent + "/branch1/002file.html",
         parent + "/branch1/002file.txt",
         parent + "/branch1/index.html",
         parent + "/branch2",
         parent + "/branch2/001file.html",
         parent + "/branch2/001file.txt",
         parent + "/branch2/002file.html",
         parent + "/branch2/002file.txt",
         parent + "/branch2/index.html",
         parent + "/index.html",
       };
   assertIsomorphic(expectedA, childL);
 }
  public void testBaseUrlPath() throws Exception {
    sau1 = setupSimAu(simAuConfig(tempDirPath));
    createContent(sau1);
    crawlContent(sau1);
    CachedUrlSet cus1 = sau1.getAuCachedUrlSet();

    tempDirPath2 = getTempDir().getAbsolutePath() + File.separator;
    Configuration config2 = simAuConfig(tempDirPath2);
    config2.put("base_url", "http://anotherhost.org/some/path/");
    SimulatedArchivalUnit sau2 = setupSimAu(config2);
    createContent(sau2);
    crawlContent(sau2);
    CachedUrlSet cus2 = sau1.getAuCachedUrlSet();
    List urls1 = auUrls(sau1);
    List urls2 = auUrls(sau2);

    Pattern pat1 = Pattern.compile("http://www\\.example\\.com(/.*)$");
    Pattern pat2 = Pattern.compile("http://anotherhost\\.org/some/path(/.*)$");
    List<String> l1 = auUrls(sau1);
    List<String> l2 = auUrls(sau2);
    assertEquals(l1.size(), l2.size());
    for (int ix = 0; ix < l1.size(); ix++) {
      Matcher m1 = pat1.matcher(l1.get(ix));
      assertTrue(m1.matches());
      Matcher m2 = pat2.matcher(l2.get(ix));
      assertTrue(m2.matches());
      assertEquals(m1.group(1), m2.group(1));
    }
  }
 protected void doDamageRemoveTest(SimulatedArchivalUnit sau) throws Exception {
   /* Cache the file again; this time the damage should be gone */
   String file = sau.getUrlRoot() + DAMAGED_CACHED_URL;
   UrlCacher uc = sau.makeUrlCacher(file);
   BitSet fetchFlags = new BitSet();
   fetchFlags.set(UrlCacher.REFETCH_FLAG);
   uc.setFetchFlags(fetchFlags);
   uc.cache();
   checkUrlContent(sau, DAMAGED_CACHED_URL, 2, 2, 2, false, false);
 }
  protected void createContent(SimulatedArchivalUnit sau) {
    log.debug("createContent()");
    scgen = sau.getContentGenerator();
    scgen.setFileTypes(
        SimulatedContentGenerator.FILE_TYPE_HTML + SimulatedContentGenerator.FILE_TYPE_TXT);
    scgen.setAbnormalFile("1,1", 1);
    scgen.setOddBranchesHaveContent(true);

    sau.deleteContentTree();
    sau.generateContentTree();
    assertTrue(scgen.isContentTree());
  }
 void enableFilter(SimulatedArchivalUnit sau, boolean enable)
     throws ArchivalUnit.ConfigurationException {
   Configuration auConfig = sau.getConfiguration().copy();
   // no bad file when playing with filtering
   auConfig.remove("badCachedFileLoc");
   auConfig.remove("badCachedFileNum");
   if (enable) {
     auConfig.put(SimulatedPlugin.AU_PARAM_HASH_FILTER_SPEC, "true");
   } else {
     auConfig.remove(SimulatedPlugin.AU_PARAM_HASH_FILTER_SPEC);
   }
   sau.setConfiguration(auConfig);
 }
  private void checkHashSet(
      SimulatedArchivalUnit sau, boolean namesOnly, boolean filter, byte[] expected)
      throws Exception {
    enableFilter(sau, filter);
    CachedUrlSet set = sau.getAuCachedUrlSet();
    byte[] hash = getHash(set, namesOnly);
    assertEquals(expected, hash);

    String parent = sau.getUrlRoot() + "/branch1";
    CachedUrlSetSpec spec = new RangeCachedUrlSetSpec(parent);
    set = sau.makeCachedUrlSet(spec);
    byte[] hash2 = getHash(set, namesOnly);
    assertFalse(Arrays.equals(hash, hash2));
  }
  public void testDualContentHash() throws Exception {
    sau1 = setupSimAu(simAuConfig(tempDirPath));
    createContent(sau1);
    crawlContent(sau1);
    CachedUrlSet set = sau1.getAuCachedUrlSet();
    byte[] nameH = getHash(set, true);
    byte[] contentH = getHash(set, false);

    tempDirPath2 = getTempDir().getAbsolutePath() + File.separator;
    SimulatedArchivalUnit sau2 = setupSimAu(simAuConfig(tempDirPath2));

    createContent(sau2);
    crawlContent(sau2);
    set = sau2.getAuCachedUrlSet();
    byte[] nameH2 = getHash(set, true);
    byte[] contentH2 = getHash(set, false);
    assertEquals(nameH, nameH2);
    assertEquals(contentH, contentH2);
  }
 protected void checkDepth(SimulatedArchivalUnit sau) {
   log.debug("checkDepth()");
   String URL_ROOT = sau.getUrlRoot();
   assertEquals(0, sau.getLinkDepth(URL_ROOT + "/index.html"));
   assertEquals(0, sau.getLinkDepth(URL_ROOT + "/"));
   assertEquals(1, sau.getLinkDepth(URL_ROOT + "/001file.html"));
   assertEquals(1, sau.getLinkDepth(URL_ROOT + "/branch1/index.html"));
   assertEquals(1, sau.getLinkDepth(URL_ROOT + "/branch1/"));
   assertEquals(2, sau.getLinkDepth(URL_ROOT + "/branch1/001file.html"));
 }
  protected void checkFilter(SimulatedArchivalUnit sau) throws Exception {
    log.debug("checkFilter()");
    CachedUrl cu = sau.makeCachedUrl(sau.getUrlRoot() + "/001file.html");

    enableFilter(sau, true);
    InputStream is = cu.openForHashing();
    String expected = "001file.html This is file 1, depth 0, branch 0. foobar ";
    assertEquals(expected, StringUtil.fromInputStream(is));
    is.close();
    enableFilter(sau, false);
    cu = sau.makeCachedUrl(sau.getUrlRoot() + "/001file.html");
    is = cu.openForHashing();
    expected =
        "<HTML><HEAD><TITLE>001file.html</TITLE></HEAD><BODY>\n"
            + "This is file 1, depth 0, branch 0.<br><!-- comment -->    "
            + "Citation String   foobar<br><script>"
            + "(defun fact (n) (cond ((= n 0) 1) (t (fact (sub1 n)))))</script>\n"
            + "</BODY></HTML>";
    assertEquals(expected, StringUtil.fromInputStream(is));
    is.close();
  }
 private void measureHashSpeed(SimulatedArchivalUnit sau) throws Exception {
   MessageDigest dig = null;
   try {
     dig = MessageDigest.getInstance("SHA-1");
   } catch (NoSuchAlgorithmException ex) {
     fail("No algorithm.");
   }
   CachedUrlSet set = sau.getAuCachedUrlSet();
   CachedUrlSetHasher hasher = set.getContentHasher(dig);
   SystemMetrics metrics = theDaemon.getSystemMetrics();
   int estimate = metrics.getBytesPerMsHashEstimate(hasher, dig);
   // should be protected against this being zero by MyMockSystemMetrics,
   // but otherwise use the proper calculation.  This avoids test failure
   // due to really slow machines
   assertTrue(estimate > 0);
   long estimatedTime = set.estimatedHashDuration();
   long size = ((Long) PrivilegedAccessor.getValue(set, "totalNodeSize")).longValue();
   assertTrue(size > 0);
   System.out.println("b/ms: " + estimate);
   System.out.println("size: " + size);
   System.out.println("estimate: " + estimatedTime);
   assertEquals(estimatedTime, theDaemon.getHashService().padHashEstimate(size / estimate));
 }
 public void tearDown() throws Exception {
   sau.deleteContentTree();
   theDaemon.stopDaemon();
   super.tearDown();
 }
 protected void crawlContent(SimulatedArchivalUnit sau) {
   log.debug("crawlContent()");
   CrawlSpec spec = new SpiderCrawlSpec(sau.getNewContentCrawlUrls(), null);
   Crawler crawler = new NoCrawlEndActionsNewContentCrawler(sau, spec, new MockAuState());
   crawler.doCrawl();
 }