public void testBaseUrl() throws Exception {
    sau1 = setupSimAu(simAuConfig(tempDirPath));
    createContent(sau1);
    crawlContent(sau1);
    CachedUrlSet cus1 = sau1.getAuCachedUrlSet();

    tempDirPath2 = getTempDir().getAbsolutePath() + File.separator;
    Configuration config2 = simAuConfig(tempDirPath2);
    config2.put("base_url", "http://anotherhost.org/");
    SimulatedArchivalUnit sau2 = setupSimAu(config2);
    createContent(sau2);
    crawlContent(sau2);
    CachedUrlSet cus2 = sau1.getAuCachedUrlSet();
    List urls1 = auUrls(sau1);
    List urls2 = auUrls(sau2);

    Pattern pat = Pattern.compile("http://([^/]+)(/.*)$");
    List<String> l1 = auUrls(sau1);
    List<String> l2 = auUrls(sau2);
    assertEquals(l1.size(), l2.size());
    for (int ix = 0; ix < l1.size(); ix++) {
      Matcher m1 = pat.matcher(l1.get(ix));
      assertTrue(m1.matches());
      Matcher m2 = pat.matcher(l2.get(ix));
      assertTrue(m2.matches());
      assertEquals("www.example.com", m1.group(1));
      assertEquals("anotherhost.org", m2.group(1));
      assertEquals(m1.group(2), m2.group(2));
    }
  }
 public BePressArticleIterator(
     ArchivalUnit au, SubTreeArticleIterator.Spec spec, boolean isSection) {
   super(au, spec);
   String volumeAsString = au.getConfiguration().get(ConfigParamDescr.VOLUME_NUMBER.getKey());
   String journalAbbr = au.getConfiguration().get(ConfigParamDescr.JOURNAL_ABBR.getKey());
   if (isSection) {
     journalAbbr = journalAbbr + "/" + au.getConfiguration().get("journal_section");
   }
   // pick up issue level and lower (make (art)?[0-9]+ optional because a few au's have article
   // at issue level
   this.pattern =
       Pattern.compile(
           String.format(
               "/%s/((([^0-9]+/)?(vol)?%s/(iss)?[0-9]+(/(art)?[0-9]+)?)|(vol%s/(?-i:[A-Z])[0-9]+))$",
               journalAbbr, volumeAsString, volumeAsString),
           Pattern.CASE_INSENSITIVE);
   this.TOC_pattern =
       Pattern.compile(
           String.format("/%s/([^0-9]+/)?(vol)?%s/(iss)?[0-9]+$", journalAbbr, volumeAsString),
           Pattern.CASE_INSENSITIVE);
 }
 /*
  * This is comlicated. MOST AUs have articles that live below and issue level TOC
  * that is,
  * <blah>/<journal_id>/vol#/iss#/ is a toc with no relevant metadata
  * <blah>/<journal_id>/vol#/iss#/xxx is an article with metadata
  * (eg Economist Voice V1)
  * BUT
  * in some AUs there are issues with only 1 article, in which case
  * <blah>/<journal_id>/vol#/iss#/ is an abstract with metadata
  * (eg Rhodes Cook V4)
  * and a few AUs with a mixture
  * (eg Forum for Health Economics V5)
  * So to identify ALL articles, we'll also have to capture issue level items and then look
  * at the html and if it has article metadata in it, count it as an article.
  *
  */
 @Override
 protected ArticleFiles createArticleFiles(CachedUrl cu) {
   String url = cu.getUrl();
   Matcher mat = pattern.matcher(url);
   if (mat.find()) {
     // we matched, but could this pattern potentially be a toc?
     Matcher tocmat = TOC_pattern.matcher(url);
     // if we could be a TOC then we must have metadata to be considered an article
     if (tocmat.find()) {
       if (hasArticleMetadata(cu)) {
         return processUrl(cu, mat);
       }
     } else {
       // we're not a potential TOC, so treat this as an article without checking
       return processUrl(cu, mat);
     }
     return null; // this was a TOC, not an article
   }
   log.warning("Mismatch between article iterator factory and article iterator: " + url);
   return null;
 }