private void deleteBlock(CachedUrl cu) throws IOException {
   log.info("deleting " + cu.getUrl());
   CachedUrlSetSpec cuss = new SingleNodeCachedUrlSetSpec(cu.getUrl());
   ArchivalUnit au = cu.getArchivalUnit();
   CachedUrlSet cus = au.makeCachedUrlSet(cuss);
   NodeManager nm = au.getPlugin().getDaemon().getNodeManager(au);
   nm.deleteNode(cus);
 }
  public void testFunctionalFromTarHierarchy() throws Exception {
    log.debug3("in testFromTarHierarchy");
    // load the tarballs
    InputStream file_input = null;
    try {
      file_input = getResourceAsStream(realTARFile_A);
      // UrlCacher uc = au.makeUrlCacher(TAR_A_BASE);
      // uc.storeContent(file_input, tarHeader);
      UrlCacher uc = tarAu.makeUrlCacher(new UrlData(file_input, tarHeader, TAR_A_BASE));
      uc.storeContent();
      IOUtil.safeClose(file_input);

      file_input = getResourceAsStream(realTARFile_B);
      // uc = au.makeUrlCacher(TAR_B_BASE);
      // uc.storeContent(file_input, tarHeader);
      uc = tarAu.makeUrlCacher(new UrlData(file_input, tarHeader, TAR_B_BASE));
      uc.storeContent();
      IOUtil.safeClose(file_input);

    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } finally {
      IOUtil.safeClose(file_input);
    }

    CachedUrlSet cus = tarAu.getAuCachedUrlSet();
    for (CachedUrl cu : cus.getCuIterable()) {
      log.debug3("AU - cu is: " + cu.getUrl());
      cu.release();
    }

    // We need to start from the level of the ArticleMetadataExtractor
    MyListEmitter emitter = new MyListEmitter();
    ArticleMetadataExtractor amEx =
        new ElsevierDeferredArticleMetadataExtractor(ArticleFiles.ROLE_ARTICLE_METADATA);

    Iterator<ArticleFiles> it = tarAu.getArticleIterator(MetadataTarget.Any());
    while (it.hasNext()) {
      ArticleFiles af = it.next();
      log.debug3("Metadata test - articlefiles " + af.toString());
      // CachedUrl cu = af.getFullTextCu();
      CachedUrl cu = af.getRoleCu(ArticleFiles.ROLE_ARTICLE_METADATA);
      log.debug3("metadata cu is " + cu.getUrl());
      // List<ArticleMetadata> mdlist = mle.extract(MetadataTarget.Any(), cu);
      amEx.extract(MetadataTarget.Any(), af, emitter);
      List<ArticleMetadata> returnList = emitter.getAmList();

      assertNotNull(returnList);
      log.debug3("size of returnList is " + returnList.size());
      Iterator<ArticleMetadata> mdIt = returnList.iterator();
      ArticleMetadata mdRecord = null;
      while (mdIt.hasNext()) {
        mdRecord = (ArticleMetadata) mdIt.next();
        validateCompleteMetadataRecord(mdRecord);
      }
    }
  }
  public void testArticleCountAndType() throws Exception {
    int expCount = 28;
    PluginTestUtil.crawlSimAu(sau);
    String pat1 = "branch(\\d+)/(\\d+file\\.html)";
    String rep1 = "aps/journal/v123/n$1/full/$2";
    PluginTestUtil.copyAu(sau, nau, ".*[^.][^p][^d][^f]$", pat1, rep1);
    String pat2 = "branch(\\d+)/(\\d+file\\.pdf)";
    String rep2 = "aps/journal/v123/n$1/pdf/$2";
    PluginTestUtil.copyAu(sau, nau, ".*\\.pdf$", pat2, rep2);

    // Remove some URLs
    int deleted = 0;
    for (Iterator it = nau.getAuCachedUrlSet().contentHashIterator(); it.hasNext(); ) {
      CachedUrlSetNode cusn = (CachedUrlSetNode) it.next();
      if (cusn instanceof CachedUrl) {
        CachedUrl cu = (CachedUrl) cusn;
        String url = cu.getUrl();
        if (url.contains("/journal/")
            && (url.endsWith("1file.html") || url.endsWith("2file.pdf"))) {
          deleteBlock(cu);
          ++deleted;
        }
      }
    }
    assertEquals(8, deleted);

    Iterator<ArticleFiles> it = nau.getArticleIterator();
    int count = 0;
    int countHtmlOnly = 0;
    int countPdfOnly = 0;
    while (it.hasNext()) {
      ArticleFiles af = it.next();
      log.info(af.toString());
      CachedUrl cu = af.getFullTextCu();
      String url = cu.getUrl();
      assertNotNull(cu);
      String contentType = cu.getContentType();
      log.debug("count " + count + " url " + url + " " + contentType);
      count++;
      if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == null) {
        ++countHtmlOnly;
      }
      if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == url) {
        ++countPdfOnly;
      }
    }
    log.debug("Article count is " + count);
    assertEquals(expCount, count);
    assertEquals(4, countHtmlOnly);
    assertEquals(4, countPdfOnly);
  }
 @Override
 public void extract(MetadataTarget target, CachedUrl cu, Emitter emitter) throws IOException {
   ArticleMetadata am = new SimpleHtmlMetaTagMetadataExtractor().extract(target, cu);
   am.cook(tagMap);
   String url = am.get(MetadataField.FIELD_ACCESS_URL);
   ArchivalUnit au = cu.getArchivalUnit();
   if (url == null || url.isEmpty() || !au.makeCachedUrl(url).hasContent()) {
     url = cu.getUrl();
   }
   am.replace(
       MetadataField.FIELD_ACCESS_URL,
       HttpToHttpsUtil.AuUtil.normalizeHttpHttpsFromBaseUrl(au, url));
   emitter.emitMetadata(cu, am);
 }
  /* private support methods */
  private List<ArticleMetadata> setupContentForAU(
      ArchivalUnit au, String url, String content, boolean isHtmlExtractor)
      throws IOException, PluginException {
    FileMetadataExtractor me;

    InputStream input = null;
    CIProperties props = null;
    if (isHtmlExtractor) {
      input = IOUtils.toInputStream(content, "utf-8");
      props = getContentHtmlProperties();
      me =
          new BaseAtyponHtmlMetadataExtractorFactory()
              .createFileMetadataExtractor(MetadataTarget.Any(), "text/html");
    } else {
      input = IOUtils.toInputStream(content, "utf-8");
      props = getContentRisProperties();
      me =
          new BaseAtyponRisMetadataExtractorFactory()
              .createFileMetadataExtractor(MetadataTarget.Any(), "text/plain");
    }
    UrlData ud = new UrlData(input, props, url);
    UrlCacher uc = au.makeUrlCacher(ud);
    uc.storeContent();
    CachedUrl cu = uc.getCachedUrl();
    FileMetadataListExtractor mle = new FileMetadataListExtractor(me);
    return mle.extract(MetadataTarget.Any(), cu);
  }
 public BePressArticleIterator(
     ArchivalUnit au, SubTreeArticleIterator.Spec spec, boolean isSection) {
   super(au, spec);
   String volumeAsString = au.getConfiguration().get(ConfigParamDescr.VOLUME_NUMBER.getKey());
   String journalAbbr = au.getConfiguration().get(ConfigParamDescr.JOURNAL_ABBR.getKey());
   if (isSection) {
     journalAbbr = journalAbbr + "/" + au.getConfiguration().get("journal_section");
   }
   // pick up issue level and lower (make (art)?[0-9]+ optional because a few au's have article
   // at issue level
   this.pattern =
       Pattern.compile(
           String.format(
               "/%s/((([^0-9]+/)?(vol)?%s/(iss)?[0-9]+(/(art)?[0-9]+)?)|(vol%s/(?-i:[A-Z])[0-9]+))$",
               journalAbbr, volumeAsString, volumeAsString),
           Pattern.CASE_INSENSITIVE);
   this.TOC_pattern =
       Pattern.compile(
           String.format("/%s/([^0-9]+/)?(vol)?%s/(iss)?[0-9]+$", journalAbbr, volumeAsString),
           Pattern.CASE_INSENSITIVE);
 }
  public void setUp() throws Exception {
    super.setUp();
    setUpDiskSpace(); // you need this to have startService work properly...

    theDaemon = getMockLockssDaemon();
    theDaemon.getAlertManager();
    theDaemon.getPluginManager().setLoadablePluginsReady(true);
    theDaemon.setDaemonInited(true);
    theDaemon.getPluginManager().startService();
    theDaemon.getCrawlManager();

    // in this directory this is file "test_baseatypon.tdb" but it becomes xml
    ConfigurationUtil.addFromUrl(getResource("test_baseatypon.xml"));
    Tdb tdb = ConfigManager.getCurrentConfig().getTdb();

    TdbAu tdbau1 = tdb.getTdbAusLikeName(goodJournal + " Volume " + goodVolume).get(0);
    assertNotNull("Didn't find named TdbAu", tdbau1);
    bau1 = PluginTestUtil.createAndStartAu(tdbau1);
    assertNotNull(bau1);
    TypedEntryMap auConfig = bau1.getProperties();
    assertEquals(BASE_URL, auConfig.getString(BASE_URL_KEY));
  }