public void testArticleCountAndType() throws Exception {
    int expCount = 28;
    PluginTestUtil.crawlSimAu(sau);
    String pat1 = "branch(\\d+)/(\\d+file\\.html)";
    String rep1 = "aps/journal/v123/n$1/full/$2";
    PluginTestUtil.copyAu(sau, nau, ".*[^.][^p][^d][^f]$", pat1, rep1);
    String pat2 = "branch(\\d+)/(\\d+file\\.pdf)";
    String rep2 = "aps/journal/v123/n$1/pdf/$2";
    PluginTestUtil.copyAu(sau, nau, ".*\\.pdf$", pat2, rep2);

    // Remove some URLs
    int deleted = 0;
    for (Iterator it = nau.getAuCachedUrlSet().contentHashIterator(); it.hasNext(); ) {
      CachedUrlSetNode cusn = (CachedUrlSetNode) it.next();
      if (cusn instanceof CachedUrl) {
        CachedUrl cu = (CachedUrl) cusn;
        String url = cu.getUrl();
        if (url.contains("/journal/")
            && (url.endsWith("1file.html") || url.endsWith("2file.pdf"))) {
          deleteBlock(cu);
          ++deleted;
        }
      }
    }
    assertEquals(8, deleted);

    Iterator<ArticleFiles> it = nau.getArticleIterator();
    int count = 0;
    int countHtmlOnly = 0;
    int countPdfOnly = 0;
    while (it.hasNext()) {
      ArticleFiles af = it.next();
      log.info(af.toString());
      CachedUrl cu = af.getFullTextCu();
      String url = cu.getUrl();
      assertNotNull(cu);
      String contentType = cu.getContentType();
      log.debug("count " + count + " url " + url + " " + contentType);
      count++;
      if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == null) {
        ++countHtmlOnly;
      }
      if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == url) {
        ++countPdfOnly;
      }
    }
    log.debug("Article count is " + count);
    assertEquals(expCount, count);
    assertEquals(4, countHtmlOnly);
    assertEquals(4, countPdfOnly);
  }
 private void deleteBlock(CachedUrl cu) throws IOException {
   log.info("deleting " + cu.getUrl());
   CachedUrlSetSpec cuss = new SingleNodeCachedUrlSetSpec(cu.getUrl());
   ArchivalUnit au = cu.getArchivalUnit();
   CachedUrlSet cus = au.makeCachedUrlSet(cuss);
   NodeManager nm = au.getPlugin().getDaemon().getNodeManager(au);
   nm.deleteNode(cus);
 }
public class TestElsevierXmlLinkExtractorFactory extends LinkExtractorTestCase {

  private static Logger logger = Logger.getLogger("TestElsevierXmlLinkExtractorFactory");

  String srcUrl = "http://www.example.com/";

  private static final String withLinks =
      "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
          + "<!DOCTYPE dataset SYSTEM \"http://support.sciencedirect.com/xml/sdosftp10.dtd\">\n"
          + "<dataset identifier=\"OXM10160\" customer=\"OHL\""
          + " status=\"Announcement\""
          + " version=\"Network Dataset Announcement/Confirmation v1.0\">"
          + " <date year=\"2007\" month=\"May\" day=\"1\"/>\n"
          + "<file name=\"01407007.tar\" size=\"21780480\""
          + " md5=\"6c7266e0e246bf3e8cf1cd8b659a7a73\"/>\n"
          + "<file name=\"03064530.tar\" size=\"12748800\""
          + " md5=\"df9519d3075e164d22f5dd4988a693c3\"/>\n"
          + "<file name=\"dataset.toc\" size=\"2216587\""
          + " md5=\"cd21741eb91fa0fdfef2fa36485e21a0\"/>\n"
          + "</dataset>\n";

  private static final String withoutLinks =
      "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
          + "<!DOCTYPE dataset SYSTEM \"http://support.sciencedirect.com/xml/sdosftp10.dtd\">\n"
          + "<dataset identifier=\"OXM10160\" customer=\"OHL\""
          + " status=\"Announcement\""
          + " version=\"Network Dataset Announcement/Confirmation v1.0\">"
          + " <date year=\"2007\" month=\"May\" day=\"1\"/>\n"
          + "</dataset>\n";

  private static final String[] links = {
    "01407007.tar", "03064530.tar", "dataset.toc",
  };

  public String getMimeType() {
    return "text/xml";
  }

  public LinkExtractorFactory getFactory() {
    return new ElsevierXmlLinkExtractorFactory();
  }

  public void testFindCorrectEntries() throws Exception {
    Set expected = new HashSet();
    for (String link : links) {
      expected.add(srcUrl + link);
    }
    assertEquals(expected, extractUrls(withLinks));
  }

  public void testFindNoEntries() throws Exception {
    assertEmpty(extractUrls(withoutLinks));
  }
}
public class TestNatureArticleIteratorFactory extends LockssTestCase {
  static Logger log = Logger.getLogger("TestNatureArticleIteratorFactory");

  private SimulatedArchivalUnit sau; // Simulated AU to generate content
  private ArchivalUnit nau; // Nature AU
  private MockLockssDaemon theDaemon;
  private static final int DEFAULT_FILESIZE = 3000;
  private static int fileSize = DEFAULT_FILESIZE;

  private static String PLUGIN_NAME = "org.lockss.plugin.nature.ClockssNaturePublishingGroupPlugin";

  private static String BASE_URL = "http://www.nature.com/";

  public void setUp() throws Exception {
    super.setUp();
    String tempDirPath = getTempDir().getAbsolutePath() + File.separator;
    ConfigurationUtil.setFromArgs(LockssRepositoryImpl.PARAM_CACHE_LOCATION, tempDirPath);
    theDaemon = getMockLockssDaemon();
    theDaemon.getAlertManager();
    theDaemon.getPluginManager().setLoadablePluginsReady(true);
    theDaemon.setDaemonInited(true);
    theDaemon.getPluginManager().startService();
    theDaemon.getCrawlManager();

    sau = PluginTestUtil.createAndStartSimAu(simAuConfig(tempDirPath));
    nau = PluginTestUtil.createAndStartAu(PLUGIN_NAME, natureAuConfig());
  }

  public void tearDown() throws Exception {
    sau.deleteContentTree();
    theDaemon.stopDaemon();
    super.tearDown();
  }

  Configuration simAuConfig(String rootPath) {
    Configuration conf = ConfigManager.newConfiguration();
    conf.put("root", rootPath);
    conf.put("base_url", BASE_URL);
    conf.put("depth", "1");
    conf.put("branch", "4");
    conf.put("numFiles", "7");
    conf.put(
        "fileTypes",
        "" + (SimulatedContentGenerator.FILE_TYPE_HTML | SimulatedContentGenerator.FILE_TYPE_PDF));
    conf.put("binFileSize", "" + fileSize);
    return conf;
  }

  Configuration natureAuConfig() {
    Configuration conf = ConfigManager.newConfiguration();
    conf.put("base_url", BASE_URL);
    conf.put("journal_id", "aps");
    conf.put("volume_name", "123");
    conf.put("year", "2008");
    return conf;
  }

  public void testArticleCountAndType() throws Exception {
    int expCount = 28;
    PluginTestUtil.crawlSimAu(sau);
    String pat1 = "branch(\\d+)/(\\d+file\\.html)";
    String rep1 = "aps/journal/v123/n$1/full/$2";
    PluginTestUtil.copyAu(sau, nau, ".*[^.][^p][^d][^f]$", pat1, rep1);
    String pat2 = "branch(\\d+)/(\\d+file\\.pdf)";
    String rep2 = "aps/journal/v123/n$1/pdf/$2";
    PluginTestUtil.copyAu(sau, nau, ".*\\.pdf$", pat2, rep2);

    // Remove some URLs
    int deleted = 0;
    for (Iterator it = nau.getAuCachedUrlSet().contentHashIterator(); it.hasNext(); ) {
      CachedUrlSetNode cusn = (CachedUrlSetNode) it.next();
      if (cusn instanceof CachedUrl) {
        CachedUrl cu = (CachedUrl) cusn;
        String url = cu.getUrl();
        if (url.contains("/journal/")
            && (url.endsWith("1file.html") || url.endsWith("2file.pdf"))) {
          deleteBlock(cu);
          ++deleted;
        }
      }
    }
    assertEquals(8, deleted);

    Iterator<ArticleFiles> it = nau.getArticleIterator();
    int count = 0;
    int countHtmlOnly = 0;
    int countPdfOnly = 0;
    while (it.hasNext()) {
      ArticleFiles af = it.next();
      log.info(af.toString());
      CachedUrl cu = af.getFullTextCu();
      String url = cu.getUrl();
      assertNotNull(cu);
      String contentType = cu.getContentType();
      log.debug("count " + count + " url " + url + " " + contentType);
      count++;
      if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == null) {
        ++countHtmlOnly;
      }
      if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == url) {
        ++countPdfOnly;
      }
    }
    log.debug("Article count is " + count);
    assertEquals(expCount, count);
    assertEquals(4, countHtmlOnly);
    assertEquals(4, countPdfOnly);
  }

  //  public void testArticleCountAndDefaultType() throws Exception {
  //    testArticleCountAndType("text/html", true, 24);
  //  }
  //
  //  public void testArticleCountAndPdf() throws Exception {
  //    testArticleCountAndType("application/pdf", false, 0);
  //  }

  private void deleteBlock(CachedUrl cu) throws IOException {
    log.info("deleting " + cu.getUrl());
    CachedUrlSetSpec cuss = new SingleNodeCachedUrlSetSpec(cu.getUrl());
    ArchivalUnit au = cu.getArchivalUnit();
    CachedUrlSet cus = au.makeCachedUrlSet(cuss);
    NodeManager nm = au.getPlugin().getDaemon().getNodeManager(au);
    nm.deleteNode(cus);
  }
}
public class TestHighWireArticleIteratorFactory extends ArticleIteratorTestCase {
  static Logger log = Logger.getLogger(TestHighWireArticleIteratorFactory.class);

  private SimulatedArchivalUnit sau; // Simulated AU to generate content

  private static String PLUGIN_NAME = "org.lockss.plugin.highwire.HighWirePressPlugin";

  private static String BASE_URL = "http://pediatrics.aappublications.org/";
  private static String SIM_ROOT = BASE_URL + "cgi/reprint/";

  public void setUp() throws Exception {
    super.setUp();
    String tempDirPath = setUpDiskSpace();
    au = createAu();
    sau = PluginTestUtil.createAndStartSimAu(simAuConfig(tempDirPath));
  }

  public void tearDown() throws Exception {
    sau.deleteContentTree();
    // theDaemon.stopDaemon();
    super.tearDown();
  }

  Configuration simAuConfig(String rootPath) {
    Configuration conf = ConfigManager.newConfiguration();
    conf.put("root", rootPath);
    conf.put("base_url", SIM_ROOT);
    conf.put("depth", "0");
    conf.put("branch", "0");
    conf.put("numFiles", "2");
    conf.put(
        "fileTypes",
        "" + (SimulatedContentGenerator.FILE_TYPE_PDF | SimulatedContentGenerator.FILE_TYPE_HTML));
    conf.put("binFileSize", "7");
    return conf;
  }

  protected ArchivalUnit createAu() throws ArchivalUnit.ConfigurationException {
    return PluginTestUtil.createAndStartAu(
        PLUGIN_NAME,
        ConfigurationUtil.fromArgs(
            "base_url",
            "http://pediatrics.aappublications.org/",
            "volume_name",
            "52",
            "journal_issn",
            "1098-4275"));
  }

  public void testRoots() throws Exception {
    SubTreeArticleIterator artIter = createSubTreeIter();
    System.out.println("Root Urls::" + getRootUrls(artIter));
    assertEquals(
        ListUtil.list(
            "http://pediatrics.aappublications.org/cgi/content/full/52/",
            "http://pediatrics.aappublications.org/cgi/reprint/52/"),
        getRootUrls(artIter));
  }

  public void testUrlsWithPrefixes() throws Exception {
    SubTreeArticleIterator artIter = createSubTreeIter();
    Pattern pat = getPattern(artIter);
    assertMatchesRE(
        pat, "http://pediatrics.aappublications.org/cgi/reprint/foo;52/Supplement_3/S69.pdf");
    assertMatchesRE(
        pat, "http://pediatrics.aappublications.org/cgi/reprint/52/supplement_3/S69.pdf");
    assertNotMatchesRE(
        pat, "http://pediatrics.aappublications.org/cgi/reprin/1014174823t49006/j0143.pdfwrong");
    assertNotMatchesRE(
        pat, "http://pediatrics.aappublications.org/cgi/reprintt/1014174823t49006/j0143.pdfwrong");
    assertNotMatchesRE(pat, "http://www.example.com/content/");
    assertNotMatchesRE(pat, "http://www.example.com/content/j");
    assertNotMatchesRE(pat, "http://www.example.com/content/j0123/j383.pdfwrong");
  }

  public void testCreateArticleFiles() throws Exception {
    PluginTestUtil.crawlSimAu(sau);

    String pat0 = "001file[.]html";
    String rep0 = "52/1/S1";
    PluginTestUtil.copyAu(sau, au, ".*[.]html$", pat0, rep0);
    String pat1 = "001file[.]pdf";
    String rep1 = "52/1/S1.pdf";
    PluginTestUtil.copyAu(sau, au, ".*[.]pdf$", pat1, rep1);

    String pdfurl = "http://pediatrics.aappublications.org/cgi/reprint/52/1/S1.pdf";
    String url = "http://pediatrics.aappublications.org/cgi/reprint/52/1/S1";

    au.makeCachedUrl(url);
    CachedUrl cu = au.makeCachedUrl(pdfurl);
    assertNotNull(cu);
    SubTreeArticleIterator artIter = createSubTreeIter();
    assertNotNull(artIter);
    ArticleFiles af = artIter.next();
    assertNotNull(af);
    System.out.println("article files::" + af);
    assertEquals(url, af.getRoleCu(ArticleFiles.ROLE_FULL_TEXT_PDF_LANDING_PAGE).getUrl());
    assertEquals(pdfurl, af.getRoleCu(ArticleFiles.ROLE_FULL_TEXT_PDF).getUrl());
  }
}