public void testArticleCountAndType() throws Exception { int expCount = 28; PluginTestUtil.crawlSimAu(sau); String pat1 = "branch(\\d+)/(\\d+file\\.html)"; String rep1 = "aps/journal/v123/n$1/full/$2"; PluginTestUtil.copyAu(sau, nau, ".*[^.][^p][^d][^f]$", pat1, rep1); String pat2 = "branch(\\d+)/(\\d+file\\.pdf)"; String rep2 = "aps/journal/v123/n$1/pdf/$2"; PluginTestUtil.copyAu(sau, nau, ".*\\.pdf$", pat2, rep2); // Remove some URLs int deleted = 0; for (Iterator it = nau.getAuCachedUrlSet().contentHashIterator(); it.hasNext(); ) { CachedUrlSetNode cusn = (CachedUrlSetNode) it.next(); if (cusn instanceof CachedUrl) { CachedUrl cu = (CachedUrl) cusn; String url = cu.getUrl(); if (url.contains("/journal/") && (url.endsWith("1file.html") || url.endsWith("2file.pdf"))) { deleteBlock(cu); ++deleted; } } } assertEquals(8, deleted); Iterator<ArticleFiles> it = nau.getArticleIterator(); int count = 0; int countHtmlOnly = 0; int countPdfOnly = 0; while (it.hasNext()) { ArticleFiles af = it.next(); log.info(af.toString()); CachedUrl cu = af.getFullTextCu(); String url = cu.getUrl(); assertNotNull(cu); String contentType = cu.getContentType(); log.debug("count " + count + " url " + url + " " + contentType); count++; if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == null) { ++countHtmlOnly; } if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == url) { ++countPdfOnly; } } log.debug("Article count is " + count); assertEquals(expCount, count); assertEquals(4, countHtmlOnly); assertEquals(4, countPdfOnly); }
public void testCreateArticleFiles() throws Exception { // create urls to store in UrlCacher String[] au_urls = { BASE_URL + "pc/doifinder/10.1057/9780123456789", BASE_URL + "pc/browse/citationExport?doi=10.1057/9780123456789", BASE_URL + "pc/doifinder/download/10.1057/9780123456789", BASE_URL + "pc/doifinder/download/10.1057/9780123456789.epub", BASE_URL + "pc/doifinder/10.1057/9781234567890", BASE_URL + "pc/browse/citationExport?doi=10.1057/9781234567890", BASE_URL + "pc/doifinder/download/10.1057/9781234567890", BASE_URL + "pc/doifinder/download/10.1057/9781234567890.epub", BASE_URL + "pc/doifinder/10.1057/9782345678901", BASE_URL + "pc/browse/citationExport?doi=10.1057/9782345678901", BASE_URL + "pc/doifinder/download/10.1057/9782345678901", BASE_URL + "pc/doifinder/download/10.1057/9782345678901.epub" }; /* // get cached url content type and properties from simulated contents // for UrclCacher.storeContent() CachedUrl cuPdf = null; CachedUrl cuHtml = null; CachedUrl cuEpub = null; for (CachedUrl cu : AuUtil.getCuIterable(sau)) { if (cuPdf == null && cu.getContentType().toLowerCase().startsWith(Constants.MIME_TYPE_PDF)) { log.info("pdf contenttype: " + cu.getContentType()); cuPdf = cu; } else if (cuHtml == null && cu.getContentType().toLowerCase().startsWith(Constants.MIME_TYPE_HTML)) { log.info("html contenttype: " + cu.getContentType()); cuHtml = cu; } else if (cuEpub == null && cu.getContentType().toLowerCase().startsWith("application/epub")) { log.info("epub contenttype: " + cu.getContentType()); cuEpub = cu; } if (cuPdf != null && cuHtml != null && cuEpub != null) { break; } } */ CachedUrl cu; // store content using cached url content type and properties for (String url : au_urls) { if (url.contains("download") && !url.endsWith(".epub")) { storeContent(random_content_stream, pdfHeader, url); } else if (url.contains("download")) { // epub storeContent(random_content_stream, epubHeader, url); } else { storeContent(random_content_stream, textHeader, url); } } // book 9780123456789 ArticleFiles af1 = new ArticleFiles(); af1.setRoleString( ArticleFiles.ROLE_FULL_TEXT_PDF, BASE_URL + "pc/doifinder/download/10.1057/9780123456789"); af1.setRoleString( ArticleFiles.ROLE_ARTICLE_METADATA, BASE_URL + "pc/browse/citationExport?doi=10.1057/9780123456789"); af1.setRoleString( ArticleFiles.ROLE_FULL_TEXT_EPUB, BASE_URL + "pc/doifinder/download/10.1057/9780123456789.epub"); // book 9780123456789 ArticleFiles af2 = new ArticleFiles(); af1.setRoleString( ArticleFiles.ROLE_FULL_TEXT_PDF, BASE_URL + "pc/doifinder/download/10.1057/9781234567890"); af1.setRoleString( ArticleFiles.ROLE_ARTICLE_METADATA, BASE_URL + "pc/browse/citationExport?doi=10.1057/9781234567890"); af1.setRoleString( ArticleFiles.ROLE_FULL_TEXT_EPUB, BASE_URL + "pc/doifinder/download/10.1057/9781234567890.epub"); // book 9780123456789 ArticleFiles af3 = new ArticleFiles(); af1.setRoleString( ArticleFiles.ROLE_FULL_TEXT_PDF, BASE_URL + "pc/doifinder/download/10.1057/9782345678901"); af1.setRoleString( ArticleFiles.ROLE_ARTICLE_METADATA, BASE_URL + "pc/browse/citationExport?doi=10.1057/9782345678901"); af1.setRoleString( ArticleFiles.ROLE_FULL_TEXT_EPUB, BASE_URL + "pc/doifinder/download/10.1057/9782345678901.epub"); // key the expected content to the fullTextUrl for the ArticleFiles HashMap<String, ArticleFiles> fullUrlToAF = new HashMap<String, ArticleFiles>(); fullUrlToAF.put(BASE_URL + "pc/doifinder/download/10.1057/9780123456789", af1); fullUrlToAF.put(BASE_URL + "pc/doifinder/download/10.1057/9781234567890", af2); fullUrlToAF.put(BASE_URL + "pc/doifinder/download/10.1057/9782345678901", af3); // get article iterator, get article files and the appropriate urls according // to their roles. String[] expectedUrls = { EXPECTED_FULL_TEXT_URL, EXPECTED_PDF_URL, }; for (SubTreeArticleIterator artIter = createSubTreeIter(); artIter.hasNext(); ) { ArticleFiles af = artIter.next(); String[] actualUrls = { af.getFullTextUrl(), af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF), // af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF_LANDING_PAGE) }; log.info("actualUrls: " + actualUrls.length); for (int i = 0; i < actualUrls.length; i++) { log.info("e_url: " + expectedUrls[i]); log.info("url: " + actualUrls[i]); // assertEquals(expectedUrls[i], actualUrls[i]); } } }