private void deleteBlock(CachedUrl cu) throws IOException { log.info("deleting " + cu.getUrl()); CachedUrlSetSpec cuss = new SingleNodeCachedUrlSetSpec(cu.getUrl()); ArchivalUnit au = cu.getArchivalUnit(); CachedUrlSet cus = au.makeCachedUrlSet(cuss); NodeManager nm = au.getPlugin().getDaemon().getNodeManager(au); nm.deleteNode(cus); }
public void testFunctionalFromTarHierarchy() throws Exception { log.debug3("in testFromTarHierarchy"); // load the tarballs InputStream file_input = null; try { file_input = getResourceAsStream(realTARFile_A); // UrlCacher uc = au.makeUrlCacher(TAR_A_BASE); // uc.storeContent(file_input, tarHeader); UrlCacher uc = tarAu.makeUrlCacher(new UrlData(file_input, tarHeader, TAR_A_BASE)); uc.storeContent(); IOUtil.safeClose(file_input); file_input = getResourceAsStream(realTARFile_B); // uc = au.makeUrlCacher(TAR_B_BASE); // uc.storeContent(file_input, tarHeader); uc = tarAu.makeUrlCacher(new UrlData(file_input, tarHeader, TAR_B_BASE)); uc.storeContent(); IOUtil.safeClose(file_input); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } finally { IOUtil.safeClose(file_input); } CachedUrlSet cus = tarAu.getAuCachedUrlSet(); for (CachedUrl cu : cus.getCuIterable()) { log.debug3("AU - cu is: " + cu.getUrl()); cu.release(); } // We need to start from the level of the ArticleMetadataExtractor MyListEmitter emitter = new MyListEmitter(); ArticleMetadataExtractor amEx = new ElsevierDeferredArticleMetadataExtractor(ArticleFiles.ROLE_ARTICLE_METADATA); Iterator<ArticleFiles> it = tarAu.getArticleIterator(MetadataTarget.Any()); while (it.hasNext()) { ArticleFiles af = it.next(); log.debug3("Metadata test - articlefiles " + af.toString()); // CachedUrl cu = af.getFullTextCu(); CachedUrl cu = af.getRoleCu(ArticleFiles.ROLE_ARTICLE_METADATA); log.debug3("metadata cu is " + cu.getUrl()); // List<ArticleMetadata> mdlist = mle.extract(MetadataTarget.Any(), cu); amEx.extract(MetadataTarget.Any(), af, emitter); List<ArticleMetadata> returnList = emitter.getAmList(); assertNotNull(returnList); log.debug3("size of returnList is " + returnList.size()); Iterator<ArticleMetadata> mdIt = returnList.iterator(); ArticleMetadata mdRecord = null; while (mdIt.hasNext()) { mdRecord = (ArticleMetadata) mdIt.next(); validateCompleteMetadataRecord(mdRecord); } } }
public void testArticleCountAndType() throws Exception { int expCount = 28; PluginTestUtil.crawlSimAu(sau); String pat1 = "branch(\\d+)/(\\d+file\\.html)"; String rep1 = "aps/journal/v123/n$1/full/$2"; PluginTestUtil.copyAu(sau, nau, ".*[^.][^p][^d][^f]$", pat1, rep1); String pat2 = "branch(\\d+)/(\\d+file\\.pdf)"; String rep2 = "aps/journal/v123/n$1/pdf/$2"; PluginTestUtil.copyAu(sau, nau, ".*\\.pdf$", pat2, rep2); // Remove some URLs int deleted = 0; for (Iterator it = nau.getAuCachedUrlSet().contentHashIterator(); it.hasNext(); ) { CachedUrlSetNode cusn = (CachedUrlSetNode) it.next(); if (cusn instanceof CachedUrl) { CachedUrl cu = (CachedUrl) cusn; String url = cu.getUrl(); if (url.contains("/journal/") && (url.endsWith("1file.html") || url.endsWith("2file.pdf"))) { deleteBlock(cu); ++deleted; } } } assertEquals(8, deleted); Iterator<ArticleFiles> it = nau.getArticleIterator(); int count = 0; int countHtmlOnly = 0; int countPdfOnly = 0; while (it.hasNext()) { ArticleFiles af = it.next(); log.info(af.toString()); CachedUrl cu = af.getFullTextCu(); String url = cu.getUrl(); assertNotNull(cu); String contentType = cu.getContentType(); log.debug("count " + count + " url " + url + " " + contentType); count++; if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == null) { ++countHtmlOnly; } if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == url) { ++countPdfOnly; } } log.debug("Article count is " + count); assertEquals(expCount, count); assertEquals(4, countHtmlOnly); assertEquals(4, countPdfOnly); }
protected ArticleFiles processFullTextPdf(CachedUrl cu, Matcher mat) { ArticleFiles af = new ArticleFiles(); af.setRoleCu(ArticleFiles.ROLE_FULL_TEXT_PDF, cu); String abstractUrl = mat.replaceAll("$1aspx"); CachedUrl abstractCu = cu.getArchivalUnit().makeCachedUrl(abstractUrl); if (abstractCu.hasContent()) af.setFullTextCu(abstractCu); else af.setFullTextCu(cu); log.debug3("returning full text: " + af.getFullTextUrl()); return af; }
@Override public void extract(MetadataTarget target, CachedUrl cu, Emitter emitter) throws IOException { ArticleMetadata am = new SimpleHtmlMetaTagMetadataExtractor().extract(target, cu); am.cook(tagMap); String url = am.get(MetadataField.FIELD_ACCESS_URL); ArchivalUnit au = cu.getArchivalUnit(); if (url == null || url.isEmpty() || !au.makeCachedUrl(url).hasContent()) { url = cu.getUrl(); } am.replace( MetadataField.FIELD_ACCESS_URL, HttpToHttpsUtil.AuUtil.normalizeHttpHttpsFromBaseUrl(au, url)); emitter.emitMetadata(cu, am); }
@Override public ArticleMetadata extract(MetadataTarget target, CachedUrl cu) throws IOException { ArticleMetadata am = super.extract(target, cu); am.cook(tagMap); String url = am.get(MetadataField.FIELD_ACCESS_URL); if (url != null && !url.isEmpty()) { CachedUrl val = cu.getArchivalUnit().makeCachedUrl(url); if (!val.hasContent()) { am.replace(MetadataField.FIELD_ACCESS_URL, cu.getUrl()); } } else { am.replace(MetadataField.FIELD_ACCESS_URL, cu.getUrl()); } return am; }
@Override public ArticleMetadata extract(MetadataTarget target, CachedUrl cu) throws IOException { log.debug3("Metadata - cachedurl cu:" + cu.getUrl()); ArticleMetadata am = super.extract(target, cu); am.cook(tagMap); return am; } // extract
@Override protected ArticleFiles createArticleFiles(CachedUrl cu) { String url = cu.getUrl(); Matcher mat = ABSTRACT_PATTERN.matcher(url); if (mat.find()) { return processAbstract(cu, mat); } log.warning("Mismatch between article iterator factory and article iterator: " + url); return null; }
@Override protected ArticleFiles createArticleFiles(CachedUrl cu) { String url = cu.getUrl(); log.debug3("iterating url: " + url); Matcher mat = PATTERN.matcher(url); if (mat.find()) { return processFullTextPdf(cu, mat); } log.warning("Mismatch between article iterator factory and article iterator: " + url); return null; }
/* * hasArticleMetadata(CachedUrl cu) * Given the CachedUrl for the potential abstract file, using the existing * SimpleHtmlMetaTagMetadataExtractor to parse the file and * retrieve any contained metadata. If a doi or author exists, it's an article * NOT defining the Metadata Extractor here! */ private boolean hasArticleMetadata(CachedUrl cu) { MetadataTarget at = new MetadataTarget(MetadataTarget.PURPOSE_ARTICLE); ArticleMetadata am; SimpleHtmlMetaTagMetadataExtractor ext = new SimpleHtmlMetaTagMetadataExtractor(); if (cu != null && cu.hasContent()) { try { at.setFormat("text/html"); am = ext.extract(at, cu); if ((am.containsRawKey("bepress_citation_journal_title")) || (am.containsRawKey("bepress_citation_abstract_html_url")) || (am.containsRawKey("bepress_citation_doi")) || (am.containsRawKey("bepress_citation_author"))) { return true; } } catch (IOException e) { e.printStackTrace(); } } return false; // no reasonable metadata, probably a toc }
/* * This is comlicated. MOST AUs have articles that live below and issue level TOC * that is, * <blah>/<journal_id>/vol#/iss#/ is a toc with no relevant metadata * <blah>/<journal_id>/vol#/iss#/xxx is an article with metadata * (eg Economist Voice V1) * BUT * in some AUs there are issues with only 1 article, in which case * <blah>/<journal_id>/vol#/iss#/ is an abstract with metadata * (eg Rhodes Cook V4) * and a few AUs with a mixture * (eg Forum for Health Economics V5) * So to identify ALL articles, we'll also have to capture issue level items and then look * at the html and if it has article metadata in it, count it as an article. * */ @Override protected ArticleFiles createArticleFiles(CachedUrl cu) { String url = cu.getUrl(); Matcher mat = pattern.matcher(url); if (mat.find()) { // we matched, but could this pattern potentially be a toc? Matcher tocmat = TOC_pattern.matcher(url); // if we could be a TOC then we must have metadata to be considered an article if (tocmat.find()) { if (hasArticleMetadata(cu)) { return processUrl(cu, mat); } } else { // we're not a potential TOC, so treat this as an article without checking return processUrl(cu, mat); } return null; // this was a TOC, not an article } log.warning("Mismatch between article iterator factory and article iterator: " + url); return null; }
/* * In order to find full text PDF you need to find the citation_pdf_url meta tag in the * abstract html pull out the pdf url normalize it (reorder params...) and find the matching * cached URL */ protected ArticleFiles processAbstract(CachedUrl absCu, Matcher absMat) { NodeList nl = null; ArticleFiles af = new ArticleFiles(); if (absCu != null && absCu.hasContent()) { // TEMPORARY: set absCU as default full text CU in case there is // no PDF CU with content; the current metadata manager currently // uses only the full text CU, but this will change with the new // metadata schema that can have multiple CUs for an article. af.setFullTextCu(absCu); af.setRoleCu(ArticleFiles.ROLE_ABSTRACT, absCu); try { InputStreamSource is = new InputStreamSource(new Stream(absCu.getUnfilteredInputStream())); Page pg = new Page(is); Lexer lx = new Lexer(pg); Parser parser = new Parser(lx); Lexer.STRICT_REMARKS = false; NodeFilter nf = new NodeFilter() { public boolean accept(Node node) { if (!(node instanceof MetaTag)) return false; MetaTag meta = (MetaTag) node; if (!"citation_pdf_url".equalsIgnoreCase(meta.getMetaTagName())) return false; return true; } }; nl = parser.extractAllNodesThatMatch(nf); } catch (ParserException e) { log.debug("Unable to parse abstract page html", e); } catch (UnsupportedEncodingException e) { log.debug("Bad encoding in abstact page html", e); } finally { absCu.release(); } } try { if (nl != null) { if (nl.size() > 0) { // minimally encode URL to prevent URL constructor // from stripping trailing spaces String pdfUrlStr = ((MetaTag) nl.elementAt(0)).getMetaContent(); URL pdfUrl = new URL(UrlUtil.minimallyEncodeUrl(pdfUrlStr)); List<String> paramList = new ArrayList<String>(); paramList.add("fileType"); paramList.add("fileId"); paramList.add("fileName"); pdfUrl = reArrangeUrlParams(pdfUrl, paramList); if (!pdfUrl.getHost().startsWith("www.")) { pdfUrl = new URL(pdfUrl.getProtocol(), "www." + pdfUrl.getHost(), pdfUrl.getFile()); } // note: must leave URL encoded because that's how we store URLs CachedUrl pdfCu = au.makeCachedUrl(pdfUrl.toString()); if (pdfCu != null && pdfCu.hasContent()) { // replace absCU with pdfCU if exists and has content af.setFullTextCu(pdfCu); af.setRoleCu(ArticleFiles.ROLE_FULL_TEXT_PDF, pdfCu); } } } } catch (MalformedURLException e) { log.debug("Badly formatted pdf url link", e); } catch (IllegalArgumentException e) { log.debug("Badly formatted pdf url link", e); } return af; }