@Override protected ArticleFiles createArticleFiles(CachedUrl cu) { String url = cu.getUrl(); Matcher mat = ABSTRACT_PATTERN.matcher(url); if (mat.find()) { return processAbstract(cu, mat); } log.warning("Mismatch between article iterator factory and article iterator: " + url); return null; }
@Override protected ArticleFiles createArticleFiles(CachedUrl cu) { String url = cu.getUrl(); log.debug3("iterating url: " + url); Matcher mat = PATTERN.matcher(url); if (mat.find()) { return processFullTextPdf(cu, mat); } log.warning("Mismatch between article iterator factory and article iterator: " + url); return null; }
/* * This is comlicated. MOST AUs have articles that live below and issue level TOC * that is, * <blah>/<journal_id>/vol#/iss#/ is a toc with no relevant metadata * <blah>/<journal_id>/vol#/iss#/xxx is an article with metadata * (eg Economist Voice V1) * BUT * in some AUs there are issues with only 1 article, in which case * <blah>/<journal_id>/vol#/iss#/ is an abstract with metadata * (eg Rhodes Cook V4) * and a few AUs with a mixture * (eg Forum for Health Economics V5) * So to identify ALL articles, we'll also have to capture issue level items and then look * at the html and if it has article metadata in it, count it as an article. * */ @Override protected ArticleFiles createArticleFiles(CachedUrl cu) { String url = cu.getUrl(); Matcher mat = pattern.matcher(url); if (mat.find()) { // we matched, but could this pattern potentially be a toc? Matcher tocmat = TOC_pattern.matcher(url); // if we could be a TOC then we must have metadata to be considered an article if (tocmat.find()) { if (hasArticleMetadata(cu)) { return processUrl(cu, mat); } } else { // we're not a potential TOC, so treat this as an article without checking return processUrl(cu, mat); } return null; // this was a TOC, not an article } log.warning("Mismatch between article iterator factory and article iterator: " + url); return null; }
protected ArticleFiles processFullTextPdf(CachedUrl cu, Matcher mat) { ArticleFiles af = new ArticleFiles(); af.setRoleCu(ArticleFiles.ROLE_FULL_TEXT_PDF, cu); String abstractUrl = mat.replaceAll("$1aspx"); CachedUrl abstractCu = cu.getArchivalUnit().makeCachedUrl(abstractUrl); if (abstractCu.hasContent()) af.setFullTextCu(abstractCu); else af.setFullTextCu(cu); log.debug3("returning full text: " + af.getFullTextUrl()); return af; }