protected static class AshdinArticleIterator extends SubTreeArticleIterator { protected static Pattern PATTERN = Pattern.compile("(/journals/[^/]+/[^/]+\\.)(pdf)$", Pattern.CASE_INSENSITIVE); public AshdinArticleIterator(ArchivalUnit au, SubTreeArticleIterator.Spec spec) { super(au, spec); } @Override protected ArticleFiles createArticleFiles(CachedUrl cu) { String url = cu.getUrl(); log.debug3("iterating url: " + url); Matcher mat = PATTERN.matcher(url); if (mat.find()) { return processFullTextPdf(cu, mat); } log.warning("Mismatch between article iterator factory and article iterator: " + url); return null; } protected ArticleFiles processFullTextPdf(CachedUrl cu, Matcher mat) { ArticleFiles af = new ArticleFiles(); af.setRoleCu(ArticleFiles.ROLE_FULL_TEXT_PDF, cu); String abstractUrl = mat.replaceAll("$1aspx"); CachedUrl abstractCu = cu.getArchivalUnit().makeCachedUrl(abstractUrl); if (abstractCu.hasContent()) af.setFullTextCu(abstractCu); else af.setFullTextCu(cu); log.debug3("returning full text: " + af.getFullTextUrl()); return af; } }
@Override protected ArticleFiles createArticleFiles(CachedUrl cu) { String url = cu.getUrl(); Matcher mat = ABSTRACT_PATTERN.matcher(url); if (mat.find()) { return processAbstract(cu, mat); } log.warning("Mismatch between article iterator factory and article iterator: " + url); return null; }
@Override protected ArticleFiles createArticleFiles(CachedUrl cu) { String url = cu.getUrl(); log.debug3("iterating url: " + url); Matcher mat = PATTERN.matcher(url); if (mat.find()) { return processFullTextPdf(cu, mat); } log.warning("Mismatch between article iterator factory and article iterator: " + url); return null; }
public BePressArticleIterator( ArchivalUnit au, SubTreeArticleIterator.Spec spec, boolean isSection) { super(au, spec); String volumeAsString = au.getConfiguration().get(ConfigParamDescr.VOLUME_NUMBER.getKey()); String journalAbbr = au.getConfiguration().get(ConfigParamDescr.JOURNAL_ABBR.getKey()); if (isSection) { journalAbbr = journalAbbr + "/" + au.getConfiguration().get("journal_section"); } // pick up issue level and lower (make (art)?[0-9]+ optional because a few au's have article // at issue level this.pattern = Pattern.compile( String.format( "/%s/((([^0-9]+/)?(vol)?%s/(iss)?[0-9]+(/(art)?[0-9]+)?)|(vol%s/(?-i:[A-Z])[0-9]+))$", journalAbbr, volumeAsString, volumeAsString), Pattern.CASE_INSENSITIVE); this.TOC_pattern = Pattern.compile( String.format("/%s/([^0-9]+/)?(vol)?%s/(iss)?[0-9]+$", journalAbbr, volumeAsString), Pattern.CASE_INSENSITIVE); }
/* * This is comlicated. MOST AUs have articles that live below and issue level TOC * that is, * <blah>/<journal_id>/vol#/iss#/ is a toc with no relevant metadata * <blah>/<journal_id>/vol#/iss#/xxx is an article with metadata * (eg Economist Voice V1) * BUT * in some AUs there are issues with only 1 article, in which case * <blah>/<journal_id>/vol#/iss#/ is an abstract with metadata * (eg Rhodes Cook V4) * and a few AUs with a mixture * (eg Forum for Health Economics V5) * So to identify ALL articles, we'll also have to capture issue level items and then look * at the html and if it has article metadata in it, count it as an article. * */ @Override protected ArticleFiles createArticleFiles(CachedUrl cu) { String url = cu.getUrl(); Matcher mat = pattern.matcher(url); if (mat.find()) { // we matched, but could this pattern potentially be a toc? Matcher tocmat = TOC_pattern.matcher(url); // if we could be a TOC then we must have metadata to be considered an article if (tocmat.find()) { if (hasArticleMetadata(cu)) { return processUrl(cu, mat); } } else { // we're not a potential TOC, so treat this as an article without checking return processUrl(cu, mat); } return null; // this was a TOC, not an article } log.warning("Mismatch between article iterator factory and article iterator: " + url); return null; }
protected static class MaffeyArticleIterator extends SubTreeArticleIterator { protected Pattern ABSTRACT_PATTERN = Pattern.compile("([^/]+)-article-a[0-9]+$", Pattern.CASE_INSENSITIVE); public MaffeyArticleIterator(ArchivalUnit au, SubTreeArticleIterator.Spec spec) { super(au, spec); } @Override protected ArticleFiles createArticleFiles(CachedUrl cu) { String url = cu.getUrl(); Matcher mat = ABSTRACT_PATTERN.matcher(url); if (mat.find()) { return processAbstract(cu, mat); } log.warning("Mismatch between article iterator factory and article iterator: " + url); return null; } /* * In order to find full text PDF you need to find the citation_pdf_url meta tag in the * abstract html pull out the pdf url normalize it (reorder params...) and find the matching * cached URL */ protected ArticleFiles processAbstract(CachedUrl absCu, Matcher absMat) { NodeList nl = null; ArticleFiles af = new ArticleFiles(); if (absCu != null && absCu.hasContent()) { // TEMPORARY: set absCU as default full text CU in case there is // no PDF CU with content; the current metadata manager currently // uses only the full text CU, but this will change with the new // metadata schema that can have multiple CUs for an article. af.setFullTextCu(absCu); af.setRoleCu(ArticleFiles.ROLE_ABSTRACT, absCu); try { InputStreamSource is = new InputStreamSource(new Stream(absCu.getUnfilteredInputStream())); Page pg = new Page(is); Lexer lx = new Lexer(pg); Parser parser = new Parser(lx); Lexer.STRICT_REMARKS = false; NodeFilter nf = new NodeFilter() { public boolean accept(Node node) { if (!(node instanceof MetaTag)) return false; MetaTag meta = (MetaTag) node; if (!"citation_pdf_url".equalsIgnoreCase(meta.getMetaTagName())) return false; return true; } }; nl = parser.extractAllNodesThatMatch(nf); } catch (ParserException e) { log.debug("Unable to parse abstract page html", e); } catch (UnsupportedEncodingException e) { log.debug("Bad encoding in abstact page html", e); } finally { absCu.release(); } } try { if (nl != null) { if (nl.size() > 0) { // minimally encode URL to prevent URL constructor // from stripping trailing spaces String pdfUrlStr = ((MetaTag) nl.elementAt(0)).getMetaContent(); URL pdfUrl = new URL(UrlUtil.minimallyEncodeUrl(pdfUrlStr)); List<String> paramList = new ArrayList<String>(); paramList.add("fileType"); paramList.add("fileId"); paramList.add("fileName"); pdfUrl = reArrangeUrlParams(pdfUrl, paramList); if (!pdfUrl.getHost().startsWith("www.")) { pdfUrl = new URL(pdfUrl.getProtocol(), "www." + pdfUrl.getHost(), pdfUrl.getFile()); } // note: must leave URL encoded because that's how we store URLs CachedUrl pdfCu = au.makeCachedUrl(pdfUrl.toString()); if (pdfCu != null && pdfCu.hasContent()) { // replace absCU with pdfCU if exists and has content af.setFullTextCu(pdfCu); af.setRoleCu(ArticleFiles.ROLE_FULL_TEXT_PDF, pdfCu); } } } } catch (MalformedURLException e) { log.debug("Badly formatted pdf url link", e); } catch (IllegalArgumentException e) { log.debug("Badly formatted pdf url link", e); } return af; } private URL reArrangeUrlParams(URL url, List<String> paramList) throws MalformedURLException { return reArrangeUrlParams(url, paramList, "&"); } // Pulls parameters in paramList and arranges them in the same order as they // are in paramList. private URL reArrangeUrlParams(URL url, List<String> paramList, String paramDelim) throws MalformedURLException { String[] urlTokens = url.getQuery().split(paramDelim); String newQuery = ""; if (urlTokens.length > 1) { for (String param : paramList) { for (int i = 0; i < urlTokens.length; i++) { if (urlTokens[i].toLowerCase().startsWith(param.toLowerCase())) { if (newQuery != "") newQuery = newQuery + "&"; newQuery = newQuery + urlTokens[i]; break; } } } } return new URL(url.getProtocol(), url.getHost(), url.getPath() + "?" + newQuery); } }