コード例 #1
0
  protected static class AshdinArticleIterator extends SubTreeArticleIterator {
    protected static Pattern PATTERN =
        Pattern.compile("(/journals/[^/]+/[^/]+\\.)(pdf)$", Pattern.CASE_INSENSITIVE);

    public AshdinArticleIterator(ArchivalUnit au, SubTreeArticleIterator.Spec spec) {
      super(au, spec);
    }

    @Override
    protected ArticleFiles createArticleFiles(CachedUrl cu) {
      String url = cu.getUrl();
      log.debug3("iterating url: " + url);
      Matcher mat = PATTERN.matcher(url);
      if (mat.find()) {
        return processFullTextPdf(cu, mat);
      }
      log.warning("Mismatch between article iterator factory and article iterator: " + url);
      return null;
    }

    protected ArticleFiles processFullTextPdf(CachedUrl cu, Matcher mat) {
      ArticleFiles af = new ArticleFiles();
      af.setRoleCu(ArticleFiles.ROLE_FULL_TEXT_PDF, cu);

      String abstractUrl = mat.replaceAll("$1aspx");
      CachedUrl abstractCu = cu.getArchivalUnit().makeCachedUrl(abstractUrl);

      if (abstractCu.hasContent()) af.setFullTextCu(abstractCu);
      else af.setFullTextCu(cu);

      log.debug3("returning full text: " + af.getFullTextUrl());

      return af;
    }
  }
コード例 #2
0
    @Override
    protected ArticleFiles createArticleFiles(CachedUrl cu) {
      String url = cu.getUrl();
      Matcher mat = ABSTRACT_PATTERN.matcher(url);
      if (mat.find()) {
        return processAbstract(cu, mat);
      }

      log.warning("Mismatch between article iterator factory and article iterator: " + url);
      return null;
    }
コード例 #3
0
 @Override
 protected ArticleFiles createArticleFiles(CachedUrl cu) {
   String url = cu.getUrl();
   log.debug3("iterating url: " + url);
   Matcher mat = PATTERN.matcher(url);
   if (mat.find()) {
     return processFullTextPdf(cu, mat);
   }
   log.warning("Mismatch between article iterator factory and article iterator: " + url);
   return null;
 }
コード例 #4
0
 public BePressArticleIterator(
     ArchivalUnit au, SubTreeArticleIterator.Spec spec, boolean isSection) {
   super(au, spec);
   String volumeAsString = au.getConfiguration().get(ConfigParamDescr.VOLUME_NUMBER.getKey());
   String journalAbbr = au.getConfiguration().get(ConfigParamDescr.JOURNAL_ABBR.getKey());
   if (isSection) {
     journalAbbr = journalAbbr + "/" + au.getConfiguration().get("journal_section");
   }
   // pick up issue level and lower (make (art)?[0-9]+ optional because a few au's have article
   // at issue level
   this.pattern =
       Pattern.compile(
           String.format(
               "/%s/((([^0-9]+/)?(vol)?%s/(iss)?[0-9]+(/(art)?[0-9]+)?)|(vol%s/(?-i:[A-Z])[0-9]+))$",
               journalAbbr, volumeAsString, volumeAsString),
           Pattern.CASE_INSENSITIVE);
   this.TOC_pattern =
       Pattern.compile(
           String.format("/%s/([^0-9]+/)?(vol)?%s/(iss)?[0-9]+$", journalAbbr, volumeAsString),
           Pattern.CASE_INSENSITIVE);
 }
コード例 #5
0
 /*
  * This is comlicated. MOST AUs have articles that live below and issue level TOC
  * that is,
  * <blah>/<journal_id>/vol#/iss#/ is a toc with no relevant metadata
  * <blah>/<journal_id>/vol#/iss#/xxx is an article with metadata
  * (eg Economist Voice V1)
  * BUT
  * in some AUs there are issues with only 1 article, in which case
  * <blah>/<journal_id>/vol#/iss#/ is an abstract with metadata
  * (eg Rhodes Cook V4)
  * and a few AUs with a mixture
  * (eg Forum for Health Economics V5)
  * So to identify ALL articles, we'll also have to capture issue level items and then look
  * at the html and if it has article metadata in it, count it as an article.
  *
  */
 @Override
 protected ArticleFiles createArticleFiles(CachedUrl cu) {
   String url = cu.getUrl();
   Matcher mat = pattern.matcher(url);
   if (mat.find()) {
     // we matched, but could this pattern potentially be a toc?
     Matcher tocmat = TOC_pattern.matcher(url);
     // if we could be a TOC then we must have metadata to be considered an article
     if (tocmat.find()) {
       if (hasArticleMetadata(cu)) {
         return processUrl(cu, mat);
       }
     } else {
       // we're not a potential TOC, so treat this as an article without checking
       return processUrl(cu, mat);
     }
     return null; // this was a TOC, not an article
   }
   log.warning("Mismatch between article iterator factory and article iterator: " + url);
   return null;
 }
コード例 #6
0
  protected static class MaffeyArticleIterator extends SubTreeArticleIterator {

    protected Pattern ABSTRACT_PATTERN =
        Pattern.compile("([^/]+)-article-a[0-9]+$", Pattern.CASE_INSENSITIVE);

    public MaffeyArticleIterator(ArchivalUnit au, SubTreeArticleIterator.Spec spec) {
      super(au, spec);
    }

    @Override
    protected ArticleFiles createArticleFiles(CachedUrl cu) {
      String url = cu.getUrl();
      Matcher mat = ABSTRACT_PATTERN.matcher(url);
      if (mat.find()) {
        return processAbstract(cu, mat);
      }

      log.warning("Mismatch between article iterator factory and article iterator: " + url);
      return null;
    }

    /*
     * In order to find full text PDF you need to find the citation_pdf_url meta tag in the
     * abstract html pull out the pdf url normalize it (reorder params...) and find the matching
     * cached URL
     */
    protected ArticleFiles processAbstract(CachedUrl absCu, Matcher absMat) {
      NodeList nl = null;
      ArticleFiles af = new ArticleFiles();
      if (absCu != null && absCu.hasContent()) {
        // TEMPORARY: set absCU as default full text CU in case there is
        // no PDF CU with content; the current metadata manager currently
        // uses only the full text CU, but this will change with the new
        // metadata schema that can have multiple CUs for an article.
        af.setFullTextCu(absCu);
        af.setRoleCu(ArticleFiles.ROLE_ABSTRACT, absCu);
        try {
          InputStreamSource is =
              new InputStreamSource(new Stream(absCu.getUnfilteredInputStream()));
          Page pg = new Page(is);
          Lexer lx = new Lexer(pg);
          Parser parser = new Parser(lx);
          Lexer.STRICT_REMARKS = false;
          NodeFilter nf =
              new NodeFilter() {
                public boolean accept(Node node) {
                  if (!(node instanceof MetaTag)) return false;
                  MetaTag meta = (MetaTag) node;
                  if (!"citation_pdf_url".equalsIgnoreCase(meta.getMetaTagName())) return false;
                  return true;
                }
              };
          nl = parser.extractAllNodesThatMatch(nf);
        } catch (ParserException e) {
          log.debug("Unable to parse abstract page html", e);
        } catch (UnsupportedEncodingException e) {
          log.debug("Bad encoding in abstact page html", e);
        } finally {
          absCu.release();
        }
      }
      try {
        if (nl != null) {
          if (nl.size() > 0) {
            // minimally encode URL to prevent URL constructor
            // from stripping trailing spaces
            String pdfUrlStr = ((MetaTag) nl.elementAt(0)).getMetaContent();
            URL pdfUrl = new URL(UrlUtil.minimallyEncodeUrl(pdfUrlStr));
            List<String> paramList = new ArrayList<String>();
            paramList.add("fileType");
            paramList.add("fileId");
            paramList.add("fileName");
            pdfUrl = reArrangeUrlParams(pdfUrl, paramList);

            if (!pdfUrl.getHost().startsWith("www.")) {
              pdfUrl = new URL(pdfUrl.getProtocol(), "www." + pdfUrl.getHost(), pdfUrl.getFile());
            }

            // note: must leave URL encoded because that's how we store URLs
            CachedUrl pdfCu = au.makeCachedUrl(pdfUrl.toString());
            if (pdfCu != null && pdfCu.hasContent()) {
              // replace absCU with pdfCU if exists and has content
              af.setFullTextCu(pdfCu);
              af.setRoleCu(ArticleFiles.ROLE_FULL_TEXT_PDF, pdfCu);
            }
          }
        }
      } catch (MalformedURLException e) {
        log.debug("Badly formatted pdf url link", e);
      } catch (IllegalArgumentException e) {
        log.debug("Badly formatted pdf url link", e);
      }

      return af;
    }

    private URL reArrangeUrlParams(URL url, List<String> paramList) throws MalformedURLException {
      return reArrangeUrlParams(url, paramList, "&");
    }
    // Pulls parameters in paramList and arranges them in the same order as they
    // are in paramList.
    private URL reArrangeUrlParams(URL url, List<String> paramList, String paramDelim)
        throws MalformedURLException {
      String[] urlTokens = url.getQuery().split(paramDelim);
      String newQuery = "";
      if (urlTokens.length > 1) {
        for (String param : paramList) {
          for (int i = 0; i < urlTokens.length; i++) {
            if (urlTokens[i].toLowerCase().startsWith(param.toLowerCase())) {
              if (newQuery != "") newQuery = newQuery + "&";
              newQuery = newQuery + urlTokens[i];
              break;
            }
          }
        }
      }
      return new URL(url.getProtocol(), url.getHost(), url.getPath() + "?" + newQuery);
    }
  }