Java CachedUrl.hasContent Exemples, org.lockss.extractor.CachedUrl.hasContent Java Exemples

Exemple #1

0

Afficher le fichier

Fichier : AshdinArticleIteratorFactory.java Projet : edina/lockss-daemon

    protected ArticleFiles processFullTextPdf(CachedUrl cu, Matcher mat) {
      ArticleFiles af = new ArticleFiles();
      af.setRoleCu(ArticleFiles.ROLE_FULL_TEXT_PDF, cu);

      String abstractUrl = mat.replaceAll("$1aspx");
      CachedUrl abstractCu = cu.getArchivalUnit().makeCachedUrl(abstractUrl);

      if (abstractCu.hasContent()) af.setFullTextCu(abstractCu);
      else af.setFullTextCu(cu);

      log.debug3("returning full text: " + af.getFullTextUrl());

      return af;
    }

Exemple #2

0

Afficher le fichier

Fichier : IOPScienceHtmlMetadataExtractorFactory.java Projet : lockss/lockss-daemon

 @Override
 public ArticleMetadata extract(MetadataTarget target, CachedUrl cu) throws IOException {
   ArticleMetadata am = super.extract(target, cu);
   am.cook(tagMap);
   String url = am.get(MetadataField.FIELD_ACCESS_URL);
   if (url != null && !url.isEmpty()) {
     CachedUrl val = cu.getArchivalUnit().makeCachedUrl(url);
     if (!val.hasContent()) {
       am.replace(MetadataField.FIELD_ACCESS_URL, cu.getUrl());
     }
   } else {
     am.replace(MetadataField.FIELD_ACCESS_URL, cu.getUrl());
   }
   return am;
 }

Exemple #3

0

Afficher le fichier

Fichier : BePressArticleIteratorFactory.java Projet : lockss/lockss-daemon

 /*
  * hasArticleMetadata(CachedUrl cu)
  *   Given the CachedUrl for the potential abstract file, using the existing
  *   SimpleHtmlMetaTagMetadataExtractor to parse the file and
  *   retrieve any contained metadata. If a doi or author exists, it's an article
  *   NOT defining the Metadata Extractor here!
  */
 private boolean hasArticleMetadata(CachedUrl cu) {
   MetadataTarget at = new MetadataTarget(MetadataTarget.PURPOSE_ARTICLE);
   ArticleMetadata am;
   SimpleHtmlMetaTagMetadataExtractor ext = new SimpleHtmlMetaTagMetadataExtractor();
   if (cu != null && cu.hasContent()) {
     try {
       at.setFormat("text/html");
       am = ext.extract(at, cu);
       if ((am.containsRawKey("bepress_citation_journal_title"))
           || (am.containsRawKey("bepress_citation_abstract_html_url"))
           || (am.containsRawKey("bepress_citation_doi"))
           || (am.containsRawKey("bepress_citation_author"))) {
         return true;
       }
     } catch (IOException e) {
       e.printStackTrace();
     }
   }
   return false; // no reasonable metadata, probably a toc
 }

Exemple #4

0

Afficher le fichier

Fichier : MaffeyArticleIteratorFactory.java Projet : edina/lockss-daemon

    /*
     * In order to find full text PDF you need to find the citation_pdf_url meta tag in the
     * abstract html pull out the pdf url normalize it (reorder params...) and find the matching
     * cached URL
     */
    protected ArticleFiles processAbstract(CachedUrl absCu, Matcher absMat) {
      NodeList nl = null;
      ArticleFiles af = new ArticleFiles();
      if (absCu != null && absCu.hasContent()) {
        // TEMPORARY: set absCU as default full text CU in case there is
        // no PDF CU with content; the current metadata manager currently
        // uses only the full text CU, but this will change with the new
        // metadata schema that can have multiple CUs for an article.
        af.setFullTextCu(absCu);
        af.setRoleCu(ArticleFiles.ROLE_ABSTRACT, absCu);
        try {
          InputStreamSource is =
              new InputStreamSource(new Stream(absCu.getUnfilteredInputStream()));
          Page pg = new Page(is);
          Lexer lx = new Lexer(pg);
          Parser parser = new Parser(lx);
          Lexer.STRICT_REMARKS = false;
          NodeFilter nf =
              new NodeFilter() {
                public boolean accept(Node node) {
                  if (!(node instanceof MetaTag)) return false;
                  MetaTag meta = (MetaTag) node;
                  if (!"citation_pdf_url".equalsIgnoreCase(meta.getMetaTagName())) return false;
                  return true;
                }
              };
          nl = parser.extractAllNodesThatMatch(nf);
        } catch (ParserException e) {
          log.debug("Unable to parse abstract page html", e);
        } catch (UnsupportedEncodingException e) {
          log.debug("Bad encoding in abstact page html", e);
        } finally {
          absCu.release();
        }
      }
      try {
        if (nl != null) {
          if (nl.size() > 0) {
            // minimally encode URL to prevent URL constructor
            // from stripping trailing spaces
            String pdfUrlStr = ((MetaTag) nl.elementAt(0)).getMetaContent();
            URL pdfUrl = new URL(UrlUtil.minimallyEncodeUrl(pdfUrlStr));
            List<String> paramList = new ArrayList<String>();
            paramList.add("fileType");
            paramList.add("fileId");
            paramList.add("fileName");
            pdfUrl = reArrangeUrlParams(pdfUrl, paramList);

            if (!pdfUrl.getHost().startsWith("www.")) {
              pdfUrl = new URL(pdfUrl.getProtocol(), "www." + pdfUrl.getHost(), pdfUrl.getFile());
            }

            // note: must leave URL encoded because that's how we store URLs
            CachedUrl pdfCu = au.makeCachedUrl(pdfUrl.toString());
            if (pdfCu != null && pdfCu.hasContent()) {
              // replace absCU with pdfCU if exists and has content
              af.setFullTextCu(pdfCu);
              af.setRoleCu(ArticleFiles.ROLE_FULL_TEXT_PDF, pdfCu);
            }
          }
        }
      } catch (MalformedURLException e) {
        log.debug("Badly formatted pdf url link", e);
      } catch (IllegalArgumentException e) {
        log.debug("Badly formatted pdf url link", e);
      }

      return af;
    }