コード例 #1
0
    @Override
    public ArticleMetadata extract(MetadataTarget target, CachedUrl cu) throws IOException {

      log.debug3("Metadata - cachedurl cu:" + cu.getUrl());

      ArticleMetadata am = super.extract(target, cu);
      am.cook(tagMap);
      return am;
    } // extract
コード例 #2
0
  public void testDOIExtraction() throws Exception {

    List<ArticleMetadata> mdlist =
        setupContentForAU(bau1, ABS_URL, goodHtmlContentNoDOIorPublisher, true);
    assertNotEmpty(mdlist);
    ArticleMetadata md = mdlist.get(0);
    assertNotNull(md);
    // gets pulled from the URL if not set in the metadata
    assertEquals("10.1175/2010WCAS1063.1", md.get(MetadataField.FIELD_DOI));
    // gets set manually if not in the metadata
    // first it would try the TDB
    assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER));
  }
 @Override
 public void extract(MetadataTarget target, CachedUrl cu, Emitter emitter) throws IOException {
   ArticleMetadata am = new SimpleHtmlMetaTagMetadataExtractor().extract(target, cu);
   am.cook(tagMap);
   String url = am.get(MetadataField.FIELD_ACCESS_URL);
   ArchivalUnit au = cu.getArchivalUnit();
   if (url == null || url.isEmpty() || !au.makeCachedUrl(url).hasContent()) {
     url = cu.getUrl();
   }
   am.replace(
       MetadataField.FIELD_ACCESS_URL,
       HttpToHttpsUtil.AuUtil.normalizeHttpHttpsFromBaseUrl(au, url));
   emitter.emitMetadata(cu, am);
 }
コード例 #4
0
  /*
   * When testing no-pdf-check basic XML parsing, you will get partial MD records
   * depending on whether the info comes from dataset.xml or from main.xml
   */
  private void validateDatasetMetadataRecord(ArticleMetadata am) {
    log.debug3("valideDatasetMetadatRecord");
    String doi_val = am.get(MetadataField.FIELD_DOI);
    assertEquals(common_issn, am.get(MetadataField.FIELD_ISSN));

    log.debug3("doi val is: " + doi_val);
    // The dataset doesn't set this value, it'll fail over the main.xml value
    if (doi_val.equals("10.1016/S0140-1111(14)61865-1")) {
      assertEquals(null, am.get(MetadataField.FIELD_DATE));
    } else {
      assertEquals(dateMap.get(doi_val), am.get(MetadataField.FIELD_DATE));
    }
    assertEquals(pubTitleMap.get(doi_val), am.get(MetadataField.FIELD_PUBLICATION_TITLE));
  }
コード例 #5
0
 @Override
 public ArticleMetadata extract(MetadataTarget target, CachedUrl cu) throws IOException {
   ArticleMetadata am = super.extract(target, cu);
   am.cook(tagMap);
   String url = am.get(MetadataField.FIELD_ACCESS_URL);
   if (url != null && !url.isEmpty()) {
     CachedUrl val = cu.getArchivalUnit().makeCachedUrl(url);
     if (!val.hasContent()) {
       am.replace(MetadataField.FIELD_ACCESS_URL, cu.getUrl());
     }
   } else {
     am.replace(MetadataField.FIELD_ACCESS_URL, cu.getUrl());
   }
   return am;
 }
コード例 #6
0
 /*
  * hasArticleMetadata(CachedUrl cu)
  *   Given the CachedUrl for the potential abstract file, using the existing
  *   SimpleHtmlMetaTagMetadataExtractor to parse the file and
  *   retrieve any contained metadata. If a doi or author exists, it's an article
  *   NOT defining the Metadata Extractor here!
  */
 private boolean hasArticleMetadata(CachedUrl cu) {
   MetadataTarget at = new MetadataTarget(MetadataTarget.PURPOSE_ARTICLE);
   ArticleMetadata am;
   SimpleHtmlMetaTagMetadataExtractor ext = new SimpleHtmlMetaTagMetadataExtractor();
   if (cu != null && cu.hasContent()) {
     try {
       at.setFormat("text/html");
       am = ext.extract(at, cu);
       if ((am.containsRawKey("bepress_citation_journal_title"))
           || (am.containsRawKey("bepress_citation_abstract_html_url"))
           || (am.containsRawKey("bepress_citation_doi"))
           || (am.containsRawKey("bepress_citation_author"))) {
         return true;
       }
     } catch (IOException e) {
       e.printStackTrace();
     }
   }
   return false; // no reasonable metadata, probably a toc
 }
コード例 #7
0
  public void testExtractGoodHtmlContent() throws Exception {

    List<ArticleMetadata> mdlist = setupContentForAU(bau1, ABS_URL, goodHtmlContent, true);
    assertNotEmpty(mdlist);
    ArticleMetadata md = mdlist.get(0);
    assertNotNull(md);
    assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER));
    assertEquals(goodTitle, md.get(MetadataField.DC_FIELD_TITLE));
    assertEquals(goodDate, md.get(MetadataField.FIELD_DATE));
    assertEquals(goodFormat, md.get(MetadataField.DC_FIELD_FORMAT));
    assertEquals(goodType, md.get(MetadataField.DC_FIELD_TYPE));
    assertEquals(Arrays.asList(goodAuthors), md.getList(MetadataField.FIELD_AUTHOR));
    assertEquals(goodAuthors[0], md.get(MetadataField.DC_FIELD_CREATOR));
  }
コード例 #8
0
  /*
   * You will have to tell it the DOI and the schema because those normally come from dataset
   */
  private void validateSingleMainMetadataRecord(ArticleMetadata am, String doi_val, String schema) {
    log.debug3("valideSingleMainMetadatRecord");
    if ("simple-article".equals(schema)) {
      assertEquals(common_simple_article_title, am.get(MetadataField.FIELD_ARTICLE_TITLE));
    } else {
      assertEquals(common_article_title, am.get(MetadataField.FIELD_ARTICLE_TITLE));
    }

    log.debug3("doi val is: " + doi_val);
    assertEquals(authorMap.get(doi_val), am.getList(MetadataField.FIELD_AUTHOR));
    assertEquals(volMap.get(doi_val), am.get(MetadataField.FIELD_VOLUME));
    assertEquals(issueMap.get(doi_val), am.get(MetadataField.FIELD_ISSUE));
    assertEquals("Comment", am.getRaw(ElsevierMainDTD5XmlSchemaHelper.common_dochead));
    assertEquals(doi_val, am.getRaw(ElsevierMainDTD5XmlSchemaHelper.common_doi));
    assertEquals("2014", am.getRaw(ElsevierMainDTD5XmlSchemaHelper.common_copyright));
  }
コード例 #9
0
  /**
   * Method that creates a simulated Cached URL from the source code provided by the goodContent
   * String. It then asserts that the metadata extracted, by using the
   * MetaPressRisMetadataExtractorFactory, match the metadata in the source code.
   *
   * @throws Exception
   */
  public void testExtractAlternateRisContent() throws Exception {
    String goodContent = createAlternateRisContent();
    log.debug3(goodContent);

    List<ArticleMetadata> mdlist = setupContentForAU(bau1, RIS_URL, goodContent, false);
    assertNotEmpty(mdlist);
    ArticleMetadata md = mdlist.get(0);
    assertNotNull(md);

    Iterator<String> actAuthIter = md.getList(MetadataField.FIELD_AUTHOR).iterator();
    for (String expAuth : goodAuthors) {
      assertEquals(expAuth, actAuthIter.next());
    }
    assertEquals(goodTitle, md.get(MetadataField.FIELD_ARTICLE_TITLE));
    assertEquals(goodJournal, md.get(MetadataField.FIELD_PUBLICATION_TITLE));
    assertEquals(goodDate, md.get(MetadataField.FIELD_DATE));
    assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER));
  }
コード例 #10
0
    /*
     * The filename is the ProducID with either ".pdf" or ".epub" suffix.
     * Tje content files live in a parallel directory
     *     <base>/<year>/Content/
     * The XML file represented by the current cu would be something like:
     *   <base>/<year>/DataFeed/EBSCOhostGKB_20160205_DELTA.zip!/EBSCOhostGKB_20160205_DELTA.xml
     * and the pdf would be
     *   <base>/<year>/Content/123456.pdf
     */
    @Override
    protected List<String> getFilenamesAssociatedWithRecord(
        SourceXmlSchemaHelper helper, CachedUrl cu, ArticleMetadata oneAM) {

      // this has been set to be the "ProductID" value
      String filenameValue = oneAM.getRaw(helper.getFilenameXPathKey());

      String cuBase = FilenameUtils.getFullPath(cu.getUrl());
      int datafeed_dir_start = cuBase.lastIndexOf("/DataFeed/");
      // This will leave the "/", so just add back on the sibling_dir and filename
      String contentPath;
      if (datafeed_dir_start < 0) {
        // can't return null because that would make it okay to emit
        // this will fail to emit, as it should - we don't know how to verify the PDF existence
        log.siteWarning("The XML file lives at an unexpected location: " + cuBase);
        contentPath = CONTENT_DIR; // invalid but will force failure
      } else {
        contentPath = cuBase.substring(0, datafeed_dir_start) + CONTENT_DIR;
      }
      List<String> returnList = new ArrayList<String>();
      returnList.add(contentPath + filenameValue + ".pdf");
      returnList.add(contentPath + filenameValue + ".epub");
      return returnList;
    }
コード例 #11
0
  /**
   * Method that creates a simulated Cached URL from the source code provided by the goodContent
   * String. It then asserts that the metadata extracted, by using the
   * MetaPressRisMetadataExtractorFactory, match the metadata in the source code.
   *
   * @throws Exception
   */
  public void testExtractGoodRisContent() throws Exception {
    String goodContent = createGoodRisContent();
    log.debug3(goodContent);

    List<ArticleMetadata> mdlist = setupContentForAU(bau1, RIS_URL, goodContent, false);
    assertNotEmpty(mdlist);
    ArticleMetadata md = mdlist.get(0);
    assertNotNull(md);

    assertEquals(goodVolume, md.get(MetadataField.FIELD_VOLUME));
    assertEquals(goodIssue, md.get(MetadataField.FIELD_ISSUE));
    assertEquals(goodStartPage, md.get(MetadataField.FIELD_START_PAGE));
    assertEquals(goodEndPage, md.get(MetadataField.FIELD_END_PAGE));
    assertEquals(goodIssn, md.get(MetadataField.FIELD_ISSN));
    Iterator<String> actAuthIter = md.getList(MetadataField.FIELD_AUTHOR).iterator();
    for (String expAuth : goodAuthors) {
      assertEquals(expAuth, actAuthIter.next());
    }
    assertEquals(goodTitle, md.get(MetadataField.FIELD_ARTICLE_TITLE));
    assertEquals(goodJournal, md.get(MetadataField.FIELD_PUBLICATION_TITLE));
    assertEquals(goodDate, md.get(MetadataField.FIELD_DATE));

    assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER));
    assertEquals(goodDOI, md.get(MetadataField.FIELD_DOI));
    // This shouldn't get set. It will default later to fuill_text_cu
    assertNotEquals(doiURL, md.get(MetadataField.FIELD_ACCESS_URL));
  }
コード例 #12
0
    // override this to do some additional attempts to get valid data before emitting
    @Override
    public void extract(MetadataTarget target, CachedUrl cu, FileMetadataExtractor.Emitter emitter)
        throws IOException, PluginException {
      ArticleMetadata am = extract(target, cu);

      /*
       * if, due to overcrawl, we got to a page that didn't have anything
       * valid, eg "this page not found" html page
       * don't emit empty metadata (because defaults would get put in
       * Must do this after cooking, because it checks size of cooked info
       */
      if (am.isEmpty()) {
        return;
      }

      /*
       * RIS data can be variable.  We don't have any way to add priority to
       * the cooking of data, so fallback to alternate values manually
       *
       * There are differences between books and journals, so fork for titles
       * and metadata check
       */
      if (am.get(MetadataField.FIELD_DATE) == null) {
        if (am.getRaw("Y1") != null) { // if DA wasn't there, use Y1
          am.put(MetadataField.FIELD_DATE, am.getRaw("Y1"));
        }
      }

      /*
       * Determine if this is a book item or a journal item.
       * set the appropriate article type once the daemon passes along the TY
       */
      String ris_type = am.getRaw("TY");
      if (ris_type == null) {
        // pre 1.69, do an alternate check because TY wasn't passed through
        ris_type = "JOUR"; // set a default
        if (am.get(MetadataField.FIELD_ISBN) != null) {
          // it is a bad value, but it was recognized as an isbn because of TY type
          ris_type = "BOOK"; // it could be a chapter but until TY is passed through...
        }
      }

      // Modify or try alternate RIS tag values based after cooking
      postCookProcess(cu, am, ris_type);

      // Only emit if this item is likely to be from this AU
      // protect against counting overcrawled articles by checking against
      // values from the TDB file - differentiate between book items and journal itesm
      ArchivalUnit au = cu.getArchivalUnit();
      if (ris_type.contains("BOOK") || ris_type.contains("CHAP")) {
        if (!BaseAtyponMetadataUtil.metadataMatchesBookTdb(au, am)) {
          return;
        }
      } else {
        // JOURNAL default is to assume it's a journal for backwards compatibility
        if (!BaseAtyponMetadataUtil.metadataMatchesTdb(au, am)) {
          return;
        }
      }

      /*
       * Fill in DOI, publisher, other information available from
       * the URL or TDB
       * CORRECT the access.url if it is not in the AU
       */
      BaseAtyponMetadataUtil.completeMetadata(cu, am);
      emitter.emitMetadata(cu, am);
    }
コード例 #13
0
    /*
     * isolate the modifications done on the AM after the initial extraction
     * in order to allow child plugins to do override this and do
     * additional work before calling the pre-emit checking...
     * ArticleMetadata - passed in information from extract/cook
     * ris_type - the TY value or its inferred type (basically, book or journal)
     */
    protected void postCookProcess(CachedUrl cu, ArticleMetadata am, String ris_type) {
      /*
       * RIS data can be variable.  We don't have any way to add priority to
       * the cooking of data, so fallback to alternate values manually
       */
      if (am.get(MetadataField.FIELD_DATE) == null) {
        if (am.getRaw("Y1") != null) { // if DA wasn't there, use Y1
          am.put(MetadataField.FIELD_DATE, am.getRaw("Y1"));
        }
      }

      /*
       * There are differences between books and journals, so fork for titles
       * and metadata check
       */
      if (ris_type.contains("BOOK") || ris_type.contains("CHAP")) {
        // BOOK in some form
        // T1 is the primary title - of the chapter for a book chapter, or book for a complete book
        // T2 is the next title up - of the book for a chapter, of the series for a book
        // T3 is the uppermost - of the series for a chapter
        // sometimes they use TI instead of T1...
        if (am.get(MetadataField.FIELD_ARTICLE_TITLE) == null) {
          if (am.getRaw("TI") != null) { // if T1 wasn't there, use TI
            am.put(MetadataField.FIELD_ARTICLE_TITLE, am.getRaw("TI"));
          }
        }

        if (ris_type.contains("CHAP")) {
          // just one chapter - set the article type correctly
          am.put(MetadataField.FIELD_ARTICLE_TYPE, MetadataField.ARTICLE_TYPE_BOOKCHAPTER);
          if ((am.get(MetadataField.FIELD_PUBLICATION_TITLE) == null)
              && (am.getRaw("T2") != null)) {
            // the publication and the article titles are just the name of the book
            am.put(MetadataField.FIELD_PUBLICATION_TITLE, am.getRaw("T2"));
          }
          if ((am.get(MetadataField.FIELD_SERIES_TITLE) == null) && (am.getRaw("T3") != null)) {
            // the publication and the article titles are just the name of the book
            am.put(MetadataField.FIELD_SERIES_TITLE, am.getRaw("T3"));
          }
        } else {
          // We're a full book volume - articletitle = publicationtitle
          am.put(MetadataField.FIELD_ARTICLE_TYPE, MetadataField.ARTICLE_TYPE_BOOKVOLUME);
          if (am.get(MetadataField.FIELD_PUBLICATION_TITLE) == null) {
            // the publication and the article titles are just the name of the book
            am.put(
                MetadataField.FIELD_PUBLICATION_TITLE, am.get(MetadataField.FIELD_ARTICLE_TITLE));
          }
          // series title can be from T2
          if ((am.get(MetadataField.FIELD_SERIES_TITLE) == null) && (am.getRaw("T2") != null)) {
            // the publication and the article titles are just the name of the book
            am.put(MetadataField.FIELD_SERIES_TITLE, am.getRaw("T2"));
          }
        }
      } else {
        // JOURNAL default is to assume it's a journal for backwards compatibility
        if (am.get(MetadataField.FIELD_PUBLICATION_TITLE) == null) {
          if (am.getRaw("T2") != null) {
            am.put(MetadataField.FIELD_PUBLICATION_TITLE, am.getRaw("T2"));
          } else if (am.getRaw("JO") != null) {
            am.put(
                MetadataField.FIELD_PUBLICATION_TITLE,
                am.getRaw("JO")); // might be unabbreviated version
          }
        }
      }
    }
コード例 #14
0
  /*
   * When testing a complete extraction out of the tarset, the MD record will be completely filled in
   * and pdf-existence will get established
   */
  private void validateCompleteMetadataRecord(ArticleMetadata am) {
    log.debug3("valideCompleteMetadatRecord");
    String doi_val = am.get(MetadataField.FIELD_DOI);
    /* make sure we can pick up both types of xml article data */
    log.debug3("doi val is: " + doi_val);

    if ("JA 5.2.0 SIMPLE-ARTICLE"
        .equals(am.getRaw(ElsevierDatasetXmlSchemaHelper.dataset_dtd_metadata))) {
      log.debug3("simple-article");
      assertEquals(common_simple_article_title, am.get(MetadataField.FIELD_ARTICLE_TITLE));
    } else {
      assertEquals(common_article_title, am.get(MetadataField.FIELD_ARTICLE_TITLE));
    }
    assertEquals(common_issn, am.get(MetadataField.FIELD_ISSN));
    assertEquals(authorMap.get(doi_val), am.getList(MetadataField.FIELD_AUTHOR));
    assertEquals(dateMap.get(doi_val), am.get(MetadataField.FIELD_DATE));
    assertEquals(accessUrlMap.get(doi_val), am.get(MetadataField.FIELD_ACCESS_URL));
    assertEquals(volMap.get(doi_val), am.get(MetadataField.FIELD_VOLUME));
    assertEquals(issueMap.get(doi_val), am.get(MetadataField.FIELD_ISSUE));
    assertEquals(pubTitleMap.get(doi_val), am.get(MetadataField.FIELD_PUBLICATION_TITLE));
    assertEquals("Elsevier", am.get(MetadataField.FIELD_PROVIDER));
    assertEquals("Elsevier", am.get(MetadataField.FIELD_PUBLISHER));
    log.debug3(am.ppString(2));
  }