@Override public ArticleMetadata extract(MetadataTarget target, CachedUrl cu) throws IOException { log.debug3("Metadata - cachedurl cu:" + cu.getUrl()); ArticleMetadata am = super.extract(target, cu); am.cook(tagMap); return am; } // extract
public void testDOIExtraction() throws Exception { List<ArticleMetadata> mdlist = setupContentForAU(bau1, ABS_URL, goodHtmlContentNoDOIorPublisher, true); assertNotEmpty(mdlist); ArticleMetadata md = mdlist.get(0); assertNotNull(md); // gets pulled from the URL if not set in the metadata assertEquals("10.1175/2010WCAS1063.1", md.get(MetadataField.FIELD_DOI)); // gets set manually if not in the metadata // first it would try the TDB assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER)); }
@Override public void extract(MetadataTarget target, CachedUrl cu, Emitter emitter) throws IOException { ArticleMetadata am = new SimpleHtmlMetaTagMetadataExtractor().extract(target, cu); am.cook(tagMap); String url = am.get(MetadataField.FIELD_ACCESS_URL); ArchivalUnit au = cu.getArchivalUnit(); if (url == null || url.isEmpty() || !au.makeCachedUrl(url).hasContent()) { url = cu.getUrl(); } am.replace( MetadataField.FIELD_ACCESS_URL, HttpToHttpsUtil.AuUtil.normalizeHttpHttpsFromBaseUrl(au, url)); emitter.emitMetadata(cu, am); }
/* * When testing no-pdf-check basic XML parsing, you will get partial MD records * depending on whether the info comes from dataset.xml or from main.xml */ private void validateDatasetMetadataRecord(ArticleMetadata am) { log.debug3("valideDatasetMetadatRecord"); String doi_val = am.get(MetadataField.FIELD_DOI); assertEquals(common_issn, am.get(MetadataField.FIELD_ISSN)); log.debug3("doi val is: " + doi_val); // The dataset doesn't set this value, it'll fail over the main.xml value if (doi_val.equals("10.1016/S0140-1111(14)61865-1")) { assertEquals(null, am.get(MetadataField.FIELD_DATE)); } else { assertEquals(dateMap.get(doi_val), am.get(MetadataField.FIELD_DATE)); } assertEquals(pubTitleMap.get(doi_val), am.get(MetadataField.FIELD_PUBLICATION_TITLE)); }
@Override public ArticleMetadata extract(MetadataTarget target, CachedUrl cu) throws IOException { ArticleMetadata am = super.extract(target, cu); am.cook(tagMap); String url = am.get(MetadataField.FIELD_ACCESS_URL); if (url != null && !url.isEmpty()) { CachedUrl val = cu.getArchivalUnit().makeCachedUrl(url); if (!val.hasContent()) { am.replace(MetadataField.FIELD_ACCESS_URL, cu.getUrl()); } } else { am.replace(MetadataField.FIELD_ACCESS_URL, cu.getUrl()); } return am; }
/* * hasArticleMetadata(CachedUrl cu) * Given the CachedUrl for the potential abstract file, using the existing * SimpleHtmlMetaTagMetadataExtractor to parse the file and * retrieve any contained metadata. If a doi or author exists, it's an article * NOT defining the Metadata Extractor here! */ private boolean hasArticleMetadata(CachedUrl cu) { MetadataTarget at = new MetadataTarget(MetadataTarget.PURPOSE_ARTICLE); ArticleMetadata am; SimpleHtmlMetaTagMetadataExtractor ext = new SimpleHtmlMetaTagMetadataExtractor(); if (cu != null && cu.hasContent()) { try { at.setFormat("text/html"); am = ext.extract(at, cu); if ((am.containsRawKey("bepress_citation_journal_title")) || (am.containsRawKey("bepress_citation_abstract_html_url")) || (am.containsRawKey("bepress_citation_doi")) || (am.containsRawKey("bepress_citation_author"))) { return true; } } catch (IOException e) { e.printStackTrace(); } } return false; // no reasonable metadata, probably a toc }
public void testExtractGoodHtmlContent() throws Exception { List<ArticleMetadata> mdlist = setupContentForAU(bau1, ABS_URL, goodHtmlContent, true); assertNotEmpty(mdlist); ArticleMetadata md = mdlist.get(0); assertNotNull(md); assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER)); assertEquals(goodTitle, md.get(MetadataField.DC_FIELD_TITLE)); assertEquals(goodDate, md.get(MetadataField.FIELD_DATE)); assertEquals(goodFormat, md.get(MetadataField.DC_FIELD_FORMAT)); assertEquals(goodType, md.get(MetadataField.DC_FIELD_TYPE)); assertEquals(Arrays.asList(goodAuthors), md.getList(MetadataField.FIELD_AUTHOR)); assertEquals(goodAuthors[0], md.get(MetadataField.DC_FIELD_CREATOR)); }
/* * You will have to tell it the DOI and the schema because those normally come from dataset */ private void validateSingleMainMetadataRecord(ArticleMetadata am, String doi_val, String schema) { log.debug3("valideSingleMainMetadatRecord"); if ("simple-article".equals(schema)) { assertEquals(common_simple_article_title, am.get(MetadataField.FIELD_ARTICLE_TITLE)); } else { assertEquals(common_article_title, am.get(MetadataField.FIELD_ARTICLE_TITLE)); } log.debug3("doi val is: " + doi_val); assertEquals(authorMap.get(doi_val), am.getList(MetadataField.FIELD_AUTHOR)); assertEquals(volMap.get(doi_val), am.get(MetadataField.FIELD_VOLUME)); assertEquals(issueMap.get(doi_val), am.get(MetadataField.FIELD_ISSUE)); assertEquals("Comment", am.getRaw(ElsevierMainDTD5XmlSchemaHelper.common_dochead)); assertEquals(doi_val, am.getRaw(ElsevierMainDTD5XmlSchemaHelper.common_doi)); assertEquals("2014", am.getRaw(ElsevierMainDTD5XmlSchemaHelper.common_copyright)); }
/** * Method that creates a simulated Cached URL from the source code provided by the goodContent * String. It then asserts that the metadata extracted, by using the * MetaPressRisMetadataExtractorFactory, match the metadata in the source code. * * @throws Exception */ public void testExtractAlternateRisContent() throws Exception { String goodContent = createAlternateRisContent(); log.debug3(goodContent); List<ArticleMetadata> mdlist = setupContentForAU(bau1, RIS_URL, goodContent, false); assertNotEmpty(mdlist); ArticleMetadata md = mdlist.get(0); assertNotNull(md); Iterator<String> actAuthIter = md.getList(MetadataField.FIELD_AUTHOR).iterator(); for (String expAuth : goodAuthors) { assertEquals(expAuth, actAuthIter.next()); } assertEquals(goodTitle, md.get(MetadataField.FIELD_ARTICLE_TITLE)); assertEquals(goodJournal, md.get(MetadataField.FIELD_PUBLICATION_TITLE)); assertEquals(goodDate, md.get(MetadataField.FIELD_DATE)); assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER)); }
/* * The filename is the ProducID with either ".pdf" or ".epub" suffix. * Tje content files live in a parallel directory * <base>/<year>/Content/ * The XML file represented by the current cu would be something like: * <base>/<year>/DataFeed/EBSCOhostGKB_20160205_DELTA.zip!/EBSCOhostGKB_20160205_DELTA.xml * and the pdf would be * <base>/<year>/Content/123456.pdf */ @Override protected List<String> getFilenamesAssociatedWithRecord( SourceXmlSchemaHelper helper, CachedUrl cu, ArticleMetadata oneAM) { // this has been set to be the "ProductID" value String filenameValue = oneAM.getRaw(helper.getFilenameXPathKey()); String cuBase = FilenameUtils.getFullPath(cu.getUrl()); int datafeed_dir_start = cuBase.lastIndexOf("/DataFeed/"); // This will leave the "/", so just add back on the sibling_dir and filename String contentPath; if (datafeed_dir_start < 0) { // can't return null because that would make it okay to emit // this will fail to emit, as it should - we don't know how to verify the PDF existence log.siteWarning("The XML file lives at an unexpected location: " + cuBase); contentPath = CONTENT_DIR; // invalid but will force failure } else { contentPath = cuBase.substring(0, datafeed_dir_start) + CONTENT_DIR; } List<String> returnList = new ArrayList<String>(); returnList.add(contentPath + filenameValue + ".pdf"); returnList.add(contentPath + filenameValue + ".epub"); return returnList; }
/** * Method that creates a simulated Cached URL from the source code provided by the goodContent * String. It then asserts that the metadata extracted, by using the * MetaPressRisMetadataExtractorFactory, match the metadata in the source code. * * @throws Exception */ public void testExtractGoodRisContent() throws Exception { String goodContent = createGoodRisContent(); log.debug3(goodContent); List<ArticleMetadata> mdlist = setupContentForAU(bau1, RIS_URL, goodContent, false); assertNotEmpty(mdlist); ArticleMetadata md = mdlist.get(0); assertNotNull(md); assertEquals(goodVolume, md.get(MetadataField.FIELD_VOLUME)); assertEquals(goodIssue, md.get(MetadataField.FIELD_ISSUE)); assertEquals(goodStartPage, md.get(MetadataField.FIELD_START_PAGE)); assertEquals(goodEndPage, md.get(MetadataField.FIELD_END_PAGE)); assertEquals(goodIssn, md.get(MetadataField.FIELD_ISSN)); Iterator<String> actAuthIter = md.getList(MetadataField.FIELD_AUTHOR).iterator(); for (String expAuth : goodAuthors) { assertEquals(expAuth, actAuthIter.next()); } assertEquals(goodTitle, md.get(MetadataField.FIELD_ARTICLE_TITLE)); assertEquals(goodJournal, md.get(MetadataField.FIELD_PUBLICATION_TITLE)); assertEquals(goodDate, md.get(MetadataField.FIELD_DATE)); assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER)); assertEquals(goodDOI, md.get(MetadataField.FIELD_DOI)); // This shouldn't get set. It will default later to fuill_text_cu assertNotEquals(doiURL, md.get(MetadataField.FIELD_ACCESS_URL)); }
// override this to do some additional attempts to get valid data before emitting @Override public void extract(MetadataTarget target, CachedUrl cu, FileMetadataExtractor.Emitter emitter) throws IOException, PluginException { ArticleMetadata am = extract(target, cu); /* * if, due to overcrawl, we got to a page that didn't have anything * valid, eg "this page not found" html page * don't emit empty metadata (because defaults would get put in * Must do this after cooking, because it checks size of cooked info */ if (am.isEmpty()) { return; } /* * RIS data can be variable. We don't have any way to add priority to * the cooking of data, so fallback to alternate values manually * * There are differences between books and journals, so fork for titles * and metadata check */ if (am.get(MetadataField.FIELD_DATE) == null) { if (am.getRaw("Y1") != null) { // if DA wasn't there, use Y1 am.put(MetadataField.FIELD_DATE, am.getRaw("Y1")); } } /* * Determine if this is a book item or a journal item. * set the appropriate article type once the daemon passes along the TY */ String ris_type = am.getRaw("TY"); if (ris_type == null) { // pre 1.69, do an alternate check because TY wasn't passed through ris_type = "JOUR"; // set a default if (am.get(MetadataField.FIELD_ISBN) != null) { // it is a bad value, but it was recognized as an isbn because of TY type ris_type = "BOOK"; // it could be a chapter but until TY is passed through... } } // Modify or try alternate RIS tag values based after cooking postCookProcess(cu, am, ris_type); // Only emit if this item is likely to be from this AU // protect against counting overcrawled articles by checking against // values from the TDB file - differentiate between book items and journal itesm ArchivalUnit au = cu.getArchivalUnit(); if (ris_type.contains("BOOK") || ris_type.contains("CHAP")) { if (!BaseAtyponMetadataUtil.metadataMatchesBookTdb(au, am)) { return; } } else { // JOURNAL default is to assume it's a journal for backwards compatibility if (!BaseAtyponMetadataUtil.metadataMatchesTdb(au, am)) { return; } } /* * Fill in DOI, publisher, other information available from * the URL or TDB * CORRECT the access.url if it is not in the AU */ BaseAtyponMetadataUtil.completeMetadata(cu, am); emitter.emitMetadata(cu, am); }
/* * isolate the modifications done on the AM after the initial extraction * in order to allow child plugins to do override this and do * additional work before calling the pre-emit checking... * ArticleMetadata - passed in information from extract/cook * ris_type - the TY value or its inferred type (basically, book or journal) */ protected void postCookProcess(CachedUrl cu, ArticleMetadata am, String ris_type) { /* * RIS data can be variable. We don't have any way to add priority to * the cooking of data, so fallback to alternate values manually */ if (am.get(MetadataField.FIELD_DATE) == null) { if (am.getRaw("Y1") != null) { // if DA wasn't there, use Y1 am.put(MetadataField.FIELD_DATE, am.getRaw("Y1")); } } /* * There are differences between books and journals, so fork for titles * and metadata check */ if (ris_type.contains("BOOK") || ris_type.contains("CHAP")) { // BOOK in some form // T1 is the primary title - of the chapter for a book chapter, or book for a complete book // T2 is the next title up - of the book for a chapter, of the series for a book // T3 is the uppermost - of the series for a chapter // sometimes they use TI instead of T1... if (am.get(MetadataField.FIELD_ARTICLE_TITLE) == null) { if (am.getRaw("TI") != null) { // if T1 wasn't there, use TI am.put(MetadataField.FIELD_ARTICLE_TITLE, am.getRaw("TI")); } } if (ris_type.contains("CHAP")) { // just one chapter - set the article type correctly am.put(MetadataField.FIELD_ARTICLE_TYPE, MetadataField.ARTICLE_TYPE_BOOKCHAPTER); if ((am.get(MetadataField.FIELD_PUBLICATION_TITLE) == null) && (am.getRaw("T2") != null)) { // the publication and the article titles are just the name of the book am.put(MetadataField.FIELD_PUBLICATION_TITLE, am.getRaw("T2")); } if ((am.get(MetadataField.FIELD_SERIES_TITLE) == null) && (am.getRaw("T3") != null)) { // the publication and the article titles are just the name of the book am.put(MetadataField.FIELD_SERIES_TITLE, am.getRaw("T3")); } } else { // We're a full book volume - articletitle = publicationtitle am.put(MetadataField.FIELD_ARTICLE_TYPE, MetadataField.ARTICLE_TYPE_BOOKVOLUME); if (am.get(MetadataField.FIELD_PUBLICATION_TITLE) == null) { // the publication and the article titles are just the name of the book am.put( MetadataField.FIELD_PUBLICATION_TITLE, am.get(MetadataField.FIELD_ARTICLE_TITLE)); } // series title can be from T2 if ((am.get(MetadataField.FIELD_SERIES_TITLE) == null) && (am.getRaw("T2") != null)) { // the publication and the article titles are just the name of the book am.put(MetadataField.FIELD_SERIES_TITLE, am.getRaw("T2")); } } } else { // JOURNAL default is to assume it's a journal for backwards compatibility if (am.get(MetadataField.FIELD_PUBLICATION_TITLE) == null) { if (am.getRaw("T2") != null) { am.put(MetadataField.FIELD_PUBLICATION_TITLE, am.getRaw("T2")); } else if (am.getRaw("JO") != null) { am.put( MetadataField.FIELD_PUBLICATION_TITLE, am.getRaw("JO")); // might be unabbreviated version } } } }
/* * When testing a complete extraction out of the tarset, the MD record will be completely filled in * and pdf-existence will get established */ private void validateCompleteMetadataRecord(ArticleMetadata am) { log.debug3("valideCompleteMetadatRecord"); String doi_val = am.get(MetadataField.FIELD_DOI); /* make sure we can pick up both types of xml article data */ log.debug3("doi val is: " + doi_val); if ("JA 5.2.0 SIMPLE-ARTICLE" .equals(am.getRaw(ElsevierDatasetXmlSchemaHelper.dataset_dtd_metadata))) { log.debug3("simple-article"); assertEquals(common_simple_article_title, am.get(MetadataField.FIELD_ARTICLE_TITLE)); } else { assertEquals(common_article_title, am.get(MetadataField.FIELD_ARTICLE_TITLE)); } assertEquals(common_issn, am.get(MetadataField.FIELD_ISSN)); assertEquals(authorMap.get(doi_val), am.getList(MetadataField.FIELD_AUTHOR)); assertEquals(dateMap.get(doi_val), am.get(MetadataField.FIELD_DATE)); assertEquals(accessUrlMap.get(doi_val), am.get(MetadataField.FIELD_ACCESS_URL)); assertEquals(volMap.get(doi_val), am.get(MetadataField.FIELD_VOLUME)); assertEquals(issueMap.get(doi_val), am.get(MetadataField.FIELD_ISSUE)); assertEquals(pubTitleMap.get(doi_val), am.get(MetadataField.FIELD_PUBLICATION_TITLE)); assertEquals("Elsevier", am.get(MetadataField.FIELD_PROVIDER)); assertEquals("Elsevier", am.get(MetadataField.FIELD_PUBLISHER)); log.debug3(am.ppString(2)); }