/* * The filename is the ProducID with either ".pdf" or ".epub" suffix. * Tje content files live in a parallel directory * <base>/<year>/Content/ * The XML file represented by the current cu would be something like: * <base>/<year>/DataFeed/EBSCOhostGKB_20160205_DELTA.zip!/EBSCOhostGKB_20160205_DELTA.xml * and the pdf would be * <base>/<year>/Content/123456.pdf */ @Override protected List<String> getFilenamesAssociatedWithRecord( SourceXmlSchemaHelper helper, CachedUrl cu, ArticleMetadata oneAM) { // this has been set to be the "ProductID" value String filenameValue = oneAM.getRaw(helper.getFilenameXPathKey()); String cuBase = FilenameUtils.getFullPath(cu.getUrl()); int datafeed_dir_start = cuBase.lastIndexOf("/DataFeed/"); // This will leave the "/", so just add back on the sibling_dir and filename String contentPath; if (datafeed_dir_start < 0) { // can't return null because that would make it okay to emit // this will fail to emit, as it should - we don't know how to verify the PDF existence log.siteWarning("The XML file lives at an unexpected location: " + cuBase); contentPath = CONTENT_DIR; // invalid but will force failure } else { contentPath = cuBase.substring(0, datafeed_dir_start) + CONTENT_DIR; } List<String> returnList = new ArrayList<String>(); returnList.add(contentPath + filenameValue + ".pdf"); returnList.add(contentPath + filenameValue + ".epub"); return returnList; }
// override this to do some additional attempts to get valid data before emitting @Override public void extract(MetadataTarget target, CachedUrl cu, FileMetadataExtractor.Emitter emitter) throws IOException, PluginException { ArticleMetadata am = extract(target, cu); /* * if, due to overcrawl, we got to a page that didn't have anything * valid, eg "this page not found" html page * don't emit empty metadata (because defaults would get put in * Must do this after cooking, because it checks size of cooked info */ if (am.isEmpty()) { return; } /* * RIS data can be variable. We don't have any way to add priority to * the cooking of data, so fallback to alternate values manually * * There are differences between books and journals, so fork for titles * and metadata check */ if (am.get(MetadataField.FIELD_DATE) == null) { if (am.getRaw("Y1") != null) { // if DA wasn't there, use Y1 am.put(MetadataField.FIELD_DATE, am.getRaw("Y1")); } } /* * Determine if this is a book item or a journal item. * set the appropriate article type once the daemon passes along the TY */ String ris_type = am.getRaw("TY"); if (ris_type == null) { // pre 1.69, do an alternate check because TY wasn't passed through ris_type = "JOUR"; // set a default if (am.get(MetadataField.FIELD_ISBN) != null) { // it is a bad value, but it was recognized as an isbn because of TY type ris_type = "BOOK"; // it could be a chapter but until TY is passed through... } } // Modify or try alternate RIS tag values based after cooking postCookProcess(cu, am, ris_type); // Only emit if this item is likely to be from this AU // protect against counting overcrawled articles by checking against // values from the TDB file - differentiate between book items and journal itesm ArchivalUnit au = cu.getArchivalUnit(); if (ris_type.contains("BOOK") || ris_type.contains("CHAP")) { if (!BaseAtyponMetadataUtil.metadataMatchesBookTdb(au, am)) { return; } } else { // JOURNAL default is to assume it's a journal for backwards compatibility if (!BaseAtyponMetadataUtil.metadataMatchesTdb(au, am)) { return; } } /* * Fill in DOI, publisher, other information available from * the URL or TDB * CORRECT the access.url if it is not in the AU */ BaseAtyponMetadataUtil.completeMetadata(cu, am); emitter.emitMetadata(cu, am); }
/* * isolate the modifications done on the AM after the initial extraction * in order to allow child plugins to do override this and do * additional work before calling the pre-emit checking... * ArticleMetadata - passed in information from extract/cook * ris_type - the TY value or its inferred type (basically, book or journal) */ protected void postCookProcess(CachedUrl cu, ArticleMetadata am, String ris_type) { /* * RIS data can be variable. We don't have any way to add priority to * the cooking of data, so fallback to alternate values manually */ if (am.get(MetadataField.FIELD_DATE) == null) { if (am.getRaw("Y1") != null) { // if DA wasn't there, use Y1 am.put(MetadataField.FIELD_DATE, am.getRaw("Y1")); } } /* * There are differences between books and journals, so fork for titles * and metadata check */ if (ris_type.contains("BOOK") || ris_type.contains("CHAP")) { // BOOK in some form // T1 is the primary title - of the chapter for a book chapter, or book for a complete book // T2 is the next title up - of the book for a chapter, of the series for a book // T3 is the uppermost - of the series for a chapter // sometimes they use TI instead of T1... if (am.get(MetadataField.FIELD_ARTICLE_TITLE) == null) { if (am.getRaw("TI") != null) { // if T1 wasn't there, use TI am.put(MetadataField.FIELD_ARTICLE_TITLE, am.getRaw("TI")); } } if (ris_type.contains("CHAP")) { // just one chapter - set the article type correctly am.put(MetadataField.FIELD_ARTICLE_TYPE, MetadataField.ARTICLE_TYPE_BOOKCHAPTER); if ((am.get(MetadataField.FIELD_PUBLICATION_TITLE) == null) && (am.getRaw("T2") != null)) { // the publication and the article titles are just the name of the book am.put(MetadataField.FIELD_PUBLICATION_TITLE, am.getRaw("T2")); } if ((am.get(MetadataField.FIELD_SERIES_TITLE) == null) && (am.getRaw("T3") != null)) { // the publication and the article titles are just the name of the book am.put(MetadataField.FIELD_SERIES_TITLE, am.getRaw("T3")); } } else { // We're a full book volume - articletitle = publicationtitle am.put(MetadataField.FIELD_ARTICLE_TYPE, MetadataField.ARTICLE_TYPE_BOOKVOLUME); if (am.get(MetadataField.FIELD_PUBLICATION_TITLE) == null) { // the publication and the article titles are just the name of the book am.put( MetadataField.FIELD_PUBLICATION_TITLE, am.get(MetadataField.FIELD_ARTICLE_TITLE)); } // series title can be from T2 if ((am.get(MetadataField.FIELD_SERIES_TITLE) == null) && (am.getRaw("T2") != null)) { // the publication and the article titles are just the name of the book am.put(MetadataField.FIELD_SERIES_TITLE, am.getRaw("T2")); } } } else { // JOURNAL default is to assume it's a journal for backwards compatibility if (am.get(MetadataField.FIELD_PUBLICATION_TITLE) == null) { if (am.getRaw("T2") != null) { am.put(MetadataField.FIELD_PUBLICATION_TITLE, am.getRaw("T2")); } else if (am.getRaw("JO") != null) { am.put( MetadataField.FIELD_PUBLICATION_TITLE, am.getRaw("JO")); // might be unabbreviated version } } } }