@Override public void extract(MetadataTarget target, CachedUrl cu, Emitter emitter) throws IOException { ArticleMetadata am = new SimpleHtmlMetaTagMetadataExtractor().extract(target, cu); am.cook(tagMap); String url = am.get(MetadataField.FIELD_ACCESS_URL); ArchivalUnit au = cu.getArchivalUnit(); if (url == null || url.isEmpty() || !au.makeCachedUrl(url).hasContent()) { url = cu.getUrl(); } am.replace( MetadataField.FIELD_ACCESS_URL, HttpToHttpsUtil.AuUtil.normalizeHttpHttpsFromBaseUrl(au, url)); emitter.emitMetadata(cu, am); }
@Override public ArticleMetadata extract(MetadataTarget target, CachedUrl cu) throws IOException { ArticleMetadata am = super.extract(target, cu); am.cook(tagMap); String url = am.get(MetadataField.FIELD_ACCESS_URL); if (url != null && !url.isEmpty()) { CachedUrl val = cu.getArchivalUnit().makeCachedUrl(url); if (!val.hasContent()) { am.replace(MetadataField.FIELD_ACCESS_URL, cu.getUrl()); } } else { am.replace(MetadataField.FIELD_ACCESS_URL, cu.getUrl()); } return am; }
// override this to do some additional attempts to get valid data before emitting @Override public void extract(MetadataTarget target, CachedUrl cu, FileMetadataExtractor.Emitter emitter) throws IOException, PluginException { ArticleMetadata am = extract(target, cu); /* * if, due to overcrawl, we got to a page that didn't have anything * valid, eg "this page not found" html page * don't emit empty metadata (because defaults would get put in * Must do this after cooking, because it checks size of cooked info */ if (am.isEmpty()) { return; } /* * RIS data can be variable. We don't have any way to add priority to * the cooking of data, so fallback to alternate values manually * * There are differences between books and journals, so fork for titles * and metadata check */ if (am.get(MetadataField.FIELD_DATE) == null) { if (am.getRaw("Y1") != null) { // if DA wasn't there, use Y1 am.put(MetadataField.FIELD_DATE, am.getRaw("Y1")); } } /* * Determine if this is a book item or a journal item. * set the appropriate article type once the daemon passes along the TY */ String ris_type = am.getRaw("TY"); if (ris_type == null) { // pre 1.69, do an alternate check because TY wasn't passed through ris_type = "JOUR"; // set a default if (am.get(MetadataField.FIELD_ISBN) != null) { // it is a bad value, but it was recognized as an isbn because of TY type ris_type = "BOOK"; // it could be a chapter but until TY is passed through... } } // Modify or try alternate RIS tag values based after cooking postCookProcess(cu, am, ris_type); // Only emit if this item is likely to be from this AU // protect against counting overcrawled articles by checking against // values from the TDB file - differentiate between book items and journal itesm ArchivalUnit au = cu.getArchivalUnit(); if (ris_type.contains("BOOK") || ris_type.contains("CHAP")) { if (!BaseAtyponMetadataUtil.metadataMatchesBookTdb(au, am)) { return; } } else { // JOURNAL default is to assume it's a journal for backwards compatibility if (!BaseAtyponMetadataUtil.metadataMatchesTdb(au, am)) { return; } } /* * Fill in DOI, publisher, other information available from * the URL or TDB * CORRECT the access.url if it is not in the AU */ BaseAtyponMetadataUtil.completeMetadata(cu, am); emitter.emitMetadata(cu, am); }
/* * isolate the modifications done on the AM after the initial extraction * in order to allow child plugins to do override this and do * additional work before calling the pre-emit checking... * ArticleMetadata - passed in information from extract/cook * ris_type - the TY value or its inferred type (basically, book or journal) */ protected void postCookProcess(CachedUrl cu, ArticleMetadata am, String ris_type) { /* * RIS data can be variable. We don't have any way to add priority to * the cooking of data, so fallback to alternate values manually */ if (am.get(MetadataField.FIELD_DATE) == null) { if (am.getRaw("Y1") != null) { // if DA wasn't there, use Y1 am.put(MetadataField.FIELD_DATE, am.getRaw("Y1")); } } /* * There are differences between books and journals, so fork for titles * and metadata check */ if (ris_type.contains("BOOK") || ris_type.contains("CHAP")) { // BOOK in some form // T1 is the primary title - of the chapter for a book chapter, or book for a complete book // T2 is the next title up - of the book for a chapter, of the series for a book // T3 is the uppermost - of the series for a chapter // sometimes they use TI instead of T1... if (am.get(MetadataField.FIELD_ARTICLE_TITLE) == null) { if (am.getRaw("TI") != null) { // if T1 wasn't there, use TI am.put(MetadataField.FIELD_ARTICLE_TITLE, am.getRaw("TI")); } } if (ris_type.contains("CHAP")) { // just one chapter - set the article type correctly am.put(MetadataField.FIELD_ARTICLE_TYPE, MetadataField.ARTICLE_TYPE_BOOKCHAPTER); if ((am.get(MetadataField.FIELD_PUBLICATION_TITLE) == null) && (am.getRaw("T2") != null)) { // the publication and the article titles are just the name of the book am.put(MetadataField.FIELD_PUBLICATION_TITLE, am.getRaw("T2")); } if ((am.get(MetadataField.FIELD_SERIES_TITLE) == null) && (am.getRaw("T3") != null)) { // the publication and the article titles are just the name of the book am.put(MetadataField.FIELD_SERIES_TITLE, am.getRaw("T3")); } } else { // We're a full book volume - articletitle = publicationtitle am.put(MetadataField.FIELD_ARTICLE_TYPE, MetadataField.ARTICLE_TYPE_BOOKVOLUME); if (am.get(MetadataField.FIELD_PUBLICATION_TITLE) == null) { // the publication and the article titles are just the name of the book am.put( MetadataField.FIELD_PUBLICATION_TITLE, am.get(MetadataField.FIELD_ARTICLE_TITLE)); } // series title can be from T2 if ((am.get(MetadataField.FIELD_SERIES_TITLE) == null) && (am.getRaw("T2") != null)) { // the publication and the article titles are just the name of the book am.put(MetadataField.FIELD_SERIES_TITLE, am.getRaw("T2")); } } } else { // JOURNAL default is to assume it's a journal for backwards compatibility if (am.get(MetadataField.FIELD_PUBLICATION_TITLE) == null) { if (am.getRaw("T2") != null) { am.put(MetadataField.FIELD_PUBLICATION_TITLE, am.getRaw("T2")); } else if (am.getRaw("JO") != null) { am.put( MetadataField.FIELD_PUBLICATION_TITLE, am.getRaw("JO")); // might be unabbreviated version } } } }