protected void initFeatureVersions() throws PluginException.InvalidDefinition { if (definitionMap.containsKey(KEY_PLUGIN_FEATURE_VERSION_MAP)) { Map<Plugin.Feature, String> map = new HashMap<Plugin.Feature, String>(); Map<String, String> spec = (Map<String, String>) definitionMap.getMap(KEY_PLUGIN_FEATURE_VERSION_MAP); log.debug2("features: " + spec); for (Map.Entry<String, String> ent : spec.entrySet()) { try { // Prefix version string with feature name to create separate // namespace for each feature String key = ent.getKey(); map.put(Plugin.Feature.valueOf(key), key + "_" + ent.getValue()); } catch (RuntimeException e) { log.warning( getPluginName() + " set unknown feature: " + ent.getKey() + " to version " + ent.getValue(), e); throw new PluginException.InvalidDefinition("Unknown feature: " + ent.getKey(), e); } } featureVersion = map; } else { featureVersion = null; } }
void setConfig(Configuration config) { log.debug("config: " + config); proxyHost = config.get(PARAM_PROXY_HOST); proxyPort = config.getInt(PARAM_PROXY_PORT, DEFAULT_PROXY_PORT); if (StringUtil.isNullString(proxyHost) || proxyPort <= 0) { String http_proxy = System.getenv("http_proxy"); if (!StringUtil.isNullString(http_proxy)) { try { HostPortParser hpp = new HostPortParser(http_proxy); proxyHost = hpp.getHost(); proxyPort = hpp.getPort(); } catch (HostPortParser.InvalidSpec e) { log.warning("Can't parse http_proxy environment var, ignoring: " + http_proxy + ": " + e); } } } if (StringUtil.isNullString(proxyHost) || proxyPort <= 0) { proxyHost = null; } else {"Proxying through " + proxyHost + ":" + proxyPort); } userAgent = config.get(PARAM_USER_AGENT); if (StringUtil.isNullString(userAgent)) { userAgent = null; } else { log.debug("Setting User-Agent to " + userAgent); } }
public void emitMetadata(ArticleFiles af, ArticleMetadata md) { if (log.isDebug3()) log.debug3("emit(" + af + ", " + md + ")"); if (md != null) { log.debug3("add " + md + " to amlist"); amlst.add(md); } ; }
public void testFunctionalFromTarHierarchy() throws Exception { log.debug3("in testFromTarHierarchy"); // load the tarballs InputStream file_input = null; try { file_input = getResourceAsStream(realTARFile_A); // UrlCacher uc = au.makeUrlCacher(TAR_A_BASE); // uc.storeContent(file_input, tarHeader); UrlCacher uc = tarAu.makeUrlCacher(new UrlData(file_input, tarHeader, TAR_A_BASE)); uc.storeContent(); IOUtil.safeClose(file_input); file_input = getResourceAsStream(realTARFile_B); // uc = au.makeUrlCacher(TAR_B_BASE); // uc.storeContent(file_input, tarHeader); uc = tarAu.makeUrlCacher(new UrlData(file_input, tarHeader, TAR_B_BASE)); uc.storeContent(); IOUtil.safeClose(file_input); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } finally { IOUtil.safeClose(file_input); } CachedUrlSet cus = tarAu.getAuCachedUrlSet(); for (CachedUrl cu : cus.getCuIterable()) { log.debug3("AU - cu is: " + cu.getUrl()); cu.release(); } // We need to start from the level of the ArticleMetadataExtractor MyListEmitter emitter = new MyListEmitter(); ArticleMetadataExtractor amEx = new ElsevierDeferredArticleMetadataExtractor(ArticleFiles.ROLE_ARTICLE_METADATA); Iterator<ArticleFiles> it = tarAu.getArticleIterator(MetadataTarget.Any()); while (it.hasNext()) { ArticleFiles af =; log.debug3("Metadata test - articlefiles " + af.toString()); // CachedUrl cu = af.getFullTextCu(); CachedUrl cu = af.getRoleCu(ArticleFiles.ROLE_ARTICLE_METADATA); log.debug3("metadata cu is " + cu.getUrl()); // List<ArticleMetadata> mdlist = mle.extract(MetadataTarget.Any(), cu); amEx.extract(MetadataTarget.Any(), af, emitter); List<ArticleMetadata> returnList = emitter.getAmList(); assertNotNull(returnList); log.debug3("size of returnList is " + returnList.size()); Iterator<ArticleMetadata> mdIt = returnList.iterator(); ArticleMetadata mdRecord = null; while (mdIt.hasNext()) { mdRecord = (ArticleMetadata); validateCompleteMetadataRecord(mdRecord); } } }
public String getDefaultArticleMimeType() { String ret = definitionMap.getString(KEY_DEFAULT_ARTICLE_MIME_TYPE, null); log.debug3("DefaultArticleMimeType " + ret); if (ret == null) { ret = super.getDefaultArticleMimeType(); log.debug3("DefaultArticleMimeType from super " + ret); } return ret; }
public void testArticleCountAndType() throws Exception { int expCount = 28; PluginTestUtil.crawlSimAu(sau); String pat1 = "branch(\\d+)/(\\d+file\\.html)"; String rep1 = "aps/journal/v123/n$1/full/$2"; PluginTestUtil.copyAu(sau, nau, ".*[^.][^p][^d][^f]$", pat1, rep1); String pat2 = "branch(\\d+)/(\\d+file\\.pdf)"; String rep2 = "aps/journal/v123/n$1/pdf/$2"; PluginTestUtil.copyAu(sau, nau, ".*\\.pdf$", pat2, rep2); // Remove some URLs int deleted = 0; for (Iterator it = nau.getAuCachedUrlSet().contentHashIterator(); it.hasNext(); ) { CachedUrlSetNode cusn = (CachedUrlSetNode); if (cusn instanceof CachedUrl) { CachedUrl cu = (CachedUrl) cusn; String url = cu.getUrl(); if (url.contains("/journal/") && (url.endsWith("1file.html") || url.endsWith("2file.pdf"))) { deleteBlock(cu); ++deleted; } } } assertEquals(8, deleted); Iterator<ArticleFiles> it = nau.getArticleIterator(); int count = 0; int countHtmlOnly = 0; int countPdfOnly = 0; while (it.hasNext()) { ArticleFiles af =;; CachedUrl cu = af.getFullTextCu(); String url = cu.getUrl(); assertNotNull(cu); String contentType = cu.getContentType(); log.debug("count " + count + " url " + url + " " + contentType); count++; if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == null) { ++countHtmlOnly; } if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == url) { ++countPdfOnly; } } log.debug("Article count is " + count); assertEquals(expCount, count); assertEquals(4, countHtmlOnly); assertEquals(4, countPdfOnly); }
/* * When testing no-pdf-check basic XML parsing, you will get partial MD records * depending on whether the info comes from dataset.xml or from main.xml */ private void validateDatasetMetadataRecord(ArticleMetadata am) { log.debug3("valideDatasetMetadatRecord"); String doi_val = am.get(MetadataField.FIELD_DOI); assertEquals(common_issn, am.get(MetadataField.FIELD_ISSN)); log.debug3("doi val is: " + doi_val); // The dataset doesn't set this value, it'll fail over the main.xml value if (doi_val.equals("10.1016/S0140-1111(14)61865-1")) { assertEquals(null, am.get(MetadataField.FIELD_DATE)); } else { assertEquals(dateMap.get(doi_val), am.get(MetadataField.FIELD_DATE)); } assertEquals(pubTitleMap.get(doi_val), am.get(MetadataField.FIELD_PUBLICATION_TITLE)); }
/** * Method that creates a simulated Cached URL from the source code provided by the goodContent * String. It then asserts that the metadata extracted, by using the * MetaPressRisMetadataExtractorFactory, match the metadata in the source code. * * @throws Exception */ public void testExtractGoodRisContent() throws Exception { String goodContent = createGoodRisContent(); log.debug3(goodContent); List<ArticleMetadata> mdlist = setupContentForAU(bau1, RIS_URL, goodContent, false); assertNotEmpty(mdlist); ArticleMetadata md = mdlist.get(0); assertNotNull(md); assertEquals(goodVolume, md.get(MetadataField.FIELD_VOLUME)); assertEquals(goodIssue, md.get(MetadataField.FIELD_ISSUE)); assertEquals(goodStartPage, md.get(MetadataField.FIELD_START_PAGE)); assertEquals(goodEndPage, md.get(MetadataField.FIELD_END_PAGE)); assertEquals(goodIssn, md.get(MetadataField.FIELD_ISSN)); Iterator<String> actAuthIter = md.getList(MetadataField.FIELD_AUTHOR).iterator(); for (String expAuth : goodAuthors) { assertEquals(expAuth,; } assertEquals(goodTitle, md.get(MetadataField.FIELD_ARTICLE_TITLE)); assertEquals(goodJournal, md.get(MetadataField.FIELD_PUBLICATION_TITLE)); assertEquals(goodDate, md.get(MetadataField.FIELD_DATE)); assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER)); assertEquals(goodDOI, md.get(MetadataField.FIELD_DOI)); // This shouldn't get set. It will default later to fuill_text_cu assertNotEquals(doiURL, md.get(MetadataField.FIELD_ACCESS_URL)); }
/* * You will have to tell it the DOI and the schema because those normally come from dataset */ private void validateSingleMainMetadataRecord(ArticleMetadata am, String doi_val, String schema) { log.debug3("valideSingleMainMetadatRecord"); if ("simple-article".equals(schema)) { assertEquals(common_simple_article_title, am.get(MetadataField.FIELD_ARTICLE_TITLE)); } else { assertEquals(common_article_title, am.get(MetadataField.FIELD_ARTICLE_TITLE)); } log.debug3("doi val is: " + doi_val); assertEquals(authorMap.get(doi_val), am.getList(MetadataField.FIELD_AUTHOR)); assertEquals(volMap.get(doi_val), am.get(MetadataField.FIELD_VOLUME)); assertEquals(issueMap.get(doi_val), am.get(MetadataField.FIELD_ISSUE)); assertEquals("Comment", am.getRaw(ElsevierMainDTD5XmlSchemaHelper.common_dochead)); assertEquals(doi_val, am.getRaw(ElsevierMainDTD5XmlSchemaHelper.common_doi)); assertEquals("2014", am.getRaw(ElsevierMainDTD5XmlSchemaHelper.common_copyright)); }
private void deleteBlock(CachedUrl cu) throws IOException {"deleting " + cu.getUrl()); CachedUrlSetSpec cuss = new SingleNodeCachedUrlSetSpec(cu.getUrl()); ArchivalUnit au = cu.getArchivalUnit(); CachedUrlSet cus = au.makeCachedUrlSet(cuss); NodeManager nm = au.getPlugin().getDaemon().getNodeManager(au); nm.deleteNode(cus); }
public void testSimpleMainXML() throws Exception { log.debug3("testSimpleMainXML"); String xml_url = TAR_A_BASE + SUBDIR + "01420615/v64sC/S0142061514004608/main.xml"; List<ArticleMetadata> mdList = extractFromContent(xml_url, "text/xml", simpleMain, nocheck_mle, null); assertEquals(1, mdList.size()); validateSingleMainMetadataRecord(mdList.get(0), "10.1016/j.jidx.2014.07.028", "article"); }
public class HighWireDrupalHtmlMetadataExtractorFactory implements FileMetadataExtractorFactory { private static final Logger log = Logger.getLogger(HighWireDrupalHtmlMetadataExtractorFactory.class); @Override public FileMetadataExtractor createFileMetadataExtractor( MetadataTarget target, String contentType) throws PluginException { return new HighWireDrupalHtmlMetadataExtractor(); } public static class HighWireDrupalHtmlMetadataExtractor implements FileMetadataExtractor { // Map HighWire HTML meta tag names to cooked metadata fields private static MultiMap tagMap = new MultiValueMap(); static { tagMap.put("DC.Format", MetadataField.FIELD_FORMAT); tagMap.put("DC.Language", MetadataField.FIELD_LANGUAGE); tagMap.put("citation_publisher", MetadataField.FIELD_PUBLISHER); tagMap.put("citation_journal_title", MetadataField.FIELD_PUBLICATION_TITLE); tagMap.put("citation_title", MetadataField.FIELD_ARTICLE_TITLE); tagMap.put("citation_date", MetadataField.FIELD_DATE); tagMap.put("citation_publication_date", MetadataField.FIELD_DATE); tagMap.put( "citation_authors", new MetadataField(MetadataField.FIELD_AUTHOR, MetadataField.splitAt(";"))); tagMap.put("citation_author", MetadataField.FIELD_AUTHOR); tagMap.put("citation_issn", MetadataField.FIELD_ISSN); tagMap.put("citation_volume", MetadataField.FIELD_VOLUME); tagMap.put("citation_issue", MetadataField.FIELD_ISSUE); tagMap.put("citation_firstpage", MetadataField.FIELD_START_PAGE); tagMap.put("citation_lastpage", MetadataField.FIELD_END_PAGE); tagMap.put("citation_doi", MetadataField.FIELD_DOI); tagMap.put("citation_public_url", MetadataField.FIELD_ACCESS_URL); // typical field value: "acupmed;30/1/8": extract "acupmed" tagMap.put( "citation_mjid", new MetadataField( MetadataField.FIELD_PROPRIETARY_IDENTIFIER, MetadataField.extract("^([^;]+);", 1))); } @Override public void extract(MetadataTarget target, CachedUrl cu, Emitter emitter) throws IOException { ArticleMetadata am = new SimpleHtmlMetaTagMetadataExtractor().extract(target, cu); am.cook(tagMap); String url = am.get(MetadataField.FIELD_ACCESS_URL); ArchivalUnit au = cu.getArchivalUnit(); if (url == null || url.isEmpty() || !au.makeCachedUrl(url).hasContent()) { url = cu.getUrl(); } am.replace( MetadataField.FIELD_ACCESS_URL, HttpToHttpsUtil.AuUtil.normalizeHttpHttpsFromBaseUrl(au, url)); emitter.emitMetadata(cu, am); } } }
@Override public ArticleMetadata extract(MetadataTarget target, CachedUrl cu) throws IOException { log.debug3("Metadata - cachedurl cu:" + cu.getUrl()); ArticleMetadata am = super.extract(target, cu); am.cook(tagMap); return am; } // extract
public class EbscoXmlMetadataExtractorFactory extends SourceXmlMetadataExtractorFactory { private static final Logger log = Logger.getLogger(EbscoXmlMetadataExtractorFactory.class); private static final String CONTENT_DIR = "/Content/"; private static SourceXmlSchemaHelper EbscoSchemaHelper = null; @Override public FileMetadataExtractor createFileMetadataExtractor( MetadataTarget target, String contentType) throws PluginException { return new EbscoSourceXmlMetadataExtractor(); } public class EbscoSourceXmlMetadataExtractor extends SourceXmlMetadataExtractor { @Override protected SourceXmlSchemaHelper setUpSchema(CachedUrl cu) { // Once you have it, just keep returning the same one. It won't change. if (EbscoSchemaHelper == null) { EbscoSchemaHelper = new EbscoSchemaHelper(); } return EbscoSchemaHelper; } /* * The filename is the ProducID with either ".pdf" or ".epub" suffix. * Tje content files live in a parallel directory * <base>/<year>/Content/ * The XML file represented by the current cu would be something like: * <base>/<year>/DataFeed/!/EBSCOhostGKB_20160205_DELTA.xml * and the pdf would be * <base>/<year>/Content/123456.pdf */ @Override protected List<String> getFilenamesAssociatedWithRecord( SourceXmlSchemaHelper helper, CachedUrl cu, ArticleMetadata oneAM) { // this has been set to be the "ProductID" value String filenameValue = oneAM.getRaw(helper.getFilenameXPathKey()); String cuBase = FilenameUtils.getFullPath(cu.getUrl()); int datafeed_dir_start = cuBase.lastIndexOf("/DataFeed/"); // This will leave the "/", so just add back on the sibling_dir and filename String contentPath; if (datafeed_dir_start < 0) { // can't return null because that would make it okay to emit // this will fail to emit, as it should - we don't know how to verify the PDF existence log.siteWarning("The XML file lives at an unexpected location: " + cuBase); contentPath = CONTENT_DIR; // invalid but will force failure } else { contentPath = cuBase.substring(0, datafeed_dir_start) + CONTENT_DIR; } List<String> returnList = new ArrayList<String>(); returnList.add(contentPath + filenameValue + ".pdf"); returnList.add(contentPath + filenameValue + ".epub"); return returnList; } } }
public class TestElsevierXmlLinkExtractorFactory extends LinkExtractorTestCase { private static Logger logger = Logger.getLogger("TestElsevierXmlLinkExtractorFactory"); String srcUrl = ""; private static final String withLinks = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + "<!DOCTYPE dataset SYSTEM \"\">\n" + "<dataset identifier=\"OXM10160\" customer=\"OHL\"" + " status=\"Announcement\"" + " version=\"Network Dataset Announcement/Confirmation v1.0\">" + " <date year=\"2007\" month=\"May\" day=\"1\"/>\n" + "<file name=\"01407007.tar\" size=\"21780480\"" + " md5=\"6c7266e0e246bf3e8cf1cd8b659a7a73\"/>\n" + "<file name=\"03064530.tar\" size=\"12748800\"" + " md5=\"df9519d3075e164d22f5dd4988a693c3\"/>\n" + "<file name=\"dataset.toc\" size=\"2216587\"" + " md5=\"cd21741eb91fa0fdfef2fa36485e21a0\"/>\n" + "</dataset>\n"; private static final String withoutLinks = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + "<!DOCTYPE dataset SYSTEM \"\">\n" + "<dataset identifier=\"OXM10160\" customer=\"OHL\"" + " status=\"Announcement\"" + " version=\"Network Dataset Announcement/Confirmation v1.0\">" + " <date year=\"2007\" month=\"May\" day=\"1\"/>\n" + "</dataset>\n"; private static final String[] links = { "01407007.tar", "03064530.tar", "dataset.toc", }; public String getMimeType() { return "text/xml"; } public LinkExtractorFactory getFactory() { return new ElsevierXmlLinkExtractorFactory(); } public void testFindCorrectEntries() throws Exception { Set expected = new HashSet(); for (String link : links) { expected.add(srcUrl + link); } assertEquals(expected, extractUrls(withLinks)); } public void testFindNoEntries() throws Exception { assertEmpty(extractUrls(withoutLinks)); } }
/* * OJS2HtmlMetadataExtractorFactory extracts metadata from each article. */ public class OJS2HtmlMetadataExtractorFactory implements FileMetadataExtractorFactory { static Logger log = Logger.getLogger(OJS2HtmlMetadataExtractorFactory.class); public FileMetadataExtractor createFileMetadataExtractor( MetadataTarget target, String contentType) throws PluginException { return new OJS2HtmlMetadataExtractor(); } // createFileMetadataExtractor public static class OJS2HtmlMetadataExtractor extends SimpleHtmlMetaTagMetadataExtractor { // Map OJS2-specific HTML meta tag names to cooked metadata fields private static MultiMap tagMap = new MultiValueMap(); static { tagMap.put("DC.Format", MetadataField.DC_FIELD_FORMAT); tagMap.put("DC.Language", MetadataField.DC_FIELD_LANGUAGE); tagMap.put("DC.Title", MetadataField.DC_FIELD_TITLE); tagMap.put("DC.Identifier", MetadataField.DC_FIELD_IDENTIFIER); tagMap.put("DC.Date", MetadataField.DC_FIELD_DATE); tagMap.put("DC.Publisher", MetadataField.DC_FIELD_PUBLISHER); tagMap.put("DC.Publisher", MetadataField.FIELD_PUBLISHER); tagMap.put("DC.Contributor", MetadataField.DC_FIELD_CONTRIBUTOR); tagMap.put("citation_journal_title", MetadataField.FIELD_PUBLICATION_TITLE); tagMap.put("citation_title", MetadataField.FIELD_ARTICLE_TITLE); tagMap.put("citation_date", MetadataField.FIELD_DATE); tagMap.put("citation_author", MetadataField.FIELD_AUTHOR); tagMap.put( "citation_authors", new MetadataField(MetadataField.FIELD_AUTHOR, MetadataField.splitAt(";"))); tagMap.put("citation_issn", MetadataField.FIELD_ISSN); tagMap.put("citation_volume", MetadataField.FIELD_VOLUME); tagMap.put("citation_volume", MetadataField.DC_FIELD_CITATION_VOLUME); tagMap.put("citation_issue", MetadataField.FIELD_ISSUE); tagMap.put("citation_issue", MetadataField.DC_FIELD_CITATION_ISSUE); tagMap.put("citation_firstpage", MetadataField.FIELD_START_PAGE); tagMap.put("citation_lastpage", MetadataField.FIELD_END_PAGE); tagMap.put("citation_doi", MetadataField.FIELD_DOI); tagMap.put("citation_public_url", MetadataField.FIELD_ACCESS_URL); } // static @Override public ArticleMetadata extract(MetadataTarget target, CachedUrl cu) throws IOException { log.debug3("Metadata - cachedurl cu:" + cu.getUrl()); ArticleMetadata am = super.extract(target, cu); am.cook(tagMap); return am; } // extract } // OJS2HtmlMetadataExtractor } // OJS2HtmlMetadataExtractorFactory
protected PermissionCheckerFactory getPermissionCheckerFactory() { if (permissionCheckerFact == null) { String permissionCheckerFactoryClass = definitionMap.getString(DefinableArchivalUnit.KEY_AU_PERMISSION_CHECKER_FACTORY, null); if (permissionCheckerFactoryClass != null) { permissionCheckerFact = (PermissionCheckerFactory) newAuxClass(permissionCheckerFactoryClass, PermissionCheckerFactory.class); log.debug2("Loaded PermissionCheckerFactory: " + permissionCheckerFact); } } return permissionCheckerFact; }
/* * When testing a complete extraction out of the tarset, the MD record will be completely filled in * and pdf-existence will get established */ private void validateCompleteMetadataRecord(ArticleMetadata am) { log.debug3("valideCompleteMetadatRecord"); String doi_val = am.get(MetadataField.FIELD_DOI); /* make sure we can pick up both types of xml article data */ log.debug3("doi val is: " + doi_val); if ("JA 5.2.0 SIMPLE-ARTICLE" .equals(am.getRaw(ElsevierDatasetXmlSchemaHelper.dataset_dtd_metadata))) { log.debug3("simple-article"); assertEquals(common_simple_article_title, am.get(MetadataField.FIELD_ARTICLE_TITLE)); } else { assertEquals(common_article_title, am.get(MetadataField.FIELD_ARTICLE_TITLE)); } assertEquals(common_issn, am.get(MetadataField.FIELD_ISSN)); assertEquals(authorMap.get(doi_val), am.getList(MetadataField.FIELD_AUTHOR)); assertEquals(dateMap.get(doi_val), am.get(MetadataField.FIELD_DATE)); assertEquals(accessUrlMap.get(doi_val), am.get(MetadataField.FIELD_ACCESS_URL)); assertEquals(volMap.get(doi_val), am.get(MetadataField.FIELD_VOLUME)); assertEquals(issueMap.get(doi_val), am.get(MetadataField.FIELD_ISSUE)); assertEquals(pubTitleMap.get(doi_val), am.get(MetadataField.FIELD_PUBLICATION_TITLE)); assertEquals("Elsevier", am.get(MetadataField.FIELD_PROVIDER)); assertEquals("Elsevier", am.get(MetadataField.FIELD_PUBLISHER)); log.debug3(am.ppString(2)); }
public void testSimpleDatasetXML() throws Exception { log.debug3("testSimpleDatasetXML"); String file_input = StringUtil.fromInputStream(getResourceAsStream(testDatasetFile)); String xml_url = TAR_A_BASE + SUBDIR + "dataset.xml"; List<ArticleMetadata> mdList = extractFromContent(xml_url, "text/xml", file_input, nocheck_mle, null); assertEquals(6, mdList.size()); Iterator<ArticleMetadata> mdIt = mdList.iterator(); ArticleMetadata mdRecord = null; while (mdIt.hasNext()) { mdRecord = (ArticleMetadata); validateDatasetMetadataRecord(mdRecord); } }
protected FilterRule constructFilterRule(String contentType) { String mimeType = HeaderUtil.getMimeTypeFromContentType(contentType); Object filter_el = definitionMap.getMapElement(mimeType + DefinableArchivalUnit.SUFFIX_FILTER_RULE); if (filter_el instanceof String) { log.debug("Loading filter " + filter_el); return (FilterRule) newAuxClass((String) filter_el, FilterRule.class); } else if (filter_el instanceof List) { if (((List) filter_el).size() > 0) { return new DefinableFilterRule((List) filter_el); } } return super.constructFilterRule(mimeType); }
/** * Method that creates a simulated Cached URL from the source code provided by the goodContent * String. It then asserts that the metadata extracted, by using the * MetaPressRisMetadataExtractorFactory, match the metadata in the source code. * * @throws Exception */ public void testExtractAlternateRisContent() throws Exception { String goodContent = createAlternateRisContent(); log.debug3(goodContent); List<ArticleMetadata> mdlist = setupContentForAU(bau1, RIS_URL, goodContent, false); assertNotEmpty(mdlist); ArticleMetadata md = mdlist.get(0); assertNotNull(md); Iterator<String> actAuthIter = md.getList(MetadataField.FIELD_AUTHOR).iterator(); for (String expAuth : goodAuthors) { assertEquals(expAuth,; } assertEquals(goodTitle, md.get(MetadataField.FIELD_ARTICLE_TITLE)); assertEquals(goodJournal, md.get(MetadataField.FIELD_PUBLICATION_TITLE)); assertEquals(goodDate, md.get(MetadataField.FIELD_DATE)); assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER)); }
void checkParamAgreement(String key, PrintfContext context) { List<String> printfList = getElementList(key); if (printfList == null) { return; } for (String printf : printfList) { if (StringUtil.isNullString(printf)) { log.warning("Null printf string in " + key); continue; } PrintfUtil.PrintfData p_data = PrintfUtil.stringToPrintf(printf); Collection<String> p_args = p_data.getArguments(); for (String arg : p_args) { ConfigParamDescr descr = findAuConfigDescr(arg); if (descr == null) { throw new PluginException.InvalidDefinition( "Not a declared parameter: " + arg + " in " + printf + " in " + getPluginName()); } // ensure range and set params used only in legal context switch (context) { case Regexp: case Display: // everything is legal in a regexp or a display string break; case URL: // NUM_RANGE and SET legal because can enumerate. Can't // enumerate RANGE switch (descr.getType()) { case ConfigParamDescr.TYPE_RANGE: throw new PluginException.InvalidDefinition( "Range parameter (" + arg + ") used in illegal context in " + getPluginName() + ": " + key + ": " + printf); default: } } } } }
/** If in testing mode FOO, copy values from FOO_override map, if any, to main map */ void processOverrides(TypedEntryMap map) { String testMode = getTestingMode(); if (StringUtil.isNullString(testMode)) { return; } Object o = map.getMapElement(testMode + DefinableArchivalUnit.SUFFIX_OVERRIDE); if (o == null) { return; } if (o instanceof Map) { Map overrideMap = (Map) o; for (Map.Entry entry : (Set<Map.Entry>) overrideMap.entrySet()) { String key = (String) entry.getKey(); Object val = entry.getValue(); log.debug(getDefaultPluginName() + ": Overriding " + key + " with " + val); map.setMapElement(key, val); } } }
/* * This is comlicated. MOST AUs have articles that live below and issue level TOC * that is, * <blah>/<journal_id>/vol#/iss#/ is a toc with no relevant metadata * <blah>/<journal_id>/vol#/iss#/xxx is an article with metadata * (eg Economist Voice V1) * BUT * in some AUs there are issues with only 1 article, in which case * <blah>/<journal_id>/vol#/iss#/ is an abstract with metadata * (eg Rhodes Cook V4) * and a few AUs with a mixture * (eg Forum for Health Economics V5) * So to identify ALL articles, we'll also have to capture issue level items and then look * at the html and if it has article metadata in it, count it as an article. * */ @Override protected ArticleFiles createArticleFiles(CachedUrl cu) { String url = cu.getUrl(); Matcher mat = pattern.matcher(url); if (mat.find()) { // we matched, but could this pattern potentially be a toc? Matcher tocmat = TOC_pattern.matcher(url); // if we could be a TOC then we must have metadata to be considered an article if (tocmat.find()) { if (hasArticleMetadata(cu)) { return processUrl(cu, mat); } } else { // we're not a potential TOC, so treat this as an article without checking return processUrl(cu, mat); } return null; // this was a TOC, not an article } log.warning("Mismatch between article iterator factory and article iterator: " + url); return null; }
/** * One of the articles used to get the html source for this plugin is: * */ public class IOPScienceHtmlMetadataExtractorFactory implements FileMetadataExtractorFactory { static Logger log = Logger.getLogger(IOPScienceHtmlMetadataExtractorFactory.class); public static MetadataField IOP_ACCESS_URL = new MetadataField( MetadataField.KEY_ACCESS_URL, Cardinality.Single, new Validator() { public String validate(ArticleMetadata am, MetadataField field, String val) throws MetadataException.ValidationException { // trim trailing '/' from urls like // if ((val != null) && !val.isEmpty() && (val.endsWith("/"))) { val = val.substring(0, val.length() - 1); } return val; } }); public FileMetadataExtractor createFileMetadataExtractor( MetadataTarget target, String contentType) throws PluginException { return new IOPScienceHtmlMetadataExtractor(); } public static class IOPScienceHtmlMetadataExtractor extends SimpleHtmlMetaTagMetadataExtractor { private static MultiMap tagMap = new MultiValueMap(); static { // <meta name="citation_doi" content="10.1088/2043-6262/1/4/043003" /> tagMap.put("citation_doi", MetadataField.FIELD_DOI); // <meta name="citation_publication_date" content="2011-01-25" /> tagMap.put("citation_publication_date", MetadataField.FIELD_DATE); // <meta name="citation_title" content="Polymer materials with spatially..." /> tagMap.put("citation_title", MetadataField.FIELD_ARTICLE_TITLE); // <meta name="citation_issn" content="2043-6262"/> tagMap.put("citation_issn", MetadataField.FIELD_ISSN); // <meta name="citation_volume" content="1" /> tagMap.put("citation_volume", MetadataField.FIELD_VOLUME); // <meta name="citation_issue" content="4"/> tagMap.put("citation_issue", MetadataField.FIELD_ISSUE); // <meta name="citation_firstpage" content="043003"/> tagMap.put("citation_firstpage", MetadataField.FIELD_START_PAGE); // <meta name="citation_author" content="Daisuke Fujiki"/> tagMap.put("citation_author", MetadataField.FIELD_AUTHOR); // <meta name="citation_journal_title" content="Advances in Natural Sciences: // Nanoscience and Nanotechnology" /> tagMap.put("citation_journal_title", MetadataField.FIELD_PUBLICATION_TITLE); // <meta name="citation_publisher" content="IOP Publishing" /> tagMap.put("citation_publisher", MetadataField.FIELD_PUBLISHER); // XXX this map is so that the metadata url is not always the access_url // <meta name="citation_fulltext_html_url" content=" tagMap.put("citation_fulltext_html_url", IOP_ACCESS_URL); } @Override public ArticleMetadata extract(MetadataTarget target, CachedUrl cu) throws IOException { ArticleMetadata am = super.extract(target, cu); am.cook(tagMap); String url = am.get(MetadataField.FIELD_ACCESS_URL); if (url != null && !url.isEmpty()) { CachedUrl val = cu.getArchivalUnit().makeCachedUrl(url); if (!val.hasContent()) { am.replace(MetadataField.FIELD_ACCESS_URL, cu.getUrl()); } } else { am.replace(MetadataField.FIELD_ACCESS_URL, cu.getUrl()); } return am; } } }
public class TestBaseAtyponMetadataExtractor extends LockssTestCase { static Logger log = Logger.getLogger("TestBaseAtyponMetadataExtractor"); private MockLockssDaemon theDaemon; private ArchivalUnit bau; private ArchivalUnit bau1; private static String PLUGIN_NAME = "org.lockss.plugin.atypon.BaseAtyponPlugin"; static final String BASE_URL_KEY = ConfigParamDescr.BASE_URL.getKey(); private static String BASE_URL = ""; // the metadata that should be extracted static String goodDate = "2012-07-05"; static String[] goodAuthors = new String[] {"D. Author", "S. Author2"}; static String goodFormat = "text/HTML"; static String goodTitle = "Title of Article"; static String goodType = "research-article"; static String goodPublisher = "Base Atypon"; static String goodPublishingPlatform = "Atypon"; static String goodDOI = "10.1137/10081839X"; static String goodJID = "xxx"; static String goodJournal = "Journal Name"; static String goodStartPage = "22"; static String goodEndPage = "44"; static String goodVolume = "13"; static String goodIssue = "3"; static String goodIssn = "1540-3459"; static String doiURL = "" + goodDOI; private static final String ABS_URL = BASE_URL + "doi/abs/10.1175/2010WCAS1063.1"; private static final String RIS_URL = BASE_URL + "action/downloadCitation?doi=" + goodDOI + "&format=ris&include=cit"; public void setUp() throws Exception { super.setUp(); setUpDiskSpace(); // you need this to have startService work properly... theDaemon = getMockLockssDaemon(); theDaemon.getAlertManager(); theDaemon.getPluginManager().setLoadablePluginsReady(true); theDaemon.setDaemonInited(true); theDaemon.getPluginManager().startService(); theDaemon.getCrawlManager(); // in this directory this is file "test_baseatypon.tdb" but it becomes xml ConfigurationUtil.addFromUrl(getResource("test_baseatypon.xml")); Tdb tdb = ConfigManager.getCurrentConfig().getTdb(); TdbAu tdbau1 = tdb.getTdbAusLikeName(goodJournal + " Volume " + goodVolume).get(0); assertNotNull("Didn't find named TdbAu", tdbau1); bau1 = PluginTestUtil.createAndStartAu(tdbau1); assertNotNull(bau1); TypedEntryMap auConfig = bau1.getProperties(); assertEquals(BASE_URL, auConfig.getString(BASE_URL_KEY)); } public void tearDown() throws Exception { theDaemon.stopDaemon(); super.tearDown(); } /* * Test the functionality of the MetadataUtilities * */ public void testNormalizeTitleValue() throws Exception { assertEquals( BaseAtyponMetadataUtil.normalizeTitle("The title goes here"), BaseAtyponMetadataUtil.normalizeTitle("Title Goes Here")); assertEquals( BaseAtyponMetadataUtil.normalizeTitle("Title with random spaces"), BaseAtyponMetadataUtil.normalizeTitle("Title with random spaces")); assertEquals( BaseAtyponMetadataUtil.normalizeTitle("Normalize -- hyphen"), BaseAtyponMetadataUtil.normalizeTitle("normalize \u2013\u2013 hyphen")); assertEquals( BaseAtyponMetadataUtil.normalizeTitle("Title and title"), BaseAtyponMetadataUtil.normalizeTitle("Title & title")); assertEquals( BaseAtyponMetadataUtil.normalizeTitle(" leading spaces"), BaseAtyponMetadataUtil.normalizeTitle("leading spaces")); // now checking the fall-back last ditch attempt assertEquals( BaseAtyponMetadataUtil.generateRawTitle("leading spaces:colon?"), BaseAtyponMetadataUtil.generateRawTitle("leadingspacescolon")); assertEquals( BaseAtyponMetadataUtil.generateRawTitle("relapsing-remitting"), BaseAtyponMetadataUtil.generateRawTitle("relapsing?remitting")); assertEquals( BaseAtyponMetadataUtil.generateRawTitle("foo\"blah"), BaseAtyponMetadataUtil.generateRawTitle("foo-blah")); } /** * Configuration method. * * @return */ /* "<meta name="dc.Title" content="Title of Article"></meta> "<meta name="dc.Creator" content="D. Author"></meta> "<meta name="dc.Creator" content="S. Author2"></meta> "<meta name="dc.Subject" content="weighted regularity; elliptic problem; oscillatory diffusion; $hp$ finite elements; 65N30; 35B65; 35J57"></meta> "<meta name="dc.Description" content="Long test summary of article, probably taken directly from the adstract..."></meta> "<meta name="dc.Publisher" content="Name of Publisher"></meta> "<meta name="dc.Date" scheme="WTN8601" content="2012-07-05"></meta> "<meta name="dc.Type" content="research-article"></meta> "<meta name="dc.Format" content="text/HTML"></meta> "<meta name="dc.Identifier" scheme="publisher" content="81839"></meta> "<meta name="dc.Identifier" scheme="doi" content="10.1137/10081839X"></meta> "<meta name="dc.Source" content=""></meta> "<meta name="dc.Language" content="en"></meta> "<meta name="dc.Coverage" content="world"></meta> "<meta name="keywords" content="weighted regularity, elliptic problem, oscillatory diffusion, $hp$ finite elements, 65N30, 35B65, 35J57"></meta> */ // a chunk of html source code from the publisher's site from where the // metadata should be extracted String goodHtmlContent = "<meta name=\"dc.Title\" content=\"Title of Article\"></meta>" + "<meta name=\"dc.Creator\" content=\"D. Author\"></meta>" + "<meta name=\"dc.Creator\" content=\"S. Author2\"></meta>" + "<meta name=\"dc.Subject\" content=\"weighted regularity; elliptic problem; oscillatory diffusion; $hp$ finite elements; 65N30; 35B65; 35J57\"></meta>" + "<meta name=\"dc.Description\" content=\"Long test summary of article, probably taken directly from the adstract...\"></meta>" + "<meta name=\"dc.Publisher\" content=\"Base Atypon\"></meta>" + "<meta name=\"dc.Date\" scheme=\"WTN8601\" content=\"2012-07-05\"></meta>" + "<meta name=\"dc.Type\" content=\"research-article\"></meta>" + "<meta name=\"dc.Format\" content=\"text/HTML\"></meta>" + "<meta name=\"dc.Identifier\" scheme=\"publisher\" content=\"81839\"></meta>" + "<meta name=\"dc.Identifier\" scheme=\"doi\" content=\"10.1137/10081839X\"></meta>" + "<meta name=\"dc.Source\" content=\"\"></meta>" + "<meta name=\"dc.Language\" content=\"en\"></meta>" + "<meta name=\"dc.Coverage\" content=\"world\"></meta>" + "<meta name=\"keywords\" content=\"weighted regularity, elliptic problem, oscillatory diffusion, $hp$ finite elements, 65N30, 35B65, 35J57\"></meta>"; public void testExtractGoodHtmlContent() throws Exception { List<ArticleMetadata> mdlist = setupContentForAU(bau1, ABS_URL, goodHtmlContent, true); assertNotEmpty(mdlist); ArticleMetadata md = mdlist.get(0); assertNotNull(md); assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER)); assertEquals(goodTitle, md.get(MetadataField.DC_FIELD_TITLE)); assertEquals(goodDate, md.get(MetadataField.FIELD_DATE)); assertEquals(goodFormat, md.get(MetadataField.DC_FIELD_FORMAT)); assertEquals(goodType, md.get(MetadataField.DC_FIELD_TYPE)); assertEquals(Arrays.asList(goodAuthors), md.getList(MetadataField.FIELD_AUTHOR)); assertEquals(goodAuthors[0], md.get(MetadataField.DC_FIELD_CREATOR)); } String goodHtmlContentNoDOIorPublisher = "<meta name=\"dc.Title\" content=\"Title of Article\"></meta>" + "<meta name=\"dc.Creator\" content=\"D. Author\"></meta>" + "<meta name=\"dc.Creator\" content=\"S. Author2\"></meta>" + "<meta name=\"dc.Subject\" content=\"weighted regularity; elliptic problem; oscillatory diffusion; $hp$ finite elements; 65N30; 35B65; 35J57\"></meta>" + "<meta name=\"dc.Description\" content=\"Long test summary of article, probably taken directly from the adstract...\"></meta>" + "<meta name=\"dc.Date\" scheme=\"WTN8601\" content=\"2012-07-05\"></meta>" + "<meta name=\"dc.Type\" content=\"research-article\"></meta>" + "<meta name=\"dc.Format\" content=\"text/HTML\"></meta>" + "<meta name=\"dc.Identifier\" scheme=\"publisher\" content=\"81839\"></meta>" + "<meta name=\"dc.Language\" content=\"en\"></meta>" + "<meta name=\"dc.Coverage\" content=\"world\"></meta>" + "<meta name=\"keywords\" content=\"weighted regularity, elliptic problem, oscillatory diffusion, $hp$ finite elements, 65N30, 35B65, 35J57\"></meta>"; public void testDOIExtraction() throws Exception { List<ArticleMetadata> mdlist = setupContentForAU(bau1, ABS_URL, goodHtmlContentNoDOIorPublisher, true); assertNotEmpty(mdlist); ArticleMetadata md = mdlist.get(0); assertNotNull(md); // gets pulled from the URL if not set in the metadata assertEquals("10.1175/2010WCAS1063.1", md.get(MetadataField.FIELD_DOI)); // gets set manually if not in the metadata // first it would try the TDB assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER)); } private String createGoodRisContent() { StringBuilder sb = new StringBuilder(); sb.append("TY - JOUR"); for (String auth : goodAuthors) { sb.append("\nA1 - "); sb.append(auth); } sb.append("\nDA - "); sb.append(goodDate); sb.append("\nJF - "); sb.append(goodJournal); sb.append("\nSP - "); sb.append(goodStartPage); sb.append("\nEP - "); sb.append(goodEndPage); sb.append("\nVL - "); sb.append(goodVolume); sb.append("\nIS - "); sb.append(goodIssue); sb.append("\nSN - "); sb.append(goodIssn); sb.append("\nT1 - "); sb.append(goodTitle); sb.append("\nPB - "); sb.append(goodPublisher); sb.append("\nDO - "); sb.append(goodDOI); sb.append("\nUR - "); sb.append(doiURL); sb.append("\nER -"); return sb.toString(); } /** * Method that creates a simulated Cached URL from the source code provided by the goodContent * String. It then asserts that the metadata extracted, by using the * MetaPressRisMetadataExtractorFactory, match the metadata in the source code. * * @throws Exception */ public void testExtractGoodRisContent() throws Exception { String goodContent = createGoodRisContent(); log.debug3(goodContent); List<ArticleMetadata> mdlist = setupContentForAU(bau1, RIS_URL, goodContent, false); assertNotEmpty(mdlist); ArticleMetadata md = mdlist.get(0); assertNotNull(md); assertEquals(goodVolume, md.get(MetadataField.FIELD_VOLUME)); assertEquals(goodIssue, md.get(MetadataField.FIELD_ISSUE)); assertEquals(goodStartPage, md.get(MetadataField.FIELD_START_PAGE)); assertEquals(goodEndPage, md.get(MetadataField.FIELD_END_PAGE)); assertEquals(goodIssn, md.get(MetadataField.FIELD_ISSN)); Iterator<String> actAuthIter = md.getList(MetadataField.FIELD_AUTHOR).iterator(); for (String expAuth : goodAuthors) { assertEquals(expAuth,; } assertEquals(goodTitle, md.get(MetadataField.FIELD_ARTICLE_TITLE)); assertEquals(goodJournal, md.get(MetadataField.FIELD_PUBLICATION_TITLE)); assertEquals(goodDate, md.get(MetadataField.FIELD_DATE)); assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER)); assertEquals(goodDOI, md.get(MetadataField.FIELD_DOI)); // This shouldn't get set. It will default later to fuill_text_cu assertNotEquals(doiURL, md.get(MetadataField.FIELD_ACCESS_URL)); } /* the extractor checks if data is missing it uses possible alternate RIS tags */ private String createAlternateRisContent() { StringBuilder sb = new StringBuilder(); sb.append("TY - JOUR"); for (String auth : goodAuthors) { sb.append("\nAU - "); sb.append(auth); } sb.append("\nY1 - "); sb.append(goodDate); sb.append("\nT2 - "); sb.append(goodJournal); sb.append("\nT1 - "); sb.append(goodTitle); sb.append("\nPB - "); sb.append(goodPublisher); sb.append("\nER -"); return sb.toString(); } /** * Method that creates a simulated Cached URL from the source code provided by the goodContent * String. It then asserts that the metadata extracted, by using the * MetaPressRisMetadataExtractorFactory, match the metadata in the source code. * * @throws Exception */ public void testExtractAlternateRisContent() throws Exception { String goodContent = createAlternateRisContent(); log.debug3(goodContent); List<ArticleMetadata> mdlist = setupContentForAU(bau1, RIS_URL, goodContent, false); assertNotEmpty(mdlist); ArticleMetadata md = mdlist.get(0); assertNotNull(md); Iterator<String> actAuthIter = md.getList(MetadataField.FIELD_AUTHOR).iterator(); for (String expAuth : goodAuthors) { assertEquals(expAuth,; } assertEquals(goodTitle, md.get(MetadataField.FIELD_ARTICLE_TITLE)); assertEquals(goodJournal, md.get(MetadataField.FIELD_PUBLICATION_TITLE)); assertEquals(goodDate, md.get(MetadataField.FIELD_DATE)); assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER)); } /* private support methods */ private List<ArticleMetadata> setupContentForAU( ArchivalUnit au, String url, String content, boolean isHtmlExtractor) throws IOException, PluginException { FileMetadataExtractor me; InputStream input = null; CIProperties props = null; if (isHtmlExtractor) { input = IOUtils.toInputStream(content, "utf-8"); props = getContentHtmlProperties(); me = new BaseAtyponHtmlMetadataExtractorFactory() .createFileMetadataExtractor(MetadataTarget.Any(), "text/html"); } else { input = IOUtils.toInputStream(content, "utf-8"); props = getContentRisProperties(); me = new BaseAtyponRisMetadataExtractorFactory() .createFileMetadataExtractor(MetadataTarget.Any(), "text/plain"); } UrlData ud = new UrlData(input, props, url); UrlCacher uc = au.makeUrlCacher(ud); uc.storeContent(); CachedUrl cu = uc.getCachedUrl(); FileMetadataListExtractor mle = new FileMetadataListExtractor(me); return mle.extract(MetadataTarget.Any(), cu); } private CIProperties getContentHtmlProperties() { CIProperties cProps = new CIProperties(); // the CU checks the X-Lockss-content-type, not the content-type to determine encoding cProps.put(CachedUrl.PROPERTY_CONTENT_TYPE, "text/html; charset=UTF-8"); cProps.put("Content-type", "text/html; charset=UTF-8"); return cProps; } private CIProperties getContentRisProperties() { CIProperties cProps = new CIProperties(); // the CU checks the X-Lockss-content-type, not the content-type to determine encoding cProps.put(CachedUrl.PROPERTY_CONTENT_TYPE, "text/plain; charset=UTF-8"); cProps.put("Content-type", "text/plain; charset=UTF-8"); return cProps; } }
public class CrawlRuleTester extends Thread { protected static Logger log = Logger.getLogger(CrawlRuleTester.class); /** Proxy host */ public static final String PARAM_PROXY_HOST = Configuration.PREFIX + ""; /** Proxy port */ public static final String PARAM_PROXY_PORT = Configuration.PREFIX + "crawltest.proxy.port"; public static final int DEFAULT_PROXY_PORT = -1; /** User-Agent */ public static final String PARAM_USER_AGENT = Configuration.PREFIX + "crawltest.userAgent"; /* Message Types */ public static final int ERROR_MESSAGE = 0; public static final int WARNING_MESSAGE = 1; public static final int PLAIN_MESSAGE = 2; public static final int URL_SUMMARY_MESSAGE = 3; public static final int TEST_SUMMARY_MESSAGE = 4; private String m_baseUrl; private int m_crawlDepth; private long m_crawlDelay; private int m_curDepth; private ArchivalUnit m_au; private String m_outputFile = null; private BufferedWriter m_outWriter = null; private Deadline fetchDeadline =; private boolean useLocalWriter = true; private MessageHandler m_msgHandler; private LockssUrlConnectionPool connectionPool = new LockssUrlConnectionPool(); private String proxyHost; private String userAgent; private int proxyPort; // our storage for extracted urls private TreeSet m_extracted = new TreeSet(); private TreeSet m_incls = new TreeSet(); private TreeSet m_excls = new TreeSet(); private TreeSet m_reported = new TreeSet(); public CrawlRuleTester(int crawlDepth, long crawlDelay, String baseUrl, ArchivalUnit au) { super("crawlrule tester"); m_crawlDepth = crawlDepth; long minFetchDelay = CurrentConfig.getLongParam( BaseArchivalUnit.PARAM_MIN_FETCH_DELAY, BaseArchivalUnit.DEFAULT_MIN_FETCH_DELAY); m_crawlDelay = Math.max(crawlDelay, minFetchDelay); m_baseUrl = baseUrl; m_au = au; } /** * RuleTest * * @param outFile String * @param crawlDepth int * @param crawlDelay long * @param baseUrl String * @param crawlSpec CrawlSpec */ public CrawlRuleTester( String outFile, int crawlDepth, long crawlDelay, String baseUrl, ArchivalUnit au) { this(crawlDepth, crawlDelay, baseUrl, au); m_outputFile = outFile; } /** * RuleTest * * @param outWriter BufferedWriter * @param crawlDepth int * @param crawlDelay long * @param baseUrl String * @param crawlSpec CrawlSpec */ public CrawlRuleTester( BufferedWriter outWriter, int crawlDepth, long crawlDelay, String baseUrl, ArchivalUnit au) { this(crawlDepth, crawlDelay, baseUrl, au); m_outWriter = outWriter; } /** * RuleTest * * @param msgHandler MessageHandler to take all output * @param crawlDepth the crawl depth to use * @param crawlDelay the type to wait between fetches * @param baseUrl the url to start from * @param crawlSpec a CrawlSpec to use for url checking. */ public CrawlRuleTester( MessageHandler msgHandler, int crawlDepth, long crawlDelay, String baseUrl, ArchivalUnit au) { this(crawlDepth, crawlDelay, baseUrl, au); m_msgHandler = msgHandler; } public void run() { try { setConfig(ConfigManager.getCurrentConfig()); if (m_outWriter == null && m_msgHandler == null) { useLocalWriter = true; } else { useLocalWriter = false; } if (useLocalWriter) { openOutputFile(); } checkRules(); if (useLocalWriter) { closeOutputFile(); } } finally { if (m_msgHandler != null) { m_msgHandler.close(); } } } void setConfig(Configuration config) { log.debug("config: " + config); proxyHost = config.get(PARAM_PROXY_HOST); proxyPort = config.getInt(PARAM_PROXY_PORT, DEFAULT_PROXY_PORT); if (StringUtil.isNullString(proxyHost) || proxyPort <= 0) { String http_proxy = System.getenv("http_proxy"); if (!StringUtil.isNullString(http_proxy)) { try { HostPortParser hpp = new HostPortParser(http_proxy); proxyHost = hpp.getHost(); proxyPort = hpp.getPort(); } catch (HostPortParser.InvalidSpec e) { log.warning("Can't parse http_proxy environment var, ignoring: " + http_proxy + ": " + e); } } } if (StringUtil.isNullString(proxyHost) || proxyPort <= 0) { proxyHost = null; } else {"Proxying through " + proxyHost + ":" + proxyPort); } userAgent = config.get(PARAM_USER_AGENT); if (StringUtil.isNullString(userAgent)) { userAgent = null; } else { log.debug("Setting User-Agent to " + userAgent); } } private void openOutputFile() { if (m_outputFile != null) { try { m_outWriter = new BufferedWriter(new FileWriter(m_outputFile, false)); return; } catch (Exception ex) { System.err.println("Error opening output file, writing to stdout: " + ex); } } m_outWriter = new BufferedWriter(new OutputStreamWriter(System.out)); } private void closeOutputFile() { try { if (m_outWriter != null) { m_outWriter.close(); } } catch (IOException ex) { System.err.println("Error closing output file."); } } int[] depth_incl; int[] depth_fetched; int[] depth_parsed; private void checkRules() { outputMessage("\nChecking " + m_baseUrl, TEST_SUMMARY_MESSAGE); outputMessage( "crawl depth: " + m_crawlDepth + " crawl delay: " + m_crawlDelay + " ms.", PLAIN_MESSAGE); TreeSet crawlList = new TreeSet(); TreeSet fetched = new TreeSet(); // inialize with the baseUrl crawlList.add(m_baseUrl); depth_incl = new int[m_crawlDepth]; depth_fetched = new int[m_crawlDepth]; depth_parsed = new int[m_crawlDepth]; long start_time = TimeBase.nowMs(); for (int depth = 1; depth <= m_crawlDepth; depth++) { if (isInterrupted()) { return; } m_curDepth = depth; if (crawlList.isEmpty() && depth <= m_crawlDepth) { outputMessage("\nNothing left to crawl, exiting after depth " + (depth - 1), PLAIN_MESSAGE); break; } String[] urls = (String[]) crawlList.toArray(new String[0]); crawlList.clear(); outputMessage("\nDepth " + depth, PLAIN_MESSAGE); for (int ix = 0; ix < urls.length; ix++) { if (isInterrupted()) { return; } pauseBeforeFetch(); String urlstr = urls[ix]; m_incls.clear(); m_excls.clear(); // crawl the page buildUrlSets(urlstr); fetched.add(urlstr); // output incl/excl results, // add the new_incls to the crawlList for next crawl depth loop crawlList.addAll(outputUrlResults(urlstr, m_incls, m_excls)); } } long elapsed_time = TimeBase.nowMs() - start_time; outputSummary(m_baseUrl, fetched, crawlList, elapsed_time); } private void buildUrlSets(String url) { try { outputMessage("\nFetching " + url, TEST_SUMMARY_MESSAGE); URL srcUrl = new URL(url); // URLConnection conn = srcUrl.openConnection(); // String type = conn.getContentType(); // type = conn.getHeaderField("content-type"); // InputStream istr = conn.getInputStream(); LockssUrlConnection conn = UrlUtil.openConnection(url, connectionPool); if (proxyHost != null) { conn.setProxy(proxyHost, proxyPort); } if (userAgent != null) { conn.setRequestProperty("user-agent", userAgent); } try { conn.execute(); int resp = conn.getResponseCode(); if (resp != 200) { outputMessage("Resp: " + resp + ": " + conn.getResponseMessage(), TEST_SUMMARY_MESSAGE); return; } depth_fetched[m_curDepth - 1]++; String cookies = conn.getResponseHeaderValue("Set-Cookie"); if (cookies != null) { outputMessage("Cookies: " + cookies, PLAIN_MESSAGE); } String type = conn.getResponseContentType(); if (type == null || !type.toLowerCase().startsWith("text/html")) { outputMessage("Type: " + type + ", not parsing", URL_SUMMARY_MESSAGE); return; } outputMessage("Type: " + type + ", extracting Urls", URL_SUMMARY_MESSAGE); InputStream istr = conn.getResponseInputStream(); InputStreamReader reader = new InputStreamReader(istr); // MyMockCachedUrl mcu = new MyMockCachedUrl(srcUrl.toString(), reader); GoslingHtmlLinkExtractor extractor = new GoslingHtmlLinkExtractor(); extractor.extractUrls(null, istr, null, srcUrl.toString(), new MyLinkExtractorCallback()); istr.close(); depth_parsed[m_curDepth - 1]++; } finally { conn.release(); } } catch (MalformedURLException murle) { murle.printStackTrace(); outputErrResults(url, "Malformed URL:" + murle.getMessage()); } catch (IOException ex) { ex.printStackTrace(); outputErrResults(url, "IOException: " + ex.getMessage()); } } private void pauseBeforeFetch() { if (!fetchDeadline.expired()) { try { fetchDeadline.sleep(); } catch (InterruptedException ie) { // no action } } fetchDeadline.expireIn(m_crawlDelay); } private void outputMessage(String msg, int msgType) { if (isInterrupted()) { return; } if (m_msgHandler != null) { m_msgHandler.outputMessage(msg + "\n", msgType); } else { try { m_outWriter.write(msg); m_outWriter.newLine(); } catch (Exception ex) { System.err.println(msg); } } } private void outputErrResults(String url, String errMsg) { outputMessage("Error: " + errMsg + " occured while processing " + url, ERROR_MESSAGE); } private Set outputUrlResults(String url, Set m_inclset, Set m_exclset) { Set new_incls = new TreeSet(CollectionUtils.subtract(m_inclset, m_reported)); Set new_excls = new TreeSet(CollectionUtils.subtract(m_exclset, m_reported)); if (!m_inclset.isEmpty()) { outputMessage( "\nIncluded Urls: (" + new_incls.size() + " new, " + (m_inclset.size() - new_incls.size()) + " old)", URL_SUMMARY_MESSAGE); depth_incl[m_curDepth - 1] += new_incls.size(); } for (Iterator it = new_incls.iterator(); it.hasNext(); ) { outputMessage(, PLAIN_MESSAGE); } if (!m_exclset.isEmpty()) { outputMessage( "\nExcluded Urls: (" + new_excls.size() + " new, " + (m_exclset.size() - new_excls.size()) + " old)", URL_SUMMARY_MESSAGE); } for (Iterator it = new_excls.iterator(); it.hasNext(); ) { outputMessage(, PLAIN_MESSAGE); } m_reported.addAll(new_incls); m_reported.addAll(new_excls); if (m_outWriter != null) { try { m_outWriter.flush(); } catch (IOException ex) { } } return new_incls; } private void outputSummary(String baseUrl, Set fetched, Set toCrawl, long elapsedTime) { int fetchCount = fetched.size(); outputMessage( "\n\nSummary for starting Url: " + baseUrl + " and depth: " + m_crawlDepth, TEST_SUMMARY_MESSAGE); outputMessage( "\nUrls fetched: " + fetchCount + " Urls extracted: " + m_extracted.size(), PLAIN_MESSAGE); outputMessage("\nDepth Fetched Parsed New URLs", PLAIN_MESSAGE); for (int depth = 1; depth <= m_crawlDepth; depth++) { PrintfFormat pf = new PrintfFormat("%5d %7d %6d %8d"); Integer[] args = new Integer[] { new Integer(depth), new Integer(depth_fetched[depth - 1]), new Integer(depth_parsed[depth - 1]), new Integer(depth_incl[depth - 1]), }; String s = pf.sprintf(args); outputMessage(s, PLAIN_MESSAGE); } outputMessage("\nRemaining unfetched: " + toCrawl.size(), PLAIN_MESSAGE); if (false) { for (Iterator iter = toCrawl.iterator(); iter.hasNext(); ) { String url = (String); outputMessage(url, PLAIN_MESSAGE); } } long secs = elapsedTime / Constants.SECOND; long fetchRate = 0; if (secs > 0) { fetchRate = fetchCount * 60 * Constants.SECOND / elapsedTime; } outputMessage( "\nElapsed Time: " + secs + " secs." + " Fetch Rate: " + fetchRate + " p/m", PLAIN_MESSAGE); } public interface MessageHandler { void outputMessage(String message, int messageType); void close(); } private class MyLinkExtractorCallback implements LinkExtractor.Callback { MyLinkExtractorCallback() {} public void foundLink(String url) { m_extracted.add(url); try { String normUrl = UrlUtil.normalizeUrl(url); if (BaseCrawler.isSupportedUrlProtocol(normUrl) && m_au.shouldBeCached(normUrl)) { m_incls.add(normUrl); } else { m_excls.add(normUrl); } } catch (MalformedURLException e) { m_excls.add(url); } } } class MyMockCachedUrl implements CachedUrl { private String url; private boolean doesExist = false; private Reader reader = null; public MyMockCachedUrl(String url, Reader reader) { this.url = url; this.reader = reader; } public ArchivalUnit getArchivalUnit() { throw new UnsupportedOperationException("Not implemented"); } public String getUrl() { return url; } public CachedUrl getCuVersion(int version) { throw new UnsupportedOperationException("Not implemented"); } public CachedUrl[] getCuVersions() { throw new UnsupportedOperationException("Not implemented"); } public CachedUrl[] getCuVersions(int maxVersions) { throw new UnsupportedOperationException("Not implemented"); } public int getVersion() { return 1; } public Reader openForReading() { return reader; } public LinkRewriterFactory getLinkRewriterFactory() { throw new UnsupportedOperationException("Not implemented"); } public String getEncoding() { return Constants.DEFAULT_ENCODING; } /** * getUnfilteredInputStream * * @return InputStream */ public InputStream getUnfilteredInputStream() { throw new UnsupportedOperationException("Not implemented"); } /** * getUnfilteredInputStream * * @return InputStream */ public InputStream getUnfilteredInputStream(HashedInputStream.Hasher hasher) { throw new UnsupportedOperationException("Not implemented"); } public InputStream getUncompressedInputStream() { throw new UnsupportedOperationException("Not implemented"); } public InputStream getUncompressedInputStream(HashedInputStream.Hasher hasher) { throw new UnsupportedOperationException("Not implemented"); } /** * openForHashing * * @return InputStream */ public InputStream openForHashing() { throw new UnsupportedOperationException("Not implemented"); } /** * openForHashing * * @param hasher HashedInputStream.Hasher for unfiltered content * @return InputStream */ public InputStream openForHashing(HashedInputStream.Hasher hasher) { throw new UnsupportedOperationException("Not implemented"); } /** * getContentSize * * @return long */ public long getContentSize() { throw new UnsupportedOperationException("Not implemented"); } public String getContentType() { throw new UnsupportedOperationException("Not implemented"); } public void setOption(String option, String val) {} public boolean hasContent() { return doesExist; } public boolean isLeaf() { return true; } public int getType() { return CachedUrlSetNode.TYPE_CACHED_URL; } public CIProperties getProperties() { return null; } public void addProperty(String key, String value) {} public void release() {} public String toString() { StringBuffer sb = new StringBuffer(url.length() + 17); sb.append("[MyMockCachedUrl: "); sb.append(url); sb.append("]"); return sb.toString(); } @Override public FileMetadataExtractor getFileMetadataExtractor(MetadataTarget target) { return null; } public CachedUrl getArchiveMemberCu(ArchiveMemberSpec ams) { throw new UnsupportedOperationException("Not implemented"); } @Override public boolean isArchiveMember() { return false; } } }
public class TestNatureArticleIteratorFactory extends LockssTestCase { static Logger log = Logger.getLogger("TestNatureArticleIteratorFactory"); private SimulatedArchivalUnit sau; // Simulated AU to generate content private ArchivalUnit nau; // Nature AU private MockLockssDaemon theDaemon; private static final int DEFAULT_FILESIZE = 3000; private static int fileSize = DEFAULT_FILESIZE; private static String PLUGIN_NAME = "org.lockss.plugin.nature.ClockssNaturePublishingGroupPlugin"; private static String BASE_URL = ""; public void setUp() throws Exception { super.setUp(); String tempDirPath = getTempDir().getAbsolutePath() + File.separator; ConfigurationUtil.setFromArgs(LockssRepositoryImpl.PARAM_CACHE_LOCATION, tempDirPath); theDaemon = getMockLockssDaemon(); theDaemon.getAlertManager(); theDaemon.getPluginManager().setLoadablePluginsReady(true); theDaemon.setDaemonInited(true); theDaemon.getPluginManager().startService(); theDaemon.getCrawlManager(); sau = PluginTestUtil.createAndStartSimAu(simAuConfig(tempDirPath)); nau = PluginTestUtil.createAndStartAu(PLUGIN_NAME, natureAuConfig()); } public void tearDown() throws Exception { sau.deleteContentTree(); theDaemon.stopDaemon(); super.tearDown(); } Configuration simAuConfig(String rootPath) { Configuration conf = ConfigManager.newConfiguration(); conf.put("root", rootPath); conf.put("base_url", BASE_URL); conf.put("depth", "1"); conf.put("branch", "4"); conf.put("numFiles", "7"); conf.put( "fileTypes", "" + (SimulatedContentGenerator.FILE_TYPE_HTML | SimulatedContentGenerator.FILE_TYPE_PDF)); conf.put("binFileSize", "" + fileSize); return conf; } Configuration natureAuConfig() { Configuration conf = ConfigManager.newConfiguration(); conf.put("base_url", BASE_URL); conf.put("journal_id", "aps"); conf.put("volume_name", "123"); conf.put("year", "2008"); return conf; } public void testArticleCountAndType() throws Exception { int expCount = 28; PluginTestUtil.crawlSimAu(sau); String pat1 = "branch(\\d+)/(\\d+file\\.html)"; String rep1 = "aps/journal/v123/n$1/full/$2"; PluginTestUtil.copyAu(sau, nau, ".*[^.][^p][^d][^f]$", pat1, rep1); String pat2 = "branch(\\d+)/(\\d+file\\.pdf)"; String rep2 = "aps/journal/v123/n$1/pdf/$2"; PluginTestUtil.copyAu(sau, nau, ".*\\.pdf$", pat2, rep2); // Remove some URLs int deleted = 0; for (Iterator it = nau.getAuCachedUrlSet().contentHashIterator(); it.hasNext(); ) { CachedUrlSetNode cusn = (CachedUrlSetNode); if (cusn instanceof CachedUrl) { CachedUrl cu = (CachedUrl) cusn; String url = cu.getUrl(); if (url.contains("/journal/") && (url.endsWith("1file.html") || url.endsWith("2file.pdf"))) { deleteBlock(cu); ++deleted; } } } assertEquals(8, deleted); Iterator<ArticleFiles> it = nau.getArticleIterator(); int count = 0; int countHtmlOnly = 0; int countPdfOnly = 0; while (it.hasNext()) { ArticleFiles af =;; CachedUrl cu = af.getFullTextCu(); String url = cu.getUrl(); assertNotNull(cu); String contentType = cu.getContentType(); log.debug("count " + count + " url " + url + " " + contentType); count++; if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == null) { ++countHtmlOnly; } if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == url) { ++countPdfOnly; } } log.debug("Article count is " + count); assertEquals(expCount, count); assertEquals(4, countHtmlOnly); assertEquals(4, countPdfOnly); } // public void testArticleCountAndDefaultType() throws Exception { // testArticleCountAndType("text/html", true, 24); // } // // public void testArticleCountAndPdf() throws Exception { // testArticleCountAndType("application/pdf", false, 0); // } private void deleteBlock(CachedUrl cu) throws IOException {"deleting " + cu.getUrl()); CachedUrlSetSpec cuss = new SingleNodeCachedUrlSetSpec(cu.getUrl()); ArchivalUnit au = cu.getArchivalUnit(); CachedUrlSet cus = au.makeCachedUrlSet(cuss); NodeManager nm = au.getPlugin().getDaemon().getNodeManager(au); nm.deleteNode(cus); } }
/** * DefinablePlugin: a plugin which uses the data stored in an ExternalizableMap to configure itself. * * @author Claire Griffin * @version 1.0 */ public class DefinablePlugin extends BasePlugin { // configuration map keys public static final String KEY_PLUGIN_IDENTIFIER = "plugin_identifier"; public static final String KEY_PLUGIN_NAME = "plugin_name"; public static final String KEY_PLUGIN_VERSION = "plugin_version"; public static final String KEY_PLUGIN_FEATURE_VERSION_MAP = "plugin_feature_version_map"; public static final String KEY_REQUIRED_DAEMON_VERSION = "required_daemon_version"; public static final String KEY_PUBLISHING_PLATFORM = "plugin_publishing_platform"; public static final String KEY_PLUGIN_CONFIG_PROPS = "plugin_config_props"; public static final String KEY_EXCEPTION_HANDLER = "plugin_cache_result_handler"; public static final String KEY_EXCEPTION_LIST = "plugin_cache_result_list"; public static final String KEY_PLUGIN_NOTES = "plugin_notes"; public static final String KEY_CRAWL_TYPE = "plugin_crawl_type"; public static final String KEY_FOLLOW_LINKS = "plugin_follow_link"; /** Message to be displayed when user configures an AU with this plugin */ public static final String KEY_PLUGIN_AU_CONFIG_USER_MSG = "plugin_au_config_user_msg"; public static final String KEY_PER_HOST_PERMISSION_PATH = "plugin_per_host_permission_path"; public static final String KEY_PLUGIN_PARENT = "plugin_parent"; public static final String KEY_PLUGIN_PARENT_VERSION = "plugin_parent_version"; public static final String KEY_PLUGIN_CRAWL_URL_COMPARATOR_FACTORY = "plugin_crawl_url_comparator_factory"; public static final String KEY_PLUGIN_FETCH_RATE_LIMITER_SOURCE = "plugin_fetch_rate_limiter_source"; public static final String KEY_PLUGIN_ARTICLE_ITERATOR_FACTORY = "plugin_article_iterator_factory"; public static final String KEY_PLUGIN_ARTICLE_METADATA_EXTRACTOR_FACTORY = "plugin_article_metadata_extractor_factory"; public static final String KEY_DEFAULT_ARTICLE_MIME_TYPE = "plugin_default_article_mime_type"; public static final String KEY_ARTICLE_ITERATOR_ROOT = "plugin_article_iterator_root"; public static final String KEY_ARTICLE_ITERATOR_PATTERN = "plugin_article_iterator_pattern"; public static final String DEFAULT_PLUGIN_VERSION = "1"; public static final String DEFAULT_REQUIRED_DAEMON_VERSION = "0.0.0"; public static final String MAP_SUFFIX = ".xml"; public static final String CRAWL_TYPE_HTML_LINKS = "HTML Links"; public static final String CRAWL_TYPE_OAI = "OAI"; public static final String[] CRAWL_TYPES = { CRAWL_TYPE_HTML_LINKS, CRAWL_TYPE_OAI, }; public static final String DEFAULT_CRAWL_TYPE = CRAWL_TYPE_HTML_LINKS; protected String mapName = null; static Logger log = Logger.getLogger("DefinablePlugin"); protected ExternalizableMap definitionMap = new ExternalizableMap(); protected CacheResultHandler resultHandler = null; protected List<String> loadedFromUrls; protected CrawlWindow crawlWindow; protected Map<Plugin.Feature, String> featureVersion; public void initPlugin(LockssDaemon daemon, String extMapName) throws FileNotFoundException { initPlugin(daemon, extMapName, this.getClass().getClassLoader()); } public void initPlugin(LockssDaemon daemon, String extMapName, ClassLoader loader) throws FileNotFoundException { // convert the plugin class name to an xml file name // load the configuration map from jar file ExternalizableMap defMap = loadMap(extMapName, loader); this.initPlugin(daemon, extMapName, defMap, loader); } public void initPlugin( LockssDaemon daemon, String extMapName, ExternalizableMap defMap, ClassLoader loader) { mapName = extMapName; this.classLoader = loader; this.definitionMap = defMap; super.initPlugin(daemon); initMimeMap(); initFeatureVersions(); initAuFeatureMap(); checkParamAgreement(); } private ExternalizableMap loadMap(String extMapName, ClassLoader loader) throws FileNotFoundException { String first = null; String next = extMapName; List<String> urls = new ArrayList<String>(); ExternalizableMap res = null; while (next != null) { // convert the plugin class name to an xml file name String mapFile = next.replace('.', '/') + MAP_SUFFIX; URL url = loader.getResource(mapFile); if (url != null && urls.contains(url.toString())) { throw new PluginException.InvalidDefinition("Plugin inheritance loop: " + next); } // load into map ExternalizableMap oneMap = new ExternalizableMap(); oneMap.loadMapFromResource(mapFile, loader); urls.add(url.toString()); // apply overrides one plugin at a time in inheritance chain processOverrides(oneMap); if (res == null) { res = oneMap; } else { for (Map.Entry ent : oneMap.entrySet()) { String key = (String) ent.getKey(); Object val = ent.getValue(); if (!res.containsKey(key)) { res.setMapElement(key, val); } } } if (oneMap.containsKey(KEY_PLUGIN_PARENT)) { next = oneMap.getString(KEY_PLUGIN_PARENT); } else { next = null; } } loadedFromUrls = urls; return res; } /** If in testing mode FOO, copy values from FOO_override map, if any, to main map */ void processOverrides(TypedEntryMap map) { String testMode = getTestingMode(); if (StringUtil.isNullString(testMode)) { return; } Object o = map.getMapElement(testMode + DefinableArchivalUnit.SUFFIX_OVERRIDE); if (o == null) { return; } if (o instanceof Map) { Map overrideMap = (Map) o; for (Map.Entry entry : (Set<Map.Entry>) overrideMap.entrySet()) { String key = (String) entry.getKey(); Object val = entry.getValue(); log.debug(getDefaultPluginName() + ": Overriding " + key + " with " + val); map.setMapElement(key, val); } } } String getTestingMode() { return theDaemon == null ? null : theDaemon.getTestingMode(); } // Used by tests public void initPlugin(LockssDaemon daemon, File file) throws PluginException { ExternalizableMap oneMap = new ExternalizableMap(); oneMap.loadMap(file); if (oneMap.getErrorString() != null) { throw new PluginException(oneMap.getErrorString()); } initPlugin(daemon, file.getPath(), oneMap, null); } void initPlugin(LockssDaemon daemon, ExternalizableMap defMap) { initPlugin(daemon, defMap, this.getClass().getClassLoader()); } void initPlugin(LockssDaemon daemon, ExternalizableMap defMap, ClassLoader loader) { initPlugin(daemon, "Internal", defMap, loader); } enum PrintfContext { Regexp, URL, Display }; void checkParamAgreement() { for (Map.Entry<String, PrintfContext> ent : DefinableArchivalUnit.printfKeysContext.entrySet()) { checkParamAgreement(ent.getKey(), ent.getValue()); } } void checkParamAgreement(String key, PrintfContext context) { List<String> printfList = getElementList(key); if (printfList == null) { return; } for (String printf : printfList) { if (StringUtil.isNullString(printf)) { log.warning("Null printf string in " + key); continue; } PrintfUtil.PrintfData p_data = PrintfUtil.stringToPrintf(printf); Collection<String> p_args = p_data.getArguments(); for (String arg : p_args) { ConfigParamDescr descr = findAuConfigDescr(arg); if (descr == null) { throw new PluginException.InvalidDefinition( "Not a declared parameter: " + arg + " in " + printf + " in " + getPluginName()); } // ensure range and set params used only in legal context switch (context) { case Regexp: case Display: // everything is legal in a regexp or a display string break; case URL: // NUM_RANGE and SET legal because can enumerate. Can't // enumerate RANGE switch (descr.getType()) { case ConfigParamDescr.TYPE_RANGE: throw new PluginException.InvalidDefinition( "Range parameter (" + arg + ") used in illegal context in " + getPluginName() + ": " + key + ": " + printf); default: } } } } } public List<String> getLoadedFromUrls() { return loadedFromUrls; } public String getPluginName() { if (definitionMap.containsKey(KEY_PLUGIN_NAME)) { return definitionMap.getString(KEY_PLUGIN_NAME); } else { return getDefaultPluginName(); } } protected String getDefaultPluginName() { return StringUtil.shortName(getPluginId()); } public String getVersion() { return definitionMap.getString(KEY_PLUGIN_VERSION, DEFAULT_PLUGIN_VERSION); } public String getFeatureVersion(Plugin.Feature feat) { if (featureVersion == null) { return null; } return featureVersion.get(feat); } public String getRequiredDaemonVersion() { return definitionMap.getString(KEY_REQUIRED_DAEMON_VERSION, DEFAULT_REQUIRED_DAEMON_VERSION); } public String getPublishingPlatform() { return definitionMap.getString(KEY_PUBLISHING_PLATFORM, null); } public String getPluginNotes() { return definitionMap.getString(KEY_PLUGIN_NOTES, null); } public String getDefaultArticleMimeType() { String ret = definitionMap.getString(KEY_DEFAULT_ARTICLE_MIME_TYPE, null); log.debug3("DefaultArticleMimeType " + ret); if (ret == null) { ret = super.getDefaultArticleMimeType(); log.debug3("DefaultArticleMimeType from super " + ret); } return ret; } public List<String> getElementList(String key) { Object element = definitionMap.getMapElement(key); List<String> lst; if (element instanceof String) { return Collections.singletonList((String) element); } else if (element instanceof List) { return (List) element; } else { return null; } } public List getLocalAuConfigDescrs() throws PluginException.InvalidDefinition { List auConfigDescrs = (List) definitionMap.getCollection(KEY_PLUGIN_CONFIG_PROPS, null); if (auConfigDescrs == null) { throw new PluginException.InvalidDefinition(mapName + " missing ConfigParamDescrs"); } return auConfigDescrs; } protected ArchivalUnit createAu0(Configuration auConfig) throws ArchivalUnit.ConfigurationException { DefinableArchivalUnit au = new DefinableArchivalUnit(this, definitionMap); au.setConfiguration(auConfig); return au; } public ExternalizableMap getDefinitionMap() { return definitionMap; } CacheResultHandler getCacheResultHandler() { return resultHandler; } String stripSuffix(String str, String suffix) { return str.substring(0, str.length() - suffix.length()); } protected void initMimeMap() throws PluginException.InvalidDefinition { for (Iterator iter = definitionMap.entrySet().iterator(); iter.hasNext(); ) { Map.Entry ent = (Map.Entry); String key = (String) ent.getKey(); Object val = ent.getValue(); if (key.endsWith(DefinableArchivalUnit.SUFFIX_LINK_EXTRACTOR_FACTORY)) { String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_LINK_EXTRACTOR_FACTORY); if (val instanceof String) { String factName = (String) val; log.debug(mime + " link extractor: " + factName); MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime); LinkExtractorFactory fact = (LinkExtractorFactory) newAuxClass(factName, LinkExtractorFactory.class); mti.setLinkExtractorFactory(fact); } } else if (key.endsWith(DefinableArchivalUnit.SUFFIX_CRAWL_FILTER_FACTORY)) { // XXX This clause must precede the one for SUFFIX_HASH_FILTER_FACTORY // XXX unless/until that key is changed to not be a terminal substring // XXX of this one String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_CRAWL_FILTER_FACTORY); if (val instanceof String) { String factName = (String) val; log.debug(mime + " crawl filter: " + factName); MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime); FilterFactory fact = (FilterFactory) newAuxClass(factName, FilterFactory.class); mti.setCrawlFilterFactory(fact); } } else if (key.endsWith(DefinableArchivalUnit.SUFFIX_HASH_FILTER_FACTORY)) { String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_HASH_FILTER_FACTORY); if (val instanceof String) { String factName = (String) val; log.debug(mime + " filter: " + factName); MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime); FilterFactory fact = (FilterFactory) newAuxClass(factName, FilterFactory.class); mti.setHashFilterFactory(fact); } } else if (key.endsWith(DefinableArchivalUnit.SUFFIX_FETCH_RATE_LIMIT)) { String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_FETCH_RATE_LIMIT); if (val instanceof String) { String rate = (String) val; log.debug(mime + " fetch rate: " + rate); MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime); RateLimiter limit = mti.getFetchRateLimiter(); if (limit != null) { limit.setRate(rate); } else { mti.setFetchRateLimiter(new RateLimiter(rate)); } } } else if (key.endsWith(DefinableArchivalUnit.SUFFIX_LINK_REWRITER_FACTORY)) { String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_LINK_REWRITER_FACTORY); String factName = (String) val; log.debug(mime + " link rewriter: " + factName); MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime); LinkRewriterFactory fact = (LinkRewriterFactory) newAuxClass(factName, LinkRewriterFactory.class); mti.setLinkRewriterFactory(fact); } else if (key.endsWith(DefinableArchivalUnit.SUFFIX_METADATA_EXTRACTOR_FACTORY_MAP)) { String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_METADATA_EXTRACTOR_FACTORY_MAP); Map factNameMap = (Map) val; Map factClassMap = new HashMap(); MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime); for (Iterator it = factNameMap.keySet().iterator(); it.hasNext(); ) { String mdTypes = (String); String factName = (String) factNameMap.get(mdTypes); log.debug(mime + " (" + mdTypes + ") metadata extractor: " + factName); for (String mdType : (List<String>) StringUtil.breakAt(mdTypes, ";")) { setMdTypeFact(factClassMap, mdType, factName); } } mti.setFileMetadataExtractorFactoryMap(factClassMap); } } } private void setMdTypeFact(Map factClassMap, String mdType, String factName) { log.debug3("Metadata type: " + mdType + " factory " + factName); FileMetadataExtractorFactory fact = (FileMetadataExtractorFactory) newAuxClass(factName, FileMetadataExtractorFactory.class); factClassMap.put(mdType, fact); } protected void initResultMap() throws PluginException.InvalidDefinition { HttpResultMap hResultMap = new HttpResultMap(); // XXX Currently this only allows a CacheResultHandler class to // initialize the result map. Instead, don't use a CacheResultMap // directly, use either the plugin's CacheResultHandler, if specified, // or a default one that wraps the CacheResultMap String handler_class = null; handler_class = definitionMap.getString(KEY_EXCEPTION_HANDLER, null); if (handler_class != null) { try { resultHandler = (CacheResultHandler) newAuxClass(handler_class, CacheResultHandler.class); resultHandler.init(hResultMap); } catch (Exception ex) { throw new PluginException.InvalidDefinition( mapName + " has invalid Exception handler: " + handler_class, ex); } catch (LinkageError le) { throw new PluginException.InvalidDefinition( mapName + " has invalid Exception handler: " + handler_class, le); } } else { // Expect a list of mappings from either result code or exception // name to CacheException name Collection<String> mappings = definitionMap.getCollection(KEY_EXCEPTION_LIST, null); if (mappings != null) { // add each entry for (String entry : mappings) { if (log.isDebug2()) { log.debug2("initMap(" + entry + ")"); } String first; String ceName; try { List<String> pair = StringUtil.breakAt(entry, '=', 2, true, true); first = pair.get(0); ceName = pair.get(1); } catch (Exception ex) { throw new PluginException.InvalidDefinition( "Invalid syntax: " + entry + "in " + mapName); } Object val; // Value should be either a CacheException or CacheResultHandler // class name. PluginFetchEventResponse resp = (PluginFetchEventResponse) newAuxClass(ceName, PluginFetchEventResponse.class, null); if (resp instanceof CacheException) { val = resp.getClass(); } else if (resp instanceof CacheResultHandler) { val = WrapperUtil.wrap((CacheResultHandler) resp, CacheResultHandler.class); } else { throw new PluginException.InvalidDefinition( "Second arg not a " + "CacheException or " + "CacheResultHandler class: " + entry + ", in " + mapName); } try { int code = Integer.parseInt(first); // If parseable as an integer, it's a result code. hResultMap.storeMapEntry(code, val); } catch (NumberFormatException e) { try { Class eClass = Class.forName(first); // If a class name, it should be an exception class if (Exception.class.isAssignableFrom(eClass)) { hResultMap.storeMapEntry(eClass, val); } else { throw new PluginException.InvalidDefinition( "First arg not an " + "Exception class: " + entry + ", in " + mapName); } } catch (Exception ex) { throw new PluginException.InvalidDefinition( "First arg not a " + "number or class: " + entry + ", in " + mapName); } catch (LinkageError le) { throw new PluginException.InvalidDefinition("Can't load " + first, le); } } } } } resultMap = hResultMap; } protected void initFeatureVersions() throws PluginException.InvalidDefinition { if (definitionMap.containsKey(KEY_PLUGIN_FEATURE_VERSION_MAP)) { Map<Plugin.Feature, String> map = new HashMap<Plugin.Feature, String>(); Map<String, String> spec = (Map<String, String>) definitionMap.getMap(KEY_PLUGIN_FEATURE_VERSION_MAP); log.debug2("features: " + spec); for (Map.Entry<String, String> ent : spec.entrySet()) { try { // Prefix version string with feature name to create separate // namespace for each feature String key = ent.getKey(); map.put(Plugin.Feature.valueOf(key), key + "_" + ent.getValue()); } catch (RuntimeException e) { log.warning( getPluginName() + " set unknown feature: " + ent.getKey() + " to version " + ent.getValue(), e); throw new PluginException.InvalidDefinition("Unknown feature: " + ent.getKey(), e); } } featureVersion = map; } else { featureVersion = null; } } protected void initAuFeatureMap() { if (definitionMap.containsKey(DefinableArchivalUnit.KEY_AU_FEATURE_URL_MAP)) { Map<String, ?> featMap = definitionMap.getMap(DefinableArchivalUnit.KEY_AU_FEATURE_URL_MAP); for (Map.Entry ent : featMap.entrySet()) { Object val = ent.getValue(); if (val instanceof Map) { ent.setValue(MapUtil.expandAlternativeKeyLists((Map) val)); } } } } /** Create a CrawlWindow if necessary and return it. The CrawlWindow must be thread-safe. */ protected CrawlWindow makeCrawlWindow() { if (crawlWindow != null) { return crawlWindow; } CrawlWindow window = (CrawlWindow) definitionMap.getMapElement(DefinableArchivalUnit.KEY_AU_CRAWL_WINDOW_SER); if (window == null) { String window_class = definitionMap.getString(DefinableArchivalUnit.KEY_AU_CRAWL_WINDOW, null); if (window_class != null) { ConfigurableCrawlWindow ccw = (ConfigurableCrawlWindow) newAuxClass(window_class, ConfigurableCrawlWindow.class); try { window = ccw.makeCrawlWindow(); } catch (PluginException e) { throw new RuntimeException(e); } } } crawlWindow = window; return window; } LoginPageChecker loginChecker; protected LoginPageChecker makeLoginPageChecker() { if (loginChecker == null) { String loginPageCheckerClass = definitionMap.getString(DefinableArchivalUnit.KEY_AU_LOGIN_PAGE_CHECKER, null); if (loginPageCheckerClass != null) { loginChecker = (LoginPageChecker) newAuxClass(loginPageCheckerClass, LoginPageChecker.class); } } return loginChecker; } PermissionCheckerFactory permissionCheckerFact; protected PermissionCheckerFactory getPermissionCheckerFactory() { if (permissionCheckerFact == null) { String permissionCheckerFactoryClass = definitionMap.getString(DefinableArchivalUnit.KEY_AU_PERMISSION_CHECKER_FACTORY, null); if (permissionCheckerFactoryClass != null) { permissionCheckerFact = (PermissionCheckerFactory) newAuxClass(permissionCheckerFactoryClass, PermissionCheckerFactory.class); log.debug2("Loaded PermissionCheckerFactory: " + permissionCheckerFact); } } return permissionCheckerFact; } protected UrlNormalizer urlNorm; protected UrlNormalizer getUrlNormalizer() { if (urlNorm == null) { String normalizerClass = definitionMap.getString(DefinableArchivalUnit.KEY_AU_URL_NORMALIZER, null); if (normalizerClass != null) { urlNorm = (UrlNormalizer) newAuxClass(normalizerClass, UrlNormalizer.class); } else { urlNorm = NullUrlNormalizer.INSTANCE; } } return urlNorm; } protected ExploderHelper exploderHelper = null; protected ExploderHelper getExploderHelper() { if (exploderHelper == null) { String helperClass = definitionMap.getString(DefinableArchivalUnit.KEY_AU_EXPLODER_HELPER, null); if (helperClass != null) { exploderHelper = (ExploderHelper) newAuxClass(helperClass, ExploderHelper.class); } } return exploderHelper; } protected CrawlUrlComparatorFactory crawlUrlComparatorFactory = null; protected CrawlUrlComparatorFactory getCrawlUrlComparatorFactory() { if (crawlUrlComparatorFactory == null) { String factClass = definitionMap.getString(DefinablePlugin.KEY_PLUGIN_CRAWL_URL_COMPARATOR_FACTORY, null); if (factClass != null) { crawlUrlComparatorFactory = (CrawlUrlComparatorFactory) newAuxClass(factClass, CrawlUrlComparatorFactory.class); } } return crawlUrlComparatorFactory; } protected Comparator<CrawlUrl> getCrawlUrlComparator(ArchivalUnit au) throws PluginException.LinkageError { CrawlUrlComparatorFactory fact = getCrawlUrlComparatorFactory(); if (fact == null) { return null; } return fact.createCrawlUrlComparator(au); } protected FilterRule constructFilterRule(String contentType) { String mimeType = HeaderUtil.getMimeTypeFromContentType(contentType); Object filter_el = definitionMap.getMapElement(mimeType + DefinableArchivalUnit.SUFFIX_FILTER_RULE); if (filter_el instanceof String) { log.debug("Loading filter " + filter_el); return (FilterRule) newAuxClass((String) filter_el, FilterRule.class); } else if (filter_el instanceof List) { if (((List) filter_el).size() > 0) { return new DefinableFilterRule((List) filter_el); } } return super.constructFilterRule(mimeType); } protected ArticleIteratorFactory articleIteratorFact = null; protected ArticleMetadataExtractorFactory articleMetadataFact = null; /** * Returns the plugin's article iterator factory, if any * * @return the ArticleIteratorFactory */ public ArticleIteratorFactory getArticleIteratorFactory() { if (articleIteratorFact == null) { String factClass = definitionMap.getString(KEY_PLUGIN_ARTICLE_ITERATOR_FACTORY, null); if (factClass != null) { articleIteratorFact = (ArticleIteratorFactory) newAuxClass(factClass, ArticleIteratorFactory.class); } } return articleIteratorFact; } /** * Returns the article iterator factory for the content type, if any * * @param contentType the content type * @return the ArticleIteratorFactory */ public ArticleMetadataExtractorFactory getArticleMetadataExtractorFactory(MetadataTarget target) { if (articleMetadataFact == null) { String factClass = definitionMap.getString(KEY_PLUGIN_ARTICLE_METADATA_EXTRACTOR_FACTORY, null); if (factClass != null) { articleMetadataFact = (ArticleMetadataExtractorFactory) newAuxClass(factClass, ArticleMetadataExtractorFactory.class); } } return articleMetadataFact; } public String getPluginId() { String className; if (mapName != null) { className = mapName; } else { // @TODO: eliminate this when we eliminate subclasses className = this.getClass().getName(); } return className; } }
public class BePressArticleIteratorFactory implements ArticleIteratorFactory, ArticleMetadataExtractorFactory { /** * An article iterator factory, for the Section plugin variant * * @author Thib Guicherd-Callin */ public static class Section implements ArticleIteratorFactory { protected static final String ROOT_TEMPLATE = "\"%s%s/%s\", base_url, journal_abbr, journal_section"; protected static final String PATTERN_TEMPLATE = "\"^%s%s/%s/((([^0-9]+/)?(vol)?%d/(iss)?[0-9]+/(art|editorial)?[0-9]+)|(vol%d/(?-i:[A-Z])[0-9]+))$\", base_url, journal_abbr, journal_section, volume, volume"; public Iterator<ArticleFiles> createArticleIterator(ArchivalUnit au, MetadataTarget target) throws PluginException { return new BePressArticleIterator( au, new SubTreeArticleIterator.Spec() .setTarget(target) .setRootTemplate(ROOT_TEMPLATE) .setPatternTemplate(PATTERN_TEMPLATE), true); } } protected static Logger log = Logger.getLogger("BePressArticleIteratorFactory"); protected static final String ROOT_TEMPLATE = "\"%s%s\", base_url, journal_abbr"; // Make the final "art or editorial + number" chunk in the first half optional // because a few AUs have issues with single articles which sit at the issue level // We'll do a content check in the createArticleFiles() // Finally figured out the 2nd half matching group using ?-i: to mandate case sensitivity protected static final String PATTERN_TEMPLATE = "\"^%s%s/((([^0-9]+/)?(vol)?%d/(iss)?[0-9]+(/(art|editorial)?[0-9]+)?)|(vol%d/(?-i:[A-Z])[0-9]+))$\", base_url, journal_abbr, volume, volume"; public Iterator<ArticleFiles> createArticleIterator(ArchivalUnit au, MetadataTarget target) throws PluginException { return new BePressArticleIterator( au, new SubTreeArticleIterator.Spec() .setTarget(target) .setRootTemplate(ROOT_TEMPLATE) .setPatternTemplate(PATTERN_TEMPLATE), false); } protected static class BePressArticleIterator extends SubTreeArticleIterator { protected Pattern pattern; protected Pattern TOC_pattern; public BePressArticleIterator( ArchivalUnit au, SubTreeArticleIterator.Spec spec, boolean isSection) { super(au, spec); String volumeAsString = au.getConfiguration().get(ConfigParamDescr.VOLUME_NUMBER.getKey()); String journalAbbr = au.getConfiguration().get(ConfigParamDescr.JOURNAL_ABBR.getKey()); if (isSection) { journalAbbr = journalAbbr + "/" + au.getConfiguration().get("journal_section"); } // pick up issue level and lower (make (art)?[0-9]+ optional because a few au's have article // at issue level this.pattern = Pattern.compile( String.format( "/%s/((([^0-9]+/)?(vol)?%s/(iss)?[0-9]+(/(art)?[0-9]+)?)|(vol%s/(?-i:[A-Z])[0-9]+))$", journalAbbr, volumeAsString, volumeAsString), Pattern.CASE_INSENSITIVE); this.TOC_pattern = Pattern.compile( String.format("/%s/([^0-9]+/)?(vol)?%s/(iss)?[0-9]+$", journalAbbr, volumeAsString), Pattern.CASE_INSENSITIVE); } /* * This is comlicated. MOST AUs have articles that live below and issue level TOC * that is, * <blah>/<journal_id>/vol#/iss#/ is a toc with no relevant metadata * <blah>/<journal_id>/vol#/iss#/xxx is an article with metadata * (eg Economist Voice V1) * BUT * in some AUs there are issues with only 1 article, in which case * <blah>/<journal_id>/vol#/iss#/ is an abstract with metadata * (eg Rhodes Cook V4) * and a few AUs with a mixture * (eg Forum for Health Economics V5) * So to identify ALL articles, we'll also have to capture issue level items and then look * at the html and if it has article metadata in it, count it as an article. * */ @Override protected ArticleFiles createArticleFiles(CachedUrl cu) { String url = cu.getUrl(); Matcher mat = pattern.matcher(url); if (mat.find()) { // we matched, but could this pattern potentially be a toc? Matcher tocmat = TOC_pattern.matcher(url); // if we could be a TOC then we must have metadata to be considered an article if (tocmat.find()) { if (hasArticleMetadata(cu)) { return processUrl(cu, mat); } } else { // we're not a potential TOC, so treat this as an article without checking return processUrl(cu, mat); } return null; // this was a TOC, not an article } log.warning("Mismatch between article iterator factory and article iterator: " + url); return null; } protected ArticleFiles processUrl(CachedUrl cu, Matcher mat) { ArticleFiles af = new ArticleFiles(); af.setFullTextCu(cu); af.setRoleCu(ArticleFiles.ROLE_ABSTRACT, cu); // XXX Full text PDF link embedded in page, cannot guess URL return af; } /* * hasArticleMetadata(CachedUrl cu) * Given the CachedUrl for the potential abstract file, using the existing * SimpleHtmlMetaTagMetadataExtractor to parse the file and * retrieve any contained metadata. If a doi or author exists, it's an article * NOT defining the Metadata Extractor here! */ private boolean hasArticleMetadata(CachedUrl cu) { MetadataTarget at = new MetadataTarget(MetadataTarget.PURPOSE_ARTICLE); ArticleMetadata am; SimpleHtmlMetaTagMetadataExtractor ext = new SimpleHtmlMetaTagMetadataExtractor(); if (cu != null && cu.hasContent()) { try { at.setFormat("text/html"); am = ext.extract(at, cu); if ((am.containsRawKey("bepress_citation_journal_title")) || (am.containsRawKey("bepress_citation_abstract_html_url")) || (am.containsRawKey("bepress_citation_doi")) || (am.containsRawKey("bepress_citation_author"))) { return true; } } catch (IOException e) { e.printStackTrace(); } } return false; // no reasonable metadata, probably a toc } } public ArticleMetadataExtractor createArticleMetadataExtractor(MetadataTarget target) throws PluginException { return new BaseArticleMetadataExtractor(null); } }