protected void initFeatureVersions() throws PluginException.InvalidDefinition {
   if (definitionMap.containsKey(KEY_PLUGIN_FEATURE_VERSION_MAP)) {
     Map<Plugin.Feature, String> map = new HashMap<Plugin.Feature, String>();
     Map<String, String> spec =
         (Map<String, String>) definitionMap.getMap(KEY_PLUGIN_FEATURE_VERSION_MAP);
     log.debug2("features: " + spec);
     for (Map.Entry<String, String> ent : spec.entrySet()) {
       try {
         // Prefix version string with feature name to create separate
         // namespace for each feature
         String key = ent.getKey();
         map.put(Plugin.Feature.valueOf(key), key + "_" + ent.getValue());
       } catch (RuntimeException e) {
         log.warning(
             getPluginName()
                 + " set unknown feature: "
                 + ent.getKey()
                 + " to version "
                 + ent.getValue(),
             e);
         throw new PluginException.InvalidDefinition("Unknown feature: " + ent.getKey(), e);
       }
     }
     featureVersion = map;
   } else {
     featureVersion = null;
   }
 }
Esempio n. 2
0
 void setConfig(Configuration config) {
   log.debug("config: " + config);
   proxyHost = config.get(PARAM_PROXY_HOST);
   proxyPort = config.getInt(PARAM_PROXY_PORT, DEFAULT_PROXY_PORT);
   if (StringUtil.isNullString(proxyHost) || proxyPort <= 0) {
     String http_proxy = System.getenv("http_proxy");
     if (!StringUtil.isNullString(http_proxy)) {
       try {
         HostPortParser hpp = new HostPortParser(http_proxy);
         proxyHost = hpp.getHost();
         proxyPort = hpp.getPort();
       } catch (HostPortParser.InvalidSpec e) {
         log.warning("Can't parse http_proxy environment var, ignoring: " + http_proxy + ": " + e);
       }
     }
   }
   if (StringUtil.isNullString(proxyHost) || proxyPort <= 0) {
     proxyHost = null;
   } else {
     log.info("Proxying through " + proxyHost + ":" + proxyPort);
   }
   userAgent = config.get(PARAM_USER_AGENT);
   if (StringUtil.isNullString(userAgent)) {
     userAgent = null;
   } else {
     log.debug("Setting User-Agent to " + userAgent);
   }
 }
 public void emitMetadata(ArticleFiles af, ArticleMetadata md) {
   if (log.isDebug3()) log.debug3("emit(" + af + ", " + md + ")");
   if (md != null) {
     log.debug3("add " + md + " to amlist");
     amlst.add(md);
   }
   ;
 }
  public void testFunctionalFromTarHierarchy() throws Exception {
    log.debug3("in testFromTarHierarchy");
    // load the tarballs
    InputStream file_input = null;
    try {
      file_input = getResourceAsStream(realTARFile_A);
      // UrlCacher uc = au.makeUrlCacher(TAR_A_BASE);
      // uc.storeContent(file_input, tarHeader);
      UrlCacher uc = tarAu.makeUrlCacher(new UrlData(file_input, tarHeader, TAR_A_BASE));
      uc.storeContent();
      IOUtil.safeClose(file_input);

      file_input = getResourceAsStream(realTARFile_B);
      // uc = au.makeUrlCacher(TAR_B_BASE);
      // uc.storeContent(file_input, tarHeader);
      uc = tarAu.makeUrlCacher(new UrlData(file_input, tarHeader, TAR_B_BASE));
      uc.storeContent();
      IOUtil.safeClose(file_input);

    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } finally {
      IOUtil.safeClose(file_input);
    }

    CachedUrlSet cus = tarAu.getAuCachedUrlSet();
    for (CachedUrl cu : cus.getCuIterable()) {
      log.debug3("AU - cu is: " + cu.getUrl());
      cu.release();
    }

    // We need to start from the level of the ArticleMetadataExtractor
    MyListEmitter emitter = new MyListEmitter();
    ArticleMetadataExtractor amEx =
        new ElsevierDeferredArticleMetadataExtractor(ArticleFiles.ROLE_ARTICLE_METADATA);

    Iterator<ArticleFiles> it = tarAu.getArticleIterator(MetadataTarget.Any());
    while (it.hasNext()) {
      ArticleFiles af = it.next();
      log.debug3("Metadata test - articlefiles " + af.toString());
      // CachedUrl cu = af.getFullTextCu();
      CachedUrl cu = af.getRoleCu(ArticleFiles.ROLE_ARTICLE_METADATA);
      log.debug3("metadata cu is " + cu.getUrl());
      // List<ArticleMetadata> mdlist = mle.extract(MetadataTarget.Any(), cu);
      amEx.extract(MetadataTarget.Any(), af, emitter);
      List<ArticleMetadata> returnList = emitter.getAmList();

      assertNotNull(returnList);
      log.debug3("size of returnList is " + returnList.size());
      Iterator<ArticleMetadata> mdIt = returnList.iterator();
      ArticleMetadata mdRecord = null;
      while (mdIt.hasNext()) {
        mdRecord = (ArticleMetadata) mdIt.next();
        validateCompleteMetadataRecord(mdRecord);
      }
    }
  }
 public String getDefaultArticleMimeType() {
   String ret = definitionMap.getString(KEY_DEFAULT_ARTICLE_MIME_TYPE, null);
   log.debug3("DefaultArticleMimeType " + ret);
   if (ret == null) {
     ret = super.getDefaultArticleMimeType();
     log.debug3("DefaultArticleMimeType from super " + ret);
   }
   return ret;
 }
  public void testArticleCountAndType() throws Exception {
    int expCount = 28;
    PluginTestUtil.crawlSimAu(sau);
    String pat1 = "branch(\\d+)/(\\d+file\\.html)";
    String rep1 = "aps/journal/v123/n$1/full/$2";
    PluginTestUtil.copyAu(sau, nau, ".*[^.][^p][^d][^f]$", pat1, rep1);
    String pat2 = "branch(\\d+)/(\\d+file\\.pdf)";
    String rep2 = "aps/journal/v123/n$1/pdf/$2";
    PluginTestUtil.copyAu(sau, nau, ".*\\.pdf$", pat2, rep2);

    // Remove some URLs
    int deleted = 0;
    for (Iterator it = nau.getAuCachedUrlSet().contentHashIterator(); it.hasNext(); ) {
      CachedUrlSetNode cusn = (CachedUrlSetNode) it.next();
      if (cusn instanceof CachedUrl) {
        CachedUrl cu = (CachedUrl) cusn;
        String url = cu.getUrl();
        if (url.contains("/journal/")
            && (url.endsWith("1file.html") || url.endsWith("2file.pdf"))) {
          deleteBlock(cu);
          ++deleted;
        }
      }
    }
    assertEquals(8, deleted);

    Iterator<ArticleFiles> it = nau.getArticleIterator();
    int count = 0;
    int countHtmlOnly = 0;
    int countPdfOnly = 0;
    while (it.hasNext()) {
      ArticleFiles af = it.next();
      log.info(af.toString());
      CachedUrl cu = af.getFullTextCu();
      String url = cu.getUrl();
      assertNotNull(cu);
      String contentType = cu.getContentType();
      log.debug("count " + count + " url " + url + " " + contentType);
      count++;
      if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == null) {
        ++countHtmlOnly;
      }
      if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == url) {
        ++countPdfOnly;
      }
    }
    log.debug("Article count is " + count);
    assertEquals(expCount, count);
    assertEquals(4, countHtmlOnly);
    assertEquals(4, countPdfOnly);
  }
  /*
   * When testing no-pdf-check basic XML parsing, you will get partial MD records
   * depending on whether the info comes from dataset.xml or from main.xml
   */
  private void validateDatasetMetadataRecord(ArticleMetadata am) {
    log.debug3("valideDatasetMetadatRecord");
    String doi_val = am.get(MetadataField.FIELD_DOI);
    assertEquals(common_issn, am.get(MetadataField.FIELD_ISSN));

    log.debug3("doi val is: " + doi_val);
    // The dataset doesn't set this value, it'll fail over the main.xml value
    if (doi_val.equals("10.1016/S0140-1111(14)61865-1")) {
      assertEquals(null, am.get(MetadataField.FIELD_DATE));
    } else {
      assertEquals(dateMap.get(doi_val), am.get(MetadataField.FIELD_DATE));
    }
    assertEquals(pubTitleMap.get(doi_val), am.get(MetadataField.FIELD_PUBLICATION_TITLE));
  }
  /**
   * Method that creates a simulated Cached URL from the source code provided by the goodContent
   * String. It then asserts that the metadata extracted, by using the
   * MetaPressRisMetadataExtractorFactory, match the metadata in the source code.
   *
   * @throws Exception
   */
  public void testExtractGoodRisContent() throws Exception {
    String goodContent = createGoodRisContent();
    log.debug3(goodContent);

    List<ArticleMetadata> mdlist = setupContentForAU(bau1, RIS_URL, goodContent, false);
    assertNotEmpty(mdlist);
    ArticleMetadata md = mdlist.get(0);
    assertNotNull(md);

    assertEquals(goodVolume, md.get(MetadataField.FIELD_VOLUME));
    assertEquals(goodIssue, md.get(MetadataField.FIELD_ISSUE));
    assertEquals(goodStartPage, md.get(MetadataField.FIELD_START_PAGE));
    assertEquals(goodEndPage, md.get(MetadataField.FIELD_END_PAGE));
    assertEquals(goodIssn, md.get(MetadataField.FIELD_ISSN));
    Iterator<String> actAuthIter = md.getList(MetadataField.FIELD_AUTHOR).iterator();
    for (String expAuth : goodAuthors) {
      assertEquals(expAuth, actAuthIter.next());
    }
    assertEquals(goodTitle, md.get(MetadataField.FIELD_ARTICLE_TITLE));
    assertEquals(goodJournal, md.get(MetadataField.FIELD_PUBLICATION_TITLE));
    assertEquals(goodDate, md.get(MetadataField.FIELD_DATE));

    assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER));
    assertEquals(goodDOI, md.get(MetadataField.FIELD_DOI));
    // This shouldn't get set. It will default later to fuill_text_cu
    assertNotEquals(doiURL, md.get(MetadataField.FIELD_ACCESS_URL));
  }
  /*
   * You will have to tell it the DOI and the schema because those normally come from dataset
   */
  private void validateSingleMainMetadataRecord(ArticleMetadata am, String doi_val, String schema) {
    log.debug3("valideSingleMainMetadatRecord");
    if ("simple-article".equals(schema)) {
      assertEquals(common_simple_article_title, am.get(MetadataField.FIELD_ARTICLE_TITLE));
    } else {
      assertEquals(common_article_title, am.get(MetadataField.FIELD_ARTICLE_TITLE));
    }

    log.debug3("doi val is: " + doi_val);
    assertEquals(authorMap.get(doi_val), am.getList(MetadataField.FIELD_AUTHOR));
    assertEquals(volMap.get(doi_val), am.get(MetadataField.FIELD_VOLUME));
    assertEquals(issueMap.get(doi_val), am.get(MetadataField.FIELD_ISSUE));
    assertEquals("Comment", am.getRaw(ElsevierMainDTD5XmlSchemaHelper.common_dochead));
    assertEquals(doi_val, am.getRaw(ElsevierMainDTD5XmlSchemaHelper.common_doi));
    assertEquals("2014", am.getRaw(ElsevierMainDTD5XmlSchemaHelper.common_copyright));
  }
 private void deleteBlock(CachedUrl cu) throws IOException {
   log.info("deleting " + cu.getUrl());
   CachedUrlSetSpec cuss = new SingleNodeCachedUrlSetSpec(cu.getUrl());
   ArchivalUnit au = cu.getArchivalUnit();
   CachedUrlSet cus = au.makeCachedUrlSet(cuss);
   NodeManager nm = au.getPlugin().getDaemon().getNodeManager(au);
   nm.deleteNode(cus);
 }
 public void testSimpleMainXML() throws Exception {
   log.debug3("testSimpleMainXML");
   String xml_url = TAR_A_BASE + SUBDIR + "01420615/v64sC/S0142061514004608/main.xml";
   List<ArticleMetadata> mdList =
       extractFromContent(xml_url, "text/xml", simpleMain, nocheck_mle, null);
   assertEquals(1, mdList.size());
   validateSingleMainMetadataRecord(mdList.get(0), "10.1016/j.jidx.2014.07.028", "article");
 }
public class HighWireDrupalHtmlMetadataExtractorFactory implements FileMetadataExtractorFactory {

  private static final Logger log =
      Logger.getLogger(HighWireDrupalHtmlMetadataExtractorFactory.class);

  @Override
  public FileMetadataExtractor createFileMetadataExtractor(
      MetadataTarget target, String contentType) throws PluginException {
    return new HighWireDrupalHtmlMetadataExtractor();
  }

  public static class HighWireDrupalHtmlMetadataExtractor implements FileMetadataExtractor {

    // Map HighWire HTML meta tag names to cooked metadata fields
    private static MultiMap tagMap = new MultiValueMap();

    static {
      tagMap.put("DC.Format", MetadataField.FIELD_FORMAT);
      tagMap.put("DC.Language", MetadataField.FIELD_LANGUAGE);
      tagMap.put("citation_publisher", MetadataField.FIELD_PUBLISHER);
      tagMap.put("citation_journal_title", MetadataField.FIELD_PUBLICATION_TITLE);
      tagMap.put("citation_title", MetadataField.FIELD_ARTICLE_TITLE);
      tagMap.put("citation_date", MetadataField.FIELD_DATE);
      tagMap.put("citation_publication_date", MetadataField.FIELD_DATE);
      tagMap.put(
          "citation_authors",
          new MetadataField(MetadataField.FIELD_AUTHOR, MetadataField.splitAt(";")));
      tagMap.put("citation_author", MetadataField.FIELD_AUTHOR);
      tagMap.put("citation_issn", MetadataField.FIELD_ISSN);
      tagMap.put("citation_volume", MetadataField.FIELD_VOLUME);
      tagMap.put("citation_issue", MetadataField.FIELD_ISSUE);
      tagMap.put("citation_firstpage", MetadataField.FIELD_START_PAGE);
      tagMap.put("citation_lastpage", MetadataField.FIELD_END_PAGE);
      tagMap.put("citation_doi", MetadataField.FIELD_DOI);
      tagMap.put("citation_public_url", MetadataField.FIELD_ACCESS_URL);
      // typical field value: "acupmed;30/1/8": extract "acupmed"
      tagMap.put(
          "citation_mjid",
          new MetadataField(
              MetadataField.FIELD_PROPRIETARY_IDENTIFIER, MetadataField.extract("^([^;]+);", 1)));
    }

    @Override
    public void extract(MetadataTarget target, CachedUrl cu, Emitter emitter) throws IOException {
      ArticleMetadata am = new SimpleHtmlMetaTagMetadataExtractor().extract(target, cu);
      am.cook(tagMap);
      String url = am.get(MetadataField.FIELD_ACCESS_URL);
      ArchivalUnit au = cu.getArchivalUnit();
      if (url == null || url.isEmpty() || !au.makeCachedUrl(url).hasContent()) {
        url = cu.getUrl();
      }
      am.replace(
          MetadataField.FIELD_ACCESS_URL,
          HttpToHttpsUtil.AuUtil.normalizeHttpHttpsFromBaseUrl(au, url));
      emitter.emitMetadata(cu, am);
    }
  }
}
    @Override
    public ArticleMetadata extract(MetadataTarget target, CachedUrl cu) throws IOException {

      log.debug3("Metadata - cachedurl cu:" + cu.getUrl());

      ArticleMetadata am = super.extract(target, cu);
      am.cook(tagMap);
      return am;
    } // extract
public class EbscoXmlMetadataExtractorFactory extends SourceXmlMetadataExtractorFactory {
  private static final Logger log = Logger.getLogger(EbscoXmlMetadataExtractorFactory.class);
  private static final String CONTENT_DIR = "/Content/";
  private static SourceXmlSchemaHelper EbscoSchemaHelper = null;

  @Override
  public FileMetadataExtractor createFileMetadataExtractor(
      MetadataTarget target, String contentType) throws PluginException {
    return new EbscoSourceXmlMetadataExtractor();
  }

  public class EbscoSourceXmlMetadataExtractor extends SourceXmlMetadataExtractor {

    @Override
    protected SourceXmlSchemaHelper setUpSchema(CachedUrl cu) {
      // Once you have it, just keep returning the same one. It won't change.
      if (EbscoSchemaHelper == null) {
        EbscoSchemaHelper = new EbscoSchemaHelper();
      }
      return EbscoSchemaHelper;
    }

    /*
     * The filename is the ProducID with either ".pdf" or ".epub" suffix.
     * Tje content files live in a parallel directory
     *     <base>/<year>/Content/
     * The XML file represented by the current cu would be something like:
     *   <base>/<year>/DataFeed/EBSCOhostGKB_20160205_DELTA.zip!/EBSCOhostGKB_20160205_DELTA.xml
     * and the pdf would be
     *   <base>/<year>/Content/123456.pdf
     */
    @Override
    protected List<String> getFilenamesAssociatedWithRecord(
        SourceXmlSchemaHelper helper, CachedUrl cu, ArticleMetadata oneAM) {

      // this has been set to be the "ProductID" value
      String filenameValue = oneAM.getRaw(helper.getFilenameXPathKey());

      String cuBase = FilenameUtils.getFullPath(cu.getUrl());
      int datafeed_dir_start = cuBase.lastIndexOf("/DataFeed/");
      // This will leave the "/", so just add back on the sibling_dir and filename
      String contentPath;
      if (datafeed_dir_start < 0) {
        // can't return null because that would make it okay to emit
        // this will fail to emit, as it should - we don't know how to verify the PDF existence
        log.siteWarning("The XML file lives at an unexpected location: " + cuBase);
        contentPath = CONTENT_DIR; // invalid but will force failure
      } else {
        contentPath = cuBase.substring(0, datafeed_dir_start) + CONTENT_DIR;
      }
      List<String> returnList = new ArrayList<String>();
      returnList.add(contentPath + filenameValue + ".pdf");
      returnList.add(contentPath + filenameValue + ".epub");
      return returnList;
    }
  }
}
public class TestElsevierXmlLinkExtractorFactory extends LinkExtractorTestCase {

  private static Logger logger = Logger.getLogger("TestElsevierXmlLinkExtractorFactory");

  String srcUrl = "http://www.example.com/";

  private static final String withLinks =
      "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
          + "<!DOCTYPE dataset SYSTEM \"http://support.sciencedirect.com/xml/sdosftp10.dtd\">\n"
          + "<dataset identifier=\"OXM10160\" customer=\"OHL\""
          + " status=\"Announcement\""
          + " version=\"Network Dataset Announcement/Confirmation v1.0\">"
          + " <date year=\"2007\" month=\"May\" day=\"1\"/>\n"
          + "<file name=\"01407007.tar\" size=\"21780480\""
          + " md5=\"6c7266e0e246bf3e8cf1cd8b659a7a73\"/>\n"
          + "<file name=\"03064530.tar\" size=\"12748800\""
          + " md5=\"df9519d3075e164d22f5dd4988a693c3\"/>\n"
          + "<file name=\"dataset.toc\" size=\"2216587\""
          + " md5=\"cd21741eb91fa0fdfef2fa36485e21a0\"/>\n"
          + "</dataset>\n";

  private static final String withoutLinks =
      "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
          + "<!DOCTYPE dataset SYSTEM \"http://support.sciencedirect.com/xml/sdosftp10.dtd\">\n"
          + "<dataset identifier=\"OXM10160\" customer=\"OHL\""
          + " status=\"Announcement\""
          + " version=\"Network Dataset Announcement/Confirmation v1.0\">"
          + " <date year=\"2007\" month=\"May\" day=\"1\"/>\n"
          + "</dataset>\n";

  private static final String[] links = {
    "01407007.tar", "03064530.tar", "dataset.toc",
  };

  public String getMimeType() {
    return "text/xml";
  }

  public LinkExtractorFactory getFactory() {
    return new ElsevierXmlLinkExtractorFactory();
  }

  public void testFindCorrectEntries() throws Exception {
    Set expected = new HashSet();
    for (String link : links) {
      expected.add(srcUrl + link);
    }
    assertEquals(expected, extractUrls(withLinks));
  }

  public void testFindNoEntries() throws Exception {
    assertEmpty(extractUrls(withoutLinks));
  }
}
/*
 * OJS2HtmlMetadataExtractorFactory extracts metadata from each article.
 */
public class OJS2HtmlMetadataExtractorFactory implements FileMetadataExtractorFactory {

  static Logger log = Logger.getLogger(OJS2HtmlMetadataExtractorFactory.class);

  public FileMetadataExtractor createFileMetadataExtractor(
      MetadataTarget target, String contentType) throws PluginException {

    return new OJS2HtmlMetadataExtractor();
  } // createFileMetadataExtractor

  public static class OJS2HtmlMetadataExtractor extends SimpleHtmlMetaTagMetadataExtractor {

    // Map OJS2-specific HTML meta tag names to cooked metadata fields
    private static MultiMap tagMap = new MultiValueMap();

    static {
      tagMap.put("DC.Format", MetadataField.DC_FIELD_FORMAT);
      tagMap.put("DC.Language", MetadataField.DC_FIELD_LANGUAGE);
      tagMap.put("DC.Title", MetadataField.DC_FIELD_TITLE);
      tagMap.put("DC.Identifier", MetadataField.DC_FIELD_IDENTIFIER);
      tagMap.put("DC.Date", MetadataField.DC_FIELD_DATE);
      tagMap.put("DC.Publisher", MetadataField.DC_FIELD_PUBLISHER);
      tagMap.put("DC.Publisher", MetadataField.FIELD_PUBLISHER);
      tagMap.put("DC.Contributor", MetadataField.DC_FIELD_CONTRIBUTOR);
      tagMap.put("citation_journal_title", MetadataField.FIELD_PUBLICATION_TITLE);
      tagMap.put("citation_title", MetadataField.FIELD_ARTICLE_TITLE);
      tagMap.put("citation_date", MetadataField.FIELD_DATE);
      tagMap.put("citation_author", MetadataField.FIELD_AUTHOR);
      tagMap.put(
          "citation_authors",
          new MetadataField(MetadataField.FIELD_AUTHOR, MetadataField.splitAt(";")));
      tagMap.put("citation_issn", MetadataField.FIELD_ISSN);
      tagMap.put("citation_volume", MetadataField.FIELD_VOLUME);
      tagMap.put("citation_volume", MetadataField.DC_FIELD_CITATION_VOLUME);
      tagMap.put("citation_issue", MetadataField.FIELD_ISSUE);
      tagMap.put("citation_issue", MetadataField.DC_FIELD_CITATION_ISSUE);
      tagMap.put("citation_firstpage", MetadataField.FIELD_START_PAGE);
      tagMap.put("citation_lastpage", MetadataField.FIELD_END_PAGE);
      tagMap.put("citation_doi", MetadataField.FIELD_DOI);
      tagMap.put("citation_public_url", MetadataField.FIELD_ACCESS_URL);
    } // static

    @Override
    public ArticleMetadata extract(MetadataTarget target, CachedUrl cu) throws IOException {

      log.debug3("Metadata - cachedurl cu:" + cu.getUrl());

      ArticleMetadata am = super.extract(target, cu);
      am.cook(tagMap);
      return am;
    } // extract
  } // OJS2HtmlMetadataExtractor
} // OJS2HtmlMetadataExtractorFactory
Esempio n. 17
0
 protected PermissionCheckerFactory getPermissionCheckerFactory() {
   if (permissionCheckerFact == null) {
     String permissionCheckerFactoryClass =
         definitionMap.getString(DefinableArchivalUnit.KEY_AU_PERMISSION_CHECKER_FACTORY, null);
     if (permissionCheckerFactoryClass != null) {
       permissionCheckerFact =
           (PermissionCheckerFactory)
               newAuxClass(permissionCheckerFactoryClass, PermissionCheckerFactory.class);
       log.debug2("Loaded PermissionCheckerFactory: " + permissionCheckerFact);
     }
   }
   return permissionCheckerFact;
 }
  /*
   * When testing a complete extraction out of the tarset, the MD record will be completely filled in
   * and pdf-existence will get established
   */
  private void validateCompleteMetadataRecord(ArticleMetadata am) {
    log.debug3("valideCompleteMetadatRecord");
    String doi_val = am.get(MetadataField.FIELD_DOI);
    /* make sure we can pick up both types of xml article data */
    log.debug3("doi val is: " + doi_val);

    if ("JA 5.2.0 SIMPLE-ARTICLE"
        .equals(am.getRaw(ElsevierDatasetXmlSchemaHelper.dataset_dtd_metadata))) {
      log.debug3("simple-article");
      assertEquals(common_simple_article_title, am.get(MetadataField.FIELD_ARTICLE_TITLE));
    } else {
      assertEquals(common_article_title, am.get(MetadataField.FIELD_ARTICLE_TITLE));
    }
    assertEquals(common_issn, am.get(MetadataField.FIELD_ISSN));
    assertEquals(authorMap.get(doi_val), am.getList(MetadataField.FIELD_AUTHOR));
    assertEquals(dateMap.get(doi_val), am.get(MetadataField.FIELD_DATE));
    assertEquals(accessUrlMap.get(doi_val), am.get(MetadataField.FIELD_ACCESS_URL));
    assertEquals(volMap.get(doi_val), am.get(MetadataField.FIELD_VOLUME));
    assertEquals(issueMap.get(doi_val), am.get(MetadataField.FIELD_ISSUE));
    assertEquals(pubTitleMap.get(doi_val), am.get(MetadataField.FIELD_PUBLICATION_TITLE));
    assertEquals("Elsevier", am.get(MetadataField.FIELD_PROVIDER));
    assertEquals("Elsevier", am.get(MetadataField.FIELD_PUBLISHER));
    log.debug3(am.ppString(2));
  }
  public void testSimpleDatasetXML() throws Exception {
    log.debug3("testSimpleDatasetXML");
    String file_input = StringUtil.fromInputStream(getResourceAsStream(testDatasetFile));
    String xml_url = TAR_A_BASE + SUBDIR + "dataset.xml";

    List<ArticleMetadata> mdList =
        extractFromContent(xml_url, "text/xml", file_input, nocheck_mle, null);
    assertEquals(6, mdList.size());
    Iterator<ArticleMetadata> mdIt = mdList.iterator();
    ArticleMetadata mdRecord = null;
    while (mdIt.hasNext()) {
      mdRecord = (ArticleMetadata) mdIt.next();
      validateDatasetMetadataRecord(mdRecord);
    }
  }
Esempio n. 20
0
  protected FilterRule constructFilterRule(String contentType) {
    String mimeType = HeaderUtil.getMimeTypeFromContentType(contentType);

    Object filter_el =
        definitionMap.getMapElement(mimeType + DefinableArchivalUnit.SUFFIX_FILTER_RULE);

    if (filter_el instanceof String) {
      log.debug("Loading filter " + filter_el);
      return (FilterRule) newAuxClass((String) filter_el, FilterRule.class);
    } else if (filter_el instanceof List) {
      if (((List) filter_el).size() > 0) {
        return new DefinableFilterRule((List) filter_el);
      }
    }
    return super.constructFilterRule(mimeType);
  }
  /**
   * Method that creates a simulated Cached URL from the source code provided by the goodContent
   * String. It then asserts that the metadata extracted, by using the
   * MetaPressRisMetadataExtractorFactory, match the metadata in the source code.
   *
   * @throws Exception
   */
  public void testExtractAlternateRisContent() throws Exception {
    String goodContent = createAlternateRisContent();
    log.debug3(goodContent);

    List<ArticleMetadata> mdlist = setupContentForAU(bau1, RIS_URL, goodContent, false);
    assertNotEmpty(mdlist);
    ArticleMetadata md = mdlist.get(0);
    assertNotNull(md);

    Iterator<String> actAuthIter = md.getList(MetadataField.FIELD_AUTHOR).iterator();
    for (String expAuth : goodAuthors) {
      assertEquals(expAuth, actAuthIter.next());
    }
    assertEquals(goodTitle, md.get(MetadataField.FIELD_ARTICLE_TITLE));
    assertEquals(goodJournal, md.get(MetadataField.FIELD_PUBLICATION_TITLE));
    assertEquals(goodDate, md.get(MetadataField.FIELD_DATE));
    assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER));
  }
Esempio n. 22
0
 void checkParamAgreement(String key, PrintfContext context) {
   List<String> printfList = getElementList(key);
   if (printfList == null) {
     return;
   }
   for (String printf : printfList) {
     if (StringUtil.isNullString(printf)) {
       log.warning("Null printf string in " + key);
       continue;
     }
     PrintfUtil.PrintfData p_data = PrintfUtil.stringToPrintf(printf);
     Collection<String> p_args = p_data.getArguments();
     for (String arg : p_args) {
       ConfigParamDescr descr = findAuConfigDescr(arg);
       if (descr == null) {
         throw new PluginException.InvalidDefinition(
             "Not a declared parameter: " + arg + " in " + printf + " in " + getPluginName());
       }
       // ensure range and set params used only in legal context
       switch (context) {
         case Regexp:
         case Display:
           // everything is legal in a regexp or a display string
           break;
         case URL:
           // NUM_RANGE and SET legal because can enumerate.  Can't
           // enumerate RANGE
           switch (descr.getType()) {
             case ConfigParamDescr.TYPE_RANGE:
               throw new PluginException.InvalidDefinition(
                   "Range parameter ("
                       + arg
                       + ") used in illegal context in "
                       + getPluginName()
                       + ": "
                       + key
                       + ": "
                       + printf);
             default:
           }
       }
     }
   }
 }
Esempio n. 23
0
 /** If in testing mode FOO, copy values from FOO_override map, if any, to main map */
 void processOverrides(TypedEntryMap map) {
   String testMode = getTestingMode();
   if (StringUtil.isNullString(testMode)) {
     return;
   }
   Object o = map.getMapElement(testMode + DefinableArchivalUnit.SUFFIX_OVERRIDE);
   if (o == null) {
     return;
   }
   if (o instanceof Map) {
     Map overrideMap = (Map) o;
     for (Map.Entry entry : (Set<Map.Entry>) overrideMap.entrySet()) {
       String key = (String) entry.getKey();
       Object val = entry.getValue();
       log.debug(getDefaultPluginName() + ": Overriding " + key + " with " + val);
       map.setMapElement(key, val);
     }
   }
 }
 /*
  * This is comlicated. MOST AUs have articles that live below and issue level TOC
  * that is,
  * <blah>/<journal_id>/vol#/iss#/ is a toc with no relevant metadata
  * <blah>/<journal_id>/vol#/iss#/xxx is an article with metadata
  * (eg Economist Voice V1)
  * BUT
  * in some AUs there are issues with only 1 article, in which case
  * <blah>/<journal_id>/vol#/iss#/ is an abstract with metadata
  * (eg Rhodes Cook V4)
  * and a few AUs with a mixture
  * (eg Forum for Health Economics V5)
  * So to identify ALL articles, we'll also have to capture issue level items and then look
  * at the html and if it has article metadata in it, count it as an article.
  *
  */
 @Override
 protected ArticleFiles createArticleFiles(CachedUrl cu) {
   String url = cu.getUrl();
   Matcher mat = pattern.matcher(url);
   if (mat.find()) {
     // we matched, but could this pattern potentially be a toc?
     Matcher tocmat = TOC_pattern.matcher(url);
     // if we could be a TOC then we must have metadata to be considered an article
     if (tocmat.find()) {
       if (hasArticleMetadata(cu)) {
         return processUrl(cu, mat);
       }
     } else {
       // we're not a potential TOC, so treat this as an article without checking
       return processUrl(cu, mat);
     }
     return null; // this was a TOC, not an article
   }
   log.warning("Mismatch between article iterator factory and article iterator: " + url);
   return null;
 }
/**
 * One of the articles used to get the html source for this plugin is:
 * http://iopscience.iop.org/2043-6262/1/4/043003
 */
public class IOPScienceHtmlMetadataExtractorFactory implements FileMetadataExtractorFactory {
  static Logger log = Logger.getLogger(IOPScienceHtmlMetadataExtractorFactory.class);

  public static MetadataField IOP_ACCESS_URL =
      new MetadataField(
          MetadataField.KEY_ACCESS_URL,
          Cardinality.Single,
          new Validator() {
            public String validate(ArticleMetadata am, MetadataField field, String val)
                throws MetadataException.ValidationException {
              // trim trailing '/' from urls like
              // http://iopscience.iop.org/0264-9381/29/9/097002/article/
              if ((val != null) && !val.isEmpty() && (val.endsWith("/"))) {
                val = val.substring(0, val.length() - 1);
              }
              return val;
            }
          });

  public FileMetadataExtractor createFileMetadataExtractor(
      MetadataTarget target, String contentType) throws PluginException {
    return new IOPScienceHtmlMetadataExtractor();
  }

  public static class IOPScienceHtmlMetadataExtractor extends SimpleHtmlMetaTagMetadataExtractor {

    private static MultiMap tagMap = new MultiValueMap();

    static {
      // <meta name="citation_doi" content="10.1088/2043-6262/1/4/043003" />
      tagMap.put("citation_doi", MetadataField.FIELD_DOI);
      //  <meta name="citation_publication_date" content="2011-01-25" />
      tagMap.put("citation_publication_date", MetadataField.FIELD_DATE);
      // <meta name="citation_title" content="Polymer materials with spatially..." />
      tagMap.put("citation_title", MetadataField.FIELD_ARTICLE_TITLE);
      // <meta name="citation_issn" content="2043-6262"/>
      tagMap.put("citation_issn", MetadataField.FIELD_ISSN);
      // <meta name="citation_volume" content="1" />
      tagMap.put("citation_volume", MetadataField.FIELD_VOLUME);
      // <meta name="citation_issue" content="4"/>
      tagMap.put("citation_issue", MetadataField.FIELD_ISSUE);
      // <meta name="citation_firstpage" content="043003"/>
      tagMap.put("citation_firstpage", MetadataField.FIELD_START_PAGE);
      //  <meta name="citation_author" content="Daisuke Fujiki"/>
      tagMap.put("citation_author", MetadataField.FIELD_AUTHOR);
      // <meta name="citation_journal_title" content="Advances in Natural Sciences:
      // Nanoscience and Nanotechnology" />
      tagMap.put("citation_journal_title", MetadataField.FIELD_PUBLICATION_TITLE);
      // <meta name="citation_publisher" content="IOP Publishing" />
      tagMap.put("citation_publisher", MetadataField.FIELD_PUBLISHER);
      // XXX this map is so that the metadata url is not always the access_url
      // <meta name="citation_fulltext_html_url" content="http://iopscience.iop.org/...
      tagMap.put("citation_fulltext_html_url", IOP_ACCESS_URL);
    }

    @Override
    public ArticleMetadata extract(MetadataTarget target, CachedUrl cu) throws IOException {
      ArticleMetadata am = super.extract(target, cu);
      am.cook(tagMap);
      String url = am.get(MetadataField.FIELD_ACCESS_URL);
      if (url != null && !url.isEmpty()) {
        CachedUrl val = cu.getArchivalUnit().makeCachedUrl(url);
        if (!val.hasContent()) {
          am.replace(MetadataField.FIELD_ACCESS_URL, cu.getUrl());
        }
      } else {
        am.replace(MetadataField.FIELD_ACCESS_URL, cu.getUrl());
      }
      return am;
    }
  }
}
public class TestBaseAtyponMetadataExtractor extends LockssTestCase {

  static Logger log = Logger.getLogger("TestBaseAtyponMetadataExtractor");

  private MockLockssDaemon theDaemon;
  private ArchivalUnit bau;
  private ArchivalUnit bau1;
  private static String PLUGIN_NAME = "org.lockss.plugin.atypon.BaseAtyponPlugin";
  static final String BASE_URL_KEY = ConfigParamDescr.BASE_URL.getKey();
  private static String BASE_URL = "http://www.baseatypon.org/";

  // the metadata that should be extracted
  static String goodDate = "2012-07-05";
  static String[] goodAuthors = new String[] {"D. Author", "S. Author2"};
  static String goodFormat = "text/HTML";
  static String goodTitle = "Title of Article";
  static String goodType = "research-article";
  static String goodPublisher = "Base Atypon";
  static String goodPublishingPlatform = "Atypon";
  static String goodDOI = "10.1137/10081839X";
  static String goodJID = "xxx";

  static String goodJournal = "Journal Name";
  static String goodStartPage = "22";
  static String goodEndPage = "44";
  static String goodVolume = "13";
  static String goodIssue = "3";
  static String goodIssn = "1540-3459";
  static String doiURL = "http://dx.doi.org/" + goodDOI;
  private static final String ABS_URL = BASE_URL + "doi/abs/10.1175/2010WCAS1063.1";
  private static final String RIS_URL =
      BASE_URL + "action/downloadCitation?doi=" + goodDOI + "&format=ris&include=cit";

  public void setUp() throws Exception {
    super.setUp();
    setUpDiskSpace(); // you need this to have startService work properly...

    theDaemon = getMockLockssDaemon();
    theDaemon.getAlertManager();
    theDaemon.getPluginManager().setLoadablePluginsReady(true);
    theDaemon.setDaemonInited(true);
    theDaemon.getPluginManager().startService();
    theDaemon.getCrawlManager();

    // in this directory this is file "test_baseatypon.tdb" but it becomes xml
    ConfigurationUtil.addFromUrl(getResource("test_baseatypon.xml"));
    Tdb tdb = ConfigManager.getCurrentConfig().getTdb();

    TdbAu tdbau1 = tdb.getTdbAusLikeName(goodJournal + " Volume " + goodVolume).get(0);
    assertNotNull("Didn't find named TdbAu", tdbau1);
    bau1 = PluginTestUtil.createAndStartAu(tdbau1);
    assertNotNull(bau1);
    TypedEntryMap auConfig = bau1.getProperties();
    assertEquals(BASE_URL, auConfig.getString(BASE_URL_KEY));
  }

  public void tearDown() throws Exception {
    theDaemon.stopDaemon();
    super.tearDown();
  }

  /*
   * Test the functionality of the MetadataUtilities
   *
   */
  public void testNormalizeTitleValue() throws Exception {

    assertEquals(
        BaseAtyponMetadataUtil.normalizeTitle("The title goes here"),
        BaseAtyponMetadataUtil.normalizeTitle("Title Goes Here"));
    assertEquals(
        BaseAtyponMetadataUtil.normalizeTitle("Title    with     random spaces"),
        BaseAtyponMetadataUtil.normalizeTitle("Title with random spaces"));
    assertEquals(
        BaseAtyponMetadataUtil.normalizeTitle("Normalize -- hyphen"),
        BaseAtyponMetadataUtil.normalizeTitle("normalize \u2013\u2013 hyphen"));
    assertEquals(
        BaseAtyponMetadataUtil.normalizeTitle("Title and title"),
        BaseAtyponMetadataUtil.normalizeTitle("Title & title"));
    assertEquals(
        BaseAtyponMetadataUtil.normalizeTitle("   leading spaces"),
        BaseAtyponMetadataUtil.normalizeTitle("leading spaces"));

    // now checking the fall-back last ditch attempt
    assertEquals(
        BaseAtyponMetadataUtil.generateRawTitle("leading spaces:colon?"),
        BaseAtyponMetadataUtil.generateRawTitle("leadingspacescolon"));
    assertEquals(
        BaseAtyponMetadataUtil.generateRawTitle("relapsing-remitting"),
        BaseAtyponMetadataUtil.generateRawTitle("relapsing?remitting"));
    assertEquals(
        BaseAtyponMetadataUtil.generateRawTitle("foo\"blah"),
        BaseAtyponMetadataUtil.generateRawTitle("foo-blah"));
  }

  /**
   * Configuration method.
   *
   * @return
   */

  /*
  "<meta name="dc.Title" content="Title of Article"></meta>
  "<meta name="dc.Creator" content="D. Author"></meta>
  "<meta name="dc.Creator" content="S. Author2"></meta>
  "<meta name="dc.Subject" content="weighted regularity; elliptic problem; oscillatory diffusion; $hp$ finite elements; 65N30; 35B65; 35J57"></meta>
  "<meta name="dc.Description" content="Long test summary of article, probably taken directly from the adstract..."></meta>
  "<meta name="dc.Publisher" content="Name of Publisher"></meta>
  "<meta name="dc.Date" scheme="WTN8601" content="2012-07-05"></meta>
  "<meta name="dc.Type" content="research-article"></meta>
  "<meta name="dc.Format" content="text/HTML"></meta>
  "<meta name="dc.Identifier" scheme="publisher" content="81839"></meta>
  "<meta name="dc.Identifier" scheme="doi" content="10.1137/10081839X"></meta>
  "<meta name="dc.Source" content="http://dx.doi.org/10.1137/10081839X"></meta>
  "<meta name="dc.Language" content="en"></meta>
  "<meta name="dc.Coverage" content="world"></meta>
  "<meta name="keywords" content="weighted regularity, elliptic problem, oscillatory diffusion, $hp$ finite elements, 65N30, 35B65, 35J57"></meta>
  */

  // a chunk of html source code from the publisher's site from where the
  // metadata should be extracted

  String goodHtmlContent =
      "<meta name=\"dc.Title\" content=\"Title of Article\"></meta>"
          + "<meta name=\"dc.Creator\" content=\"D. Author\"></meta>"
          + "<meta name=\"dc.Creator\" content=\"S. Author2\"></meta>"
          + "<meta name=\"dc.Subject\" content=\"weighted regularity; elliptic problem; oscillatory diffusion; $hp$ finite elements; 65N30; 35B65; 35J57\"></meta>"
          + "<meta name=\"dc.Description\" content=\"Long test summary of article, probably taken directly from the adstract...\"></meta>"
          + "<meta name=\"dc.Publisher\" content=\"Base Atypon\"></meta>"
          + "<meta name=\"dc.Date\" scheme=\"WTN8601\" content=\"2012-07-05\"></meta>"
          + "<meta name=\"dc.Type\" content=\"research-article\"></meta>"
          + "<meta name=\"dc.Format\" content=\"text/HTML\"></meta>"
          + "<meta name=\"dc.Identifier\" scheme=\"publisher\" content=\"81839\"></meta>"
          + "<meta name=\"dc.Identifier\" scheme=\"doi\" content=\"10.1137/10081839X\"></meta>"
          + "<meta name=\"dc.Source\" content=\"http://dx.doi.org/10.1137/10081839X\"></meta>"
          + "<meta name=\"dc.Language\" content=\"en\"></meta>"
          + "<meta name=\"dc.Coverage\" content=\"world\"></meta>"
          + "<meta name=\"keywords\" content=\"weighted regularity, elliptic problem, oscillatory diffusion, $hp$ finite elements, 65N30, 35B65, 35J57\"></meta>";

  public void testExtractGoodHtmlContent() throws Exception {

    List<ArticleMetadata> mdlist = setupContentForAU(bau1, ABS_URL, goodHtmlContent, true);
    assertNotEmpty(mdlist);
    ArticleMetadata md = mdlist.get(0);
    assertNotNull(md);
    assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER));
    assertEquals(goodTitle, md.get(MetadataField.DC_FIELD_TITLE));
    assertEquals(goodDate, md.get(MetadataField.FIELD_DATE));
    assertEquals(goodFormat, md.get(MetadataField.DC_FIELD_FORMAT));
    assertEquals(goodType, md.get(MetadataField.DC_FIELD_TYPE));
    assertEquals(Arrays.asList(goodAuthors), md.getList(MetadataField.FIELD_AUTHOR));
    assertEquals(goodAuthors[0], md.get(MetadataField.DC_FIELD_CREATOR));
  }

  String goodHtmlContentNoDOIorPublisher =
      "<meta name=\"dc.Title\" content=\"Title of Article\"></meta>"
          + "<meta name=\"dc.Creator\" content=\"D. Author\"></meta>"
          + "<meta name=\"dc.Creator\" content=\"S. Author2\"></meta>"
          + "<meta name=\"dc.Subject\" content=\"weighted regularity; elliptic problem; oscillatory diffusion; $hp$ finite elements; 65N30; 35B65; 35J57\"></meta>"
          + "<meta name=\"dc.Description\" content=\"Long test summary of article, probably taken directly from the adstract...\"></meta>"
          + "<meta name=\"dc.Date\" scheme=\"WTN8601\" content=\"2012-07-05\"></meta>"
          + "<meta name=\"dc.Type\" content=\"research-article\"></meta>"
          + "<meta name=\"dc.Format\" content=\"text/HTML\"></meta>"
          + "<meta name=\"dc.Identifier\" scheme=\"publisher\" content=\"81839\"></meta>"
          + "<meta name=\"dc.Language\" content=\"en\"></meta>"
          + "<meta name=\"dc.Coverage\" content=\"world\"></meta>"
          + "<meta name=\"keywords\" content=\"weighted regularity, elliptic problem, oscillatory diffusion, $hp$ finite elements, 65N30, 35B65, 35J57\"></meta>";

  public void testDOIExtraction() throws Exception {

    List<ArticleMetadata> mdlist =
        setupContentForAU(bau1, ABS_URL, goodHtmlContentNoDOIorPublisher, true);
    assertNotEmpty(mdlist);
    ArticleMetadata md = mdlist.get(0);
    assertNotNull(md);
    // gets pulled from the URL if not set in the metadata
    assertEquals("10.1175/2010WCAS1063.1", md.get(MetadataField.FIELD_DOI));
    // gets set manually if not in the metadata
    // first it would try the TDB
    assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER));
  }

  private String createGoodRisContent() {
    StringBuilder sb = new StringBuilder();
    sb.append("TY  - JOUR");
    for (String auth : goodAuthors) {
      sb.append("\nA1  - ");
      sb.append(auth);
    }
    sb.append("\nDA  - ");
    sb.append(goodDate);
    sb.append("\nJF  - ");
    sb.append(goodJournal);
    sb.append("\nSP  - ");
    sb.append(goodStartPage);
    sb.append("\nEP  - ");
    sb.append(goodEndPage);
    sb.append("\nVL  - ");
    sb.append(goodVolume);
    sb.append("\nIS  - ");
    sb.append(goodIssue);
    sb.append("\nSN  - ");
    sb.append(goodIssn);
    sb.append("\nT1  - ");
    sb.append(goodTitle);
    sb.append("\nPB  - ");
    sb.append(goodPublisher);
    sb.append("\nDO  - ");
    sb.append(goodDOI);
    sb.append("\nUR  - ");
    sb.append(doiURL);
    sb.append("\nER  -");
    return sb.toString();
  }
  /**
   * Method that creates a simulated Cached URL from the source code provided by the goodContent
   * String. It then asserts that the metadata extracted, by using the
   * MetaPressRisMetadataExtractorFactory, match the metadata in the source code.
   *
   * @throws Exception
   */
  public void testExtractGoodRisContent() throws Exception {
    String goodContent = createGoodRisContent();
    log.debug3(goodContent);

    List<ArticleMetadata> mdlist = setupContentForAU(bau1, RIS_URL, goodContent, false);
    assertNotEmpty(mdlist);
    ArticleMetadata md = mdlist.get(0);
    assertNotNull(md);

    assertEquals(goodVolume, md.get(MetadataField.FIELD_VOLUME));
    assertEquals(goodIssue, md.get(MetadataField.FIELD_ISSUE));
    assertEquals(goodStartPage, md.get(MetadataField.FIELD_START_PAGE));
    assertEquals(goodEndPage, md.get(MetadataField.FIELD_END_PAGE));
    assertEquals(goodIssn, md.get(MetadataField.FIELD_ISSN));
    Iterator<String> actAuthIter = md.getList(MetadataField.FIELD_AUTHOR).iterator();
    for (String expAuth : goodAuthors) {
      assertEquals(expAuth, actAuthIter.next());
    }
    assertEquals(goodTitle, md.get(MetadataField.FIELD_ARTICLE_TITLE));
    assertEquals(goodJournal, md.get(MetadataField.FIELD_PUBLICATION_TITLE));
    assertEquals(goodDate, md.get(MetadataField.FIELD_DATE));

    assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER));
    assertEquals(goodDOI, md.get(MetadataField.FIELD_DOI));
    // This shouldn't get set. It will default later to fuill_text_cu
    assertNotEquals(doiURL, md.get(MetadataField.FIELD_ACCESS_URL));
  }

  /* the extractor checks if data is missing it uses possible alternate RIS tags */
  private String createAlternateRisContent() {
    StringBuilder sb = new StringBuilder();
    sb.append("TY  - JOUR");
    for (String auth : goodAuthors) {
      sb.append("\nAU  - ");
      sb.append(auth);
    }
    sb.append("\nY1  - ");
    sb.append(goodDate);
    sb.append("\nT2  - ");
    sb.append(goodJournal);
    sb.append("\nT1  - ");
    sb.append(goodTitle);
    sb.append("\nPB  - ");
    sb.append(goodPublisher);
    sb.append("\nER  -");
    return sb.toString();
  }
  /**
   * Method that creates a simulated Cached URL from the source code provided by the goodContent
   * String. It then asserts that the metadata extracted, by using the
   * MetaPressRisMetadataExtractorFactory, match the metadata in the source code.
   *
   * @throws Exception
   */
  public void testExtractAlternateRisContent() throws Exception {
    String goodContent = createAlternateRisContent();
    log.debug3(goodContent);

    List<ArticleMetadata> mdlist = setupContentForAU(bau1, RIS_URL, goodContent, false);
    assertNotEmpty(mdlist);
    ArticleMetadata md = mdlist.get(0);
    assertNotNull(md);

    Iterator<String> actAuthIter = md.getList(MetadataField.FIELD_AUTHOR).iterator();
    for (String expAuth : goodAuthors) {
      assertEquals(expAuth, actAuthIter.next());
    }
    assertEquals(goodTitle, md.get(MetadataField.FIELD_ARTICLE_TITLE));
    assertEquals(goodJournal, md.get(MetadataField.FIELD_PUBLICATION_TITLE));
    assertEquals(goodDate, md.get(MetadataField.FIELD_DATE));
    assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER));
  }

  /* private support methods */
  private List<ArticleMetadata> setupContentForAU(
      ArchivalUnit au, String url, String content, boolean isHtmlExtractor)
      throws IOException, PluginException {
    FileMetadataExtractor me;

    InputStream input = null;
    CIProperties props = null;
    if (isHtmlExtractor) {
      input = IOUtils.toInputStream(content, "utf-8");
      props = getContentHtmlProperties();
      me =
          new BaseAtyponHtmlMetadataExtractorFactory()
              .createFileMetadataExtractor(MetadataTarget.Any(), "text/html");
    } else {
      input = IOUtils.toInputStream(content, "utf-8");
      props = getContentRisProperties();
      me =
          new BaseAtyponRisMetadataExtractorFactory()
              .createFileMetadataExtractor(MetadataTarget.Any(), "text/plain");
    }
    UrlData ud = new UrlData(input, props, url);
    UrlCacher uc = au.makeUrlCacher(ud);
    uc.storeContent();
    CachedUrl cu = uc.getCachedUrl();
    FileMetadataListExtractor mle = new FileMetadataListExtractor(me);
    return mle.extract(MetadataTarget.Any(), cu);
  }

  private CIProperties getContentHtmlProperties() {
    CIProperties cProps = new CIProperties();
    // the CU checks the X-Lockss-content-type, not the content-type to determine encoding
    cProps.put(CachedUrl.PROPERTY_CONTENT_TYPE, "text/html; charset=UTF-8");
    cProps.put("Content-type", "text/html; charset=UTF-8");
    return cProps;
  }

  private CIProperties getContentRisProperties() {
    CIProperties cProps = new CIProperties();
    // the CU checks the X-Lockss-content-type, not the content-type to determine encoding
    cProps.put(CachedUrl.PROPERTY_CONTENT_TYPE, "text/plain; charset=UTF-8");
    cProps.put("Content-type", "text/plain; charset=UTF-8");
    return cProps;
  }
}
Esempio n. 27
0
public class CrawlRuleTester extends Thread {
  protected static Logger log = Logger.getLogger(CrawlRuleTester.class);

  /** Proxy host */
  public static final String PARAM_PROXY_HOST = Configuration.PREFIX + "crawltest.proxy.host";

  /** Proxy port */
  public static final String PARAM_PROXY_PORT = Configuration.PREFIX + "crawltest.proxy.port";

  public static final int DEFAULT_PROXY_PORT = -1;

  /** User-Agent */
  public static final String PARAM_USER_AGENT = Configuration.PREFIX + "crawltest.userAgent";

  /* Message Types */
  public static final int ERROR_MESSAGE = 0;
  public static final int WARNING_MESSAGE = 1;
  public static final int PLAIN_MESSAGE = 2;
  public static final int URL_SUMMARY_MESSAGE = 3;
  public static final int TEST_SUMMARY_MESSAGE = 4;

  private String m_baseUrl;
  private int m_crawlDepth;
  private long m_crawlDelay;
  private int m_curDepth;
  private ArchivalUnit m_au;
  private String m_outputFile = null;
  private BufferedWriter m_outWriter = null;
  private Deadline fetchDeadline = Deadline.in(0);
  private boolean useLocalWriter = true;
  private MessageHandler m_msgHandler;
  private LockssUrlConnectionPool connectionPool = new LockssUrlConnectionPool();
  private String proxyHost;
  private String userAgent;
  private int proxyPort;

  // our storage for extracted urls
  private TreeSet m_extracted = new TreeSet();
  private TreeSet m_incls = new TreeSet();
  private TreeSet m_excls = new TreeSet();
  private TreeSet m_reported = new TreeSet();

  public CrawlRuleTester(int crawlDepth, long crawlDelay, String baseUrl, ArchivalUnit au) {
    super("crawlrule tester");
    m_crawlDepth = crawlDepth;
    long minFetchDelay =
        CurrentConfig.getLongParam(
            BaseArchivalUnit.PARAM_MIN_FETCH_DELAY, BaseArchivalUnit.DEFAULT_MIN_FETCH_DELAY);
    m_crawlDelay = Math.max(crawlDelay, minFetchDelay);
    m_baseUrl = baseUrl;
    m_au = au;
  }
  /**
   * RuleTest
   *
   * @param outFile String
   * @param crawlDepth int
   * @param crawlDelay long
   * @param baseUrl String
   * @param crawlSpec CrawlSpec
   */
  public CrawlRuleTester(
      String outFile, int crawlDepth, long crawlDelay, String baseUrl, ArchivalUnit au) {

    this(crawlDepth, crawlDelay, baseUrl, au);
    m_outputFile = outFile;
  }

  /**
   * RuleTest
   *
   * @param outWriter BufferedWriter
   * @param crawlDepth int
   * @param crawlDelay long
   * @param baseUrl String
   * @param crawlSpec CrawlSpec
   */
  public CrawlRuleTester(
      BufferedWriter outWriter, int crawlDepth, long crawlDelay, String baseUrl, ArchivalUnit au) {
    this(crawlDepth, crawlDelay, baseUrl, au);
    m_outWriter = outWriter;
  }

  /**
   * RuleTest
   *
   * @param msgHandler MessageHandler to take all output
   * @param crawlDepth the crawl depth to use
   * @param crawlDelay the type to wait between fetches
   * @param baseUrl the url to start from
   * @param crawlSpec a CrawlSpec to use for url checking.
   */
  public CrawlRuleTester(
      MessageHandler msgHandler, int crawlDepth, long crawlDelay, String baseUrl, ArchivalUnit au) {
    this(crawlDepth, crawlDelay, baseUrl, au);
    m_msgHandler = msgHandler;
  }

  public void run() {
    try {
      setConfig(ConfigManager.getCurrentConfig());
      if (m_outWriter == null && m_msgHandler == null) {
        useLocalWriter = true;
      } else {
        useLocalWriter = false;
      }
      if (useLocalWriter) {
        openOutputFile();
      }
      checkRules();
      if (useLocalWriter) {
        closeOutputFile();
      }
    } finally {
      if (m_msgHandler != null) {
        m_msgHandler.close();
      }
    }
  }

  void setConfig(Configuration config) {
    log.debug("config: " + config);
    proxyHost = config.get(PARAM_PROXY_HOST);
    proxyPort = config.getInt(PARAM_PROXY_PORT, DEFAULT_PROXY_PORT);
    if (StringUtil.isNullString(proxyHost) || proxyPort <= 0) {
      String http_proxy = System.getenv("http_proxy");
      if (!StringUtil.isNullString(http_proxy)) {
        try {
          HostPortParser hpp = new HostPortParser(http_proxy);
          proxyHost = hpp.getHost();
          proxyPort = hpp.getPort();
        } catch (HostPortParser.InvalidSpec e) {
          log.warning("Can't parse http_proxy environment var, ignoring: " + http_proxy + ": " + e);
        }
      }
    }
    if (StringUtil.isNullString(proxyHost) || proxyPort <= 0) {
      proxyHost = null;
    } else {
      log.info("Proxying through " + proxyHost + ":" + proxyPort);
    }
    userAgent = config.get(PARAM_USER_AGENT);
    if (StringUtil.isNullString(userAgent)) {
      userAgent = null;
    } else {
      log.debug("Setting User-Agent to " + userAgent);
    }
  }

  private void openOutputFile() {
    if (m_outputFile != null) {
      try {
        m_outWriter = new BufferedWriter(new FileWriter(m_outputFile, false));
        return;
      } catch (Exception ex) {
        System.err.println("Error opening output file, writing to stdout: " + ex);
      }
    }
    m_outWriter = new BufferedWriter(new OutputStreamWriter(System.out));
  }

  private void closeOutputFile() {
    try {
      if (m_outWriter != null) {
        m_outWriter.close();
      }
    } catch (IOException ex) {
      System.err.println("Error closing output file.");
    }
  }

  int[] depth_incl;
  int[] depth_fetched;
  int[] depth_parsed;

  private void checkRules() {
    outputMessage("\nChecking " + m_baseUrl, TEST_SUMMARY_MESSAGE);
    outputMessage(
        "crawl depth: " + m_crawlDepth + "     crawl delay: " + m_crawlDelay + " ms.",
        PLAIN_MESSAGE);

    TreeSet crawlList = new TreeSet();
    TreeSet fetched = new TreeSet();
    // inialize with the baseUrl
    crawlList.add(m_baseUrl);
    depth_incl = new int[m_crawlDepth];
    depth_fetched = new int[m_crawlDepth];
    depth_parsed = new int[m_crawlDepth];
    long start_time = TimeBase.nowMs();
    for (int depth = 1; depth <= m_crawlDepth; depth++) {
      if (isInterrupted()) {
        return;
      }
      m_curDepth = depth;
      if (crawlList.isEmpty() && depth <= m_crawlDepth) {
        outputMessage("\nNothing left to crawl, exiting after depth " + (depth - 1), PLAIN_MESSAGE);
        break;
      }
      String[] urls = (String[]) crawlList.toArray(new String[0]);
      crawlList.clear();
      outputMessage("\nDepth " + depth, PLAIN_MESSAGE);
      for (int ix = 0; ix < urls.length; ix++) {
        if (isInterrupted()) {
          return;
        }
        pauseBeforeFetch();
        String urlstr = urls[ix];

        m_incls.clear();
        m_excls.clear();

        // crawl the page
        buildUrlSets(urlstr);
        fetched.add(urlstr);
        // output incl/excl results,
        // add the new_incls to the crawlList for next crawl depth loop
        crawlList.addAll(outputUrlResults(urlstr, m_incls, m_excls));
      }
    }
    long elapsed_time = TimeBase.nowMs() - start_time;
    outputSummary(m_baseUrl, fetched, crawlList, elapsed_time);
  }

  private void buildUrlSets(String url) {

    try {
      outputMessage("\nFetching " + url, TEST_SUMMARY_MESSAGE);
      URL srcUrl = new URL(url);
      //       URLConnection conn = srcUrl.openConnection();
      //       String type = conn.getContentType();
      //       type = conn.getHeaderField("content-type");
      //       InputStream istr = conn.getInputStream();

      LockssUrlConnection conn = UrlUtil.openConnection(url, connectionPool);
      if (proxyHost != null) {
        conn.setProxy(proxyHost, proxyPort);
      }
      if (userAgent != null) {
        conn.setRequestProperty("user-agent", userAgent);
      }
      try {
        conn.execute();
        int resp = conn.getResponseCode();
        if (resp != 200) {
          outputMessage("Resp: " + resp + ": " + conn.getResponseMessage(), TEST_SUMMARY_MESSAGE);
          return;
        }
        depth_fetched[m_curDepth - 1]++;
        String cookies = conn.getResponseHeaderValue("Set-Cookie");
        if (cookies != null) {
          outputMessage("Cookies: " + cookies, PLAIN_MESSAGE);
        }
        String type = conn.getResponseContentType();
        if (type == null || !type.toLowerCase().startsWith("text/html")) {
          outputMessage("Type: " + type + ", not parsing", URL_SUMMARY_MESSAGE);
          return;
        }
        outputMessage("Type: " + type + ", extracting Urls", URL_SUMMARY_MESSAGE);
        InputStream istr = conn.getResponseInputStream();
        InputStreamReader reader = new InputStreamReader(istr);
        //       MyMockCachedUrl mcu = new MyMockCachedUrl(srcUrl.toString(), reader);
        GoslingHtmlLinkExtractor extractor = new GoslingHtmlLinkExtractor();
        extractor.extractUrls(null, istr, null, srcUrl.toString(), new MyLinkExtractorCallback());
        istr.close();
        depth_parsed[m_curDepth - 1]++;
      } finally {
        conn.release();
      }
    } catch (MalformedURLException murle) {
      murle.printStackTrace();
      outputErrResults(url, "Malformed URL:" + murle.getMessage());
    } catch (IOException ex) {
      ex.printStackTrace();
      outputErrResults(url, "IOException: " + ex.getMessage());
    }
  }

  private void pauseBeforeFetch() {
    if (!fetchDeadline.expired()) {
      try {
        fetchDeadline.sleep();
      } catch (InterruptedException ie) {
        // no action
      }
    }
    fetchDeadline.expireIn(m_crawlDelay);
  }

  private void outputMessage(String msg, int msgType) {
    if (isInterrupted()) {
      return;
    }

    if (m_msgHandler != null) {
      m_msgHandler.outputMessage(msg + "\n", msgType);
    } else {
      try {
        m_outWriter.write(msg);
        m_outWriter.newLine();
      } catch (Exception ex) {
        System.err.println(msg);
      }
    }
  }

  private void outputErrResults(String url, String errMsg) {
    outputMessage("Error: " + errMsg + " occured while processing " + url, ERROR_MESSAGE);
  }

  private Set outputUrlResults(String url, Set m_inclset, Set m_exclset) {
    Set new_incls = new TreeSet(CollectionUtils.subtract(m_inclset, m_reported));
    Set new_excls = new TreeSet(CollectionUtils.subtract(m_exclset, m_reported));
    if (!m_inclset.isEmpty()) {
      outputMessage(
          "\nIncluded Urls: ("
              + new_incls.size()
              + " new, "
              + (m_inclset.size() - new_incls.size())
              + " old)",
          URL_SUMMARY_MESSAGE);
      depth_incl[m_curDepth - 1] += new_incls.size();
    }
    for (Iterator it = new_incls.iterator(); it.hasNext(); ) {
      outputMessage(it.next().toString(), PLAIN_MESSAGE);
    }

    if (!m_exclset.isEmpty()) {
      outputMessage(
          "\nExcluded Urls: ("
              + new_excls.size()
              + " new, "
              + (m_exclset.size() - new_excls.size())
              + " old)",
          URL_SUMMARY_MESSAGE);
    }
    for (Iterator it = new_excls.iterator(); it.hasNext(); ) {
      outputMessage(it.next().toString(), PLAIN_MESSAGE);
    }
    m_reported.addAll(new_incls);
    m_reported.addAll(new_excls);

    if (m_outWriter != null) {
      try {
        m_outWriter.flush();
      } catch (IOException ex) {
      }
    }
    return new_incls;
  }

  private void outputSummary(String baseUrl, Set fetched, Set toCrawl, long elapsedTime) {
    int fetchCount = fetched.size();
    outputMessage(
        "\n\nSummary for starting Url: " + baseUrl + " and depth: " + m_crawlDepth,
        TEST_SUMMARY_MESSAGE);
    outputMessage(
        "\nUrls fetched: " + fetchCount + "    Urls extracted: " + m_extracted.size(),
        PLAIN_MESSAGE);

    outputMessage("\nDepth  Fetched  Parsed  New URLs", PLAIN_MESSAGE);
    for (int depth = 1; depth <= m_crawlDepth; depth++) {
      PrintfFormat pf = new PrintfFormat("%5d  %7d  %6d  %8d");
      Integer[] args =
          new Integer[] {
            new Integer(depth),
            new Integer(depth_fetched[depth - 1]),
            new Integer(depth_parsed[depth - 1]),
            new Integer(depth_incl[depth - 1]),
          };
      String s = pf.sprintf(args);
      outputMessage(s, PLAIN_MESSAGE);
    }

    outputMessage("\nRemaining unfetched: " + toCrawl.size(), PLAIN_MESSAGE);
    if (false) {
      for (Iterator iter = toCrawl.iterator(); iter.hasNext(); ) {
        String url = (String) iter.next();
        outputMessage(url, PLAIN_MESSAGE);
      }
    }
    long secs = elapsedTime / Constants.SECOND;
    long fetchRate = 0;
    if (secs > 0) {
      fetchRate = fetchCount * 60 * Constants.SECOND / elapsedTime;
    }
    outputMessage(
        "\nElapsed Time: " + secs + " secs." + "    Fetch Rate: " + fetchRate + " p/m",
        PLAIN_MESSAGE);
  }

  public interface MessageHandler {
    void outputMessage(String message, int messageType);

    void close();
  }

  private class MyLinkExtractorCallback implements LinkExtractor.Callback {

    MyLinkExtractorCallback() {}

    public void foundLink(String url) {

      m_extracted.add(url);
      try {
        String normUrl = UrlUtil.normalizeUrl(url);
        if (BaseCrawler.isSupportedUrlProtocol(normUrl) && m_au.shouldBeCached(normUrl)) {
          m_incls.add(normUrl);
        } else {
          m_excls.add(normUrl);
        }
      } catch (MalformedURLException e) {
        m_excls.add(url);
      }
    }
  }

  class MyMockCachedUrl implements CachedUrl {
    private String url;
    private boolean doesExist = false;
    private Reader reader = null;

    public MyMockCachedUrl(String url, Reader reader) {
      this.url = url;

      this.reader = reader;
    }

    public ArchivalUnit getArchivalUnit() {
      throw new UnsupportedOperationException("Not implemented");
    }

    public String getUrl() {
      return url;
    }

    public CachedUrl getCuVersion(int version) {
      throw new UnsupportedOperationException("Not implemented");
    }

    public CachedUrl[] getCuVersions() {
      throw new UnsupportedOperationException("Not implemented");
    }

    public CachedUrl[] getCuVersions(int maxVersions) {
      throw new UnsupportedOperationException("Not implemented");
    }

    public int getVersion() {
      return 1;
    }

    public Reader openForReading() {
      return reader;
    }

    public LinkRewriterFactory getLinkRewriterFactory() {
      throw new UnsupportedOperationException("Not implemented");
    }

    public String getEncoding() {
      return Constants.DEFAULT_ENCODING;
    }

    /**
     * getUnfilteredInputStream
     *
     * @return InputStream
     */
    public InputStream getUnfilteredInputStream() {
      throw new UnsupportedOperationException("Not implemented");
    }

    /**
     * getUnfilteredInputStream
     *
     * @return InputStream
     */
    public InputStream getUnfilteredInputStream(HashedInputStream.Hasher hasher) {
      throw new UnsupportedOperationException("Not implemented");
    }

    public InputStream getUncompressedInputStream() {
      throw new UnsupportedOperationException("Not implemented");
    }

    public InputStream getUncompressedInputStream(HashedInputStream.Hasher hasher) {
      throw new UnsupportedOperationException("Not implemented");
    }

    /**
     * openForHashing
     *
     * @return InputStream
     */
    public InputStream openForHashing() {
      throw new UnsupportedOperationException("Not implemented");
    }

    /**
     * openForHashing
     *
     * @param hasher HashedInputStream.Hasher for unfiltered content
     * @return InputStream
     */
    public InputStream openForHashing(HashedInputStream.Hasher hasher) {
      throw new UnsupportedOperationException("Not implemented");
    }

    /**
     * getContentSize
     *
     * @return long
     */
    public long getContentSize() {
      throw new UnsupportedOperationException("Not implemented");
    }

    public String getContentType() {
      throw new UnsupportedOperationException("Not implemented");
    }

    public void setOption(String option, String val) {}

    public boolean hasContent() {
      return doesExist;
    }

    public boolean isLeaf() {
      return true;
    }

    public int getType() {
      return CachedUrlSetNode.TYPE_CACHED_URL;
    }

    public CIProperties getProperties() {
      return null;
    }

    public void addProperty(String key, String value) {}

    public void release() {}

    public String toString() {
      StringBuffer sb = new StringBuffer(url.length() + 17);
      sb.append("[MyMockCachedUrl: ");
      sb.append(url);
      sb.append("]");
      return sb.toString();
    }

    @Override
    public FileMetadataExtractor getFileMetadataExtractor(MetadataTarget target) {
      return null;
    }

    public CachedUrl getArchiveMemberCu(ArchiveMemberSpec ams) {
      throw new UnsupportedOperationException("Not implemented");
    }

    @Override
    public boolean isArchiveMember() {
      return false;
    }
  }
}
public class TestNatureArticleIteratorFactory extends LockssTestCase {
  static Logger log = Logger.getLogger("TestNatureArticleIteratorFactory");

  private SimulatedArchivalUnit sau; // Simulated AU to generate content
  private ArchivalUnit nau; // Nature AU
  private MockLockssDaemon theDaemon;
  private static final int DEFAULT_FILESIZE = 3000;
  private static int fileSize = DEFAULT_FILESIZE;

  private static String PLUGIN_NAME = "org.lockss.plugin.nature.ClockssNaturePublishingGroupPlugin";

  private static String BASE_URL = "http://www.nature.com/";

  public void setUp() throws Exception {
    super.setUp();
    String tempDirPath = getTempDir().getAbsolutePath() + File.separator;
    ConfigurationUtil.setFromArgs(LockssRepositoryImpl.PARAM_CACHE_LOCATION, tempDirPath);
    theDaemon = getMockLockssDaemon();
    theDaemon.getAlertManager();
    theDaemon.getPluginManager().setLoadablePluginsReady(true);
    theDaemon.setDaemonInited(true);
    theDaemon.getPluginManager().startService();
    theDaemon.getCrawlManager();

    sau = PluginTestUtil.createAndStartSimAu(simAuConfig(tempDirPath));
    nau = PluginTestUtil.createAndStartAu(PLUGIN_NAME, natureAuConfig());
  }

  public void tearDown() throws Exception {
    sau.deleteContentTree();
    theDaemon.stopDaemon();
    super.tearDown();
  }

  Configuration simAuConfig(String rootPath) {
    Configuration conf = ConfigManager.newConfiguration();
    conf.put("root", rootPath);
    conf.put("base_url", BASE_URL);
    conf.put("depth", "1");
    conf.put("branch", "4");
    conf.put("numFiles", "7");
    conf.put(
        "fileTypes",
        "" + (SimulatedContentGenerator.FILE_TYPE_HTML | SimulatedContentGenerator.FILE_TYPE_PDF));
    conf.put("binFileSize", "" + fileSize);
    return conf;
  }

  Configuration natureAuConfig() {
    Configuration conf = ConfigManager.newConfiguration();
    conf.put("base_url", BASE_URL);
    conf.put("journal_id", "aps");
    conf.put("volume_name", "123");
    conf.put("year", "2008");
    return conf;
  }

  public void testArticleCountAndType() throws Exception {
    int expCount = 28;
    PluginTestUtil.crawlSimAu(sau);
    String pat1 = "branch(\\d+)/(\\d+file\\.html)";
    String rep1 = "aps/journal/v123/n$1/full/$2";
    PluginTestUtil.copyAu(sau, nau, ".*[^.][^p][^d][^f]$", pat1, rep1);
    String pat2 = "branch(\\d+)/(\\d+file\\.pdf)";
    String rep2 = "aps/journal/v123/n$1/pdf/$2";
    PluginTestUtil.copyAu(sau, nau, ".*\\.pdf$", pat2, rep2);

    // Remove some URLs
    int deleted = 0;
    for (Iterator it = nau.getAuCachedUrlSet().contentHashIterator(); it.hasNext(); ) {
      CachedUrlSetNode cusn = (CachedUrlSetNode) it.next();
      if (cusn instanceof CachedUrl) {
        CachedUrl cu = (CachedUrl) cusn;
        String url = cu.getUrl();
        if (url.contains("/journal/")
            && (url.endsWith("1file.html") || url.endsWith("2file.pdf"))) {
          deleteBlock(cu);
          ++deleted;
        }
      }
    }
    assertEquals(8, deleted);

    Iterator<ArticleFiles> it = nau.getArticleIterator();
    int count = 0;
    int countHtmlOnly = 0;
    int countPdfOnly = 0;
    while (it.hasNext()) {
      ArticleFiles af = it.next();
      log.info(af.toString());
      CachedUrl cu = af.getFullTextCu();
      String url = cu.getUrl();
      assertNotNull(cu);
      String contentType = cu.getContentType();
      log.debug("count " + count + " url " + url + " " + contentType);
      count++;
      if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == null) {
        ++countHtmlOnly;
      }
      if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == url) {
        ++countPdfOnly;
      }
    }
    log.debug("Article count is " + count);
    assertEquals(expCount, count);
    assertEquals(4, countHtmlOnly);
    assertEquals(4, countPdfOnly);
  }

  //  public void testArticleCountAndDefaultType() throws Exception {
  //    testArticleCountAndType("text/html", true, 24);
  //  }
  //
  //  public void testArticleCountAndPdf() throws Exception {
  //    testArticleCountAndType("application/pdf", false, 0);
  //  }

  private void deleteBlock(CachedUrl cu) throws IOException {
    log.info("deleting " + cu.getUrl());
    CachedUrlSetSpec cuss = new SingleNodeCachedUrlSetSpec(cu.getUrl());
    ArchivalUnit au = cu.getArchivalUnit();
    CachedUrlSet cus = au.makeCachedUrlSet(cuss);
    NodeManager nm = au.getPlugin().getDaemon().getNodeManager(au);
    nm.deleteNode(cus);
  }
}
Esempio n. 29
0
/**
 * DefinablePlugin: a plugin which uses the data stored in an ExternalizableMap to configure itself.
 *
 * @author Claire Griffin
 * @version 1.0
 */
public class DefinablePlugin extends BasePlugin {
  // configuration map keys
  public static final String KEY_PLUGIN_IDENTIFIER = "plugin_identifier";
  public static final String KEY_PLUGIN_NAME = "plugin_name";
  public static final String KEY_PLUGIN_VERSION = "plugin_version";
  public static final String KEY_PLUGIN_FEATURE_VERSION_MAP = "plugin_feature_version_map";
  public static final String KEY_REQUIRED_DAEMON_VERSION = "required_daemon_version";
  public static final String KEY_PUBLISHING_PLATFORM = "plugin_publishing_platform";
  public static final String KEY_PLUGIN_CONFIG_PROPS = "plugin_config_props";
  public static final String KEY_EXCEPTION_HANDLER = "plugin_cache_result_handler";
  public static final String KEY_EXCEPTION_LIST = "plugin_cache_result_list";
  public static final String KEY_PLUGIN_NOTES = "plugin_notes";
  public static final String KEY_CRAWL_TYPE = "plugin_crawl_type";
  public static final String KEY_FOLLOW_LINKS = "plugin_follow_link";
  /** Message to be displayed when user configures an AU with this plugin */
  public static final String KEY_PLUGIN_AU_CONFIG_USER_MSG = "plugin_au_config_user_msg";

  public static final String KEY_PER_HOST_PERMISSION_PATH = "plugin_per_host_permission_path";
  public static final String KEY_PLUGIN_PARENT = "plugin_parent";
  public static final String KEY_PLUGIN_PARENT_VERSION = "plugin_parent_version";
  public static final String KEY_PLUGIN_CRAWL_URL_COMPARATOR_FACTORY =
      "plugin_crawl_url_comparator_factory";
  public static final String KEY_PLUGIN_FETCH_RATE_LIMITER_SOURCE =
      "plugin_fetch_rate_limiter_source";

  public static final String KEY_PLUGIN_ARTICLE_ITERATOR_FACTORY =
      "plugin_article_iterator_factory";

  public static final String KEY_PLUGIN_ARTICLE_METADATA_EXTRACTOR_FACTORY =
      "plugin_article_metadata_extractor_factory";

  public static final String KEY_DEFAULT_ARTICLE_MIME_TYPE = "plugin_default_article_mime_type";

  public static final String KEY_ARTICLE_ITERATOR_ROOT = "plugin_article_iterator_root";

  public static final String KEY_ARTICLE_ITERATOR_PATTERN = "plugin_article_iterator_pattern";

  public static final String DEFAULT_PLUGIN_VERSION = "1";
  public static final String DEFAULT_REQUIRED_DAEMON_VERSION = "0.0.0";

  public static final String MAP_SUFFIX = ".xml";

  public static final String CRAWL_TYPE_HTML_LINKS = "HTML Links";
  public static final String CRAWL_TYPE_OAI = "OAI";
  public static final String[] CRAWL_TYPES = {
    CRAWL_TYPE_HTML_LINKS, CRAWL_TYPE_OAI,
  };
  public static final String DEFAULT_CRAWL_TYPE = CRAWL_TYPE_HTML_LINKS;

  protected String mapName = null;

  static Logger log = Logger.getLogger("DefinablePlugin");

  protected ExternalizableMap definitionMap = new ExternalizableMap();
  protected CacheResultHandler resultHandler = null;
  protected List<String> loadedFromUrls;
  protected CrawlWindow crawlWindow;
  protected Map<Plugin.Feature, String> featureVersion;

  public void initPlugin(LockssDaemon daemon, String extMapName) throws FileNotFoundException {
    initPlugin(daemon, extMapName, this.getClass().getClassLoader());
  }

  public void initPlugin(LockssDaemon daemon, String extMapName, ClassLoader loader)
      throws FileNotFoundException {
    // convert the plugin class name to an xml file name
    // load the configuration map from jar file
    ExternalizableMap defMap = loadMap(extMapName, loader);
    this.initPlugin(daemon, extMapName, defMap, loader);
  }

  public void initPlugin(
      LockssDaemon daemon, String extMapName, ExternalizableMap defMap, ClassLoader loader) {
    mapName = extMapName;
    this.classLoader = loader;
    this.definitionMap = defMap;
    super.initPlugin(daemon);
    initMimeMap();
    initFeatureVersions();
    initAuFeatureMap();
    checkParamAgreement();
  }

  private ExternalizableMap loadMap(String extMapName, ClassLoader loader)
      throws FileNotFoundException {
    String first = null;
    String next = extMapName;
    List<String> urls = new ArrayList<String>();
    ExternalizableMap res = null;
    while (next != null) {
      // convert the plugin class name to an xml file name
      String mapFile = next.replace('.', '/') + MAP_SUFFIX;
      URL url = loader.getResource(mapFile);
      if (url != null && urls.contains(url.toString())) {
        throw new PluginException.InvalidDefinition("Plugin inheritance loop: " + next);
      }
      // load into map
      ExternalizableMap oneMap = new ExternalizableMap();
      oneMap.loadMapFromResource(mapFile, loader);
      urls.add(url.toString());
      // apply overrides one plugin at a time in inheritance chain
      processOverrides(oneMap);
      if (res == null) {
        res = oneMap;
      } else {
        for (Map.Entry ent : oneMap.entrySet()) {
          String key = (String) ent.getKey();
          Object val = ent.getValue();
          if (!res.containsKey(key)) {
            res.setMapElement(key, val);
          }
        }
      }
      if (oneMap.containsKey(KEY_PLUGIN_PARENT)) {
        next = oneMap.getString(KEY_PLUGIN_PARENT);
      } else {
        next = null;
      }
    }
    loadedFromUrls = urls;
    return res;
  }

  /** If in testing mode FOO, copy values from FOO_override map, if any, to main map */
  void processOverrides(TypedEntryMap map) {
    String testMode = getTestingMode();
    if (StringUtil.isNullString(testMode)) {
      return;
    }
    Object o = map.getMapElement(testMode + DefinableArchivalUnit.SUFFIX_OVERRIDE);
    if (o == null) {
      return;
    }
    if (o instanceof Map) {
      Map overrideMap = (Map) o;
      for (Map.Entry entry : (Set<Map.Entry>) overrideMap.entrySet()) {
        String key = (String) entry.getKey();
        Object val = entry.getValue();
        log.debug(getDefaultPluginName() + ": Overriding " + key + " with " + val);
        map.setMapElement(key, val);
      }
    }
  }

  String getTestingMode() {
    return theDaemon == null ? null : theDaemon.getTestingMode();
  }

  // Used by tests

  public void initPlugin(LockssDaemon daemon, File file) throws PluginException {
    ExternalizableMap oneMap = new ExternalizableMap();
    oneMap.loadMap(file);
    if (oneMap.getErrorString() != null) {
      throw new PluginException(oneMap.getErrorString());
    }
    initPlugin(daemon, file.getPath(), oneMap, null);
  }

  void initPlugin(LockssDaemon daemon, ExternalizableMap defMap) {
    initPlugin(daemon, defMap, this.getClass().getClassLoader());
  }

  void initPlugin(LockssDaemon daemon, ExternalizableMap defMap, ClassLoader loader) {
    initPlugin(daemon, "Internal", defMap, loader);
  }

  enum PrintfContext {
    Regexp,
    URL,
    Display
  };

  void checkParamAgreement() {
    for (Map.Entry<String, PrintfContext> ent :
        DefinableArchivalUnit.printfKeysContext.entrySet()) {
      checkParamAgreement(ent.getKey(), ent.getValue());
    }
  }

  void checkParamAgreement(String key, PrintfContext context) {
    List<String> printfList = getElementList(key);
    if (printfList == null) {
      return;
    }
    for (String printf : printfList) {
      if (StringUtil.isNullString(printf)) {
        log.warning("Null printf string in " + key);
        continue;
      }
      PrintfUtil.PrintfData p_data = PrintfUtil.stringToPrintf(printf);
      Collection<String> p_args = p_data.getArguments();
      for (String arg : p_args) {
        ConfigParamDescr descr = findAuConfigDescr(arg);
        if (descr == null) {
          throw new PluginException.InvalidDefinition(
              "Not a declared parameter: " + arg + " in " + printf + " in " + getPluginName());
        }
        // ensure range and set params used only in legal context
        switch (context) {
          case Regexp:
          case Display:
            // everything is legal in a regexp or a display string
            break;
          case URL:
            // NUM_RANGE and SET legal because can enumerate.  Can't
            // enumerate RANGE
            switch (descr.getType()) {
              case ConfigParamDescr.TYPE_RANGE:
                throw new PluginException.InvalidDefinition(
                    "Range parameter ("
                        + arg
                        + ") used in illegal context in "
                        + getPluginName()
                        + ": "
                        + key
                        + ": "
                        + printf);
              default:
            }
        }
      }
    }
  }

  public List<String> getLoadedFromUrls() {
    return loadedFromUrls;
  }

  public String getPluginName() {
    if (definitionMap.containsKey(KEY_PLUGIN_NAME)) {
      return definitionMap.getString(KEY_PLUGIN_NAME);
    } else {
      return getDefaultPluginName();
    }
  }

  protected String getDefaultPluginName() {
    return StringUtil.shortName(getPluginId());
  }

  public String getVersion() {
    return definitionMap.getString(KEY_PLUGIN_VERSION, DEFAULT_PLUGIN_VERSION);
  }

  public String getFeatureVersion(Plugin.Feature feat) {
    if (featureVersion == null) {
      return null;
    }
    return featureVersion.get(feat);
  }

  public String getRequiredDaemonVersion() {
    return definitionMap.getString(KEY_REQUIRED_DAEMON_VERSION, DEFAULT_REQUIRED_DAEMON_VERSION);
  }

  public String getPublishingPlatform() {
    return definitionMap.getString(KEY_PUBLISHING_PLATFORM, null);
  }

  public String getPluginNotes() {
    return definitionMap.getString(KEY_PLUGIN_NOTES, null);
  }

  public String getDefaultArticleMimeType() {
    String ret = definitionMap.getString(KEY_DEFAULT_ARTICLE_MIME_TYPE, null);
    log.debug3("DefaultArticleMimeType " + ret);
    if (ret == null) {
      ret = super.getDefaultArticleMimeType();
      log.debug3("DefaultArticleMimeType from super " + ret);
    }
    return ret;
  }

  public List<String> getElementList(String key) {
    Object element = definitionMap.getMapElement(key);
    List<String> lst;

    if (element instanceof String) {
      return Collections.singletonList((String) element);
    } else if (element instanceof List) {
      return (List) element;
    } else {
      return null;
    }
  }

  public List getLocalAuConfigDescrs() throws PluginException.InvalidDefinition {
    List auConfigDescrs = (List) definitionMap.getCollection(KEY_PLUGIN_CONFIG_PROPS, null);
    if (auConfigDescrs == null) {
      throw new PluginException.InvalidDefinition(mapName + " missing ConfigParamDescrs");
    }
    return auConfigDescrs;
  }

  protected ArchivalUnit createAu0(Configuration auConfig)
      throws ArchivalUnit.ConfigurationException {
    DefinableArchivalUnit au = new DefinableArchivalUnit(this, definitionMap);
    au.setConfiguration(auConfig);
    return au;
  }

  public ExternalizableMap getDefinitionMap() {
    return definitionMap;
  }

  CacheResultHandler getCacheResultHandler() {
    return resultHandler;
  }

  String stripSuffix(String str, String suffix) {
    return str.substring(0, str.length() - suffix.length());
  }

  protected void initMimeMap() throws PluginException.InvalidDefinition {
    for (Iterator iter = definitionMap.entrySet().iterator(); iter.hasNext(); ) {
      Map.Entry ent = (Map.Entry) iter.next();
      String key = (String) ent.getKey();
      Object val = ent.getValue();
      if (key.endsWith(DefinableArchivalUnit.SUFFIX_LINK_EXTRACTOR_FACTORY)) {
        String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_LINK_EXTRACTOR_FACTORY);
        if (val instanceof String) {
          String factName = (String) val;
          log.debug(mime + " link extractor: " + factName);
          MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime);
          LinkExtractorFactory fact =
              (LinkExtractorFactory) newAuxClass(factName, LinkExtractorFactory.class);
          mti.setLinkExtractorFactory(fact);
        }
      } else if (key.endsWith(DefinableArchivalUnit.SUFFIX_CRAWL_FILTER_FACTORY)) {
        // XXX This clause must precede the one for SUFFIX_HASH_FILTER_FACTORY
        // XXX unless/until that key is changed to not be a terminal substring
        // XXX of this one
        String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_CRAWL_FILTER_FACTORY);
        if (val instanceof String) {
          String factName = (String) val;
          log.debug(mime + " crawl filter: " + factName);
          MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime);
          FilterFactory fact = (FilterFactory) newAuxClass(factName, FilterFactory.class);
          mti.setCrawlFilterFactory(fact);
        }
      } else if (key.endsWith(DefinableArchivalUnit.SUFFIX_HASH_FILTER_FACTORY)) {
        String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_HASH_FILTER_FACTORY);
        if (val instanceof String) {
          String factName = (String) val;
          log.debug(mime + " filter: " + factName);
          MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime);
          FilterFactory fact = (FilterFactory) newAuxClass(factName, FilterFactory.class);
          mti.setHashFilterFactory(fact);
        }
      } else if (key.endsWith(DefinableArchivalUnit.SUFFIX_FETCH_RATE_LIMIT)) {
        String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_FETCH_RATE_LIMIT);
        if (val instanceof String) {
          String rate = (String) val;
          log.debug(mime + " fetch rate: " + rate);
          MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime);
          RateLimiter limit = mti.getFetchRateLimiter();
          if (limit != null) {
            limit.setRate(rate);
          } else {
            mti.setFetchRateLimiter(new RateLimiter(rate));
          }
        }
      } else if (key.endsWith(DefinableArchivalUnit.SUFFIX_LINK_REWRITER_FACTORY)) {
        String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_LINK_REWRITER_FACTORY);
        String factName = (String) val;
        log.debug(mime + " link rewriter: " + factName);
        MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime);
        LinkRewriterFactory fact =
            (LinkRewriterFactory) newAuxClass(factName, LinkRewriterFactory.class);
        mti.setLinkRewriterFactory(fact);
      } else if (key.endsWith(DefinableArchivalUnit.SUFFIX_METADATA_EXTRACTOR_FACTORY_MAP)) {
        String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_METADATA_EXTRACTOR_FACTORY_MAP);
        Map factNameMap = (Map) val;
        Map factClassMap = new HashMap();
        MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime);
        for (Iterator it = factNameMap.keySet().iterator(); it.hasNext(); ) {
          String mdTypes = (String) it.next();
          String factName = (String) factNameMap.get(mdTypes);
          log.debug(mime + " (" + mdTypes + ") metadata extractor: " + factName);
          for (String mdType : (List<String>) StringUtil.breakAt(mdTypes, ";")) {
            setMdTypeFact(factClassMap, mdType, factName);
          }
        }
        mti.setFileMetadataExtractorFactoryMap(factClassMap);
      }
    }
  }

  private void setMdTypeFact(Map factClassMap, String mdType, String factName) {
    log.debug3("Metadata type: " + mdType + " factory " + factName);
    FileMetadataExtractorFactory fact =
        (FileMetadataExtractorFactory) newAuxClass(factName, FileMetadataExtractorFactory.class);
    factClassMap.put(mdType, fact);
  }

  protected void initResultMap() throws PluginException.InvalidDefinition {
    HttpResultMap hResultMap = new HttpResultMap();
    // XXX Currently this only allows a CacheResultHandler class to
    // initialize the result map.  Instead, don't use a CacheResultMap
    // directly, use either the plugin's CacheResultHandler, if specified,
    // or a default one that wraps the CacheResultMap

    String handler_class = null;
    handler_class = definitionMap.getString(KEY_EXCEPTION_HANDLER, null);
    if (handler_class != null) {
      try {
        resultHandler = (CacheResultHandler) newAuxClass(handler_class, CacheResultHandler.class);
        resultHandler.init(hResultMap);
      } catch (Exception ex) {
        throw new PluginException.InvalidDefinition(
            mapName + " has invalid Exception handler: " + handler_class, ex);
      } catch (LinkageError le) {
        throw new PluginException.InvalidDefinition(
            mapName + " has invalid Exception handler: " + handler_class, le);
      }
    } else {
      // Expect a list of mappings from either result code or exception
      // name to CacheException name
      Collection<String> mappings = definitionMap.getCollection(KEY_EXCEPTION_LIST, null);
      if (mappings != null) {
        // add each entry
        for (String entry : mappings) {
          if (log.isDebug2()) {
            log.debug2("initMap(" + entry + ")");
          }
          String first;
          String ceName;
          try {
            List<String> pair = StringUtil.breakAt(entry, '=', 2, true, true);
            first = pair.get(0);
            ceName = pair.get(1);
          } catch (Exception ex) {
            throw new PluginException.InvalidDefinition(
                "Invalid syntax: " + entry + "in " + mapName);
          }
          Object val;

          // Value should be either a CacheException or CacheResultHandler
          // class name.
          PluginFetchEventResponse resp =
              (PluginFetchEventResponse) newAuxClass(ceName, PluginFetchEventResponse.class, null);
          if (resp instanceof CacheException) {
            val = resp.getClass();
          } else if (resp instanceof CacheResultHandler) {
            val = WrapperUtil.wrap((CacheResultHandler) resp, CacheResultHandler.class);
          } else {
            throw new PluginException.InvalidDefinition(
                "Second arg not a "
                    + "CacheException or "
                    + "CacheResultHandler class: "
                    + entry
                    + ", in "
                    + mapName);
          }
          try {
            int code = Integer.parseInt(first);
            // If parseable as an integer, it's a result code.
            hResultMap.storeMapEntry(code, val);
          } catch (NumberFormatException e) {
            try {
              Class eClass = Class.forName(first);
              // If a class name, it should be an exception class
              if (Exception.class.isAssignableFrom(eClass)) {
                hResultMap.storeMapEntry(eClass, val);
              } else {
                throw new PluginException.InvalidDefinition(
                    "First arg not an " + "Exception class: " + entry + ", in " + mapName);
              }
            } catch (Exception ex) {
              throw new PluginException.InvalidDefinition(
                  "First arg not a " + "number or class: " + entry + ", in " + mapName);
            } catch (LinkageError le) {
              throw new PluginException.InvalidDefinition("Can't load " + first, le);
            }
          }
        }
      }
    }
    resultMap = hResultMap;
  }

  protected void initFeatureVersions() throws PluginException.InvalidDefinition {
    if (definitionMap.containsKey(KEY_PLUGIN_FEATURE_VERSION_MAP)) {
      Map<Plugin.Feature, String> map = new HashMap<Plugin.Feature, String>();
      Map<String, String> spec =
          (Map<String, String>) definitionMap.getMap(KEY_PLUGIN_FEATURE_VERSION_MAP);
      log.debug2("features: " + spec);
      for (Map.Entry<String, String> ent : spec.entrySet()) {
        try {
          // Prefix version string with feature name to create separate
          // namespace for each feature
          String key = ent.getKey();
          map.put(Plugin.Feature.valueOf(key), key + "_" + ent.getValue());
        } catch (RuntimeException e) {
          log.warning(
              getPluginName()
                  + " set unknown feature: "
                  + ent.getKey()
                  + " to version "
                  + ent.getValue(),
              e);
          throw new PluginException.InvalidDefinition("Unknown feature: " + ent.getKey(), e);
        }
      }
      featureVersion = map;
    } else {
      featureVersion = null;
    }
  }

  protected void initAuFeatureMap() {
    if (definitionMap.containsKey(DefinableArchivalUnit.KEY_AU_FEATURE_URL_MAP)) {
      Map<String, ?> featMap = definitionMap.getMap(DefinableArchivalUnit.KEY_AU_FEATURE_URL_MAP);
      for (Map.Entry ent : featMap.entrySet()) {
        Object val = ent.getValue();
        if (val instanceof Map) {
          ent.setValue(MapUtil.expandAlternativeKeyLists((Map) val));
        }
      }
    }
  }

  /** Create a CrawlWindow if necessary and return it. The CrawlWindow must be thread-safe. */
  protected CrawlWindow makeCrawlWindow() {
    if (crawlWindow != null) {
      return crawlWindow;
    }
    CrawlWindow window =
        (CrawlWindow) definitionMap.getMapElement(DefinableArchivalUnit.KEY_AU_CRAWL_WINDOW_SER);
    if (window == null) {
      String window_class =
          definitionMap.getString(DefinableArchivalUnit.KEY_AU_CRAWL_WINDOW, null);
      if (window_class != null) {
        ConfigurableCrawlWindow ccw =
            (ConfigurableCrawlWindow) newAuxClass(window_class, ConfigurableCrawlWindow.class);
        try {
          window = ccw.makeCrawlWindow();
        } catch (PluginException e) {
          throw new RuntimeException(e);
        }
      }
    }
    crawlWindow = window;
    return window;
  }

  LoginPageChecker loginChecker;

  protected LoginPageChecker makeLoginPageChecker() {
    if (loginChecker == null) {
      String loginPageCheckerClass =
          definitionMap.getString(DefinableArchivalUnit.KEY_AU_LOGIN_PAGE_CHECKER, null);
      if (loginPageCheckerClass != null) {
        loginChecker =
            (LoginPageChecker) newAuxClass(loginPageCheckerClass, LoginPageChecker.class);
      }
    }
    return loginChecker;
  }

  PermissionCheckerFactory permissionCheckerFact;

  protected PermissionCheckerFactory getPermissionCheckerFactory() {
    if (permissionCheckerFact == null) {
      String permissionCheckerFactoryClass =
          definitionMap.getString(DefinableArchivalUnit.KEY_AU_PERMISSION_CHECKER_FACTORY, null);
      if (permissionCheckerFactoryClass != null) {
        permissionCheckerFact =
            (PermissionCheckerFactory)
                newAuxClass(permissionCheckerFactoryClass, PermissionCheckerFactory.class);
        log.debug2("Loaded PermissionCheckerFactory: " + permissionCheckerFact);
      }
    }
    return permissionCheckerFact;
  }

  protected UrlNormalizer urlNorm;

  protected UrlNormalizer getUrlNormalizer() {
    if (urlNorm == null) {
      String normalizerClass =
          definitionMap.getString(DefinableArchivalUnit.KEY_AU_URL_NORMALIZER, null);
      if (normalizerClass != null) {
        urlNorm = (UrlNormalizer) newAuxClass(normalizerClass, UrlNormalizer.class);
      } else {
        urlNorm = NullUrlNormalizer.INSTANCE;
      }
    }
    return urlNorm;
  }

  protected ExploderHelper exploderHelper = null;

  protected ExploderHelper getExploderHelper() {
    if (exploderHelper == null) {
      String helperClass =
          definitionMap.getString(DefinableArchivalUnit.KEY_AU_EXPLODER_HELPER, null);
      if (helperClass != null) {
        exploderHelper = (ExploderHelper) newAuxClass(helperClass, ExploderHelper.class);
      }
    }
    return exploderHelper;
  }

  protected CrawlUrlComparatorFactory crawlUrlComparatorFactory = null;

  protected CrawlUrlComparatorFactory getCrawlUrlComparatorFactory() {
    if (crawlUrlComparatorFactory == null) {
      String factClass =
          definitionMap.getString(DefinablePlugin.KEY_PLUGIN_CRAWL_URL_COMPARATOR_FACTORY, null);
      if (factClass != null) {
        crawlUrlComparatorFactory =
            (CrawlUrlComparatorFactory) newAuxClass(factClass, CrawlUrlComparatorFactory.class);
      }
    }
    return crawlUrlComparatorFactory;
  }

  protected Comparator<CrawlUrl> getCrawlUrlComparator(ArchivalUnit au)
      throws PluginException.LinkageError {
    CrawlUrlComparatorFactory fact = getCrawlUrlComparatorFactory();
    if (fact == null) {
      return null;
    }
    return fact.createCrawlUrlComparator(au);
  }

  protected FilterRule constructFilterRule(String contentType) {
    String mimeType = HeaderUtil.getMimeTypeFromContentType(contentType);

    Object filter_el =
        definitionMap.getMapElement(mimeType + DefinableArchivalUnit.SUFFIX_FILTER_RULE);

    if (filter_el instanceof String) {
      log.debug("Loading filter " + filter_el);
      return (FilterRule) newAuxClass((String) filter_el, FilterRule.class);
    } else if (filter_el instanceof List) {
      if (((List) filter_el).size() > 0) {
        return new DefinableFilterRule((List) filter_el);
      }
    }
    return super.constructFilterRule(mimeType);
  }

  protected ArticleIteratorFactory articleIteratorFact = null;
  protected ArticleMetadataExtractorFactory articleMetadataFact = null;

  /**
   * Returns the plugin's article iterator factory, if any
   *
   * @return the ArticleIteratorFactory
   */
  public ArticleIteratorFactory getArticleIteratorFactory() {
    if (articleIteratorFact == null) {
      String factClass = definitionMap.getString(KEY_PLUGIN_ARTICLE_ITERATOR_FACTORY, null);
      if (factClass != null) {
        articleIteratorFact =
            (ArticleIteratorFactory) newAuxClass(factClass, ArticleIteratorFactory.class);
      }
    }
    return articleIteratorFact;
  }

  /**
   * Returns the article iterator factory for the content type, if any
   *
   * @param contentType the content type
   * @return the ArticleIteratorFactory
   */
  public ArticleMetadataExtractorFactory getArticleMetadataExtractorFactory(MetadataTarget target) {
    if (articleMetadataFact == null) {
      String factClass =
          definitionMap.getString(KEY_PLUGIN_ARTICLE_METADATA_EXTRACTOR_FACTORY, null);
      if (factClass != null) {
        articleMetadataFact =
            (ArticleMetadataExtractorFactory)
                newAuxClass(factClass, ArticleMetadataExtractorFactory.class);
      }
    }
    return articleMetadataFact;
  }

  public String getPluginId() {
    String className;
    if (mapName != null) {
      className = mapName;
    } else {
      // @TODO: eliminate this when we eliminate subclasses
      className = this.getClass().getName();
    }
    return className;
  }
}
public class BePressArticleIteratorFactory
    implements ArticleIteratorFactory, ArticleMetadataExtractorFactory {

  /**
   * An article iterator factory, for the Section plugin variant
   *
   * @author Thib Guicherd-Callin
   */
  public static class Section implements ArticleIteratorFactory {

    protected static final String ROOT_TEMPLATE =
        "\"%s%s/%s\", base_url, journal_abbr, journal_section";

    protected static final String PATTERN_TEMPLATE =
        "\"^%s%s/%s/((([^0-9]+/)?(vol)?%d/(iss)?[0-9]+/(art|editorial)?[0-9]+)|(vol%d/(?-i:[A-Z])[0-9]+))$\", base_url, journal_abbr, journal_section, volume, volume";

    public Iterator<ArticleFiles> createArticleIterator(ArchivalUnit au, MetadataTarget target)
        throws PluginException {
      return new BePressArticleIterator(
          au,
          new SubTreeArticleIterator.Spec()
              .setTarget(target)
              .setRootTemplate(ROOT_TEMPLATE)
              .setPatternTemplate(PATTERN_TEMPLATE),
          true);
    }
  }

  protected static Logger log = Logger.getLogger("BePressArticleIteratorFactory");

  protected static final String ROOT_TEMPLATE = "\"%s%s\", base_url, journal_abbr";

  // Make the final "art or editorial + number" chunk in the first half optional
  // because a few AUs have issues with single articles which sit at the issue level
  // We'll do a content check in the createArticleFiles()
  // Finally figured out the 2nd half matching group using ?-i: to mandate case sensitivity
  protected static final String PATTERN_TEMPLATE =
      "\"^%s%s/((([^0-9]+/)?(vol)?%d/(iss)?[0-9]+(/(art|editorial)?[0-9]+)?)|(vol%d/(?-i:[A-Z])[0-9]+))$\", base_url, journal_abbr, volume, volume";

  public Iterator<ArticleFiles> createArticleIterator(ArchivalUnit au, MetadataTarget target)
      throws PluginException {
    return new BePressArticleIterator(
        au,
        new SubTreeArticleIterator.Spec()
            .setTarget(target)
            .setRootTemplate(ROOT_TEMPLATE)
            .setPatternTemplate(PATTERN_TEMPLATE),
        false);
  }

  protected static class BePressArticleIterator extends SubTreeArticleIterator {

    protected Pattern pattern;
    protected Pattern TOC_pattern;

    public BePressArticleIterator(
        ArchivalUnit au, SubTreeArticleIterator.Spec spec, boolean isSection) {
      super(au, spec);
      String volumeAsString = au.getConfiguration().get(ConfigParamDescr.VOLUME_NUMBER.getKey());
      String journalAbbr = au.getConfiguration().get(ConfigParamDescr.JOURNAL_ABBR.getKey());
      if (isSection) {
        journalAbbr = journalAbbr + "/" + au.getConfiguration().get("journal_section");
      }
      // pick up issue level and lower (make (art)?[0-9]+ optional because a few au's have article
      // at issue level
      this.pattern =
          Pattern.compile(
              String.format(
                  "/%s/((([^0-9]+/)?(vol)?%s/(iss)?[0-9]+(/(art)?[0-9]+)?)|(vol%s/(?-i:[A-Z])[0-9]+))$",
                  journalAbbr, volumeAsString, volumeAsString),
              Pattern.CASE_INSENSITIVE);
      this.TOC_pattern =
          Pattern.compile(
              String.format("/%s/([^0-9]+/)?(vol)?%s/(iss)?[0-9]+$", journalAbbr, volumeAsString),
              Pattern.CASE_INSENSITIVE);
    }

    /*
     * This is comlicated. MOST AUs have articles that live below and issue level TOC
     * that is,
     * <blah>/<journal_id>/vol#/iss#/ is a toc with no relevant metadata
     * <blah>/<journal_id>/vol#/iss#/xxx is an article with metadata
     * (eg Economist Voice V1)
     * BUT
     * in some AUs there are issues with only 1 article, in which case
     * <blah>/<journal_id>/vol#/iss#/ is an abstract with metadata
     * (eg Rhodes Cook V4)
     * and a few AUs with a mixture
     * (eg Forum for Health Economics V5)
     * So to identify ALL articles, we'll also have to capture issue level items and then look
     * at the html and if it has article metadata in it, count it as an article.
     *
     */
    @Override
    protected ArticleFiles createArticleFiles(CachedUrl cu) {
      String url = cu.getUrl();
      Matcher mat = pattern.matcher(url);
      if (mat.find()) {
        // we matched, but could this pattern potentially be a toc?
        Matcher tocmat = TOC_pattern.matcher(url);
        // if we could be a TOC then we must have metadata to be considered an article
        if (tocmat.find()) {
          if (hasArticleMetadata(cu)) {
            return processUrl(cu, mat);
          }
        } else {
          // we're not a potential TOC, so treat this as an article without checking
          return processUrl(cu, mat);
        }
        return null; // this was a TOC, not an article
      }
      log.warning("Mismatch between article iterator factory and article iterator: " + url);
      return null;
    }

    protected ArticleFiles processUrl(CachedUrl cu, Matcher mat) {
      ArticleFiles af = new ArticleFiles();
      af.setFullTextCu(cu);
      af.setRoleCu(ArticleFiles.ROLE_ABSTRACT, cu);
      // XXX Full text PDF link embedded in page, cannot guess URL
      return af;
    }

    /*
     * hasArticleMetadata(CachedUrl cu)
     *   Given the CachedUrl for the potential abstract file, using the existing
     *   SimpleHtmlMetaTagMetadataExtractor to parse the file and
     *   retrieve any contained metadata. If a doi or author exists, it's an article
     *   NOT defining the Metadata Extractor here!
     */
    private boolean hasArticleMetadata(CachedUrl cu) {
      MetadataTarget at = new MetadataTarget(MetadataTarget.PURPOSE_ARTICLE);
      ArticleMetadata am;
      SimpleHtmlMetaTagMetadataExtractor ext = new SimpleHtmlMetaTagMetadataExtractor();
      if (cu != null && cu.hasContent()) {
        try {
          at.setFormat("text/html");
          am = ext.extract(at, cu);
          if ((am.containsRawKey("bepress_citation_journal_title"))
              || (am.containsRawKey("bepress_citation_abstract_html_url"))
              || (am.containsRawKey("bepress_citation_doi"))
              || (am.containsRawKey("bepress_citation_author"))) {
            return true;
          }
        } catch (IOException e) {
          e.printStackTrace();
        }
      }
      return false; // no reasonable metadata, probably a toc
    }
  }

  public ArticleMetadataExtractor createArticleMetadataExtractor(MetadataTarget target)
      throws PluginException {
    return new BaseArticleMetadataExtractor(null);
  }
}