Esempio n. 1
0
    public void process(ArchiveEntry ae) {
      // By default the files have to go in the crawler's AU
      ArchivalUnit au = crawlFacade.getAu();
      // By default the path should start at the AU's base url.
      Configuration config = au.getConfiguration();
      String url = config.get(ConfigParamDescr.BASE_URL.getKey());
      ae.setBaseUrl(url);
      ae.setRestOfUrl(ae.getName());
      CIProperties cip = new CIProperties();

      ae.setHeaderFields(cip);
    }
  public void loadAuConfigDescrs(Configuration config) throws ConfigurationException {
    super.loadAuConfigDescrs(config);
    this.m_registryUrl = config.get(ConfigParamDescr.BASE_URL.getKey());
    // Now we can construct a valid CC permission checker.
    m_permissionCheckers =
        //       ListUtil.list(new CreativeCommonsPermissionChecker(m_registryUrl));
        ListUtil.list(new CreativeCommonsPermissionChecker());

    paramMap.putLong(
        KEY_AU_NEW_CONTENT_CRAWL_INTERVAL,
        CurrentConfig.getTimeIntervalParam(
            PARAM_REGISTRY_CRAWL_INTERVAL, DEFAULT_REGISTRY_CRAWL_INTERVAL));
    if (log.isDebug2()) {
      log.debug2(
          "Setting Registry AU recrawl interval to "
              + StringUtil.timeIntervalToString(
                  paramMap.getLong(KEY_AU_NEW_CONTENT_CRAWL_INTERVAL)));
    }
  }
public class TestBaseAtyponMetadataExtractor extends LockssTestCase {

  static Logger log = Logger.getLogger("TestBaseAtyponMetadataExtractor");

  private MockLockssDaemon theDaemon;
  private ArchivalUnit bau;
  private ArchivalUnit bau1;
  private static String PLUGIN_NAME = "org.lockss.plugin.atypon.BaseAtyponPlugin";
  static final String BASE_URL_KEY = ConfigParamDescr.BASE_URL.getKey();
  private static String BASE_URL = "http://www.baseatypon.org/";

  // the metadata that should be extracted
  static String goodDate = "2012-07-05";
  static String[] goodAuthors = new String[] {"D. Author", "S. Author2"};
  static String goodFormat = "text/HTML";
  static String goodTitle = "Title of Article";
  static String goodType = "research-article";
  static String goodPublisher = "Base Atypon";
  static String goodPublishingPlatform = "Atypon";
  static String goodDOI = "10.1137/10081839X";
  static String goodJID = "xxx";

  static String goodJournal = "Journal Name";
  static String goodStartPage = "22";
  static String goodEndPage = "44";
  static String goodVolume = "13";
  static String goodIssue = "3";
  static String goodIssn = "1540-3459";
  static String doiURL = "http://dx.doi.org/" + goodDOI;
  private static final String ABS_URL = BASE_URL + "doi/abs/10.1175/2010WCAS1063.1";
  private static final String RIS_URL =
      BASE_URL + "action/downloadCitation?doi=" + goodDOI + "&format=ris&include=cit";

  public void setUp() throws Exception {
    super.setUp();
    setUpDiskSpace(); // you need this to have startService work properly...

    theDaemon = getMockLockssDaemon();
    theDaemon.getAlertManager();
    theDaemon.getPluginManager().setLoadablePluginsReady(true);
    theDaemon.setDaemonInited(true);
    theDaemon.getPluginManager().startService();
    theDaemon.getCrawlManager();

    // in this directory this is file "test_baseatypon.tdb" but it becomes xml
    ConfigurationUtil.addFromUrl(getResource("test_baseatypon.xml"));
    Tdb tdb = ConfigManager.getCurrentConfig().getTdb();

    TdbAu tdbau1 = tdb.getTdbAusLikeName(goodJournal + " Volume " + goodVolume).get(0);
    assertNotNull("Didn't find named TdbAu", tdbau1);
    bau1 = PluginTestUtil.createAndStartAu(tdbau1);
    assertNotNull(bau1);
    TypedEntryMap auConfig = bau1.getProperties();
    assertEquals(BASE_URL, auConfig.getString(BASE_URL_KEY));
  }

  public void tearDown() throws Exception {
    theDaemon.stopDaemon();
    super.tearDown();
  }

  /*
   * Test the functionality of the MetadataUtilities
   *
   */
  public void testNormalizeTitleValue() throws Exception {

    assertEquals(
        BaseAtyponMetadataUtil.normalizeTitle("The title goes here"),
        BaseAtyponMetadataUtil.normalizeTitle("Title Goes Here"));
    assertEquals(
        BaseAtyponMetadataUtil.normalizeTitle("Title    with     random spaces"),
        BaseAtyponMetadataUtil.normalizeTitle("Title with random spaces"));
    assertEquals(
        BaseAtyponMetadataUtil.normalizeTitle("Normalize -- hyphen"),
        BaseAtyponMetadataUtil.normalizeTitle("normalize \u2013\u2013 hyphen"));
    assertEquals(
        BaseAtyponMetadataUtil.normalizeTitle("Title and title"),
        BaseAtyponMetadataUtil.normalizeTitle("Title & title"));
    assertEquals(
        BaseAtyponMetadataUtil.normalizeTitle("   leading spaces"),
        BaseAtyponMetadataUtil.normalizeTitle("leading spaces"));

    // now checking the fall-back last ditch attempt
    assertEquals(
        BaseAtyponMetadataUtil.generateRawTitle("leading spaces:colon?"),
        BaseAtyponMetadataUtil.generateRawTitle("leadingspacescolon"));
    assertEquals(
        BaseAtyponMetadataUtil.generateRawTitle("relapsing-remitting"),
        BaseAtyponMetadataUtil.generateRawTitle("relapsing?remitting"));
    assertEquals(
        BaseAtyponMetadataUtil.generateRawTitle("foo\"blah"),
        BaseAtyponMetadataUtil.generateRawTitle("foo-blah"));
  }

  /**
   * Configuration method.
   *
   * @return
   */

  /*
  "<meta name="dc.Title" content="Title of Article"></meta>
  "<meta name="dc.Creator" content="D. Author"></meta>
  "<meta name="dc.Creator" content="S. Author2"></meta>
  "<meta name="dc.Subject" content="weighted regularity; elliptic problem; oscillatory diffusion; $hp$ finite elements; 65N30; 35B65; 35J57"></meta>
  "<meta name="dc.Description" content="Long test summary of article, probably taken directly from the adstract..."></meta>
  "<meta name="dc.Publisher" content="Name of Publisher"></meta>
  "<meta name="dc.Date" scheme="WTN8601" content="2012-07-05"></meta>
  "<meta name="dc.Type" content="research-article"></meta>
  "<meta name="dc.Format" content="text/HTML"></meta>
  "<meta name="dc.Identifier" scheme="publisher" content="81839"></meta>
  "<meta name="dc.Identifier" scheme="doi" content="10.1137/10081839X"></meta>
  "<meta name="dc.Source" content="http://dx.doi.org/10.1137/10081839X"></meta>
  "<meta name="dc.Language" content="en"></meta>
  "<meta name="dc.Coverage" content="world"></meta>
  "<meta name="keywords" content="weighted regularity, elliptic problem, oscillatory diffusion, $hp$ finite elements, 65N30, 35B65, 35J57"></meta>
  */

  // a chunk of html source code from the publisher's site from where the
  // metadata should be extracted

  String goodHtmlContent =
      "<meta name=\"dc.Title\" content=\"Title of Article\"></meta>"
          + "<meta name=\"dc.Creator\" content=\"D. Author\"></meta>"
          + "<meta name=\"dc.Creator\" content=\"S. Author2\"></meta>"
          + "<meta name=\"dc.Subject\" content=\"weighted regularity; elliptic problem; oscillatory diffusion; $hp$ finite elements; 65N30; 35B65; 35J57\"></meta>"
          + "<meta name=\"dc.Description\" content=\"Long test summary of article, probably taken directly from the adstract...\"></meta>"
          + "<meta name=\"dc.Publisher\" content=\"Base Atypon\"></meta>"
          + "<meta name=\"dc.Date\" scheme=\"WTN8601\" content=\"2012-07-05\"></meta>"
          + "<meta name=\"dc.Type\" content=\"research-article\"></meta>"
          + "<meta name=\"dc.Format\" content=\"text/HTML\"></meta>"
          + "<meta name=\"dc.Identifier\" scheme=\"publisher\" content=\"81839\"></meta>"
          + "<meta name=\"dc.Identifier\" scheme=\"doi\" content=\"10.1137/10081839X\"></meta>"
          + "<meta name=\"dc.Source\" content=\"http://dx.doi.org/10.1137/10081839X\"></meta>"
          + "<meta name=\"dc.Language\" content=\"en\"></meta>"
          + "<meta name=\"dc.Coverage\" content=\"world\"></meta>"
          + "<meta name=\"keywords\" content=\"weighted regularity, elliptic problem, oscillatory diffusion, $hp$ finite elements, 65N30, 35B65, 35J57\"></meta>";

  public void testExtractGoodHtmlContent() throws Exception {

    List<ArticleMetadata> mdlist = setupContentForAU(bau1, ABS_URL, goodHtmlContent, true);
    assertNotEmpty(mdlist);
    ArticleMetadata md = mdlist.get(0);
    assertNotNull(md);
    assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER));
    assertEquals(goodTitle, md.get(MetadataField.DC_FIELD_TITLE));
    assertEquals(goodDate, md.get(MetadataField.FIELD_DATE));
    assertEquals(goodFormat, md.get(MetadataField.DC_FIELD_FORMAT));
    assertEquals(goodType, md.get(MetadataField.DC_FIELD_TYPE));
    assertEquals(Arrays.asList(goodAuthors), md.getList(MetadataField.FIELD_AUTHOR));
    assertEquals(goodAuthors[0], md.get(MetadataField.DC_FIELD_CREATOR));
  }

  String goodHtmlContentNoDOIorPublisher =
      "<meta name=\"dc.Title\" content=\"Title of Article\"></meta>"
          + "<meta name=\"dc.Creator\" content=\"D. Author\"></meta>"
          + "<meta name=\"dc.Creator\" content=\"S. Author2\"></meta>"
          + "<meta name=\"dc.Subject\" content=\"weighted regularity; elliptic problem; oscillatory diffusion; $hp$ finite elements; 65N30; 35B65; 35J57\"></meta>"
          + "<meta name=\"dc.Description\" content=\"Long test summary of article, probably taken directly from the adstract...\"></meta>"
          + "<meta name=\"dc.Date\" scheme=\"WTN8601\" content=\"2012-07-05\"></meta>"
          + "<meta name=\"dc.Type\" content=\"research-article\"></meta>"
          + "<meta name=\"dc.Format\" content=\"text/HTML\"></meta>"
          + "<meta name=\"dc.Identifier\" scheme=\"publisher\" content=\"81839\"></meta>"
          + "<meta name=\"dc.Language\" content=\"en\"></meta>"
          + "<meta name=\"dc.Coverage\" content=\"world\"></meta>"
          + "<meta name=\"keywords\" content=\"weighted regularity, elliptic problem, oscillatory diffusion, $hp$ finite elements, 65N30, 35B65, 35J57\"></meta>";

  public void testDOIExtraction() throws Exception {

    List<ArticleMetadata> mdlist =
        setupContentForAU(bau1, ABS_URL, goodHtmlContentNoDOIorPublisher, true);
    assertNotEmpty(mdlist);
    ArticleMetadata md = mdlist.get(0);
    assertNotNull(md);
    // gets pulled from the URL if not set in the metadata
    assertEquals("10.1175/2010WCAS1063.1", md.get(MetadataField.FIELD_DOI));
    // gets set manually if not in the metadata
    // first it would try the TDB
    assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER));
  }

  private String createGoodRisContent() {
    StringBuilder sb = new StringBuilder();
    sb.append("TY  - JOUR");
    for (String auth : goodAuthors) {
      sb.append("\nA1  - ");
      sb.append(auth);
    }
    sb.append("\nDA  - ");
    sb.append(goodDate);
    sb.append("\nJF  - ");
    sb.append(goodJournal);
    sb.append("\nSP  - ");
    sb.append(goodStartPage);
    sb.append("\nEP  - ");
    sb.append(goodEndPage);
    sb.append("\nVL  - ");
    sb.append(goodVolume);
    sb.append("\nIS  - ");
    sb.append(goodIssue);
    sb.append("\nSN  - ");
    sb.append(goodIssn);
    sb.append("\nT1  - ");
    sb.append(goodTitle);
    sb.append("\nPB  - ");
    sb.append(goodPublisher);
    sb.append("\nDO  - ");
    sb.append(goodDOI);
    sb.append("\nUR  - ");
    sb.append(doiURL);
    sb.append("\nER  -");
    return sb.toString();
  }
  /**
   * Method that creates a simulated Cached URL from the source code provided by the goodContent
   * String. It then asserts that the metadata extracted, by using the
   * MetaPressRisMetadataExtractorFactory, match the metadata in the source code.
   *
   * @throws Exception
   */
  public void testExtractGoodRisContent() throws Exception {
    String goodContent = createGoodRisContent();
    log.debug3(goodContent);

    List<ArticleMetadata> mdlist = setupContentForAU(bau1, RIS_URL, goodContent, false);
    assertNotEmpty(mdlist);
    ArticleMetadata md = mdlist.get(0);
    assertNotNull(md);

    assertEquals(goodVolume, md.get(MetadataField.FIELD_VOLUME));
    assertEquals(goodIssue, md.get(MetadataField.FIELD_ISSUE));
    assertEquals(goodStartPage, md.get(MetadataField.FIELD_START_PAGE));
    assertEquals(goodEndPage, md.get(MetadataField.FIELD_END_PAGE));
    assertEquals(goodIssn, md.get(MetadataField.FIELD_ISSN));
    Iterator<String> actAuthIter = md.getList(MetadataField.FIELD_AUTHOR).iterator();
    for (String expAuth : goodAuthors) {
      assertEquals(expAuth, actAuthIter.next());
    }
    assertEquals(goodTitle, md.get(MetadataField.FIELD_ARTICLE_TITLE));
    assertEquals(goodJournal, md.get(MetadataField.FIELD_PUBLICATION_TITLE));
    assertEquals(goodDate, md.get(MetadataField.FIELD_DATE));

    assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER));
    assertEquals(goodDOI, md.get(MetadataField.FIELD_DOI));
    // This shouldn't get set. It will default later to fuill_text_cu
    assertNotEquals(doiURL, md.get(MetadataField.FIELD_ACCESS_URL));
  }

  /* the extractor checks if data is missing it uses possible alternate RIS tags */
  private String createAlternateRisContent() {
    StringBuilder sb = new StringBuilder();
    sb.append("TY  - JOUR");
    for (String auth : goodAuthors) {
      sb.append("\nAU  - ");
      sb.append(auth);
    }
    sb.append("\nY1  - ");
    sb.append(goodDate);
    sb.append("\nT2  - ");
    sb.append(goodJournal);
    sb.append("\nT1  - ");
    sb.append(goodTitle);
    sb.append("\nPB  - ");
    sb.append(goodPublisher);
    sb.append("\nER  -");
    return sb.toString();
  }
  /**
   * Method that creates a simulated Cached URL from the source code provided by the goodContent
   * String. It then asserts that the metadata extracted, by using the
   * MetaPressRisMetadataExtractorFactory, match the metadata in the source code.
   *
   * @throws Exception
   */
  public void testExtractAlternateRisContent() throws Exception {
    String goodContent = createAlternateRisContent();
    log.debug3(goodContent);

    List<ArticleMetadata> mdlist = setupContentForAU(bau1, RIS_URL, goodContent, false);
    assertNotEmpty(mdlist);
    ArticleMetadata md = mdlist.get(0);
    assertNotNull(md);

    Iterator<String> actAuthIter = md.getList(MetadataField.FIELD_AUTHOR).iterator();
    for (String expAuth : goodAuthors) {
      assertEquals(expAuth, actAuthIter.next());
    }
    assertEquals(goodTitle, md.get(MetadataField.FIELD_ARTICLE_TITLE));
    assertEquals(goodJournal, md.get(MetadataField.FIELD_PUBLICATION_TITLE));
    assertEquals(goodDate, md.get(MetadataField.FIELD_DATE));
    assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER));
  }

  /* private support methods */
  private List<ArticleMetadata> setupContentForAU(
      ArchivalUnit au, String url, String content, boolean isHtmlExtractor)
      throws IOException, PluginException {
    FileMetadataExtractor me;

    InputStream input = null;
    CIProperties props = null;
    if (isHtmlExtractor) {
      input = IOUtils.toInputStream(content, "utf-8");
      props = getContentHtmlProperties();
      me =
          new BaseAtyponHtmlMetadataExtractorFactory()
              .createFileMetadataExtractor(MetadataTarget.Any(), "text/html");
    } else {
      input = IOUtils.toInputStream(content, "utf-8");
      props = getContentRisProperties();
      me =
          new BaseAtyponRisMetadataExtractorFactory()
              .createFileMetadataExtractor(MetadataTarget.Any(), "text/plain");
    }
    UrlData ud = new UrlData(input, props, url);
    UrlCacher uc = au.makeUrlCacher(ud);
    uc.storeContent();
    CachedUrl cu = uc.getCachedUrl();
    FileMetadataListExtractor mle = new FileMetadataListExtractor(me);
    return mle.extract(MetadataTarget.Any(), cu);
  }

  private CIProperties getContentHtmlProperties() {
    CIProperties cProps = new CIProperties();
    // the CU checks the X-Lockss-content-type, not the content-type to determine encoding
    cProps.put(CachedUrl.PROPERTY_CONTENT_TYPE, "text/html; charset=UTF-8");
    cProps.put("Content-type", "text/html; charset=UTF-8");
    return cProps;
  }

  private CIProperties getContentRisProperties() {
    CIProperties cProps = new CIProperties();
    // the CU checks the X-Lockss-content-type, not the content-type to determine encoding
    cProps.put(CachedUrl.PROPERTY_CONTENT_TYPE, "text/plain; charset=UTF-8");
    cProps.put("Content-type", "text/plain; charset=UTF-8");
    return cProps;
  }
}
public class TestPalgraveBookArticleIteratorFactory extends ArticleIteratorTestCase {

  // private SimulatedArchivalUnit sau;	// Simulated AU to generate content

  private static final String PLUGIN_NAME = "org.lockss.plugin.palgrave.ClockssPalgraveBookPlugin";
  private static final String BASE_URL = "http://www.palgraveconnect.com/";
  private static final String BOOK_ISBN = "9781137024497";
  private static final String BASE_URL_KEY = ConfigParamDescr.BASE_URL.getKey();
  private static final String BOOK_ISBN_KEY = "book_isbn";
  private static final int DEFAULT_FILESIZE = 3000;

  private final String EXPECTED_PDF_LANDING_PAGE =
      "http://www.palgraveconnect.com/pc/doifinder/10.1057/9781137024497";
  private final String EXPECTED_PDF_URL =
      "http://www.palgraveconnect.com/pc/busman2013/browse/inside/download/9781137024497.pdf";
  private final String EXPECTED_FULL_TEXT_URL = EXPECTED_PDF_URL;
  private CIProperties pdfHeader = new CIProperties();
  private CIProperties textHeader = new CIProperties();
  private CIProperties epubHeader = new CIProperties();
  private static final String ContentString = "foo blah";
  InputStream random_content_stream;

  @Override
  public void setUp() throws Exception {
    super.setUp();
    String tempDirPath = setUpDiskSpace();
    au = createAu();
    // set up headers for creating mock CU's of the appropriate type
    pdfHeader.put(CachedUrl.PROPERTY_CONTENT_TYPE, "application/pdf");
    textHeader.put(CachedUrl.PROPERTY_CONTENT_TYPE, "text/html");
    epubHeader.put(CachedUrl.PROPERTY_CONTENT_TYPE, "application/epub");
    // the content in the urls doesn't really matter for the test
    random_content_stream =
        new ByteArrayInputStream(ContentString.getBytes(Constants.ENCODING_UTF_8));
  }

  @Override
  public void tearDown() throws Exception {
    super.tearDown();
  }

  // Set configuration attributes to create plugin AU (archival unit)
  Configuration palgraveBookAuConfig() {
    Configuration conf = ConfigManager.newConfiguration();
    conf.put(BASE_URL_KEY, BASE_URL);
    conf.put(BOOK_ISBN_KEY, BOOK_ISBN);
    return conf;
  }

  protected ArchivalUnit createAu() throws ArchivalUnit.ConfigurationException {
    return PluginTestUtil.createAndStartAu(PLUGIN_NAME, palgraveBookAuConfig());
  }

  public void testRoots() throws Exception {
    SubTreeArticleIterator artIter = createSubTreeIter();
    assertEquals(ListUtil.list(BASE_URL + "pc/"), getRootUrls(artIter));
  }

  public void testUrlsWithPrefixes() throws Exception {
    SubTreeArticleIterator artIter = createSubTreeIter();
    Pattern pat = getPattern(artIter);
    // PATTERN_TEMPLATE = "\"%spc/.+/browse/inside/(download|epub)?/[0-9]+\\.(html|pdf|epub)$\",
    // base_url";
    // NEW PATTERN_TEMPLATE = "\"%spc/doifinder/download/10.1057/([0-9]+)(\\.epub)?$\", base_url";
    assertNotMatchesRE(
        pat,
        "http://www.palgraveconnect.com/pc/busman2013/browsee/inside/download/9781137024497.pdfbad");
    assertNotMatchesRE(
        pat, "http://www.palgraveconnect.com/pc/doifinder/download-this/10.1057/9781137289520");
    assertNotMatchesRE(
        pat,
        "http://www.palgraveconnect.com/pc/busman2013/browse/inside/download/9781137024497.pdf");
    assertNotMatchesRE(
        pat, "http://www.palgraveconnect.com/pc/busman2013/browse/inside/epub/9781137024497.epub");
    //
    assertMatchesRE(
        pat, "http://www.palgraveconnect.com/pc/doifinder/download/10.1057/9781137024497");
    assertMatchesRE(
        pat, "http://www.palgraveconnect.com/pc/doifinder/download/10.1057/9781137024497.epub");
    assertMatchesRE(
        pat, "http://www.palgraveconnect.com/pc/doifinder/download/10.1057/9781137289520");
  }

  public void testCreateArticleFiles() throws Exception {
    // create urls to store in UrlCacher
    String[] au_urls = {
      BASE_URL + "pc/doifinder/10.1057/9780123456789",
      BASE_URL + "pc/browse/citationExport?doi=10.1057/9780123456789",
      BASE_URL + "pc/doifinder/download/10.1057/9780123456789",
      BASE_URL + "pc/doifinder/download/10.1057/9780123456789.epub",
      BASE_URL + "pc/doifinder/10.1057/9781234567890",
      BASE_URL + "pc/browse/citationExport?doi=10.1057/9781234567890",
      BASE_URL + "pc/doifinder/download/10.1057/9781234567890",
      BASE_URL + "pc/doifinder/download/10.1057/9781234567890.epub",
      BASE_URL + "pc/doifinder/10.1057/9782345678901",
      BASE_URL + "pc/browse/citationExport?doi=10.1057/9782345678901",
      BASE_URL + "pc/doifinder/download/10.1057/9782345678901",
      BASE_URL + "pc/doifinder/download/10.1057/9782345678901.epub"
    };
    /*
    // get cached url content type and properties from simulated contents
    // for UrclCacher.storeContent()
    CachedUrl cuPdf = null;
    CachedUrl cuHtml = null;
    CachedUrl cuEpub = null;
    for (CachedUrl cu : AuUtil.getCuIterable(sau)) {
      if (cuPdf == null
          && cu.getContentType().toLowerCase().startsWith(Constants.MIME_TYPE_PDF)) {
        log.info("pdf contenttype: " + cu.getContentType());
        cuPdf = cu;
      } else if (cuHtml == null
          && cu.getContentType().toLowerCase().startsWith(Constants.MIME_TYPE_HTML)) {
        log.info("html contenttype: " + cu.getContentType());
        cuHtml = cu;
      } else if (cuEpub == null
          && cu.getContentType().toLowerCase().startsWith("application/epub")) {
        log.info("epub contenttype: " + cu.getContentType());
        cuEpub = cu;
      }
    	if (cuPdf != null && cuHtml != null && cuEpub != null) {
    	  break;
    	}
    }
    */
    CachedUrl cu;
    // store content using cached url content type and properties
    for (String url : au_urls) {
      if (url.contains("download") && !url.endsWith(".epub")) {
        storeContent(random_content_stream, pdfHeader, url);
      } else if (url.contains("download")) { // epub
        storeContent(random_content_stream, epubHeader, url);
      } else {
        storeContent(random_content_stream, textHeader, url);
      }
    }

    // book 9780123456789
    ArticleFiles af1 = new ArticleFiles();
    af1.setRoleString(
        ArticleFiles.ROLE_FULL_TEXT_PDF, BASE_URL + "pc/doifinder/download/10.1057/9780123456789");
    af1.setRoleString(
        ArticleFiles.ROLE_ARTICLE_METADATA,
        BASE_URL + "pc/browse/citationExport?doi=10.1057/9780123456789");
    af1.setRoleString(
        ArticleFiles.ROLE_FULL_TEXT_EPUB,
        BASE_URL + "pc/doifinder/download/10.1057/9780123456789.epub");
    // book 9780123456789
    ArticleFiles af2 = new ArticleFiles();
    af1.setRoleString(
        ArticleFiles.ROLE_FULL_TEXT_PDF, BASE_URL + "pc/doifinder/download/10.1057/9781234567890");
    af1.setRoleString(
        ArticleFiles.ROLE_ARTICLE_METADATA,
        BASE_URL + "pc/browse/citationExport?doi=10.1057/9781234567890");
    af1.setRoleString(
        ArticleFiles.ROLE_FULL_TEXT_EPUB,
        BASE_URL + "pc/doifinder/download/10.1057/9781234567890.epub");
    // book 9780123456789
    ArticleFiles af3 = new ArticleFiles();
    af1.setRoleString(
        ArticleFiles.ROLE_FULL_TEXT_PDF, BASE_URL + "pc/doifinder/download/10.1057/9782345678901");
    af1.setRoleString(
        ArticleFiles.ROLE_ARTICLE_METADATA,
        BASE_URL + "pc/browse/citationExport?doi=10.1057/9782345678901");
    af1.setRoleString(
        ArticleFiles.ROLE_FULL_TEXT_EPUB,
        BASE_URL + "pc/doifinder/download/10.1057/9782345678901.epub");

    // key the expected content to the fullTextUrl for the ArticleFiles
    HashMap<String, ArticleFiles> fullUrlToAF = new HashMap<String, ArticleFiles>();
    fullUrlToAF.put(BASE_URL + "pc/doifinder/download/10.1057/9780123456789", af1);
    fullUrlToAF.put(BASE_URL + "pc/doifinder/download/10.1057/9781234567890", af2);
    fullUrlToAF.put(BASE_URL + "pc/doifinder/download/10.1057/9782345678901", af3);

    // get article iterator, get article files and the appropriate urls according
    // to their roles.
    String[] expectedUrls = {
      EXPECTED_FULL_TEXT_URL, EXPECTED_PDF_URL,
    };
    for (SubTreeArticleIterator artIter = createSubTreeIter(); artIter.hasNext(); ) {
      ArticleFiles af = artIter.next();
      String[] actualUrls = {
        af.getFullTextUrl(), af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF),
        // af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF_LANDING_PAGE)
      };
      log.info("actualUrls: " + actualUrls.length);
      for (int i = 0; i < actualUrls.length; i++) {
        log.info("e_url: " + expectedUrls[i]);

        log.info("url: " + actualUrls[i]);
        // assertEquals(expectedUrls[i], actualUrls[i]);
      }
    }
  }
}
Esempio n. 5
0
public class TestEmlsPlugin extends LockssTestCase {
  private DefinablePlugin plugin;
  static final String BASE_URL_KEY = ConfigParamDescr.BASE_URL.getKey();
  static final String VOL_KEY = ConfigParamDescr.VOLUME_NUMBER.getKey();

  public void setUp() throws Exception {
    super.setUp();
    plugin = new DefinablePlugin();
    plugin.initPlugin(getMockLockssDaemon(), "org.lockss.plugin.emls.EmlsPlugin");
  }

  public void testGetAuNullConfig() throws ArchivalUnit.ConfigurationException {
    try {
      plugin.configureAu(null, null);
      fail("Didn't throw ArchivalUnit.ConfigurationException");
    } catch (ArchivalUnit.ConfigurationException e) {
    }
  }

  private DefinableArchivalUnit makeAuFromProps(Properties props)
      throws ArchivalUnit.ConfigurationException {
    Configuration config = ConfigurationUtil.fromProps(props);
    return (DefinableArchivalUnit) plugin.configureAu(config, null);
  }

  public void testGetAuHandlesBadUrl()
      throws ArchivalUnit.ConfigurationException, MalformedURLException {
    Properties props = new Properties();
    props.setProperty(BASE_URL_KEY, "blah");
    props.setProperty(VOL_KEY, "3");

    try {
      DefinableArchivalUnit au = makeAuFromProps(props);
      fail("Didn't throw InstantiationException when given a bad url");
    } catch (ArchivalUnit.ConfigurationException auie) {
      ConfigParamDescr.InvalidFormatException murle =
          (ConfigParamDescr.InvalidFormatException) auie.getCause();
      assertNotNull(auie.getCause());
    }
  }

  public void testGetAuConstructsProperAu()
      throws ArchivalUnit.ConfigurationException, MalformedURLException {
    Properties props = new Properties();
    props.setProperty(BASE_URL_KEY, "http://extra.shu.ac.uk/emls/");
    props.setProperty(VOL_KEY, "3");

    DefinableArchivalUnit au = makeAuFromProps(props);
    assertEquals(
        "Early Modern Literary Studies Plugin, Base URL http://extra.shu.ac.uk/emls/, Volume 3",
        au.getName());
  }

  public void testGetPluginId() {
    assertEquals("org.lockss.plugin.emls.EmlsPlugin", plugin.getPluginId());
  }

  public void testGetAuConfigProperties() {
    for (Iterator iter = plugin.getLocalAuConfigDescrs().iterator(); iter.hasNext(); ) {
      ConfigParamDescr desc = (ConfigParamDescr) iter.next();
      if (desc.equals(ConfigParamDescr.BASE_URL)) {
        continue;
      }
      if (desc.equals(ConfigParamDescr.VOLUME_NUMBER)) {
        continue;
      }
      if ("issues".equals(desc.getKey())) {
        assertEquals(ConfigParamDescr.TYPE_SET, desc.getType());
        assertFalse(desc.isDefinitional());
        continue;
      }
      fail("Unexpected config param: " + desc.getKey());
    }
  }
}
Esempio n. 6
0
public class TestHighWireDrupalPlugin extends LockssTestCase {

  static final String BASE_URL_KEY = ConfigParamDescr.BASE_URL.getKey();
  static final String VOL_KEY = ConfigParamDescr.VOLUME_NAME.getKey();

  private MockLockssDaemon theDaemon;
  private DefinablePlugin plugin;

  public TestHighWireDrupalPlugin(String msg) {
    super(msg);
  }

  @Override
  public void setUp() throws Exception {
    super.setUp();
    setUpDiskSpace();
    theDaemon = getMockLockssDaemon();
    plugin = new DefinablePlugin();
    plugin.initPlugin(getMockLockssDaemon(), "org.lockss.plugin.highwire.HighWireDrupalPlugin");
  }

  public void testGetAuNullConfig() throws ArchivalUnit.ConfigurationException {
    try {
      plugin.configureAu(null, null);
      fail("Didn't throw ArchivalUnit.ConfigurationException");
    } catch (ArchivalUnit.ConfigurationException e) {
    }
  }

  public void testCreateAu() throws ConfigurationException {
    Properties props = new Properties();
    props.setProperty(BASE_URL_KEY, "http://www.example.com/");
    props.setProperty(VOL_KEY, "32");
    makeAuFromProps(props);
  }

  private DefinableArchivalUnit makeAuFromProps(Properties props)
      throws ArchivalUnit.ConfigurationException {
    Configuration config = ConfigurationUtil.fromProps(props);
    return (DefinableArchivalUnit) plugin.configureAu(config, null);
  }

  public void testGetAuConstructsProperAu()
      throws ArchivalUnit.ConfigurationException, MalformedURLException {
    Properties props = new Properties();
    props.setProperty(VOL_KEY, "303");
    props.setProperty(BASE_URL_KEY, "http://www.example.com/");

    String starturl = "http://www.example.com/lockss-manifest/vol_303_manifest.html";
    DefinableArchivalUnit au = makeAuFromProps(props);
    assertEquals(
        "HighWire Drupal Plugin, Base URL http://www.example.com/, Volume 303", au.getName());
    assertEquals(ListUtil.list(starturl), au.getStartUrls());
  }

  public void testGetPluginId() {
    assertEquals("org.lockss.plugin.highwire.HighWireDrupalPlugin", plugin.getPluginId());
  }

  public void testGetAuConfigProperties() {
    assertEquals(
        ListUtil.list(ConfigParamDescr.BASE_URL, ConfigParamDescr.VOLUME_NAME),
        plugin.getLocalAuConfigDescrs());
  }

  public void testHandles500Result() throws Exception {
    Properties props = new Properties();
    props.setProperty(VOL_KEY, "322");
    props.setProperty(BASE_URL_KEY, "http://www.example.com/");

    String starturl = "http://www.example.com/lockss-manifest/vol_322_manifest.html";
    DefinableArchivalUnit au = makeAuFromProps(props);
    MockLockssUrlConnection conn = new MockLockssUrlConnection();
    conn.setURL("http://uuu17/");
    CacheException exc =
        ((HttpResultMap) plugin.getCacheResultMap()).mapException(au, conn, 500, "foo");
    assertClass(CacheException.RetryDeadLinkException.class, exc);

    conn.setURL(starturl);
    exc = ((HttpResultMap) plugin.getCacheResultMap()).mapException(au, conn, 500, "foo");
    assertClass(CacheException.RetrySameUrlException.class, exc);
  }

  // Test the crawl rules for eLife
  public void testShouldCacheProperPages() throws Exception {
    String ROOT_URL = "http://highwire.org/";
    Properties props = new Properties();
    props.setProperty(BASE_URL_KEY, ROOT_URL);
    props.setProperty(VOL_KEY, "2015");
    DefinableArchivalUnit au = null;
    try {
      au = makeAuFromProps(props);
    } catch (ConfigurationException ex) {
    }
    theDaemon.getLockssRepository(au);

    // Test for pages that should get crawled or not
    // permission page/start url
    shouldCacheTest(ROOT_URL + "lockss-manifest/vol_2015_manifest.html", true, au);
    shouldCacheTest(ROOT_URL + "clockss-manifest/vol_2015_manifest.html", false, au);
    shouldCacheTest(ROOT_URL + "manifest/year=2015", false, au);
    // toc page for a volume, issue
    shouldCacheTest(ROOT_URL + "content/2015", false, au);
    shouldCacheTest(ROOT_URL + "content/2015/1", true, au);
    shouldCacheTest(ROOT_URL + "content/2015/2.toc", true, au);
    // article files
    shouldCacheTest(ROOT_URL + "content/2015/1/2", true, au);
    shouldCacheTest(ROOT_URL + "content/2015/1/2.abstract", true, au);
    shouldCacheTest(ROOT_URL + "content/2015/1/2.extract", true, au);
    shouldCacheTest(ROOT_URL + "content/2015/1/2.full", true, au);
    shouldCacheTest(ROOT_URL + "content/2015/1/2.full.pdf", true, au);
    shouldCacheTest(ROOT_URL + "content/2015/1/2.full.pdf+html", true, au);
    shouldCacheTest(ROOT_URL + "content/2015/1/2.full-text.pdf+html", true, au);
    shouldCacheTest(ROOT_URL + "content/2015/1/2/DC1", true, au);

    shouldCacheTest(ROOT_URL + "content/2015/1/2.print", false, au);
    shouldCacheTest(ROOT_URL + "content/2015/1/2.explore", false, au);
    shouldCacheTest(ROOT_URL + "content/2015/1/2/article-info", false, au);
    shouldCacheTest(ROOT_URL + "content/2015/1/2/submit?param=12", false, au);

    shouldCacheTest(ROOT_URL + "panels_ajax_tab/hw_tab_data/node:80746/1", true, au);
    shouldCacheTest(ROOT_URL + "panels_ajax_tab/hw_tab_art/node:80746/1", false, au);

    shouldCacheTest(ROOT_URL + "highwire/citation/12/ris", true, au);
    shouldCacheTest(ROOT_URL + "highwire/citation/9/1/ris", false, au);
    shouldCacheTest(ROOT_URL + "highwire/markup/113/expansion", true, au);

    shouldCacheTest(ROOT_URL + "sites/all/libraries/modernizr/modernizr.min.js", true, au);
    shouldCacheTest(ROOT_URL + "sites/default/files/js/js_0j8_f76rvZ212f4rg.js", true, au);
    shouldCacheTest(ROOT_URL + "sites/default/themes/hw/font/fontawesome-webfont.eot", true, au);
    shouldCacheTest(ROOT_URL + "sites/default/themes/font/fontawesome-webfont.eot", true, au);

    shouldCacheTest(
        ROOT_URL + "content/hw/suppl/2014/04/23/hw.02130.DC1/hw02130_Supplemental_files.zip",
        true,
        au);
    shouldCacheTest("http://cdn.cloudfront.net/content/2015/1/3/F1.medium.gif", true, au);
    shouldCacheTest("http://cdn.mathjax.org/mathjax/latest/MathJax.js", true, au);
    shouldCacheTest("https://ajax.googleapis.com/ajax/libs/jquery/1.8.2/jquery.min.js", true, au);
    shouldCacheTest("", false, au);

    // should not get crawled - LOCKSS
    shouldCacheTest("http://lockss.stanford.edu", false, au);
  }

  private void shouldCacheTest(String url, boolean shouldCache, ArchivalUnit au) {
    log.info("shouldCacheTest url: " + url);
    assertEquals(shouldCache, au.shouldBeCached(url));
  }
}