public class TestBaseAtyponMetadataExtractor extends LockssTestCase {

  static Logger log = Logger.getLogger("TestBaseAtyponMetadataExtractor");

  private MockLockssDaemon theDaemon;
  private ArchivalUnit bau;
  private ArchivalUnit bau1;
  private static String PLUGIN_NAME = "org.lockss.plugin.atypon.BaseAtyponPlugin";
  static final String BASE_URL_KEY = ConfigParamDescr.BASE_URL.getKey();
  private static String BASE_URL = "http://www.baseatypon.org/";

  // the metadata that should be extracted
  static String goodDate = "2012-07-05";
  static String[] goodAuthors = new String[] {"D. Author", "S. Author2"};
  static String goodFormat = "text/HTML";
  static String goodTitle = "Title of Article";
  static String goodType = "research-article";
  static String goodPublisher = "Base Atypon";
  static String goodPublishingPlatform = "Atypon";
  static String goodDOI = "10.1137/10081839X";
  static String goodJID = "xxx";

  static String goodJournal = "Journal Name";
  static String goodStartPage = "22";
  static String goodEndPage = "44";
  static String goodVolume = "13";
  static String goodIssue = "3";
  static String goodIssn = "1540-3459";
  static String doiURL = "http://dx.doi.org/" + goodDOI;
  private static final String ABS_URL = BASE_URL + "doi/abs/10.1175/2010WCAS1063.1";
  private static final String RIS_URL =
      BASE_URL + "action/downloadCitation?doi=" + goodDOI + "&format=ris&include=cit";

  public void setUp() throws Exception {
    super.setUp();
    setUpDiskSpace(); // you need this to have startService work properly...

    theDaemon = getMockLockssDaemon();
    theDaemon.getAlertManager();
    theDaemon.getPluginManager().setLoadablePluginsReady(true);
    theDaemon.setDaemonInited(true);
    theDaemon.getPluginManager().startService();
    theDaemon.getCrawlManager();

    // in this directory this is file "test_baseatypon.tdb" but it becomes xml
    ConfigurationUtil.addFromUrl(getResource("test_baseatypon.xml"));
    Tdb tdb = ConfigManager.getCurrentConfig().getTdb();

    TdbAu tdbau1 = tdb.getTdbAusLikeName(goodJournal + " Volume " + goodVolume).get(0);
    assertNotNull("Didn't find named TdbAu", tdbau1);
    bau1 = PluginTestUtil.createAndStartAu(tdbau1);
    assertNotNull(bau1);
    TypedEntryMap auConfig = bau1.getProperties();
    assertEquals(BASE_URL, auConfig.getString(BASE_URL_KEY));
  }

  public void tearDown() throws Exception {
    theDaemon.stopDaemon();
    super.tearDown();
  }

  /*
   * Test the functionality of the MetadataUtilities
   *
   */
  public void testNormalizeTitleValue() throws Exception {

    assertEquals(
        BaseAtyponMetadataUtil.normalizeTitle("The title goes here"),
        BaseAtyponMetadataUtil.normalizeTitle("Title Goes Here"));
    assertEquals(
        BaseAtyponMetadataUtil.normalizeTitle("Title    with     random spaces"),
        BaseAtyponMetadataUtil.normalizeTitle("Title with random spaces"));
    assertEquals(
        BaseAtyponMetadataUtil.normalizeTitle("Normalize -- hyphen"),
        BaseAtyponMetadataUtil.normalizeTitle("normalize \u2013\u2013 hyphen"));
    assertEquals(
        BaseAtyponMetadataUtil.normalizeTitle("Title and title"),
        BaseAtyponMetadataUtil.normalizeTitle("Title & title"));
    assertEquals(
        BaseAtyponMetadataUtil.normalizeTitle("   leading spaces"),
        BaseAtyponMetadataUtil.normalizeTitle("leading spaces"));

    // now checking the fall-back last ditch attempt
    assertEquals(
        BaseAtyponMetadataUtil.generateRawTitle("leading spaces:colon?"),
        BaseAtyponMetadataUtil.generateRawTitle("leadingspacescolon"));
    assertEquals(
        BaseAtyponMetadataUtil.generateRawTitle("relapsing-remitting"),
        BaseAtyponMetadataUtil.generateRawTitle("relapsing?remitting"));
    assertEquals(
        BaseAtyponMetadataUtil.generateRawTitle("foo\"blah"),
        BaseAtyponMetadataUtil.generateRawTitle("foo-blah"));
  }

  /**
   * Configuration method.
   *
   * @return
   */

  /*
  "<meta name="dc.Title" content="Title of Article"></meta>
  "<meta name="dc.Creator" content="D. Author"></meta>
  "<meta name="dc.Creator" content="S. Author2"></meta>
  "<meta name="dc.Subject" content="weighted regularity; elliptic problem; oscillatory diffusion; $hp$ finite elements; 65N30; 35B65; 35J57"></meta>
  "<meta name="dc.Description" content="Long test summary of article, probably taken directly from the adstract..."></meta>
  "<meta name="dc.Publisher" content="Name of Publisher"></meta>
  "<meta name="dc.Date" scheme="WTN8601" content="2012-07-05"></meta>
  "<meta name="dc.Type" content="research-article"></meta>
  "<meta name="dc.Format" content="text/HTML"></meta>
  "<meta name="dc.Identifier" scheme="publisher" content="81839"></meta>
  "<meta name="dc.Identifier" scheme="doi" content="10.1137/10081839X"></meta>
  "<meta name="dc.Source" content="http://dx.doi.org/10.1137/10081839X"></meta>
  "<meta name="dc.Language" content="en"></meta>
  "<meta name="dc.Coverage" content="world"></meta>
  "<meta name="keywords" content="weighted regularity, elliptic problem, oscillatory diffusion, $hp$ finite elements, 65N30, 35B65, 35J57"></meta>
  */

  // a chunk of html source code from the publisher's site from where the
  // metadata should be extracted

  String goodHtmlContent =
      "<meta name=\"dc.Title\" content=\"Title of Article\"></meta>"
          + "<meta name=\"dc.Creator\" content=\"D. Author\"></meta>"
          + "<meta name=\"dc.Creator\" content=\"S. Author2\"></meta>"
          + "<meta name=\"dc.Subject\" content=\"weighted regularity; elliptic problem; oscillatory diffusion; $hp$ finite elements; 65N30; 35B65; 35J57\"></meta>"
          + "<meta name=\"dc.Description\" content=\"Long test summary of article, probably taken directly from the adstract...\"></meta>"
          + "<meta name=\"dc.Publisher\" content=\"Base Atypon\"></meta>"
          + "<meta name=\"dc.Date\" scheme=\"WTN8601\" content=\"2012-07-05\"></meta>"
          + "<meta name=\"dc.Type\" content=\"research-article\"></meta>"
          + "<meta name=\"dc.Format\" content=\"text/HTML\"></meta>"
          + "<meta name=\"dc.Identifier\" scheme=\"publisher\" content=\"81839\"></meta>"
          + "<meta name=\"dc.Identifier\" scheme=\"doi\" content=\"10.1137/10081839X\"></meta>"
          + "<meta name=\"dc.Source\" content=\"http://dx.doi.org/10.1137/10081839X\"></meta>"
          + "<meta name=\"dc.Language\" content=\"en\"></meta>"
          + "<meta name=\"dc.Coverage\" content=\"world\"></meta>"
          + "<meta name=\"keywords\" content=\"weighted regularity, elliptic problem, oscillatory diffusion, $hp$ finite elements, 65N30, 35B65, 35J57\"></meta>";

  public void testExtractGoodHtmlContent() throws Exception {

    List<ArticleMetadata> mdlist = setupContentForAU(bau1, ABS_URL, goodHtmlContent, true);
    assertNotEmpty(mdlist);
    ArticleMetadata md = mdlist.get(0);
    assertNotNull(md);
    assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER));
    assertEquals(goodTitle, md.get(MetadataField.DC_FIELD_TITLE));
    assertEquals(goodDate, md.get(MetadataField.FIELD_DATE));
    assertEquals(goodFormat, md.get(MetadataField.DC_FIELD_FORMAT));
    assertEquals(goodType, md.get(MetadataField.DC_FIELD_TYPE));
    assertEquals(Arrays.asList(goodAuthors), md.getList(MetadataField.FIELD_AUTHOR));
    assertEquals(goodAuthors[0], md.get(MetadataField.DC_FIELD_CREATOR));
  }

  String goodHtmlContentNoDOIorPublisher =
      "<meta name=\"dc.Title\" content=\"Title of Article\"></meta>"
          + "<meta name=\"dc.Creator\" content=\"D. Author\"></meta>"
          + "<meta name=\"dc.Creator\" content=\"S. Author2\"></meta>"
          + "<meta name=\"dc.Subject\" content=\"weighted regularity; elliptic problem; oscillatory diffusion; $hp$ finite elements; 65N30; 35B65; 35J57\"></meta>"
          + "<meta name=\"dc.Description\" content=\"Long test summary of article, probably taken directly from the adstract...\"></meta>"
          + "<meta name=\"dc.Date\" scheme=\"WTN8601\" content=\"2012-07-05\"></meta>"
          + "<meta name=\"dc.Type\" content=\"research-article\"></meta>"
          + "<meta name=\"dc.Format\" content=\"text/HTML\"></meta>"
          + "<meta name=\"dc.Identifier\" scheme=\"publisher\" content=\"81839\"></meta>"
          + "<meta name=\"dc.Language\" content=\"en\"></meta>"
          + "<meta name=\"dc.Coverage\" content=\"world\"></meta>"
          + "<meta name=\"keywords\" content=\"weighted regularity, elliptic problem, oscillatory diffusion, $hp$ finite elements, 65N30, 35B65, 35J57\"></meta>";

  public void testDOIExtraction() throws Exception {

    List<ArticleMetadata> mdlist =
        setupContentForAU(bau1, ABS_URL, goodHtmlContentNoDOIorPublisher, true);
    assertNotEmpty(mdlist);
    ArticleMetadata md = mdlist.get(0);
    assertNotNull(md);
    // gets pulled from the URL if not set in the metadata
    assertEquals("10.1175/2010WCAS1063.1", md.get(MetadataField.FIELD_DOI));
    // gets set manually if not in the metadata
    // first it would try the TDB
    assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER));
  }

  private String createGoodRisContent() {
    StringBuilder sb = new StringBuilder();
    sb.append("TY  - JOUR");
    for (String auth : goodAuthors) {
      sb.append("\nA1  - ");
      sb.append(auth);
    }
    sb.append("\nDA  - ");
    sb.append(goodDate);
    sb.append("\nJF  - ");
    sb.append(goodJournal);
    sb.append("\nSP  - ");
    sb.append(goodStartPage);
    sb.append("\nEP  - ");
    sb.append(goodEndPage);
    sb.append("\nVL  - ");
    sb.append(goodVolume);
    sb.append("\nIS  - ");
    sb.append(goodIssue);
    sb.append("\nSN  - ");
    sb.append(goodIssn);
    sb.append("\nT1  - ");
    sb.append(goodTitle);
    sb.append("\nPB  - ");
    sb.append(goodPublisher);
    sb.append("\nDO  - ");
    sb.append(goodDOI);
    sb.append("\nUR  - ");
    sb.append(doiURL);
    sb.append("\nER  -");
    return sb.toString();
  }
  /**
   * Method that creates a simulated Cached URL from the source code provided by the goodContent
   * String. It then asserts that the metadata extracted, by using the
   * MetaPressRisMetadataExtractorFactory, match the metadata in the source code.
   *
   * @throws Exception
   */
  public void testExtractGoodRisContent() throws Exception {
    String goodContent = createGoodRisContent();
    log.debug3(goodContent);

    List<ArticleMetadata> mdlist = setupContentForAU(bau1, RIS_URL, goodContent, false);
    assertNotEmpty(mdlist);
    ArticleMetadata md = mdlist.get(0);
    assertNotNull(md);

    assertEquals(goodVolume, md.get(MetadataField.FIELD_VOLUME));
    assertEquals(goodIssue, md.get(MetadataField.FIELD_ISSUE));
    assertEquals(goodStartPage, md.get(MetadataField.FIELD_START_PAGE));
    assertEquals(goodEndPage, md.get(MetadataField.FIELD_END_PAGE));
    assertEquals(goodIssn, md.get(MetadataField.FIELD_ISSN));
    Iterator<String> actAuthIter = md.getList(MetadataField.FIELD_AUTHOR).iterator();
    for (String expAuth : goodAuthors) {
      assertEquals(expAuth, actAuthIter.next());
    }
    assertEquals(goodTitle, md.get(MetadataField.FIELD_ARTICLE_TITLE));
    assertEquals(goodJournal, md.get(MetadataField.FIELD_PUBLICATION_TITLE));
    assertEquals(goodDate, md.get(MetadataField.FIELD_DATE));

    assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER));
    assertEquals(goodDOI, md.get(MetadataField.FIELD_DOI));
    // This shouldn't get set. It will default later to fuill_text_cu
    assertNotEquals(doiURL, md.get(MetadataField.FIELD_ACCESS_URL));
  }

  /* the extractor checks if data is missing it uses possible alternate RIS tags */
  private String createAlternateRisContent() {
    StringBuilder sb = new StringBuilder();
    sb.append("TY  - JOUR");
    for (String auth : goodAuthors) {
      sb.append("\nAU  - ");
      sb.append(auth);
    }
    sb.append("\nY1  - ");
    sb.append(goodDate);
    sb.append("\nT2  - ");
    sb.append(goodJournal);
    sb.append("\nT1  - ");
    sb.append(goodTitle);
    sb.append("\nPB  - ");
    sb.append(goodPublisher);
    sb.append("\nER  -");
    return sb.toString();
  }
  /**
   * Method that creates a simulated Cached URL from the source code provided by the goodContent
   * String. It then asserts that the metadata extracted, by using the
   * MetaPressRisMetadataExtractorFactory, match the metadata in the source code.
   *
   * @throws Exception
   */
  public void testExtractAlternateRisContent() throws Exception {
    String goodContent = createAlternateRisContent();
    log.debug3(goodContent);

    List<ArticleMetadata> mdlist = setupContentForAU(bau1, RIS_URL, goodContent, false);
    assertNotEmpty(mdlist);
    ArticleMetadata md = mdlist.get(0);
    assertNotNull(md);

    Iterator<String> actAuthIter = md.getList(MetadataField.FIELD_AUTHOR).iterator();
    for (String expAuth : goodAuthors) {
      assertEquals(expAuth, actAuthIter.next());
    }
    assertEquals(goodTitle, md.get(MetadataField.FIELD_ARTICLE_TITLE));
    assertEquals(goodJournal, md.get(MetadataField.FIELD_PUBLICATION_TITLE));
    assertEquals(goodDate, md.get(MetadataField.FIELD_DATE));
    assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER));
  }

  /* private support methods */
  private List<ArticleMetadata> setupContentForAU(
      ArchivalUnit au, String url, String content, boolean isHtmlExtractor)
      throws IOException, PluginException {
    FileMetadataExtractor me;

    InputStream input = null;
    CIProperties props = null;
    if (isHtmlExtractor) {
      input = IOUtils.toInputStream(content, "utf-8");
      props = getContentHtmlProperties();
      me =
          new BaseAtyponHtmlMetadataExtractorFactory()
              .createFileMetadataExtractor(MetadataTarget.Any(), "text/html");
    } else {
      input = IOUtils.toInputStream(content, "utf-8");
      props = getContentRisProperties();
      me =
          new BaseAtyponRisMetadataExtractorFactory()
              .createFileMetadataExtractor(MetadataTarget.Any(), "text/plain");
    }
    UrlData ud = new UrlData(input, props, url);
    UrlCacher uc = au.makeUrlCacher(ud);
    uc.storeContent();
    CachedUrl cu = uc.getCachedUrl();
    FileMetadataListExtractor mle = new FileMetadataListExtractor(me);
    return mle.extract(MetadataTarget.Any(), cu);
  }

  private CIProperties getContentHtmlProperties() {
    CIProperties cProps = new CIProperties();
    // the CU checks the X-Lockss-content-type, not the content-type to determine encoding
    cProps.put(CachedUrl.PROPERTY_CONTENT_TYPE, "text/html; charset=UTF-8");
    cProps.put("Content-type", "text/html; charset=UTF-8");
    return cProps;
  }

  private CIProperties getContentRisProperties() {
    CIProperties cProps = new CIProperties();
    // the CU checks the X-Lockss-content-type, not the content-type to determine encoding
    cProps.put(CachedUrl.PROPERTY_CONTENT_TYPE, "text/plain; charset=UTF-8");
    cProps.put("Content-type", "text/plain; charset=UTF-8");
    return cProps;
  }
}
public class TestNatureArticleIteratorFactory extends LockssTestCase {
  static Logger log = Logger.getLogger("TestNatureArticleIteratorFactory");

  private SimulatedArchivalUnit sau; // Simulated AU to generate content
  private ArchivalUnit nau; // Nature AU
  private MockLockssDaemon theDaemon;
  private static final int DEFAULT_FILESIZE = 3000;
  private static int fileSize = DEFAULT_FILESIZE;

  private static String PLUGIN_NAME = "org.lockss.plugin.nature.ClockssNaturePublishingGroupPlugin";

  private static String BASE_URL = "http://www.nature.com/";

  public void setUp() throws Exception {
    super.setUp();
    String tempDirPath = getTempDir().getAbsolutePath() + File.separator;
    ConfigurationUtil.setFromArgs(LockssRepositoryImpl.PARAM_CACHE_LOCATION, tempDirPath);
    theDaemon = getMockLockssDaemon();
    theDaemon.getAlertManager();
    theDaemon.getPluginManager().setLoadablePluginsReady(true);
    theDaemon.setDaemonInited(true);
    theDaemon.getPluginManager().startService();
    theDaemon.getCrawlManager();

    sau = PluginTestUtil.createAndStartSimAu(simAuConfig(tempDirPath));
    nau = PluginTestUtil.createAndStartAu(PLUGIN_NAME, natureAuConfig());
  }

  public void tearDown() throws Exception {
    sau.deleteContentTree();
    theDaemon.stopDaemon();
    super.tearDown();
  }

  Configuration simAuConfig(String rootPath) {
    Configuration conf = ConfigManager.newConfiguration();
    conf.put("root", rootPath);
    conf.put("base_url", BASE_URL);
    conf.put("depth", "1");
    conf.put("branch", "4");
    conf.put("numFiles", "7");
    conf.put(
        "fileTypes",
        "" + (SimulatedContentGenerator.FILE_TYPE_HTML | SimulatedContentGenerator.FILE_TYPE_PDF));
    conf.put("binFileSize", "" + fileSize);
    return conf;
  }

  Configuration natureAuConfig() {
    Configuration conf = ConfigManager.newConfiguration();
    conf.put("base_url", BASE_URL);
    conf.put("journal_id", "aps");
    conf.put("volume_name", "123");
    conf.put("year", "2008");
    return conf;
  }

  public void testArticleCountAndType() throws Exception {
    int expCount = 28;
    PluginTestUtil.crawlSimAu(sau);
    String pat1 = "branch(\\d+)/(\\d+file\\.html)";
    String rep1 = "aps/journal/v123/n$1/full/$2";
    PluginTestUtil.copyAu(sau, nau, ".*[^.][^p][^d][^f]$", pat1, rep1);
    String pat2 = "branch(\\d+)/(\\d+file\\.pdf)";
    String rep2 = "aps/journal/v123/n$1/pdf/$2";
    PluginTestUtil.copyAu(sau, nau, ".*\\.pdf$", pat2, rep2);

    // Remove some URLs
    int deleted = 0;
    for (Iterator it = nau.getAuCachedUrlSet().contentHashIterator(); it.hasNext(); ) {
      CachedUrlSetNode cusn = (CachedUrlSetNode) it.next();
      if (cusn instanceof CachedUrl) {
        CachedUrl cu = (CachedUrl) cusn;
        String url = cu.getUrl();
        if (url.contains("/journal/")
            && (url.endsWith("1file.html") || url.endsWith("2file.pdf"))) {
          deleteBlock(cu);
          ++deleted;
        }
      }
    }
    assertEquals(8, deleted);

    Iterator<ArticleFiles> it = nau.getArticleIterator();
    int count = 0;
    int countHtmlOnly = 0;
    int countPdfOnly = 0;
    while (it.hasNext()) {
      ArticleFiles af = it.next();
      log.info(af.toString());
      CachedUrl cu = af.getFullTextCu();
      String url = cu.getUrl();
      assertNotNull(cu);
      String contentType = cu.getContentType();
      log.debug("count " + count + " url " + url + " " + contentType);
      count++;
      if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == null) {
        ++countHtmlOnly;
      }
      if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == url) {
        ++countPdfOnly;
      }
    }
    log.debug("Article count is " + count);
    assertEquals(expCount, count);
    assertEquals(4, countHtmlOnly);
    assertEquals(4, countPdfOnly);
  }

  //  public void testArticleCountAndDefaultType() throws Exception {
  //    testArticleCountAndType("text/html", true, 24);
  //  }
  //
  //  public void testArticleCountAndPdf() throws Exception {
  //    testArticleCountAndType("application/pdf", false, 0);
  //  }

  private void deleteBlock(CachedUrl cu) throws IOException {
    log.info("deleting " + cu.getUrl());
    CachedUrlSetSpec cuss = new SingleNodeCachedUrlSetSpec(cu.getUrl());
    ArchivalUnit au = cu.getArchivalUnit();
    CachedUrlSet cus = au.makeCachedUrlSet(cuss);
    NodeManager nm = au.getPlugin().getDaemon().getNodeManager(au);
    nm.deleteNode(cus);
  }
}
Esempio n. 3
0
public class CrawlRuleTester extends Thread {
  protected static Logger log = Logger.getLogger(CrawlRuleTester.class);

  /** Proxy host */
  public static final String PARAM_PROXY_HOST = Configuration.PREFIX + "crawltest.proxy.host";

  /** Proxy port */
  public static final String PARAM_PROXY_PORT = Configuration.PREFIX + "crawltest.proxy.port";

  public static final int DEFAULT_PROXY_PORT = -1;

  /** User-Agent */
  public static final String PARAM_USER_AGENT = Configuration.PREFIX + "crawltest.userAgent";

  /* Message Types */
  public static final int ERROR_MESSAGE = 0;
  public static final int WARNING_MESSAGE = 1;
  public static final int PLAIN_MESSAGE = 2;
  public static final int URL_SUMMARY_MESSAGE = 3;
  public static final int TEST_SUMMARY_MESSAGE = 4;

  private String m_baseUrl;
  private int m_crawlDepth;
  private long m_crawlDelay;
  private int m_curDepth;
  private ArchivalUnit m_au;
  private String m_outputFile = null;
  private BufferedWriter m_outWriter = null;
  private Deadline fetchDeadline = Deadline.in(0);
  private boolean useLocalWriter = true;
  private MessageHandler m_msgHandler;
  private LockssUrlConnectionPool connectionPool = new LockssUrlConnectionPool();
  private String proxyHost;
  private String userAgent;
  private int proxyPort;

  // our storage for extracted urls
  private TreeSet m_extracted = new TreeSet();
  private TreeSet m_incls = new TreeSet();
  private TreeSet m_excls = new TreeSet();
  private TreeSet m_reported = new TreeSet();

  public CrawlRuleTester(int crawlDepth, long crawlDelay, String baseUrl, ArchivalUnit au) {
    super("crawlrule tester");
    m_crawlDepth = crawlDepth;
    long minFetchDelay =
        CurrentConfig.getLongParam(
            BaseArchivalUnit.PARAM_MIN_FETCH_DELAY, BaseArchivalUnit.DEFAULT_MIN_FETCH_DELAY);
    m_crawlDelay = Math.max(crawlDelay, minFetchDelay);
    m_baseUrl = baseUrl;
    m_au = au;
  }
  /**
   * RuleTest
   *
   * @param outFile String
   * @param crawlDepth int
   * @param crawlDelay long
   * @param baseUrl String
   * @param crawlSpec CrawlSpec
   */
  public CrawlRuleTester(
      String outFile, int crawlDepth, long crawlDelay, String baseUrl, ArchivalUnit au) {

    this(crawlDepth, crawlDelay, baseUrl, au);
    m_outputFile = outFile;
  }

  /**
   * RuleTest
   *
   * @param outWriter BufferedWriter
   * @param crawlDepth int
   * @param crawlDelay long
   * @param baseUrl String
   * @param crawlSpec CrawlSpec
   */
  public CrawlRuleTester(
      BufferedWriter outWriter, int crawlDepth, long crawlDelay, String baseUrl, ArchivalUnit au) {
    this(crawlDepth, crawlDelay, baseUrl, au);
    m_outWriter = outWriter;
  }

  /**
   * RuleTest
   *
   * @param msgHandler MessageHandler to take all output
   * @param crawlDepth the crawl depth to use
   * @param crawlDelay the type to wait between fetches
   * @param baseUrl the url to start from
   * @param crawlSpec a CrawlSpec to use for url checking.
   */
  public CrawlRuleTester(
      MessageHandler msgHandler, int crawlDepth, long crawlDelay, String baseUrl, ArchivalUnit au) {
    this(crawlDepth, crawlDelay, baseUrl, au);
    m_msgHandler = msgHandler;
  }

  public void run() {
    try {
      setConfig(ConfigManager.getCurrentConfig());
      if (m_outWriter == null && m_msgHandler == null) {
        useLocalWriter = true;
      } else {
        useLocalWriter = false;
      }
      if (useLocalWriter) {
        openOutputFile();
      }
      checkRules();
      if (useLocalWriter) {
        closeOutputFile();
      }
    } finally {
      if (m_msgHandler != null) {
        m_msgHandler.close();
      }
    }
  }

  void setConfig(Configuration config) {
    log.debug("config: " + config);
    proxyHost = config.get(PARAM_PROXY_HOST);
    proxyPort = config.getInt(PARAM_PROXY_PORT, DEFAULT_PROXY_PORT);
    if (StringUtil.isNullString(proxyHost) || proxyPort <= 0) {
      String http_proxy = System.getenv("http_proxy");
      if (!StringUtil.isNullString(http_proxy)) {
        try {
          HostPortParser hpp = new HostPortParser(http_proxy);
          proxyHost = hpp.getHost();
          proxyPort = hpp.getPort();
        } catch (HostPortParser.InvalidSpec e) {
          log.warning("Can't parse http_proxy environment var, ignoring: " + http_proxy + ": " + e);
        }
      }
    }
    if (StringUtil.isNullString(proxyHost) || proxyPort <= 0) {
      proxyHost = null;
    } else {
      log.info("Proxying through " + proxyHost + ":" + proxyPort);
    }
    userAgent = config.get(PARAM_USER_AGENT);
    if (StringUtil.isNullString(userAgent)) {
      userAgent = null;
    } else {
      log.debug("Setting User-Agent to " + userAgent);
    }
  }

  private void openOutputFile() {
    if (m_outputFile != null) {
      try {
        m_outWriter = new BufferedWriter(new FileWriter(m_outputFile, false));
        return;
      } catch (Exception ex) {
        System.err.println("Error opening output file, writing to stdout: " + ex);
      }
    }
    m_outWriter = new BufferedWriter(new OutputStreamWriter(System.out));
  }

  private void closeOutputFile() {
    try {
      if (m_outWriter != null) {
        m_outWriter.close();
      }
    } catch (IOException ex) {
      System.err.println("Error closing output file.");
    }
  }

  int[] depth_incl;
  int[] depth_fetched;
  int[] depth_parsed;

  private void checkRules() {
    outputMessage("\nChecking " + m_baseUrl, TEST_SUMMARY_MESSAGE);
    outputMessage(
        "crawl depth: " + m_crawlDepth + "     crawl delay: " + m_crawlDelay + " ms.",
        PLAIN_MESSAGE);

    TreeSet crawlList = new TreeSet();
    TreeSet fetched = new TreeSet();
    // inialize with the baseUrl
    crawlList.add(m_baseUrl);
    depth_incl = new int[m_crawlDepth];
    depth_fetched = new int[m_crawlDepth];
    depth_parsed = new int[m_crawlDepth];
    long start_time = TimeBase.nowMs();
    for (int depth = 1; depth <= m_crawlDepth; depth++) {
      if (isInterrupted()) {
        return;
      }
      m_curDepth = depth;
      if (crawlList.isEmpty() && depth <= m_crawlDepth) {
        outputMessage("\nNothing left to crawl, exiting after depth " + (depth - 1), PLAIN_MESSAGE);
        break;
      }
      String[] urls = (String[]) crawlList.toArray(new String[0]);
      crawlList.clear();
      outputMessage("\nDepth " + depth, PLAIN_MESSAGE);
      for (int ix = 0; ix < urls.length; ix++) {
        if (isInterrupted()) {
          return;
        }
        pauseBeforeFetch();
        String urlstr = urls[ix];

        m_incls.clear();
        m_excls.clear();

        // crawl the page
        buildUrlSets(urlstr);
        fetched.add(urlstr);
        // output incl/excl results,
        // add the new_incls to the crawlList for next crawl depth loop
        crawlList.addAll(outputUrlResults(urlstr, m_incls, m_excls));
      }
    }
    long elapsed_time = TimeBase.nowMs() - start_time;
    outputSummary(m_baseUrl, fetched, crawlList, elapsed_time);
  }

  private void buildUrlSets(String url) {

    try {
      outputMessage("\nFetching " + url, TEST_SUMMARY_MESSAGE);
      URL srcUrl = new URL(url);
      //       URLConnection conn = srcUrl.openConnection();
      //       String type = conn.getContentType();
      //       type = conn.getHeaderField("content-type");
      //       InputStream istr = conn.getInputStream();

      LockssUrlConnection conn = UrlUtil.openConnection(url, connectionPool);
      if (proxyHost != null) {
        conn.setProxy(proxyHost, proxyPort);
      }
      if (userAgent != null) {
        conn.setRequestProperty("user-agent", userAgent);
      }
      try {
        conn.execute();
        int resp = conn.getResponseCode();
        if (resp != 200) {
          outputMessage("Resp: " + resp + ": " + conn.getResponseMessage(), TEST_SUMMARY_MESSAGE);
          return;
        }
        depth_fetched[m_curDepth - 1]++;
        String cookies = conn.getResponseHeaderValue("Set-Cookie");
        if (cookies != null) {
          outputMessage("Cookies: " + cookies, PLAIN_MESSAGE);
        }
        String type = conn.getResponseContentType();
        if (type == null || !type.toLowerCase().startsWith("text/html")) {
          outputMessage("Type: " + type + ", not parsing", URL_SUMMARY_MESSAGE);
          return;
        }
        outputMessage("Type: " + type + ", extracting Urls", URL_SUMMARY_MESSAGE);
        InputStream istr = conn.getResponseInputStream();
        InputStreamReader reader = new InputStreamReader(istr);
        //       MyMockCachedUrl mcu = new MyMockCachedUrl(srcUrl.toString(), reader);
        GoslingHtmlLinkExtractor extractor = new GoslingHtmlLinkExtractor();
        extractor.extractUrls(null, istr, null, srcUrl.toString(), new MyLinkExtractorCallback());
        istr.close();
        depth_parsed[m_curDepth - 1]++;
      } finally {
        conn.release();
      }
    } catch (MalformedURLException murle) {
      murle.printStackTrace();
      outputErrResults(url, "Malformed URL:" + murle.getMessage());
    } catch (IOException ex) {
      ex.printStackTrace();
      outputErrResults(url, "IOException: " + ex.getMessage());
    }
  }

  private void pauseBeforeFetch() {
    if (!fetchDeadline.expired()) {
      try {
        fetchDeadline.sleep();
      } catch (InterruptedException ie) {
        // no action
      }
    }
    fetchDeadline.expireIn(m_crawlDelay);
  }

  private void outputMessage(String msg, int msgType) {
    if (isInterrupted()) {
      return;
    }

    if (m_msgHandler != null) {
      m_msgHandler.outputMessage(msg + "\n", msgType);
    } else {
      try {
        m_outWriter.write(msg);
        m_outWriter.newLine();
      } catch (Exception ex) {
        System.err.println(msg);
      }
    }
  }

  private void outputErrResults(String url, String errMsg) {
    outputMessage("Error: " + errMsg + " occured while processing " + url, ERROR_MESSAGE);
  }

  private Set outputUrlResults(String url, Set m_inclset, Set m_exclset) {
    Set new_incls = new TreeSet(CollectionUtils.subtract(m_inclset, m_reported));
    Set new_excls = new TreeSet(CollectionUtils.subtract(m_exclset, m_reported));
    if (!m_inclset.isEmpty()) {
      outputMessage(
          "\nIncluded Urls: ("
              + new_incls.size()
              + " new, "
              + (m_inclset.size() - new_incls.size())
              + " old)",
          URL_SUMMARY_MESSAGE);
      depth_incl[m_curDepth - 1] += new_incls.size();
    }
    for (Iterator it = new_incls.iterator(); it.hasNext(); ) {
      outputMessage(it.next().toString(), PLAIN_MESSAGE);
    }

    if (!m_exclset.isEmpty()) {
      outputMessage(
          "\nExcluded Urls: ("
              + new_excls.size()
              + " new, "
              + (m_exclset.size() - new_excls.size())
              + " old)",
          URL_SUMMARY_MESSAGE);
    }
    for (Iterator it = new_excls.iterator(); it.hasNext(); ) {
      outputMessage(it.next().toString(), PLAIN_MESSAGE);
    }
    m_reported.addAll(new_incls);
    m_reported.addAll(new_excls);

    if (m_outWriter != null) {
      try {
        m_outWriter.flush();
      } catch (IOException ex) {
      }
    }
    return new_incls;
  }

  private void outputSummary(String baseUrl, Set fetched, Set toCrawl, long elapsedTime) {
    int fetchCount = fetched.size();
    outputMessage(
        "\n\nSummary for starting Url: " + baseUrl + " and depth: " + m_crawlDepth,
        TEST_SUMMARY_MESSAGE);
    outputMessage(
        "\nUrls fetched: " + fetchCount + "    Urls extracted: " + m_extracted.size(),
        PLAIN_MESSAGE);

    outputMessage("\nDepth  Fetched  Parsed  New URLs", PLAIN_MESSAGE);
    for (int depth = 1; depth <= m_crawlDepth; depth++) {
      PrintfFormat pf = new PrintfFormat("%5d  %7d  %6d  %8d");
      Integer[] args =
          new Integer[] {
            new Integer(depth),
            new Integer(depth_fetched[depth - 1]),
            new Integer(depth_parsed[depth - 1]),
            new Integer(depth_incl[depth - 1]),
          };
      String s = pf.sprintf(args);
      outputMessage(s, PLAIN_MESSAGE);
    }

    outputMessage("\nRemaining unfetched: " + toCrawl.size(), PLAIN_MESSAGE);
    if (false) {
      for (Iterator iter = toCrawl.iterator(); iter.hasNext(); ) {
        String url = (String) iter.next();
        outputMessage(url, PLAIN_MESSAGE);
      }
    }
    long secs = elapsedTime / Constants.SECOND;
    long fetchRate = 0;
    if (secs > 0) {
      fetchRate = fetchCount * 60 * Constants.SECOND / elapsedTime;
    }
    outputMessage(
        "\nElapsed Time: " + secs + " secs." + "    Fetch Rate: " + fetchRate + " p/m",
        PLAIN_MESSAGE);
  }

  public interface MessageHandler {
    void outputMessage(String message, int messageType);

    void close();
  }

  private class MyLinkExtractorCallback implements LinkExtractor.Callback {

    MyLinkExtractorCallback() {}

    public void foundLink(String url) {

      m_extracted.add(url);
      try {
        String normUrl = UrlUtil.normalizeUrl(url);
        if (BaseCrawler.isSupportedUrlProtocol(normUrl) && m_au.shouldBeCached(normUrl)) {
          m_incls.add(normUrl);
        } else {
          m_excls.add(normUrl);
        }
      } catch (MalformedURLException e) {
        m_excls.add(url);
      }
    }
  }

  class MyMockCachedUrl implements CachedUrl {
    private String url;
    private boolean doesExist = false;
    private Reader reader = null;

    public MyMockCachedUrl(String url, Reader reader) {
      this.url = url;

      this.reader = reader;
    }

    public ArchivalUnit getArchivalUnit() {
      throw new UnsupportedOperationException("Not implemented");
    }

    public String getUrl() {
      return url;
    }

    public CachedUrl getCuVersion(int version) {
      throw new UnsupportedOperationException("Not implemented");
    }

    public CachedUrl[] getCuVersions() {
      throw new UnsupportedOperationException("Not implemented");
    }

    public CachedUrl[] getCuVersions(int maxVersions) {
      throw new UnsupportedOperationException("Not implemented");
    }

    public int getVersion() {
      return 1;
    }

    public Reader openForReading() {
      return reader;
    }

    public LinkRewriterFactory getLinkRewriterFactory() {
      throw new UnsupportedOperationException("Not implemented");
    }

    public String getEncoding() {
      return Constants.DEFAULT_ENCODING;
    }

    /**
     * getUnfilteredInputStream
     *
     * @return InputStream
     */
    public InputStream getUnfilteredInputStream() {
      throw new UnsupportedOperationException("Not implemented");
    }

    /**
     * getUnfilteredInputStream
     *
     * @return InputStream
     */
    public InputStream getUnfilteredInputStream(HashedInputStream.Hasher hasher) {
      throw new UnsupportedOperationException("Not implemented");
    }

    public InputStream getUncompressedInputStream() {
      throw new UnsupportedOperationException("Not implemented");
    }

    public InputStream getUncompressedInputStream(HashedInputStream.Hasher hasher) {
      throw new UnsupportedOperationException("Not implemented");
    }

    /**
     * openForHashing
     *
     * @return InputStream
     */
    public InputStream openForHashing() {
      throw new UnsupportedOperationException("Not implemented");
    }

    /**
     * openForHashing
     *
     * @param hasher HashedInputStream.Hasher for unfiltered content
     * @return InputStream
     */
    public InputStream openForHashing(HashedInputStream.Hasher hasher) {
      throw new UnsupportedOperationException("Not implemented");
    }

    /**
     * getContentSize
     *
     * @return long
     */
    public long getContentSize() {
      throw new UnsupportedOperationException("Not implemented");
    }

    public String getContentType() {
      throw new UnsupportedOperationException("Not implemented");
    }

    public void setOption(String option, String val) {}

    public boolean hasContent() {
      return doesExist;
    }

    public boolean isLeaf() {
      return true;
    }

    public int getType() {
      return CachedUrlSetNode.TYPE_CACHED_URL;
    }

    public CIProperties getProperties() {
      return null;
    }

    public void addProperty(String key, String value) {}

    public void release() {}

    public String toString() {
      StringBuffer sb = new StringBuffer(url.length() + 17);
      sb.append("[MyMockCachedUrl: ");
      sb.append(url);
      sb.append("]");
      return sb.toString();
    }

    @Override
    public FileMetadataExtractor getFileMetadataExtractor(MetadataTarget target) {
      return null;
    }

    public CachedUrl getArchiveMemberCu(ArchiveMemberSpec ams) {
      throw new UnsupportedOperationException("Not implemented");
    }

    @Override
    public boolean isArchiveMember() {
      return false;
    }
  }
}
Esempio n. 4
0
// SingleThreadModel causes servlet instances to be assigned to only a
// single thread (request) at a time.
public abstract class LockssServlet extends HttpServlet implements SingleThreadModel {
  protected static Logger log = Logger.getLogger("LockssServlet");

  // Constants
  static final String PARAM_LOCAL_IP = Configuration.PREFIX + "localIPAddress";

  static final String PARAM_PLATFORM_VERSION = Configuration.PREFIX + "platform.version";

  /** Inactive HTTP session (cookie) timeout */
  static final String PARAM_UI_SESSION_TIMEOUT = Configuration.PREFIX + "ui.sessionTimeout";

  static final long DEFAULT_UI_SESSION_TIMEOUT = 2 * Constants.DAY;

  /** Maximum size of uploaded file accepted */
  static final String PARAM_MAX_UPLOAD_FILE_SIZE = Configuration.PREFIX + "ui.maxUploadFileSize";

  static final int DEFAULT_MAX_UPLOAD_FILE_SIZE = 500000;

  /** The warning string to display when the UI is disabled. */
  static final String PARAM_UI_WARNING = Configuration.PREFIX + "ui.warning";

  // session keys
  static final String SESSION_KEY_OBJECT_ID = "obj_id";
  static final String SESSION_KEY_OBJ_MAP = "obj_map";
  public static final String SESSION_KEY_RUNNING_SERVLET = "running_servlet";
  public static final String SESSION_KEY_REQUEST_HOST = "request_host";

  // Name given to form element whose value is the action that should be
  // performed when the form is submitted.  (Not always the submit button.)
  public static final String ACTION_TAG = "lockssAction";

  public static final String JAVASCRIPT_RESOURCE = "org/lockss/htdocs/admin.js";

  public static final String ATTR_INCLUDE_SCRIPT = "IncludeScript";
  public static final String ATTR_ALLOW_ROLES = "AllowRoles";

  /** User may configure admin access (add/delete/modify users, set admin access list) */
  public static final String ROLE_USER_ADMIN = "userAdminRole";

  /** User may configure content access (set content access list) */
  public static final String ROLE_CONTENT_ADMIN = "contentAdminRole";

  /** User may change AU configuration (add/delete content) */
  public static final String ROLE_AU_ADMIN = "auAdminRole";

  public static final String ROLE_DEBUG = "debugRole";

  protected ServletContext context;

  private LockssApp theApp = null;
  private ServletManager servletMgr;
  private AccountManager acctMgr;

  // Request-local storage.  Convenient, but requires servlet instances
  // to be single threaded, and must ensure reset them to avoid carrying
  // over state between requests.
  protected HttpServletRequest req;
  protected HttpServletResponse resp;
  protected URL reqURL;
  protected HttpSession session;
  private String adminDir = null;
  protected String clientAddr; // client addr, even if no param
  protected String localAddr;
  protected MultiPartRequest multiReq;

  private Vector footnotes;
  private int footNumber;
  private int tabindex;
  ServletDescr _myServletDescr = null;
  private String myName = null;

  // number submit buttons sequentially so unit tests can find them
  protected int submitButtonNumber = 0;

  /** Run once when servlet loaded. */
  public void init(ServletConfig config) throws ServletException {
    super.init(config);
    context = config.getServletContext();
    theApp = (LockssApp) context.getAttribute(ServletManager.CONTEXT_ATTR_LOCKSS_APP);
    servletMgr = (ServletManager) context.getAttribute(ServletManager.CONTEXT_ATTR_SERVLET_MGR);
    if (theApp instanceof LockssDaemon) {
      acctMgr = getLockssDaemon().getAccountManager();
    }
  }

  public ServletManager getServletManager() {
    return servletMgr;
  }

  protected ServletDescr[] getServletDescrs() {
    return servletMgr.getServletDescrs();
  }

  /** Servlets must implement this method. */
  protected abstract void lockssHandleRequest() throws ServletException, IOException;

  /** Common request handling. */
  public void service(HttpServletRequest req, HttpServletResponse resp)
      throws ServletException, IOException {
    resetState();
    boolean success = false;
    HttpSession session = req.getSession(false);
    try {
      this.req = req;
      this.resp = resp;
      if (log.isDebug()) {
        logParams();
      }
      resp.setContentType("text/html");

      if (!mayPageBeCached()) {
        resp.setHeader("pragma", "no-cache");
        resp.setHeader("Cache-control", "no-cache");
      }

      reqURL = new URL(UrlUtil.getRequestURL(req));
      clientAddr = getLocalIPAddr();

      // check that current user has permission to run this servlet
      if (!isServletAllowed(myServletDescr())) {
        displayWarningInLieuOfPage("You are not authorized to use " + myServletDescr().heading);
        return;
      }

      // check whether servlet is disabled
      String reason = ServletUtil.servletDisabledReason(myServletDescr().getServletName());
      if (reason != null) {
        displayWarningInLieuOfPage("This function is disabled. " + reason);
        return;
      }
      if (session != null) {
        session.setAttribute(SESSION_KEY_RUNNING_SERVLET, getHeading());
        String reqHost = req.getRemoteHost();
        String forw = req.getHeader(HttpFields.__XForwardedFor);
        if (!StringUtil.isNullString(forw)) {
          reqHost += " (proxies for " + forw + ")";
        }
        session.setAttribute(SESSION_KEY_REQUEST_HOST, reqHost);
      }
      lockssHandleRequest();
      success = (errMsg == null);
    } catch (ServletException e) {
      log.error("Servlet threw", e);
      throw e;
    } catch (IOException e) {
      log.error("Servlet threw", e);
      throw e;
    } catch (RuntimeException e) {
      log.error("Servlet threw", e);
      throw e;
    } finally {
      if (session != null) {
        session.setAttribute(SESSION_KEY_RUNNING_SERVLET, null);
        session.setAttribute(LockssFormAuthenticator.__J_AUTH_ACTIVITY, TimeBase.nowMs());
      }
      if ("please".equalsIgnoreCase(req.getHeader("X-Lockss-Result"))) {
        log.debug3("X-Lockss-Result: " + (success ? "Ok" : "Fail"));
        resp.setHeader("X-Lockss-Result", success ? "Ok" : "Fail");
      }
      resetMyLocals();
      resetLocals();
    }
  }

  protected void resetState() {
    multiReq = null;
    footNumber = 0;
    submitButtonNumber = 0;
    tabindex = 1;
    statusMsg = null;
    errMsg = null;
    isFramed = false;
  }

  protected void resetLocals() {}

  protected void resetMyLocals() {
    // Don't hold on to stuff forever
    req = null;
    resp = null;
    session = null;
    reqURL = null;
    adminDir = null;
    localAddr = null;
    footnotes = null;
    _myServletDescr = null;
    myName = null;
    multiReq = null;
  }

  /**
   * Return true if generated page may be cached (e.g., by browser). Default is false as most
   * servlets generate dynamic results
   */
  protected boolean mayPageBeCached() {
    return false;
  }

  /** Set the session timeout to the configured value */
  protected void setSessionTimeout(HttpSession session) {
    Configuration config = CurrentConfig.getCurrentConfig();
    setSessionTimeout(
        session, config.getTimeInterval(PARAM_UI_SESSION_TIMEOUT, DEFAULT_UI_SESSION_TIMEOUT));
  }

  /** Set the session timeout */
  protected void setSessionTimeout(HttpSession session, long time) {
    session.setMaxInactiveInterval((int) (time / Constants.SECOND));
  }

  /** Get the current session, creating it if necessary (and set the timeout if so) */
  protected HttpSession getSession() {
    if (session == null) {
      session = req.getSession(true);
      if (session.isNew()) {
        setSessionTimeout(session);
      }
    }
    return session;
  }

  /** Return true iff a session has already been established */
  protected boolean hasSession() {
    return req.getSession(false) != null;
  }

  /** Get an unused ID string for storing an object in the session */
  protected String getNewSessionObjectId() {
    HttpSession session = getSession();
    synchronized (session) {
      Integer id = (Integer) getSession().getAttribute(SESSION_KEY_OBJECT_ID);
      if (id == null) {
        id = new Integer(1);
      }
      session.setAttribute(SESSION_KEY_OBJECT_ID, new Integer(id.intValue() + 1));
      return id.toString();
    }
  }

  /** Get the object associated with the ID in the session */
  protected Object getSessionIdObject(String id) {
    HttpSession session = getSession();
    synchronized (session) {
      BidiMap map = (BidiMap) session.getAttribute(SESSION_KEY_OBJ_MAP);
      if (map == null) {
        return null;
      }
      return map.getKey(id);
    }
  }

  /** Get the String associated with the ID in the session */
  protected String getSessionIdString(String id) {
    return (String) getSessionIdObject(id);
  }

  /** Get the ID with which the object is associated with the session, if any */
  protected String getSessionObjectId(Object obj) {
    HttpSession session = getSession();
    BidiMap map;
    synchronized (session) {
      map = (BidiMap) session.getAttribute(SESSION_KEY_OBJ_MAP);
      if (map == null) {
        map = new DualHashBidiMap();
        session.setAttribute(SESSION_KEY_OBJ_MAP, map);
      }
    }
    synchronized (map) {
      String id = (String) map.get(obj);
      if (id == null) {
        id = getNewSessionObjectId();
        map.put(obj, id);
      }
      return id;
    }
  }

  // Return descriptor of running servlet
  protected ServletDescr myServletDescr() {
    if (_myServletDescr == null) {
      _myServletDescr = servletMgr.findServletDescr(this);
    }
    return _myServletDescr;
  }

  // By default, servlet heading is in descr.  Override method to
  // compute other heading
  protected String getHeading(ServletDescr d) {
    if (d == null) return "Unknown Servlet";
    return d.heading;
  }

  protected String getHeading() {
    return getHeading(myServletDescr());
  }

  String getLocalIPAddr() {
    if (localAddr == null) {
      try {
        IPAddr localHost = IPAddr.getLocalHost();
        localAddr = localHost.getHostAddress();
      } catch (UnknownHostException e) {
        // shouldn't happen
        log.error("LockssServlet: getLocalHost: " + e.toString());
        return "???";
      }
    }
    return localAddr;
  }

  // Return IP addr used by LCAP.  If specified by (misleadingly named)
  // localIPAddress prop, might not really be our address (if we are
  // behind NAT).
  String getLcapIPAddr() {
    String ip = CurrentConfig.getParam(PARAM_LOCAL_IP);
    if (ip == null || ip.length() <= 0) {
      return getLocalIPAddr();
    }
    return ip;
  }

  String getRequestHost() {
    return reqURL.getHost();
  }

  String getMachineName() {
    return PlatformUtil.getLocalHostname();
  }

  //   String getMachineName0() {
  //     if (myName == null) {
  //       // Return the canonical name of the interface the request was aimed
  //       // at.  (localIPAddress prop isn't necessarily right here, as it
  //       // might be the address of a NAT that we're behind.)
  //       String host = reqURL.getHost();
  //       try {
  // 	IPAddr localHost = IPAddr.getByName(host);
  // 	String ip = localHost.getHostAddress();
  // 	myName = getMachineName(ip);
  //       } catch (UnknownHostException e) {
  // 	// shouldn't happen
  // 	log.error("getMachineName", e);
  // 	return host;
  //       }
  //     }
  //     return myName;
  //   }

  //   String getMachineName(String ip) {
  //     try {
  //       IPAddr inet = IPAddr.getByName(ip);
  //       return inet.getHostName();
  //     } catch (UnknownHostException e) {
  //       log.warning("getMachineName", e);
  //     }
  //     return ip;
  //   }

  // return IP given name or IP
  String getMachineIP(String name) {
    try {
      IPAddr inet = IPAddr.getByName(name);
      return inet.getHostAddress();
    } catch (UnknownHostException e) {
      return null;
    }
  }

  boolean isServletLinkInNav(ServletDescr d) {
    return !isThisServlet(d) || linkMeInNav();
  }

  boolean isThisServlet(ServletDescr d) {
    return d == myServletDescr();
  }

  /** servlets may override this to determine whether they should be a link in nav table */
  protected boolean linkMeInNav() {
    return false;
  }

  boolean isLargeLogo() {
    return myServletDescr().isLargeLogo();
  }

  // user predicates
  String getUsername() {
    Principal user = req.getUserPrincipal();
    return user != null ? user.toString() : null;
  }

  protected UserAccount getUserAccount() {
    if (acctMgr != null) {
      return acctMgr.getUser(getUsername());
    }
    return AccountManager.NOBODY_ACCOUNT;
  }

  protected boolean isDebugUser() {
    return doesUserHaveRole(ROLE_DEBUG);
  }

  protected boolean doesUserHaveRole(String role) {
    if ((req.isUserInRole(role) || req.isUserInRole(ROLE_USER_ADMIN)) && !hasNoRoleParsm(role)) {
      return true;
    }
    return hasTestRole(role);
  }

  static Map<String, String> noRoleParams = new HashMap<String, String>();

  static {
    noRoleParams.put(ROLE_USER_ADMIN, "noadmin");
    noRoleParams.put(ROLE_CONTENT_ADMIN, "nocontent");
    noRoleParams.put(ROLE_AU_ADMIN, "noau");
    noRoleParams.put(ROLE_DEBUG, "nodebug");
  }

  protected boolean hasNoRoleParsm(String roleName) {
    String noRoleParam = noRoleParams.get(roleName);
    return (noRoleParam != null && !StringUtil.isNullString(req.getParameter(noRoleParam)));
  }

  protected boolean hasTestRole(String role) {
    // Servlet test harness puts roles in context
    List roles = (List) context.getAttribute(ATTR_ALLOW_ROLES);
    return roles != null && (roles.contains(role) || roles.contains(ROLE_USER_ADMIN));
  }

  protected boolean isServletAllowed(ServletDescr d) {
    if (d.needsUserAdminRole() && !doesUserHaveRole(ROLE_USER_ADMIN)) return false;
    if (d.needsContentAdminRole() && !doesUserHaveRole(ROLE_CONTENT_ADMIN)) return false;
    if (d.needsAuAdminRole() && !doesUserHaveRole(ROLE_AU_ADMIN)) return false;

    return d.isEnabled(getLockssDaemon());
  }

  protected boolean isServletDisplayed(ServletDescr d) {
    if (!isServletAllowed(d)) return false;
    if (d.needsDebugRole() && !doesUserHaveRole(ROLE_DEBUG)) return false;
    return true;
  }

  protected boolean isServletInNav(ServletDescr d) {
    if (d.cls == ServletDescr.UNAVAILABLE_SERVLET_MARKER) return false;
    return d.isInNav(this) && isServletDisplayed(d);
  }

  // Called when a servlet doesn't get the parameters it expects/needs
  protected void paramError() throws IOException {
    // FIXME: As of 2006-03-15 this method and its only caller checkParam() are not called from
    // anywhere
    PrintWriter wrtr = resp.getWriter();
    Page page = new Page();
    // add referer, params, msg to contact lockss unless from old bookmark
    // or manually entered url
    page.add("Parameter error");
    page.write(wrtr);
  }

  // return true iff error
  protected boolean checkParam(boolean ok, String msg) throws IOException {
    if (ok) return false;
    log.error(myServletDescr().getPath() + ": " + msg);
    paramError();
    return true;
  }

  /** Construct servlet URL */
  String srvURL(ServletDescr d) {
    return srvURL((String) null, d, null);
  }

  /** Construct servlet URL with params */
  String srvURL(ServletDescr d, String params) {
    return srvURL((String) null, d, params);
  }

  /** Construct servlet URL with params */
  String srvURL(ServletDescr d, Properties params) {
    return srvURL(d, concatParams(params));
  }

  /** Construct servlet absolute URL, with params as necessary. */
  String srvAbsURL(ServletDescr d, String params) {
    return srvURL(getRequestHost(), d, params);
  }

  /**
   * Construct servlet URL, with params as necessary. Avoid generating a hostname different from
   * that used in the original request, or browsers will prompt again for login
   */
  String srvURL(String host, ServletDescr d, String params) {
    return srvURLFromStem(srvUrlStem(host), d, params);
  }

  String srvURL(PeerIdentity peer, ServletDescr d, String params) {
    return srvURLFromStem(peer.getUiUrlStem(reqURL.getPort()), d, params);
  }

  /**
   * Construct servlet URL, with params as necessary. Avoid generating a hostname different from
   * that used in the original request, or browsers will prompt again for login
   */
  String srvURLFromStem(String stem, ServletDescr d, String params) {
    if (d.isPathIsUrl()) {
      return d.getPath();
    }
    StringBuilder sb = new StringBuilder(80);
    if (stem != null) {
      sb.append(stem);
      if (stem.charAt(stem.length() - 1) != '/') {
        sb.append('/');
      }
    } else {
      // ensure absolute path even if no scheme/host/port
      sb.append('/');
    }
    sb.append(d.getPath());
    if (params != null) {
      sb.append('?');
      sb.append(params);
    }
    return sb.toString();
  }

  String srvUrlStem(String host) {
    if (host == null) {
      return null;
    }
    StringBuilder sb = new StringBuilder();
    sb.append(reqURL.getProtocol());
    sb.append("://");
    sb.append(host);
    sb.append(':');
    sb.append(reqURL.getPort());
    return sb.toString();
  }

  /** Return a link to a servlet */
  String srvLink(ServletDescr d, String text) {
    return srvLink(d, text, (String) null);
  }

  /** Return a link to a servlet with params */
  String srvLink(ServletDescr d, String text, String params) {
    return new Link(srvURL(d, params), (text != null ? text : d.heading)).toString();
  }

  /** Return a link to a servlet with params */
  String srvLink(ServletDescr d, String text, Properties params) {
    return new Link(srvURL(d, params), text).toString();
  }

  /** Return an absolute link to a servlet with params */
  String srvAbsLink(ServletDescr d, String text, Properties params) {
    return srvAbsLink(d, text, concatParams(params));
  }

  /** Return an absolute link to a servlet with params */
  String srvAbsLink(ServletDescr d, String text, String params) {
    return new Link(srvAbsURL(d, params), (text != null ? text : d.heading)).toString();
  }

  /** Return an absolute link to a servlet with params */
  String srvAbsLink(String host, ServletDescr d, String text, String params) {
    return new Link(srvURL(host, d, params), (text != null ? text : d.heading)).toString();
  }

  /** Return an absolute link to a servlet with params */
  String srvAbsLink(PeerIdentity peer, ServletDescr d, String text, String params) {
    return new Link(srvURL(peer, d, params), (text != null ? text : d.heading)).toString();
  }

  /** Return text as a link iff isLink */
  String conditionalSrvLink(ServletDescr d, String text, String params, boolean isLink) {
    if (isLink) {
      return srvLink(d, text, params);
    } else {
      return text;
    }
  }

  /** Return text as a link iff isLink */
  String conditionalSrvLink(ServletDescr d, String text, boolean isLink) {
    return conditionalSrvLink(d, text, null, isLink);
  }

  /** Concatenate params for URL string */
  static String concatParams(String p1, String p2) {
    if (StringUtil.isNullString(p1)) {
      return p2;
    }
    if (StringUtil.isNullString(p2)) {
      return p1;
    }
    return p1 + "&" + p2;
  }

  /** Concatenate params for URL string */
  String concatParams(Properties props) {
    if (props == null) {
      return null;
    }
    java.util.List list = new ArrayList();
    for (Iterator iter = props.keySet().iterator(); iter.hasNext(); ) {
      String key = (String) iter.next();
      String val = props.getProperty(key);
      if (!StringUtil.isNullString(val)) {
        list.add(key + "=" + urlEncode(val));
      }
    }
    return StringUtil.separatedString(list, "&");
  }

  String modifyParams(String key, String val) {
    Properties props = getParamsAsProps();
    props.setProperty(key, val);
    return concatParams(props);
  }

  /**
   * Return the request parameters as a Properties. Only the first value of multivalued parameters
   * is included.
   */
  Properties getParamsAsProps() {
    Properties props = new Properties();
    for (Enumeration en = req.getParameterNames(); en.hasMoreElements(); ) {
      String name = (String) en.nextElement();
      props.setProperty(name, req.getParameter(name));
    }
    return props;
  }

  /**
   * Return the request parameters as a Map<String,String>. Only the first value of multivalued
   * parameters is included.
   */
  Map<String, String> getParamsAsMap() {
    Map<String, String> map = new HashMap<String, String>();
    for (Enumeration en = req.getParameterNames(); en.hasMoreElements(); ) {
      String name = (String) en.nextElement();
      map.put(name, req.getParameter(name));
    }
    return map;
  }

  protected String urlEncode(String param) {
    return UrlUtil.encodeUrl(param);
  }

  protected String getRequestKey() {
    String key = req.getPathInfo();
    if (key != null && key.startsWith("/")) {
      return key.substring(1);
    }
    return key;
  }

  /** Common page setup. */
  protected Page newPage() {
    // Compute heading
    String heading = getHeading();
    if (heading == null) {
      heading = "Box Administration";
    }

    // Create page and layout header
    Page page = ServletUtil.doNewPage(getPageTitle(), isFramed());
    Iterator inNavIterator;
    if (myServletDescr().hasNoNavTable()) {
      inNavIterator = CollectionUtil.EMPTY_ITERATOR;
    } else {
      inNavIterator =
          new FilterIterator(
              new ObjectArrayIterator(getServletDescrs()),
              new Predicate() {
                public boolean evaluate(Object obj) {
                  return isServletInNav((ServletDescr) obj);
                }
              });
    }
    ServletUtil.layoutHeader(
        this,
        page,
        heading,
        isLargeLogo(),
        getMachineName(),
        getLockssApp().getStartDate(),
        inNavIterator);
    String warnMsg = CurrentConfig.getParam(PARAM_UI_WARNING);
    if (warnMsg != null) {
      Composite warning = new Composite();
      warning.add("<center><font color=red size=+1>");
      warning.add(warnMsg);
      warning.add("</font></center><br>");
      page.add(warning);
    }
    return page;
  }

  protected Page addBarePageHeading(Page page) {
    // FIXME: Move the following fragment elsewhere
    // It causes the doctype statement to appear in the middle,
    // after the <body> tag.
    page.add("<!doctype html public \"-//w3c//dtd html 4.0 transitional//en\">");
    page.addHeader("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\">");
    page.addHeader("<meta http-equiv=\"content-type\" content=\"text/html;charset=ISO-8859-1\">");
    page.addHeader("<link rel=\"shortcut icon\" href=\"/favicon.ico\" type=\"image/x-icon\" />");
    return page;
  }

  private boolean isFramed = false;

  protected String errMsg;

  protected String statusMsg;

  protected boolean isFramed() {
    return isFramed;
  }

  protected void setFramed(boolean v) {
    isFramed = v;
  }

  protected String getPageTitle() {
    String heading = getHeading();
    if (heading != null) {
      return "LOCKSS: " + heading;
    } else {
      return "LOCKSS";
    }
  }

  /** Return a button that invokes the javascript submit routine with the specified action */
  protected Element submitButton(String label, String action) {
    return submitButton(label, action, null, null);
  }

  /** Return a button that invokes javascript when clicked. */
  Input jsButton(String label, String js) {
    Input btn = new Input("button", null);
    btn.attribute("value", label);
    setTabOrder(btn);
    btn.attribute("onClick", js);
    return btn;
  }

  /**
   * Return a button that invokes the javascript submit routine with the specified action, first
   * storing the value in the specified form prop.
   */
  protected Element submitButton(String label, String action, String prop, String value) {
    StringBuilder sb = new StringBuilder(40);
    sb.append("lockssButton(this, '");
    sb.append(action);
    sb.append("'");
    if (prop != null && value != null) {
      sb.append(", '");
      sb.append(prop);
      sb.append("', '");
      sb.append(value);
      sb.append("'");
    }
    sb.append(")");
    Input btn = jsButton(label, sb.toString());
    btn.attribute("id", "lsb." + (++submitButtonNumber));
    return btn;
  }

  /**
   * Return a (possibly labelled) checkbox.
   *
   * @param label appears to right of checkbox if non null
   * @param value value included in result set if box checked
   * @param key form key to which result set is assigned
   * @param checked if true, box is initially checked
   * @return a checkbox Element
   */
  Element checkBox(String label, String value, String key, boolean checked) {
    Input in = new Input(Input.Checkbox, key, value);
    if (checked) {
      in.check();
    }
    setTabOrder(in);
    if (StringUtil.isNullString(label)) {
      return in;
    } else {
      Composite c = new Composite();
      c.add(in);
      c.add(" ");
      c.add(label);
      return c;
    }
  }

  /**
   * Return a labelled rasio button
   *
   * @param label label to right of circle, and form value if checked
   * @param key form key to which value is assigned
   * @param checked if true, is initially checked
   * @return a readio button Element
   */
  protected Element radioButton(String label, String key, boolean checked) {
    return radioButton(label, label, key, checked);
  }

  /**
   * Return a labelled rasio button
   *
   * @param label appears to right of circle if non null
   * @param value value assigned to key if box checked
   * @param key form key to which value is assigned
   * @param checked if true, is initially checked
   * @return a readio button Element
   */
  protected Element radioButton(String label, String value, String key, boolean checked) {
    Composite c = new Composite();
    Input in = new Input(Input.Radio, key, value);
    if (checked) {
      in.check();
    }
    setTabOrder(in);
    c.add(in);
    c.add(label);
    return c;
  }

  /** Add html tags to grey the text if isGrey is true */
  protected String greyText(String txt, boolean isGrey) {
    if (!isGrey) {
      return txt;
    }
    return "<font color=gray>" + txt + "</font>";
  }

  /**
   * Set this element next in the tab order. Returns the element for easier nesting in expressions.
   */
  protected Element setTabOrder(Element ele) {
    ele.attribute("tabindex", tabindex++);
    return ele;
  }

  /**
   * Store a footnote, assign it a number, return html for footnote reference. If footnote in null
   * or empty, no footnote is added and an empty string is returned. Footnote numbers get turned
   * into links; <b>Do not put the result of addFootnote inside a link!</b>.
   */
  protected String addFootnote(String s) {
    if (s == null || s.length() == 0) {
      return "";
    }
    if (footNumber == 0) {
      if (footnotes == null) {
        footnotes = new Vector(10, 10);
      } else {
        footnotes.removeAllElements();
      }
    }
    int n = footnotes.indexOf(s);
    if (n < 0) {
      n = footNumber++;
      footnotes.addElement(s);
    }
    return "<sup><font size=-1><a href=#foottag" + (n + 1) + ">" + (n + 1) + "</a></font></sup>";
  }

  /**
   * Add javascript to page. Normally adds a link to the script file, but can be told to include the
   * script directly in the page, to accomodate unit testing of individual servlets, when other
   * fetches won't work.
   */
  protected void addJavaScript(Composite comp) {
    String include = (String) context.getAttribute(ATTR_INCLUDE_SCRIPT);
    if (StringUtil.isNullString(include)) {
      linkToJavaScript(comp);
    } else {
      includeJavaScript0(comp);
    }
  }

  private void includeJavaScript0(Composite comp) {
    Script script = new Script(getJavascript());
    comp.add(script);
  }

  private void linkToJavaScript(Composite comp) {
    Script script = new Script("");
    script.attribute("src", "admin.js");
    comp.add(script);
  }

  private static String jstext = null;

  private static synchronized String getJavascript() {
    if (jstext == null) {
      InputStream istr = null;
      try {
        ClassLoader loader = Thread.currentThread().getContextClassLoader();
        istr = loader.getResourceAsStream(JAVASCRIPT_RESOURCE);
        jstext = StringUtil.fromInputStream(istr);
        istr.close();
      } catch (Exception e) {
        log.error("Can't load javascript", e);
      } finally {
        IOUtil.safeClose(istr);
      }
    }
    return jstext;
  }

  /** Display a message in lieu of the normal page */
  protected void displayMsgInLieuOfPage(String msg) throws IOException {
    // TODO: Look at HTML
    Page page = newPage();
    Composite warning = new Composite();
    warning.add(msg);
    warning.add("<br>");
    page.add(warning);
    layoutFooter(page);
    page.write(resp.getWriter());
  }

  /** Display a warning in red, in lieu of the normal page */
  protected void displayWarningInLieuOfPage(String msg) throws IOException {
    displayMsgInLieuOfPage("<center><font color=red size=+1>" + msg + "</font></center>");
  }

  /** Display "The cache isn't ready yet, come back later" */
  protected void displayNotStarted() throws IOException {
    displayWarningInLieuOfPage(
        "This LOCKSS box is still starting.  Please "
            + srvLink(myServletDescr(), "try again", getParamsAsProps())
            + " in a moment.");
  }

  public MultiPartRequest getMultiPartRequest() throws FormDataTooLongException, IOException {
    int maxUpload =
        CurrentConfig.getIntParam(PARAM_MAX_UPLOAD_FILE_SIZE, DEFAULT_MAX_UPLOAD_FILE_SIZE);
    return getMultiPartRequest(maxUpload);
  }

  public MultiPartRequest getMultiPartRequest(int maxLen)
      throws FormDataTooLongException, IOException {
    if (req.getContentType() == null || !req.getContentType().startsWith("multipart/form-data")) {
      return null;
    }
    if (req.getContentLength() > maxLen) {
      throw new FormDataTooLongException(req.getContentLength() + " bytes, " + maxLen + " allowed");
    }
    MultiPartRequest multi = new MultiPartRequest(req);
    if (log.isDebug2()) {
      String[] parts = multi.getPartNames();
      log.debug3("Multipart request, " + parts.length + " parts");
      if (log.isDebug3()) {
        for (int p = 0; p < parts.length; p++) {
          String name = parts[p];
          String cont = multi.getString(parts[p]);
          log.debug3(name + ": " + cont);
        }
      }
    }
    multiReq = multi;
    return multi;
  }

  public String getParameter(String name) {
    String val = req.getParameter(name);
    if (val == null && multiReq != null) {
      val = multiReq.getString(name);
    }
    if (val == null) {
      return null;
    }
    val = StringUtils.strip(val, " \t");
    //     if (StringUtil.isNullString(val)) {
    if ("".equals(val)) {
      return null;
    }
    return val;
  }

  protected void layoutFooter(Page page) {
    ServletUtil.doLayoutFooter(
        page, (footnotes == null ? null : footnotes.iterator()), getLockssApp().getVersionInfo());
    if (footnotes != null) {
      footnotes.removeAllElements();
    }
  }

  /** Return the app instance. */
  protected LockssApp getLockssApp() {
    return theApp;
  }

  /**
   * Return the daemon instance, assumes that the servlet is running in the daemon.
   *
   * @throws ClassCastException if the servlet is running in an app other than the daemon
   */
  protected LockssDaemon getLockssDaemon() {
    return (LockssDaemon) theApp;
  }

  protected void logParams() {
    Enumeration en = req.getParameterNames();
    while (en.hasMoreElements()) {
      String name = (String) en.nextElement();
      String vals[];
      String dispval;
      if (StringUtil.indexOfIgnoreCase(name, "passw") >= 0) {
        dispval = req.getParameter(name).length() == 0 ? "" : "********";
      } else if (log.isDebug2() && (vals = req.getParameterValues(name)).length > 1) {
        dispval = StringUtil.separatedString(vals, ", ");
      } else {
        dispval = req.getParameter(name);
      }
      log.debug(name + " = " + dispval);
    }
  }

  /** Convenience method */
  protected String encodeText(String s) {
    return HtmlUtil.encode(s, HtmlUtil.ENCODE_TEXT);
  }

  /** Convenience method */
  protected String encodeTextArea(String s) {
    return HtmlUtil.encode(s, HtmlUtil.ENCODE_TEXTAREA);
  }

  /** Convenience method */
  protected String encodeAttr(String s) {
    return HtmlUtil.encode(s, HtmlUtil.ENCODE_ATTR);
  }

  /**
   * Create message and error message block
   *
   * @param composite TODO
   */
  protected void layoutErrorBlock(Composite composite) {
    if (errMsg != null || statusMsg != null) {
      ServletUtil.layoutErrorBlock(composite, errMsg, statusMsg);
    }
  }

  /** Exception thrown if multipart form data is longer than the caller-supplied max */
  public static class FormDataTooLongException extends Exception {
    public FormDataTooLongException(String message) {
      super(message);
    }
  }
}
/** Central repository of loaded keystores */
public class LockssKeyStoreManager extends BaseLockssDaemonManager implements ConfigurableManager {

  protected static Logger log = Logger.getLogger("LockssKeyStoreManager");

  static final String PREFIX = Configuration.PREFIX + "keyMgr.";

  /** Default type for newly created keystores. */
  public static final String PARAM_DEFAULT_KEYSTORE_TYPE = PREFIX + "defaultKeyStoreType";

  public static final String DEFAULT_DEFAULT_KEYSTORE_TYPE = "JCEKS";

  /** Default keystore provider. */
  public static final String PARAM_DEFAULT_KEYSTORE_PROVIDER = PREFIX + "defaultKeyStoreProvider";

  public static final String DEFAULT_DEFAULT_KEYSTORE_PROVIDER = null;

  /**
   * Root of keystore definitions. For each keystore, pick a unique identifier and use it in place
   * of &lt;id&gt; in the following
   */
  public static final String PARAM_KEYSTORE = PREFIX + "keystore";

  /** If true the daemon will exit if a critical keystore is missing. */
  public static final String PARAM_EXIT_IF_MISSING_KEYSTORE = PREFIX + "exitIfMissingKeystore";

  public static final boolean DEFAULT_EXIT_IF_MISSING_KEYSTORE = true;

  /** keystore name, used by clients to refer to it */
  public static final String KEYSTORE_PARAM_NAME = "name";
  /** keystore file. Only one of file, resource or url should be set */
  public static final String KEYSTORE_PARAM_FILE = "file";
  /** keystore resource. Only one of file, resource or url should be set */
  public static final String KEYSTORE_PARAM_RESOURCE = "resource";
  /** keystore url. Only one of file, resource or url should be set */
  public static final String KEYSTORE_PARAM_URL = "url";
  /** keystore type */
  public static final String KEYSTORE_PARAM_TYPE = "type";
  /** keystore provider */
  public static final String KEYSTORE_PARAM_PROVIDER = "provider";
  /** keystore password */
  public static final String KEYSTORE_PARAM_PASSWORD = "******";
  /** private key password */
  public static final String KEYSTORE_PARAM_KEY_PASSWORD = "******";
  /** private key password file */
  public static final String KEYSTORE_PARAM_KEY_PASSWORD_FILE = "keyPasswordFile";
  /**
   * If true, and the keystore doesn't exist, a keystore with a self-signed certificate will be be
   * created.
   */
  public static final String KEYSTORE_PARAM_CREATE = "create";

  protected String defaultKeyStoreType = DEFAULT_DEFAULT_KEYSTORE_TYPE;
  protected String defaultKeyStoreProvider = DEFAULT_DEFAULT_KEYSTORE_PROVIDER;
  protected boolean paramExitIfMissingKeyStore = DEFAULT_EXIT_IF_MISSING_KEYSTORE;

  // Pseudo params for param doc
  public static final String DOC_PREFIX = PARAM_KEYSTORE + ".<id>.";

  /** Name by which daemon component(s) refer to this keystore */
  public static final String PARAM_KEYSTORE_NAME = DOC_PREFIX + KEYSTORE_PARAM_NAME;
  /** Keystore filename. Only one of file, resource or url should be set */
  public static final String PARAM_KEYSTORE_FILE = DOC_PREFIX + KEYSTORE_PARAM_FILE;
  /** Keystore resource. Only one of file, resource or url should be set */
  public static final String PARAM_KEYSTORE_RESOURCE = DOC_PREFIX + KEYSTORE_PARAM_RESOURCE;
  /** Keystore url. Only one of file, resource or url should be set */
  public static final String PARAM_KEYSTORE_URL = DOC_PREFIX + KEYSTORE_PARAM_URL;
  /** Keystore type (JKS, JCEKS, etc.) */
  public static final String PARAM_KEYSTORE_TYPE = DOC_PREFIX + KEYSTORE_PARAM_TYPE;
  /** Keystore provider (SunJCE, etc.) */
  public static final String PARAM_KEYSTORE_PROVIDER = DOC_PREFIX + KEYSTORE_PARAM_PROVIDER;
  /** Keystore password. Default is machine's fqdn */
  public static final String PARAM_KEYSTORE_PASSWORD = DOC_PREFIX + KEYSTORE_PARAM_PASSWORD;
  /** Private key password */
  public static final String PARAM_KEYSTORE_KEY_PASSWORD = DOC_PREFIX + KEYSTORE_PARAM_KEY_PASSWORD;
  /** private key password file */
  public static final String PARAM_KEYSTORE_KEY_PASSWORD_FILE =
      DOC_PREFIX + KEYSTORE_PARAM_KEY_PASSWORD_FILE;
  /**
   * If true, and the keystore doesn't exist, a keystore with a self-signed certificate will be be
   * created.
   */
  public static final String PARAM_KEYSTORE_CREATE = DOC_PREFIX + KEYSTORE_PARAM_CREATE;

  public static boolean DEFAULT_CREATE = false;

  protected Map<String, LockssKeyStore> keystoreMap = new HashMap<String, LockssKeyStore>();

  public void startService() {
    super.startService();
    loadKeyStores();
  }

  public synchronized void setConfig(
      Configuration config, Configuration prevConfig, Configuration.Differences changedKeys) {
    if (changedKeys.contains(PREFIX)) {
      defaultKeyStoreType = config.get(PARAM_DEFAULT_KEYSTORE_TYPE, DEFAULT_DEFAULT_KEYSTORE_TYPE);
      defaultKeyStoreProvider =
          config.get(PARAM_DEFAULT_KEYSTORE_PROVIDER, DEFAULT_DEFAULT_KEYSTORE_PROVIDER);
      paramExitIfMissingKeyStore =
          config.getBoolean(PARAM_EXIT_IF_MISSING_KEYSTORE, DEFAULT_EXIT_IF_MISSING_KEYSTORE);

      if (changedKeys.contains(PARAM_KEYSTORE)) {
        configureKeyStores(config);
        // defer initial set of keystore loading until startService
        if (isInited()) {
          // load any newly added keystores
          loadKeyStores();
        }
      }
    }
  }

  /**
   * Return the named LockssKeyStore or null
   *
   * @param name the keystore name
   */
  public LockssKeyStore getLockssKeyStore(String name) {
    return getLockssKeyStore(name, null);
  }

  /**
   * Return the named LockssKeyStore
   *
   * @param name the keystore name
   * @param criticalServiceName if non-null, this is a criticial keystore whose unavailability
   *     should cause the daemon to exit (if org.lockss.keyMgr.exitIfMissingKeystore is true)
   */
  public LockssKeyStore getLockssKeyStore(String name, String criticalServiceName) {
    LockssKeyStore res = keystoreMap.get(name);
    checkFact(res, name, criticalServiceName, null);
    return res;
  }

  /**
   * Convenience method to return the KeyManagerFactory from the named LockssKeyStore, or null
   *
   * @param name the keystore name
   */
  public KeyManagerFactory getKeyManagerFactory(String name) {
    return getKeyManagerFactory(name, null);
  }

  /**
   * Convenience method to return the KeyManagerFactory from the named LockssKeyStore.
   *
   * @param name the keystore name
   * @param criticalServiceName if non-null, this is a criticial keystore whose unavailability
   *     should cause the daemon to exit (if org.lockss.keyMgr.exitIfMissingKeystore is true)
   */
  public KeyManagerFactory getKeyManagerFactory(String name, String criticalServiceName) {
    LockssKeyStore lk = getLockssKeyStore(name, criticalServiceName);
    if (lk != null) {
      KeyManagerFactory fact = lk.getKeyManagerFactory();
      checkFact(fact, name, criticalServiceName, "found but contains no private keys");

      return fact;
    }
    return null;
  }

  /** Convenience method to return the TrustManagerFactory from the named LockssKeyStore, or null */
  public TrustManagerFactory getTrustManagerFactory(String name) {
    return getTrustManagerFactory(name, null);
  }

  /**
   * Convenience method to return the TrustManagerFactory from the named LockssKeyStore.
   *
   * @param name the keystore name
   * @param criticalServiceName if non-null, this is a criticial keystore whose unavailability
   *     should cause the daemon to exit (if org.lockss.keyMgr.exitIfMissingKeystore is true)
   */
  public TrustManagerFactory getTrustManagerFactory(String name, String criticalServiceName) {
    LockssKeyStore lk = getLockssKeyStore(name, criticalServiceName);
    if (lk != null) {
      TrustManagerFactory fact = lk.getTrustManagerFactory();
      checkFact(fact, name, criticalServiceName, "found but contains no trusted certificates");
      return fact;
    }
    return null;
  }

  private void checkFact(Object fact, String name, String criticalServiceName, String message) {
    if (fact == null && criticalServiceName != null) {
      String msg =
          StringUtil.isNullString(name)
              ? ("No keystore name given for critical keystore"
                  + " needed for service "
                  + criticalServiceName
                  + ", daemon exiting")
              : ("Critical keystore "
                  + name
                  + " "
                  + ((message != null) ? message : "not found or not loadable")
                  + " for service "
                  + criticalServiceName
                  + ", daemon exiting");
      log.critical(msg);

      if (paramExitIfMissingKeyStore) {
        System.exit(Constants.EXIT_CODE_KEYSTORE_MISSING);
      } else {
        throw new IllegalArgumentException(msg);
      }
    }
  }

  /** Create LockssKeystores from config subtree below {@link #PARAM_KEYSTORE} */
  void configureKeyStores(Configuration config) {
    Configuration allKs = config.getConfigTree(PARAM_KEYSTORE);
    for (Iterator iter = allKs.nodeIterator(); iter.hasNext(); ) {
      String id = (String) iter.next();
      Configuration oneKs = allKs.getConfigTree(id);
      try {
        LockssKeyStore lk = createLockssKeyStore(oneKs);
        String name = lk.getName();
        if (name == null) {
          log.error("KeyStore definition missing name: " + oneKs);
          continue;
        }
        LockssKeyStore old = keystoreMap.get(name);
        if (old != null && !lk.equals(old)) {
          log.warning(
              "Keystore "
                  + name
                  + " redefined.  "
                  + "New definition may not take effect until daemon restart");
        }

        log.debug("Adding keystore " + name);
        keystoreMap.put(name, lk);

      } catch (Exception e) {
        log.error("Couldn't create keystore: " + oneKs, e);
      }
    }
  }

  /** Create LockssKeystore from a config subtree */
  LockssKeyStore createLockssKeyStore(Configuration config) {
    log.debug2("Creating LockssKeyStore from config: " + config);
    String name = config.get(KEYSTORE_PARAM_NAME);
    LockssKeyStore lk = new LockssKeyStore(name);

    String file = config.get(KEYSTORE_PARAM_FILE);
    String resource = config.get(KEYSTORE_PARAM_RESOURCE);
    String url = config.get(KEYSTORE_PARAM_URL);

    if (!StringUtil.isNullString(file)) {
      lk.setLocation(file, LocationType.File);
    } else if (!StringUtil.isNullString(resource)) {
      lk.setLocation(resource, LocationType.Resource);
    } else if (!StringUtil.isNullString(url)) {
      lk.setLocation(url, LocationType.Url);
    }

    lk.setType(config.get(KEYSTORE_PARAM_TYPE, defaultKeyStoreType));
    lk.setProvider(config.get(KEYSTORE_PARAM_PROVIDER, defaultKeyStoreProvider));
    lk.setPassword(config.get(KEYSTORE_PARAM_PASSWORD));
    lk.setKeyPassword(config.get(KEYSTORE_PARAM_KEY_PASSWORD));
    lk.setKeyPasswordFile(config.get(KEYSTORE_PARAM_KEY_PASSWORD_FILE));
    lk.setMayCreate(config.getBoolean(KEYSTORE_PARAM_CREATE, DEFAULT_CREATE));
    return lk;
  }

  void loadKeyStores() {
    List<LockssKeyStore> lst = new ArrayList<LockssKeyStore>(keystoreMap.values());
    for (LockssKeyStore lk : lst) {
      try {
        lk.load();
      } catch (Exception e) {
        log.error("Can't load keystore " + lk.getName(), e);
        keystoreMap.remove(lk.getName());
      }
    }
  }
}
/**
 * LockssRepository is used to organize the urls being cached. It keeps a memory cache of the most
 * recently used nodes as a least-recently-used map, and also caches weak references to the
 * instances as they're doled out. This ensures that two instances of the same node are never
 * created, as the weak references only disappear when the object is finalized (they go to null when
 * the last hard reference is gone, then are removed from the cache on finalize()).
 */
public class LockssRepositoryImpl extends BaseLockssDaemonManager implements LockssRepository {
  private static Logger logger = Logger.getLogger("LockssRepository");

  /** Configuration parameter name for Lockss cache location. */
  public static final String PARAM_CACHE_LOCATION = Configuration.PREFIX + "cache.location";

  /** Restores pre repo rescanning bugfix behavior (4050) */
  public static final String PARAM_CLEAR_DIR_MAP = RepositoryManager.PREFIX + "clearDirMapOnAuStop";

  public static final boolean DEFAULT_CLEAR_DIR_MAP = false;

  /** Name of top directory in which the urls are cached. */
  public static final String CACHE_ROOT_NAME = "cache";

  // XXX This is a remnant from the single-disk days, and should go away.
  // It is used only by unit tests, which want it set to the (last)
  // individual repository dir created.
  private static String staticCacheLocation = null;

  // Maps local repository name (disk path) to LocalRepository instance
  static Map localRepositories = new HashMap();

  // starts with a '#' so no possibility of clashing with a URL
  public static final String AU_ID_FILE = "#au_id_file";
  static final String AU_ID_PROP = "au.id";
  static final String PLUGIN_ID_PROP = "plugin.id";
  static final char ESCAPE_CHAR = '#';
  static final String ESCAPE_STR = "#";
  static final char ENCODED_SEPARATOR_CHAR = 's';
  static final String INITIAL_PLUGIN_DIR = String.valueOf((char) ('a' - 1));
  static String lastPluginDir = INITIAL_PLUGIN_DIR;
  // PJG: Windows prohibits use of ':' in file name -- replace with '~' for development
  static final String PORT_SEPARATOR = SystemUtils.IS_OS_WINDOWS ? "%" : ":";

  // this contains a '#' so that it's not defeatable by strings which
  // match the prefix in a url (like '../tmp/')
  private static final String TEST_PREFIX = "/#tmp";

  private RepositoryManager repoMgr;
  private String rootLocation;
  UniqueRefLruCache nodeCache;
  private boolean isGlobalNodeCache = RepositoryManager.DEFAULT_GLOBAL_CACHE_ENABLED;

  LockssRepositoryImpl(String rootPath) {
    if (rootPath.endsWith(File.separator)) {
      rootLocation = rootPath;
    } else {
      // shouldn't happen
      StringBuilder sb = new StringBuilder(rootPath.length() + File.separator.length());
      sb.append(rootPath);
      sb.append(File.separator);
      rootLocation = sb.toString();
    }
    // Test code still needs this.
    nodeCache = new UniqueRefLruCache(RepositoryManager.DEFAULT_MAX_PER_AU_CACHE_SIZE);
    rootLocation =
        rootLocation
            .replace("?", "")
            .replace("COM8", "COMEIGHT")
            .replace("%5c", "/"); // //windows folder structure fix
  }

  public void startService() {
    super.startService();
    repoMgr = getDaemon().getRepositoryManager();
    isGlobalNodeCache = repoMgr.isGlobalNodeCache();
    if (isGlobalNodeCache) {
      nodeCache = repoMgr.getGlobalNodeCache();
    } else {
      //       nodeCache =
      // 	new UniqueRefLruCache(repoMgr.paramNodeCacheSize);
      setNodeCacheSize(repoMgr.paramNodeCacheSize);
    }
  }

  public void stopService() {
    // mainly important in testing to blank this
    lastPluginDir = INITIAL_PLUGIN_DIR;
    if (CurrentConfig.getBooleanParam(PARAM_CLEAR_DIR_MAP, DEFAULT_CLEAR_DIR_MAP)) {
      // This should be in RepositoryManager.stopService()
      localRepositories = new HashMap();
    }
    super.stopService();
  }

  public void setNodeCacheSize(int size) {
    if (nodeCache != null && !isGlobalNodeCache && nodeCache.getMaxSize() != size) {
      nodeCache.setMaxSize(size);
    }
  }

  /**
   * Called between initService() and startService(), then whenever the AU's config changes.
   *
   * @param auConfig the new configuration
   */
  public void setAuConfig(Configuration auConfig) {}

  void queueSizeCalc(RepositoryNode node) {
    repoMgr.queueSizeCalc(node);
  }

  public RepositoryNode getNode(String url) throws MalformedURLException {
    return getNode(url, false);
  }

  public RepositoryNode createNewNode(String url) throws MalformedURLException {
    return getNode(url, true);
  }

  public void deleteNode(String url) throws MalformedURLException {
    RepositoryNode node = getNode(url, false);
    if (node != null) {
      node.markAsDeleted();
    }
  }

  public void deactivateNode(String url) throws MalformedURLException {
    RepositoryNode node = getNode(url, false);
    if (node != null) {
      node.deactivateContent();
    }
  }

  /**
   * This function returns a RepositoryNode with a canonicalized path.
   *
   * @param url the url in String form
   * @param create true iff the node should be created if absent
   * @return RepositoryNode the node
   * @throws MalformedURLException
   */
  private synchronized RepositoryNode getNode(String url, boolean create)
      throws MalformedURLException {
    String canonUrl;
    boolean isAuUrl = false;
    if (AuUrl.isAuUrl(url)) {
      // path information is lost here, but is unimportant if it's an AuUrl
      canonUrl = AuUrl.PROTOCOL;
      isAuUrl = true;
    } else {
      // create a canonical path, handling all illegal path traversal
      canonUrl = canonicalizePath(url);
    }

    // check LRUMap cache for node
    RepositoryNode node = (RepositoryNode) nodeCache.get(nodeCacheKey(canonUrl));
    if (node != null) {
      return node;
    }

    String nodeLocation;
    if (isAuUrl) {
      // base directory of ArchivalUnit
      nodeLocation = rootLocation;
      node = new AuNodeImpl(canonUrl, nodeLocation, this);
    } else {
      // determine proper node location
      nodeLocation =
          LockssRepositoryImpl.mapUrlToFileLocation(rootLocation, canonUrl)
              .replace("?", "")
              .replace("COM8", "COMEIGHT")
              .replace("%5c", "/"); // //windows folder structure fix
      node = new RepositoryNodeImpl(canonUrl, nodeLocation, this);
    }

    if (!create) {
      // if not creating, check for existence
      File nodeDir = new File(nodeLocation);
      if (!nodeDir.exists()) {
        // return null if the node doesn't exist and shouldn't be created
        return null;
      }
      if (!nodeDir.isDirectory()) {
        logger.error("Cache file not a directory: " + nodeLocation);
        throw new LockssRepository.RepositoryStateException("Invalid cache file.");
      }
    }

    // add to node cache
    nodeCache.put(nodeCacheKey(canonUrl), node);
    return node;
  }

  Object nodeCacheKey(String canonUrl) {
    if (isGlobalNodeCache) {
      return new KeyPair(this, canonUrl);
    }
    return canonUrl;
  }

  // functions for testing
  int getCacheHits() {
    return nodeCache.getCacheHits();
  }

  int getCacheMisses() {
    return nodeCache.getCacheMisses();
  }

  int getRefHits() {
    return nodeCache.getRefHits();
  }

  int getRefMisses() {
    return nodeCache.getRefMisses();
  }

  public void nodeConsistencyCheck() {
    // traverse the node tree from the top
    RepositoryNode topNode;
    try {
      topNode = getNode(AuUrl.PROTOCOL_COLON);
      recurseConsistencyCheck((RepositoryNodeImpl) topNode);
    } catch (MalformedURLException ignore) {
    }
  }

  /**
   * Checks the consistency of the node, and continues with its children if it's consistent.
   *
   * @param node RepositoryNodeImpl the node to check
   */
  private void recurseConsistencyCheck(RepositoryNodeImpl node) {
    logger.debug2("Checking node '" + node.getNodeUrl() + "'...");
    // check consistency at each node
    // correct/deactivate as necessary
    // 'checkNodeConsistency()' will repair if possible
    if (node.checkNodeConsistency()) {
      logger.debug3("Node consistent; recursing on children...");
      List children = node.getNodeList(null, false);
      Iterator iter = children.iterator();
      while (iter.hasNext()) {
        RepositoryNodeImpl child = (RepositoryNodeImpl) iter.next();
        recurseConsistencyCheck(child);
      }
    } else {
      logger.debug3("Node inconsistent; deactivating...");
      deactivateInconsistentNode(node);
    }
  }

  /**
   * This is called when a node is in an inconsistent state. It simply creates some necessary
   * directories and deactivates the node. Future polls should restore it properly.
   *
   * @param node the inconsistent node
   */
  void deactivateInconsistentNode(RepositoryNodeImpl node) {
    logger.warning("Deactivating inconsistent node.");
    FileUtil.ensureDirExists(node.contentDir);
    // manually deactivate
    node.deactivateContent();
  }

  /**
   * A method to remove any non-canonical '..' or '.' elements in the path, as well as protecting
   * against illegal path traversal.
   *
   * @param url the raw url
   * @return String the canonicalized url
   * @throws MalformedURLException
   */
  public String canonicalizePath(String url) throws MalformedURLException {
    String canonUrl = UrlUtil.normalizeUrl(url, UrlUtil.PATH_TRAVERSAL_ACTION_THROW);
    // canonicalize "dir" and "dir/"
    // XXX if these are ever two separate nodes, this is wrong
    if (canonUrl.endsWith(UrlUtil.URL_PATH_SEPARATOR)) {
      canonUrl = canonUrl.substring(0, canonUrl.length() - 1);
    }

    return canonUrl;
  }

  // static calls

  /**
   * Factory method to create new LockssRepository instances.
   *
   * @param au the {@link ArchivalUnit}
   * @return the new LockssRepository instance
   */
  public static LockssRepository createNewLockssRepository(ArchivalUnit au) {
    String root = getRepositoryRoot(au);
    if (root == null || root.equals("null")) {
      logger.error("No repository dir set in config");
      throw new LockssRepository.RepositoryStateException("No repository dir set in config");
    }
    String auDir = LockssRepositoryImpl.mapAuToFileLocation(root, au);
    if (logger.isDebug2()) {
      logger.debug2("repo: " + auDir + ", au: " + au.getName());
    }
    staticCacheLocation = extendCacheLocation(root);
    LockssRepositoryImpl repo = new LockssRepositoryImpl(auDir);
    Plugin plugin = au.getPlugin();
    if (plugin != null) {
      LockssDaemon daemon = plugin.getDaemon();
      if (daemon != null) {
        RepositoryManager mgr = daemon.getRepositoryManager();
        if (mgr != null) {
          mgr.setRepositoryForPath(auDir, repo);
        }
      }
    }
    return repo;
  }

  public static String getRepositorySpec(ArchivalUnit au) {
    Configuration auConfig = au.getConfiguration();
    if (auConfig != null) { // can be null in unit tests
      String repoSpec = auConfig.get(PluginManager.AU_PARAM_REPOSITORY);
      if (repoSpec != null && repoSpec.startsWith("local:")) {
        return repoSpec;
      }
    }
    return "local:" + CurrentConfig.getParam(PARAM_CACHE_LOCATION);
  }

  public static String getRepositoryRoot(ArchivalUnit au) {
    return getLocalRepositoryPath(getRepositorySpec(au));
  }

  public static String getLocalRepositoryPath(String repoSpec) {
    if (repoSpec != null) {
      if (repoSpec.startsWith("local:")) {
        return repoSpec.substring(6);
      }
    }
    return null;
  }

  // The OpenBSD platform has renamed the first disk from /cache to
  // /cache.wd0, leaving behind a symbolic link in /cache .  This is
  // transparent everywhere except the repository status table, which needs
  // to match AU configs with AUs it finds when enumerating the repository.
  // Existing AU configs have repository=local:/cache, so the relative link
  // needs to be resolved to detect that that's the same as
  // local:/cache.wd0

  private static Map canonicalRoots = new HashMap();

  public static boolean isDirInRepository(String dir, String repoRoot) {
    if (dir.startsWith(repoRoot)) {
      return true;
    }
    return canonRoot(dir).startsWith(canonRoot(repoRoot));
  }

  static String canonRoot(String root) {
    synchronized (canonicalRoots) {
      String canon = (String) canonicalRoots.get(root);
      if (canon == null) {
        try {
          canon = new File(root).getCanonicalPath();
          canonicalRoots.put(root, canon);
        } catch (IOException e) {
          logger.warning("Can't canonicalize: " + root, e);
          return root;
        }
      }
      return canon;
    }
  }

  static String getCacheLocation() {
    return staticCacheLocation;
  }

  /**
   * Adds the 'cache' directory to the HD location.
   *
   * @param cacheDir the root location.
   * @return String the extended location
   */
  static String extendCacheLocation(String cacheDir) {
    StringBuilder buffer = new StringBuilder(cacheDir);
    if (!cacheDir.endsWith(File.separator)) {
      buffer.append(File.separator);
    }
    buffer.append(CACHE_ROOT_NAME);
    buffer.append(File.separator);
    return buffer.toString();
  }

  /**
   * mapAuToFileLocation() is the method used to resolve {@link ArchivalUnit}s into directory names.
   * This maps a given au to directories, using the cache root as the base. Given an au with
   * PluginId of 'plugin' and AuId of 'au', it would return the string '<rootLocation>/plugin/au/'.
   *
   * @param repoRoot the root of a LOCKSS repository
   * @param au the ArchivalUnit to resolve
   * @return the directory location
   */
  public static String mapAuToFileLocation(String repoRoot, ArchivalUnit au) {
    return getAuDir(au, repoRoot, true);
  }

  /**
   * mapUrlToFileLocation() is the method used to resolve urls into file names. This maps a given
   * url to a file location, using the au top directory as the base. It creates directories which
   * mirror the html string, so 'http://www.journal.org/issue1/index.html' would be cached in the
   * file: <rootLocation>/www.journal.org/http/issue1/index.html
   *
   * @param rootLocation the top directory for ArchivalUnit this URL is in
   * @param urlStr the url to translate
   * @return the url file location
   * @throws java.net.MalformedURLException
   */
  public static String mapUrlToFileLocation(String rootLocation, String urlStr)
      throws MalformedURLException {
    int totalLength = rootLocation.length() + urlStr.length();
    URL url = new URL(urlStr);
    StringBuilder buffer = new StringBuilder(totalLength);
    buffer.append(rootLocation);
    if (!rootLocation.endsWith(File.separator)) {
      buffer.append(File.separator);
    }
    buffer.append(url.getHost().toLowerCase());
    int port = url.getPort();
    if (port != -1) {
      buffer.append(PORT_SEPARATOR);
      buffer.append(port);
    }
    buffer.append(File.separator);
    buffer.append(url.getProtocol());
    if (RepositoryManager.isEnableLongComponents()) {
      String escapedPath =
          escapePath(
              StringUtil.replaceString(url.getPath(), UrlUtil.URL_PATH_SEPARATOR, File.separator));
      String query = url.getQuery();
      if (query != null) {
        escapedPath = escapedPath + "?" + escapeQuery(query);
      }
      String encodedPath = RepositoryNodeImpl.encodeUrl(escapedPath);
      // encodeUrl strips leading / from path
      buffer.append(File.separator);
      buffer.append(encodedPath);
    } else {
      buffer.append(
          escapePath(
              StringUtil.replaceString(url.getPath(), UrlUtil.URL_PATH_SEPARATOR, File.separator)));
      String query = url.getQuery();
      if (query != null) {
        buffer.append("?");
        buffer.append(escapeQuery(query));
      }
    }
    return buffer.toString();
  }

  // name mapping functions

  /**
   * Return true iff a repository for the auid exists under the root
   *
   * @param auid
   * @param repoRoot the repository root
   * @return true iff a repository for the auid exists
   */
  static boolean doesAuDirExist(String auid, String repoRoot) {
    return null != getAuDir(auid, repoRoot, false);
  }

  /**
   * Finds the directory for this AU. If none found in the map, designates a new dir for it.
   *
   * @param au the AU
   * @param repoRoot root of the repository
   * @return the dir {@link String}
   */
  static String getAuDir(ArchivalUnit au, String repoRoot, boolean create) {
    return getAuDir(au.getAuId(), repoRoot, create);
  }

  /**
   * Finds the directory for this AU. If none found in the map, designates a new dir for it.
   *
   * @param auid AU id representing the au
   * @param repoRoot path to the root of the repository
   * @return the dir String
   */
  static String getAuDir(String auid, String repoRoot, boolean create) {
    String repoCachePath = extendCacheLocation(repoRoot);
    LocalRepository localRepo = getLocalRepository(repoRoot);
    synchronized (localRepo) {
      Map aumap = localRepo.getAuMap();
      String auPathSlash = (String) aumap.get(auid);
      if (auPathSlash != null) {
        return auPathSlash;
      }
      if (!create) {
        return null;
      }
      logger.debug3("Creating new au directory for '" + auid + "'.");
      String auDir = localRepo.getPrevAuDir();
      for (int cnt = RepositoryManager.getMaxUnusedDirSearch(); cnt > 0; cnt--) {
        // loop through looking for an available dir
        auDir = getNextDirName(auDir);
        File testDir = new File(repoCachePath, auDir);
        if (logger.isDebug3()) logger.debug3("Probe for unused: " + testDir);
        if (!testDir.exists()) {
          if (RepositoryManager.isStatefulUnusedDirSearch()) {
            localRepo.setPrevAuDir(auDir);
          }
          String auPath = testDir.toString();
          logger.debug3("New au directory: " + auPath);
          auPathSlash = auPath + File.separator;
          // write the new au property file to the new dir
          // XXX this data should be backed up elsewhere to avoid single-point
          // corruption
          Properties idProps = new Properties();
          idProps.setProperty(AU_ID_PROP, auid);
          saveAuIdProperties(auPath, idProps);
          aumap.put(auid, auPathSlash);
          return auPathSlash;
        } else {
          if (logger.isDebug3()) {
            logger.debug3("Existing directory found at '" + auDir + "'.  Checking next...");
          }
        }
      }
    }
    throw new RuntimeException(
        "Can't find unused repository dir after "
            + RepositoryManager.getMaxUnusedDirSearch()
            + " tries in "
            + repoCachePath);
  }

  static LocalRepository getLocalRepository(ArchivalUnit au) {
    return getLocalRepository(getRepositoryRoot(au));
  }

  static LocalRepository getLocalRepository(String repoRoot) {
    synchronized (localRepositories) {
      LocalRepository localRepo = (LocalRepository) localRepositories.get(repoRoot);
      if (localRepo == null) {
        logger.debug2("Creating LocalRepository(" + repoRoot + ")");
        localRepo = new LocalRepository(repoRoot);
        localRepositories.put(repoRoot, localRepo);
      }
      return localRepo;
    }
  }

  /** Return next string in the sequence "a", "b", ... "z", "aa", "ab", ... */
  static String getNextDirName(String old) {
    StringBuilder sb = new StringBuilder(old);
    // go through and increment the first non-'z' char
    // counts back from the last char, so 'aa'->'ab', not 'ba'
    for (int ii = sb.length() - 1; ii >= 0; ii--) {
      char curChar = sb.charAt(ii);
      if (curChar < 'z') {
        sb.setCharAt(ii, (char) (curChar + 1));
        return sb.toString();
      }
      sb.setCharAt(ii, 'a');
    }
    sb.insert(0, 'a');
    return sb.toString();
  }

  public static File getAuIdFile(String location) {
    return new File(location + File.separator + AU_ID_FILE);
  }

  static Properties getAuIdProperties(String location) {
    File propFile = new File(location + File.separator + AU_ID_FILE);
    try {
      InputStream is = new BufferedInputStream(new FileInputStream(propFile));
      Properties idProps = new Properties();
      idProps.load(is);
      is.close();
      return idProps;
    } catch (Exception e) {
      logger.warning("Error loading au id from " + propFile.getPath() + ".");
      return null;
    }
  }

  static void saveAuIdProperties(String location, Properties props) {
    // XXX these AU_ID_FILE entries need to be backed up elsewhere to avoid
    // single-point corruption
    File propDir = new File(location);
    if (!propDir.exists()) {
      logger.debug("Creating directory '" + propDir.getAbsolutePath() + "'");
      propDir.mkdirs();
    }
    File propFile = new File(propDir, AU_ID_FILE);
    try {
      logger.debug3("Saving au id properties at '" + location + "'.");
      OutputStream os = new BufferedOutputStream(new FileOutputStream(propFile));
      props.store(os, "ArchivalUnit id info");
      os.close();
      propFile.setReadOnly();
    } catch (IOException ioe) {
      logger.error("Couldn't write properties for " + propFile.getPath() + ".", ioe);
      throw new LockssRepository.RepositoryStateException("Couldn't write au id properties file.");
    }
  }

  // lockss filename-specific encoding methods

  /**
   * Escapes instances of the ESCAPE_CHAR from the path. This avoids name conflicts with the
   * repository files, such as '#nodestate.xml'.
   *
   * @param path the path
   * @return the escaped path
   */
  static String escapePath(String path) {
    // XXX escaping disabled because of URL encoding
    if (false && path.indexOf(ESCAPE_CHAR) >= 0) {
      return StringUtil.replaceString(path, ESCAPE_STR, ESCAPE_STR + ESCAPE_STR);
    } else {
      return path;
    }
  }

  /**
   * Escapes instances of File.separator from the query. These are safe from filename overlap, but
   * can't convert into extended paths and directories.
   *
   * @param query the query
   * @return the escaped query
   */
  static String escapeQuery(String query) {
    if (query.indexOf(File.separator) >= 0) {
      return StringUtil.replaceString(query, File.separator, ESCAPE_STR + ENCODED_SEPARATOR_CHAR);
    } else {
      return query;
    }
  }

  /**
   * Extracts '#x' encoding and converts back to 'x'.
   *
   * @param orig the original
   * @return the unescaped version.
   */
  static String unescape(String orig) {
    if (orig.indexOf(ESCAPE_CHAR) < 0) {
      // fast treatment of non-escaped strings
      return orig;
    }
    int index = -1;
    StringBuilder buffer = new StringBuilder(orig.length());
    String oldStr = orig;
    while ((index = oldStr.indexOf(ESCAPE_CHAR)) >= 0) {
      buffer.append(oldStr.substring(0, index));
      buffer.append(convertCode(oldStr.substring(index, index + 2)));
      if (oldStr.length() > 2) {
        oldStr = oldStr.substring(index + 2);
      } else {
        oldStr = "";
      }
    }
    buffer.append(oldStr);
    return buffer.toString();
  }

  /**
   * Returns the second char in the escaped segment, unless it is 's', which is a stand-in for the
   * File.separatorChar.
   *
   * @param code the code segment (length 2)
   * @return the encoded char
   */
  static char convertCode(String code) {
    char encodedChar = code.charAt(1);
    if (encodedChar == ENCODED_SEPARATOR_CHAR) {
      return File.separatorChar;
    } else {
      return encodedChar;
    }
  }

  public static class Factory implements LockssAuManager.Factory {
    public LockssAuManager createAuManager(ArchivalUnit au) {
      return createNewLockssRepository(au);
    }
  }

  /** Maintains state for a local repository root dir (<i>eg</i>, auid of each au subdir). */
  static class LocalRepository {
    String repoPath;
    File repoCacheFile;
    Map auMap;
    String prevAuDir;

    LocalRepository(String repoPath) {
      this.repoPath = repoPath;
      repoCacheFile = new File(repoPath, CACHE_ROOT_NAME);
    }

    public String getRepositoryPath() {
      return repoPath;
    }

    public String getPrevAuDir() {
      if (prevAuDir == null) {
        prevAuDir = lastPluginDir;
      }
      return prevAuDir;
    }

    public void setPrevAuDir(String dir) {
      prevAuDir = dir;
    }

    /**
     * Return the auid -> au-subdir-path mapping. Enumerating the directories if necessary to
     * initialize the map
     */
    Map getAuMap() {
      if (auMap == null) {
        logger.debug3("Loading name map for '" + repoCacheFile + "'.");
        auMap = new HashMap();
        if (!repoCacheFile.exists()) {
          logger.debug3("Creating cache dir:" + repoCacheFile + "'.");
          if (!repoCacheFile.mkdirs()) {
            logger.critical("Couldn't create directory, check owner/permissions: " + repoCacheFile);
            // return empty map
            return auMap;
          }
        } else {
          // read each dir's property file and store mapping auid -> dir
          File[] auDirs = repoCacheFile.listFiles();
          for (int ii = 0; ii < auDirs.length; ii++) {
            String dirName = auDirs[ii].getName();
            //       if (dirName.compareTo(lastPluginDir) == 1) {
            //         // adjust the 'lastPluginDir' upwards if necessary
            //         lastPluginDir = dirName;
            //       }

            String path = auDirs[ii].getAbsolutePath();
            Properties idProps = getAuIdProperties(path);
            if (idProps != null) {
              String auid = idProps.getProperty(AU_ID_PROP);
              StringBuilder sb = new StringBuilder(path.length() + File.separator.length());
              sb.append(path);
              sb.append(File.separator);
              auMap.put(auid, sb.toString());
              logger.debug3("Mapping to: " + auMap.get(auid) + ": " + auid);
            } else {
              logger.debug3("Not mapping " + path + ", no auid file.");
            }
          }
        }
      }
      return auMap;
    }

    public String toString() {
      return "[LR: " + repoPath + "]";
    }
  }

  String getRootLocation() {
    return rootLocation;
  }
}
Esempio n. 7
0
/** Export the contents and metadata from an AU */
public abstract class Exporter {

  private static final Logger log = Logger.getLogger(Exporter.class);

  static final String PREFIX = Configuration.PREFIX + "exporter.";

  /** Abort export after this many errors */
  public static final String PARAM_MAX_ERRORS = PREFIX + "maxErrors";

  public static final int DEFAULT_MAX_ERRORS = 5;

  protected static int maxErrors = DEFAULT_MAX_ERRORS;

  protected LockssDaemon daemon;
  protected ArchivalUnit au;
  protected File dir;
  protected String prefix;
  protected long maxSize = -1;
  protected int maxVersions = 1;
  protected boolean compress = false;
  protected boolean excludeDirNodes = false;
  protected FilenameTranslation xlate = FilenameTranslation.XLATE_NONE;
  protected List errors = new ArrayList();
  protected boolean isDiskFull = false;

  protected abstract void start() throws IOException;

  protected abstract void finish() throws IOException;

  protected abstract void writeCu(CachedUrl cu) throws IOException;

  protected Exporter(LockssDaemon daemon, ArchivalUnit au) {
    this.daemon = daemon;
    this.au = au;
  }

  /** Called by org.lockss.config.MiscConfig */
  public static void setConfig(
      Configuration config, Configuration oldConfig, Configuration.Differences diffs) {
    if (diffs.contains(PREFIX)) {
      maxErrors = config.getInt(PARAM_MAX_ERRORS, DEFAULT_MAX_ERRORS);
    }
  }

  public void setCompress(boolean val) {
    compress = val;
  }

  public boolean getCompress() {
    return compress;
  }

  public void setExcludeDirNodes(boolean val) {
    excludeDirNodes = val;
  }

  public boolean getExcludeDirNodes() {
    return excludeDirNodes;
  }

  public void setFilenameTranslation(FilenameTranslation val) {
    xlate = val;
  }

  public FilenameTranslation getFilenameTranslation() {
    return xlate;
  }

  public void setDir(File val) {
    dir = val;
  }

  public File getDir() {
    return dir;
  }

  public void setPrefix(String val) {
    prefix = val;
  }

  public String getPrefix() {
    return prefix;
  }

  public void setMaxSize(long val) {
    maxSize = val;
  }

  public long getMaxSize() {
    return maxSize;
  }

  public void setMaxVersions(int val) {
    maxVersions = val;
  }

  public int getMaxVersions() {
    return maxVersions;
  }

  public List getErrors() {
    return errors;
  }

  protected void checkArgs() {
    if (getDir() == null) {
      throw new IllegalArgumentException("Must supply output directory");
    }
    if (getPrefix() == null) {
      throw new IllegalArgumentException("Must supply file name/prefix");
    }
  }

  public void export() {
    log.debug("export(" + au.getName() + ")");
    log.debug(
        "dir: "
            + dir
            + ", pref: "
            + prefix
            + ", size: "
            + maxSize
            + ", ver: "
            + maxVersions
            + (compress ? ", (C)" : ""));
    checkArgs();
    try {
      start();
    } catch (IOException e) {
      recordError("Error opening file", e);
      return;
    }
    writeFiles();
    try {
      finish();
    } catch (IOException e) {
      if (!isDiskFull) {
        // If we already knew (and reported) disk full, also reporting it
        // as a close error is misleading.
        recordError("Error closing file", e);
      }
    }
  }

  protected String xlateFilename(String url) {
    return xlate.xlate(url);
  }

  protected void recordError(String msg, Throwable t) {
    log.error(msg, t);
    errors.add(msg + ": " + t.toString());
  }

  protected void recordError(String msg) {
    log.error(msg);
    errors.add(msg);
  }

  protected String getSoftwareVersion() {
    String releaseName = BuildInfo.getBuildProperty(BuildInfo.BUILD_RELEASENAME);
    StringBuilder sb = new StringBuilder();
    sb.append("LOCKSS Daemon ");
    if (releaseName != null) {
      sb.append(releaseName);
    }
    return sb.toString();
  }

  protected String getHostIp() {
    try {
      IPAddr localHost = IPAddr.getLocalHost();
      return localHost.getHostAddress();
    } catch (UnknownHostException e) {
      log.error("getHostIp()", e);
      return "1.1.1.1";
    }
  }

  protected String getHostName() {
    String res = ConfigManager.getPlatformHostname();
    if (res == null) {
      try {
        InetAddress inet = InetAddress.getLocalHost();
        return inet.getHostName();
      } catch (UnknownHostException e) {
        log.warning("Can't get hostname", e);
        return "unknown";
      }
    }
    return res;
  }

  protected Properties filterResponseProps(Properties props) {
    Properties res = new Properties();
    for (Map.Entry ent : props.entrySet()) {
      String key = (String) ent.getKey();
      if (StringUtil.startsWithIgnoreCase(key, "x-lockss")
          || StringUtil.startsWithIgnoreCase(key, "x_lockss")
          || key.equalsIgnoreCase("org.lockss.version.number")) {
        continue;
      }
      // We've lost the original case - capitalize them the way most people
      // expect
      res.put(StringUtil.titleCase(key, '-'), (String) ent.getValue());
    }
    return res;
  }

  protected String getHttpResponseString(CachedUrl cu) {
    Properties cuProps = cu.getProperties();
    Properties filteredProps = filterResponseProps(cuProps);
    String hdrString = PropUtil.toHeaderString(filteredProps);
    StringBuilder sb = new StringBuilder(hdrString.length() + 30);
    String line1 = inferHttpResponseCode(cu, cuProps);
    sb.append(line1);
    sb.append(Constants.CRLF);
    sb.append(hdrString);
    sb.append(Constants.CRLF);
    return sb.toString();
  }

  String inferHttpResponseCode(CachedUrl cu, Properties cuProps) {
    if (cuProps.get("location") == null) {
      return "HTTP/1.1 200 OK";
    } else {
      return "HTTP/1.1 302 Found";
    }
  }

  // return the next CU with content
  private CachedUrl getNextCu(CuIterator iter) {
    return iter.hasNext() ? iter.next() : null;
  }

  /**
   * Return true if, interpreting URLs as filenames, dirCu is a directory containing fileCu. Used to
   * exclude directory content from output files, so they can be unpacked by standard utilities
   * (e.g., unzip). Shouldn't be called with equal URLs, but return false in that case, as we
   * wouldn't want to exclude the URL
   */
  boolean isDirOf(CachedUrl dirCu, CachedUrl fileCu) {
    String dir = dirCu.getUrl();
    String file = fileCu.getUrl();
    if (!dir.endsWith("/")) {
      dir = dir + "/";
    }
    return file.startsWith(dir) && !file.equals(dir);
  }

  private void writeFiles() {
    PlatformUtil platutil = PlatformUtil.getInstance();
    CuIterator iter = AuUtil.getCuIterator(au);
    int errs = 0;
    CachedUrl curCu = null;
    CachedUrl nextCu = getNextCu(iter);
    while (nextCu != null) {
      curCu = nextCu;
      nextCu = getNextCu(iter);
      if (excludeDirNodes && nextCu != null && isDirOf(curCu, nextCu)) {
        continue;
      }
      CachedUrl[] cuVersions =
          curCu.getCuVersions(maxVersions > 0 ? maxVersions : Integer.MAX_VALUE);
      for (CachedUrl cu : cuVersions) {
        try {
          log.debug2("Exporting " + cu.getUrl());
          writeCu(cu);
        } catch (IOException e) {
          if (platutil.isDiskFullError(e)) {
            recordError("Disk full, can't write export file.");
            isDiskFull = true;
            return;
          }
        } catch (Exception e) {
          // XXX Would like to differentiate between errors opening or
          // reading CU, which shouldn't cause abort, and errors writing
          // to export file, which should.
          recordError("Unable to copy " + cu.getUrl(), e);
          if (errs++ >= maxErrors) {
            recordError("Aborting after " + errs + " errors");
            return;
          }
        }
      }
    }
  }

  private Set<File> fileSet = new HashSet<File>();
  private List<File> fileList = new ArrayList<File>();

  protected void recordExportFile(File file) {
    if (fileSet.add(file)) {
      fileList.add(file);
    }
  }

  /**
   * Return the list of export files written
   *
   * @return List of File written
   */
  public List<File> getExportFiles() {
    return fileList;
  }

  public interface Factory {
    public Exporter makeExporter(LockssDaemon daemon, ArchivalUnit au);
  }

  static final String WINDOWS_FROM = "?<>:*|\\";
  static final String WINDOWS_TO = "_______";
  static final String MAC_FROM = ":";
  static final String MAC_TO = "_";

  /** Enum of filename translation types */
  public static enum FilenameTranslation {
    XLATE_NONE("None") {
      public String xlate(String s) {
        return s;
      }
    },
    XLATE_WINDOWS("Windows") {
      public String xlate(String s) {
        return StringUtils.replaceChars(s, WINDOWS_FROM, WINDOWS_TO);
      }
    },
    XLATE_MAC("MacOS") {
      public String xlate(String s) {
        return StringUtils.replaceChars(s, MAC_FROM, MAC_TO);
      }
    };

    private final String label;

    FilenameTranslation(String label) {
      this.label = label;
    }

    public String getLabel() {
      return label;
    }

    public abstract String xlate(String s);
  };

  /** Enum of Exporter types, and factories */
  public static enum Type implements Factory {
    ARC_RESOURCE("ARC (content only)") {
      public Exporter makeExporter(LockssDaemon daemon, ArchivalUnit au) {
        return new ArcExporter(daemon, au, false);
      }
    },
    ARC_RESPONSE("ARC (response and content)") {
      public Exporter makeExporter(LockssDaemon daemon, ArchivalUnit au) {
        return new ArcExporter(daemon, au, true);
      }
    },
    WARC_RESOURCE("WARC (content only)") {
      public Exporter makeExporter(LockssDaemon daemon, ArchivalUnit au) {
        return new WarcExporter(daemon, au, false);
      }
    },
    WARC_RESPONSE("WARC (response and content)") {
      public Exporter makeExporter(LockssDaemon daemon, ArchivalUnit au) {
        return new WarcExporter(daemon, au, true);
      }
    },
    ZIP("ZIP") {
      public Exporter makeExporter(LockssDaemon daemon, ArchivalUnit au) {
        return new ZipExporter(daemon, au);
      }
    };

    private final String label;

    Type(String label) {
      this.label = label;
    }

    public abstract Exporter makeExporter(LockssDaemon daemon, ArchivalUnit au);

    public String getLabel() {
      return label;
    }
  };
}
/**
 * RepositoryManager is the center of the per AU repositories. It manages the repository config
 * parameters.
 */
public class RepositoryManager extends BaseLockssDaemonManager implements ConfigurableManager {

  private static Logger log = Logger.getLogger("RepositoryManager");

  public static final String PREFIX = Configuration.PREFIX + "repository.";

  /** Maximum size of per-AU repository node cache */
  public static final String PARAM_MAX_PER_AU_CACHE_SIZE = PREFIX + "nodeCache.size";

  public static final int DEFAULT_MAX_PER_AU_CACHE_SIZE = 10;

  /*
   * This needs to be a small multiple of the number of simultaneous
   * polls (poller and voter), as there is a cache entry per active AU.
   * Each poll will have one active AU at a time.
   */
  public static final String PARAM_MAX_SUSPECT_VERSIONS_CACHE_SIZE =
      PREFIX + "suspectVersionsCache.size";
  public static final int DEFAULT_MAX_SUSPECT_VERSIONS_CACHE_SIZE = 10;

  static final String GLOBAL_CACHE_PREFIX = PREFIX + "globalNodeCache.";
  public static final String PARAM_MAX_GLOBAL_CACHE_SIZE = GLOBAL_CACHE_PREFIX + "size";
  public static final int DEFAULT_MAX_GLOBAL_CACHE_SIZE = 500;

  public static final String PARAM_GLOBAL_CACHE_ENABLED = GLOBAL_CACHE_PREFIX + "enabled";
  public static final boolean DEFAULT_GLOBAL_CACHE_ENABLED = false;

  /** Max times to loop looking for unused AU directory. */
  public static final String PARAM_MAX_UNUSED_DIR_SEARCH = PREFIX + "maxUnusedDirSearch";

  public static final int DEFAULT_MAX_UNUSED_DIR_SEARCH = 30000;

  /** If true, LocalRepository keeps track of next subdir name to probe. */
  public static final String PARAM_IS_STATEFUL_UNUSED_DIR_SEARCH =
      PREFIX + "enableStatefulUnusedDirSearch";

  public static final boolean DEFAULT_IS_STATEFUL_UNUSED_DIR_SEARCH = true;

  /** Max percent of time size calculation thread may run. */
  public static final String PARAM_SIZE_CALC_MAX_LOAD = PREFIX + "sizeCalcMaxLoad";

  public static final float DEFAULT_SIZE_CALC_MAX_LOAD = 0.5F;

  /**
   * If true, path components longer than the maximum filesystem path component are encodes are
   * multiple levels of directories
   */
  public static final String PARAM_ENABLE_LONG_COMPONENTS = PREFIX + "enableLongComponents";

  public static final boolean DEFAULT_ENABLE_LONG_COMPONENTS = true;

  /**
   * Prior to 1.61.6, when long component support is enabled, backslahes were normalized to %5c
   * instead of %5C. This is harmless if checkUnnormalized is set to Fix, as they'll be normalized.
   * Otherwise, this can be set true for compatibility with old repositories. Setting
   * checkUnnormalized to Fix is preferred.
   */
  public static final String PARAM_ENABLE_LONG_COMPONENTS_COMPATIBILITY =
      PREFIX + "enableLongComponentsCompatibility";

  public static final boolean DEFAULT_ENABLE_LONG_COMPONENTS_COMPATIBILITY = false;

  /** Maximum length of a filesystem path component. */
  public static final String PARAM_MAX_COMPONENT_LENGTH = PREFIX + "maxComponentLength";

  public static final int DEFAULT_MAX_COMPONENT_LENGTH = 255;

  /** @see #PARAM_CHECK_UNNORMALIZED */
  public enum CheckUnnormalizedMode {
    No,
    Log,
    Fix
  };

  /**
   * Check for existing nodes with unnormalized names (created by very old daemon that didn't
   * normalize): None, Log, Fix
   */
  public static final String PARAM_CHECK_UNNORMALIZED = PREFIX + "checkUnnormalized";

  public static final CheckUnnormalizedMode DEFAULT_CHECK_UNNORMALIZED = CheckUnnormalizedMode.Log;

  static final String WDOG_PARAM_SIZE_CALC = "SizeCalc";
  static final long WDOG_DEFAULT_SIZE_CALC = Constants.DAY;

  static final String PRIORITY_PARAM_SIZE_CALC = "SizeCalc";
  static final int PRIORITY_DEFAULT_SIZE_CALC = Thread.NORM_PRIORITY - 1;

  static final String DISK_PREFIX = PREFIX + "diskSpace.";

  static final String PARAM_DISK_WARN_FRRE_MB = DISK_PREFIX + "warn.freeMB";
  static final int DEFAULT_DISK_WARN_FRRE_MB = 5000;
  static final String PARAM_DISK_FULL_FRRE_MB = DISK_PREFIX + "full.freeMB";
  static final int DEFAULT_DISK_FULL_FRRE_MB = 100;
  static final String PARAM_DISK_WARN_FRRE_PERCENT = DISK_PREFIX + "warn.freePercent";
  static final double DEFAULT_DISK_WARN_FRRE_PERCENT = .02;
  static final String PARAM_DISK_FULL_FRRE_PERCENT = DISK_PREFIX + "full.freePercent";
  static final double DEFAULT_DISK_FULL_FRRE_PERCENT = .01;

  private PlatformUtil platInfo = PlatformUtil.getInstance();
  private List repoList = Collections.EMPTY_LIST;
  int paramNodeCacheSize = DEFAULT_MAX_PER_AU_CACHE_SIZE;
  boolean paramIsGlobalNodeCache = DEFAULT_GLOBAL_CACHE_ENABLED;
  int paramGlobalNodeCacheSize = DEFAULT_MAX_GLOBAL_CACHE_SIZE;
  int paramSuspectVersionsCacheSize = DEFAULT_MAX_SUSPECT_VERSIONS_CACHE_SIZE;
  UniqueRefLruCache globalNodeCache = new UniqueRefLruCache(DEFAULT_MAX_GLOBAL_CACHE_SIZE);
  UniqueRefLruCache suspectVersionsCache =
      new UniqueRefLruCache(DEFAULT_MAX_SUSPECT_VERSIONS_CACHE_SIZE);
  Map localRepos = new HashMap();
  private static int maxUnusedDirSearch = DEFAULT_MAX_UNUSED_DIR_SEARCH;
  private static boolean isStatefulUnusedDirSearch = DEFAULT_IS_STATEFUL_UNUSED_DIR_SEARCH;
  private static boolean enableLongComponents = DEFAULT_ENABLE_LONG_COMPONENTS;
  private static boolean enableLongComponentsCompatibility =
      DEFAULT_ENABLE_LONG_COMPONENTS_COMPATIBILITY;
  private static int maxComponentLength = DEFAULT_MAX_COMPONENT_LENGTH;
  private static CheckUnnormalizedMode checkUnnormalized = DEFAULT_CHECK_UNNORMALIZED;

  PlatformUtil.DF paramDFWarn =
      PlatformUtil.DF.makeThreshold(DEFAULT_DISK_WARN_FRRE_MB, DEFAULT_DISK_WARN_FRRE_PERCENT);
  PlatformUtil.DF paramDFFull =
      PlatformUtil.DF.makeThreshold(DEFAULT_DISK_FULL_FRRE_MB, DEFAULT_DISK_FULL_FRRE_PERCENT);

  private float sizeCalcMaxLoad = DEFAULT_SIZE_CALC_MAX_LOAD;

  public void startService() {
    super.startService();
    localRepos = new HashMap();
  }

  public void setConfig(
      Configuration config, Configuration oldConfig, Configuration.Differences changedKeys) {
    //  Build list of repositories from list of disk (fs) paths).  Needs to
    //  be generalized if ever another repository implementation.
    if (changedKeys.contains(ConfigManager.PARAM_PLATFORM_DISK_SPACE_LIST)) {
      List lst = new ArrayList();
      String dspace = config.get(ConfigManager.PARAM_PLATFORM_DISK_SPACE_LIST, "");
      List paths = StringUtil.breakAt(dspace, ';');
      if (paths != null) {
        for (Iterator iter = paths.iterator(); iter.hasNext(); ) {
          lst.add("local:" + (String) iter.next());
        }
      }
      repoList = lst;
    }
    if (changedKeys.contains(PARAM_MAX_PER_AU_CACHE_SIZE)) {
      paramNodeCacheSize =
          config.getInt(PARAM_MAX_PER_AU_CACHE_SIZE, DEFAULT_MAX_PER_AU_CACHE_SIZE);
      for (Iterator iter = getDaemon().getAllLockssRepositories().iterator(); iter.hasNext(); ) {
        LockssRepository repo = (LockssRepository) iter.next();
        if (repo instanceof LockssRepositoryImpl) {
          LockssRepositoryImpl repoImpl = (LockssRepositoryImpl) repo;
          repoImpl.setNodeCacheSize(paramNodeCacheSize);
        }
      }
    }
    if (changedKeys.contains(PARAM_MAX_SUSPECT_VERSIONS_CACHE_SIZE)) {
      paramSuspectVersionsCacheSize =
          config.getInt(
              PARAM_MAX_SUSPECT_VERSIONS_CACHE_SIZE, DEFAULT_MAX_SUSPECT_VERSIONS_CACHE_SIZE);
      suspectVersionsCache.setMaxSize(paramSuspectVersionsCacheSize);
    }
    if (changedKeys.contains(GLOBAL_CACHE_PREFIX)) {
      paramIsGlobalNodeCache =
          config.getBoolean(PARAM_GLOBAL_CACHE_ENABLED, DEFAULT_GLOBAL_CACHE_ENABLED);
      if (paramIsGlobalNodeCache) {
        paramGlobalNodeCacheSize =
            config.getInt(PARAM_MAX_GLOBAL_CACHE_SIZE, DEFAULT_MAX_GLOBAL_CACHE_SIZE);
        log.debug("global node cache size: " + paramGlobalNodeCacheSize);
        globalNodeCache.setMaxSize(paramGlobalNodeCacheSize);
      }
    }
    if (changedKeys.contains(DISK_PREFIX)) {
      int minMB = config.getInt(PARAM_DISK_WARN_FRRE_MB, DEFAULT_DISK_WARN_FRRE_MB);
      double minPer =
          config.getPercentage(PARAM_DISK_WARN_FRRE_PERCENT, DEFAULT_DISK_WARN_FRRE_PERCENT);
      paramDFWarn = PlatformUtil.DF.makeThreshold(minMB, minPer);
      minMB = config.getInt(PARAM_DISK_FULL_FRRE_MB, DEFAULT_DISK_FULL_FRRE_MB);
      minPer = config.getPercentage(PARAM_DISK_FULL_FRRE_PERCENT, DEFAULT_DISK_FULL_FRRE_PERCENT);
      paramDFFull = PlatformUtil.DF.makeThreshold(minMB, minPer);
    }
    if (changedKeys.contains(PARAM_SIZE_CALC_MAX_LOAD)) {
      sizeCalcMaxLoad = config.getPercentage(PARAM_SIZE_CALC_MAX_LOAD, DEFAULT_SIZE_CALC_MAX_LOAD);
    }
    if (changedKeys.contains(PREFIX)) {
      maxUnusedDirSearch =
          config.getInt(PARAM_MAX_UNUSED_DIR_SEARCH, DEFAULT_MAX_UNUSED_DIR_SEARCH);
      isStatefulUnusedDirSearch =
          config.getBoolean(
              PARAM_IS_STATEFUL_UNUSED_DIR_SEARCH, DEFAULT_IS_STATEFUL_UNUSED_DIR_SEARCH);
      enableLongComponents =
          config.getBoolean(PARAM_ENABLE_LONG_COMPONENTS, DEFAULT_ENABLE_LONG_COMPONENTS);
      enableLongComponentsCompatibility =
          config.getBoolean(
              PARAM_ENABLE_LONG_COMPONENTS_COMPATIBILITY,
              DEFAULT_ENABLE_LONG_COMPONENTS_COMPATIBILITY);
      maxComponentLength = config.getInt(PARAM_MAX_COMPONENT_LENGTH, DEFAULT_MAX_COMPONENT_LENGTH);
      checkUnnormalized =
          (CheckUnnormalizedMode)
              config.getEnum(
                  CheckUnnormalizedMode.class,
                  PARAM_CHECK_UNNORMALIZED,
                  DEFAULT_CHECK_UNNORMALIZED);
    }
  }

  public static boolean isEnableLongComponents() {
    return enableLongComponents;
  }

  public static boolean isEnableLongComponentsCompatibility() {
    return enableLongComponentsCompatibility;
  }

  public static int getMaxComponentLength() {
    return maxComponentLength;
  }

  public static CheckUnnormalizedMode getCheckUnnormalizedMode() {
    return checkUnnormalized;
  }

  /**
   * Return list of known repository names. Needs a registration mechanism if ever another
   * repository implementation.
   */
  public List<String> getRepositoryList() {
    return repoList;
  }

  public PlatformUtil.DF getRepositoryDF(String repoName) {
    String path = LockssRepositoryImpl.getLocalRepositoryPath(repoName);
    log.debug("path: " + path);
    //    try {
    return platInfo.getJavaDF(path);
    //    } catch (PlatformUtil.UnsupportedException e) {
    //      return null;
    //    }
  }

  public Map<String, PlatformUtil.DF> getRepositoryMap() {
    Map<String, PlatformUtil.DF> repoMap = new LinkedMap();
    for (String repo : getRepositoryList()) {
      repoMap.put(repo, getRepositoryDF(repo));
    }
    return repoMap;
  }

  public String findLeastFullRepository() {
    return findLeastFullRepository(getRepositoryMap());
  }

  public String findLeastFullRepository(Map<String, PlatformUtil.DF> repoMap) {
    String mostFree = null;
    for (String repo : repoMap.keySet()) {
      PlatformUtil.DF df = repoMap.get(repo);
      if (df != null) {
        if (mostFree == null || (repoMap.get(mostFree)).getAvail() < df.getAvail()) {
          mostFree = repo;
        }
      }
    }
    return mostFree;
  }

  public PlatformUtil.DF getDiskWarnThreshold() {
    return paramDFWarn;
  }

  public PlatformUtil.DF getDiskFullThreshold() {
    return paramDFFull;
  }

  public static int getMaxUnusedDirSearch() {
    return maxUnusedDirSearch;
  }

  public static boolean isStatefulUnusedDirSearch() {
    return isStatefulUnusedDirSearch;
  }

  public List findExistingRepositoriesFor(String auid) {
    List res = null;
    for (Iterator iter = getRepositoryList().iterator(); iter.hasNext(); ) {
      String repoName = (String) iter.next();
      String path = LockssRepositoryImpl.getLocalRepositoryPath(repoName);
      if (LockssRepositoryImpl.doesAuDirExist(auid, path)) {
        if (res == null) {
          res = new ArrayList();
        }
        res.add(repoName);
      }
    }
    return res == null ? Collections.EMPTY_LIST : res;
  }

  // hack only local
  public synchronized LockssRepositoryImpl getRepositoryFromPath(String path) {
    LockssRepositoryImpl repo = (LockssRepositoryImpl) localRepos.get(path);
    if (repo == null) {
      repo = new LockssRepositoryImpl(path);
      repo.initService(getDaemon());
      repo.startService();
      localRepos.put(path, repo);
    }
    return repo;
  }

  /**
   * Return the disk space used by the AU, including all overhead, optionally calculating it if
   * necessary.
   *
   * @param repoAuPath the full path to an AU dir in a LockssRepositoryImpl
   * @param calcIfUnknown if true, size will calculated if unknown (time consumeing)
   * @return the AU's disk usage in bytes, or -1 if unknown
   */
  public long getRepoDiskUsage(String repoAuPath, boolean calcIfUnknown) {
    LockssRepository repo = getRepositoryFromPath(repoAuPath);
    if (repo != null) {
      try {
        RepositoryNode repoNode = repo.getNode(AuCachedUrlSetSpec.URL);
        if (repoNode instanceof AuNodeImpl) {
          return ((AuNodeImpl) repoNode).getDiskUsage(calcIfUnknown);
        }
      } catch (MalformedURLException ignore) {
      }
    }
    return -1;
  }

  public synchronized void setRepositoryForPath(String path, LockssRepositoryImpl repo) {
    localRepos.put(path, repo);
  }

  public boolean isGlobalNodeCache() {
    return paramIsGlobalNodeCache;
  }

  public UniqueRefLruCache getGlobalNodeCache() {
    return globalNodeCache;
  }

  public UniqueRefLruCache getSuspectVersionsCache() {
    return suspectVersionsCache;
  }

  // Background thread to (re)calculate AU size and disk usage.

  private Set sizeCalcQueue = new HashSet();
  private BinarySemaphore sizeCalcSem = new BinarySemaphore();
  private SizeCalcThread sizeCalcThread;

  /** engqueue a size calculation for the AU */
  public void queueSizeCalc(ArchivalUnit au) {
    queueSizeCalc(AuUtil.getAuRepoNode(au));
  }

  /** engqueue a size calculation for the node */
  public void queueSizeCalc(RepositoryNode node) {
    synchronized (sizeCalcQueue) {
      if (sizeCalcQueue.add(node)) {
        log.debug2("Queue size calc: " + node);
        startOrKickThread();
      }
    }
  }

  public int sizeCalcQueueLen() {
    synchronized (sizeCalcQueue) {
      return sizeCalcQueue.size();
    }
  }

  void startOrKickThread() {
    if (sizeCalcThread == null) {
      log.debug2("Starting thread");
      sizeCalcThread = new SizeCalcThread();
      sizeCalcThread.start();
      sizeCalcThread.waitRunning();
    }
    sizeCalcSem.give();
  }

  void stopThread() {
    if (sizeCalcThread != null) {
      log.debug2("Stopping thread");
      sizeCalcThread.stopSizeCalc();
      sizeCalcThread = null;
    }
  }

  void doSizeCalc(RepositoryNode node) {
    node.getTreeContentSize(null, true);
    if (node instanceof AuNodeImpl) {
      ((AuNodeImpl) node).getDiskUsage(true);
    }
  }

  long sleepTimeToAchieveLoad(long runDuration, float maxLoad) {
    return Math.round(((double) runDuration / maxLoad) - runDuration);
  }

  private class SizeCalcThread extends LockssThread {
    private volatile boolean goOn = true;

    private SizeCalcThread() {
      super("SizeCalc");
    }

    public void lockssRun() {
      setPriority(PRIORITY_PARAM_SIZE_CALC, PRIORITY_DEFAULT_SIZE_CALC);
      startWDog(WDOG_PARAM_SIZE_CALC, WDOG_DEFAULT_SIZE_CALC);
      triggerWDogOnExit(true);
      nowRunning();

      while (goOn) {
        try {
          pokeWDog();
          if (sizeCalcQueue.isEmpty()) {
            Deadline timeout = Deadline.in(Constants.HOUR);
            sizeCalcSem.take(timeout);
          }
          RepositoryNode node;
          synchronized (sizeCalcQueue) {
            node = (RepositoryNode) CollectionUtil.getAnElement(sizeCalcQueue);
          }
          if (node != null) {
            long start = TimeBase.nowMs();
            log.debug2("CalcSize start: " + node);
            long dur = 0;
            try {
              doSizeCalc(node);
              dur = TimeBase.nowMs() - start;
              log.debug2("CalcSize finish (" + StringUtil.timeIntervalToString(dur) + "): " + node);
            } catch (RuntimeException e) {
              log.warning("doSizeCalc: " + node, e);
            }
            synchronized (sizeCalcQueue) {
              sizeCalcQueue.remove(node);
            }
            pokeWDog();
            long sleep = sleepTimeToAchieveLoad(dur, sizeCalcMaxLoad);
            Deadline.in(sleep).sleep();
          }
        } catch (InterruptedException e) {
          // just wakeup and check for exit
        }
      }
      if (!goOn) {
        triggerWDogOnExit(false);
      }
    }

    private void stopSizeCalc() {
      goOn = false;
      interrupt();
    }
  }
}
Esempio n. 9
0
public abstract class V3Serializer {

  private static final String PREFIX = Configuration.PREFIX + "poll.v3.";

  protected File pollDir;
  protected LockssDaemon daemon;

  static final Logger log = Logger.getLogger("V3Serializer");

  public V3Serializer(LockssDaemon daemon) throws PollSerializerException {
    this.daemon = daemon;
    Configuration config = CurrentConfig.getCurrentConfig();
    File stateDir = PollUtil.ensurePollStateRoot();
    if (!FileUtil.ensureDirExists(stateDir)) {
      throw new PollSerializerException("Could not create state directory " + stateDir);
    }
    try {
      this.pollDir = FileUtil.createTempDir("pollstate-", "", stateDir);
    } catch (IOException ex) {
      throw new PollSerializerException("Cannot create temp dir in state directory" + stateDir, ex);
    }
  }

  /**
   * Create a new PollSerializer. The parameter pollDir is optional. If it is specified, it must be
   * a poll serialization directory that already exists. If it is null, a new poll serialization
   * directory will be created.
   *
   * @param dir Optionally, a pre-existing serialization directory to use.
   * @throws PollSerializerException
   */
  public V3Serializer(LockssDaemon daemon, File dir) throws PollSerializerException {
    if (dir == null) {
      throw new NullPointerException("Poll serialization directory must not " + "be null");
    }
    this.daemon = daemon;
    this.pollDir = dir;
    if (!pollDir.exists()) {
      throw new IllegalArgumentException(
          "Poll directories passed as " + "arguments must already exist");
    }
  }

  /** Make an XStreamSerializer */
  protected ObjectSerializer getSerializer() {
    return new XStreamSerializer(daemon);
  }

  /** Clean up all resources used by this poll. Removes the poll directory. */
  public void closePoll() {
    if (pollDir != null && pollDir.isDirectory() && !FileUtil.delTree(pollDir))
      log.warning("Unable to delete poll state directory: " + pollDir);
  }

  /**
   * PollSerializerException. Simply exception handling by wrapping IOException and
   * ObjectSerializer.SerializationException.
   */
  public static class PollSerializerException extends Exception {
    public PollSerializerException() {
      super();
    }

    public PollSerializerException(String msg) {
      super(msg);
    }

    public PollSerializerException(String msg, Throwable cause) {
      super(msg, cause);
    }

    public PollSerializerException(Throwable cause) {
      super(cause);
    }
  }
}
/** Functional tests on the simulated content generator. */
public class FuncSimulatedContent extends LockssTestCase {
  static final Logger log = Logger.getLogger("FuncSimulatedContent");

  private PluginManager pluginMgr;
  private Plugin simPlugin;
  private SimulatedArchivalUnit sau1;
  private SimulatedContentGenerator scgen = null;
  private MockLockssDaemon theDaemon;
  String tempDirPath;
  String tempDirPath2;

  private static String DAMAGED_CACHED_URL = "/branch2/branch2/002file.txt";

  public FuncSimulatedContent(String msg) {
    super(msg);
  }

  public void setUp() throws Exception {
    super.setUp();
    tempDirPath = getTempDir().getAbsolutePath() + File.separator;

    theDaemon = getMockLockssDaemon();
    theDaemon.getAlertManager();
    theDaemon.getPluginManager().setLoadablePluginsReady(true);
    theDaemon.getHashService();
    MockSystemMetrics metrics = new MyMockSystemMetrics();
    metrics.initService(theDaemon);
    theDaemon.setSystemMetrics(metrics);

    theDaemon.setDaemonInited(true);

    Properties props = new Properties();
    props.setProperty(SystemMetrics.PARAM_HASH_TEST_DURATION, "1000");
    props.setProperty(SystemMetrics.PARAM_HASH_TEST_BYTE_STEP, "1024");
    props.setProperty(ConfigManager.PARAM_PLATFORM_DISK_SPACE_LIST, tempDirPath);
    ConfigurationUtil.setCurrentConfigFromProps(props);

    pluginMgr = theDaemon.getPluginManager();
    pluginMgr.startService();
    theDaemon.getHashService().startService();
    metrics.startService();
    metrics.setHashSpeed(100);

    simPlugin = PluginTestUtil.findPlugin(SimulatedPlugin.class);
  }

  public void tearDown() throws Exception {
    theDaemon.getLockssRepository(sau1).stopService();
    theDaemon.getNodeManager(sau1).stopService();
    theDaemon.getPluginManager().stopService();
    theDaemon.getHashService().stopService();
    theDaemon.getSystemMetrics().stopService();
    theDaemon.stopDaemon();
    super.tearDown();
  }

  SimulatedArchivalUnit setupSimAu(Configuration auConfig)
      throws ArchivalUnit.ConfigurationException {
    ArchivalUnit au = PluginTestUtil.createAndStartAu(simPlugin, auConfig);
    return (SimulatedArchivalUnit) au;
  }

  Configuration simAuConfig(String rootPath) {
    Configuration conf = ConfigManager.newConfiguration();
    conf.put("root", rootPath);
    conf.put("depth", "2");
    conf.put("branch", "2");
    conf.put("numFiles", "2");
    conf.put("badCachedFileLoc", "2,2");
    conf.put("badCachedFileNum", "2");
    return conf;
  }

  void enableFilter(SimulatedArchivalUnit sau, boolean enable)
      throws ArchivalUnit.ConfigurationException {
    Configuration auConfig = sau.getConfiguration().copy();
    // no bad file when playing with filtering
    auConfig.remove("badCachedFileLoc");
    auConfig.remove("badCachedFileNum");
    if (enable) {
      auConfig.put(SimulatedPlugin.AU_PARAM_HASH_FILTER_SPEC, "true");
    } else {
      auConfig.remove(SimulatedPlugin.AU_PARAM_HASH_FILTER_SPEC);
    }
    sau.setConfiguration(auConfig);
  }

  public void testSimulatedContent() throws Exception {
    sau1 = setupSimAu(simAuConfig(tempDirPath));
    createContent(sau1);
    crawlContent(sau1);
    checkContent(sau1);
    doDamageRemoveTest(sau1); // must be before content read again
    checkFilter(sau1);
    hashContent(sau1);

    // this resets AU's config, do last to avoid messing up toBeDamaged set
  }

  public void testDualContentHash() throws Exception {
    sau1 = setupSimAu(simAuConfig(tempDirPath));
    createContent(sau1);
    crawlContent(sau1);
    CachedUrlSet set = sau1.getAuCachedUrlSet();
    byte[] nameH = getHash(set, true);
    byte[] contentH = getHash(set, false);

    tempDirPath2 = getTempDir().getAbsolutePath() + File.separator;
    SimulatedArchivalUnit sau2 = setupSimAu(simAuConfig(tempDirPath2));

    createContent(sau2);
    crawlContent(sau2);
    set = sau2.getAuCachedUrlSet();
    byte[] nameH2 = getHash(set, true);
    byte[] contentH2 = getHash(set, false);
    assertEquals(nameH, nameH2);
    assertEquals(contentH, contentH2);
  }

  public void testBaseUrl() throws Exception {
    sau1 = setupSimAu(simAuConfig(tempDirPath));
    createContent(sau1);
    crawlContent(sau1);
    CachedUrlSet cus1 = sau1.getAuCachedUrlSet();

    tempDirPath2 = getTempDir().getAbsolutePath() + File.separator;
    Configuration config2 = simAuConfig(tempDirPath2);
    config2.put("base_url", "http://anotherhost.org/");
    SimulatedArchivalUnit sau2 = setupSimAu(config2);
    createContent(sau2);
    crawlContent(sau2);
    CachedUrlSet cus2 = sau1.getAuCachedUrlSet();
    List urls1 = auUrls(sau1);
    List urls2 = auUrls(sau2);

    Pattern pat = Pattern.compile("http://([^/]+)(/.*)$");
    List<String> l1 = auUrls(sau1);
    List<String> l2 = auUrls(sau2);
    assertEquals(l1.size(), l2.size());
    for (int ix = 0; ix < l1.size(); ix++) {
      Matcher m1 = pat.matcher(l1.get(ix));
      assertTrue(m1.matches());
      Matcher m2 = pat.matcher(l2.get(ix));
      assertTrue(m2.matches());
      assertEquals("www.example.com", m1.group(1));
      assertEquals("anotherhost.org", m2.group(1));
      assertEquals(m1.group(2), m2.group(2));
    }
  }

  public void testBaseUrlPath() throws Exception {
    sau1 = setupSimAu(simAuConfig(tempDirPath));
    createContent(sau1);
    crawlContent(sau1);
    CachedUrlSet cus1 = sau1.getAuCachedUrlSet();

    tempDirPath2 = getTempDir().getAbsolutePath() + File.separator;
    Configuration config2 = simAuConfig(tempDirPath2);
    config2.put("base_url", "http://anotherhost.org/some/path/");
    SimulatedArchivalUnit sau2 = setupSimAu(config2);
    createContent(sau2);
    crawlContent(sau2);
    CachedUrlSet cus2 = sau1.getAuCachedUrlSet();
    List urls1 = auUrls(sau1);
    List urls2 = auUrls(sau2);

    Pattern pat1 = Pattern.compile("http://www\\.example\\.com(/.*)$");
    Pattern pat2 = Pattern.compile("http://anotherhost\\.org/some/path(/.*)$");
    List<String> l1 = auUrls(sau1);
    List<String> l2 = auUrls(sau2);
    assertEquals(l1.size(), l2.size());
    for (int ix = 0; ix < l1.size(); ix++) {
      Matcher m1 = pat1.matcher(l1.get(ix));
      assertTrue(m1.matches());
      Matcher m2 = pat2.matcher(l2.get(ix));
      assertTrue(m2.matches());
      assertEquals(m1.group(1), m2.group(1));
    }
  }

  List<String> auUrls(ArchivalUnit au) {
    List<String> res = new ArrayList<String>();
    for (Iterator iter = au.getAuCachedUrlSet().contentHashIterator(); iter.hasNext(); ) {
      CachedUrlSetNode cusn = (CachedUrlSetNode) iter.next();
      if (cusn.hasContent()) {
        res.add(cusn.getUrl());
      }
    }
    return res;
  }

  protected void createContent(SimulatedArchivalUnit sau) {
    log.debug("createContent()");
    scgen = sau.getContentGenerator();
    scgen.setFileTypes(
        SimulatedContentGenerator.FILE_TYPE_HTML + SimulatedContentGenerator.FILE_TYPE_TXT);
    scgen.setAbnormalFile("1,1", 1);
    scgen.setOddBranchesHaveContent(true);

    sau.deleteContentTree();
    sau.generateContentTree();
    assertTrue(scgen.isContentTree());
  }

  protected void crawlContent(SimulatedArchivalUnit sau) {
    log.debug("crawlContent()");
    CrawlSpec spec = new SpiderCrawlSpec(sau.getNewContentCrawlUrls(), null);
    Crawler crawler = new NoCrawlEndActionsNewContentCrawler(sau, spec, new MockAuState());
    crawler.doCrawl();
  }

  protected void checkContent(SimulatedArchivalUnit sau) throws IOException {
    log.debug("checkContent()");
    checkRoot(sau);
    checkLeaf(sau);
    checkStoredContent(sau);
    checkDepth(sau);
  }

  protected void checkFilter(SimulatedArchivalUnit sau) throws Exception {
    log.debug("checkFilter()");
    CachedUrl cu = sau.makeCachedUrl(sau.getUrlRoot() + "/001file.html");

    enableFilter(sau, true);
    InputStream is = cu.openForHashing();
    String expected = "001file.html This is file 1, depth 0, branch 0. foobar ";
    assertEquals(expected, StringUtil.fromInputStream(is));
    is.close();
    enableFilter(sau, false);
    cu = sau.makeCachedUrl(sau.getUrlRoot() + "/001file.html");
    is = cu.openForHashing();
    expected =
        "<HTML><HEAD><TITLE>001file.html</TITLE></HEAD><BODY>\n"
            + "This is file 1, depth 0, branch 0.<br><!-- comment -->    "
            + "Citation String   foobar<br><script>"
            + "(defun fact (n) (cond ((= n 0) 1) (t (fact (sub1 n)))))</script>\n"
            + "</BODY></HTML>";
    assertEquals(expected, StringUtil.fromInputStream(is));
    is.close();
  }

  private byte[] fromHex(String hex) {
    return ByteArray.fromHexString(hex);
  }

  protected void hashContent(SimulatedArchivalUnit sau) throws Exception {
    log.debug("hashContent()");
    measureHashSpeed(sau);

    // If any changes are made to the contents or shape of the simulated
    // content tree, these hash values will have to be changed
    checkHashSet(sau, true, false, fromHex("6AB258B4E1FFD9F9B45316B4F54111FF5E5948D2"));
    checkHashSet(sau, true, true, fromHex("6AB258B4E1FFD9F9B45316B4F54111FF5E5948D2"));
    checkHashSet(sau, false, false, fromHex("409893F1A603F4C276632694DB1621B639BD5164"));
    checkHashSet(sau, false, true, fromHex("85E6213C3771BEAC5A4602CAF7982C6C222800D5"));
  }

  protected void checkDepth(SimulatedArchivalUnit sau) {
    log.debug("checkDepth()");
    String URL_ROOT = sau.getUrlRoot();
    assertEquals(0, sau.getLinkDepth(URL_ROOT + "/index.html"));
    assertEquals(0, sau.getLinkDepth(URL_ROOT + "/"));
    assertEquals(1, sau.getLinkDepth(URL_ROOT + "/001file.html"));
    assertEquals(1, sau.getLinkDepth(URL_ROOT + "/branch1/index.html"));
    assertEquals(1, sau.getLinkDepth(URL_ROOT + "/branch1/"));
    assertEquals(2, sau.getLinkDepth(URL_ROOT + "/branch1/001file.html"));
  }

  protected void checkRoot(SimulatedArchivalUnit sau) {
    log.debug("checkRoot()");
    CachedUrlSet set = sau.getAuCachedUrlSet();
    Iterator setIt = set.flatSetIterator();
    ArrayList childL = new ArrayList(1);
    CachedUrlSet cus = null;
    while (setIt.hasNext()) {
      cus = (CachedUrlSet) setIt.next();
      childL.add(cus.getUrl());
    }

    String urlRoot = sau.getUrlRoot();

    String[] expectedA = new String[1];
    expectedA[0] = urlRoot;
    assertIsomorphic(expectedA, childL);

    setIt = cus.flatSetIterator();
    childL = new ArrayList(7);
    while (setIt.hasNext()) {
      childL.add(((CachedUrlSetNode) setIt.next()).getUrl());
    }

    expectedA =
        new String[] {
          urlRoot + "/001file.html",
          urlRoot + "/001file.txt",
          urlRoot + "/002file.html",
          urlRoot + "/002file.txt",
          urlRoot + "/branch1",
          urlRoot + "/branch2",
          urlRoot + "/index.html"
        };
    assertIsomorphic(expectedA, childL);
  }

  protected void checkLeaf(SimulatedArchivalUnit sau) {
    log.debug("checkLeaf()");
    String parent = sau.getUrlRoot() + "/branch1";
    CachedUrlSetSpec spec = new RangeCachedUrlSetSpec(parent);
    CachedUrlSet set = sau.makeCachedUrlSet(spec);
    Iterator setIt = set.contentHashIterator();
    ArrayList childL = new ArrayList(16);
    while (setIt.hasNext()) {
      childL.add(((CachedUrlSetNode) setIt.next()).getUrl());
    }
    String[] expectedA =
        new String[] {
          parent,
          parent + "/001file.html",
          parent + "/001file.txt",
          parent + "/002file.html",
          parent + "/002file.txt",
          parent + "/branch1",
          parent + "/branch1/001file.html",
          parent + "/branch1/001file.txt",
          parent + "/branch1/002file.html",
          parent + "/branch1/002file.txt",
          parent + "/branch1/index.html",
          parent + "/branch2",
          parent + "/branch2/001file.html",
          parent + "/branch2/001file.txt",
          parent + "/branch2/002file.html",
          parent + "/branch2/002file.txt",
          parent + "/branch2/index.html",
          parent + "/index.html",
        };
    assertIsomorphic(expectedA, childL);
  }

  protected void checkUrlContent(
      SimulatedArchivalUnit sau,
      String path,
      int fileNum,
      int depth,
      int branchNum,
      boolean isAbnormal,
      boolean isDamaged)
      throws IOException {
    String file = sau.getUrlRoot() + path;
    CachedUrl url = sau.makeCachedUrl(file);
    String content = getUrlContent(url);
    String expectedContent;
    if (path.endsWith(".html")) {
      String fn = path.substring(path.lastIndexOf("/") + 1);
      expectedContent = scgen.getHtmlFileContent(fn, fileNum, depth, branchNum, isAbnormal);
    } else {
      expectedContent = scgen.getTxtContent(fileNum, depth, branchNum, isAbnormal);
    }
    if (isDamaged) {
      assertNotEquals(expectedContent, content);
    } else {
      assertEquals(expectedContent, content);
    }
  }

  protected void checkStoredContent(SimulatedArchivalUnit sau) throws IOException {
    checkUrlContent(sau, "/001file.txt", 1, 0, 0, false, false);
    checkUrlContent(sau, "/branch1/branch1/001file.txt", 1, 2, 1, true, false);
    checkUrlContent(sau, DAMAGED_CACHED_URL, 2, 2, 2, false, true);
  }

  protected void doDamageRemoveTest(SimulatedArchivalUnit sau) throws Exception {
    /* Cache the file again; this time the damage should be gone */
    String file = sau.getUrlRoot() + DAMAGED_CACHED_URL;
    UrlCacher uc = sau.makeUrlCacher(file);
    BitSet fetchFlags = new BitSet();
    fetchFlags.set(UrlCacher.REFETCH_FLAG);
    uc.setFetchFlags(fetchFlags);
    uc.cache();
    checkUrlContent(sau, DAMAGED_CACHED_URL, 2, 2, 2, false, false);
  }

  private void measureHashSpeed(SimulatedArchivalUnit sau) throws Exception {
    MessageDigest dig = null;
    try {
      dig = MessageDigest.getInstance("SHA-1");
    } catch (NoSuchAlgorithmException ex) {
      fail("No algorithm.");
    }
    CachedUrlSet set = sau.getAuCachedUrlSet();
    CachedUrlSetHasher hasher = set.getContentHasher(dig);
    SystemMetrics metrics = theDaemon.getSystemMetrics();
    int estimate = metrics.getBytesPerMsHashEstimate(hasher, dig);
    // should be protected against this being zero by MyMockSystemMetrics,
    // but otherwise use the proper calculation.  This avoids test failure
    // due to really slow machines
    assertTrue(estimate > 0);
    long estimatedTime = set.estimatedHashDuration();
    long size = ((Long) PrivilegedAccessor.getValue(set, "totalNodeSize")).longValue();
    assertTrue(size > 0);
    System.out.println("b/ms: " + estimate);
    System.out.println("size: " + size);
    System.out.println("estimate: " + estimatedTime);
    assertEquals(estimatedTime, theDaemon.getHashService().padHashEstimate(size / estimate));
  }

  private void checkHashSet(
      SimulatedArchivalUnit sau, boolean namesOnly, boolean filter, byte[] expected)
      throws Exception {
    enableFilter(sau, filter);
    CachedUrlSet set = sau.getAuCachedUrlSet();
    byte[] hash = getHash(set, namesOnly);
    assertEquals(expected, hash);

    String parent = sau.getUrlRoot() + "/branch1";
    CachedUrlSetSpec spec = new RangeCachedUrlSetSpec(parent);
    set = sau.makeCachedUrlSet(spec);
    byte[] hash2 = getHash(set, namesOnly);
    assertFalse(Arrays.equals(hash, hash2));
  }

  private byte[] getHash(CachedUrlSet set, boolean namesOnly) throws IOException {
    MessageDigest dig = null;
    try {
      dig = MessageDigest.getInstance("SHA-1");
    } catch (NoSuchAlgorithmException ex) {
      fail("No algorithm.");
    }
    hash(set, dig, namesOnly);
    return dig.digest();
  }

  private void hash(CachedUrlSet set, MessageDigest dig, boolean namesOnly) throws IOException {
    CachedUrlSetHasher hasher = null;
    if (namesOnly) {
      hasher = set.getNameHasher(dig);
    } else {
      hasher = set.getContentHasher(dig);
    }
    int bytesHashed = 0;
    long timeTaken = System.currentTimeMillis();
    while (!hasher.finished()) {
      bytesHashed += hasher.hashStep(256);
    }
    timeTaken = System.currentTimeMillis() - timeTaken;
    if ((timeTaken > 0) && (bytesHashed > 500)) {
      System.out.println("Bytes hashed: " + bytesHashed);
      System.out.println("Time taken: " + timeTaken + "ms");
      System.out.println("Bytes/sec: " + (bytesHashed * 1000 / timeTaken));
    } else {
      System.out.println("No time taken, or insufficient bytes hashed.");
      System.out.println("Bytes hashed: " + bytesHashed);
      System.out.println("Time taken: " + timeTaken + "ms");
    }
  }

  private String getUrlContent(CachedUrl url) throws IOException {
    InputStream content = url.getUnfilteredInputStream();
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    StreamUtil.copy(content, baos);
    content.close();
    String contentStr = new String(baos.toByteArray());
    baos.close();
    return contentStr;
  }

  // this version doesn't fully override the 'measureHashSpeed()' function, but
  // protects against it returning '0' by returning the set speed
  private class MyMockSystemMetrics extends MockSystemMetrics {
    public int measureHashSpeed(CachedUrlSetHasher hasher, MessageDigest digest)
        throws IOException {
      int speed = super.measureHashSpeed(hasher, digest);
      if (speed == 0) {
        speed = getHashSpeed();
        if (speed <= 0) {
          throw new RuntimeException("No hash speed set.");
        }
      }
      return speed;
    }
  }

  public static void main(String[] argv) {
    String[] testCaseList = {FuncSimulatedContent.class.getName()};
    junit.swingui.TestRunner.main(testCaseList);
  }

  public static Test suite() {
    return new TestSuite(FuncSimulatedContent.class);
  }
}
Esempio n. 11
0
/** UI to invoke various daemon actions */
@SuppressWarnings("serial")
public class DebugPanel extends LockssServlet {

  public static final String PREFIX = Configuration.PREFIX + "debugPanel.";

  /** Priority for crawls started from the debug panel */
  public static final String PARAM_CRAWL_PRIORITY = PREFIX + "crawlPriority";

  public static final int DEFAULT_CRAWL_PRIORITY = 10;

  /** Priority for crawls started from the debug panel */
  public static final String PARAM_ENABLE_DEEP_CRAWL = PREFIX + "deepCrawlEnabled";

  private static final boolean DEFAULT_ENABLE_DEEP_CRAWL = false;

  static final String KEY_ACTION = "action";
  static final String KEY_MSG = "msg";
  static final String KEY_NAME_SEL = "name_sel";
  static final String KEY_NAME_TYPE = "name_type";
  static final String KEY_AUID = "auid";
  static final String KEY_URL = "url";
  static final String KEY_REFETCH_DEPTH = "depth";
  static final String KEY_TIME = "time";

  static final String ACTION_MAIL_BACKUP = "Mail Backup File";
  static final String ACTION_THROW_IOEXCEPTION = "Throw IOException";
  static final String ACTION_FIND_URL = "Find Preserved URL";

  public static final String ACTION_REINDEX_METADATA = "Reindex Metadata";
  public static final String ACTION_FORCE_REINDEX_METADATA = "Force Reindex Metadata";
  public static final String ACTION_START_V3_POLL = "Start V3 Poll";
  static final String ACTION_FORCE_START_V3_POLL = "Force V3 Poll";
  public static final String ACTION_START_CRAWL = "Start Crawl";
  public static final String ACTION_FORCE_START_CRAWL = "Force Start Crawl";
  public static final String ACTION_START_DEEP_CRAWL = "Deep Crawl";
  public static final String ACTION_FORCE_START_DEEP_CRAWL = "Force Deep Crawl";
  public static final String ACTION_CHECK_SUBSTANCE = "Check Substance";
  static final String ACTION_CRAWL_PLUGINS = "Crawl Plugins";
  static final String ACTION_RELOAD_CONFIG = "Reload Config";
  static final String ACTION_SLEEP = "Sleep";
  public static final String ACTION_DISABLE_METADATA_INDEXING = "Disable Indexing";

  /** Set of actions for which audit alerts shouldn't be generated */
  public static final Set noAuditActions = SetUtil.set(ACTION_FIND_URL);

  static final String COL2 = "colspan=2";
  static final String COL2CENTER = COL2 + " align=center";

  static Logger log = Logger.getLogger("DebugPanel");

  private LockssDaemon daemon;
  private PluginManager pluginMgr;
  private PollManager pollManager;
  private CrawlManager crawlMgr;
  private ConfigManager cfgMgr;
  private DbManager dbMgr;
  private MetadataManager metadataMgr;
  private RemoteApi rmtApi;

  boolean showResult;
  boolean showForcePoll;
  boolean showForceCrawl;
  boolean showForceReindexMetadata;

  String formAuid;
  String formDepth = "100";

  protected void resetLocals() {
    resetVars();
    super.resetLocals();
  }

  void resetVars() {
    formAuid = null;
    errMsg = null;
    statusMsg = null;
    showForcePoll = false;
    showForceCrawl = false;
    showForceReindexMetadata = false;
  }

  public void init(ServletConfig config) throws ServletException {
    super.init(config);
    daemon = getLockssDaemon();
    pluginMgr = daemon.getPluginManager();
    pollManager = daemon.getPollManager();
    crawlMgr = daemon.getCrawlManager();
    cfgMgr = daemon.getConfigManager();
    rmtApi = daemon.getRemoteApi();
    try {
      dbMgr = daemon.getDbManager();
      metadataMgr = daemon.getMetadataManager();
    } catch (IllegalArgumentException ex) {
    }
  }

  public void lockssHandleRequest() throws IOException {
    resetVars();
    boolean showForm = true;
    String action = getParameter(KEY_ACTION);

    if (!StringUtil.isNullString(action)) {

      formAuid = getParameter(KEY_AUID);
      formDepth = getParameter(KEY_REFETCH_DEPTH);

      UserAccount acct = getUserAccount();
      if (acct != null && !noAuditActions.contains(action)) {
        acct.auditableEvent("used debug panel action: " + action + " AU ID: " + formAuid);
      }
    }

    if (ACTION_MAIL_BACKUP.equals(action)) {
      doMailBackup();
    }
    if (ACTION_RELOAD_CONFIG.equals(action)) {
      doReloadConfig();
    }
    if (ACTION_SLEEP.equals(action)) {
      doSleep();
    }
    if (ACTION_THROW_IOEXCEPTION.equals(action)) {
      doThrow();
    }
    if (ACTION_START_V3_POLL.equals(action)) {
      doV3Poll();
    }
    if (ACTION_FORCE_START_V3_POLL.equals(action)) {
      forceV3Poll();
    }
    if (ACTION_START_CRAWL.equals(action)) {
      doCrawl(false, false);
    }
    if (ACTION_FORCE_START_CRAWL.equals(action)) {
      doCrawl(true, false);
    }
    if (ACTION_START_DEEP_CRAWL.equals(action)) {
      doCrawl(false, true);
    }
    if (ACTION_FORCE_START_DEEP_CRAWL.equals(action)) {
      doCrawl(true, true);
    }
    if (ACTION_CHECK_SUBSTANCE.equals(action)) {
      doCheckSubstance();
    }
    if (ACTION_CRAWL_PLUGINS.equals(action)) {
      crawlPluginRegistries();
    }
    if (ACTION_FIND_URL.equals(action)) {
      showForm = doFindUrl();
    }
    if (ACTION_REINDEX_METADATA.equals(action)) {
      doReindexMetadata();
    }
    if (ACTION_FORCE_REINDEX_METADATA.equals(action)) {
      forceReindexMetadata();
    }
    if (ACTION_DISABLE_METADATA_INDEXING.equals(action)) {
      doDisableMetadataIndexing();
    }
    if (showForm) {
      displayPage();
    }
  }

  private void doMailBackup() {
    try {
      rmtApi.createConfigBackupFile(RemoteApi.BackupFileDisposition.Mail);
    } catch (Exception e) {
      errMsg = "Error: " + e.getMessage();
    }
  }

  private void doReloadConfig() {
    cfgMgr.requestReload();
  }

  private void doThrow() throws IOException {
    String msg = getParameter(KEY_MSG);
    throw new IOException(msg != null ? msg : "Test message");
  }

  private void doSleep() throws IOException {
    String timestr = getParameter(KEY_TIME);
    try {
      long time = StringUtil.parseTimeInterval(timestr);
      Deadline.in(time).sleep();
      statusMsg = "Slept for " + StringUtil.timeIntervalToString(time);
    } catch (NumberFormatException e) {
      errMsg = "Illegal duration: " + e;
    } catch (InterruptedException e) {
      errMsg = "Interrupted: " + e;
    }
  }

  private void doReindexMetadata() {
    ArchivalUnit au = getAu();
    if (au == null) return;
    try {
      startReindexingMetadata(au, false);
    } catch (RuntimeException e) {
      log.error("Can't reindex metadata", e);
      errMsg = "Error: " + e.toString();
    }
  }

  private void forceReindexMetadata() {
    ArchivalUnit au = getAu();
    if (au == null) return;
    try {
      startReindexingMetadata(au, true);
    } catch (RuntimeException e) {
      log.error("Can't reindex metadata", e);
      errMsg = "Error: " + e.toString();
    }
  }

  private void doDisableMetadataIndexing() {
    ArchivalUnit au = getAu();
    if (au == null) return;
    try {
      disableMetadataIndexing(au, false);
    } catch (RuntimeException e) {
      log.error("Can't disable metadata indexing", e);
      errMsg = "Error: " + e.toString();
    }
  }

  private void doCrawl(boolean force, boolean deep) {
    ArchivalUnit au = getAu();
    if (au == null) return;
    try {
      startCrawl(au, force, deep);
    } catch (CrawlManagerImpl.NotEligibleException.RateLimiter e) {
      errMsg = "AU has crawled recently (" + e.getMessage() + ").  Click again to override.";
      showForceCrawl = true;
      return;
    } catch (CrawlManagerImpl.NotEligibleException e) {
      errMsg = "Can't enqueue crawl: " + e.getMessage();
    }
  }

  private void crawlPluginRegistries() {
    StringBuilder sb = new StringBuilder();
    for (ArchivalUnit au : pluginMgr.getAllRegistryAus()) {
      sb.append(au.getName());
      sb.append(": ");
      try {
        startCrawl(au, true, false);
        sb.append("Queued.");
      } catch (CrawlManagerImpl.NotEligibleException e) {
        sb.append("Failed: ");
        sb.append(e.getMessage());
      }
      sb.append("\n");
    }
    statusMsg = sb.toString();
  }

  private boolean startCrawl(ArchivalUnit au, boolean force, boolean deep)
      throws CrawlManagerImpl.NotEligibleException {
    CrawlManagerImpl cmi = (CrawlManagerImpl) crawlMgr;
    if (force) {
      RateLimiter limit = cmi.getNewContentRateLimiter(au);
      if (!limit.isEventOk()) {
        limit.unevent();
      }
    }
    cmi.checkEligibleToQueueNewContentCrawl(au);
    String delayMsg = "";
    String deepMsg = "";
    try {
      cmi.checkEligibleForNewContentCrawl(au);
    } catch (CrawlManagerImpl.NotEligibleException e) {
      delayMsg = ", Start delayed due to: " + e.getMessage();
    }
    Configuration config = ConfigManager.getCurrentConfig();
    int pri = config.getInt(PARAM_CRAWL_PRIORITY, DEFAULT_CRAWL_PRIORITY);

    CrawlReq req;
    try {
      req = new CrawlReq(au);
      req.setPriority(pri);
      if (deep) {
        int d = Integer.parseInt(formDepth);
        if (d < 0) {
          errMsg = "Illegal refetch depth: " + d;
          return false;
        }
        req.setRefetchDepth(d);
        deepMsg = "Deep (" + req.getRefetchDepth() + ") ";
      }
    } catch (NumberFormatException e) {
      errMsg = "Illegal refetch depth: " + formDepth;
      return false;
    } catch (RuntimeException e) {
      log.error("Couldn't create CrawlReq: " + au, e);
      errMsg = "Couldn't create CrawlReq: " + e.toString();
      return false;
    }
    cmi.startNewContentCrawl(req, null);
    statusMsg = deepMsg + "Crawl requested for " + au.getName() + delayMsg;
    return true;
  }

  private void doCheckSubstance() {
    ArchivalUnit au = getAu();
    if (au == null) return;
    try {
      checkSubstance(au);
    } catch (RuntimeException e) {
      log.error("Error in SubstanceChecker", e);
      errMsg = "Error in SubstanceChecker; see log.";
    }
  }

  private void checkSubstance(ArchivalUnit au) {
    SubstanceChecker subChecker = new SubstanceChecker(au);
    if (!subChecker.isEnabled()) {
      errMsg = "No substance patterns defined for plugin.";
      return;
    }
    AuState auState = AuUtil.getAuState(au);
    SubstanceChecker.State oldState = auState.getSubstanceState();
    SubstanceChecker.State newState = subChecker.findSubstance();
    String chtxt = (newState == oldState ? "(unchanged)" : "(was " + oldState.toString() + ")");
    switch (newState) {
      case Unknown:
        log.error("Shouldn't happen: SubstanceChecker returned Unknown");
        errMsg = "Error in SubstanceChecker; see log.";
        break;
      case Yes:
        statusMsg = "AU has substance " + chtxt + ": " + au.getName();
        auState.setSubstanceState(SubstanceChecker.State.Yes);
        break;
      case No:
        statusMsg = "AU has no substance " + chtxt + ": " + au.getName();
        auState.setSubstanceState(SubstanceChecker.State.No);
        break;
    }
  }

  private boolean startReindexingMetadata(ArchivalUnit au, boolean force) {
    if (metadataMgr == null) {
      errMsg = "Metadata processing is not enabled.";
      return false;
    }

    if (!force) {
      if (!AuUtil.hasCrawled(au)) {
        errMsg = "Au has never crawled. Click again to reindex metadata";
        showForceReindexMetadata = true;
        return false;
      }

      AuState auState = AuUtil.getAuState(au);
      switch (auState.getSubstanceState()) {
        case No:
          errMsg = "Au has no substance. Click again to reindex metadata";
          showForceReindexMetadata = true;
          return false;
        case Unknown:
          errMsg = "Unknown substance for Au. Click again to reindex metadata.";
          showForceReindexMetadata = true;
          return false;
        case Yes:
          // fall through
      }
    }

    // Fully reindex metadata with the highest priority.
    Connection conn = null;
    PreparedStatement insertPendingAuBatchStatement = null;

    try {
      conn = dbMgr.getConnection();
      insertPendingAuBatchStatement = metadataMgr.getPrioritizedInsertPendingAuBatchStatement(conn);

      if (metadataMgr.enableAndAddAuToReindex(
          au, conn, insertPendingAuBatchStatement, false, true)) {
        statusMsg = "Reindexing metadata for " + au.getName();
        return true;
      }
    } catch (DbException dbe) {
      log.error("Cannot reindex metadata for " + au.getName(), dbe);
    } finally {
      DbManager.safeCloseStatement(insertPendingAuBatchStatement);
      DbManager.safeRollbackAndClose(conn);
    }

    if (force) {
      errMsg = "Still cannot reindex metadata for " + au.getName();
    } else {
      errMsg = "Cannot reindex metadata for " + au.getName();
    }
    return false;
  }

  private boolean disableMetadataIndexing(ArchivalUnit au, boolean force) {
    if (metadataMgr == null) {
      errMsg = "Metadata processing is not enabled.";
      return false;
    }

    try {
      metadataMgr.disableAuIndexing(au);
      statusMsg = "Disabled metadata indexing for " + au.getName();
      return true;
    } catch (Exception e) {
      errMsg = "Cannot reindex metadata for " + au.getName() + ": " + e.getMessage();
      return false;
    }
  }

  private void doV3Poll() {
    ArchivalUnit au = getAu();
    if (au == null) return;
    try {
      callV3ContentPoll(au);
    } catch (PollManager.NotEligibleException e) {
      errMsg = "AU is not eligible for poll: " + e.getMessage();
      //       errMsg = "Ineligible: " + e.getMessage() +
      // 	"<br>Click again to force new poll.";
      //       showForcePoll = true;
      return;
    } catch (Exception e) {
      log.error("Can't start poll", e);
      errMsg = "Error: " + e.toString();
    }
  }

  private void forceV3Poll() {
    ArchivalUnit au = getAu();
    if (au == null) return;
    try {
      callV3ContentPoll(au);
    } catch (Exception e) {
      log.error("Can't start poll", e);
      errMsg = "Error: " + e.toString();
    }
  }

  private void callV3ContentPoll(ArchivalUnit au) throws PollManager.NotEligibleException {
    log.debug("Enqueuing a V3 Content Poll on " + au.getName());
    PollSpec spec = new PollSpec(au.getAuCachedUrlSet(), Poll.V3_POLL);
    pollManager.enqueueHighPriorityPoll(au, spec);
    statusMsg = "Enqueued V3 poll for " + au.getName();
  }

  private boolean doFindUrl() throws IOException {

    String url = getParameter(KEY_URL);

    String redir =
        srvURL(
            AdminServletManager.SERVLET_DAEMON_STATUS,
            PropUtil.fromArgs("table", ArchivalUnitStatus.AUS_WITH_URL_TABLE_NAME, "key", url));

    resp.setContentLength(0);
    //     resp.sendRedirect(resp.encodeRedirectURL(redir));
    resp.sendRedirect(redir);
    return false;
  }

  ArchivalUnit getAu() {
    if (StringUtil.isNullString(formAuid)) {
      errMsg = "Select an AU";
      return null;
    }
    ArchivalUnit au = pluginMgr.getAuFromId(formAuid);
    if (au == null) {
      errMsg = "No such AU.  Select an AU";
      return null;
    }
    return au;
  }

  private void displayPage() throws IOException {
    Page page = newPage();
    layoutErrorBlock(page);
    ServletUtil.layoutExplanationBlock(page, "Debug Actions");
    page.add(makeForm());
    page.add("<br>");
    endPage(page);
  }

  private Element makeForm() {
    Composite comp = new Composite();
    Form frm = new Form(srvURL(myServletDescr()));
    frm.method("POST");

    frm.add("<br><center>");
    Input reload = new Input(Input.Submit, KEY_ACTION, ACTION_RELOAD_CONFIG);
    setTabOrder(reload);
    frm.add(reload);
    frm.add(" ");
    Input backup = new Input(Input.Submit, KEY_ACTION, ACTION_MAIL_BACKUP);
    setTabOrder(backup);
    frm.add(backup);
    frm.add(" ");
    Input crawlplug = new Input(Input.Submit, KEY_ACTION, ACTION_CRAWL_PLUGINS);
    setTabOrder(crawlplug);
    frm.add(crawlplug);
    frm.add("</center>");
    ServletDescr d1 = AdminServletManager.SERVLET_HASH_CUS;
    if (isServletRunnable(d1)) {
      frm.add("<br><center>" + srvLink(d1, d1.heading) + "</center>");
    }
    Input findUrl = new Input(Input.Submit, KEY_ACTION, ACTION_FIND_URL);
    Input findUrlText = new Input(Input.Text, KEY_URL);
    findUrlText.setSize(50);
    setTabOrder(findUrl);
    setTabOrder(findUrlText);
    frm.add("<br><center>" + findUrl + " " + findUrlText + "</center>");

    Input thrw = new Input(Input.Submit, KEY_ACTION, ACTION_THROW_IOEXCEPTION);
    Input thmsg = new Input(Input.Text, KEY_MSG);
    setTabOrder(thrw);
    setTabOrder(thmsg);
    frm.add("<br><center>" + thrw + " " + thmsg + "</center>");

    frm.add("<br><center>AU Actions: select AU</center>");
    Composite ausel = ServletUtil.layoutSelectAu(this, KEY_AUID, formAuid);
    frm.add("<br><center>" + ausel + "</center>");
    setTabOrder(ausel);

    Input v3Poll =
        new Input(
            Input.Submit,
            KEY_ACTION,
            (showForcePoll ? ACTION_FORCE_START_V3_POLL : ACTION_START_V3_POLL));
    Input crawl =
        new Input(
            Input.Submit,
            KEY_ACTION,
            (showForceCrawl ? ACTION_FORCE_START_CRAWL : ACTION_START_CRAWL));

    frm.add("<br><center>");
    frm.add(v3Poll);
    frm.add(" ");
    frm.add(crawl);
    if (CurrentConfig.getBooleanParam(PARAM_ENABLE_DEEP_CRAWL, DEFAULT_ENABLE_DEEP_CRAWL)) {
      Input deepCrawl =
          new Input(
              Input.Submit,
              KEY_ACTION,
              (showForceCrawl ? ACTION_FORCE_START_DEEP_CRAWL : ACTION_START_DEEP_CRAWL));
      Input depthText = new Input(Input.Text, KEY_REFETCH_DEPTH, formDepth);
      depthText.setSize(4);
      setTabOrder(depthText);
      frm.add(" ");
      frm.add(deepCrawl);
      frm.add(depthText);
    }
    Input checkSubstance = new Input(Input.Submit, KEY_ACTION, ACTION_CHECK_SUBSTANCE);
    frm.add("<br>");
    frm.add(checkSubstance);
    if (metadataMgr != null) {
      Input reindex =
          new Input(
              Input.Submit,
              KEY_ACTION,
              (showForceReindexMetadata ? ACTION_FORCE_REINDEX_METADATA : ACTION_REINDEX_METADATA));
      frm.add(" ");
      frm.add(reindex);
      Input disableIndexing = new Input(Input.Submit, KEY_ACTION, ACTION_DISABLE_METADATA_INDEXING);
      frm.add(" ");
      frm.add(disableIndexing);
    }
    frm.add("</center>");

    comp.add(frm);
    return comp;
  }
}
/**
 * PluginArchivalUnit: The Archival Unit Class for PluginPlugin. This archival unit uses a base url
 * to define an archival unit.
 *
 * @author Seth Morabito
 * @version 1.0
 */
public class RegistryArchivalUnit extends BaseArchivalUnit {
  protected static final Logger log = Logger.getLogger("RegistryArchivalUnit");

  /** The interval between recrawls of the loadable plugin registry AUs. */
  static final String PARAM_REGISTRY_CRAWL_INTERVAL = RegistryPlugin.PREFIX + "crawlInterval";

  static final long DEFAULT_REGISTRY_CRAWL_INTERVAL = Constants.DAY;

  /**
   * If "au", registry AUs will crawl in parallel using individual rate limiters; if "plugin"
   * they'll crawl sequentially using a shared rate limiter
   */
  static final String PARAM_REGISTRY_FETCH_RATE_LIMITER_SOURCE =
      RegistryPlugin.PREFIX + "fetchRateLimiterSource";

  static final String DEFAULT_REGISTRY_FETCH_RATE_LIMITER_SOURCE = "au";

  /** Limits fetch rate of registry crawls */
  static final String PARAM_REGISTRY_FETCH_RATE = RegistryPlugin.PREFIX + "fetchRate";

  static final String DEFAULT_REGISTRY_FETCH_RATE = "20/10s";

  /** Run polls on Plugin registry AUs */
  static final String PARAM_ENABLE_REGISTRY_POLLS = RegistryPlugin.PREFIX + "enablePolls";

  static final boolean DEFAULT_ENABLE_REGISTRY_POLLS = true;

  private String m_registryUrl = null;
  private int m_maxRefetchDepth = NewContentCrawler.DEFAULT_MAX_CRAWL_DEPTH;
  private List m_permissionCheckers = null;
  private boolean recomputeRegName = true;
  private boolean enablePolls = DEFAULT_ENABLE_REGISTRY_POLLS;
  private String regName = null;

  public RegistryArchivalUnit(RegistryPlugin plugin) {
    super(plugin);
  }

  // Called by RegistryPlugin iff any config below RegistryPlugin.PREFIX
  // has changed
  protected void setConfig(
      Configuration config, Configuration prevConfig, Configuration.Differences changedKeys) {
    m_maxRefetchDepth =
        config.getInt(
            NewContentCrawler.PARAM_MAX_CRAWL_DEPTH, NewContentCrawler.DEFAULT_MAX_CRAWL_DEPTH);
    fetchRateLimiter = recomputeFetchRateLimiter(fetchRateLimiter);
    enablePolls = config.getBoolean(PARAM_ENABLE_REGISTRY_POLLS, DEFAULT_ENABLE_REGISTRY_POLLS);
  }

  public void loadAuConfigDescrs(Configuration config) throws ConfigurationException {
    super.loadAuConfigDescrs(config);
    this.m_registryUrl = config.get(ConfigParamDescr.BASE_URL.getKey());
    // Now we can construct a valid CC permission checker.
    m_permissionCheckers =
        //       ListUtil.list(new CreativeCommonsPermissionChecker(m_registryUrl));
        ListUtil.list(new CreativeCommonsPermissionChecker());

    paramMap.putLong(
        KEY_AU_NEW_CONTENT_CRAWL_INTERVAL,
        CurrentConfig.getTimeIntervalParam(
            PARAM_REGISTRY_CRAWL_INTERVAL, DEFAULT_REGISTRY_CRAWL_INTERVAL));
    if (log.isDebug2()) {
      log.debug2(
          "Setting Registry AU recrawl interval to "
              + StringUtil.timeIntervalToString(
                  paramMap.getLong(KEY_AU_NEW_CONTENT_CRAWL_INTERVAL)));
    }
  }

  /**
   * return a string that represents the plugin registry. This is just the base URL.
   *
   * @return The base URL.
   */
  protected String makeName() {
    return "Plugin registry at '" + m_registryUrl + "'";
  }

  public String getName() {
    if (recomputeRegName) {
      regName = recomputeRegName();
    }
    if (regName != null) {
      return regName;
    } else {
      return super.getName();
    }
  }

  // If there is a <title> element on the start page, use that as our AU
  // name.
  String recomputeRegName() {
    if (!isStarted()) {
      // This can get invoked (seveeral times, mostly from logging) before
      // enough mechanism has started to make it possible to resolve the CuUrl
      // below.
      return null;
    }
    try {
      CachedUrl cu = makeCachedUrl(m_registryUrl);
      if (cu == null) return null;
      URL cuUrl = CuUrl.fromCu(cu);
      Parser parser = new Parser(cuUrl.toString());
      NodeList nodelst = parser.extractAllNodesThatMatch(new NodeClassFilter(TitleTag.class));
      Node nodes[] = nodelst.toNodeArray();
      recomputeRegName = false;
      if (nodes.length < 1) return null;
      // Get the first title found
      TitleTag tag = (TitleTag) nodes[0];
      if (tag == null) return null;
      return tag.getTitle();
    } catch (MalformedURLException e) {
      log.warning("recomputeRegName", e);
      return null;
    } catch (ParserException e) {
      if (e.getThrowable() instanceof FileNotFoundException) {
        log.warning("recomputeRegName: " + e.getThrowable().toString());
      } else {
        log.warning("recomputeRegName", e);
      }
      return null;
    }
  }

  boolean isStarted() {
    return getPlugin().getDaemon().getPluginManager().getAuFromId(getAuId()) != null;
  }

  /**
   * return a string that points to the plugin registry page.
   *
   * @return a string that points to the plugin registry page for this registry. This is just the
   *     base URL.
   */
  protected String makeStartUrl() {
    return m_registryUrl;
  }

  /** Call top level polls iff configured to do so. */
  public boolean shouldCallTopLevelPoll(AuState aus) {
    if (!enablePolls) {
      return false;
    }
    return super.shouldCallTopLevelPoll(aus);
  }

  /**
   * Return a new CrawlSpec with the appropriate collect AND redistribute permissions, and with the
   * maximum refetch depth.
   *
   * @return CrawlSpec
   */
  protected CrawlSpec makeCrawlSpec() throws LockssRegexpException {
    CrawlRule rule = makeRules();
    List startUrls = getNewContentCrawlUrls();
    return new SpiderCrawlSpec(startUrls, startUrls, rule, m_maxRefetchDepth, null, null);
  }

  /**
   * return the collection of crawl rules used to crawl and cache a list of Plugin JAR files.
   *
   * @return CrawlRule
   */
  protected CrawlRule makeRules() {
    return new RegistryRule();
  }

  // Might need to recompute name if refetch start page
  public UrlCacher makeUrlCacher(String url) {
    if (url.equals(m_registryUrl)) {
      recomputeRegName = true;
    }
    return super.makeUrlCacher(url);
  }

  protected RateLimiter recomputeFetchRateLimiter(RateLimiter oldLimiter) {
    String rate = CurrentConfig.getParam(PARAM_REGISTRY_FETCH_RATE, DEFAULT_REGISTRY_FETCH_RATE);
    Object limiterKey = getFetchRateLimiterKey();

    if (limiterKey == null) {
      return RateLimiter.getRateLimiter(oldLimiter, rate, DEFAULT_REGISTRY_FETCH_RATE);
    } else {
      RateLimiter.Pool pool = RateLimiter.getPool();
      return pool.findNamedRateLimiter(limiterKey, rate, DEFAULT_REGISTRY_FETCH_RATE);
    }
  }

  protected String getFetchRateLimiterSource() {
    return CurrentConfig.getParam(
        PARAM_REGISTRY_FETCH_RATE_LIMITER_SOURCE, DEFAULT_REGISTRY_FETCH_RATE_LIMITER_SOURCE);
  }

  // Registry AU crawl rule implementation
  private class RegistryRule implements CrawlRule {
    public int match(String url) {
      if (StringUtil.equalStringsIgnoreCase(url, m_registryUrl)
          || StringUtil.endsWithIgnoreCase(url, ".jar")) {
        return CrawlRule.INCLUDE;
      } else {
        return CrawlRule.EXCLUDE;
      }
    }
  }
}
Esempio n. 13
0
/**
 * Class implementing the concept of the set of URLs covered by a poll.
 *
 * @author Claire Griffin
 * @version 1.0
 */
public class PollSpec {
  /**
   * A lower bound value which indicates the poll should use a {@link SingleNodeCachedUrlSetSpec}
   * instead of a {@link RangeCachedUrlSetSpec}.
   */
  public static final String SINGLE_NODE_LWRBOUND = ".";

  public static final String DEFAULT_PLUGIN_VERSION = "1";

  private static Logger theLog = Logger.getLogger("PollSpec");

  private String auId;
  private String pluginVersion;
  private String url;
  private String uprBound = null;
  private String lwrBound = null;
  private CachedUrlSet cus = null;
  private PluginManager pluginMgr = null;
  private int protocolVersion; // poll protocol version
  private int pollType; // One of the types defined by Poll
  private V3Poller.PollVariant variant = V3Poller.PollVariant.PoR;

  /**
   * Construct a PollSpec from a CachedUrlSet and an upper and lower bound
   *
   * @param cus the CachedUrlSet
   * @param lwrBound the lower boundary
   * @param uprBound the upper boundary
   * @param pollType one of the types defined by Poll
   */
  public PollSpec(CachedUrlSet cus, String lwrBound, String uprBound, int pollType) {
    commonSetup(cus, lwrBound, uprBound, pollType);
  }

  /**
   * Construct a PollSpec from a CachedUrlSet.
   *
   * @param cus the CachedUrlSpec which defines the range of interest
   * @param pollType one of the types defined by Poll
   */
  public PollSpec(CachedUrlSet cus, int pollType) {
    CachedUrlSetSpec cuss = cus.getSpec();
    if (cuss instanceof RangeCachedUrlSetSpec) {
      RangeCachedUrlSetSpec rcuss = (RangeCachedUrlSetSpec) cuss;
      commonSetup(cus, rcuss.getLowerBound(), rcuss.getUpperBound(), pollType);
    } else if (cuss.isSingleNode()) {
      commonSetup(cus, SINGLE_NODE_LWRBOUND, null, pollType);
    } else {
      commonSetup(cus, null, null, pollType);
    }
  }

  /**
   * Construct a PollSpec from a V1 LcapMessage
   *
   * @param msg the LcapMessage which defines the range of interest
   */
  public PollSpec(V1LcapMessage msg) {
    auId = msg.getArchivalId();
    pluginVersion = msg.getPluginVersion();
    url = msg.getTargetUrl();
    uprBound = msg.getUprBound();
    lwrBound = msg.getLwrBound();
    protocolVersion = msg.getProtocolVersion();
    if (msg.isContentPoll()) {
      pollType = Poll.V1_CONTENT_POLL;
    } else if (msg.isNamePoll()) {
      pollType = Poll.V1_NAME_POLL;
    } else if (msg.isVerifyPoll()) {
      pollType = Poll.V1_VERIFY_POLL;
    } else {
      pollType = -1;
    }
    cus = getPluginManager().findCachedUrlSet(this);
  }

  public PollSpec(V3LcapMessage msg) {
    this(
        msg.getArchivalId(),
        (msg.getTargetUrl() == null) ? "lockssau:" : msg.getTargetUrl(),
        null,
        null,
        Poll.V3_POLL);
    protocolVersion = msg.getProtocolVersion();
    pluginVersion = msg.getPluginVersion();
  }

  /** Construct a PollSpec from explicit args */
  public PollSpec(String auId, String url, String lower, String upper, int pollType) {
    this.auId = auId;
    this.url = url;
    uprBound = upper;
    lwrBound = lower;
    this.pollType = pollType;
    cus = getPluginManager().findCachedUrlSet(this);
    this.protocolVersion = protocolVersionFromPollType(pollType);
  }

  /**
   * Construct a PollSpec from another PollSpec and a poll type XXX it seems that other constructors
   * are not setting all fields
   */
  public PollSpec(PollSpec ps, int pollType) {
    this.auId = ps.auId;
    this.pluginVersion = ps.pluginVersion;
    this.url = ps.url;
    this.uprBound = ps.uprBound;
    this.lwrBound = ps.lwrBound;
    this.cus = ps.cus;
    this.pluginMgr = ps.pluginMgr;
    this.protocolVersion = ps.protocolVersion;
    this.pollType = pollType;
  }

  /** Setup common to most constructors */
  private void commonSetup(CachedUrlSet cus, String lwrBound, String uprBound, int pollType) {
    CachedUrlSetSpec cuss = cus.getSpec();
    if (cuss instanceof PrunedCachedUrlSetSpec) {
      throw new IllegalArgumentException("Polls do not support PrunedCachedUrlSetSpec");
    }
    this.cus = cus;
    ArchivalUnit au = cus.getArchivalUnit();
    auId = au.getAuId();
    this.pluginVersion = AuUtil.getPollVersion(au);
    url = cuss.getUrl();
    this.lwrBound = lwrBound;
    this.uprBound = uprBound;
    this.protocolVersion = protocolVersionFromPollType(pollType);
    this.pollType = pollType;
  }

  public CachedUrlSet getCachedUrlSet() {
    return cus;
  }

  public String getAuId() {
    return auId;
  }

  public String getPluginVersion() {
    return (pluginVersion != null) ? pluginVersion : DEFAULT_PLUGIN_VERSION;
  }

  public String getUrl() {
    return url;
  }

  public String getLwrBound() {
    return lwrBound;
  }

  public String getUprBound() {
    return uprBound;
  }

  public String getRangeString() {
    if (StringUtil.equalStrings(lwrBound, SINGLE_NODE_LWRBOUND)) {
      return "single node";
    }
    if (lwrBound != null || uprBound != null) {
      String lwrDisplay = lwrBound;
      String uprDisplay = uprBound;
      if (lwrBound != null && lwrBound.startsWith("/")) {
        lwrDisplay = lwrBound.substring(1);
      }
      if (uprBound != null && uprBound.startsWith("/")) {
        uprDisplay = uprBound.substring(1);
      }
      return lwrDisplay + " - " + uprDisplay;
    }
    return null;
  }

  public int getProtocolVersion() {
    return protocolVersion;
  }

  public int getPollType() {
    return pollType;
  }

  public V3Poller.PollVariant getPollVariant() {
    return variant;
  }

  public void setPollVariant(V3Poller.PollVariant v) {
    variant = v;
  }

  private PluginManager getPluginManager() {
    if (pluginMgr == null) {
      pluginMgr = (PluginManager) LockssDaemon.getManager(LockssDaemon.PLUGIN_MANAGER);
    }
    return pluginMgr;
  }

  /**
   * Given a poll type, return the correct version of the protocol to use.
   *
   * @param pollType
   * @return The protocol version to use
   */
  private int protocolVersionFromPollType(int pollType) {
    switch (pollType) {
      case Poll.V1_CONTENT_POLL:
      case Poll.V1_NAME_POLL:
      case Poll.V1_VERIFY_POLL:
        return Poll.V1_PROTOCOL;
      case Poll.V3_POLL:
        return Poll.V3_PROTOCOL;
      default:
        return Poll.UNDEFINED_PROTOCOL;
    }
  }

  public String toString() {
    return "[PS: "
        + Poll.POLL_NAME[pollType]
        + " auid="
        + auId
        + ", url="
        + url
        + ", l="
        + lwrBound
        + ", u="
        + uprBound
        + ", type="
        + pollType
        + ", plugVer="
        + getPluginVersion()
        + ", protocol="
        + protocolVersion
        + "]";
  }
}
public class TestElsevierDTD5XmlMetadataExtractor extends SourceXmlMetadataExtractorTest {

  private static final Logger log = Logger.getLogger(TestElsevierDTD5XmlMetadataExtractor.class);

  private MockLockssDaemon theDaemon;
  protected ArchivalUnit tarAu;

  private static String PLUGIN_NAME = "org.lockss.plugin.elsevier.ClockssElsevierDTD5SourcePlugin";
  private static String BASE_URL = "http://www.source.org/";
  private static String YEAR_NAME = "2014";
  private static String TAR_A_BASE = BASE_URL + YEAR_NAME + "/CLKS003A.tar";
  private static String TAR_B_BASE = BASE_URL + YEAR_NAME + "/CLKS003B.tar";
  private static String SUBDIR = "!/CLKS003/";

  CIProperties tarHeader;

  /* for testing validation */
  private static Map<String, String> pubTitleMap;
  private static Map<String, String> dateMap;
  private static Map<String, String> accessUrlMap;
  private static Map<String, String> volMap;
  private static Map<String, String> issueMap;
  private static Map<String, List<String>> authorMap;

  static FileMetadataListExtractor els_mle;
  static FileMetadataListExtractor nocheck_mle;

  private static final String testDatasetFile = "testDataset.xml";
  private static final String realTARFile_A = "CLKS003A.tar";
  private static final String realTARFile_B = "CLKS003B.tar";

  public void setUp() throws Exception {
    super.setUp();

    tarHeader = new CIProperties();
    tarHeader.put(CachedUrl.PROPERTY_CONTENT_TYPE, "application/tar");
    tarAu = createTarAu();

    // for tests that also check for content
    els_mle =
        new FileMetadataListExtractor(
            new ElsevierDTD5XmlSourceMetadataExtractorFactory
                .ElsevierDTD5XmlSourceMetadataExtractor());
    // for tests that use a no-check-for-pdf version of the extractor
    nocheck_mle = new FileMetadataListExtractor(new TestElsevierDTD5MetadataExtractor());
    setUpExpectedTarContent();
  }

  protected ArchivalUnit createTarAu() throws ArchivalUnit.ConfigurationException {
    // in this directory this is file "test_elsevierdtd5.tdb" but it becomes xml
    try {
      ConfigurationUtil.addFromUrl(getResource("test_elsevierdtd5.xml"));
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
    Tdb tdb = ConfigManager.getCurrentConfig().getTdb();
    TdbAu tdbau1 = tdb.getTdbAusLikeName("Elsevier Source Content 2014").get(0);
    assertNotNull("Didn't find named TdbAu", tdbau1);
    return PluginTestUtil.createAndStartAu(tdbau1);
  }

  /*
   * The tests to run for this class
   */

  public void testSimpleMainXML() throws Exception {
    log.debug3("testSimpleMainXML");
    String xml_url = TAR_A_BASE + SUBDIR + "01420615/v64sC/S0142061514004608/main.xml";
    List<ArticleMetadata> mdList =
        extractFromContent(xml_url, "text/xml", simpleMain, nocheck_mle, null);
    assertEquals(1, mdList.size());
    validateSingleMainMetadataRecord(mdList.get(0), "10.1016/j.jidx.2014.07.028", "article");
  }

  public void testSimpleDatasetXML() throws Exception {
    log.debug3("testSimpleDatasetXML");
    String file_input = StringUtil.fromInputStream(getResourceAsStream(testDatasetFile));
    String xml_url = TAR_A_BASE + SUBDIR + "dataset.xml";

    List<ArticleMetadata> mdList =
        extractFromContent(xml_url, "text/xml", file_input, nocheck_mle, null);
    assertEquals(6, mdList.size());
    Iterator<ArticleMetadata> mdIt = mdList.iterator();
    ArticleMetadata mdRecord = null;
    while (mdIt.hasNext()) {
      mdRecord = (ArticleMetadata) mdIt.next();
      validateDatasetMetadataRecord(mdRecord);
    }
  }

  public void testFunctionalFromTarHierarchy() throws Exception {
    log.debug3("in testFromTarHierarchy");
    // load the tarballs
    InputStream file_input = null;
    try {
      file_input = getResourceAsStream(realTARFile_A);
      // UrlCacher uc = au.makeUrlCacher(TAR_A_BASE);
      // uc.storeContent(file_input, tarHeader);
      UrlCacher uc = tarAu.makeUrlCacher(new UrlData(file_input, tarHeader, TAR_A_BASE));
      uc.storeContent();
      IOUtil.safeClose(file_input);

      file_input = getResourceAsStream(realTARFile_B);
      // uc = au.makeUrlCacher(TAR_B_BASE);
      // uc.storeContent(file_input, tarHeader);
      uc = tarAu.makeUrlCacher(new UrlData(file_input, tarHeader, TAR_B_BASE));
      uc.storeContent();
      IOUtil.safeClose(file_input);

    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } finally {
      IOUtil.safeClose(file_input);
    }

    CachedUrlSet cus = tarAu.getAuCachedUrlSet();
    for (CachedUrl cu : cus.getCuIterable()) {
      log.debug3("AU - cu is: " + cu.getUrl());
      cu.release();
    }

    // We need to start from the level of the ArticleMetadataExtractor
    MyListEmitter emitter = new MyListEmitter();
    ArticleMetadataExtractor amEx =
        new ElsevierDeferredArticleMetadataExtractor(ArticleFiles.ROLE_ARTICLE_METADATA);

    Iterator<ArticleFiles> it = tarAu.getArticleIterator(MetadataTarget.Any());
    while (it.hasNext()) {
      ArticleFiles af = it.next();
      log.debug3("Metadata test - articlefiles " + af.toString());
      // CachedUrl cu = af.getFullTextCu();
      CachedUrl cu = af.getRoleCu(ArticleFiles.ROLE_ARTICLE_METADATA);
      log.debug3("metadata cu is " + cu.getUrl());
      // List<ArticleMetadata> mdlist = mle.extract(MetadataTarget.Any(), cu);
      amEx.extract(MetadataTarget.Any(), af, emitter);
      List<ArticleMetadata> returnList = emitter.getAmList();

      assertNotNull(returnList);
      log.debug3("size of returnList is " + returnList.size());
      Iterator<ArticleMetadata> mdIt = returnList.iterator();
      ArticleMetadata mdRecord = null;
      while (mdIt.hasNext()) {
        mdRecord = (ArticleMetadata) mdIt.next();
        validateCompleteMetadataRecord(mdRecord);
      }
    }
  }

  /*
   *  The supporting methods
   */
  private void setUpExpectedTarContent() {
    /* maps the DOIs in the metadata to the expected values */
    log.debug3("setUpExpectedTarContent");
    pubTitleMap = new HashMap<String, String>();
    {
      pubTitleMap.put("10.1016/j.jidx.2014.07.028", "International Journal of XXX");
      pubTitleMap.put("10.1016/j.jidx2.2014.05.013", "Revista");
      pubTitleMap.put("10.1016/S1473-1111(14)70840-0", "The Journal");
      pubTitleMap.put("10.1016/S0140-1111(14)61865-1", "The Other Journal");
      pubTitleMap.put("10.1016/j.foo.2014.08.001", "Foo");
      pubTitleMap.put("10.1016/j.foo.2014.08.123", "Foo");
    }
    ;

    dateMap = new HashMap<String, String>();
    {
      dateMap.put("10.1016/j.jidx.2014.07.028", "2014-07-30");
      dateMap.put("10.1016/j.jidx2.2014.05.013", "2014-07-09");
      dateMap.put("10.1016/S1473-1111(14)70840-0", "2014-09-01");
      dateMap.put("10.1016/S0140-1111(14)61865-1", "2014"); // will get from main.xml as backup
      dateMap.put("10.1016/j.foo.2014.08.001", "2014-08-20");
      dateMap.put("10.1016/j.foo.2014.08.123", "2014-08-20");
    }
    ;

    accessUrlMap = new HashMap<String, String>();
    {
      accessUrlMap.put(
          "10.1016/j.jidx.2014.07.028",
          TAR_A_BASE + SUBDIR + "01420615/v64sC/S0142061514004608/main.pdf");
      accessUrlMap.put(
          "10.1016/j.jidx2.2014.05.013",
          TAR_A_BASE + SUBDIR + "00349356/v61i9/S0034935614001819/main.pdf");
      accessUrlMap.put(
          "10.1016/S1473-1111(14)70840-0",
          TAR_A_BASE + SUBDIR + "14733099/v14i10/S1473309914708400/main.pdf");
      accessUrlMap.put(
          "10.1016/S0140-1111(14)61865-1",
          TAR_B_BASE + SUBDIR + "01406736/v384sS1/S0140673614618651/main.pdf");
      accessUrlMap.put(
          "10.1016/j.foo.2014.08.001",
          TAR_B_BASE + SUBDIR + "00191035/v242sC/S0019103514004151/main.pdf");
      accessUrlMap.put(
          "10.1016/j.foo.2014.08.123",
          TAR_B_BASE + SUBDIR + "00191035/v242sC/S0019103514003856/main.pdf");
    }
    ;

    ArrayList<String> goodAuthors = new ArrayList<String>();
    {
      goodAuthors.add("Writer, Bob");
      goodAuthors.add("Q. Text, Samantha");
    }
    ArrayList<String> simpleAuthors = new ArrayList<String>();
    {
      simpleAuthors.add("Simple, Josh");
    }
    ArrayList<String> extendedAuthors = new ArrayList<String>();
    {
      extendedAuthors.add("Writer, Bob");
      extendedAuthors.add("Q. Text, Samantha");
      extendedAuthors.add("The COLLABORATIVE Investigators");
    }

    authorMap = new HashMap<String, List<String>>();
    {
      authorMap.put("10.1016/j.jidx.2014.07.028", goodAuthors);
      authorMap.put("10.1016/j.jidx2.2014.05.013", goodAuthors);
      authorMap.put("10.1016/S1473-1111(14)70840-0", extendedAuthors);
      authorMap.put("10.1016/S0140-1111(14)61865-1", simpleAuthors);
      authorMap.put("10.1016/j.foo.2014.08.001", goodAuthors);
      authorMap.put("10.1016/j.foo.2014.08.123", goodAuthors);
    }
    ;

    volMap = new HashMap<String, String>();
    {
      volMap.put("10.1016/j.jidx.2014.07.028", "64");
      volMap.put("10.1016/j.jidx2.2014.05.013", "61");
      volMap.put("10.1016/S1473-1111(14)70840-0", "14");
      volMap.put("10.1016/S0140-1111(14)61865-1", "384");
      volMap.put("10.1016/j.foo.2014.08.001", "242");
      volMap.put("10.1016/j.foo.2014.08.123", "242");
    }
    ;

    issueMap = new HashMap<String, String>();
    {
      issueMap.put("10.1016/j.jidx.2014.07.028", "C");
      issueMap.put("10.1016/j.jidx2.2014.05.013", "9");
      issueMap.put("10.1016/S1473-1111(14)70840-0", "10");
      issueMap.put("10.1016/S0140-1111(14)61865-1", "S1");
      issueMap.put("10.1016/j.foo.2014.08.001", "C");
      issueMap.put("10.1016/j.foo.2014.08.123", "C");
    }
    ;
  }

  private String common_issn = "1111-1111";
  private String common_article_title = "Article about Important Things";
  // private String common_simple_article_title = "Simple Article Title for Update";
  private String common_simple_article_title = "Newsdesk Simple Dochead";

  /*
   * When testing a complete extraction out of the tarset, the MD record will be completely filled in
   * and pdf-existence will get established
   */
  private void validateCompleteMetadataRecord(ArticleMetadata am) {
    log.debug3("valideCompleteMetadatRecord");
    String doi_val = am.get(MetadataField.FIELD_DOI);
    /* make sure we can pick up both types of xml article data */
    log.debug3("doi val is: " + doi_val);

    if ("JA 5.2.0 SIMPLE-ARTICLE"
        .equals(am.getRaw(ElsevierDatasetXmlSchemaHelper.dataset_dtd_metadata))) {
      log.debug3("simple-article");
      assertEquals(common_simple_article_title, am.get(MetadataField.FIELD_ARTICLE_TITLE));
    } else {
      assertEquals(common_article_title, am.get(MetadataField.FIELD_ARTICLE_TITLE));
    }
    assertEquals(common_issn, am.get(MetadataField.FIELD_ISSN));
    assertEquals(authorMap.get(doi_val), am.getList(MetadataField.FIELD_AUTHOR));
    assertEquals(dateMap.get(doi_val), am.get(MetadataField.FIELD_DATE));
    assertEquals(accessUrlMap.get(doi_val), am.get(MetadataField.FIELD_ACCESS_URL));
    assertEquals(volMap.get(doi_val), am.get(MetadataField.FIELD_VOLUME));
    assertEquals(issueMap.get(doi_val), am.get(MetadataField.FIELD_ISSUE));
    assertEquals(pubTitleMap.get(doi_val), am.get(MetadataField.FIELD_PUBLICATION_TITLE));
    assertEquals("Elsevier", am.get(MetadataField.FIELD_PROVIDER));
    assertEquals("Elsevier", am.get(MetadataField.FIELD_PUBLISHER));
    log.debug3(am.ppString(2));
  }

  /*
   * When testing no-pdf-check basic XML parsing, you will get partial MD records
   * depending on whether the info comes from dataset.xml or from main.xml
   */
  private void validateDatasetMetadataRecord(ArticleMetadata am) {
    log.debug3("valideDatasetMetadatRecord");
    String doi_val = am.get(MetadataField.FIELD_DOI);
    assertEquals(common_issn, am.get(MetadataField.FIELD_ISSN));

    log.debug3("doi val is: " + doi_val);
    // The dataset doesn't set this value, it'll fail over the main.xml value
    if (doi_val.equals("10.1016/S0140-1111(14)61865-1")) {
      assertEquals(null, am.get(MetadataField.FIELD_DATE));
    } else {
      assertEquals(dateMap.get(doi_val), am.get(MetadataField.FIELD_DATE));
    }
    assertEquals(pubTitleMap.get(doi_val), am.get(MetadataField.FIELD_PUBLICATION_TITLE));
  }

  /*
   * You will have to tell it the DOI and the schema because those normally come from dataset
   */
  private void validateSingleMainMetadataRecord(ArticleMetadata am, String doi_val, String schema) {
    log.debug3("valideSingleMainMetadatRecord");
    if ("simple-article".equals(schema)) {
      assertEquals(common_simple_article_title, am.get(MetadataField.FIELD_ARTICLE_TITLE));
    } else {
      assertEquals(common_article_title, am.get(MetadataField.FIELD_ARTICLE_TITLE));
    }

    log.debug3("doi val is: " + doi_val);
    assertEquals(authorMap.get(doi_val), am.getList(MetadataField.FIELD_AUTHOR));
    assertEquals(volMap.get(doi_val), am.get(MetadataField.FIELD_VOLUME));
    assertEquals(issueMap.get(doi_val), am.get(MetadataField.FIELD_ISSUE));
    assertEquals("Comment", am.getRaw(ElsevierMainDTD5XmlSchemaHelper.common_dochead));
    assertEquals(doi_val, am.getRaw(ElsevierMainDTD5XmlSchemaHelper.common_doi));
    assertEquals("2014", am.getRaw(ElsevierMainDTD5XmlSchemaHelper.common_copyright));
  }

  private static final String simpleMain =
      "<?xml version=\"1.0\" encoding=\"utf-8\"?>"
          + "<!DOCTYPE article PUBLIC \"-//ES//DTD journal article DTD version 5.2.0//EN//XML\" \"art520.dtd\">"
          + "<article docsubtype=\"fla\" xml:lang=\"en\">"
          + "<item-info><jid>TEST</jid>"
          + "<aid>9906</aid>"
          + "<ce:article-number>e09906</ce:article-number>"
          + "<ce:pii>S9999-9994(15)00010-0</ce:pii>"
          + "<ce:doi>10.1016/j.jidx.2014.07.028</ce:doi>"
          + "<ce:copyright type=\"full-transfer\" year=\"2014\">Elsevier GmbH</ce:copyright>"
          + "</item-info>"
          + "<head>"
          + "<ce:dochead id=\"cedoch10\"><ce:textfn>Comment</ce:textfn></ce:dochead>"
          + "<ce:title id=\"tm005\">Article about Important Things</ce:title>"
          + "<ce:author-group id=\"ag005\">"
          + "<ce:author id=\"au005\">"
          + "<ce:given-name>Bob</ce:given-name><ce:surname>Writer</ce:surname>"
          + "<ce:cross-ref id=\"ar005\" refid=\"af005\"><ce:sup>a</ce:sup></ce:cross-ref>"
          + "<ce:cross-ref id=\"ar010\" refid=\"cor1\"><ce:sup>⁎</ce:sup></ce:cross-ref>"
          + "<ce:e-address id=\"em005\" type=\"email\">[email protected]</ce:e-address>"
          + "</ce:author>"
          + "<ce:author id=\"au001\">"
          + "<ce:given-name>Samantha</ce:given-name><ce:surname>Q. Text</ce:surname>"
          + "<ce:cross-ref id=\"ar001\" refid=\"af001\"><ce:sup>a</ce:sup></ce:cross-ref>"
          + "<ce:cross-ref id=\"ar010\" refid=\"cor1\"><ce:sup>⁎</ce:sup></ce:cross-ref>"
          + "<ce:e-address id=\"em005\" type=\"email\">[email protected]</ce:e-address>"
          + "</ce:author>"
          + "</ce:author-group>"
          + "<ce:date-received day=\"1\" month=\"1\" year=\"2014\"/>"
          + "<ce:date-revised day=\"26\" month=\"7\" year=\"2014\"/>"
          + "<ce:date-accepted day=\"3\" month=\"8\" year=\"2014\"/>"
          + "<ce:abstract class=\"author\" xml:lang=\"en\" id=\"ab005\"><ce:section-title id=\"st050\">Abstract</ce:section-title>"
          + "<ce:abstract-sec id=\"as005\"><ce:simple-para id=\"sp005\">Abstract goes here.</ce:simple-para></ce:abstract-sec>"
          + "</ce:abstract>"
          + "</head>"
          + "<body>"
          + "</body>"
          + "<tail>"
          + "</tail>"
          + "</article>";

  static class MyListEmitter implements ArticleMetadataExtractor.Emitter {

    List<ArticleMetadata> amlst = new ArrayList<ArticleMetadata>();

    public void emitMetadata(ArticleFiles af, ArticleMetadata md) {
      if (log.isDebug3()) log.debug3("emit(" + af + ", " + md + ")");
      if (md != null) {
        log.debug3("add " + md + " to amlist");
        amlst.add(md);
      }
      ;
    }

    public List<ArticleMetadata> getAmList() {
      return amlst;
    }
  }

  /*
   *  A test version of the extractor that allows for suppression of the file check
   *  this allows for basic XML parsing tests without having to provide the actual file content
   */
  public class TestElsevierDTD5MetadataExtractor
      extends ElsevierDTD5XmlSourceMetadataExtractorFactory.ElsevierDTD5XmlSourceMetadataExtractor {
    //
    // Override implementation of getFilenamesAssociatedWithRecord to force
    // emit for testing purposes - while allowing use of Elsevier extractor.
    // If a null list is returned, preEmitCheck returns "true"
    // allowing emit.
    //
    protected ArrayList<String> getFilenamesAssociatedWithRecord(
        SourceXmlSchemaHelper helper, CachedUrl cu, ArticleMetadata oneAM) {
      return null;
    }
  }
}
Esempio n. 15
0
/** Implements the "get status table" command */
public class AddAuConfigure extends AuActivityBase {

  private static String NAME = "AddAuConfigure";
  private static Logger log = Logger.getLogger(NAME);

  public AddAuConfigure() {
    super();
  }

  /**
   * Populate the response body
   *
   * @return true on success
   */
  public boolean doRemoteSetupAndVerification() throws IOException {

    /*
     * Stop if any required parameters are missing (error)
     */
    if (!verifyMinimumParameters()) {
      throw new ResponseException("Missing required parameters");
    }
    /*
     * Initial page setup
     */
    return commandSetup();
  }

  /**
   * Populate the response body
   *
   * @return true on success
   */
  public boolean doCommand() throws IOException {
    Element infoElement;

    /*
     * Return disk space
     */
    infoElement = getXmlUtils().createElement(getResponseRoot(), AP_E_INFO);
    renderDiskXml(infoElement);

    /*
     * No further action if this isn't a create command (success)
     */
    if (!isCreateCommand()) {
      return true;
    }
    /*
     * Stop if any required parameters are missing (error)
     */
    if (!verifyTarget() || !verifyMinimumParameters() || !verifyDefiningParameters()) {
      throw new ResponseException("Missing required parameters");
    }
    /*
     * Create the AU
     */
    if (!commandSetup()) {
      return false;
    }
    return createAu();
  }

  /*
   * "Helpers"
   */

  /**
   * Did the client provide the minimal parameters required?
   *
   * @return true If so
   */
  private boolean verifyMinimumParameters() {
    int count = 0;

    if (!StringUtil.isNullString(getParameter(AP_E_PUBLICATION))) count++;
    if (!StringUtil.isNullString(getParameter(AP_E_CLASSNAME))) count++;
    if (!StringUtil.isNullString(getParameter(AP_E_PLUGIN))) count++;

    return (count > 0);
  }

  /**
   * A target system is required to create an AU - was it provided?
   *
   * @return true If at least one target was specified
   */
  private boolean verifyTarget() {
    if (!isCreateCommand()) {
      return true;
    }

    return !StringUtil.isNullString(getParameter(AP_E_TARGET));
  }

  /**
   * Are all of the "defining parameters" required to create an AU available?
   *
   * @return true If so
   */
  private boolean verifyDefiningParameters() {
    KeyedList parameters;
    int size;

    if (!isCreateCommand()) {
      return true;
    }

    parameters = ParseUtils.getDynamicFields(getXmlUtils(), getRequestDocument(), AP_MD_AUDEFINING);
    size = parameters.size();

    for (int i = 0; i < size; i++) {
      if (StringUtil.isNullString((String) parameters.getValue(i))) {
        return false;
      }
    }
    return true;
  }

  /**
   * "Create" command?
   *
   * @return true If so...
   */
  private boolean isCreateCommand() {
    return "create".equalsIgnoreCase(getParameter(AP_E_ACTION));
  }

  /** Query the daemon for information required to set up this command */
  private boolean commandSetup() {

    Configuration configuration = null;
    Collection noEditKeys = null;
    String key;
    String value;

    /*
     * Configure a well known publication?
     */
    if ((value = getParameter(AP_E_PUBLICATION)) != null) {
      PluginProxy plugin = getTitlePlugin(value);

      /*
       * Set plugin and Title configuration information
       */
      if (plugin == null) {
        String message = "Unknown Publication:" + value;

        log.warning(message);
        return error(message);
      }

      setPlugin(plugin);
      setTitleConfig(plugin.getTitleConfig(value));

      configuration = getTitleConfig().getConfig();
      noEditKeys = getNoEditKeys();

    } else {
      /*
       * Lookup by Plugin or Class name - set the plugin
       *
       * NB: As of 23-Feb-04, this is not supported from AddAuPage.java.  See
       *     AddAuWithCompleteFunctionalityPage.java for full support.
       */
      if ((value = getParameter(AP_E_PLUGIN)) != null) {
        key = RemoteApi.pluginKeyFromId(value);

      } else if ((value = getParameter(AP_E_CLASSNAME)) != null) {
        key = RemoteApi.pluginKeyFromId(value);

      } else {
        return error("Supply a Publication, Plugin, or Class name");
      }

      if (StringUtil.isNullString(key)) {
        return error("Supply a valid Publication, Plugin, or Class name");
      }

      if (!pluginLoaded(key)) {
        return error("Plugin is not loaded: " + key);
      }

      setPlugin(getPluginProxy(key));
    }

    /*
     * Finally, return an XML rendition of the Plugin and AU key set up
     */
    generateSetupXml(configuration, noEditKeys);
    return true;
  }

  /**
   * Create an Archival Unit
   *
   * @return true If successful
   */
  private boolean createAu() {

    Configuration config = getAuConfigFromForm();

    AuProxy au;
    Element element;

    try {
      au = getRemoteApi().createAndSaveAuConfiguration(getPlugin(), config);

    } catch (ArchivalUnit.ConfigurationException exception) {
      return error("Configuration failed: " + exception.getMessage());

    } catch (IOException exception) {
      return error("Unable to save configuration: " + exception.getMessage());
    }
    /*
     * Successful creation - add the AU name and ID to the response document
     */
    element = getXmlUtils().createElement(getResponseRoot(), AP_E_AU);
    XmlUtils.addText(element, au.getName());

    element = getXmlUtils().createElement(getResponseRoot(), AP_E_AUID);
    XmlUtils.addText(element, au.getAuId());

    return true;
  }
}
public class TestHighWireArticleIteratorFactory extends ArticleIteratorTestCase {
  static Logger log = Logger.getLogger(TestHighWireArticleIteratorFactory.class);

  private SimulatedArchivalUnit sau; // Simulated AU to generate content

  private static String PLUGIN_NAME = "org.lockss.plugin.highwire.HighWirePressPlugin";

  private static String BASE_URL = "http://pediatrics.aappublications.org/";
  private static String SIM_ROOT = BASE_URL + "cgi/reprint/";

  public void setUp() throws Exception {
    super.setUp();
    String tempDirPath = setUpDiskSpace();
    au = createAu();
    sau = PluginTestUtil.createAndStartSimAu(simAuConfig(tempDirPath));
  }

  public void tearDown() throws Exception {
    sau.deleteContentTree();
    // theDaemon.stopDaemon();
    super.tearDown();
  }

  Configuration simAuConfig(String rootPath) {
    Configuration conf = ConfigManager.newConfiguration();
    conf.put("root", rootPath);
    conf.put("base_url", SIM_ROOT);
    conf.put("depth", "0");
    conf.put("branch", "0");
    conf.put("numFiles", "2");
    conf.put(
        "fileTypes",
        "" + (SimulatedContentGenerator.FILE_TYPE_PDF | SimulatedContentGenerator.FILE_TYPE_HTML));
    conf.put("binFileSize", "7");
    return conf;
  }

  protected ArchivalUnit createAu() throws ArchivalUnit.ConfigurationException {
    return PluginTestUtil.createAndStartAu(
        PLUGIN_NAME,
        ConfigurationUtil.fromArgs(
            "base_url",
            "http://pediatrics.aappublications.org/",
            "volume_name",
            "52",
            "journal_issn",
            "1098-4275"));
  }

  public void testRoots() throws Exception {
    SubTreeArticleIterator artIter = createSubTreeIter();
    System.out.println("Root Urls::" + getRootUrls(artIter));
    assertEquals(
        ListUtil.list(
            "http://pediatrics.aappublications.org/cgi/content/full/52/",
            "http://pediatrics.aappublications.org/cgi/reprint/52/"),
        getRootUrls(artIter));
  }

  public void testUrlsWithPrefixes() throws Exception {
    SubTreeArticleIterator artIter = createSubTreeIter();
    Pattern pat = getPattern(artIter);
    assertMatchesRE(
        pat, "http://pediatrics.aappublications.org/cgi/reprint/foo;52/Supplement_3/S69.pdf");
    assertMatchesRE(
        pat, "http://pediatrics.aappublications.org/cgi/reprint/52/supplement_3/S69.pdf");
    assertNotMatchesRE(
        pat, "http://pediatrics.aappublications.org/cgi/reprin/1014174823t49006/j0143.pdfwrong");
    assertNotMatchesRE(
        pat, "http://pediatrics.aappublications.org/cgi/reprintt/1014174823t49006/j0143.pdfwrong");
    assertNotMatchesRE(pat, "http://www.example.com/content/");
    assertNotMatchesRE(pat, "http://www.example.com/content/j");
    assertNotMatchesRE(pat, "http://www.example.com/content/j0123/j383.pdfwrong");
  }

  public void testCreateArticleFiles() throws Exception {
    PluginTestUtil.crawlSimAu(sau);

    String pat0 = "001file[.]html";
    String rep0 = "52/1/S1";
    PluginTestUtil.copyAu(sau, au, ".*[.]html$", pat0, rep0);
    String pat1 = "001file[.]pdf";
    String rep1 = "52/1/S1.pdf";
    PluginTestUtil.copyAu(sau, au, ".*[.]pdf$", pat1, rep1);

    String pdfurl = "http://pediatrics.aappublications.org/cgi/reprint/52/1/S1.pdf";
    String url = "http://pediatrics.aappublications.org/cgi/reprint/52/1/S1";

    au.makeCachedUrl(url);
    CachedUrl cu = au.makeCachedUrl(pdfurl);
    assertNotNull(cu);
    SubTreeArticleIterator artIter = createSubTreeIter();
    assertNotNull(artIter);
    ArticleFiles af = artIter.next();
    assertNotNull(af);
    System.out.println("article files::" + af);
    assertEquals(url, af.getRoleCu(ArticleFiles.ROLE_FULL_TEXT_PDF_LANDING_PAGE).getUrl());
    assertEquals(pdfurl, af.getRoleCu(ArticleFiles.ROLE_FULL_TEXT_PDF).getUrl());
  }
}
Esempio n. 17
0
/**
 * @author Thomas S. Robertson
 * @version 0.0
 */
public class PluginUtil {
  static Logger log = Logger.getLogger("PluginUtil");

  static final String PREFIX = Configuration.PREFIX + "PluginUtil.";

  /**
   * If true, the logic in getBaseUrl() that is responsible for turning the url <code>foo</code>
   * into the base url <code>foo/</code> (if that is the name the page was actually collected under)
   * will do that only if the url doesn't end with slash, and the nodeUrl in the props does. This
   * matches the logic in LockssResourceHandler.handleLockssRedirect(). But it's expensive, and I
   * don't think the nodeUrl prop should ever be different from the url in any other situation, so I
   * don't think it's necessary.
   */
  public static final String PARAM_DIR_NODE_CHECK_SLASH = PREFIX + "dirNodeCheckSlash";

  public static final boolean DEFAULT_DIR_NODE_CHECK_SLASH = false;

  private static boolean dirNodeCheckSlash = DEFAULT_DIR_NODE_CHECK_SLASH;

  /** Called by org.lockss.config.MiscConfig */
  public static void setConfig(
      Configuration config, Configuration oldConfig, Configuration.Differences diffs) {
    if (diffs.contains(PREFIX)) {
      dirNodeCheckSlash =
          config.getBoolean(PARAM_DIR_NODE_CHECK_SLASH, DEFAULT_DIR_NODE_CHECK_SLASH);
    }
  }

  /**
   * Returns the base url of the provided CachedUrl, checking to see if it's the result of a
   * redirect.
   */
  public static String getBaseUrl(CachedUrl cu) {
    // See the comments in LockssResourceHandler.handleLockssRedirect();
    // this is the same logic.
    CIProperties props = cu.getProperties();
    if (props != null) {
      String redir = props.getProperty(CachedUrl.PROPERTY_CONTENT_URL);
      if (redir != null) {
        return redir;
      } else {
        String url = cu.getUrl();
        String nodeUrl = props.getProperty(CachedUrl.PROPERTY_NODE_URL);
        if (nodeUrl != null && !nodeUrl.equals(url)) {
          log.debug("getBaseUrl(" + url + "), nodeUrl: " + nodeUrl);
          if (dirNodeCheckSlash) {
            URI uri = new URI(url);
            if (!uri.getPath().endsWith("/")) {
              URI nodeUri = new URI(nodeUrl);
              if (nodeUri.getPath().endsWith("/")) {
                return nodeUrl;
              }
            }
          } else {
            return nodeUrl;
          }
        }
      }
    }
    return cu.getUrl();
  }
}
Esempio n. 18
0
/**
 * An Exploder that ingests Internet Archive WARC files, and behaves as if it had ingested each file
 * in the WARC file directly from its original source.
 *
 * @author David S. H. Rosenthal
 * @author Felix Ostrowski
 * @version 0.0
 */
public class WarcExploder extends Exploder {

  private static Logger logger = Logger.getLogger("WarcExploder");
  protected InputStream arcStream;
  protected CIProperties arcProps;

  /**
   * Constructor
   *
   * @param uc UrlCacher for the archive
   * @param maxRetries
   * @param crawlSpec the CrawlSpec for the crawl that foudn the archive
   * @param crawler the crawler that found the archive
   * @param explode true to explode the archives
   * @param store true to store the archive as well
   */
  public WarcExploder(FetchedUrlData toExplode, CrawlerFacade crawlFacade, ExploderHelper helper) {
    super(toExplode, crawlFacade, helper);
    arcStream = toExplode.input;
    arcProps = toExplode.headers;
  }

  /** Explode the archive into its constituent elements */
  public void explode() throws CacheException {
    CachedUrl cachedUrl = null;
    int goodEntries = 0;
    int badEntries = 0;
    int ignoredEntries = 0;
    int entriesBetweenSleep = 0;
    ArchiveReader arcReader = null;

    logger.info(
        (storeArchive ? "Storing" : "Fetching") + " WARC file: " + origUrl + " will explode");
    try {
      if (storeArchive) {
        UrlCacher uc = au.makeUrlCacher(new UrlData(arcStream, arcProps, fetchUrl));
        BitSet bs = new BitSet();
        bs.set(UrlCacher.DONT_CLOSE_INPUT_STREAM_FLAG);
        uc.setFetchFlags(bs);
        uc.storeContent();
        archiveData.resetInputStream();
        arcStream = archiveData.input;
      }
      // Wrap it in an ArchiveReader
      logger.debug3("About to wrap stream");
      arcReader = wrapStream(fetchUrl, arcStream);
      logger.debug3("wrapStream() returns " + (arcReader == null ? "null" : "non-null"));
      // Explode it
      if (arcReader == null) {
        throw new CacheException.ExploderException("no WarcReader for " + origUrl);
      }
      ArchivalUnit au = crawlFacade.getAu();
      Set stemSet = new HashSet();
      logger.debug("Exploding " + fetchUrl);
      // Iterate through the elements in the WARC file, except the first
      Iterator i = arcReader.iterator();
      // Skip first record
      for (i.next(); i.hasNext(); ) {
        // XXX probably not necessary
        helper.pokeWDog();
        if ((++entriesBetweenSleep % sleepAfter) == 0) {
          long pauseTime =
              CurrentConfig.getTimeIntervalParam(PARAM_RETRY_PAUSE, DEFAULT_RETRY_PAUSE);
          Deadline pause = Deadline.in(pauseTime);
          logger.debug3("Sleeping for " + StringUtil.timeIntervalToString(pauseTime));
          while (!pause.expired()) {
            try {
              pause.sleep();
            } catch (InterruptedException ie) {
              // no action
            }
          }
        }
        ArchiveRecord element = (ArchiveRecord) i.next();
        // Each element is a URL to be cached in a suitable AU
        ArchiveRecordHeader elementHeader = element.getHeader();
        String elementUrl = elementHeader.getUrl();
        String elementMimeType = elementHeader.getMimetype();
        long elementLength = elementHeader.getLength();
        logger.debug2("WARC url " + elementUrl + " mime " + elementMimeType);
        if (elementUrl.startsWith("http:")) {
          ArchiveEntry ae =
              new ArchiveEntry(
                  elementUrl,
                  elementLength,
                  0, // XXX need to convert getDate string to long
                  element, // ArchiveRecord extends InputStream
                  this,
                  fetchUrl);
          ae.setHeaderFields(makeCIProperties(elementHeader));
          long bytesStored = elementLength;
          logger.debug3("ArchiveEntry: " + ae.getName() + " bytes " + bytesStored);
          try {
            helper.process(ae);
          } catch (PluginException ex) {
            throw new CacheException.ExploderException("helper.process() threw", ex);
          }
          if (ae.getBaseUrl() != null) {
            if (ae.getRestOfUrl() != null && ae.getHeaderFields() != null) {
              storeEntry(ae);
              handleAddText(ae);
              goodEntries++;
              crawlFacade.getCrawlerStatus().addContentBytesFetched(bytesStored);
            } else {
              ignoredEntries++;
            }
          } else {
            badEntries++;
            logger.debug2("Can't map " + elementUrl + " from " + archiveUrl);
          }
        }
      }
    } catch (IOException ex) {
      throw new CacheException.ExploderException(ex);
    } finally {
      if (arcReader != null)
        try {
          arcReader.close();
          arcReader = null;
        } catch (IOException ex) {
          throw new CacheException.ExploderException(ex);
        }
      if (cachedUrl != null) {
        cachedUrl.release();
      }
      IOUtil.safeClose(arcStream);
    }
    if (badEntries == 0 && goodEntries > 0) {
      // Make it look like a new crawl finished on each AU to which
      // URLs were added.
      for (Iterator it = touchedAus.iterator(); it.hasNext(); ) {
        ArchivalUnit au = (ArchivalUnit) it.next();
        logger.debug3(archiveUrl + " touching " + au.toString());
        AuUtil.getDaemon(au).getNodeManager(au).newContentCrawlFinished();
      }
    } else {
      ArchivalUnit au = crawlFacade.getAu();
      String msg = archiveUrl + ": " + badEntries + "/" + goodEntries + " bad entries";
      throw new CacheException.UnretryableException(msg);
    }
  }

  protected CIProperties makeCIProperties(ArchiveRecordHeader elementHeader) throws IOException {
    CIProperties ret = new CIProperties();
    Set elementHeaderFieldKeys = elementHeader.getHeaderFieldKeys();
    for (Iterator i = elementHeaderFieldKeys.iterator(); i.hasNext(); ) {
      String key = (String) i.next();
      try {

        Object valueObject = elementHeader.getHeaderValue(key);

        if (valueObject == null) {
          logger.warning("Ignoring null value for key '" + key + "'.");
        } else {
          String value = valueObject.toString();
          logger.debug3(key + ": " + value);
          ret.put(key, value);
        }

      } catch (ClassCastException ex) {
        logger.error("makeCIProperties: " + key + " threw ", ex);
        throw new CacheException.ExploderException(ex);
      }
    }
    return (ret);
  }

  protected ArchiveReader wrapStream(String url, InputStream arcStream) throws IOException {
    ArchiveReader ret = null;
    logger.debug3("Getting an ArchiveReader");
    ret = ArchiveReaderFactory.get(url, arcStream, true);
    return (ret);
  }
}