/** Create LockssKeystores from config subtree below {@link #PARAM_KEYSTORE} */
  void configureKeyStores(Configuration config) {
    Configuration allKs = config.getConfigTree(PARAM_KEYSTORE);
    for (Iterator iter = allKs.nodeIterator(); iter.hasNext(); ) {
      String id = (String) iter.next();
      Configuration oneKs = allKs.getConfigTree(id);
      try {
        LockssKeyStore lk = createLockssKeyStore(oneKs);
        String name = lk.getName();
        if (name == null) {
          log.error("KeyStore definition missing name: " + oneKs);
          continue;
        }
        LockssKeyStore old = keystoreMap.get(name);
        if (old != null && !lk.equals(old)) {
          log.warning(
              "Keystore "
                  + name
                  + " redefined.  "
                  + "New definition may not take effect until daemon restart");
        }

        log.debug("Adding keystore " + name);
        keystoreMap.put(name, lk);

      } catch (Exception e) {
        log.error("Couldn't create keystore: " + oneKs, e);
      }
    }
  }
  /**
   * Return a TitleConfig for the AU. Returns matching entry from the title db if found, else
   * creates one
   */
  TitleConfig titleConfigFromAu(InactiveAuProxy au) {
    PluginProxy plugin = au.getPlugin();
    String auname = au.getName();
    Configuration auConfig = au.getConfiguration();
    TitleConfig tc = AuUtil.findTitleConfig(auConfig, plugin.getPlugin());
    if (tc != null) {
      return tc;
    }

    tc = new TitleConfig(auname, plugin.getPluginId());
    ArrayList<ConfigParamAssignment> params = new ArrayList();
    for (Iterator iter = auConfig.keyIterator(); iter.hasNext(); ) {
      String key = (String) iter.next();
      if (!ConfigParamDescr.isReservedParam(key)) {
        String val = auConfig.get(key);
        ConfigParamDescr descr = plugin.findAuConfigDescr(key);
        if (descr != null) {
          ConfigParamAssignment cpa = new ConfigParamAssignment(descr, val);
          params.add(cpa);
        } else {
          log.warning("Unknown parameter key: " + key + " in au: " + auname);
        }
      }
    }
    params.trimToSize();
    tc.setParams(params);
    return tc;
  }
 protected void checkLeaf(SimulatedArchivalUnit sau) {
   log.debug("checkLeaf()");
   String parent = sau.getUrlRoot() + "/branch1";
   CachedUrlSetSpec spec = new RangeCachedUrlSetSpec(parent);
   CachedUrlSet set = sau.makeCachedUrlSet(spec);
   Iterator setIt = set.contentHashIterator();
   ArrayList childL = new ArrayList(16);
   while (setIt.hasNext()) {
     childL.add(((CachedUrlSetNode) setIt.next()).getUrl());
   }
   String[] expectedA =
       new String[] {
         parent,
         parent + "/001file.html",
         parent + "/001file.txt",
         parent + "/002file.html",
         parent + "/002file.txt",
         parent + "/branch1",
         parent + "/branch1/001file.html",
         parent + "/branch1/001file.txt",
         parent + "/branch1/002file.html",
         parent + "/branch1/002file.txt",
         parent + "/branch1/index.html",
         parent + "/branch2",
         parent + "/branch2/001file.html",
         parent + "/branch2/001file.txt",
         parent + "/branch2/002file.html",
         parent + "/branch2/002file.txt",
         parent + "/branch2/index.html",
         parent + "/index.html",
       };
   assertIsomorphic(expectedA, childL);
 }
  /**
   * Method that creates a simulated Cached URL from the source code provided by the goodContent
   * String. It then asserts that the metadata extracted, by using the
   * MetaPressRisMetadataExtractorFactory, match the metadata in the source code.
   *
   * @throws Exception
   */
  public void testExtractGoodRisContent() throws Exception {
    String goodContent = createGoodRisContent();
    log.debug3(goodContent);

    List<ArticleMetadata> mdlist = setupContentForAU(bau1, RIS_URL, goodContent, false);
    assertNotEmpty(mdlist);
    ArticleMetadata md = mdlist.get(0);
    assertNotNull(md);

    assertEquals(goodVolume, md.get(MetadataField.FIELD_VOLUME));
    assertEquals(goodIssue, md.get(MetadataField.FIELD_ISSUE));
    assertEquals(goodStartPage, md.get(MetadataField.FIELD_START_PAGE));
    assertEquals(goodEndPage, md.get(MetadataField.FIELD_END_PAGE));
    assertEquals(goodIssn, md.get(MetadataField.FIELD_ISSN));
    Iterator<String> actAuthIter = md.getList(MetadataField.FIELD_AUTHOR).iterator();
    for (String expAuth : goodAuthors) {
      assertEquals(expAuth, actAuthIter.next());
    }
    assertEquals(goodTitle, md.get(MetadataField.FIELD_ARTICLE_TITLE));
    assertEquals(goodJournal, md.get(MetadataField.FIELD_PUBLICATION_TITLE));
    assertEquals(goodDate, md.get(MetadataField.FIELD_DATE));

    assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER));
    assertEquals(goodDOI, md.get(MetadataField.FIELD_DOI));
    // This shouldn't get set. It will default later to fuill_text_cu
    assertNotEquals(doiURL, md.get(MetadataField.FIELD_ACCESS_URL));
  }
Example #5
0
  private V3LcapMessage makeTestVoteMessage(Collection voteBlocks) throws IOException {
    mPollMgr.setStateDir("key", tempDir);
    V3LcapMessage msg =
        new V3LcapMessage(
            "ArchivalID_2",
            "key",
            "Plug42",
            m_testBytes,
            m_testBytes,
            V3LcapMessage.MSG_VOTE,
            987654321,
            m_testID,
            tempDir,
            theDaemon);

    // Set msg vote blocks.
    for (Iterator ix = voteBlocks.iterator(); ix.hasNext(); ) {
      msg.addVoteBlock((VoteBlock) ix.next());
    }

    msg.setHashAlgorithm(LcapMessage.getDefaultHashAlgorithm());
    msg.setArchivalId(m_archivalID);
    msg.setPluginVersion("PlugVer42");
    return msg;
  }
  protected void checkRoot(SimulatedArchivalUnit sau) {
    log.debug("checkRoot()");
    CachedUrlSet set = sau.getAuCachedUrlSet();
    Iterator setIt = set.flatSetIterator();
    ArrayList childL = new ArrayList(1);
    CachedUrlSet cus = null;
    while (setIt.hasNext()) {
      cus = (CachedUrlSet) setIt.next();
      childL.add(cus.getUrl());
    }

    String urlRoot = sau.getUrlRoot();

    String[] expectedA = new String[1];
    expectedA[0] = urlRoot;
    assertIsomorphic(expectedA, childL);

    setIt = cus.flatSetIterator();
    childL = new ArrayList(7);
    while (setIt.hasNext()) {
      childL.add(((CachedUrlSetNode) setIt.next()).getUrl());
    }

    expectedA =
        new String[] {
          urlRoot + "/001file.html",
          urlRoot + "/001file.txt",
          urlRoot + "/002file.html",
          urlRoot + "/002file.txt",
          urlRoot + "/branch1",
          urlRoot + "/branch2",
          urlRoot + "/index.html"
        };
    assertIsomorphic(expectedA, childL);
  }
 /**
  * Return the titles in the set.
  *
  * @return a collection of TitleConfig
  */
 public Collection<TitleConfig> getTitles() {
   Collection aus = daemon.getRemoteApi().getInactiveAus();
   ArrayList<TitleConfig> res = new ArrayList<TitleConfig>(aus.size());
   for (Iterator iter = aus.iterator(); iter.hasNext(); ) {
     InactiveAuProxy aup = (InactiveAuProxy) iter.next();
     res.add(titleConfigFromAu(aup));
   }
   res.trimToSize();
   return res;
 }
 List<String> auUrls(ArchivalUnit au) {
   List<String> res = new ArrayList<String>();
   for (Iterator iter = au.getAuCachedUrlSet().contentHashIterator(); iter.hasNext(); ) {
     CachedUrlSetNode cusn = (CachedUrlSetNode) iter.next();
     if (cusn.hasContent()) {
       res.add(cusn.getUrl());
     }
   }
   return res;
 }
 /** Concatenate params for URL string */
 String concatParams(Properties props) {
   if (props == null) {
     return null;
   }
   java.util.List list = new ArrayList();
   for (Iterator iter = props.keySet().iterator(); iter.hasNext(); ) {
     String key = (String) iter.next();
     String val = props.getProperty(key);
     if (!StringUtil.isNullString(val)) {
       list.add(key + "=" + urlEncode(val));
     }
   }
   return StringUtil.separatedString(list, "&");
 }
 public List findExistingRepositoriesFor(String auid) {
   List res = null;
   for (Iterator iter = getRepositoryList().iterator(); iter.hasNext(); ) {
     String repoName = (String) iter.next();
     String path = LockssRepositoryImpl.getLocalRepositoryPath(repoName);
     if (LockssRepositoryImpl.doesAuDirExist(auid, path)) {
       if (res == null) {
         res = new ArrayList();
       }
       res.add(repoName);
     }
   }
   return res == null ? Collections.EMPTY_LIST : res;
 }
  public void testSimpleDatasetXML() throws Exception {
    log.debug3("testSimpleDatasetXML");
    String file_input = StringUtil.fromInputStream(getResourceAsStream(testDatasetFile));
    String xml_url = TAR_A_BASE + SUBDIR + "dataset.xml";

    List<ArticleMetadata> mdList =
        extractFromContent(xml_url, "text/xml", file_input, nocheck_mle, null);
    assertEquals(6, mdList.size());
    Iterator<ArticleMetadata> mdIt = mdList.iterator();
    ArticleMetadata mdRecord = null;
    while (mdIt.hasNext()) {
      mdRecord = (ArticleMetadata) mdIt.next();
      validateDatasetMetadataRecord(mdRecord);
    }
  }
 /**
  * Checks the consistency of the node, and continues with its children if it's consistent.
  *
  * @param node RepositoryNodeImpl the node to check
  */
 private void recurseConsistencyCheck(RepositoryNodeImpl node) {
   logger.debug2("Checking node '" + node.getNodeUrl() + "'...");
   // check consistency at each node
   // correct/deactivate as necessary
   // 'checkNodeConsistency()' will repair if possible
   if (node.checkNodeConsistency()) {
     logger.debug3("Node consistent; recursing on children...");
     List children = node.getNodeList(null, false);
     Iterator iter = children.iterator();
     while (iter.hasNext()) {
       RepositoryNodeImpl child = (RepositoryNodeImpl) iter.next();
       recurseConsistencyCheck(child);
     }
   } else {
     logger.debug3("Node inconsistent; deactivating...");
     deactivateInconsistentNode(node);
   }
 }
  /**
   * Method that creates a simulated Cached URL from the source code provided by the goodContent
   * String. It then asserts that the metadata extracted, by using the
   * MetaPressRisMetadataExtractorFactory, match the metadata in the source code.
   *
   * @throws Exception
   */
  public void testExtractAlternateRisContent() throws Exception {
    String goodContent = createAlternateRisContent();
    log.debug3(goodContent);

    List<ArticleMetadata> mdlist = setupContentForAU(bau1, RIS_URL, goodContent, false);
    assertNotEmpty(mdlist);
    ArticleMetadata md = mdlist.get(0);
    assertNotNull(md);

    Iterator<String> actAuthIter = md.getList(MetadataField.FIELD_AUTHOR).iterator();
    for (String expAuth : goodAuthors) {
      assertEquals(expAuth, actAuthIter.next());
    }
    assertEquals(goodTitle, md.get(MetadataField.FIELD_ARTICLE_TITLE));
    assertEquals(goodJournal, md.get(MetadataField.FIELD_PUBLICATION_TITLE));
    assertEquals(goodDate, md.get(MetadataField.FIELD_DATE));
    assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER));
  }
Example #14
0
  public void testRequestMessageCreation() throws Exception {
    V3LcapMessage reqMsg =
        new V3LcapMessage(
            "ArchivalID_2",
            "key",
            "Plug42",
            m_testBytes,
            m_testBytes,
            V3LcapMessage.MSG_REPAIR_REQ,
            987654321,
            m_testID,
            tempDir,
            theDaemon);
    reqMsg.setTargetUrl("http://foo.com/");

    for (Iterator ix = m_testVoteBlocks.iterator(); ix.hasNext(); ) {
      reqMsg.addVoteBlock((VoteBlock) ix.next());
    }

    assertEquals(3, reqMsg.getProtocolVersion());
    assertEquals("Plug42", reqMsg.getPluginVersion());
    assertTrue(m_testID == reqMsg.getOriginatorId());
    assertEquals(V3LcapMessage.MSG_REPAIR_REQ, reqMsg.getOpcode());
    assertEquals("ArchivalID_2", reqMsg.getArchivalId());
    assertEquals("http://foo.com/", reqMsg.getTargetUrl());
    assertEquals(m_testBytes, reqMsg.getPollerNonce());
    assertEquals(m_testBytes, reqMsg.getVoterNonce());
    assertEquals(null, reqMsg.getVoterNonce2());
    List aBlocks = new ArrayList();
    List bBlocks = new ArrayList();
    for (VoteBlocksIterator iter = m_testMsg.getVoteBlockIterator(); iter.hasNext(); ) {
      aBlocks.add(iter.next());
    }
    for (VoteBlocksIterator iter = reqMsg.getVoteBlockIterator(); iter.hasNext(); ) {
      bBlocks.add(iter.next());
    }
    assertEquals(aBlocks, bBlocks);

    // Actual size of test vote blocks is unpredictable
    assertTrue(reqMsg.getEstimatedEncodedLength() > V3LcapMessage.EST_ENCODED_HEADER_LENGTH);
  }
Example #15
0
  private void outputSummary(String baseUrl, Set fetched, Set toCrawl, long elapsedTime) {
    int fetchCount = fetched.size();
    outputMessage(
        "\n\nSummary for starting Url: " + baseUrl + " and depth: " + m_crawlDepth,
        TEST_SUMMARY_MESSAGE);
    outputMessage(
        "\nUrls fetched: " + fetchCount + "    Urls extracted: " + m_extracted.size(),
        PLAIN_MESSAGE);

    outputMessage("\nDepth  Fetched  Parsed  New URLs", PLAIN_MESSAGE);
    for (int depth = 1; depth <= m_crawlDepth; depth++) {
      PrintfFormat pf = new PrintfFormat("%5d  %7d  %6d  %8d");
      Integer[] args =
          new Integer[] {
            new Integer(depth),
            new Integer(depth_fetched[depth - 1]),
            new Integer(depth_parsed[depth - 1]),
            new Integer(depth_incl[depth - 1]),
          };
      String s = pf.sprintf(args);
      outputMessage(s, PLAIN_MESSAGE);
    }

    outputMessage("\nRemaining unfetched: " + toCrawl.size(), PLAIN_MESSAGE);
    if (false) {
      for (Iterator iter = toCrawl.iterator(); iter.hasNext(); ) {
        String url = (String) iter.next();
        outputMessage(url, PLAIN_MESSAGE);
      }
    }
    long secs = elapsedTime / Constants.SECOND;
    long fetchRate = 0;
    if (secs > 0) {
      fetchRate = fetchCount * 60 * Constants.SECOND / elapsedTime;
    }
    outputMessage(
        "\nElapsed Time: " + secs + " secs." + "    Fetch Rate: " + fetchRate + " p/m",
        PLAIN_MESSAGE);
  }
  public void testFunctionalFromTarHierarchy() throws Exception {
    log.debug3("in testFromTarHierarchy");
    // load the tarballs
    InputStream file_input = null;
    try {
      file_input = getResourceAsStream(realTARFile_A);
      // UrlCacher uc = au.makeUrlCacher(TAR_A_BASE);
      // uc.storeContent(file_input, tarHeader);
      UrlCacher uc = tarAu.makeUrlCacher(new UrlData(file_input, tarHeader, TAR_A_BASE));
      uc.storeContent();
      IOUtil.safeClose(file_input);

      file_input = getResourceAsStream(realTARFile_B);
      // uc = au.makeUrlCacher(TAR_B_BASE);
      // uc.storeContent(file_input, tarHeader);
      uc = tarAu.makeUrlCacher(new UrlData(file_input, tarHeader, TAR_B_BASE));
      uc.storeContent();
      IOUtil.safeClose(file_input);

    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } finally {
      IOUtil.safeClose(file_input);
    }

    CachedUrlSet cus = tarAu.getAuCachedUrlSet();
    for (CachedUrl cu : cus.getCuIterable()) {
      log.debug3("AU - cu is: " + cu.getUrl());
      cu.release();
    }

    // We need to start from the level of the ArticleMetadataExtractor
    MyListEmitter emitter = new MyListEmitter();
    ArticleMetadataExtractor amEx =
        new ElsevierDeferredArticleMetadataExtractor(ArticleFiles.ROLE_ARTICLE_METADATA);

    Iterator<ArticleFiles> it = tarAu.getArticleIterator(MetadataTarget.Any());
    while (it.hasNext()) {
      ArticleFiles af = it.next();
      log.debug3("Metadata test - articlefiles " + af.toString());
      // CachedUrl cu = af.getFullTextCu();
      CachedUrl cu = af.getRoleCu(ArticleFiles.ROLE_ARTICLE_METADATA);
      log.debug3("metadata cu is " + cu.getUrl());
      // List<ArticleMetadata> mdlist = mle.extract(MetadataTarget.Any(), cu);
      amEx.extract(MetadataTarget.Any(), af, emitter);
      List<ArticleMetadata> returnList = emitter.getAmList();

      assertNotNull(returnList);
      log.debug3("size of returnList is " + returnList.size());
      Iterator<ArticleMetadata> mdIt = returnList.iterator();
      ArticleMetadata mdRecord = null;
      while (mdIt.hasNext()) {
        mdRecord = (ArticleMetadata) mdIt.next();
        validateCompleteMetadataRecord(mdRecord);
      }
    }
  }
Example #17
0
  protected CIProperties makeCIProperties(ArchiveRecordHeader elementHeader) throws IOException {
    CIProperties ret = new CIProperties();
    Set elementHeaderFieldKeys = elementHeader.getHeaderFieldKeys();
    for (Iterator i = elementHeaderFieldKeys.iterator(); i.hasNext(); ) {
      String key = (String) i.next();
      try {

        Object valueObject = elementHeader.getHeaderValue(key);

        if (valueObject == null) {
          logger.warning("Ignoring null value for key '" + key + "'.");
        } else {
          String value = valueObject.toString();
          logger.debug3(key + ": " + value);
          ret.put(key, value);
        }

      } catch (ClassCastException ex) {
        logger.error("makeCIProperties: " + key + " threw ", ex);
        throw new CacheException.ExploderException(ex);
      }
    }
    return (ret);
  }
  public void testArticleCountAndType() throws Exception {
    int expCount = 28;
    PluginTestUtil.crawlSimAu(sau);
    String pat1 = "branch(\\d+)/(\\d+file\\.html)";
    String rep1 = "aps/journal/v123/n$1/full/$2";
    PluginTestUtil.copyAu(sau, nau, ".*[^.][^p][^d][^f]$", pat1, rep1);
    String pat2 = "branch(\\d+)/(\\d+file\\.pdf)";
    String rep2 = "aps/journal/v123/n$1/pdf/$2";
    PluginTestUtil.copyAu(sau, nau, ".*\\.pdf$", pat2, rep2);

    // Remove some URLs
    int deleted = 0;
    for (Iterator it = nau.getAuCachedUrlSet().contentHashIterator(); it.hasNext(); ) {
      CachedUrlSetNode cusn = (CachedUrlSetNode) it.next();
      if (cusn instanceof CachedUrl) {
        CachedUrl cu = (CachedUrl) cusn;
        String url = cu.getUrl();
        if (url.contains("/journal/")
            && (url.endsWith("1file.html") || url.endsWith("2file.pdf"))) {
          deleteBlock(cu);
          ++deleted;
        }
      }
    }
    assertEquals(8, deleted);

    Iterator<ArticleFiles> it = nau.getArticleIterator();
    int count = 0;
    int countHtmlOnly = 0;
    int countPdfOnly = 0;
    while (it.hasNext()) {
      ArticleFiles af = it.next();
      log.info(af.toString());
      CachedUrl cu = af.getFullTextCu();
      String url = cu.getUrl();
      assertNotNull(cu);
      String contentType = cu.getContentType();
      log.debug("count " + count + " url " + url + " " + contentType);
      count++;
      if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == null) {
        ++countHtmlOnly;
      }
      if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == url) {
        ++countPdfOnly;
      }
    }
    log.debug("Article count is " + count);
    assertEquals(expCount, count);
    assertEquals(4, countHtmlOnly);
    assertEquals(4, countPdfOnly);
  }
Example #19
0
  private Set outputUrlResults(String url, Set m_inclset, Set m_exclset) {
    Set new_incls = new TreeSet(CollectionUtils.subtract(m_inclset, m_reported));
    Set new_excls = new TreeSet(CollectionUtils.subtract(m_exclset, m_reported));
    if (!m_inclset.isEmpty()) {
      outputMessage(
          "\nIncluded Urls: ("
              + new_incls.size()
              + " new, "
              + (m_inclset.size() - new_incls.size())
              + " old)",
          URL_SUMMARY_MESSAGE);
      depth_incl[m_curDepth - 1] += new_incls.size();
    }
    for (Iterator it = new_incls.iterator(); it.hasNext(); ) {
      outputMessage(it.next().toString(), PLAIN_MESSAGE);
    }

    if (!m_exclset.isEmpty()) {
      outputMessage(
          "\nExcluded Urls: ("
              + new_excls.size()
              + " new, "
              + (m_exclset.size() - new_excls.size())
              + " old)",
          URL_SUMMARY_MESSAGE);
    }
    for (Iterator it = new_excls.iterator(); it.hasNext(); ) {
      outputMessage(it.next().toString(), PLAIN_MESSAGE);
    }
    m_reported.addAll(new_incls);
    m_reported.addAll(new_excls);

    if (m_outWriter != null) {
      try {
        m_outWriter.flush();
      } catch (IOException ex) {
      }
    }
    return new_incls;
  }
 public void setConfig(
     Configuration config, Configuration oldConfig, Configuration.Differences changedKeys) {
   //  Build list of repositories from list of disk (fs) paths).  Needs to
   //  be generalized if ever another repository implementation.
   if (changedKeys.contains(ConfigManager.PARAM_PLATFORM_DISK_SPACE_LIST)) {
     List lst = new ArrayList();
     String dspace = config.get(ConfigManager.PARAM_PLATFORM_DISK_SPACE_LIST, "");
     List paths = StringUtil.breakAt(dspace, ';');
     if (paths != null) {
       for (Iterator iter = paths.iterator(); iter.hasNext(); ) {
         lst.add("local:" + (String) iter.next());
       }
     }
     repoList = lst;
   }
   if (changedKeys.contains(PARAM_MAX_PER_AU_CACHE_SIZE)) {
     paramNodeCacheSize =
         config.getInt(PARAM_MAX_PER_AU_CACHE_SIZE, DEFAULT_MAX_PER_AU_CACHE_SIZE);
     for (Iterator iter = getDaemon().getAllLockssRepositories().iterator(); iter.hasNext(); ) {
       LockssRepository repo = (LockssRepository) iter.next();
       if (repo instanceof LockssRepositoryImpl) {
         LockssRepositoryImpl repoImpl = (LockssRepositoryImpl) repo;
         repoImpl.setNodeCacheSize(paramNodeCacheSize);
       }
     }
   }
   if (changedKeys.contains(PARAM_MAX_SUSPECT_VERSIONS_CACHE_SIZE)) {
     paramSuspectVersionsCacheSize =
         config.getInt(
             PARAM_MAX_SUSPECT_VERSIONS_CACHE_SIZE, DEFAULT_MAX_SUSPECT_VERSIONS_CACHE_SIZE);
     suspectVersionsCache.setMaxSize(paramSuspectVersionsCacheSize);
   }
   if (changedKeys.contains(GLOBAL_CACHE_PREFIX)) {
     paramIsGlobalNodeCache =
         config.getBoolean(PARAM_GLOBAL_CACHE_ENABLED, DEFAULT_GLOBAL_CACHE_ENABLED);
     if (paramIsGlobalNodeCache) {
       paramGlobalNodeCacheSize =
           config.getInt(PARAM_MAX_GLOBAL_CACHE_SIZE, DEFAULT_MAX_GLOBAL_CACHE_SIZE);
       log.debug("global node cache size: " + paramGlobalNodeCacheSize);
       globalNodeCache.setMaxSize(paramGlobalNodeCacheSize);
     }
   }
   if (changedKeys.contains(DISK_PREFIX)) {
     int minMB = config.getInt(PARAM_DISK_WARN_FRRE_MB, DEFAULT_DISK_WARN_FRRE_MB);
     double minPer =
         config.getPercentage(PARAM_DISK_WARN_FRRE_PERCENT, DEFAULT_DISK_WARN_FRRE_PERCENT);
     paramDFWarn = PlatformUtil.DF.makeThreshold(minMB, minPer);
     minMB = config.getInt(PARAM_DISK_FULL_FRRE_MB, DEFAULT_DISK_FULL_FRRE_MB);
     minPer = config.getPercentage(PARAM_DISK_FULL_FRRE_PERCENT, DEFAULT_DISK_FULL_FRRE_PERCENT);
     paramDFFull = PlatformUtil.DF.makeThreshold(minMB, minPer);
   }
   if (changedKeys.contains(PARAM_SIZE_CALC_MAX_LOAD)) {
     sizeCalcMaxLoad = config.getPercentage(PARAM_SIZE_CALC_MAX_LOAD, DEFAULT_SIZE_CALC_MAX_LOAD);
   }
   if (changedKeys.contains(PREFIX)) {
     maxUnusedDirSearch =
         config.getInt(PARAM_MAX_UNUSED_DIR_SEARCH, DEFAULT_MAX_UNUSED_DIR_SEARCH);
     isStatefulUnusedDirSearch =
         config.getBoolean(
             PARAM_IS_STATEFUL_UNUSED_DIR_SEARCH, DEFAULT_IS_STATEFUL_UNUSED_DIR_SEARCH);
     enableLongComponents =
         config.getBoolean(PARAM_ENABLE_LONG_COMPONENTS, DEFAULT_ENABLE_LONG_COMPONENTS);
     enableLongComponentsCompatibility =
         config.getBoolean(
             PARAM_ENABLE_LONG_COMPONENTS_COMPATIBILITY,
             DEFAULT_ENABLE_LONG_COMPONENTS_COMPATIBILITY);
     maxComponentLength = config.getInt(PARAM_MAX_COMPONENT_LENGTH, DEFAULT_MAX_COMPONENT_LENGTH);
     checkUnnormalized =
         (CheckUnnormalizedMode)
             config.getEnum(
                 CheckUnnormalizedMode.class,
                 PARAM_CHECK_UNNORMALIZED,
                 DEFAULT_CHECK_UNNORMALIZED);
   }
 }
Example #21
0
  /** Explode the archive into its constituent elements */
  public void explode() throws CacheException {
    CachedUrl cachedUrl = null;
    int goodEntries = 0;
    int badEntries = 0;
    int ignoredEntries = 0;
    int entriesBetweenSleep = 0;
    ArchiveReader arcReader = null;

    logger.info(
        (storeArchive ? "Storing" : "Fetching") + " WARC file: " + origUrl + " will explode");
    try {
      if (storeArchive) {
        UrlCacher uc = au.makeUrlCacher(new UrlData(arcStream, arcProps, fetchUrl));
        BitSet bs = new BitSet();
        bs.set(UrlCacher.DONT_CLOSE_INPUT_STREAM_FLAG);
        uc.setFetchFlags(bs);
        uc.storeContent();
        archiveData.resetInputStream();
        arcStream = archiveData.input;
      }
      // Wrap it in an ArchiveReader
      logger.debug3("About to wrap stream");
      arcReader = wrapStream(fetchUrl, arcStream);
      logger.debug3("wrapStream() returns " + (arcReader == null ? "null" : "non-null"));
      // Explode it
      if (arcReader == null) {
        throw new CacheException.ExploderException("no WarcReader for " + origUrl);
      }
      ArchivalUnit au = crawlFacade.getAu();
      Set stemSet = new HashSet();
      logger.debug("Exploding " + fetchUrl);
      // Iterate through the elements in the WARC file, except the first
      Iterator i = arcReader.iterator();
      // Skip first record
      for (i.next(); i.hasNext(); ) {
        // XXX probably not necessary
        helper.pokeWDog();
        if ((++entriesBetweenSleep % sleepAfter) == 0) {
          long pauseTime =
              CurrentConfig.getTimeIntervalParam(PARAM_RETRY_PAUSE, DEFAULT_RETRY_PAUSE);
          Deadline pause = Deadline.in(pauseTime);
          logger.debug3("Sleeping for " + StringUtil.timeIntervalToString(pauseTime));
          while (!pause.expired()) {
            try {
              pause.sleep();
            } catch (InterruptedException ie) {
              // no action
            }
          }
        }
        ArchiveRecord element = (ArchiveRecord) i.next();
        // Each element is a URL to be cached in a suitable AU
        ArchiveRecordHeader elementHeader = element.getHeader();
        String elementUrl = elementHeader.getUrl();
        String elementMimeType = elementHeader.getMimetype();
        long elementLength = elementHeader.getLength();
        logger.debug2("WARC url " + elementUrl + " mime " + elementMimeType);
        if (elementUrl.startsWith("http:")) {
          ArchiveEntry ae =
              new ArchiveEntry(
                  elementUrl,
                  elementLength,
                  0, // XXX need to convert getDate string to long
                  element, // ArchiveRecord extends InputStream
                  this,
                  fetchUrl);
          ae.setHeaderFields(makeCIProperties(elementHeader));
          long bytesStored = elementLength;
          logger.debug3("ArchiveEntry: " + ae.getName() + " bytes " + bytesStored);
          try {
            helper.process(ae);
          } catch (PluginException ex) {
            throw new CacheException.ExploderException("helper.process() threw", ex);
          }
          if (ae.getBaseUrl() != null) {
            if (ae.getRestOfUrl() != null && ae.getHeaderFields() != null) {
              storeEntry(ae);
              handleAddText(ae);
              goodEntries++;
              crawlFacade.getCrawlerStatus().addContentBytesFetched(bytesStored);
            } else {
              ignoredEntries++;
            }
          } else {
            badEntries++;
            logger.debug2("Can't map " + elementUrl + " from " + archiveUrl);
          }
        }
      }
    } catch (IOException ex) {
      throw new CacheException.ExploderException(ex);
    } finally {
      if (arcReader != null)
        try {
          arcReader.close();
          arcReader = null;
        } catch (IOException ex) {
          throw new CacheException.ExploderException(ex);
        }
      if (cachedUrl != null) {
        cachedUrl.release();
      }
      IOUtil.safeClose(arcStream);
    }
    if (badEntries == 0 && goodEntries > 0) {
      // Make it look like a new crawl finished on each AU to which
      // URLs were added.
      for (Iterator it = touchedAus.iterator(); it.hasNext(); ) {
        ArchivalUnit au = (ArchivalUnit) it.next();
        logger.debug3(archiveUrl + " touching " + au.toString());
        AuUtil.getDaemon(au).getNodeManager(au).newContentCrawlFinished();
      }
    } else {
      ArchivalUnit au = crawlFacade.getAu();
      String msg = archiveUrl + ": " + badEntries + "/" + goodEntries + " bad entries";
      throw new CacheException.UnretryableException(msg);
    }
  }