Пример #1
0
 private void checkSubstance(ArchivalUnit au) {
   SubstanceChecker subChecker = new SubstanceChecker(au);
   if (!subChecker.isEnabled()) {
     errMsg = "No substance patterns defined for plugin.";
     return;
   }
   AuState auState = AuUtil.getAuState(au);
   SubstanceChecker.State oldState = auState.getSubstanceState();
   SubstanceChecker.State newState = subChecker.findSubstance();
   String chtxt = (newState == oldState ? "(unchanged)" : "(was " + oldState.toString() + ")");
   switch (newState) {
     case Unknown:
       log.error("Shouldn't happen: SubstanceChecker returned Unknown");
       errMsg = "Error in SubstanceChecker; see log.";
       break;
     case Yes:
       statusMsg = "AU has substance " + chtxt + ": " + au.getName();
       auState.setSubstanceState(SubstanceChecker.State.Yes);
       break;
     case No:
       statusMsg = "AU has no substance " + chtxt + ": " + au.getName();
       auState.setSubstanceState(SubstanceChecker.State.No);
       break;
   }
 }
 private void deleteBlock(CachedUrl cu) throws IOException {
   log.info("deleting " + cu.getUrl());
   CachedUrlSetSpec cuss = new SingleNodeCachedUrlSetSpec(cu.getUrl());
   ArchivalUnit au = cu.getArchivalUnit();
   CachedUrlSet cus = au.makeCachedUrlSet(cuss);
   NodeManager nm = au.getPlugin().getDaemon().getNodeManager(au);
   nm.deleteNode(cus);
 }
  public void testFunctionalFromTarHierarchy() throws Exception {
    log.debug3("in testFromTarHierarchy");
    // load the tarballs
    InputStream file_input = null;
    try {
      file_input = getResourceAsStream(realTARFile_A);
      // UrlCacher uc = au.makeUrlCacher(TAR_A_BASE);
      // uc.storeContent(file_input, tarHeader);
      UrlCacher uc = tarAu.makeUrlCacher(new UrlData(file_input, tarHeader, TAR_A_BASE));
      uc.storeContent();
      IOUtil.safeClose(file_input);

      file_input = getResourceAsStream(realTARFile_B);
      // uc = au.makeUrlCacher(TAR_B_BASE);
      // uc.storeContent(file_input, tarHeader);
      uc = tarAu.makeUrlCacher(new UrlData(file_input, tarHeader, TAR_B_BASE));
      uc.storeContent();
      IOUtil.safeClose(file_input);

    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } finally {
      IOUtil.safeClose(file_input);
    }

    CachedUrlSet cus = tarAu.getAuCachedUrlSet();
    for (CachedUrl cu : cus.getCuIterable()) {
      log.debug3("AU - cu is: " + cu.getUrl());
      cu.release();
    }

    // We need to start from the level of the ArticleMetadataExtractor
    MyListEmitter emitter = new MyListEmitter();
    ArticleMetadataExtractor amEx =
        new ElsevierDeferredArticleMetadataExtractor(ArticleFiles.ROLE_ARTICLE_METADATA);

    Iterator<ArticleFiles> it = tarAu.getArticleIterator(MetadataTarget.Any());
    while (it.hasNext()) {
      ArticleFiles af = it.next();
      log.debug3("Metadata test - articlefiles " + af.toString());
      // CachedUrl cu = af.getFullTextCu();
      CachedUrl cu = af.getRoleCu(ArticleFiles.ROLE_ARTICLE_METADATA);
      log.debug3("metadata cu is " + cu.getUrl());
      // List<ArticleMetadata> mdlist = mle.extract(MetadataTarget.Any(), cu);
      amEx.extract(MetadataTarget.Any(), af, emitter);
      List<ArticleMetadata> returnList = emitter.getAmList();

      assertNotNull(returnList);
      log.debug3("size of returnList is " + returnList.size());
      Iterator<ArticleMetadata> mdIt = returnList.iterator();
      ArticleMetadata mdRecord = null;
      while (mdIt.hasNext()) {
        mdRecord = (ArticleMetadata) mdIt.next();
        validateCompleteMetadataRecord(mdRecord);
      }
    }
  }
Пример #4
0
  private boolean startReindexingMetadata(ArchivalUnit au, boolean force) {
    if (metadataMgr == null) {
      errMsg = "Metadata processing is not enabled.";
      return false;
    }

    if (!force) {
      if (!AuUtil.hasCrawled(au)) {
        errMsg = "Au has never crawled. Click again to reindex metadata";
        showForceReindexMetadata = true;
        return false;
      }

      AuState auState = AuUtil.getAuState(au);
      switch (auState.getSubstanceState()) {
        case No:
          errMsg = "Au has no substance. Click again to reindex metadata";
          showForceReindexMetadata = true;
          return false;
        case Unknown:
          errMsg = "Unknown substance for Au. Click again to reindex metadata.";
          showForceReindexMetadata = true;
          return false;
        case Yes:
          // fall through
      }
    }

    // Fully reindex metadata with the highest priority.
    Connection conn = null;
    PreparedStatement insertPendingAuBatchStatement = null;

    try {
      conn = dbMgr.getConnection();
      insertPendingAuBatchStatement = metadataMgr.getPrioritizedInsertPendingAuBatchStatement(conn);

      if (metadataMgr.enableAndAddAuToReindex(
          au, conn, insertPendingAuBatchStatement, false, true)) {
        statusMsg = "Reindexing metadata for " + au.getName();
        return true;
      }
    } catch (DbException dbe) {
      log.error("Cannot reindex metadata for " + au.getName(), dbe);
    } finally {
      DbManager.safeCloseStatement(insertPendingAuBatchStatement);
      DbManager.safeRollbackAndClose(conn);
    }

    if (force) {
      errMsg = "Still cannot reindex metadata for " + au.getName();
    } else {
      errMsg = "Cannot reindex metadata for " + au.getName();
    }
    return false;
  }
  public void testArticleCountAndType() throws Exception {
    int expCount = 28;
    PluginTestUtil.crawlSimAu(sau);
    String pat1 = "branch(\\d+)/(\\d+file\\.html)";
    String rep1 = "aps/journal/v123/n$1/full/$2";
    PluginTestUtil.copyAu(sau, nau, ".*[^.][^p][^d][^f]$", pat1, rep1);
    String pat2 = "branch(\\d+)/(\\d+file\\.pdf)";
    String rep2 = "aps/journal/v123/n$1/pdf/$2";
    PluginTestUtil.copyAu(sau, nau, ".*\\.pdf$", pat2, rep2);

    // Remove some URLs
    int deleted = 0;
    for (Iterator it = nau.getAuCachedUrlSet().contentHashIterator(); it.hasNext(); ) {
      CachedUrlSetNode cusn = (CachedUrlSetNode) it.next();
      if (cusn instanceof CachedUrl) {
        CachedUrl cu = (CachedUrl) cusn;
        String url = cu.getUrl();
        if (url.contains("/journal/")
            && (url.endsWith("1file.html") || url.endsWith("2file.pdf"))) {
          deleteBlock(cu);
          ++deleted;
        }
      }
    }
    assertEquals(8, deleted);

    Iterator<ArticleFiles> it = nau.getArticleIterator();
    int count = 0;
    int countHtmlOnly = 0;
    int countPdfOnly = 0;
    while (it.hasNext()) {
      ArticleFiles af = it.next();
      log.info(af.toString());
      CachedUrl cu = af.getFullTextCu();
      String url = cu.getUrl();
      assertNotNull(cu);
      String contentType = cu.getContentType();
      log.debug("count " + count + " url " + url + " " + contentType);
      count++;
      if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == null) {
        ++countHtmlOnly;
      }
      if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == url) {
        ++countPdfOnly;
      }
    }
    log.debug("Article count is " + count);
    assertEquals(expCount, count);
    assertEquals(4, countHtmlOnly);
    assertEquals(4, countPdfOnly);
  }
Пример #6
0
  private boolean disableMetadataIndexing(ArchivalUnit au, boolean force) {
    if (metadataMgr == null) {
      errMsg = "Metadata processing is not enabled.";
      return false;
    }

    try {
      metadataMgr.disableAuIndexing(au);
      statusMsg = "Disabled metadata indexing for " + au.getName();
      return true;
    } catch (Exception e) {
      errMsg = "Cannot reindex metadata for " + au.getName() + ": " + e.getMessage();
      return false;
    }
  }
Пример #7
0
 private void crawlPluginRegistries() {
   StringBuilder sb = new StringBuilder();
   for (ArchivalUnit au : pluginMgr.getAllRegistryAus()) {
     sb.append(au.getName());
     sb.append(": ");
     try {
       startCrawl(au, true, false);
       sb.append("Queued.");
     } catch (CrawlManagerImpl.NotEligibleException e) {
       sb.append("Failed: ");
       sb.append(e.getMessage());
     }
     sb.append("\n");
   }
   statusMsg = sb.toString();
 }
  /* private support methods */
  private List<ArticleMetadata> setupContentForAU(
      ArchivalUnit au, String url, String content, boolean isHtmlExtractor)
      throws IOException, PluginException {
    FileMetadataExtractor me;

    InputStream input = null;
    CIProperties props = null;
    if (isHtmlExtractor) {
      input = IOUtils.toInputStream(content, "utf-8");
      props = getContentHtmlProperties();
      me =
          new BaseAtyponHtmlMetadataExtractorFactory()
              .createFileMetadataExtractor(MetadataTarget.Any(), "text/html");
    } else {
      input = IOUtils.toInputStream(content, "utf-8");
      props = getContentRisProperties();
      me =
          new BaseAtyponRisMetadataExtractorFactory()
              .createFileMetadataExtractor(MetadataTarget.Any(), "text/plain");
    }
    UrlData ud = new UrlData(input, props, url);
    UrlCacher uc = au.makeUrlCacher(ud);
    uc.storeContent();
    CachedUrl cu = uc.getCachedUrl();
    FileMetadataListExtractor mle = new FileMetadataListExtractor(me);
    return mle.extract(MetadataTarget.Any(), cu);
  }
Пример #9
0
 public void export() {
   log.debug("export(" + au.getName() + ")");
   log.debug(
       "dir: "
           + dir
           + ", pref: "
           + prefix
           + ", size: "
           + maxSize
           + ", ver: "
           + maxVersions
           + (compress ? ", (C)" : ""));
   checkArgs();
   try {
     start();
   } catch (IOException e) {
     recordError("Error opening file", e);
     return;
   }
   writeFiles();
   try {
     finish();
   } catch (IOException e) {
     if (!isDiskFull) {
       // If we already knew (and reported) disk full, also reporting it
       // as a close error is misleading.
       recordError("Error closing file", e);
     }
   }
 }
 public static String getRepositorySpec(ArchivalUnit au) {
   Configuration auConfig = au.getConfiguration();
   if (auConfig != null) { // can be null in unit tests
     String repoSpec = auConfig.get(PluginManager.AU_PARAM_REPOSITORY);
     if (repoSpec != null && repoSpec.startsWith("local:")) {
       return repoSpec;
     }
   }
   return "local:" + CurrentConfig.getParam(PARAM_CACHE_LOCATION);
 }
 List<String> auUrls(ArchivalUnit au) {
   List<String> res = new ArrayList<String>();
   for (Iterator iter = au.getAuCachedUrlSet().contentHashIterator(); iter.hasNext(); ) {
     CachedUrlSetNode cusn = (CachedUrlSetNode) iter.next();
     if (cusn.hasContent()) {
       res.add(cusn.getUrl());
     }
   }
   return res;
 }
 public BePressArticleIterator(
     ArchivalUnit au, SubTreeArticleIterator.Spec spec, boolean isSection) {
   super(au, spec);
   String volumeAsString = au.getConfiguration().get(ConfigParamDescr.VOLUME_NUMBER.getKey());
   String journalAbbr = au.getConfiguration().get(ConfigParamDescr.JOURNAL_ABBR.getKey());
   if (isSection) {
     journalAbbr = journalAbbr + "/" + au.getConfiguration().get("journal_section");
   }
   // pick up issue level and lower (make (art)?[0-9]+ optional because a few au's have article
   // at issue level
   this.pattern =
       Pattern.compile(
           String.format(
               "/%s/((([^0-9]+/)?(vol)?%s/(iss)?[0-9]+(/(art)?[0-9]+)?)|(vol%s/(?-i:[A-Z])[0-9]+))$",
               journalAbbr, volumeAsString, volumeAsString),
           Pattern.CASE_INSENSITIVE);
   this.TOC_pattern =
       Pattern.compile(
           String.format("/%s/([^0-9]+/)?(vol)?%s/(iss)?[0-9]+$", journalAbbr, volumeAsString),
           Pattern.CASE_INSENSITIVE);
 }
 /**
  * Factory method to create new LockssRepository instances.
  *
  * @param au the {@link ArchivalUnit}
  * @return the new LockssRepository instance
  */
 public static LockssRepository createNewLockssRepository(ArchivalUnit au) {
   String root = getRepositoryRoot(au);
   if (root == null || root.equals("null")) {
     logger.error("No repository dir set in config");
     throw new LockssRepository.RepositoryStateException("No repository dir set in config");
   }
   String auDir = LockssRepositoryImpl.mapAuToFileLocation(root, au);
   if (logger.isDebug2()) {
     logger.debug2("repo: " + auDir + ", au: " + au.getName());
   }
   staticCacheLocation = extendCacheLocation(root);
   LockssRepositoryImpl repo = new LockssRepositoryImpl(auDir);
   Plugin plugin = au.getPlugin();
   if (plugin != null) {
     LockssDaemon daemon = plugin.getDaemon();
     if (daemon != null) {
       RepositoryManager mgr = daemon.getRepositoryManager();
       if (mgr != null) {
         mgr.setRepositoryForPath(auDir, repo);
       }
     }
   }
   return repo;
 }
Пример #14
0
  private boolean startCrawl(ArchivalUnit au, boolean force, boolean deep)
      throws CrawlManagerImpl.NotEligibleException {
    CrawlManagerImpl cmi = (CrawlManagerImpl) crawlMgr;
    if (force) {
      RateLimiter limit = cmi.getNewContentRateLimiter(au);
      if (!limit.isEventOk()) {
        limit.unevent();
      }
    }
    cmi.checkEligibleToQueueNewContentCrawl(au);
    String delayMsg = "";
    String deepMsg = "";
    try {
      cmi.checkEligibleForNewContentCrawl(au);
    } catch (CrawlManagerImpl.NotEligibleException e) {
      delayMsg = ", Start delayed due to: " + e.getMessage();
    }
    Configuration config = ConfigManager.getCurrentConfig();
    int pri = config.getInt(PARAM_CRAWL_PRIORITY, DEFAULT_CRAWL_PRIORITY);

    CrawlReq req;
    try {
      req = new CrawlReq(au);
      req.setPriority(pri);
      if (deep) {
        int d = Integer.parseInt(formDepth);
        if (d < 0) {
          errMsg = "Illegal refetch depth: " + d;
          return false;
        }
        req.setRefetchDepth(d);
        deepMsg = "Deep (" + req.getRefetchDepth() + ") ";
      }
    } catch (NumberFormatException e) {
      errMsg = "Illegal refetch depth: " + formDepth;
      return false;
    } catch (RuntimeException e) {
      log.error("Couldn't create CrawlReq: " + au, e);
      errMsg = "Couldn't create CrawlReq: " + e.toString();
      return false;
    }
    cmi.startNewContentCrawl(req, null);
    statusMsg = deepMsg + "Crawl requested for " + au.getName() + delayMsg;
    return true;
  }
  public void setUp() throws Exception {
    super.setUp();
    setUpDiskSpace(); // you need this to have startService work properly...

    theDaemon = getMockLockssDaemon();
    theDaemon.getAlertManager();
    theDaemon.getPluginManager().setLoadablePluginsReady(true);
    theDaemon.setDaemonInited(true);
    theDaemon.getPluginManager().startService();
    theDaemon.getCrawlManager();

    // in this directory this is file "test_baseatypon.tdb" but it becomes xml
    ConfigurationUtil.addFromUrl(getResource("test_baseatypon.xml"));
    Tdb tdb = ConfigManager.getCurrentConfig().getTdb();

    TdbAu tdbau1 = tdb.getTdbAusLikeName(goodJournal + " Volume " + goodVolume).get(0);
    assertNotNull("Didn't find named TdbAu", tdbau1);
    bau1 = PluginTestUtil.createAndStartAu(tdbau1);
    assertNotNull(bau1);
    TypedEntryMap auConfig = bau1.getProperties();
    assertEquals(BASE_URL, auConfig.getString(BASE_URL_KEY));
  }
 /**
  * Finds the directory for this AU. If none found in the map, designates a new dir for it.
  *
  * @param au the AU
  * @param repoRoot root of the repository
  * @return the dir {@link String}
  */
 static String getAuDir(ArchivalUnit au, String repoRoot, boolean create) {
   return getAuDir(au.getAuId(), repoRoot, create);
 }
Пример #17
0
 private void callV3ContentPoll(ArchivalUnit au) throws PollManager.NotEligibleException {
   log.debug("Enqueuing a V3 Content Poll on " + au.getName());
   PollSpec spec = new PollSpec(au.getAuCachedUrlSet(), Poll.V3_POLL);
   pollManager.enqueueHighPriorityPoll(au, spec);
   statusMsg = "Enqueued V3 poll for " + au.getName();
 }
Пример #18
0
 public static V1Poll createCompletedPoll(
     LockssDaemon daemon,
     ArchivalUnit au,
     V1LcapMessage testmsg,
     int numAgree,
     int numDisagree,
     PollManager pollmanager)
     throws Exception {
   log.debug(
       "createCompletedPoll: au: "
           + au.toString()
           + " peer "
           + testmsg.getOriginatorId()
           + " votes "
           + numAgree
           + "/"
           + numDisagree);
   CachedUrlSetSpec cusSpec = null;
   if ((testmsg.getLwrBound() != null)
       && (testmsg.getLwrBound().equals(PollSpec.SINGLE_NODE_LWRBOUND))) {
     cusSpec = new SingleNodeCachedUrlSetSpec(testmsg.getTargetUrl());
   } else {
     cusSpec =
         new RangeCachedUrlSetSpec(
             testmsg.getTargetUrl(), testmsg.getLwrBound(), testmsg.getUprBound());
   }
   CachedUrlSet cus = au.makeCachedUrlSet(cusSpec);
   PollSpec spec = new PollSpec(cus, Poll.V1_CONTENT_POLL);
   ((MockCachedUrlSet) spec.getCachedUrlSet()).setHasContent(false);
   V1Poll p = null;
   if (testmsg.isContentPoll()) {
     p =
         new V1ContentPoll(
             spec,
             pollmanager,
             testmsg.getOriginatorId(),
             testmsg.getChallenge(),
             testmsg.getDuration(),
             testmsg.getHashAlgorithm());
   } else if (testmsg.isNamePoll()) {
     p =
         new V1NamePoll(
             spec,
             pollmanager,
             testmsg.getOriginatorId(),
             testmsg.getChallenge(),
             testmsg.getDuration(),
             testmsg.getHashAlgorithm());
   } else if (testmsg.isVerifyPoll()) {
     p =
         new V1VerifyPoll(
             spec,
             pollmanager,
             testmsg.getOriginatorId(),
             testmsg.getChallenge(),
             testmsg.getDuration(),
             testmsg.getHashAlgorithm(),
             testmsg.getVerifier());
   }
   assertNotNull(p);
   p.setMessage(testmsg);
   p.m_tally.quorum = numAgree + numDisagree;
   p.m_tally.numAgree = numAgree;
   p.m_tally.numDisagree = numDisagree;
   p.m_tally.wtAgree = 2000;
   p.m_tally.wtDisagree = 200;
   p.m_tally.localEntries = makeEntries(1, 3);
   p.m_tally.votedEntries = makeEntries(1, 5);
   p.m_tally.votedEntries.remove(1);
   p.m_pollstate = V1Poll.PS_COMPLETE;
   p.m_callerID = testmsg.getOriginatorId();
   log.debug3("poll " + p.toString());
   p.m_tally.tallyVotes();
   return p;
 }
Пример #19
0
  /** Explode the archive into its constituent elements */
  public void explode() throws CacheException {
    CachedUrl cachedUrl = null;
    int goodEntries = 0;
    int badEntries = 0;
    int ignoredEntries = 0;
    int entriesBetweenSleep = 0;
    ArchiveReader arcReader = null;

    logger.info(
        (storeArchive ? "Storing" : "Fetching") + " WARC file: " + origUrl + " will explode");
    try {
      if (storeArchive) {
        UrlCacher uc = au.makeUrlCacher(new UrlData(arcStream, arcProps, fetchUrl));
        BitSet bs = new BitSet();
        bs.set(UrlCacher.DONT_CLOSE_INPUT_STREAM_FLAG);
        uc.setFetchFlags(bs);
        uc.storeContent();
        archiveData.resetInputStream();
        arcStream = archiveData.input;
      }
      // Wrap it in an ArchiveReader
      logger.debug3("About to wrap stream");
      arcReader = wrapStream(fetchUrl, arcStream);
      logger.debug3("wrapStream() returns " + (arcReader == null ? "null" : "non-null"));
      // Explode it
      if (arcReader == null) {
        throw new CacheException.ExploderException("no WarcReader for " + origUrl);
      }
      ArchivalUnit au = crawlFacade.getAu();
      Set stemSet = new HashSet();
      logger.debug("Exploding " + fetchUrl);
      // Iterate through the elements in the WARC file, except the first
      Iterator i = arcReader.iterator();
      // Skip first record
      for (i.next(); i.hasNext(); ) {
        // XXX probably not necessary
        helper.pokeWDog();
        if ((++entriesBetweenSleep % sleepAfter) == 0) {
          long pauseTime =
              CurrentConfig.getTimeIntervalParam(PARAM_RETRY_PAUSE, DEFAULT_RETRY_PAUSE);
          Deadline pause = Deadline.in(pauseTime);
          logger.debug3("Sleeping for " + StringUtil.timeIntervalToString(pauseTime));
          while (!pause.expired()) {
            try {
              pause.sleep();
            } catch (InterruptedException ie) {
              // no action
            }
          }
        }
        ArchiveRecord element = (ArchiveRecord) i.next();
        // Each element is a URL to be cached in a suitable AU
        ArchiveRecordHeader elementHeader = element.getHeader();
        String elementUrl = elementHeader.getUrl();
        String elementMimeType = elementHeader.getMimetype();
        long elementLength = elementHeader.getLength();
        logger.debug2("WARC url " + elementUrl + " mime " + elementMimeType);
        if (elementUrl.startsWith("http:")) {
          ArchiveEntry ae =
              new ArchiveEntry(
                  elementUrl,
                  elementLength,
                  0, // XXX need to convert getDate string to long
                  element, // ArchiveRecord extends InputStream
                  this,
                  fetchUrl);
          ae.setHeaderFields(makeCIProperties(elementHeader));
          long bytesStored = elementLength;
          logger.debug3("ArchiveEntry: " + ae.getName() + " bytes " + bytesStored);
          try {
            helper.process(ae);
          } catch (PluginException ex) {
            throw new CacheException.ExploderException("helper.process() threw", ex);
          }
          if (ae.getBaseUrl() != null) {
            if (ae.getRestOfUrl() != null && ae.getHeaderFields() != null) {
              storeEntry(ae);
              handleAddText(ae);
              goodEntries++;
              crawlFacade.getCrawlerStatus().addContentBytesFetched(bytesStored);
            } else {
              ignoredEntries++;
            }
          } else {
            badEntries++;
            logger.debug2("Can't map " + elementUrl + " from " + archiveUrl);
          }
        }
      }
    } catch (IOException ex) {
      throw new CacheException.ExploderException(ex);
    } finally {
      if (arcReader != null)
        try {
          arcReader.close();
          arcReader = null;
        } catch (IOException ex) {
          throw new CacheException.ExploderException(ex);
        }
      if (cachedUrl != null) {
        cachedUrl.release();
      }
      IOUtil.safeClose(arcStream);
    }
    if (badEntries == 0 && goodEntries > 0) {
      // Make it look like a new crawl finished on each AU to which
      // URLs were added.
      for (Iterator it = touchedAus.iterator(); it.hasNext(); ) {
        ArchivalUnit au = (ArchivalUnit) it.next();
        logger.debug3(archiveUrl + " touching " + au.toString());
        AuUtil.getDaemon(au).getNodeManager(au).newContentCrawlFinished();
      }
    } else {
      ArchivalUnit au = crawlFacade.getAu();
      String msg = archiveUrl + ": " + badEntries + "/" + goodEntries + " bad entries";
      throw new CacheException.UnretryableException(msg);
    }
  }