private void deleteBlock(CachedUrl cu) throws IOException {
   log.info("deleting " + cu.getUrl());
   CachedUrlSetSpec cuss = new SingleNodeCachedUrlSetSpec(cu.getUrl());
   ArchivalUnit au = cu.getArchivalUnit();
   CachedUrlSet cus = au.makeCachedUrlSet(cuss);
   NodeManager nm = au.getPlugin().getDaemon().getNodeManager(au);
   nm.deleteNode(cus);
 }
  public void testArticleCountAndType() throws Exception {
    int expCount = 28;
    PluginTestUtil.crawlSimAu(sau);
    String pat1 = "branch(\\d+)/(\\d+file\\.html)";
    String rep1 = "aps/journal/v123/n$1/full/$2";
    PluginTestUtil.copyAu(sau, nau, ".*[^.][^p][^d][^f]$", pat1, rep1);
    String pat2 = "branch(\\d+)/(\\d+file\\.pdf)";
    String rep2 = "aps/journal/v123/n$1/pdf/$2";
    PluginTestUtil.copyAu(sau, nau, ".*\\.pdf$", pat2, rep2);

    // Remove some URLs
    int deleted = 0;
    for (Iterator it = nau.getAuCachedUrlSet().contentHashIterator(); it.hasNext(); ) {
      CachedUrlSetNode cusn = (CachedUrlSetNode) it.next();
      if (cusn instanceof CachedUrl) {
        CachedUrl cu = (CachedUrl) cusn;
        String url = cu.getUrl();
        if (url.contains("/journal/")
            && (url.endsWith("1file.html") || url.endsWith("2file.pdf"))) {
          deleteBlock(cu);
          ++deleted;
        }
      }
    }
    assertEquals(8, deleted);

    Iterator<ArticleFiles> it = nau.getArticleIterator();
    int count = 0;
    int countHtmlOnly = 0;
    int countPdfOnly = 0;
    while (it.hasNext()) {
      ArticleFiles af = it.next();
      log.info(af.toString());
      CachedUrl cu = af.getFullTextCu();
      String url = cu.getUrl();
      assertNotNull(cu);
      String contentType = cu.getContentType();
      log.debug("count " + count + " url " + url + " " + contentType);
      count++;
      if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == null) {
        ++countHtmlOnly;
      }
      if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == url) {
        ++countPdfOnly;
      }
    }
    log.debug("Article count is " + count);
    assertEquals(expCount, count);
    assertEquals(4, countHtmlOnly);
    assertEquals(4, countPdfOnly);
  }
 @Override
 public void extract(MetadataTarget target, CachedUrl cu, Emitter emitter) throws IOException {
   ArticleMetadata am = new SimpleHtmlMetaTagMetadataExtractor().extract(target, cu);
   am.cook(tagMap);
   String url = am.get(MetadataField.FIELD_ACCESS_URL);
   ArchivalUnit au = cu.getArchivalUnit();
   if (url == null || url.isEmpty() || !au.makeCachedUrl(url).hasContent()) {
     url = cu.getUrl();
   }
   am.replace(
       MetadataField.FIELD_ACCESS_URL,
       HttpToHttpsUtil.AuUtil.normalizeHttpHttpsFromBaseUrl(au, url));
   emitter.emitMetadata(cu, am);
 }
Пример #4
0
 /** Setup common to most constructors */
 private void commonSetup(CachedUrlSet cus, String lwrBound, String uprBound, int pollType) {
   CachedUrlSetSpec cuss = cus.getSpec();
   if (cuss instanceof PrunedCachedUrlSetSpec) {
     throw new IllegalArgumentException("Polls do not support PrunedCachedUrlSetSpec");
   }
   this.cus = cus;
   ArchivalUnit au = cus.getArchivalUnit();
   auId = au.getAuId();
   this.pluginVersion = AuUtil.getPollVersion(au);
   url = cuss.getUrl();
   this.lwrBound = lwrBound;
   this.uprBound = uprBound;
   this.protocolVersion = protocolVersionFromPollType(pollType);
   this.pollType = pollType;
 }
 List<String> auUrls(ArchivalUnit au) {
   List<String> res = new ArrayList<String>();
   for (Iterator iter = au.getAuCachedUrlSet().contentHashIterator(); iter.hasNext(); ) {
     CachedUrlSetNode cusn = (CachedUrlSetNode) iter.next();
     if (cusn.hasContent()) {
       res.add(cusn.getUrl());
     }
   }
   return res;
 }
 public BePressArticleIterator(
     ArchivalUnit au, SubTreeArticleIterator.Spec spec, boolean isSection) {
   super(au, spec);
   String volumeAsString = au.getConfiguration().get(ConfigParamDescr.VOLUME_NUMBER.getKey());
   String journalAbbr = au.getConfiguration().get(ConfigParamDescr.JOURNAL_ABBR.getKey());
   if (isSection) {
     journalAbbr = journalAbbr + "/" + au.getConfiguration().get("journal_section");
   }
   // pick up issue level and lower (make (art)?[0-9]+ optional because a few au's have article
   // at issue level
   this.pattern =
       Pattern.compile(
           String.format(
               "/%s/((([^0-9]+/)?(vol)?%s/(iss)?[0-9]+(/(art)?[0-9]+)?)|(vol%s/(?-i:[A-Z])[0-9]+))$",
               journalAbbr, volumeAsString, volumeAsString),
           Pattern.CASE_INSENSITIVE);
   this.TOC_pattern =
       Pattern.compile(
           String.format("/%s/([^0-9]+/)?(vol)?%s/(iss)?[0-9]+$", journalAbbr, volumeAsString),
           Pattern.CASE_INSENSITIVE);
 }
Пример #7
0
 public static V1Poll createCompletedPoll(
     LockssDaemon daemon,
     ArchivalUnit au,
     V1LcapMessage testmsg,
     int numAgree,
     int numDisagree,
     PollManager pollmanager)
     throws Exception {
   log.debug(
       "createCompletedPoll: au: "
           + au.toString()
           + " peer "
           + testmsg.getOriginatorId()
           + " votes "
           + numAgree
           + "/"
           + numDisagree);
   CachedUrlSetSpec cusSpec = null;
   if ((testmsg.getLwrBound() != null)
       && (testmsg.getLwrBound().equals(PollSpec.SINGLE_NODE_LWRBOUND))) {
     cusSpec = new SingleNodeCachedUrlSetSpec(testmsg.getTargetUrl());
   } else {
     cusSpec =
         new RangeCachedUrlSetSpec(
             testmsg.getTargetUrl(), testmsg.getLwrBound(), testmsg.getUprBound());
   }
   CachedUrlSet cus = au.makeCachedUrlSet(cusSpec);
   PollSpec spec = new PollSpec(cus, Poll.V1_CONTENT_POLL);
   ((MockCachedUrlSet) spec.getCachedUrlSet()).setHasContent(false);
   V1Poll p = null;
   if (testmsg.isContentPoll()) {
     p =
         new V1ContentPoll(
             spec,
             pollmanager,
             testmsg.getOriginatorId(),
             testmsg.getChallenge(),
             testmsg.getDuration(),
             testmsg.getHashAlgorithm());
   } else if (testmsg.isNamePoll()) {
     p =
         new V1NamePoll(
             spec,
             pollmanager,
             testmsg.getOriginatorId(),
             testmsg.getChallenge(),
             testmsg.getDuration(),
             testmsg.getHashAlgorithm());
   } else if (testmsg.isVerifyPoll()) {
     p =
         new V1VerifyPoll(
             spec,
             pollmanager,
             testmsg.getOriginatorId(),
             testmsg.getChallenge(),
             testmsg.getDuration(),
             testmsg.getHashAlgorithm(),
             testmsg.getVerifier());
   }
   assertNotNull(p);
   p.setMessage(testmsg);
   p.m_tally.quorum = numAgree + numDisagree;
   p.m_tally.numAgree = numAgree;
   p.m_tally.numDisagree = numDisagree;
   p.m_tally.wtAgree = 2000;
   p.m_tally.wtDisagree = 200;
   p.m_tally.localEntries = makeEntries(1, 3);
   p.m_tally.votedEntries = makeEntries(1, 5);
   p.m_tally.votedEntries.remove(1);
   p.m_pollstate = V1Poll.PS_COMPLETE;
   p.m_callerID = testmsg.getOriginatorId();
   log.debug3("poll " + p.toString());
   p.m_tally.tallyVotes();
   return p;
 }
Пример #8
0
  /** Explode the archive into its constituent elements */
  public void explode() throws CacheException {
    CachedUrl cachedUrl = null;
    int goodEntries = 0;
    int badEntries = 0;
    int ignoredEntries = 0;
    int entriesBetweenSleep = 0;
    ArchiveReader arcReader = null;

    logger.info(
        (storeArchive ? "Storing" : "Fetching") + " WARC file: " + origUrl + " will explode");
    try {
      if (storeArchive) {
        UrlCacher uc = au.makeUrlCacher(new UrlData(arcStream, arcProps, fetchUrl));
        BitSet bs = new BitSet();
        bs.set(UrlCacher.DONT_CLOSE_INPUT_STREAM_FLAG);
        uc.setFetchFlags(bs);
        uc.storeContent();
        archiveData.resetInputStream();
        arcStream = archiveData.input;
      }
      // Wrap it in an ArchiveReader
      logger.debug3("About to wrap stream");
      arcReader = wrapStream(fetchUrl, arcStream);
      logger.debug3("wrapStream() returns " + (arcReader == null ? "null" : "non-null"));
      // Explode it
      if (arcReader == null) {
        throw new CacheException.ExploderException("no WarcReader for " + origUrl);
      }
      ArchivalUnit au = crawlFacade.getAu();
      Set stemSet = new HashSet();
      logger.debug("Exploding " + fetchUrl);
      // Iterate through the elements in the WARC file, except the first
      Iterator i = arcReader.iterator();
      // Skip first record
      for (i.next(); i.hasNext(); ) {
        // XXX probably not necessary
        helper.pokeWDog();
        if ((++entriesBetweenSleep % sleepAfter) == 0) {
          long pauseTime =
              CurrentConfig.getTimeIntervalParam(PARAM_RETRY_PAUSE, DEFAULT_RETRY_PAUSE);
          Deadline pause = Deadline.in(pauseTime);
          logger.debug3("Sleeping for " + StringUtil.timeIntervalToString(pauseTime));
          while (!pause.expired()) {
            try {
              pause.sleep();
            } catch (InterruptedException ie) {
              // no action
            }
          }
        }
        ArchiveRecord element = (ArchiveRecord) i.next();
        // Each element is a URL to be cached in a suitable AU
        ArchiveRecordHeader elementHeader = element.getHeader();
        String elementUrl = elementHeader.getUrl();
        String elementMimeType = elementHeader.getMimetype();
        long elementLength = elementHeader.getLength();
        logger.debug2("WARC url " + elementUrl + " mime " + elementMimeType);
        if (elementUrl.startsWith("http:")) {
          ArchiveEntry ae =
              new ArchiveEntry(
                  elementUrl,
                  elementLength,
                  0, // XXX need to convert getDate string to long
                  element, // ArchiveRecord extends InputStream
                  this,
                  fetchUrl);
          ae.setHeaderFields(makeCIProperties(elementHeader));
          long bytesStored = elementLength;
          logger.debug3("ArchiveEntry: " + ae.getName() + " bytes " + bytesStored);
          try {
            helper.process(ae);
          } catch (PluginException ex) {
            throw new CacheException.ExploderException("helper.process() threw", ex);
          }
          if (ae.getBaseUrl() != null) {
            if (ae.getRestOfUrl() != null && ae.getHeaderFields() != null) {
              storeEntry(ae);
              handleAddText(ae);
              goodEntries++;
              crawlFacade.getCrawlerStatus().addContentBytesFetched(bytesStored);
            } else {
              ignoredEntries++;
            }
          } else {
            badEntries++;
            logger.debug2("Can't map " + elementUrl + " from " + archiveUrl);
          }
        }
      }
    } catch (IOException ex) {
      throw new CacheException.ExploderException(ex);
    } finally {
      if (arcReader != null)
        try {
          arcReader.close();
          arcReader = null;
        } catch (IOException ex) {
          throw new CacheException.ExploderException(ex);
        }
      if (cachedUrl != null) {
        cachedUrl.release();
      }
      IOUtil.safeClose(arcStream);
    }
    if (badEntries == 0 && goodEntries > 0) {
      // Make it look like a new crawl finished on each AU to which
      // URLs were added.
      for (Iterator it = touchedAus.iterator(); it.hasNext(); ) {
        ArchivalUnit au = (ArchivalUnit) it.next();
        logger.debug3(archiveUrl + " touching " + au.toString());
        AuUtil.getDaemon(au).getNodeManager(au).newContentCrawlFinished();
      }
    } else {
      ArchivalUnit au = crawlFacade.getAu();
      String msg = archiveUrl + ": " + badEntries + "/" + goodEntries + " bad entries";
      throw new CacheException.UnretryableException(msg);
    }
  }