Exemplo n.º 1
0
  /** Explode the archive into its constituent elements */
  public void explode() throws CacheException {
    int goodEntries = 0;
    int badEntries = 0;
    int entriesBetweenSleep = 0;
    ArchiveReader arcReader = null;

    logger.debug(
        (storeArchive ? "Storing" : "Fetching") + " WARC file: " + origUrl + " will explode");
    try {
      // Wrap it in an ArchiveReader
      logger.debug3("About to wrap stream");
      arcReader = wrapStream(fetchUrl, arcStream);
      logger.debug3("wrapStream() returns " + (arcReader == null ? "null" : "non-null"));
      // Explode it
      if (arcReader == null) {
        throw new CacheException.ExploderException("no WarcReader for " + origUrl);
      }
      ArchivalUnit au = crawlFacade.getAu();
      logger.debug("Exploding " + fetchUrl);
      // Iterate through the elements in the WARC file, except the first
      Iterator<ArchiveRecord> iter = arcReader.iterator();
      // Skip first record
      if (iter.hasNext()) iter.next();
      while (iter.hasNext()) {
        helper.pokeWDog();
        // check need to pause
        handlePause(++entriesBetweenSleep);
        // handle each element in the archive
        ArchiveRecord element = iter.next();
        // Each element is a URL to be cached in our AU
        ArchiveRecordHeader elementHeader = element.getHeader();
        String elementUrl = elementHeader.getUrl();
        String elementMimeType = elementHeader.getMimetype();
        long elementLength = elementHeader.getLength();
        long elementDate;
        try {
          elementDate = ArchiveUtils.parse14DigitDate(elementHeader.getDate()).getTime();
        } catch (ParseException e) {
          elementDate = 0;
        }
        logger.debug2("WARC url " + elementUrl + " mime " + elementMimeType);
        // add check to determine if this is a url which should be cached
        if (au.shouldBeCached(elementUrl) && elementUrl.startsWith("http:")) {
          ArchiveEntry ae =
              new ArchiveEntry(
                  elementUrl,
                  elementLength,
                  elementDate,
                  element, // ArchiveRecord extends InputStream
                  this,
                  fetchUrl);
          ae.setHeaderFields(makeCIProperties(elementHeader));
          long bytesStored = elementLength;
          logger.debug3("ArchiveEntry: " + ae.getName() + " bytes " + bytesStored);
          try {
            helper.process(ae);
          } catch (PluginException ex) {
            throw new CacheException.ExploderException("helper.process() threw", ex);
          }
          if (ae.getBaseUrl() != null) {
            if (ae.getRestOfUrl() != null && ae.getHeaderFields() != null) {
              storeEntry(ae);
              handleAddText(ae);
              goodEntries++;
              // this needs to use the correct depth ? how
              CrawlUrlData cud = new CrawlUrlData(elementUrl, 0);
              crawlFacade.addToParseQueue(cud);
              crawlFacade.getCrawlerStatus().addContentBytesFetched(bytesStored);
            }
          } else {
            badEntries++;
            logger.debug2("Can't map " + elementUrl + " from " + archiveUrl);
          }
        }
      }
    } catch (IOException ex) {
      throw new CacheException.ExploderException(ex);
    } finally {
      if (arcReader != null) {
        try {
          arcReader.close();
        } catch (IOException ex) {
          throw new CacheException.ExploderException(ex);
        }
      }
      IOUtil.safeClose(arcStream);
    }
    // report failed fetches
    if (badEntries != 0) {
      String msg = archiveUrl + ": " + badEntries + "/" + goodEntries + " bad entries";
      throw new CacheException.UnretryableException(msg);
    }
  }