public void testWarcReading() throws Exception {

    ArchiveReader archiveReader = ArchiveReaderFactory.get(TestInfo.WARC_FILE1);

    Iterator<? extends ArchiveRecord> it = archiveReader.iterator();
    assertTrue("Warc should contains records", it.hasNext());
    while (it.hasNext()) {
      ArchiveRecord next = it.next();
      System.out.println("mimetype:" + next.getHeader().getMimetype());
      System.out.println("url:" + next.getHeader().getUrl());
    }
  }
Esempio n. 2
0
  /** Explode the archive into its constituent elements */
  public void explode() throws CacheException {
    int goodEntries = 0;
    int badEntries = 0;
    int entriesBetweenSleep = 0;
    ArchiveReader arcReader = null;

    logger.debug(
        (storeArchive ? "Storing" : "Fetching") + " WARC file: " + origUrl + " will explode");
    try {
      // Wrap it in an ArchiveReader
      logger.debug3("About to wrap stream");
      arcReader = wrapStream(fetchUrl, arcStream);
      logger.debug3("wrapStream() returns " + (arcReader == null ? "null" : "non-null"));
      // Explode it
      if (arcReader == null) {
        throw new CacheException.ExploderException("no WarcReader for " + origUrl);
      }
      ArchivalUnit au = crawlFacade.getAu();
      logger.debug("Exploding " + fetchUrl);
      // Iterate through the elements in the WARC file, except the first
      Iterator<ArchiveRecord> iter = arcReader.iterator();
      // Skip first record
      if (iter.hasNext()) iter.next();
      while (iter.hasNext()) {
        helper.pokeWDog();
        // check need to pause
        handlePause(++entriesBetweenSleep);
        // handle each element in the archive
        ArchiveRecord element = iter.next();
        // Each element is a URL to be cached in our AU
        ArchiveRecordHeader elementHeader = element.getHeader();
        String elementUrl = elementHeader.getUrl();
        String elementMimeType = elementHeader.getMimetype();
        long elementLength = elementHeader.getLength();
        long elementDate;
        try {
          elementDate = ArchiveUtils.parse14DigitDate(elementHeader.getDate()).getTime();
        } catch (ParseException e) {
          elementDate = 0;
        }
        logger.debug2("WARC url " + elementUrl + " mime " + elementMimeType);
        // add check to determine if this is a url which should be cached
        if (au.shouldBeCached(elementUrl) && elementUrl.startsWith("http:")) {
          ArchiveEntry ae =
              new ArchiveEntry(
                  elementUrl,
                  elementLength,
                  elementDate,
                  element, // ArchiveRecord extends InputStream
                  this,
                  fetchUrl);
          ae.setHeaderFields(makeCIProperties(elementHeader));
          long bytesStored = elementLength;
          logger.debug3("ArchiveEntry: " + ae.getName() + " bytes " + bytesStored);
          try {
            helper.process(ae);
          } catch (PluginException ex) {
            throw new CacheException.ExploderException("helper.process() threw", ex);
          }
          if (ae.getBaseUrl() != null) {
            if (ae.getRestOfUrl() != null && ae.getHeaderFields() != null) {
              storeEntry(ae);
              handleAddText(ae);
              goodEntries++;
              // this needs to use the correct depth ? how
              CrawlUrlData cud = new CrawlUrlData(elementUrl, 0);
              crawlFacade.addToParseQueue(cud);
              crawlFacade.getCrawlerStatus().addContentBytesFetched(bytesStored);
            }
          } else {
            badEntries++;
            logger.debug2("Can't map " + elementUrl + " from " + archiveUrl);
          }
        }
      }
    } catch (IOException ex) {
      throw new CacheException.ExploderException(ex);
    } finally {
      if (arcReader != null) {
        try {
          arcReader.close();
        } catch (IOException ex) {
          throw new CacheException.ExploderException(ex);
        }
      }
      IOUtil.safeClose(arcStream);
    }
    // report failed fetches
    if (badEntries != 0) {
      String msg = archiveUrl + ": " + badEntries + "/" + goodEntries + " bad entries";
      throw new CacheException.UnretryableException(msg);
    }
  }
  @Override
  public void map(
      Text key, WritableArchiveRecord value, OutputCollector<Text, Text> output, Reporter reporter)
      throws IOException {
    ArchiveRecord record = value.getRecord();
    ArchiveRecordHeader header = record.getHeader();

    // Logging for debug info:
    log.debug(
        "Processing @"
            + header.getOffset()
            + "+"
            + record.available()
            + ","
            + header.getLength()
            + ": "
            + header.getUrl());
    for (String h : header.getHeaderFields().keySet()) {
      log.debug("ArchiveHeader: " + h + " -> " + header.getHeaderValue(h));
    }

    try {
      MDX mdx = new MDX();
      Date crawl_date = ArchiveUtils.parse14DigitISODate(header.getDate(), null);
      if (crawl_date != null) {
        mdx.setTs(ArchiveUtils.get14DigitDate(crawl_date));
      } else {
        mdx.setTs(header.getDate());
      }
      mdx.setUrl(header.getUrl());
      mdx.setHash(header.getDigest());

      // Data from WARC record:
      mdx.put("source-file", key.toString());
      mdx.put("content-type", header.getMimetype());
      mdx.put("content-length", "" + header.getContentLength());
      mdx.put("length", "" + header.getLength());
      mdx.put("source-offset", "" + header.getOffset());
      mdx.put("record-identifier", header.getRecordIdentifier());
      for (String k : header.getHeaderFieldKeys()) {
        mdx.put("HEADER-" + k, "" + header.getHeaderValue(k));
      }

      // check record type and look for HTTP data:
      Header[] httpHeaders = null;
      if (record instanceof WARCRecord) {
        mdx.setRecordType("warc." + header.getHeaderValue(HEADER_KEY_TYPE));
        mdx.setHash("" + header.getHeaderValue(WARCConstants.HEADER_KEY_PAYLOAD_DIGEST));
        // There are not always headers! The code should check first.
        String statusLine = HttpParser.readLine(record, "UTF-8");
        if (statusLine != null && statusLine.startsWith("HTTP")) {
          String firstLine[] = statusLine.split(" ");
          if (firstLine.length > 1) {
            String statusCode = firstLine[1].trim();
            mdx.put("status-code", statusCode);
            try {
              httpHeaders = HttpParser.parseHeaders(record, "UTF-8");
            } catch (ProtocolException p) {
              log.error(
                  "ProtocolException ["
                      + statusCode
                      + "]: "
                      + header.getHeaderValue(WARCConstants.HEADER_KEY_FILENAME)
                      + "@"
                      + header.getHeaderValue(WARCConstants.ABSOLUTE_OFFSET_KEY),
                  p);
            }
          } else {
            log.warn("Could not parse status line: " + statusLine);
          }
        } else {
          log.warn(
              "Invalid status line: "
                  + header.getHeaderValue(WARCConstants.HEADER_KEY_FILENAME)
                  + "@"
                  + header.getHeaderValue(WARCConstants.ABSOLUTE_OFFSET_KEY));
        }

      } else if (record instanceof ARCRecord) {
        mdx.setRecordType("arc");
        ARCRecord arcr = (ARCRecord) record;
        mdx.put("status-code", "" + arcr.getStatusCode());
        httpHeaders = arcr.getHttpHeaders();

      } else {
        mdx.setRecordType("unknown");
      }

      // Add in http headers
      if (httpHeaders != null) {
        for (Header h : httpHeaders) {
          mdx.put("HTTP-" + h.getName(), h.getValue());
        }
      }

      // URL:
      String uri = header.getUrl();
      if (uri != null) {
        UsableURI uuri = UsableURIFactory.getInstance(uri);
        // Hosts:
        if ("https".contains(uuri.getScheme())) {
          mdx.put("host", uuri.getAuthority());
        }
      } else {
        mdx.put("errors", "malformed-url");
      }

      // Year
      String date = header.getDate();
      if (date != null && date.length() > 4) {
        mdx.put("year", date.substring(0, 4));
      } else {
        mdx.put("errors", "malformed-date");
      }

      // And collect:
      String outKey = mdx.getHash();
      if (outKey == null || outKey == "" || "null".equals(outKey)) {
        outKey = mdx.getRecordType() + ":" + header.getMimetype();
      } else {
        outKey = mdx.getRecordType() + ":" + outKey;
      }

      output.collect(new Text(outKey), new Text(mdx.toString()));
    } catch (JSONException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
  }
Esempio n. 4
0
  /** Explode the archive into its constituent elements */
  public void explode() throws CacheException {
    CachedUrl cachedUrl = null;
    int goodEntries = 0;
    int badEntries = 0;
    int ignoredEntries = 0;
    int entriesBetweenSleep = 0;
    ArchiveReader arcReader = null;

    logger.info(
        (storeArchive ? "Storing" : "Fetching") + " WARC file: " + origUrl + " will explode");
    try {
      if (storeArchive) {
        UrlCacher uc = au.makeUrlCacher(new UrlData(arcStream, arcProps, fetchUrl));
        BitSet bs = new BitSet();
        bs.set(UrlCacher.DONT_CLOSE_INPUT_STREAM_FLAG);
        uc.setFetchFlags(bs);
        uc.storeContent();
        archiveData.resetInputStream();
        arcStream = archiveData.input;
      }
      // Wrap it in an ArchiveReader
      logger.debug3("About to wrap stream");
      arcReader = wrapStream(fetchUrl, arcStream);
      logger.debug3("wrapStream() returns " + (arcReader == null ? "null" : "non-null"));
      // Explode it
      if (arcReader == null) {
        throw new CacheException.ExploderException("no WarcReader for " + origUrl);
      }
      ArchivalUnit au = crawlFacade.getAu();
      Set stemSet = new HashSet();
      logger.debug("Exploding " + fetchUrl);
      // Iterate through the elements in the WARC file, except the first
      Iterator i = arcReader.iterator();
      // Skip first record
      for (i.next(); i.hasNext(); ) {
        // XXX probably not necessary
        helper.pokeWDog();
        if ((++entriesBetweenSleep % sleepAfter) == 0) {
          long pauseTime =
              CurrentConfig.getTimeIntervalParam(PARAM_RETRY_PAUSE, DEFAULT_RETRY_PAUSE);
          Deadline pause = Deadline.in(pauseTime);
          logger.debug3("Sleeping for " + StringUtil.timeIntervalToString(pauseTime));
          while (!pause.expired()) {
            try {
              pause.sleep();
            } catch (InterruptedException ie) {
              // no action
            }
          }
        }
        ArchiveRecord element = (ArchiveRecord) i.next();
        // Each element is a URL to be cached in a suitable AU
        ArchiveRecordHeader elementHeader = element.getHeader();
        String elementUrl = elementHeader.getUrl();
        String elementMimeType = elementHeader.getMimetype();
        long elementLength = elementHeader.getLength();
        logger.debug2("WARC url " + elementUrl + " mime " + elementMimeType);
        if (elementUrl.startsWith("http:")) {
          ArchiveEntry ae =
              new ArchiveEntry(
                  elementUrl,
                  elementLength,
                  0, // XXX need to convert getDate string to long
                  element, // ArchiveRecord extends InputStream
                  this,
                  fetchUrl);
          ae.setHeaderFields(makeCIProperties(elementHeader));
          long bytesStored = elementLength;
          logger.debug3("ArchiveEntry: " + ae.getName() + " bytes " + bytesStored);
          try {
            helper.process(ae);
          } catch (PluginException ex) {
            throw new CacheException.ExploderException("helper.process() threw", ex);
          }
          if (ae.getBaseUrl() != null) {
            if (ae.getRestOfUrl() != null && ae.getHeaderFields() != null) {
              storeEntry(ae);
              handleAddText(ae);
              goodEntries++;
              crawlFacade.getCrawlerStatus().addContentBytesFetched(bytesStored);
            } else {
              ignoredEntries++;
            }
          } else {
            badEntries++;
            logger.debug2("Can't map " + elementUrl + " from " + archiveUrl);
          }
        }
      }
    } catch (IOException ex) {
      throw new CacheException.ExploderException(ex);
    } finally {
      if (arcReader != null)
        try {
          arcReader.close();
          arcReader = null;
        } catch (IOException ex) {
          throw new CacheException.ExploderException(ex);
        }
      if (cachedUrl != null) {
        cachedUrl.release();
      }
      IOUtil.safeClose(arcStream);
    }
    if (badEntries == 0 && goodEntries > 0) {
      // Make it look like a new crawl finished on each AU to which
      // URLs were added.
      for (Iterator it = touchedAus.iterator(); it.hasNext(); ) {
        ArchivalUnit au = (ArchivalUnit) it.next();
        logger.debug3(archiveUrl + " touching " + au.toString());
        AuUtil.getDaemon(au).getNodeManager(au).newContentCrawlFinished();
      }
    } else {
      ArchivalUnit au = crawlFacade.getAu();
      String msg = archiveUrl + ": " + badEntries + "/" + goodEntries + " bad entries";
      throw new CacheException.UnretryableException(msg);
    }
  }