예제 #1
0
  /**
   * * Reads WARC archive from given archiveFile.
   *
   * @param archiveFile
   * @throws IOException
   * @throws ParseException
   */
  private void readWarc(String archiveFile) throws IOException, ParseException {
    InputStream in = new GZIPInputStream(new FileInputStream(archiveFile));
    WarcReader warcReader = WarcReaderFactory.getReaderUncompressed(in);
    WarcRecord record = null;

    // scan each WARC record until we find
    while ((record = warcReader.getNextRecord()) != null) {
      HeaderLine type = record.getHeader("WARC-Type");
      if (type == null || !type.value.equals("response")) {
        continue;
      }

      // a "WARC-Type: response" with valid WARC-Target-URI
      URI requestUri = null;
      HeaderLine uri = record.getHeader("WARC-Target-URI");
      if (uri != null) {
        try {
          requestUri = new URI(uri.value);
        } catch (URISyntaxException e) {
          e.printStackTrace();
          continue;
        }
      } else {
        continue;
      }

      // then create a comparable Record object from WarcRecord and add to m_recordDB
      Record responseRecord = new Record(record);
      insertRecordByUri(requestUri, responseRecord);
    }

    warcReader.close();
  }
예제 #2
0
  private void ingestWarcFile(File inputWarcFile) throws IOException {
    WarcRecord warcRecord = null;
    String uri = null;
    String date = null;
    String type = null;
    byte[] content = null;
    String key = null;

    GZIPInputStream gzInputStream = new GZIPInputStream(new FileInputStream(inputWarcFile));
    ByteCountingPushBackInputStream pbin =
        new ByteCountingPushBackInputStream(new BufferedInputStream(gzInputStream, 8192), 32);
    WarcReader warcReader = WarcReaderFactory.getReaderUncompressed(pbin);

    if (warcReader == null) {
      // TODO: LOG?
      LOG.info("Can't read warc file " + inputWarcFile.getName());
      return;
    }

    warcReader.setWarcTargetUriProfile(uriProfile);
    warcReader.setBlockDigestEnabled(bBlockDigestEnabled);
    warcReader.setPayloadDigestEnabled(bPayloadDigestEnabled);
    warcReader.setRecordHeaderMaxSize(recordHeaderMaxSize);
    warcReader.setPayloadHeaderMaxSize(payloadHeaderMaxSize);

    while ((warcRecord = warcReader.getNextRecord()) != null) {
      uri = warcRecord.header.warcTargetUriStr;
      key = Util.reverseHostname(uri);
      Payload payload = warcRecord.getPayload();
      HttpHeader httpHeader = null;
      InputStream payloadStream = null;

      // TODO: change int this:
      if (payload == null) {
        continue;
      }

      httpHeader = warcRecord.getHttpHeader();
      if (httpHeader != null) {
        payloadStream = httpHeader.getPayloadInputStream();
        type = httpHeader.contentType;
      } else {
        payloadStream = payload.getInputStreamComplete();
      }

      if (payloadStream == null) {
        skipped++;
        continue;
      }

      date = warcRecord.header.warcDateStr;

      if (payloadStream.available() > MAX_CONTENT_SIZE) {
        skipped++;
        continue;
      }
      content = IOUtils.toByteArray(payloadStream);
      // TODO: fix this
      if (key == null) {
        skipped++;
        continue;
      }

      if (type == null) {
        type = "text/plain";
      }

      if (warcRecord.getHeader("WARC-Type").value.toLowerCase().equals("response")) {
        if (content.length > MAX_CONTENT_SIZE) {
          skipped++;
          continue;
        }
        if (cnt % 10000 == 0 && cnt > 0) {
          LOG.info("Ingested " + cnt + "records to Hbase.");
        }
        if (hbaseManager.addRecord(key, date, content, type)) {
          cnt++;
        } else {
          skipped++;
        }
      }
    }
    // TODO: properly close streams.
    warcReader.close();
    pbin.close();
    gzInputStream.close();
  }