Beispiel #1
0
  private void ingestArcFile(File inputArcFile) throws IOException {
    ArcRecordBase record = null;
    String url = null;
    String date = null;
    byte[] content = null;
    String type = null;
    String key = null;

    InputStream in = new FileInputStream(inputArcFile);
    ArcReader reader = ArcReaderFactory.getReader(in);
    while ((record = reader.getNextRecord()) != null) {
      url = record.getUrlStr();
      date = record.getArchiveDateStr();
      content = IOUtils.toByteArray(record.getPayloadContent());
      key = Util.reverseHostname(url);
      type = record.getContentTypeStr();

      if (key != null && type == null) {
        type = "text/plain";
      }

      if (key == null) {
        continue;
      }

      if (content.length > MAX_CONTENT_SIZE) {
        skipped++;
      } else {
        if (cnt % 10000 == 0 && cnt > 0) {
          LOG.info("Ingested " + cnt + "records to Hbase.");
        }
        if (hbaseManager.addRecord(key, date, content, type)) {
          cnt++;
        } else {
          skipped++;
        }
      }
    }
    // TODO: properly close streams.
    reader.close();
    in.close();
  }
Beispiel #2
0
  private void ingestWarcFile(File inputWarcFile) throws IOException {
    WarcRecord warcRecord = null;
    String uri = null;
    String date = null;
    String type = null;
    byte[] content = null;
    String key = null;

    GZIPInputStream gzInputStream = new GZIPInputStream(new FileInputStream(inputWarcFile));
    ByteCountingPushBackInputStream pbin =
        new ByteCountingPushBackInputStream(new BufferedInputStream(gzInputStream, 8192), 32);
    WarcReader warcReader = WarcReaderFactory.getReaderUncompressed(pbin);

    if (warcReader == null) {
      // TODO: LOG?
      LOG.info("Can't read warc file " + inputWarcFile.getName());
      return;
    }

    warcReader.setWarcTargetUriProfile(uriProfile);
    warcReader.setBlockDigestEnabled(bBlockDigestEnabled);
    warcReader.setPayloadDigestEnabled(bPayloadDigestEnabled);
    warcReader.setRecordHeaderMaxSize(recordHeaderMaxSize);
    warcReader.setPayloadHeaderMaxSize(payloadHeaderMaxSize);

    while ((warcRecord = warcReader.getNextRecord()) != null) {
      uri = warcRecord.header.warcTargetUriStr;
      key = Util.reverseHostname(uri);
      Payload payload = warcRecord.getPayload();
      HttpHeader httpHeader = null;
      InputStream payloadStream = null;

      // TODO: change int this:
      if (payload == null) {
        continue;
      }

      httpHeader = warcRecord.getHttpHeader();
      if (httpHeader != null) {
        payloadStream = httpHeader.getPayloadInputStream();
        type = httpHeader.contentType;
      } else {
        payloadStream = payload.getInputStreamComplete();
      }

      if (payloadStream == null) {
        skipped++;
        continue;
      }

      date = warcRecord.header.warcDateStr;

      if (payloadStream.available() > MAX_CONTENT_SIZE) {
        skipped++;
        continue;
      }
      content = IOUtils.toByteArray(payloadStream);
      // TODO: fix this
      if (key == null) {
        skipped++;
        continue;
      }

      if (type == null) {
        type = "text/plain";
      }

      if (warcRecord.getHeader("WARC-Type").value.toLowerCase().equals("response")) {
        if (content.length > MAX_CONTENT_SIZE) {
          skipped++;
          continue;
        }
        if (cnt % 10000 == 0 && cnt > 0) {
          LOG.info("Ingested " + cnt + "records to Hbase.");
        }
        if (hbaseManager.addRecord(key, date, content, type)) {
          cnt++;
        } else {
          skipped++;
        }
      }
    }
    // TODO: properly close streams.
    warcReader.close();
    pbin.close();
    gzInputStream.close();
  }