Beispiel #1
0
  private void ingestFolder(File inputFolder, int i) throws IOException {
    long startTime = System.currentTimeMillis();
    cnt = 0;
    skipped = 0;
    GZIPInputStream gzInputStream = null;

    for (; i < inputFolder.listFiles().length; i++) {
      File inputFile = inputFolder.listFiles()[i];
      LOG.info("processing file " + i + ": " + inputFile.getName());

      if (inputFile.toString().toLowerCase().endsWith(".gz")) {
        gzInputStream = new GZIPInputStream(new FileInputStream(inputFile));
        ByteCountingPushBackInputStream in = new ByteCountingPushBackInputStream(gzInputStream, 32);
        if (ArcReaderFactory.isArcFile(in)) {
          ingestArcFile(inputFile);
        } else if (WarcReaderFactory.isWarcFile(in)) {
          ingestWarcFile(inputFile);
        }
      }
    }

    long totalTime = System.currentTimeMillis() - startTime;
    LOG.info("Total " + cnt + " records inserted, " + skipped + " records skipped");
    LOG.info("Total time: " + totalTime + "ms");
    LOG.info("Ingest rate: " + cnt / (totalTime / 1000) + " records per second.");
  }
Beispiel #2
0
  private void ingestArcFile(File inputArcFile) throws IOException {
    ArcRecordBase record = null;
    String url = null;
    String date = null;
    byte[] content = null;
    String type = null;
    String key = null;

    InputStream in = new FileInputStream(inputArcFile);
    ArcReader reader = ArcReaderFactory.getReader(in);
    while ((record = reader.getNextRecord()) != null) {
      url = record.getUrlStr();
      date = record.getArchiveDateStr();
      content = IOUtils.toByteArray(record.getPayloadContent());
      key = Util.reverseHostname(url);
      type = record.getContentTypeStr();

      if (key != null && type == null) {
        type = "text/plain";
      }

      if (key == null) {
        continue;
      }

      if (content.length > MAX_CONTENT_SIZE) {
        skipped++;
      } else {
        if (cnt % 10000 == 0 && cnt > 0) {
          LOG.info("Ingested " + cnt + "records to Hbase.");
        }
        if (hbaseManager.addRecord(key, date, content, type)) {
          cnt++;
        } else {
          skipped++;
        }
      }
    }
    // TODO: properly close streams.
    reader.close();
    in.close();
  }