private void ingestFolder(File inputFolder, int i) throws IOException { long startTime = System.currentTimeMillis(); cnt = 0; skipped = 0; GZIPInputStream gzInputStream = null; for (; i < inputFolder.listFiles().length; i++) { File inputFile = inputFolder.listFiles()[i]; LOG.info("processing file " + i + ": " + inputFile.getName()); if (inputFile.toString().toLowerCase().endsWith(".gz")) { gzInputStream = new GZIPInputStream(new FileInputStream(inputFile)); ByteCountingPushBackInputStream in = new ByteCountingPushBackInputStream(gzInputStream, 32); if (ArcReaderFactory.isArcFile(in)) { ingestArcFile(inputFile); } else if (WarcReaderFactory.isWarcFile(in)) { ingestWarcFile(inputFile); } } } long totalTime = System.currentTimeMillis() - startTime; LOG.info("Total " + cnt + " records inserted, " + skipped + " records skipped"); LOG.info("Total time: " + totalTime + "ms"); LOG.info("Ingest rate: " + cnt / (totalTime / 1000) + " records per second."); }
private void ingestArcFile(File inputArcFile) throws IOException { ArcRecordBase record = null; String url = null; String date = null; byte[] content = null; String type = null; String key = null; InputStream in = new FileInputStream(inputArcFile); ArcReader reader = ArcReaderFactory.getReader(in); while ((record = reader.getNextRecord()) != null) { url = record.getUrlStr(); date = record.getArchiveDateStr(); content = IOUtils.toByteArray(record.getPayloadContent()); key = Util.reverseHostname(url); type = record.getContentTypeStr(); if (key != null && type == null) { type = "text/plain"; } if (key == null) { continue; } if (content.length > MAX_CONTENT_SIZE) { skipped++; } else { if (cnt % 10000 == 0 && cnt > 0) { LOG.info("Ingested " + cnt + "records to Hbase."); } if (hbaseManager.addRecord(key, date, content, type)) { cnt++; } else { skipped++; } } } // TODO: properly close streams. reader.close(); in.close(); }