private void ingestArcFile(File inputArcFile) throws IOException { ArcRecordBase record = null; String url = null; String date = null; byte[] content = null; String type = null; String key = null; InputStream in = new FileInputStream(inputArcFile); ArcReader reader = ArcReaderFactory.getReader(in); while ((record = reader.getNextRecord()) != null) { url = record.getUrlStr(); date = record.getArchiveDateStr(); content = IOUtils.toByteArray(record.getPayloadContent()); key = Util.reverseHostname(url); type = record.getContentTypeStr(); if (key != null && type == null) { type = "text/plain"; } if (key == null) { continue; } if (content.length > MAX_CONTENT_SIZE) { skipped++; } else { if (cnt % 10000 == 0 && cnt > 0) { LOG.info("Ingested " + cnt + "records to Hbase."); } if (hbaseManager.addRecord(key, date, content, type)) { cnt++; } else { skipped++; } } } // TODO: properly close streams. reader.close(); in.close(); }
private void ingestWarcFile(File inputWarcFile) throws IOException { WarcRecord warcRecord = null; String uri = null; String date = null; String type = null; byte[] content = null; String key = null; GZIPInputStream gzInputStream = new GZIPInputStream(new FileInputStream(inputWarcFile)); ByteCountingPushBackInputStream pbin = new ByteCountingPushBackInputStream(new BufferedInputStream(gzInputStream, 8192), 32); WarcReader warcReader = WarcReaderFactory.getReaderUncompressed(pbin); if (warcReader == null) { // TODO: LOG? LOG.info("Can't read warc file " + inputWarcFile.getName()); return; } warcReader.setWarcTargetUriProfile(uriProfile); warcReader.setBlockDigestEnabled(bBlockDigestEnabled); warcReader.setPayloadDigestEnabled(bPayloadDigestEnabled); warcReader.setRecordHeaderMaxSize(recordHeaderMaxSize); warcReader.setPayloadHeaderMaxSize(payloadHeaderMaxSize); while ((warcRecord = warcReader.getNextRecord()) != null) { uri = warcRecord.header.warcTargetUriStr; key = Util.reverseHostname(uri); Payload payload = warcRecord.getPayload(); HttpHeader httpHeader = null; InputStream payloadStream = null; // TODO: change int this: if (payload == null) { continue; } httpHeader = warcRecord.getHttpHeader(); if (httpHeader != null) { payloadStream = httpHeader.getPayloadInputStream(); type = httpHeader.contentType; } else { payloadStream = payload.getInputStreamComplete(); } if (payloadStream == null) { skipped++; continue; } date = warcRecord.header.warcDateStr; if (payloadStream.available() > MAX_CONTENT_SIZE) { skipped++; continue; } content = IOUtils.toByteArray(payloadStream); // TODO: fix this if (key == null) { skipped++; continue; } if (type == null) { type = "text/plain"; } if (warcRecord.getHeader("WARC-Type").value.toLowerCase().equals("response")) { if (content.length > MAX_CONTENT_SIZE) { skipped++; continue; } if (cnt % 10000 == 0 && cnt > 0) { LOG.info("Ingested " + cnt + "records to Hbase."); } if (hbaseManager.addRecord(key, date, content, type)) { cnt++; } else { skipped++; } } } // TODO: properly close streams. warcReader.close(); pbin.close(); gzInputStream.close(); }