/** * * Reads WARC archive from given archiveFile. * * @param archiveFile * @throws IOException * @throws ParseException */ private void readWarc(String archiveFile) throws IOException, ParseException { InputStream in = new GZIPInputStream(new FileInputStream(archiveFile)); WarcReader warcReader = WarcReaderFactory.getReaderUncompressed(in); WarcRecord record = null; // scan each WARC record until we find while ((record = warcReader.getNextRecord()) != null) { HeaderLine type = record.getHeader("WARC-Type"); if (type == null || !type.value.equals("response")) { continue; } // a "WARC-Type: response" with valid WARC-Target-URI URI requestUri = null; HeaderLine uri = record.getHeader("WARC-Target-URI"); if (uri != null) { try { requestUri = new URI(uri.value); } catch (URISyntaxException e) { e.printStackTrace(); continue; } } else { continue; } // then create a comparable Record object from WarcRecord and add to m_recordDB Record responseRecord = new Record(record); insertRecordByUri(requestUri, responseRecord); } warcReader.close(); }
private void ingestWarcFile(File inputWarcFile) throws IOException { WarcRecord warcRecord = null; String uri = null; String date = null; String type = null; byte[] content = null; String key = null; GZIPInputStream gzInputStream = new GZIPInputStream(new FileInputStream(inputWarcFile)); ByteCountingPushBackInputStream pbin = new ByteCountingPushBackInputStream(new BufferedInputStream(gzInputStream, 8192), 32); WarcReader warcReader = WarcReaderFactory.getReaderUncompressed(pbin); if (warcReader == null) { // TODO: LOG? LOG.info("Can't read warc file " + inputWarcFile.getName()); return; } warcReader.setWarcTargetUriProfile(uriProfile); warcReader.setBlockDigestEnabled(bBlockDigestEnabled); warcReader.setPayloadDigestEnabled(bPayloadDigestEnabled); warcReader.setRecordHeaderMaxSize(recordHeaderMaxSize); warcReader.setPayloadHeaderMaxSize(payloadHeaderMaxSize); while ((warcRecord = warcReader.getNextRecord()) != null) { uri = warcRecord.header.warcTargetUriStr; key = Util.reverseHostname(uri); Payload payload = warcRecord.getPayload(); HttpHeader httpHeader = null; InputStream payloadStream = null; // TODO: change int this: if (payload == null) { continue; } httpHeader = warcRecord.getHttpHeader(); if (httpHeader != null) { payloadStream = httpHeader.getPayloadInputStream(); type = httpHeader.contentType; } else { payloadStream = payload.getInputStreamComplete(); } if (payloadStream == null) { skipped++; continue; } date = warcRecord.header.warcDateStr; if (payloadStream.available() > MAX_CONTENT_SIZE) { skipped++; continue; } content = IOUtils.toByteArray(payloadStream); // TODO: fix this if (key == null) { skipped++; continue; } if (type == null) { type = "text/plain"; } if (warcRecord.getHeader("WARC-Type").value.toLowerCase().equals("response")) { if (content.length > MAX_CONTENT_SIZE) { skipped++; continue; } if (cnt % 10000 == 0 && cnt > 0) { LOG.info("Ingested " + cnt + "records to Hbase."); } if (hbaseManager.addRecord(key, date, content, type)) { cnt++; } else { skipped++; } } } // TODO: properly close streams. warcReader.close(); pbin.close(); gzInputStream.close(); }