private void ingestFolder(File inputFolder, int i) throws IOException { long startTime = System.currentTimeMillis(); cnt = 0; skipped = 0; GZIPInputStream gzInputStream = null; for (; i < inputFolder.listFiles().length; i++) { File inputFile = inputFolder.listFiles()[i]; LOG.info("processing file " + i + ": " + inputFile.getName()); if (inputFile.toString().toLowerCase().endsWith(".gz")) { gzInputStream = new GZIPInputStream(new FileInputStream(inputFile)); ByteCountingPushBackInputStream in = new ByteCountingPushBackInputStream(gzInputStream, 32); if (ArcReaderFactory.isArcFile(in)) { ingestArcFile(inputFile); } else if (WarcReaderFactory.isWarcFile(in)) { ingestWarcFile(inputFile); } } } long totalTime = System.currentTimeMillis() - startTime; LOG.info("Total " + cnt + " records inserted, " + skipped + " records skipped"); LOG.info("Total time: " + totalTime + "ms"); LOG.info("Ingest rate: " + cnt / (totalTime / 1000) + " records per second."); }
/** * * Reads WARC archive from given archiveFile. * * @param archiveFile * @throws IOException * @throws ParseException */ private void readWarc(String archiveFile) throws IOException, ParseException { InputStream in = new GZIPInputStream(new FileInputStream(archiveFile)); WarcReader warcReader = WarcReaderFactory.getReaderUncompressed(in); WarcRecord record = null; // scan each WARC record until we find while ((record = warcReader.getNextRecord()) != null) { HeaderLine type = record.getHeader("WARC-Type"); if (type == null || !type.value.equals("response")) { continue; } // a "WARC-Type: response" with valid WARC-Target-URI URI requestUri = null; HeaderLine uri = record.getHeader("WARC-Target-URI"); if (uri != null) { try { requestUri = new URI(uri.value); } catch (URISyntaxException e) { e.printStackTrace(); continue; } } else { continue; } // then create a comparable Record object from WarcRecord and add to m_recordDB Record responseRecord = new Record(record); insertRecordByUri(requestUri, responseRecord); } warcReader.close(); }
@Test public void test_digest_fields() { boolean bDebugOutput = System.getProperty("jwat.debug.output") != null; InputStream in; int records = 0; int errors = 0; int warnings = 0; try { in = TestHelpers.getTestResourceAsStream(warcFile); WarcReader reader = WarcReaderFactory.getReader(in); WarcRecord record; while ((record = reader.getNextRecord()) != null) { if (bDebugOutput) { TestBaseUtils.printRecord(record); TestBaseUtils.printRecordErrors(record); } record.close(); errors = 0; warnings = 0; if (record.diagnostics.hasErrors()) { errors += record.diagnostics.getErrors().size(); } if (record.diagnostics.hasWarnings()) { warnings += record.diagnostics.getWarnings().size(); } Assert.assertEquals(expected_errors[records], errors); Assert.assertEquals(expected_warnings[records], warnings); ++records; } reader.close(); in.close(); if (bDebugOutput) { TestBaseUtils.printStatus(records, errors, warnings); } } catch (FileNotFoundException e) { Assert.fail("Input file missing"); } catch (IOException e) { Assert.fail("Unexpected i/o exception"); } Assert.assertEquals(expected_records, records); }
private void validateWarcFile(File tmpWarcFile) throws FileNotFoundException, IOException { // Validate warc records using jwat InputStream is = new FileInputStream(tmpWarcFile); ByteCountingPushBackInputStream pbin = new ByteCountingPushBackInputStream(new BufferedInputStream(is, 8192), 16); org.jwat.warc.WarcReader warcReader = WarcReaderFactory.getReader(pbin); Iterator<org.jwat.warc.WarcRecord> warcIterator = warcReader.iterator(); int recordCounter = 0; while (warcIterator.hasNext()) { recordCounter++; org.jwat.warc.WarcRecord warcRecord = warcIterator.next(); InputStream payloadIs = warcRecord.getPayloadContent(); switch (recordCounter) { case 1: // header assertEquals("warcinfo", warcRecord.getHeader("WARC-Type").value); assertEquals("application/warc-fields", warcRecord.getHeader("Content-Type").value); assertEquals("133", warcRecord.getHeader("Content-Length").value); // payload String arcHeader = new String(IOUtils.inputStreamToByteArray(payloadIs), Charset.forName("UTF-8")); assertTrue( "header start not as expected", arcHeader.startsWith( "software: JWAT Version 1.0.0 https://sbforge.org/display/JWAT/JWAT-Tools\n")); assertTrue( "header end not as expected", arcHeader.endsWith("description: migrated from ARC format: WARC file version 1.0")); break; case 2: // header assertEquals("metadata", warcRecord.getHeader("WARC-Type").value); assertEquals("1190", warcRecord.getHeader("Content-Length").value); assertEquals("text/plain", warcRecord.getHeader("Content-Type").value); // payload String oldArcInfoRecord = new String(IOUtils.inputStreamToByteArray(payloadIs), Charset.forName("UTF-8")); assertTrue( oldArcInfoRecord.startsWith( "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>")); assertTrue(oldArcInfoRecord.endsWith("</arcmetadata>\n")); break; case 3: // header assertEquals("resource", warcRecord.getHeader("WARC-Type").value); assertEquals("text/dns", warcRecord.getHeader("Content-Type").value); assertEquals("57", warcRecord.getHeader("Content-Length").value); // payload String dns = new String(IOUtils.inputStreamToByteArray(payloadIs), Charset.forName("UTF-8")); assertTrue(dns.startsWith("20130522085319")); assertTrue(dns.endsWith("fue-l.onb1.ac.at.\t3600\tIN\tA\t172.16.14.151\n")); break; case 4: // header assertEquals("response", warcRecord.getHeader("WARC-Type").value); assertEquals("text/html", warcRecord.getHeader("Content-Type").value); assertEquals("287", warcRecord.getHeader("Content-Length").value); // payload String robots = new String(IOUtils.inputStreamToByteArray(payloadIs), Charset.forName("UTF-8")); assertTrue(robots.startsWith("<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML 2.0//EN\">")); assertTrue(robots.endsWith("</body></html>\n")); break; case 5: // header assertEquals("response", warcRecord.getHeader("WARC-Type").value); assertEquals("text/html", warcRecord.getHeader("Content-Type").value); assertEquals("164", warcRecord.getHeader("Content-Length").value); // payload String html = new String(IOUtils.inputStreamToByteArray(payloadIs), Charset.forName("UTF-8")); assertTrue(html.startsWith("<html>")); assertTrue(html.endsWith("</html>\n\n")); break; case 6: // header assertEquals("response", warcRecord.getHeader("WARC-Type").value); assertEquals("image/png", warcRecord.getHeader("Content-Type").value); assertEquals("607", warcRecord.getHeader("Content-Length").value); break; } } assertEquals(6, recordCounter); }
private void ingestWarcFile(File inputWarcFile) throws IOException { WarcRecord warcRecord = null; String uri = null; String date = null; String type = null; byte[] content = null; String key = null; GZIPInputStream gzInputStream = new GZIPInputStream(new FileInputStream(inputWarcFile)); ByteCountingPushBackInputStream pbin = new ByteCountingPushBackInputStream(new BufferedInputStream(gzInputStream, 8192), 32); WarcReader warcReader = WarcReaderFactory.getReaderUncompressed(pbin); if (warcReader == null) { // TODO: LOG? LOG.info("Can't read warc file " + inputWarcFile.getName()); return; } warcReader.setWarcTargetUriProfile(uriProfile); warcReader.setBlockDigestEnabled(bBlockDigestEnabled); warcReader.setPayloadDigestEnabled(bPayloadDigestEnabled); warcReader.setRecordHeaderMaxSize(recordHeaderMaxSize); warcReader.setPayloadHeaderMaxSize(payloadHeaderMaxSize); while ((warcRecord = warcReader.getNextRecord()) != null) { uri = warcRecord.header.warcTargetUriStr; key = Util.reverseHostname(uri); Payload payload = warcRecord.getPayload(); HttpHeader httpHeader = null; InputStream payloadStream = null; // TODO: change int this: if (payload == null) { continue; } httpHeader = warcRecord.getHttpHeader(); if (httpHeader != null) { payloadStream = httpHeader.getPayloadInputStream(); type = httpHeader.contentType; } else { payloadStream = payload.getInputStreamComplete(); } if (payloadStream == null) { skipped++; continue; } date = warcRecord.header.warcDateStr; if (payloadStream.available() > MAX_CONTENT_SIZE) { skipped++; continue; } content = IOUtils.toByteArray(payloadStream); // TODO: fix this if (key == null) { skipped++; continue; } if (type == null) { type = "text/plain"; } if (warcRecord.getHeader("WARC-Type").value.toLowerCase().equals("response")) { if (content.length > MAX_CONTENT_SIZE) { skipped++; continue; } if (cnt % 10000 == 0 && cnt > 0) { LOG.info("Ingested " + cnt + "records to Hbase."); } if (hbaseManager.addRecord(key, date, content, type)) { cnt++; } else { skipped++; } } } // TODO: properly close streams. warcReader.close(); pbin.close(); gzInputStream.close(); }