@Test public void test_digest_fields() { boolean bDebugOutput = System.getProperty("jwat.debug.output") != null; InputStream in; int records = 0; int errors = 0; int warnings = 0; try { in = TestHelpers.getTestResourceAsStream(warcFile); WarcReader reader = WarcReaderFactory.getReader(in); WarcRecord record; while ((record = reader.getNextRecord()) != null) { if (bDebugOutput) { TestBaseUtils.printRecord(record); TestBaseUtils.printRecordErrors(record); } record.close(); errors = 0; warnings = 0; if (record.diagnostics.hasErrors()) { errors += record.diagnostics.getErrors().size(); } if (record.diagnostics.hasWarnings()) { warnings += record.diagnostics.getWarnings().size(); } Assert.assertEquals(expected_errors[records], errors); Assert.assertEquals(expected_warnings[records], warnings); ++records; } reader.close(); in.close(); if (bDebugOutput) { TestBaseUtils.printStatus(records, errors, warnings); } } catch (FileNotFoundException e) { Assert.fail("Input file missing"); } catch (IOException e) { Assert.fail("Unexpected i/o exception"); } Assert.assertEquals(expected_records, records); }
private void validateWarcFile(File tmpWarcFile) throws FileNotFoundException, IOException { // Validate warc records using jwat InputStream is = new FileInputStream(tmpWarcFile); ByteCountingPushBackInputStream pbin = new ByteCountingPushBackInputStream(new BufferedInputStream(is, 8192), 16); org.jwat.warc.WarcReader warcReader = WarcReaderFactory.getReader(pbin); Iterator<org.jwat.warc.WarcRecord> warcIterator = warcReader.iterator(); int recordCounter = 0; while (warcIterator.hasNext()) { recordCounter++; org.jwat.warc.WarcRecord warcRecord = warcIterator.next(); InputStream payloadIs = warcRecord.getPayloadContent(); switch (recordCounter) { case 1: // header assertEquals("warcinfo", warcRecord.getHeader("WARC-Type").value); assertEquals("application/warc-fields", warcRecord.getHeader("Content-Type").value); assertEquals("133", warcRecord.getHeader("Content-Length").value); // payload String arcHeader = new String(IOUtils.inputStreamToByteArray(payloadIs), Charset.forName("UTF-8")); assertTrue( "header start not as expected", arcHeader.startsWith( "software: JWAT Version 1.0.0 https://sbforge.org/display/JWAT/JWAT-Tools\n")); assertTrue( "header end not as expected", arcHeader.endsWith("description: migrated from ARC format: WARC file version 1.0")); break; case 2: // header assertEquals("metadata", warcRecord.getHeader("WARC-Type").value); assertEquals("1190", warcRecord.getHeader("Content-Length").value); assertEquals("text/plain", warcRecord.getHeader("Content-Type").value); // payload String oldArcInfoRecord = new String(IOUtils.inputStreamToByteArray(payloadIs), Charset.forName("UTF-8")); assertTrue( oldArcInfoRecord.startsWith( "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>")); assertTrue(oldArcInfoRecord.endsWith("</arcmetadata>\n")); break; case 3: // header assertEquals("resource", warcRecord.getHeader("WARC-Type").value); assertEquals("text/dns", warcRecord.getHeader("Content-Type").value); assertEquals("57", warcRecord.getHeader("Content-Length").value); // payload String dns = new String(IOUtils.inputStreamToByteArray(payloadIs), Charset.forName("UTF-8")); assertTrue(dns.startsWith("20130522085319")); assertTrue(dns.endsWith("fue-l.onb1.ac.at.\t3600\tIN\tA\t172.16.14.151\n")); break; case 4: // header assertEquals("response", warcRecord.getHeader("WARC-Type").value); assertEquals("text/html", warcRecord.getHeader("Content-Type").value); assertEquals("287", warcRecord.getHeader("Content-Length").value); // payload String robots = new String(IOUtils.inputStreamToByteArray(payloadIs), Charset.forName("UTF-8")); assertTrue(robots.startsWith("<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML 2.0//EN\">")); assertTrue(robots.endsWith("</body></html>\n")); break; case 5: // header assertEquals("response", warcRecord.getHeader("WARC-Type").value); assertEquals("text/html", warcRecord.getHeader("Content-Type").value); assertEquals("164", warcRecord.getHeader("Content-Length").value); // payload String html = new String(IOUtils.inputStreamToByteArray(payloadIs), Charset.forName("UTF-8")); assertTrue(html.startsWith("<html>")); assertTrue(html.endsWith("</html>\n\n")); break; case 6: // header assertEquals("response", warcRecord.getHeader("WARC-Type").value); assertEquals("image/png", warcRecord.getHeader("Content-Type").value); assertEquals("607", warcRecord.getHeader("Content-Length").value); break; } } assertEquals(6, recordCounter); }