Пример #1
0
  @Test
  public void test_digest_fields() {
    boolean bDebugOutput = System.getProperty("jwat.debug.output") != null;

    InputStream in;

    int records = 0;
    int errors = 0;
    int warnings = 0;

    try {
      in = TestHelpers.getTestResourceAsStream(warcFile);

      WarcReader reader = WarcReaderFactory.getReader(in);
      WarcRecord record;

      while ((record = reader.getNextRecord()) != null) {
        if (bDebugOutput) {
          TestBaseUtils.printRecord(record);
          TestBaseUtils.printRecordErrors(record);
        }

        record.close();

        errors = 0;
        warnings = 0;
        if (record.diagnostics.hasErrors()) {
          errors += record.diagnostics.getErrors().size();
        }
        if (record.diagnostics.hasWarnings()) {
          warnings += record.diagnostics.getWarnings().size();
        }

        Assert.assertEquals(expected_errors[records], errors);
        Assert.assertEquals(expected_warnings[records], warnings);

        ++records;
      }

      reader.close();
      in.close();

      if (bDebugOutput) {
        TestBaseUtils.printStatus(records, errors, warnings);
      }
    } catch (FileNotFoundException e) {
      Assert.fail("Input file missing");
    } catch (IOException e) {
      Assert.fail("Unexpected i/o exception");
    }

    Assert.assertEquals(expected_records, records);
  }
Пример #2
0
 private void validateWarcFile(File tmpWarcFile) throws FileNotFoundException, IOException {
   // Validate warc records using jwat
   InputStream is = new FileInputStream(tmpWarcFile);
   ByteCountingPushBackInputStream pbin =
       new ByteCountingPushBackInputStream(new BufferedInputStream(is, 8192), 16);
   org.jwat.warc.WarcReader warcReader = WarcReaderFactory.getReader(pbin);
   Iterator<org.jwat.warc.WarcRecord> warcIterator = warcReader.iterator();
   int recordCounter = 0;
   while (warcIterator.hasNext()) {
     recordCounter++;
     org.jwat.warc.WarcRecord warcRecord = warcIterator.next();
     InputStream payloadIs = warcRecord.getPayloadContent();
     switch (recordCounter) {
       case 1:
         // header
         assertEquals("warcinfo", warcRecord.getHeader("WARC-Type").value);
         assertEquals("application/warc-fields", warcRecord.getHeader("Content-Type").value);
         assertEquals("133", warcRecord.getHeader("Content-Length").value);
         // payload
         String arcHeader =
             new String(IOUtils.inputStreamToByteArray(payloadIs), Charset.forName("UTF-8"));
         assertTrue(
             "header start not as expected",
             arcHeader.startsWith(
                 "software: JWAT Version 1.0.0 https://sbforge.org/display/JWAT/JWAT-Tools\n"));
         assertTrue(
             "header end not as expected",
             arcHeader.endsWith("description: migrated from ARC format: WARC file version 1.0"));
         break;
       case 2:
         // header
         assertEquals("metadata", warcRecord.getHeader("WARC-Type").value);
         assertEquals("1190", warcRecord.getHeader("Content-Length").value);
         assertEquals("text/plain", warcRecord.getHeader("Content-Type").value);
         // payload
         String oldArcInfoRecord =
             new String(IOUtils.inputStreamToByteArray(payloadIs), Charset.forName("UTF-8"));
         assertTrue(
             oldArcInfoRecord.startsWith(
                 "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>"));
         assertTrue(oldArcInfoRecord.endsWith("</arcmetadata>\n"));
         break;
       case 3:
         // header
         assertEquals("resource", warcRecord.getHeader("WARC-Type").value);
         assertEquals("text/dns", warcRecord.getHeader("Content-Type").value);
         assertEquals("57", warcRecord.getHeader("Content-Length").value);
         // payload
         String dns =
             new String(IOUtils.inputStreamToByteArray(payloadIs), Charset.forName("UTF-8"));
         assertTrue(dns.startsWith("20130522085319"));
         assertTrue(dns.endsWith("fue-l.onb1.ac.at.\t3600\tIN\tA\t172.16.14.151\n"));
         break;
       case 4:
         // header
         assertEquals("response", warcRecord.getHeader("WARC-Type").value);
         assertEquals("text/html", warcRecord.getHeader("Content-Type").value);
         assertEquals("287", warcRecord.getHeader("Content-Length").value);
         // payload
         String robots =
             new String(IOUtils.inputStreamToByteArray(payloadIs), Charset.forName("UTF-8"));
         assertTrue(robots.startsWith("<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML 2.0//EN\">"));
         assertTrue(robots.endsWith("</body></html>\n"));
         break;
       case 5:
         // header
         assertEquals("response", warcRecord.getHeader("WARC-Type").value);
         assertEquals("text/html", warcRecord.getHeader("Content-Type").value);
         assertEquals("164", warcRecord.getHeader("Content-Length").value);
         // payload
         String html =
             new String(IOUtils.inputStreamToByteArray(payloadIs), Charset.forName("UTF-8"));
         assertTrue(html.startsWith("<html>"));
         assertTrue(html.endsWith("</html>\n\n"));
         break;
       case 6:
         // header
         assertEquals("response", warcRecord.getHeader("WARC-Type").value);
         assertEquals("image/png", warcRecord.getHeader("Content-Type").value);
         assertEquals("607", warcRecord.getHeader("Content-Length").value);
         break;
     }
   }
   assertEquals(6, recordCounter);
 }