예제 #1
0
  private void ingestFolder(File inputFolder, int i) throws IOException {
    long startTime = System.currentTimeMillis();
    cnt = 0;
    skipped = 0;
    GZIPInputStream gzInputStream = null;

    for (; i < inputFolder.listFiles().length; i++) {
      File inputFile = inputFolder.listFiles()[i];
      LOG.info("processing file " + i + ": " + inputFile.getName());

      if (inputFile.toString().toLowerCase().endsWith(".gz")) {
        gzInputStream = new GZIPInputStream(new FileInputStream(inputFile));
        ByteCountingPushBackInputStream in = new ByteCountingPushBackInputStream(gzInputStream, 32);
        if (ArcReaderFactory.isArcFile(in)) {
          ingestArcFile(inputFile);
        } else if (WarcReaderFactory.isWarcFile(in)) {
          ingestWarcFile(inputFile);
        }
      }
    }

    long totalTime = System.currentTimeMillis() - startTime;
    LOG.info("Total " + cnt + " records inserted, " + skipped + " records skipped");
    LOG.info("Total time: " + totalTime + "ms");
    LOG.info("Ingest rate: " + cnt / (totalTime / 1000) + " records per second.");
  }
예제 #2
0
  /**
   * * Reads WARC archive from given archiveFile.
   *
   * @param archiveFile
   * @throws IOException
   * @throws ParseException
   */
  private void readWarc(String archiveFile) throws IOException, ParseException {
    InputStream in = new GZIPInputStream(new FileInputStream(archiveFile));
    WarcReader warcReader = WarcReaderFactory.getReaderUncompressed(in);
    WarcRecord record = null;

    // scan each WARC record until we find
    while ((record = warcReader.getNextRecord()) != null) {
      HeaderLine type = record.getHeader("WARC-Type");
      if (type == null || !type.value.equals("response")) {
        continue;
      }

      // a "WARC-Type: response" with valid WARC-Target-URI
      URI requestUri = null;
      HeaderLine uri = record.getHeader("WARC-Target-URI");
      if (uri != null) {
        try {
          requestUri = new URI(uri.value);
        } catch (URISyntaxException e) {
          e.printStackTrace();
          continue;
        }
      } else {
        continue;
      }

      // then create a comparable Record object from WarcRecord and add to m_recordDB
      Record responseRecord = new Record(record);
      insertRecordByUri(requestUri, responseRecord);
    }

    warcReader.close();
  }
예제 #3
0
  @Test
  public void test_digest_fields() {
    boolean bDebugOutput = System.getProperty("jwat.debug.output") != null;

    InputStream in;

    int records = 0;
    int errors = 0;
    int warnings = 0;

    try {
      in = TestHelpers.getTestResourceAsStream(warcFile);

      WarcReader reader = WarcReaderFactory.getReader(in);
      WarcRecord record;

      while ((record = reader.getNextRecord()) != null) {
        if (bDebugOutput) {
          TestBaseUtils.printRecord(record);
          TestBaseUtils.printRecordErrors(record);
        }

        record.close();

        errors = 0;
        warnings = 0;
        if (record.diagnostics.hasErrors()) {
          errors += record.diagnostics.getErrors().size();
        }
        if (record.diagnostics.hasWarnings()) {
          warnings += record.diagnostics.getWarnings().size();
        }

        Assert.assertEquals(expected_errors[records], errors);
        Assert.assertEquals(expected_warnings[records], warnings);

        ++records;
      }

      reader.close();
      in.close();

      if (bDebugOutput) {
        TestBaseUtils.printStatus(records, errors, warnings);
      }
    } catch (FileNotFoundException e) {
      Assert.fail("Input file missing");
    } catch (IOException e) {
      Assert.fail("Unexpected i/o exception");
    }

    Assert.assertEquals(expected_records, records);
  }
예제 #4
0
 private void validateWarcFile(File tmpWarcFile) throws FileNotFoundException, IOException {
   // Validate warc records using jwat
   InputStream is = new FileInputStream(tmpWarcFile);
   ByteCountingPushBackInputStream pbin =
       new ByteCountingPushBackInputStream(new BufferedInputStream(is, 8192), 16);
   org.jwat.warc.WarcReader warcReader = WarcReaderFactory.getReader(pbin);
   Iterator<org.jwat.warc.WarcRecord> warcIterator = warcReader.iterator();
   int recordCounter = 0;
   while (warcIterator.hasNext()) {
     recordCounter++;
     org.jwat.warc.WarcRecord warcRecord = warcIterator.next();
     InputStream payloadIs = warcRecord.getPayloadContent();
     switch (recordCounter) {
       case 1:
         // header
         assertEquals("warcinfo", warcRecord.getHeader("WARC-Type").value);
         assertEquals("application/warc-fields", warcRecord.getHeader("Content-Type").value);
         assertEquals("133", warcRecord.getHeader("Content-Length").value);
         // payload
         String arcHeader =
             new String(IOUtils.inputStreamToByteArray(payloadIs), Charset.forName("UTF-8"));
         assertTrue(
             "header start not as expected",
             arcHeader.startsWith(
                 "software: JWAT Version 1.0.0 https://sbforge.org/display/JWAT/JWAT-Tools\n"));
         assertTrue(
             "header end not as expected",
             arcHeader.endsWith("description: migrated from ARC format: WARC file version 1.0"));
         break;
       case 2:
         // header
         assertEquals("metadata", warcRecord.getHeader("WARC-Type").value);
         assertEquals("1190", warcRecord.getHeader("Content-Length").value);
         assertEquals("text/plain", warcRecord.getHeader("Content-Type").value);
         // payload
         String oldArcInfoRecord =
             new String(IOUtils.inputStreamToByteArray(payloadIs), Charset.forName("UTF-8"));
         assertTrue(
             oldArcInfoRecord.startsWith(
                 "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>"));
         assertTrue(oldArcInfoRecord.endsWith("</arcmetadata>\n"));
         break;
       case 3:
         // header
         assertEquals("resource", warcRecord.getHeader("WARC-Type").value);
         assertEquals("text/dns", warcRecord.getHeader("Content-Type").value);
         assertEquals("57", warcRecord.getHeader("Content-Length").value);
         // payload
         String dns =
             new String(IOUtils.inputStreamToByteArray(payloadIs), Charset.forName("UTF-8"));
         assertTrue(dns.startsWith("20130522085319"));
         assertTrue(dns.endsWith("fue-l.onb1.ac.at.\t3600\tIN\tA\t172.16.14.151\n"));
         break;
       case 4:
         // header
         assertEquals("response", warcRecord.getHeader("WARC-Type").value);
         assertEquals("text/html", warcRecord.getHeader("Content-Type").value);
         assertEquals("287", warcRecord.getHeader("Content-Length").value);
         // payload
         String robots =
             new String(IOUtils.inputStreamToByteArray(payloadIs), Charset.forName("UTF-8"));
         assertTrue(robots.startsWith("<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML 2.0//EN\">"));
         assertTrue(robots.endsWith("</body></html>\n"));
         break;
       case 5:
         // header
         assertEquals("response", warcRecord.getHeader("WARC-Type").value);
         assertEquals("text/html", warcRecord.getHeader("Content-Type").value);
         assertEquals("164", warcRecord.getHeader("Content-Length").value);
         // payload
         String html =
             new String(IOUtils.inputStreamToByteArray(payloadIs), Charset.forName("UTF-8"));
         assertTrue(html.startsWith("<html>"));
         assertTrue(html.endsWith("</html>\n\n"));
         break;
       case 6:
         // header
         assertEquals("response", warcRecord.getHeader("WARC-Type").value);
         assertEquals("image/png", warcRecord.getHeader("Content-Type").value);
         assertEquals("607", warcRecord.getHeader("Content-Length").value);
         break;
     }
   }
   assertEquals(6, recordCounter);
 }
예제 #5
0
  private void ingestWarcFile(File inputWarcFile) throws IOException {
    WarcRecord warcRecord = null;
    String uri = null;
    String date = null;
    String type = null;
    byte[] content = null;
    String key = null;

    GZIPInputStream gzInputStream = new GZIPInputStream(new FileInputStream(inputWarcFile));
    ByteCountingPushBackInputStream pbin =
        new ByteCountingPushBackInputStream(new BufferedInputStream(gzInputStream, 8192), 32);
    WarcReader warcReader = WarcReaderFactory.getReaderUncompressed(pbin);

    if (warcReader == null) {
      // TODO: LOG?
      LOG.info("Can't read warc file " + inputWarcFile.getName());
      return;
    }

    warcReader.setWarcTargetUriProfile(uriProfile);
    warcReader.setBlockDigestEnabled(bBlockDigestEnabled);
    warcReader.setPayloadDigestEnabled(bPayloadDigestEnabled);
    warcReader.setRecordHeaderMaxSize(recordHeaderMaxSize);
    warcReader.setPayloadHeaderMaxSize(payloadHeaderMaxSize);

    while ((warcRecord = warcReader.getNextRecord()) != null) {
      uri = warcRecord.header.warcTargetUriStr;
      key = Util.reverseHostname(uri);
      Payload payload = warcRecord.getPayload();
      HttpHeader httpHeader = null;
      InputStream payloadStream = null;

      // TODO: change int this:
      if (payload == null) {
        continue;
      }

      httpHeader = warcRecord.getHttpHeader();
      if (httpHeader != null) {
        payloadStream = httpHeader.getPayloadInputStream();
        type = httpHeader.contentType;
      } else {
        payloadStream = payload.getInputStreamComplete();
      }

      if (payloadStream == null) {
        skipped++;
        continue;
      }

      date = warcRecord.header.warcDateStr;

      if (payloadStream.available() > MAX_CONTENT_SIZE) {
        skipped++;
        continue;
      }
      content = IOUtils.toByteArray(payloadStream);
      // TODO: fix this
      if (key == null) {
        skipped++;
        continue;
      }

      if (type == null) {
        type = "text/plain";
      }

      if (warcRecord.getHeader("WARC-Type").value.toLowerCase().equals("response")) {
        if (content.length > MAX_CONTENT_SIZE) {
          skipped++;
          continue;
        }
        if (cnt % 10000 == 0 && cnt > 0) {
          LOG.info("Ingested " + cnt + "records to Hbase.");
        }
        if (hbaseManager.addRecord(key, date, content, type)) {
          cnt++;
        } else {
          skipped++;
        }
      }
    }
    // TODO: properly close streams.
    warcReader.close();
    pbin.close();
    gzInputStream.close();
  }