Exemple #1
0
  /**
   * * Reads WARC archive from given archiveFile.
   *
   * @param archiveFile
   * @throws IOException
   * @throws ParseException
   */
  private void readWarc(String archiveFile) throws IOException, ParseException {
    InputStream in = new GZIPInputStream(new FileInputStream(archiveFile));
    WarcReader warcReader = WarcReaderFactory.getReaderUncompressed(in);
    WarcRecord record = null;

    // scan each WARC record until we find
    while ((record = warcReader.getNextRecord()) != null) {
      HeaderLine type = record.getHeader("WARC-Type");
      if (type == null || !type.value.equals("response")) {
        continue;
      }

      // a "WARC-Type: response" with valid WARC-Target-URI
      URI requestUri = null;
      HeaderLine uri = record.getHeader("WARC-Target-URI");
      if (uri != null) {
        try {
          requestUri = new URI(uri.value);
        } catch (URISyntaxException e) {
          e.printStackTrace();
          continue;
        }
      } else {
        continue;
      }

      // then create a comparable Record object from WarcRecord and add to m_recordDB
      Record responseRecord = new Record(record);
      insertRecordByUri(requestUri, responseRecord);
    }

    warcReader.close();
  }
Exemple #2
0
 private void validateWarcFile(File tmpWarcFile) throws FileNotFoundException, IOException {
   // Validate warc records using jwat
   InputStream is = new FileInputStream(tmpWarcFile);
   ByteCountingPushBackInputStream pbin =
       new ByteCountingPushBackInputStream(new BufferedInputStream(is, 8192), 16);
   org.jwat.warc.WarcReader warcReader = WarcReaderFactory.getReader(pbin);
   Iterator<org.jwat.warc.WarcRecord> warcIterator = warcReader.iterator();
   int recordCounter = 0;
   while (warcIterator.hasNext()) {
     recordCounter++;
     org.jwat.warc.WarcRecord warcRecord = warcIterator.next();
     InputStream payloadIs = warcRecord.getPayloadContent();
     switch (recordCounter) {
       case 1:
         // header
         assertEquals("warcinfo", warcRecord.getHeader("WARC-Type").value);
         assertEquals("application/warc-fields", warcRecord.getHeader("Content-Type").value);
         assertEquals("133", warcRecord.getHeader("Content-Length").value);
         // payload
         String arcHeader =
             new String(IOUtils.inputStreamToByteArray(payloadIs), Charset.forName("UTF-8"));
         assertTrue(
             "header start not as expected",
             arcHeader.startsWith(
                 "software: JWAT Version 1.0.0 https://sbforge.org/display/JWAT/JWAT-Tools\n"));
         assertTrue(
             "header end not as expected",
             arcHeader.endsWith("description: migrated from ARC format: WARC file version 1.0"));
         break;
       case 2:
         // header
         assertEquals("metadata", warcRecord.getHeader("WARC-Type").value);
         assertEquals("1190", warcRecord.getHeader("Content-Length").value);
         assertEquals("text/plain", warcRecord.getHeader("Content-Type").value);
         // payload
         String oldArcInfoRecord =
             new String(IOUtils.inputStreamToByteArray(payloadIs), Charset.forName("UTF-8"));
         assertTrue(
             oldArcInfoRecord.startsWith(
                 "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>"));
         assertTrue(oldArcInfoRecord.endsWith("</arcmetadata>\n"));
         break;
       case 3:
         // header
         assertEquals("resource", warcRecord.getHeader("WARC-Type").value);
         assertEquals("text/dns", warcRecord.getHeader("Content-Type").value);
         assertEquals("57", warcRecord.getHeader("Content-Length").value);
         // payload
         String dns =
             new String(IOUtils.inputStreamToByteArray(payloadIs), Charset.forName("UTF-8"));
         assertTrue(dns.startsWith("20130522085319"));
         assertTrue(dns.endsWith("fue-l.onb1.ac.at.\t3600\tIN\tA\t172.16.14.151\n"));
         break;
       case 4:
         // header
         assertEquals("response", warcRecord.getHeader("WARC-Type").value);
         assertEquals("text/html", warcRecord.getHeader("Content-Type").value);
         assertEquals("287", warcRecord.getHeader("Content-Length").value);
         // payload
         String robots =
             new String(IOUtils.inputStreamToByteArray(payloadIs), Charset.forName("UTF-8"));
         assertTrue(robots.startsWith("<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML 2.0//EN\">"));
         assertTrue(robots.endsWith("</body></html>\n"));
         break;
       case 5:
         // header
         assertEquals("response", warcRecord.getHeader("WARC-Type").value);
         assertEquals("text/html", warcRecord.getHeader("Content-Type").value);
         assertEquals("164", warcRecord.getHeader("Content-Length").value);
         // payload
         String html =
             new String(IOUtils.inputStreamToByteArray(payloadIs), Charset.forName("UTF-8"));
         assertTrue(html.startsWith("<html>"));
         assertTrue(html.endsWith("</html>\n\n"));
         break;
       case 6:
         // header
         assertEquals("response", warcRecord.getHeader("WARC-Type").value);
         assertEquals("image/png", warcRecord.getHeader("Content-Type").value);
         assertEquals("607", warcRecord.getHeader("Content-Length").value);
         break;
     }
   }
   assertEquals(6, recordCounter);
 }
Exemple #3
0
  private void ingestWarcFile(File inputWarcFile) throws IOException {
    WarcRecord warcRecord = null;
    String uri = null;
    String date = null;
    String type = null;
    byte[] content = null;
    String key = null;

    GZIPInputStream gzInputStream = new GZIPInputStream(new FileInputStream(inputWarcFile));
    ByteCountingPushBackInputStream pbin =
        new ByteCountingPushBackInputStream(new BufferedInputStream(gzInputStream, 8192), 32);
    WarcReader warcReader = WarcReaderFactory.getReaderUncompressed(pbin);

    if (warcReader == null) {
      // TODO: LOG?
      LOG.info("Can't read warc file " + inputWarcFile.getName());
      return;
    }

    warcReader.setWarcTargetUriProfile(uriProfile);
    warcReader.setBlockDigestEnabled(bBlockDigestEnabled);
    warcReader.setPayloadDigestEnabled(bPayloadDigestEnabled);
    warcReader.setRecordHeaderMaxSize(recordHeaderMaxSize);
    warcReader.setPayloadHeaderMaxSize(payloadHeaderMaxSize);

    while ((warcRecord = warcReader.getNextRecord()) != null) {
      uri = warcRecord.header.warcTargetUriStr;
      key = Util.reverseHostname(uri);
      Payload payload = warcRecord.getPayload();
      HttpHeader httpHeader = null;
      InputStream payloadStream = null;

      // TODO: change int this:
      if (payload == null) {
        continue;
      }

      httpHeader = warcRecord.getHttpHeader();
      if (httpHeader != null) {
        payloadStream = httpHeader.getPayloadInputStream();
        type = httpHeader.contentType;
      } else {
        payloadStream = payload.getInputStreamComplete();
      }

      if (payloadStream == null) {
        skipped++;
        continue;
      }

      date = warcRecord.header.warcDateStr;

      if (payloadStream.available() > MAX_CONTENT_SIZE) {
        skipped++;
        continue;
      }
      content = IOUtils.toByteArray(payloadStream);
      // TODO: fix this
      if (key == null) {
        skipped++;
        continue;
      }

      if (type == null) {
        type = "text/plain";
      }

      if (warcRecord.getHeader("WARC-Type").value.toLowerCase().equals("response")) {
        if (content.length > MAX_CONTENT_SIZE) {
          skipped++;
          continue;
        }
        if (cnt % 10000 == 0 && cnt > 0) {
          LOG.info("Ingested " + cnt + "records to Hbase.");
        }
        if (hbaseManager.addRecord(key, date, content, type)) {
          cnt++;
        } else {
          skipped++;
        }
      }
    }
    // TODO: properly close streams.
    warcReader.close();
    pbin.close();
    gzInputStream.close();
  }