Example #1
0
  public static MD5Hash createKeyHash(File file, Metadata metadata) throws IOException {
    String extension = Util.getExtension(file.getName());

    if ("eml".equalsIgnoreCase(extension)) {
      assert (metadata != null);
      String hashNames =
          EmailProperties.getInstance().getProperty(EmailProperties.EMAIL_HASH_NAMES);
      String[] hashNamesArr = hashNames.split(",");

      StringBuilder data = new StringBuilder();

      for (String hashName : hashNamesArr) {
        String value = metadata.get(hashName);
        if (value != null) {
          data.append(value);
          data.append(" ");
        }
      }
      return MD5Hash.digest(data.toString());
    } else {
      MD5Hash key;
      try ( // use MD5 of the input file as Hadoop key
      FileInputStream fileInputStream = new FileInputStream(file)) {
        key = MD5Hash.digest(fileInputStream);
      }
      return key;
    }
  }
Example #2
0
  /** Convert a Json map to a MD5MD5CRC32FileChecksum. */
  public static MD5MD5CRC32FileChecksum toMD5MD5CRC32FileChecksum(final Map<?, ?> json)
      throws IOException {
    if (json == null) {
      return null;
    }

    final Map<?, ?> m = (Map<?, ?>) json.get(MD5MD5CRC32FileChecksum.class.getSimpleName());
    final String algorithm = (String) m.get("algorithm");
    final int length = (int) (long) (Long) m.get("length");
    final byte[] bytes = StringUtils.hexStringToByte((String) m.get("bytes"));

    final DataInputStream in = new DataInputStream(new ByteArrayInputStream(bytes));
    final int bytesPerCRC = in.readInt();
    final long crcPerBlock = in.readLong();
    final MD5Hash md5 = MD5Hash.read(in);
    final MD5MD5CRC32FileChecksum checksum =
        new MD5MD5CRC32FileChecksum(bytesPerCRC, crcPerBlock, md5);

    // check algorithm name
    final String alg = "MD5-of-" + crcPerBlock + "MD5-of-" + bytesPerCRC + "CRC32";
    if (!alg.equals(algorithm)) {
      throw new IOException(
          "Algorithm not matched: algorithm="
              + algorithm
              + ", crcPerBlock="
              + crcPerBlock
              + ", bytesPerCRC="
              + bytesPerCRC);
    }
    // check length
    if (length != checksum.getLength()) {
      throw new IOException(
          "Length not matched: length="
              + length
              + ", checksum.getLength()="
              + checksum.getLength());
    }

    return checksum;
  }
  static Pair<TextBytes, TextBytes> generateTestRecord(
      TestModel model,
      String url,
      long timestamp,
      boolean success,
      TestRecordType recordType,
      Map<String, String> extraProperties)
      throws Exception {
    Pair<TextBytes, TextBytes> result = null;
    if (recordType == TestRecordType.CRAWL_STATUS
        || recordType == TestRecordType.CRAWL_STATUS_WITH_REDIRECT) {
      JsonObject topObject = new JsonObject();

      topObject.addProperty("source_url", url);
      topObject.addProperty("disposition", (success) ? "SUCCESS" : "FAILURE");
      topObject.addProperty("attempt_time", timestamp);
      if (success) {

        topObject.addProperty("http_result", 200);
        topObject.addProperty("server_ip", CANNED_IP);
        topObject.addProperty("content_len", CANNED_CONTENT_LEN);
        topObject.addProperty("mime_type", CANNED_MIME_TYPE);
        topObject.addProperty("md5", CANNED_MD5_VALUE);
        topObject.addProperty("text_simhash", CANNED_SIMHASH_VALUE);
        JsonObject headers = new JsonObject();
        headers.addProperty("date", timestamp);
        topObject.add("http_headers", headers);
        topObject.addProperty("parsed_as", "html");
        JsonObject content = new JsonObject();
        content.addProperty("title", CANNED_TITLE);
        JsonArray metaTags = new JsonArray();
        JsonObject metaTag = new JsonObject();
        metaTag.addProperty(CANNED_META_PROPERTY_NAME, CANNED_META_PROPERTY_VALUE);
        metaTags.add(metaTag);
        content.add("meta_tags", metaTags);
        topObject.add("content", content);

        JsonArray links = new JsonArray();
        JsonObject link1 = new JsonObject();
        link1.addProperty("href", sourceURLToLinkingHostURL(url, CANNED_LINKING_HOST_1));
        JsonObject link2 = new JsonObject();
        link2.addProperty("href", sourceURLToLinkingHostURL(url, CANNED_LINKING_HOST_2));
        links.add(link1);
        links.add(link2);
        topObject.add("links", links);

        if (recordType == TestRecordType.CRAWL_STATUS_WITH_REDIRECT) {
          JsonObject redirectObject = new JsonObject();
          redirectObject.addProperty(
              "source_url", extraProperties.get(EXTRA_PROPERTY_REDIRECT_SOURCE));
          redirectObject.addProperty("http_result", 301);
          redirectObject.addProperty("server_ip", CANNED_IP);
          topObject.add("redirect_from", redirectObject);
        }
      } else {
        topObject.addProperty("failure_reason", CANNED_FAILURE_REASON);
        topObject.addProperty("failure_detail", CANNED_FAILURE_DETAIL);
      }

      TextBytes keyOut = new TextBytes(CrawlDBKey.generateCrawlStatusKey(new Text(url), timestamp));
      TextBytes valueOut = new TextBytes(topObject.toString());

      result = new Pair<TextBytes, TextBytes>(keyOut, valueOut);
    } else {
      JsonObject linkData = new JsonObject();

      linkData.addProperty("href", url);
      linkData.addProperty("source_url", extraProperties.get(EXTRA_PROPERTY_LINK_SOURCE));
      linkData.addProperty("http_result", 200);
      linkData.addProperty("server_ip", CANNED_IP);
      linkData.addProperty("source_type", "html");
      linkData.addProperty("type", "a");
      linkData.addProperty("rel", "text/html");

      TextBytes keyOut =
          new TextBytes(
              CrawlDBKey.generateLinkKey(
                  new TextBytes(url),
                  CrawlDBKey.Type.KEY_TYPE_HTML_LINK,
                  MD5Hash.digest(extraProperties.get(EXTRA_PROPERTY_LINK_SOURCE)).toString()));

      TextBytes valueOut = new TextBytes(linkData.toString());

      result = new Pair<TextBytes, TextBytes>(keyOut, valueOut);
    }

    model.updateModelFromInputTuple(result);
    return result;
  }
 @Override
 public int compare(JsonObject o1, JsonObject o2) {
   String md51 = MD5Hash.digest(o1.toString()).toString();
   String md52 = MD5Hash.digest(o2.toString()).toString();
   return md51.compareTo(md52);
 }
 static String sourceURLToLinkingHostURL(String sourceURL, String linkingHostName) {
   return "http://"
       + linkingHostName
       + "/"
       + MD5Hash.digest(sourceURL + Long.toString(System.nanoTime())).toString();
 }
  @Override
  public long download(String source, String destination) {
    //        hdfs_source = "/"+hdfs_source;
    long readBytes = 0;
    try {
      //            String[] pluginPath = hdfs_source.split("/");
      //            String plugin = pluginPath[2];
      File destFile = new File(destination);
      if (destFile.exists()) {
        destFile.delete();
      } else {
        File parent = destFile.getParentFile();
        parent.mkdirs();
        destFile.createNewFile();
      }
      File cacheFile = new File(predeployedPath + "/" + source);
      if (cacheFile.exists()) {
        BufferedInputStream reader = new BufferedInputStream(new FileInputStream(cacheFile));

        BufferedOutputStream writer =
            new BufferedOutputStream(new FileOutputStream(destFile, false));
        try {
          byte[] buff = new byte[4096];
          int numChars;
          while ((numChars = reader.read(buff, 0, buff.length)) != -1) {
            writer.write(buff, 0, numChars);
          }
        } catch (IOException ex) {
          throw new IOException(
              "IOException when transferring " + cacheFile.getPath() + " to " + destFile.getPath());
        } finally {
          try {
            if (reader != null) {
              reader.close();
            }
            if (writer != null) writer.close();

          } catch (IOException ex) {
            System.out.println(
                "Error closing files when transferring "
                    + cacheFile.getPath()
                    + " to "
                    + destFile.getPath());
          }
        }
        return cacheFile.length();
      }

    } catch (Exception e) {
      System.err.println("Exception when trying to copy from local cache the file");
      e.printStackTrace();
    }

    try {
      FileOutputStream file = new FileOutputStream(destination);
      String[] pieces = parts(source);
      log.info("File splited into " + pieces.length + "pieces ");
      System.out.println("File is splited into " + pieces.length + " pieces ");
      long StartTime = System.currentTimeMillis();
      float currentSpeed;
      long totalUploadTime = 0;
      int count = 1;
      for (String piece : pieces) {
        byte[] pieceBytes = read(piece);
        file.write(pieceBytes);
        readBytes += pieceBytes.length;

        long endTime = System.currentTimeMillis();
        long timeDiff = endTime - StartTime + 1;
        StartTime = endTime;
        currentSpeed = (pieceBytes.length / 1000f) / ((timeDiff + 1f) / 1000f);
        totalUploadTime += timeDiff;

        log.info(
            "Succesfully download  piece #"
                + count
                + "/"
                + pieces.length
                + " "
                + readBytes
                + " bytes speed: "
                + currentSpeed
                + " kb/s");
        System.out.println(
            "Succesfully download  piece #"
                + count
                + "/"
                + pieces.length
                + " "
                + readBytes
                + " bytes speed: "
                + currentSpeed
                + " kb/s");
        count++;
      }
      file.close();
      System.out.println(
          "Download completed Total downloaded "
              + readBytes
              + " bytes into file "
              + destination
              + " time "
              + totalUploadTime / 1000f
              + " s");
      FileInputStream fileInputStream = new FileInputStream(destination);
      MD5Hash key = MD5Hash.digest(fileInputStream);
      fileInputStream.close();
      System.out.println("MD5 key : " + key);
    } catch (FileNotFoundException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    }
    return readBytes;
  }