public static MD5Hash createKeyHash(File file, Metadata metadata) throws IOException { String extension = Util.getExtension(file.getName()); if ("eml".equalsIgnoreCase(extension)) { assert (metadata != null); String hashNames = EmailProperties.getInstance().getProperty(EmailProperties.EMAIL_HASH_NAMES); String[] hashNamesArr = hashNames.split(","); StringBuilder data = new StringBuilder(); for (String hashName : hashNamesArr) { String value = metadata.get(hashName); if (value != null) { data.append(value); data.append(" "); } } return MD5Hash.digest(data.toString()); } else { MD5Hash key; try ( // use MD5 of the input file as Hadoop key FileInputStream fileInputStream = new FileInputStream(file)) { key = MD5Hash.digest(fileInputStream); } return key; } }
/** Convert a Json map to a MD5MD5CRC32FileChecksum. */ public static MD5MD5CRC32FileChecksum toMD5MD5CRC32FileChecksum(final Map<?, ?> json) throws IOException { if (json == null) { return null; } final Map<?, ?> m = (Map<?, ?>) json.get(MD5MD5CRC32FileChecksum.class.getSimpleName()); final String algorithm = (String) m.get("algorithm"); final int length = (int) (long) (Long) m.get("length"); final byte[] bytes = StringUtils.hexStringToByte((String) m.get("bytes")); final DataInputStream in = new DataInputStream(new ByteArrayInputStream(bytes)); final int bytesPerCRC = in.readInt(); final long crcPerBlock = in.readLong(); final MD5Hash md5 = MD5Hash.read(in); final MD5MD5CRC32FileChecksum checksum = new MD5MD5CRC32FileChecksum(bytesPerCRC, crcPerBlock, md5); // check algorithm name final String alg = "MD5-of-" + crcPerBlock + "MD5-of-" + bytesPerCRC + "CRC32"; if (!alg.equals(algorithm)) { throw new IOException( "Algorithm not matched: algorithm=" + algorithm + ", crcPerBlock=" + crcPerBlock + ", bytesPerCRC=" + bytesPerCRC); } // check length if (length != checksum.getLength()) { throw new IOException( "Length not matched: length=" + length + ", checksum.getLength()=" + checksum.getLength()); } return checksum; }
static Pair<TextBytes, TextBytes> generateTestRecord( TestModel model, String url, long timestamp, boolean success, TestRecordType recordType, Map<String, String> extraProperties) throws Exception { Pair<TextBytes, TextBytes> result = null; if (recordType == TestRecordType.CRAWL_STATUS || recordType == TestRecordType.CRAWL_STATUS_WITH_REDIRECT) { JsonObject topObject = new JsonObject(); topObject.addProperty("source_url", url); topObject.addProperty("disposition", (success) ? "SUCCESS" : "FAILURE"); topObject.addProperty("attempt_time", timestamp); if (success) { topObject.addProperty("http_result", 200); topObject.addProperty("server_ip", CANNED_IP); topObject.addProperty("content_len", CANNED_CONTENT_LEN); topObject.addProperty("mime_type", CANNED_MIME_TYPE); topObject.addProperty("md5", CANNED_MD5_VALUE); topObject.addProperty("text_simhash", CANNED_SIMHASH_VALUE); JsonObject headers = new JsonObject(); headers.addProperty("date", timestamp); topObject.add("http_headers", headers); topObject.addProperty("parsed_as", "html"); JsonObject content = new JsonObject(); content.addProperty("title", CANNED_TITLE); JsonArray metaTags = new JsonArray(); JsonObject metaTag = new JsonObject(); metaTag.addProperty(CANNED_META_PROPERTY_NAME, CANNED_META_PROPERTY_VALUE); metaTags.add(metaTag); content.add("meta_tags", metaTags); topObject.add("content", content); JsonArray links = new JsonArray(); JsonObject link1 = new JsonObject(); link1.addProperty("href", sourceURLToLinkingHostURL(url, CANNED_LINKING_HOST_1)); JsonObject link2 = new JsonObject(); link2.addProperty("href", sourceURLToLinkingHostURL(url, CANNED_LINKING_HOST_2)); links.add(link1); links.add(link2); topObject.add("links", links); if (recordType == TestRecordType.CRAWL_STATUS_WITH_REDIRECT) { JsonObject redirectObject = new JsonObject(); redirectObject.addProperty( "source_url", extraProperties.get(EXTRA_PROPERTY_REDIRECT_SOURCE)); redirectObject.addProperty("http_result", 301); redirectObject.addProperty("server_ip", CANNED_IP); topObject.add("redirect_from", redirectObject); } } else { topObject.addProperty("failure_reason", CANNED_FAILURE_REASON); topObject.addProperty("failure_detail", CANNED_FAILURE_DETAIL); } TextBytes keyOut = new TextBytes(CrawlDBKey.generateCrawlStatusKey(new Text(url), timestamp)); TextBytes valueOut = new TextBytes(topObject.toString()); result = new Pair<TextBytes, TextBytes>(keyOut, valueOut); } else { JsonObject linkData = new JsonObject(); linkData.addProperty("href", url); linkData.addProperty("source_url", extraProperties.get(EXTRA_PROPERTY_LINK_SOURCE)); linkData.addProperty("http_result", 200); linkData.addProperty("server_ip", CANNED_IP); linkData.addProperty("source_type", "html"); linkData.addProperty("type", "a"); linkData.addProperty("rel", "text/html"); TextBytes keyOut = new TextBytes( CrawlDBKey.generateLinkKey( new TextBytes(url), CrawlDBKey.Type.KEY_TYPE_HTML_LINK, MD5Hash.digest(extraProperties.get(EXTRA_PROPERTY_LINK_SOURCE)).toString())); TextBytes valueOut = new TextBytes(linkData.toString()); result = new Pair<TextBytes, TextBytes>(keyOut, valueOut); } model.updateModelFromInputTuple(result); return result; }
@Override public int compare(JsonObject o1, JsonObject o2) { String md51 = MD5Hash.digest(o1.toString()).toString(); String md52 = MD5Hash.digest(o2.toString()).toString(); return md51.compareTo(md52); }
static String sourceURLToLinkingHostURL(String sourceURL, String linkingHostName) { return "http://" + linkingHostName + "/" + MD5Hash.digest(sourceURL + Long.toString(System.nanoTime())).toString(); }
@Override public long download(String source, String destination) { // hdfs_source = "/"+hdfs_source; long readBytes = 0; try { // String[] pluginPath = hdfs_source.split("/"); // String plugin = pluginPath[2]; File destFile = new File(destination); if (destFile.exists()) { destFile.delete(); } else { File parent = destFile.getParentFile(); parent.mkdirs(); destFile.createNewFile(); } File cacheFile = new File(predeployedPath + "/" + source); if (cacheFile.exists()) { BufferedInputStream reader = new BufferedInputStream(new FileInputStream(cacheFile)); BufferedOutputStream writer = new BufferedOutputStream(new FileOutputStream(destFile, false)); try { byte[] buff = new byte[4096]; int numChars; while ((numChars = reader.read(buff, 0, buff.length)) != -1) { writer.write(buff, 0, numChars); } } catch (IOException ex) { throw new IOException( "IOException when transferring " + cacheFile.getPath() + " to " + destFile.getPath()); } finally { try { if (reader != null) { reader.close(); } if (writer != null) writer.close(); } catch (IOException ex) { System.out.println( "Error closing files when transferring " + cacheFile.getPath() + " to " + destFile.getPath()); } } return cacheFile.length(); } } catch (Exception e) { System.err.println("Exception when trying to copy from local cache the file"); e.printStackTrace(); } try { FileOutputStream file = new FileOutputStream(destination); String[] pieces = parts(source); log.info("File splited into " + pieces.length + "pieces "); System.out.println("File is splited into " + pieces.length + " pieces "); long StartTime = System.currentTimeMillis(); float currentSpeed; long totalUploadTime = 0; int count = 1; for (String piece : pieces) { byte[] pieceBytes = read(piece); file.write(pieceBytes); readBytes += pieceBytes.length; long endTime = System.currentTimeMillis(); long timeDiff = endTime - StartTime + 1; StartTime = endTime; currentSpeed = (pieceBytes.length / 1000f) / ((timeDiff + 1f) / 1000f); totalUploadTime += timeDiff; log.info( "Succesfully download piece #" + count + "/" + pieces.length + " " + readBytes + " bytes speed: " + currentSpeed + " kb/s"); System.out.println( "Succesfully download piece #" + count + "/" + pieces.length + " " + readBytes + " bytes speed: " + currentSpeed + " kb/s"); count++; } file.close(); System.out.println( "Download completed Total downloaded " + readBytes + " bytes into file " + destination + " time " + totalUploadTime / 1000f + " s"); FileInputStream fileInputStream = new FileInputStream(destination); MD5Hash key = MD5Hash.digest(fileInputStream); fileInputStream.close(); System.out.println("MD5 key : " + key); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return readBytes; }