public static MD5Hash createKeyHash(File file, Metadata metadata) throws IOException { String extension = Util.getExtension(file.getName()); if ("eml".equalsIgnoreCase(extension)) { assert (metadata != null); String hashNames = EmailProperties.getInstance().getProperty(EmailProperties.EMAIL_HASH_NAMES); String[] hashNamesArr = hashNames.split(","); StringBuilder data = new StringBuilder(); for (String hashName : hashNamesArr) { String value = metadata.get(hashName); if (value != null) { data.append(value); data.append(" "); } } return MD5Hash.digest(data.toString()); } else { MD5Hash key; try ( // use MD5 of the input file as Hadoop key FileInputStream fileInputStream = new FileInputStream(file)) { key = MD5Hash.digest(fileInputStream); } return key; } }
private void installFreeEed() throws Exception { String url = Settings.getSettings().getDownloadLink(); logger.info("Installing FreeEed software from " + url); String cmd = "rm FreeEed.zip; " + "wget " + url + " -O FreeEed.zip --no-check-certificate; " + "rm -fr FreeEed; " + "unzip -P 4ushH7XZT1 FreeEed.zip"; SSHAgent sshAgent = new SSHAgent(); sshAgent.setUser(ParameterProcessing.CLUSTER_USER_NAME); sshAgent.setKey(ParameterProcessing.PEM_CERTIFICATE_NAME); sshAgent.setHost(cluster.getJobTracker().getDnsName()); sshAgent.executeCommand(cmd); logger.info("Successfully installed FreeEed"); // copy the settings to jobtracker Server server = cluster.getJobTracker(); sshAgent.setHost(server.getDnsName()); Settings cloneForS3 = Settings.getSettings().cloneForS3(); String settingsFileToUse = "settings.properties.s3"; Util.writeTextFile(settingsFileToUse, cloneForS3.toString()); logger.info("Copying settings file: {}", settingsFileToUse); // TODO change passing the settings to the cloud // sshAgent.scpTo(settingsFileToUse, "FreeEed/" + ParameterProcessing.DEFAULT_SETTINGS); }
private void verifyOperation() throws Exception { hadoopReady = false; String cmd; String[] output; SSHAgent sshAgent = new SSHAgent(); sshAgent.setUser(ParameterProcessing.CLUSTER_USER_NAME); sshAgent.setKey(ParameterProcessing.PEM_CERTIFICATE_NAME); sshAgent.setHost(cluster.getJobTracker().getDnsName()); logger.info("Cluster testing and verification started"); cmd = "hadoop fs -mkdir /test"; sshAgent.executeCommand(cmd); cmd = "hadoop fs -copyFromLocal *.xml /test/"; sshAgent.executeCommand(cmd); cmd = "hadoop jar /usr/lib/hadoop/hadoop-0.20.2-cdh*-examples.jar grep /test /test-output 'dfs[a-z.]+'"; output = sshAgent.executeCommand(cmd); logger.info(Util.arrayToString(output)); cmd = "hadoop fs -ls /test-output"; output = sshAgent.executeCommand(cmd); logger.info(Util.arrayToString(output)); logger.info("Cluster testing and verification is complete"); boolean success = false; for (String line : output) { if (line.contains("_SUCCESS")) { success = true; cluster.setReadyToUse(true); break; } } hadoopReady = success; }
private void mergeLuceneIndex() throws IOException { String luceneDir = Settings.getSettings().getLuceneIndexDir(); String hdfsLuceneDir = "/" + luceneDir + File.separator + Project.getProject().getProjectCode() + File.separator; String localLuceneTempDir = luceneDir + File.separator + "tmp" + File.separator; File localLuceneTempDirFile = new File(localLuceneTempDir); if (localLuceneTempDirFile.exists()) { Util.deleteDirectory(localLuceneTempDirFile); } localLuceneTempDirFile.mkdir(); // copy all zip lucene indexes, created by maps to local hd String cmd = "hadoop fs -copyToLocal " + hdfsLuceneDir + "* " + localLuceneTempDir; OsUtil.runCommand(cmd); // remove the map indexes as they are now copied to local String removeOldZips = "hadoop fs -rm " + hdfsLuceneDir + "*"; OsUtil.runCommand(removeOldZips); logger.trace("Lucene index files collected to: {}", localLuceneTempDirFile.getAbsolutePath()); String[] zipFilesArr = localLuceneTempDirFile.list(); for (String indexZipFileStr : zipFilesArr) { String indexZipFileName = localLuceneTempDir + indexZipFileStr; String unzipToDir = localLuceneTempDir + indexZipFileStr.replace(".zip", ""); ZipUtil.unzipFile(indexZipFileName, unzipToDir); File indexDir = new File(unzipToDir); FSDirectory fsDir = FSDirectory.open(indexDir); luceneIndex.addToIndex(fsDir); } // TODO check if we need to push the index to S3 or somewhere else luceneIndex.destroy(); }
private void setupAndStartCluster() throws Exception { // form config files String masters = cluster.getMaster().getPrivateDnsName() + "\n"; Files.write(masters.getBytes(), new File(mastersFile)); List<String> slavesList = new ArrayList<String>(); for (int i = 0; i < cluster.size(); ++i) { Server server = cluster.get(i); if (server.isTaskTracker()) { slavesList.add(server.getPrivateDnsName()); } } String[] slaves = (String[]) slavesList.toArray(new String[0]); Files.write(Util.arrayToString(slaves).getBytes(), new File(slavesFile)); String coreSite = Util.readTextFile("config/" + coreSiteFile); coreSite = coreSite.replaceFirst("localhost", cluster.getMaster().getPrivateDnsName()); Files.write(coreSite.getBytes(), new File(coreSiteFile)); String mapredSite = Util.readTextFile("config/" + mapredSiteFile); mapredSite = mapredSite.replaceFirst("localhost", cluster.getJobTracker().getPrivateDnsName()); Files.write(mapredSite.getBytes(), new File(mapredSiteFile)); String cmd; String[] output; // push config files to the cluster logger.info("Configuring the Hadoop cluster"); ClusterCommand clusterCommand = new ClusterCommand(cluster); clusterCommand.runScpWaitForAll(mastersFile, mastersFile); clusterCommand.runScpWaitForAll(slavesFile, slavesFile); clusterCommand.runScpWaitForAll("config/" + hdfsSiteFile, hdfsSiteFile); clusterCommand.runScpWaitForAll(coreSiteFile, coreSiteFile); clusterCommand.runScpWaitForAll(mapredSiteFile, mapredSiteFile); // copy from home on remote to the config area clusterCommand.runCommandWaitForAll("sudo cp " + mastersFile + " /etc/hadoop/conf/"); clusterCommand.runCommandWaitForAll("sudo cp " + slavesFile + " /etc/hadoop/conf/"); clusterCommand.runCommandWaitForAll("sudo cp " + hdfsSiteFile + " /etc/hadoop/conf/"); clusterCommand.runCommandWaitForAll("sudo cp " + coreSiteFile + " /etc/hadoop/conf/"); clusterCommand.runCommandWaitForAll("sudo cp " + mapredSiteFile + " /etc/hadoop/conf/"); // create /mnt/tmp for everyone to use clusterCommand.runCommandWaitForAll("sudo rm -fr /mnt/tmp"); clusterCommand.runCommandWaitForAll("sudo mkdir /mnt/tmp"); clusterCommand.runCommandWaitForAll("sudo chmod 777 /mnt/tmp"); // create /mnt/tmp for hadoop tmp dir clusterCommand.runCommandWaitForAll("sudo mkdir /mnt/tmp/hadoop"); clusterCommand.runCommandWaitForAll("sudo chmod 777 /mnt/tmp/hadoop"); logger.info("Hadoop cluster configured, starting the services"); // shut down all services // clean up dfs on slaves hadoopReady = false; cmd = "for service in /etc/init.d/hadoop-0.20-*; do sudo $service stop; done"; clusterCommand.runCommandWaitForAll(cmd); cmd = "sudo rm -fr /var/lib/hadoop-0.20/cache/*"; clusterCommand.runCommandWaitForAll(cmd); SSHAgent sshAgent = new SSHAgent(); sshAgent.setUser(ParameterProcessing.CLUSTER_USER_NAME); sshAgent.setKey(ParameterProcessing.PEM_CERTIFICATE_NAME); sshAgent.setHost(cluster.getMaster().getDnsName()); cmd = "sudo -u hdfs hadoop namenode -format"; sshAgent.executeCommand(cmd); cmd = "sudo service hadoop-0.20-namenode start"; output = sshAgent.executeCommand(cmd); logger.info(Util.arrayToString(output)); // start all hdfs slaves clusterCommand = new ClusterCommand(cluster.getDataNodes()); cmd = "sudo service hadoop-0.20-datanode start"; clusterCommand.runCommandWaitForAll(cmd); // start all tasktrackers clusterCommand = new ClusterCommand(cluster.getTaskTrackers()); cmd = "sudo service hadoop-0.20-tasktracker start"; clusterCommand.runCommandWaitForAll(cmd); sshAgent.setHost(cluster.getJobTracker().getDnsName()); cmd = "sudo service hadoop-0.20-jobtracker start"; output = sshAgent.executeCommand(cmd); logger.info(Util.arrayToString(output)); logger.info("Cluster configuration and startup is complete"); cmd = "sudo rm /usr/lib/hadoop/lib/jets3t*.jar"; clusterCommand = new ClusterCommand(cluster); clusterCommand.runCommandWaitForAll(cmd); // install a fresh version of FreeEed installFreeEed(); // run a distributed grep app verifyOperation(); if (callingUI != null) { callingUI.refreshStatus(); } }