Пример #1
0
  public static MD5Hash createKeyHash(File file, Metadata metadata) throws IOException {
    String extension = Util.getExtension(file.getName());

    if ("eml".equalsIgnoreCase(extension)) {
      assert (metadata != null);
      String hashNames =
          EmailProperties.getInstance().getProperty(EmailProperties.EMAIL_HASH_NAMES);
      String[] hashNamesArr = hashNames.split(",");

      StringBuilder data = new StringBuilder();

      for (String hashName : hashNamesArr) {
        String value = metadata.get(hashName);
        if (value != null) {
          data.append(value);
          data.append(" ");
        }
      }
      return MD5Hash.digest(data.toString());
    } else {
      MD5Hash key;
      try ( // use MD5 of the input file as Hadoop key
      FileInputStream fileInputStream = new FileInputStream(file)) {
        key = MD5Hash.digest(fileInputStream);
      }
      return key;
    }
  }
Пример #2
0
  private void installFreeEed() throws Exception {
    String url = Settings.getSettings().getDownloadLink();
    logger.info("Installing FreeEed software from " + url);
    String cmd =
        "rm FreeEed.zip; "
            + "wget "
            + url
            + " -O FreeEed.zip --no-check-certificate; "
            + "rm -fr FreeEed; "
            + "unzip -P 4ushH7XZT1 FreeEed.zip";
    SSHAgent sshAgent = new SSHAgent();
    sshAgent.setUser(ParameterProcessing.CLUSTER_USER_NAME);
    sshAgent.setKey(ParameterProcessing.PEM_CERTIFICATE_NAME);
    sshAgent.setHost(cluster.getJobTracker().getDnsName());
    sshAgent.executeCommand(cmd);
    logger.info("Successfully installed FreeEed");
    // copy the settings to jobtracker
    Server server = cluster.getJobTracker();
    sshAgent.setHost(server.getDnsName());

    Settings cloneForS3 = Settings.getSettings().cloneForS3();
    String settingsFileToUse = "settings.properties.s3";
    Util.writeTextFile(settingsFileToUse, cloneForS3.toString());

    logger.info("Copying settings file: {}", settingsFileToUse);
    // TODO change passing the settings to the cloud
    // sshAgent.scpTo(settingsFileToUse, "FreeEed/" + ParameterProcessing.DEFAULT_SETTINGS);
  }
Пример #3
0
  private void verifyOperation() throws Exception {
    hadoopReady = false;

    String cmd;
    String[] output;

    SSHAgent sshAgent = new SSHAgent();
    sshAgent.setUser(ParameterProcessing.CLUSTER_USER_NAME);
    sshAgent.setKey(ParameterProcessing.PEM_CERTIFICATE_NAME);
    sshAgent.setHost(cluster.getJobTracker().getDnsName());
    logger.info("Cluster testing and verification started");
    cmd = "hadoop fs -mkdir /test";
    sshAgent.executeCommand(cmd);

    cmd = "hadoop fs -copyFromLocal *.xml /test/";
    sshAgent.executeCommand(cmd);

    cmd =
        "hadoop jar /usr/lib/hadoop/hadoop-0.20.2-cdh*-examples.jar grep /test /test-output 'dfs[a-z.]+'";
    output = sshAgent.executeCommand(cmd);
    logger.info(Util.arrayToString(output));

    cmd = "hadoop fs -ls /test-output";
    output = sshAgent.executeCommand(cmd);
    logger.info(Util.arrayToString(output));
    logger.info("Cluster testing and verification is complete");

    boolean success = false;
    for (String line : output) {
      if (line.contains("_SUCCESS")) {
        success = true;
        cluster.setReadyToUse(true);
        break;
      }
    }
    hadoopReady = success;
  }
Пример #4
0
  private void mergeLuceneIndex() throws IOException {
    String luceneDir = Settings.getSettings().getLuceneIndexDir();
    String hdfsLuceneDir =
        "/" + luceneDir + File.separator + Project.getProject().getProjectCode() + File.separator;

    String localLuceneTempDir = luceneDir + File.separator + "tmp" + File.separator;
    File localLuceneTempDirFile = new File(localLuceneTempDir);

    if (localLuceneTempDirFile.exists()) {
      Util.deleteDirectory(localLuceneTempDirFile);
    }

    localLuceneTempDirFile.mkdir();

    // copy all zip lucene indexes, created by maps to local hd
    String cmd = "hadoop fs -copyToLocal " + hdfsLuceneDir + "* " + localLuceneTempDir;
    OsUtil.runCommand(cmd);

    // remove the map indexes as they are now copied to local
    String removeOldZips = "hadoop fs -rm " + hdfsLuceneDir + "*";
    OsUtil.runCommand(removeOldZips);

    logger.trace("Lucene index files collected to: {}", localLuceneTempDirFile.getAbsolutePath());

    String[] zipFilesArr = localLuceneTempDirFile.list();
    for (String indexZipFileStr : zipFilesArr) {
      String indexZipFileName = localLuceneTempDir + indexZipFileStr;
      String unzipToDir = localLuceneTempDir + indexZipFileStr.replace(".zip", "");

      ZipUtil.unzipFile(indexZipFileName, unzipToDir);
      File indexDir = new File(unzipToDir);

      FSDirectory fsDir = FSDirectory.open(indexDir);
      luceneIndex.addToIndex(fsDir);
    }
    // TODO check if we need to push the index to S3 or somewhere else
    luceneIndex.destroy();
  }
Пример #5
0
  private void setupAndStartCluster() throws Exception {
    // form config files
    String masters = cluster.getMaster().getPrivateDnsName() + "\n";
    Files.write(masters.getBytes(), new File(mastersFile));

    List<String> slavesList = new ArrayList<String>();
    for (int i = 0; i < cluster.size(); ++i) {
      Server server = cluster.get(i);
      if (server.isTaskTracker()) {
        slavesList.add(server.getPrivateDnsName());
      }
    }
    String[] slaves = (String[]) slavesList.toArray(new String[0]);
    Files.write(Util.arrayToString(slaves).getBytes(), new File(slavesFile));

    String coreSite = Util.readTextFile("config/" + coreSiteFile);
    coreSite = coreSite.replaceFirst("localhost", cluster.getMaster().getPrivateDnsName());
    Files.write(coreSite.getBytes(), new File(coreSiteFile));

    String mapredSite = Util.readTextFile("config/" + mapredSiteFile);
    mapredSite = mapredSite.replaceFirst("localhost", cluster.getJobTracker().getPrivateDnsName());
    Files.write(mapredSite.getBytes(), new File(mapredSiteFile));

    String cmd;

    String[] output;
    // push config files to the cluster
    logger.info("Configuring the Hadoop cluster");
    ClusterCommand clusterCommand = new ClusterCommand(cluster);
    clusterCommand.runScpWaitForAll(mastersFile, mastersFile);
    clusterCommand.runScpWaitForAll(slavesFile, slavesFile);
    clusterCommand.runScpWaitForAll("config/" + hdfsSiteFile, hdfsSiteFile);
    clusterCommand.runScpWaitForAll(coreSiteFile, coreSiteFile);
    clusterCommand.runScpWaitForAll(mapredSiteFile, mapredSiteFile);
    // copy from home on remote to the config area
    clusterCommand.runCommandWaitForAll("sudo cp " + mastersFile + " /etc/hadoop/conf/");
    clusterCommand.runCommandWaitForAll("sudo cp " + slavesFile + " /etc/hadoop/conf/");
    clusterCommand.runCommandWaitForAll("sudo cp " + hdfsSiteFile + " /etc/hadoop/conf/");
    clusterCommand.runCommandWaitForAll("sudo cp " + coreSiteFile + " /etc/hadoop/conf/");
    clusterCommand.runCommandWaitForAll("sudo cp " + mapredSiteFile + " /etc/hadoop/conf/");
    // create /mnt/tmp for everyone to use
    clusterCommand.runCommandWaitForAll("sudo rm -fr /mnt/tmp");
    clusterCommand.runCommandWaitForAll("sudo mkdir /mnt/tmp");
    clusterCommand.runCommandWaitForAll("sudo chmod 777 /mnt/tmp");
    // create /mnt/tmp for hadoop tmp dir
    clusterCommand.runCommandWaitForAll("sudo mkdir /mnt/tmp/hadoop");
    clusterCommand.runCommandWaitForAll("sudo chmod 777 /mnt/tmp/hadoop");

    logger.info("Hadoop cluster configured, starting the services");
    // shut down all services
    // clean up dfs on slaves
    hadoopReady = false;
    cmd = "for service in /etc/init.d/hadoop-0.20-*; do sudo $service stop; done";
    clusterCommand.runCommandWaitForAll(cmd);
    cmd = "sudo rm -fr /var/lib/hadoop-0.20/cache/*";
    clusterCommand.runCommandWaitForAll(cmd);

    SSHAgent sshAgent = new SSHAgent();
    sshAgent.setUser(ParameterProcessing.CLUSTER_USER_NAME);
    sshAgent.setKey(ParameterProcessing.PEM_CERTIFICATE_NAME);
    sshAgent.setHost(cluster.getMaster().getDnsName());

    cmd = "sudo -u hdfs hadoop namenode -format";
    sshAgent.executeCommand(cmd);

    cmd = "sudo service hadoop-0.20-namenode start";
    output = sshAgent.executeCommand(cmd);
    logger.info(Util.arrayToString(output));

    // start all hdfs slaves
    clusterCommand = new ClusterCommand(cluster.getDataNodes());
    cmd = "sudo service hadoop-0.20-datanode start";
    clusterCommand.runCommandWaitForAll(cmd);
    // start all tasktrackers
    clusterCommand = new ClusterCommand(cluster.getTaskTrackers());
    cmd = "sudo service hadoop-0.20-tasktracker start";
    clusterCommand.runCommandWaitForAll(cmd);

    sshAgent.setHost(cluster.getJobTracker().getDnsName());
    cmd = "sudo service hadoop-0.20-jobtracker start";
    output = sshAgent.executeCommand(cmd);
    logger.info(Util.arrayToString(output));
    logger.info("Cluster configuration and startup is complete");

    cmd = "sudo rm /usr/lib/hadoop/lib/jets3t*.jar";
    clusterCommand = new ClusterCommand(cluster);
    clusterCommand.runCommandWaitForAll(cmd);
    // install a fresh version of FreeEed
    installFreeEed();
    // run a distributed grep app
    verifyOperation();
    if (callingUI != null) {
      callingUI.refreshStatus();
    }
  }