public void run() throws Exception {
   long startTime = System.currentTimeMillis();
   JobConf conf = new JobConf(ItemCFJob.class);
   conf.setJobName("ItemCF" + System.currentTimeMillis());
   conf.setNumMapTasks(10);
   conf.set(
       "io.serializations",
       "org.apache.hadoop.io.serializer.JavaSerialization,"
           + "org.apache.hadoop.io.serializer.WritableSerialization");
   StringBuilder sb = new StringBuilder();
   sb.append("--input ").append(input);
   sb.append(" --output ").append(output);
   if (flag) {
     sb.append(" --booleanData true");
   } else {
     sb.append(" --booleanData false");
   }
   sb.append(" --similarityClassname " + Constants.mahout_similarityclassname);
   sb.append(" --tempDir ").append(tmp);
   String[] args = sb.toString().split(" ");
   RecommenderJob job = new RecommenderJob();
   job.setConf(conf);
   job.run(args);
   long endTime = System.currentTimeMillis();
   logger.info(
       "recommdation job ["
           + conf.getJobName()
           + "] run finish. it costs"
           + (endTime - startTime) / 1000
           + "s.");
 }
  @SuppressWarnings({"deprecation", "null"})
  public static void preprocessAndNumberizeFiles(Configuration c, String inputPaths, Path output)
      throws IOException {
    sLogger.setLevel(Level.INFO);

    JobConf conf = new JobConf(c);

    conf.setJobName("bitext.compile");

    boolean useVocabServer = false;

    Thread vst1 = null;
    Thread vst2 = null;
    VocabServer vocabServer1 = null;
    VocabServer vocabServer2 = null;
    try {
      // inputPaths = bi-text given as input in main method of HadoopAlign
      conf.setOutputKeyClass(Text.class);
      conf.setOutputValueClass(PhrasePair.class);
      conf.setMapperClass(BitextCompilerMapper.class);
      conf.setReducerClass(IdentityReducer.class);
      conf.setNumMapTasks(1);
      conf.setNumReduceTasks(1);
      FileInputFormat.setInputPaths(conf, inputPaths);
      conf.set("stream.recordreader.begin", "<pchunk");
      conf.set("stream.recordreader.end", "</pchunk>");
      conf.set("stream.recordreader.slowmatch", "false");
      conf.set("stream.recordreader.maxrec", "100000");
      conf.setInputFormat(XMLInput.class);
      FileOutputFormat.setOutputPath(conf, output);
      conf.setOutputFormat(SequenceFileOutputFormat.class);
      conf.setJar("/chomes/fture/jars/ivory.jar");
      conf.set("mapred.child.java.opts", "-Xmx2048m");
      System.out.println("Running job " + conf.getJobName());
      System.out.println("Input: " + inputPaths);
      System.out.println("Output: " + output);
      JobClient.runJob(conf);
    } finally {
      try {
        if (vst1 != null) vocabServer1.stopServer();
        if (vst2 != null) vocabServer2.stopServer();
        if (vst1 != null) vst1.join();
        if (vst2 != null) vst2.join();
      } catch (InterruptedException e) {
      }
    }
  }
  public static void main(String[] args) {

    String accessKey = args[0];
    String secretKey = args[1];

    String paths[] = {
      // "2008/06",
      // "2008/07",
      // "2008/08",
      // "2008/09",
      // "2008/10",
      // "2008/11",
      "2009"
    };

    for (int pathIndex = 0; pathIndex < paths.length; ++pathIndex) {

      LOG.info("Processing Path:" + paths[pathIndex]);

      JobConf job = new JobConf(S3GetMetdataJob.class);

      Path tempDir =
          new Path(
              job.get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

      LOG.info("Output for Path:" + paths[pathIndex] + " is:" + tempDir);
      System.out.println("Output Path is:" + tempDir);

      job.setJobName("S3 To CrawlURLMetadata Job for Path:" + paths[pathIndex]);

      // setup s3 properties
      JetS3tARCSource.setMaxRetries(job, 1);
      // set up S3 credentials ...
      JetS3tARCSource.setAWSAccessKeyID(job, accessKey);
      JetS3tARCSource.setAWSSecretAccessKey(job, secretKey);
      ARCSplitCalculator.setFilesPerSplit(job, 25);
      // set up arc reader properties
      ArcFileReader.setIOTimeoutValue(30000);
      // set input prefixes ...
      JetS3tARCSource.setInputPrefixes(job, paths[pathIndex]);
      // and S3 bucket name ...
      JetS3tARCSource.setBucketName(job, "commoncrawl");
      // and setup arc source for ArcInputFormat
      ARCInputFormat.setARCSourceClass(job, JetS3tARCSource.class);
      // and set up input format ...
      job.setInputFormat(ARCInputFormat.class);
      // set mapper ...
      job.setMapRunnerClass(S3GetMetdataJob.class);
      // setup reducer (identity in this case ... )
      job.setReducerClass(IdentityReducer.class);
      // standard output format ...
      job.setOutputFormat(SequenceFileOutputFormat.class);
      // set output path
      FileOutputFormat.setOutputPath(job, tempDir);
      // map output types
      job.setMapOutputKeyClass(Text.class);
      job.setMapOutputValueClass(CrawlURLMetadata.class);
      // reduce output types
      job.setOutputKeyClass(Text.class);
      job.setOutputValueClass(CrawlURLMetadata.class);
      // double the number of reducers ...
      // job.setNumReduceTasks(job.getNumReduceTasks() * 2);

      // run the job ...
      try {
        LOG.info("Starting Job:" + job.getJobName());
        JobClient.runJob(job);
        LOG.info("Finished Job:" + job.getJobName());

        Path finalPath = new Path("jobout/" + paths[pathIndex] + "/result");
        LOG.info("Copying Job Output to:" + finalPath);
        FileSystem fs = FileSystem.get(job);

        try {
          fs.mkdirs(finalPath.getParent());
          fs.rename(tempDir, finalPath);
          LOG.info("Copied Job Output to:" + finalPath);
        } finally {
          // fs.close();
        }

      } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        e.printStackTrace();
      }
    }
  }
Beispiel #4
0
  /** Runs this tool. */
  @SuppressWarnings("deprecation")
  public int run(String[] args) throws Exception {
    JobConf job = new JobConf(getConf(), Docnos2Titles.class);

    // Read commandline arguments
    CommandLine cmdline = parseArgs(args);
    if (cmdline == null) {
      printUsage();
    }
    String eCollectionPath = cmdline.getOptionValue(ECOLLECTION_OPTION);
    String fCollectionPath = cmdline.getOptionValue(FCOLLECTION_OPTION);
    String pwsimOutputPath = cmdline.getOptionValue(PWSIM_OPTION);
    String titlePairsPath = cmdline.getOptionValue(OUTPUT_PATH_OPTION);
    String eLang = cmdline.getOptionValue(ELANG_OPTION);
    String fLang = cmdline.getOptionValue(FLANG_OPTION);
    String samplesFile = cmdline.getOptionValue(SAMPLEDOCNOS_OPTION);
    job.setJobName("Docnos2Titles_" + fLang + "-" + eLang);

    FileInputFormat.addInputPaths(job, eCollectionPath);
    FileInputFormat.addInputPaths(job, fCollectionPath);
    FileOutputFormat.setOutputPath(job, new Path(titlePairsPath));
    DistributedCache.addCacheFile(new URI(pwsimOutputPath), job);
    DistributedCache.addCacheFile(new URI(samplesFile), job);
    job.set("eLang", eLang);
    job.set("fLang", fLang);
    job.set("PwsimPairs", pwsimOutputPath);
    job.set("Ivory.SampleFile", samplesFile);

    job.setInt("mapred.task.timeout", 60000000);
    job.set("mapreduce.map.memory.mb", "3000");
    job.set("mapreduce.map.java.opts", "-Xmx3000m");
    job.setBoolean("mapred.map.tasks.speculative.execution", false);
    job.setBoolean("mapred.reduce.tasks.speculative.execution", false);

    job.setNumMapTasks(100);
    job.setNumReduceTasks(1);
    job.setInt("mapred.min.split.size", 2000000000);
    job.setFloat("mapred.reduce.slowstart.completed.maps", 0.9f);

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);
    job.setMapOutputKeyClass(PairOfInts.class);
    job.setMapOutputValueClass(PairOfIntString.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    sLogger.info("Running job " + job.getJobName() + "...");
    sLogger.info("E-collection path: " + eCollectionPath);
    sLogger.info("F-collection path: " + fCollectionPath);
    sLogger.info("Pwsim output path: " + pwsimOutputPath);
    sLogger.info("Output path: " + titlePairsPath);
    sLogger.info("Sample file?: " + ((samplesFile != null) ? samplesFile : "none"));

    long startTime = System.currentTimeMillis();
    JobClient.runJob(job);
    System.out.println(
        "Job finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
  }