public void run() throws Exception { long startTime = System.currentTimeMillis(); JobConf conf = new JobConf(ItemCFJob.class); conf.setJobName("ItemCF" + System.currentTimeMillis()); conf.setNumMapTasks(10); conf.set( "io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); StringBuilder sb = new StringBuilder(); sb.append("--input ").append(input); sb.append(" --output ").append(output); if (flag) { sb.append(" --booleanData true"); } else { sb.append(" --booleanData false"); } sb.append(" --similarityClassname " + Constants.mahout_similarityclassname); sb.append(" --tempDir ").append(tmp); String[] args = sb.toString().split(" "); RecommenderJob job = new RecommenderJob(); job.setConf(conf); job.run(args); long endTime = System.currentTimeMillis(); logger.info( "recommdation job [" + conf.getJobName() + "] run finish. it costs" + (endTime - startTime) / 1000 + "s."); }
@SuppressWarnings({"deprecation", "null"}) public static void preprocessAndNumberizeFiles(Configuration c, String inputPaths, Path output) throws IOException { sLogger.setLevel(Level.INFO); JobConf conf = new JobConf(c); conf.setJobName("bitext.compile"); boolean useVocabServer = false; Thread vst1 = null; Thread vst2 = null; VocabServer vocabServer1 = null; VocabServer vocabServer2 = null; try { // inputPaths = bi-text given as input in main method of HadoopAlign conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(PhrasePair.class); conf.setMapperClass(BitextCompilerMapper.class); conf.setReducerClass(IdentityReducer.class); conf.setNumMapTasks(1); conf.setNumReduceTasks(1); FileInputFormat.setInputPaths(conf, inputPaths); conf.set("stream.recordreader.begin", "<pchunk"); conf.set("stream.recordreader.end", "</pchunk>"); conf.set("stream.recordreader.slowmatch", "false"); conf.set("stream.recordreader.maxrec", "100000"); conf.setInputFormat(XMLInput.class); FileOutputFormat.setOutputPath(conf, output); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setJar("/chomes/fture/jars/ivory.jar"); conf.set("mapred.child.java.opts", "-Xmx2048m"); System.out.println("Running job " + conf.getJobName()); System.out.println("Input: " + inputPaths); System.out.println("Output: " + output); JobClient.runJob(conf); } finally { try { if (vst1 != null) vocabServer1.stopServer(); if (vst2 != null) vocabServer2.stopServer(); if (vst1 != null) vst1.join(); if (vst2 != null) vst2.join(); } catch (InterruptedException e) { } } }
public static void main(String[] args) { String accessKey = args[0]; String secretKey = args[1]; String paths[] = { // "2008/06", // "2008/07", // "2008/08", // "2008/09", // "2008/10", // "2008/11", "2009" }; for (int pathIndex = 0; pathIndex < paths.length; ++pathIndex) { LOG.info("Processing Path:" + paths[pathIndex]); JobConf job = new JobConf(S3GetMetdataJob.class); Path tempDir = new Path( job.get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); LOG.info("Output for Path:" + paths[pathIndex] + " is:" + tempDir); System.out.println("Output Path is:" + tempDir); job.setJobName("S3 To CrawlURLMetadata Job for Path:" + paths[pathIndex]); // setup s3 properties JetS3tARCSource.setMaxRetries(job, 1); // set up S3 credentials ... JetS3tARCSource.setAWSAccessKeyID(job, accessKey); JetS3tARCSource.setAWSSecretAccessKey(job, secretKey); ARCSplitCalculator.setFilesPerSplit(job, 25); // set up arc reader properties ArcFileReader.setIOTimeoutValue(30000); // set input prefixes ... JetS3tARCSource.setInputPrefixes(job, paths[pathIndex]); // and S3 bucket name ... JetS3tARCSource.setBucketName(job, "commoncrawl"); // and setup arc source for ArcInputFormat ARCInputFormat.setARCSourceClass(job, JetS3tARCSource.class); // and set up input format ... job.setInputFormat(ARCInputFormat.class); // set mapper ... job.setMapRunnerClass(S3GetMetdataJob.class); // setup reducer (identity in this case ... ) job.setReducerClass(IdentityReducer.class); // standard output format ... job.setOutputFormat(SequenceFileOutputFormat.class); // set output path FileOutputFormat.setOutputPath(job, tempDir); // map output types job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(CrawlURLMetadata.class); // reduce output types job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlURLMetadata.class); // double the number of reducers ... // job.setNumReduceTasks(job.getNumReduceTasks() * 2); // run the job ... try { LOG.info("Starting Job:" + job.getJobName()); JobClient.runJob(job); LOG.info("Finished Job:" + job.getJobName()); Path finalPath = new Path("jobout/" + paths[pathIndex] + "/result"); LOG.info("Copying Job Output to:" + finalPath); FileSystem fs = FileSystem.get(job); try { fs.mkdirs(finalPath.getParent()); fs.rename(tempDir, finalPath); LOG.info("Copied Job Output to:" + finalPath); } finally { // fs.close(); } } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); e.printStackTrace(); } } }
/** Runs this tool. */ @SuppressWarnings("deprecation") public int run(String[] args) throws Exception { JobConf job = new JobConf(getConf(), Docnos2Titles.class); // Read commandline arguments CommandLine cmdline = parseArgs(args); if (cmdline == null) { printUsage(); } String eCollectionPath = cmdline.getOptionValue(ECOLLECTION_OPTION); String fCollectionPath = cmdline.getOptionValue(FCOLLECTION_OPTION); String pwsimOutputPath = cmdline.getOptionValue(PWSIM_OPTION); String titlePairsPath = cmdline.getOptionValue(OUTPUT_PATH_OPTION); String eLang = cmdline.getOptionValue(ELANG_OPTION); String fLang = cmdline.getOptionValue(FLANG_OPTION); String samplesFile = cmdline.getOptionValue(SAMPLEDOCNOS_OPTION); job.setJobName("Docnos2Titles_" + fLang + "-" + eLang); FileInputFormat.addInputPaths(job, eCollectionPath); FileInputFormat.addInputPaths(job, fCollectionPath); FileOutputFormat.setOutputPath(job, new Path(titlePairsPath)); DistributedCache.addCacheFile(new URI(pwsimOutputPath), job); DistributedCache.addCacheFile(new URI(samplesFile), job); job.set("eLang", eLang); job.set("fLang", fLang); job.set("PwsimPairs", pwsimOutputPath); job.set("Ivory.SampleFile", samplesFile); job.setInt("mapred.task.timeout", 60000000); job.set("mapreduce.map.memory.mb", "3000"); job.set("mapreduce.map.java.opts", "-Xmx3000m"); job.setBoolean("mapred.map.tasks.speculative.execution", false); job.setBoolean("mapred.reduce.tasks.speculative.execution", false); job.setNumMapTasks(100); job.setNumReduceTasks(1); job.setInt("mapred.min.split.size", 2000000000); job.setFloat("mapred.reduce.slowstart.completed.maps", 0.9f); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(TextOutputFormat.class); job.setMapOutputKeyClass(PairOfInts.class); job.setMapOutputValueClass(PairOfIntString.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); sLogger.info("Running job " + job.getJobName() + "..."); sLogger.info("E-collection path: " + eCollectionPath); sLogger.info("F-collection path: " + fCollectionPath); sLogger.info("Pwsim output path: " + pwsimOutputPath); sLogger.info("Output path: " + titlePairsPath); sLogger.info("Sample file?: " + ((samplesFile != null) ? samplesFile : "none")); long startTime = System.currentTimeMillis(); JobClient.runJob(job); System.out.println( "Job finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }