public void inject(Path crawlDb, Path urlDir) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("Injector: starting at " + sdf.format(start)); LOG.info("Injector: crawlDb: " + crawlDb); LOG.info("Injector: urlDir: " + urlDir); } Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/inject-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); // map text input file to a <url,CrawlDatum> file if (LOG.isInfoEnabled()) { LOG.info("Injector: Converting injected urls to crawl db entries."); } JobConf sortJob = new NutchJob(getConf()); sortJob.setJobName("inject " + urlDir); FileInputFormat.addInputPath(sortJob, urlDir); sortJob.setMapperClass(InjectMapper.class); FileOutputFormat.setOutputPath(sortJob, tempDir); sortJob.setOutputFormat(SequenceFileOutputFormat.class); sortJob.setOutputKeyClass(Text.class); sortJob.setOutputValueClass(CrawlDatum.class); sortJob.setLong("injector.current.time", System.currentTimeMillis()); RunningJob mapJob = JobClient.runJob(sortJob); long urlsInjected = mapJob.getCounters().findCounter("injector", "urls_injected").getValue(); long urlsFiltered = mapJob.getCounters().findCounter("injector", "urls_filtered").getValue(); LOG.info("Injector: total number of urls rejected by filters: " + urlsFiltered); LOG.info( "Injector: total number of urls injected after normalization and filtering: " + urlsInjected); // merge with existing crawl db if (LOG.isInfoEnabled()) { LOG.info("Injector: Merging injected urls into crawl db."); } JobConf mergeJob = CrawlDb.createJob(getConf(), crawlDb); FileInputFormat.addInputPath(mergeJob, tempDir); mergeJob.setReducerClass(InjectReducer.class); JobClient.runJob(mergeJob); CrawlDb.install(mergeJob, crawlDb); // clean up FileSystem fs = FileSystem.get(getConf()); fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info( "Injector: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
public void configure(JobConf job) { this.jobConf = job; urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_INJECT); interval = jobConf.getInt("db.fetch.interval.default", 2592000); filters = new URLFilters(jobConf); scfilters = new ScoringFilters(jobConf); scoreInjected = jobConf.getFloat("db.score.injected", 1.0f); curTime = job.getLong("injector.current.time", System.currentTimeMillis()); }
public void bumpProgress() { numWritten++; if (numWritten % 25000 == 0) { long now = System.currentTimeMillis(); long delta = now - lastCheckpoint; lastCheckpoint = now; LOG.info("Wrote last 25000 records in " + delta + " ms"); localManager.progress(); } }
private static void analyzeResult(FileSystem fs, int testType, long execTime, String resFileName) throws IOException { Path reduceFile; if (testType == TEST_TYPE_WRITE) reduceFile = new Path(WRITE_DIR, "part-00000"); else reduceFile = new Path(READ_DIR, "part-00000"); DataInputStream in; in = new DataInputStream(fs.open(reduceFile)); BufferedReader lines; lines = new BufferedReader(new InputStreamReader(in)); long tasks = 0; long size = 0; long time = 0; float rate = 0; float sqrate = 0; String line; while ((line = lines.readLine()) != null) { StringTokenizer tokens = new StringTokenizer(line, " \t\n\r\f%"); String attr = tokens.nextToken(); if (attr.endsWith(":tasks")) tasks = Long.parseLong(tokens.nextToken()); else if (attr.endsWith(":size")) size = Long.parseLong(tokens.nextToken()); else if (attr.endsWith(":time")) time = Long.parseLong(tokens.nextToken()); else if (attr.endsWith(":rate")) rate = Float.parseFloat(tokens.nextToken()); else if (attr.endsWith(":sqrate")) sqrate = Float.parseFloat(tokens.nextToken()); } double med = rate / 1000 / tasks; double stdDev = Math.sqrt(Math.abs(sqrate / 1000 / tasks - med * med)); String resultLines[] = { "----- DFSCIOTest ----- : " + ((testType == TEST_TYPE_WRITE) ? "write" : (testType == TEST_TYPE_READ) ? "read" : "unknown"), " Date & time: " + new Date(System.currentTimeMillis()), " Number of files: " + tasks, "Total MBytes processed: " + size / MEGA, " Throughput mb/sec: " + size * 1000.0 / (time * MEGA), "Average IO rate mb/sec: " + med, " Std IO rate deviation: " + stdDev, " Test exec time sec: " + (float) execTime / 1000, "" }; PrintStream res = new PrintStream(new FileOutputStream(new File(resFileName), true)); for (int i = 0; i < resultLines.length; i++) { LOG.info(resultLines[i]); res.println(resultLines[i]); } }
public int run(String[] args) throws Exception { if (args.length < 4) { System.out.println("ERROR: Please Enter args : input output type(text|seq) splitChar(9=\t)"); return JobClient.SUCCESS; } String input = args[0]; String output = args[1]; String type = args[2]; String splitChar = args[3]; JobConf config = new JobConf(getConf(), getClass()); config.set("user.split", splitChar); config.setJobName("File Filter -" + System.currentTimeMillis()); config.setNumReduceTasks(10); config.setReducerClass(IdentityReducer.class); config.setMapperClass(FileTestMapper.class); if ("text".equals(type)) { config.setInputFormat(TextInputFormat.class); TextInputFormat.addInputPath(config, new Path(input)); } else { config.setInputFormat(SequenceFileInputFormat.class); SequenceFileInputFormat.addInputPath(config, new Path(input)); } config.setMapOutputKeyClass(Text.class); config.setMapOutputValueClass(Text.class); config.setOutputKeyClass(Text.class); config.setOutputValueClass(Text.class); // if output path exists then return FileSystem fs = FileSystem.get(config); Path outputPath = new Path(output); FileOutputFormat.setOutputPath(config, outputPath); if (!fs.exists(outputPath)) { JobClient.runJob(config); } else { System.out.println("You has finished this job today ! " + outputPath); } return JobClient.SUCCESS; }
public int run(String[] args) throws Exception { if (args.length < 1) { args = new String[] {DateStringUtils.now()}; System.out.println( "ERROR: Please Enter Date , eg. 20101010 ! now use default => " + DateStringUtils.now()); } JobConf config = new JobConf(getConf(), getClass()); config.set("user.args", Utils.asString(args)); config.setJobName(getClass() + "-" + System.currentTimeMillis()); config.setNumReduceTasks(100); config.setMapperClass(getClass()); config.setReducerClass(getClass()); config.setInputFormat(getInputFormat()); config.setMapOutputKeyClass(Text.class); config.setMapOutputValueClass(Text.class); // add input paths for (String path : getInputPath(args)) { if (TextInputFormat.class.equals(getInputFormat())) { TextInputFormat.addInputPath(config, new Path(path)); } else if (SequenceFileInputFormat.class.equals(getInputFormat())) { SequenceFileInputFormat.addInputPath(config, new Path(path)); } } config.setOutputKeyClass(Text.class); config.setOutputValueClass(Text.class); // if output path exists then return FileSystem fs = FileSystem.get(config); Path outputPath = new Path(getOutputPath(args)); FileOutputFormat.setOutputPath(config, outputPath); if (!fs.exists(outputPath)) { JobClient.runJob(config); } else { System.out.println("You has finished this job today ! " + outputPath); } return JobClient.SUCCESS; }
public void testFsCache() throws Exception { { long now = System.currentTimeMillis(); String[] users = new String[] {"foo", "bar"}; final Configuration conf = new Configuration(); FileSystem[] fs = new FileSystem[users.length]; for (int i = 0; i < users.length; i++) { UserGroupInformation ugi = UserGroupInformation.createRemoteUser(users[i]); fs[i] = ugi.doAs( new PrivilegedExceptionAction<FileSystem>() { public FileSystem run() throws IOException { return FileSystem.get(conf); } }); for (int j = 0; j < i; j++) { assertFalse(fs[j] == fs[i]); } } FileSystem.closeAll(); } { try { runTestCache(HdfsClientConfigKeys.DFS_NAMENODE_RPC_PORT_DEFAULT); } catch (java.net.BindException be) { LOG.warn( "Cannot test HdfsClientConfigKeys.DFS_NAMENODE_RPC_PORT_DEFAULT (=" + HdfsClientConfigKeys.DFS_NAMENODE_RPC_PORT_DEFAULT + ")", be); } runTestCache(0); } }
public static void main(String[] args) { int testType = TEST_TYPE_READ; int bufferSize = DEFAULT_BUFFER_SIZE; int fileSize = 1; int nrFiles = 1; String resFileName = DEFAULT_RES_FILE_NAME; boolean isSequential = false; String version = "DFSCIOTest.0.0.1"; String usage = "Usage: DFSCIOTest -read | -write | -clean [-nrFiles N] [-fileSize MB] [-resFile resultFileName] [-bufferSize Bytes] "; System.out.println(version); if (args.length == 0) { System.err.println(usage); System.exit(-1); } for (int i = 0; i < args.length; i++) { // parse command line if (args[i].startsWith("-r")) { testType = TEST_TYPE_READ; } else if (args[i].startsWith("-w")) { testType = TEST_TYPE_WRITE; } else if (args[i].startsWith("-clean")) { testType = TEST_TYPE_CLEANUP; } else if (args[i].startsWith("-seq")) { isSequential = true; } else if (args[i].equals("-nrFiles")) { nrFiles = Integer.parseInt(args[++i]); } else if (args[i].equals("-fileSize")) { fileSize = Integer.parseInt(args[++i]); } else if (args[i].equals("-bufferSize")) { bufferSize = Integer.parseInt(args[++i]); } else if (args[i].equals("-resFile")) { resFileName = args[++i]; } } LOG.info("nrFiles = " + nrFiles); LOG.info("fileSize (MB) = " + fileSize); LOG.info("bufferSize = " + bufferSize); try { fsConfig.setInt("test.io.file.buffer.size", bufferSize); FileSystem fs = FileSystem.get(fsConfig); if (testType != TEST_TYPE_CLEANUP) { fs.delete(HDFS_TEST_DIR, true); if (!fs.mkdirs(HDFS_TEST_DIR)) { throw new IOException("Mkdirs failed to create " + HDFS_TEST_DIR.toString()); } // Copy the executables over to the remote filesystem String hadoopHome = System.getenv("HADOOP_PREFIX"); fs.copyFromLocalFile( new Path(hadoopHome + "/libhdfs/libhdfs.so." + HDFS_LIB_VERSION), HDFS_SHLIB); fs.copyFromLocalFile(new Path(hadoopHome + "/libhdfs/hdfs_read"), HDFS_READ); fs.copyFromLocalFile(new Path(hadoopHome + "/libhdfs/hdfs_write"), HDFS_WRITE); } if (isSequential) { long tStart = System.currentTimeMillis(); sequentialTest(fs, testType, fileSize, nrFiles); long execTime = System.currentTimeMillis() - tStart; String resultLine = "Seq Test exec time sec: " + (float) execTime / 1000; LOG.info(resultLine); return; } if (testType == TEST_TYPE_CLEANUP) { cleanup(fs); return; } createControlFile(fs, fileSize, nrFiles); long tStart = System.currentTimeMillis(); if (testType == TEST_TYPE_WRITE) writeTest(fs); if (testType == TEST_TYPE_READ) readTest(fs); long execTime = System.currentTimeMillis() - tStart; analyzeResult(fs, testType, execTime, resFileName); } catch (Exception e) { System.err.print(e.getLocalizedMessage()); System.exit(-1); } }
public class ElephantRecordWriter implements RecordWriter<IntWritable, ElephantRecordWritable>, Closeable { FileSystem fileSystem; Args args; Map<Integer, Persistence> lps = new HashMap<Integer, Persistence>(); Progressable progressable; LocalElephantManager localManager; int numWritten = 0; long lastCheckpoint = System.currentTimeMillis(); public ElephantRecordWriter(Configuration conf, Args args, Progressable progressable) throws IOException { fileSystem = Utils.getFS(args.outputDirHdfs, conf); this.args = args; this.progressable = progressable; localManager = new LocalElephantManager(fileSystem, args.spec, LocalElephantManager.getTmpDirs(conf)); } private Persistence retrieveShard(int shardIdx) throws IOException { Persistence lp = null; if (lps.containsKey(shardIdx)) { lp = lps.get(shardIdx); } else { String localShard = localManager.downloadRemoteShard("" + shardIdx, null); Coordinator fact = args.spec.getCoordinator(); lp = fact.openPersistenceForAppend(localShard, args.spec.getPersistenceOptions()); lps.put(shardIdx, lp); progress(); } return lp; } public void write(IntWritable shard, ElephantRecordWritable carrier) throws IOException { Persistence lp = retrieveShard(shard.get()); NewKeyValDocument doc = new NewKeyValDocument(carrier.key, carrier.value); lp.index(doc); bumpProgress(); } public void bumpProgress() { numWritten++; if (numWritten % 25000 == 0) { long now = System.currentTimeMillis(); long delta = now - lastCheckpoint; lastCheckpoint = now; LOG.info("Wrote last 25000 records in " + delta + " ms"); localManager.progress(); } } public void close() throws IOException { close(null); } public void close(Reporter reporter) throws IOException { for (Integer shard : lps.keySet()) { String lpDir = localManager.localTmpDir("" + shard); LOG.info("Closing LP for shard " + shard + " at " + lpDir); lps.get(shard).close(); LOG.info("Closed LP for shard " + shard + " at " + lpDir); progress(); String remoteDir = args.outputDirHdfs + "/" + shard; // Do all this stuff to ensure that S3 actually does delete int deleteAttempt = 4; while (fileSystem.exists(new Path(remoteDir)) && deleteAttempt > 0) { LOG.info("Deleting existing shard " + shard + " at " + remoteDir); fileSystem.delete(new Path(remoteDir), true); --deleteAttempt; } if (fileSystem.exists(new Path(remoteDir)) && deleteAttempt == 0) { throw new IOException( "Failed to delete shard " + shard + " at " + remoteDir + " after " + deleteAttempt + " attempts!"); } else { LOG.info("Deleted existing shard " + shard + " at " + remoteDir); } LOG.info("Copying " + lpDir + " to " + remoteDir); fileSystem.copyFromLocalFile(new Path(lpDir), new Path(remoteDir)); LOG.info("Copied " + lpDir + " to " + remoteDir); progress(); } localManager.cleanup(); } private void progress() { if (progressable != null) progressable.progress(); } }