public void testFormat() throws Exception { localFs = FileSystem.getLocal(defaultConf); localFs.delete(workDir, true); Job job = new Job(new Configuration(defaultConf)); Path file = new Path(workDir, "test.txt"); int seed = new Random().nextInt(); Random random = new Random(seed); // for a variety of lengths for (int length = 0; length < MAX_LENGTH; length += random.nextInt(MAX_LENGTH / 10) + 1) { // create a file with length entries Writer writer = new OutputStreamWriter(localFs.create(file)); try { MyClass mc = new MyClass(); for (int i = 0; i < length; i++) { mc.s = Integer.toString(i); mc.v = i; byte[] raw = MessagePack.pack(mc); byte[] b64e = base64_.encodeBase64(raw); byte[] b64d = base64_.decode(b64e); MyClass mc2 = MessagePack.unpack(b64d, mc.getClass()); assertEquals(mc.s, mc2.s); assertEquals(mc.v, mc2.v); writer.write(base64_.encodeToString(raw)); } } finally { writer.close(); } checkFormat(job); } }
public void inject(Path crawlDb, Path urlDir) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("Injector: starting at " + sdf.format(start)); LOG.info("Injector: crawlDb: " + crawlDb); LOG.info("Injector: urlDir: " + urlDir); } Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/inject-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); // map text input file to a <url,CrawlDatum> file if (LOG.isInfoEnabled()) { LOG.info("Injector: Converting injected urls to crawl db entries."); } JobConf sortJob = new NutchJob(getConf()); sortJob.setJobName("inject " + urlDir); FileInputFormat.addInputPath(sortJob, urlDir); sortJob.setMapperClass(InjectMapper.class); FileOutputFormat.setOutputPath(sortJob, tempDir); sortJob.setOutputFormat(SequenceFileOutputFormat.class); sortJob.setOutputKeyClass(Text.class); sortJob.setOutputValueClass(CrawlDatum.class); sortJob.setLong("injector.current.time", System.currentTimeMillis()); RunningJob mapJob = JobClient.runJob(sortJob); long urlsInjected = mapJob.getCounters().findCounter("injector", "urls_injected").getValue(); long urlsFiltered = mapJob.getCounters().findCounter("injector", "urls_filtered").getValue(); LOG.info("Injector: total number of urls rejected by filters: " + urlsFiltered); LOG.info( "Injector: total number of urls injected after normalization and filtering: " + urlsInjected); // merge with existing crawl db if (LOG.isInfoEnabled()) { LOG.info("Injector: Merging injected urls into crawl db."); } JobConf mergeJob = CrawlDb.createJob(getConf(), crawlDb); FileInputFormat.addInputPath(mergeJob, tempDir); mergeJob.setReducerClass(InjectReducer.class); JobClient.runJob(mergeJob); CrawlDb.install(mergeJob, crawlDb); // clean up FileSystem fs = FileSystem.get(getConf()); fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info( "Injector: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
private void dirCleanup() { Configuration conf = new Configuration(); try { FileSystem fs = FileSystem.get(conf); Path dirPath = new Path(testPath); if (fs.exists(dirPath)) { fs.delete(dirPath, true); } } catch (IOException ex) { LOG.warn("IO Error in test cleanup", ex); } }
public void copyInitialState(Path origAppDir) throws IOException { // locate previous snapshot String newAppDir = this.dag.assertAppPath(); FSRecoveryHandler recoveryHandler = new FSRecoveryHandler(origAppDir.toString(), conf); // read snapshot against new dependencies Object snapshot = recoveryHandler.restore(); if (snapshot == null) { throw new IllegalArgumentException("No previous application state found in " + origAppDir); } InputStream logIs = recoveryHandler.getLog(); // modify snapshot state to switch app id ((StreamingContainerManager.CheckpointState) snapshot).setApplicationId(this.dag, conf); Path checkpointPath = new Path(newAppDir, LogicalPlan.SUBDIR_CHECKPOINTS); FileSystem fs = FileSystem.newInstance(origAppDir.toUri(), conf); // remove the path that was created by the storage agent during deserialization and replacement fs.delete(checkpointPath, true); // write snapshot to new location recoveryHandler = new FSRecoveryHandler(newAppDir, conf); recoveryHandler.save(snapshot); OutputStream logOs = recoveryHandler.rotateLog(); IOUtils.copy(logIs, logOs); logOs.flush(); logOs.close(); logIs.close(); // copy sub directories that are not present in target FileStatus[] lFiles = fs.listStatus(origAppDir); for (FileStatus f : lFiles) { if (f.isDirectory()) { String targetPath = f.getPath().toString().replace(origAppDir.toString(), newAppDir); if (!fs.exists(new Path(targetPath))) { LOG.debug("Copying {} to {}", f.getPath(), targetPath); FileUtil.copy(fs, f.getPath(), fs, new Path(targetPath), false, conf); // FSUtil.copy(fs, f, fs, new Path(targetPath), false, false, conf); } else { LOG.debug("Ignoring {} as it already exists under {}", f.getPath(), targetPath); // FSUtil.setPermission(fs, new Path(targetPath), new FsPermission((short)0777)); } } } }
public void testFormat() throws Exception { JobConf job = new JobConf(conf); FileSystem fs = FileSystem.getLocal(conf); Path dir = new Path(System.getProperty("test.build.data", ".") + "/mapred"); Path file = new Path(dir, "test.seq"); Reporter reporter = Reporter.NULL; int seed = new Random().nextInt(); // LOG.info("seed = "+seed); Random random = new Random(seed); fs.delete(dir, true); FileInputFormat.setInputPaths(job, dir); // for a variety of lengths for (int length = 0; length < MAX_LENGTH; length += random.nextInt(MAX_LENGTH / 10) + 1) { // LOG.info("creating; entries = " + length); // create a file with length entries SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, IntWritable.class, BytesWritable.class); try { for (int i = 0; i < length; i++) { IntWritable key = new IntWritable(i); byte[] data = new byte[random.nextInt(10)]; random.nextBytes(data); BytesWritable value = new BytesWritable(data); writer.append(key, value); } } finally { writer.close(); } // try splitting the file in a variety of sizes InputFormat<IntWritable, BytesWritable> format = new SequenceFileInputFormat<IntWritable, BytesWritable>(); IntWritable key = new IntWritable(); BytesWritable value = new BytesWritable(); for (int i = 0; i < 3; i++) { int numSplits = random.nextInt(MAX_LENGTH / (SequenceFile.SYNC_INTERVAL / 20)) + 1; // LOG.info("splitting: requesting = " + numSplits); InputSplit[] splits = format.getSplits(job, numSplits); // LOG.info("splitting: got = " + splits.length); // check each split BitSet bits = new BitSet(length); for (int j = 0; j < splits.length; j++) { RecordReader<IntWritable, BytesWritable> reader = format.getRecordReader(splits[j], job, reporter); try { int count = 0; while (reader.next(key, value)) { // if (bits.get(key.get())) { // LOG.info("splits["+j+"]="+splits[j]+" : " + // key.get()); // LOG.info("@"+reader.getPos()); // } assertFalse("Key in multiple partitions.", bits.get(key.get())); bits.set(key.get()); count++; } // LOG.info("splits["+j+"]="+splits[j]+" count=" + // count); } finally { reader.close(); } } assertEquals("Some keys in no partition.", length, bits.cardinality()); } } }
public void doTestTextBatchAppend() throws Exception { LOG.debug("Starting..."); final long rollCount = 10; final long batchSize = 2; final String fileName = "PageView"; String newPath = testPath + "/singleTextBucket"; int totalEvents = 0; int i = 1, j = 1; // clear the test directory Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Path dirPath = new Path(newPath); fs.delete(dirPath, true); fs.mkdirs(dirPath); Context context = new Context(); context.put("hdfs.path", newPath); context.put("hdfs.rollCount", String.valueOf(rollCount)); context.put("hdfs.batchSize", String.valueOf(batchSize)); context.put("hdfs.filePrefix", "pageview"); Channel channel = new MemoryChannel(); Configurables.configure(channel, context); sink.setChannel(channel); sink.start(); Calendar eventDate = Calendar.getInstance(); Date currentDate = new Date(); Map<String, String> header = new HashMap<String, String>(); header.put("topic", "PageView"); List<String> bodies = Lists.newArrayList(); // 将测试的事件推入到通道中 for (i = 1; i <= (rollCount * 10) / batchSize; i++) { Transaction txn = channel.getTransaction(); txn.begin(); for (j = 1; j <= batchSize; j++) { header.put("timestamp", String.valueOf(currentDate.getTime())); Event event = new SimpleEvent(); eventDate.clear(); eventDate.set(2014, i, i, i, 0); String body = "Test." + i + "." + j; event.setHeaders(header); event.setBody(body.getBytes()); bodies.add(body); channel.put(event); totalEvents++; } txn.commit(); txn.close(); // execute sink to process the events sink.process(); } sink.stop(); FileStatus[] dirStat = fs.listStatus(dirPath); Path fList[] = FileUtil.stat2Paths(dirStat); long expectedFiles = totalEvents / rollCount; if (totalEvents % rollCount > 0) { expectedFiles++; } Assert.assertEquals( "num files wrong, found: " + Lists.newArrayList(fList), expectedFiles, fList.length); // 检查所有写入文件的内容 verifyOutputTextFiles(fs, conf, dirPath.toUri().getPath(), fileName, bodies); }
static void checkRecords( Configuration defaults, int noMaps, int noReduces, Path sortInput, Path sortOutput) throws IOException { JobConf jobConf = new JobConf(defaults, RecordChecker.class); jobConf.setJobName("sortvalidate-record-checker"); jobConf.setInputFormat(SequenceFileInputFormat.class); jobConf.setOutputFormat(SequenceFileOutputFormat.class); jobConf.setOutputKeyClass(BytesWritable.class); jobConf.setOutputValueClass(IntWritable.class); jobConf.setMapperClass(Map.class); jobConf.setReducerClass(Reduce.class); JobClient client = new JobClient(jobConf); ClusterStatus cluster = client.getClusterStatus(); if (noMaps == -1) { noMaps = cluster.getTaskTrackers() * jobConf.getInt("test.sortvalidate.maps_per_host", 10); } if (noReduces == -1) { noReduces = (int) (cluster.getMaxReduceTasks() * 0.9); String sortReduces = jobConf.get("test.sortvalidate.reduces_per_host"); if (sortReduces != null) { noReduces = cluster.getTaskTrackers() * Integer.parseInt(sortReduces); } } jobConf.setNumMapTasks(noMaps); jobConf.setNumReduceTasks(noReduces); FileInputFormat.setInputPaths(jobConf, sortInput); FileInputFormat.addInputPath(jobConf, sortOutput); Path outputPath = new Path("/tmp/sortvalidate/recordchecker"); FileSystem fs = FileSystem.get(defaults); if (fs.exists(outputPath)) { fs.delete(outputPath, true); } FileOutputFormat.setOutputPath(jobConf, outputPath); // Uncomment to run locally in a single process // job_conf.set("mapred.job.tracker", "local"); Path[] inputPaths = FileInputFormat.getInputPaths(jobConf); System.out.println( "\nSortValidator.RecordChecker: Running on " + cluster.getTaskTrackers() + " nodes to validate sort from " + inputPaths[0] + ", " + inputPaths[1] + " into " + FileOutputFormat.getOutputPath(jobConf) + " with " + noReduces + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); JobClient.runJob(jobConf); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println( "The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); }
static void checkRecords(Configuration defaults, Path sortInput, Path sortOutput) throws IOException { FileSystem inputfs = sortInput.getFileSystem(defaults); FileSystem outputfs = sortOutput.getFileSystem(defaults); FileSystem defaultfs = FileSystem.get(defaults); JobConf jobConf = new JobConf(defaults, RecordStatsChecker.class); jobConf.setJobName("sortvalidate-recordstats-checker"); int noSortReduceTasks = outputfs.listStatus(sortOutput, sortPathsFilter).length; jobConf.setInt("sortvalidate.sort.reduce.tasks", noSortReduceTasks); int noSortInputpaths = inputfs.listStatus(sortInput).length; jobConf.setInputFormat(NonSplitableSequenceFileInputFormat.class); jobConf.setOutputFormat(SequenceFileOutputFormat.class); jobConf.setOutputKeyClass(IntWritable.class); jobConf.setOutputValueClass(RecordStatsChecker.RecordStatsWritable.class); jobConf.setMapperClass(Map.class); jobConf.setCombinerClass(Reduce.class); jobConf.setReducerClass(Reduce.class); jobConf.setNumMapTasks(noSortReduceTasks); jobConf.setNumReduceTasks(1); FileInputFormat.setInputPaths(jobConf, sortInput); FileInputFormat.addInputPath(jobConf, sortOutput); Path outputPath = new Path("/tmp/sortvalidate/recordstatschecker"); if (defaultfs.exists(outputPath)) { defaultfs.delete(outputPath, true); } FileOutputFormat.setOutputPath(jobConf, outputPath); // Uncomment to run locally in a single process // job_conf.set("mapred.job.tracker", "local"); Path[] inputPaths = FileInputFormat.getInputPaths(jobConf); System.out.println( "\nSortValidator.RecordStatsChecker: Validate sort " + "from " + inputPaths[0] + " (" + noSortInputpaths + " files), " + inputPaths[1] + " (" + noSortReduceTasks + " files) into " + FileOutputFormat.getOutputPath(jobConf) + " with 1 reducer."); Date startTime = new Date(); System.out.println("Job started: " + startTime); JobClient.runJob(jobConf); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println( "The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); // Check to ensure that the statistics of the // framework's sort-input and sort-output match SequenceFile.Reader stats = new SequenceFile.Reader(defaultfs, new Path(outputPath, "part-00000"), defaults); IntWritable k1 = new IntWritable(); IntWritable k2 = new IntWritable(); RecordStatsWritable v1 = new RecordStatsWritable(); RecordStatsWritable v2 = new RecordStatsWritable(); if (!stats.next(k1, v1)) { throw new IOException("Failed to read record #1 from reduce's output"); } if (!stats.next(k2, v2)) { throw new IOException("Failed to read record #2 from reduce's output"); } if ((v1.getBytes() != v2.getBytes()) || (v1.getRecords() != v2.getRecords()) || v1.getChecksum() != v2.getChecksum()) { throw new IOException( "(" + v1.getBytes() + ", " + v1.getRecords() + ", " + v1.getChecksum() + ") v/s (" + v2.getBytes() + ", " + v2.getRecords() + ", " + v2.getChecksum() + ")"); } }