public void testFormat() throws Exception {
    localFs = FileSystem.getLocal(defaultConf);
    localFs.delete(workDir, true);

    Job job = new Job(new Configuration(defaultConf));
    Path file = new Path(workDir, "test.txt");

    int seed = new Random().nextInt();
    Random random = new Random(seed);

    // for a variety of lengths
    for (int length = 0; length < MAX_LENGTH; length += random.nextInt(MAX_LENGTH / 10) + 1) {
      // create a file with length entries
      Writer writer = new OutputStreamWriter(localFs.create(file));
      try {
        MyClass mc = new MyClass();
        for (int i = 0; i < length; i++) {
          mc.s = Integer.toString(i);
          mc.v = i;
          byte[] raw = MessagePack.pack(mc);
          byte[] b64e = base64_.encodeBase64(raw);
          byte[] b64d = base64_.decode(b64e);
          MyClass mc2 = MessagePack.unpack(b64d, mc.getClass());
          assertEquals(mc.s, mc2.s);
          assertEquals(mc.v, mc2.v);

          writer.write(base64_.encodeToString(raw));
        }
      } finally {
        writer.close();
      }
      checkFormat(job);
    }
  }
Example #2
0
  public void inject(Path crawlDb, Path urlDir) throws IOException {
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: starting at " + sdf.format(start));
      LOG.info("Injector: crawlDb: " + crawlDb);
      LOG.info("Injector: urlDir: " + urlDir);
    }

    Path tempDir =
        new Path(
            getConf().get("mapred.temp.dir", ".")
                + "/inject-temp-"
                + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    // map text input file to a <url,CrawlDatum> file
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: Converting injected urls to crawl db entries.");
    }
    JobConf sortJob = new NutchJob(getConf());
    sortJob.setJobName("inject " + urlDir);
    FileInputFormat.addInputPath(sortJob, urlDir);
    sortJob.setMapperClass(InjectMapper.class);

    FileOutputFormat.setOutputPath(sortJob, tempDir);
    sortJob.setOutputFormat(SequenceFileOutputFormat.class);
    sortJob.setOutputKeyClass(Text.class);
    sortJob.setOutputValueClass(CrawlDatum.class);
    sortJob.setLong("injector.current.time", System.currentTimeMillis());
    RunningJob mapJob = JobClient.runJob(sortJob);

    long urlsInjected = mapJob.getCounters().findCounter("injector", "urls_injected").getValue();
    long urlsFiltered = mapJob.getCounters().findCounter("injector", "urls_filtered").getValue();
    LOG.info("Injector: total number of urls rejected by filters: " + urlsFiltered);
    LOG.info(
        "Injector: total number of urls injected after normalization and filtering: "
            + urlsInjected);

    // merge with existing crawl db
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: Merging injected urls into crawl db.");
    }
    JobConf mergeJob = CrawlDb.createJob(getConf(), crawlDb);
    FileInputFormat.addInputPath(mergeJob, tempDir);
    mergeJob.setReducerClass(InjectReducer.class);
    JobClient.runJob(mergeJob);
    CrawlDb.install(mergeJob, crawlDb);

    // clean up
    FileSystem fs = FileSystem.get(getConf());
    fs.delete(tempDir, true);

    long end = System.currentTimeMillis();
    LOG.info(
        "Injector: finished at "
            + sdf.format(end)
            + ", elapsed: "
            + TimingUtil.elapsedTime(start, end));
  }
Example #3
0
 private void dirCleanup() {
   Configuration conf = new Configuration();
   try {
     FileSystem fs = FileSystem.get(conf);
     Path dirPath = new Path(testPath);
     if (fs.exists(dirPath)) {
       fs.delete(dirPath, true);
     }
   } catch (IOException ex) {
     LOG.warn("IO Error in test cleanup", ex);
   }
 }
Example #4
0
  public void copyInitialState(Path origAppDir) throws IOException {
    // locate previous snapshot
    String newAppDir = this.dag.assertAppPath();

    FSRecoveryHandler recoveryHandler = new FSRecoveryHandler(origAppDir.toString(), conf);
    // read snapshot against new dependencies
    Object snapshot = recoveryHandler.restore();
    if (snapshot == null) {
      throw new IllegalArgumentException("No previous application state found in " + origAppDir);
    }
    InputStream logIs = recoveryHandler.getLog();

    // modify snapshot state to switch app id
    ((StreamingContainerManager.CheckpointState) snapshot).setApplicationId(this.dag, conf);
    Path checkpointPath = new Path(newAppDir, LogicalPlan.SUBDIR_CHECKPOINTS);

    FileSystem fs = FileSystem.newInstance(origAppDir.toUri(), conf);
    // remove the path that was created by the storage agent during deserialization and replacement
    fs.delete(checkpointPath, true);

    // write snapshot to new location
    recoveryHandler = new FSRecoveryHandler(newAppDir, conf);
    recoveryHandler.save(snapshot);
    OutputStream logOs = recoveryHandler.rotateLog();
    IOUtils.copy(logIs, logOs);
    logOs.flush();
    logOs.close();
    logIs.close();

    // copy sub directories that are not present in target
    FileStatus[] lFiles = fs.listStatus(origAppDir);
    for (FileStatus f : lFiles) {
      if (f.isDirectory()) {
        String targetPath = f.getPath().toString().replace(origAppDir.toString(), newAppDir);
        if (!fs.exists(new Path(targetPath))) {
          LOG.debug("Copying {} to {}", f.getPath(), targetPath);
          FileUtil.copy(fs, f.getPath(), fs, new Path(targetPath), false, conf);
          // FSUtil.copy(fs, f, fs, new Path(targetPath), false, false, conf);
        } else {
          LOG.debug("Ignoring {} as it already exists under {}", f.getPath(), targetPath);
          // FSUtil.setPermission(fs, new Path(targetPath), new FsPermission((short)0777));
        }
      }
    }
  }
  public void testFormat() throws Exception {
    JobConf job = new JobConf(conf);
    FileSystem fs = FileSystem.getLocal(conf);
    Path dir = new Path(System.getProperty("test.build.data", ".") + "/mapred");
    Path file = new Path(dir, "test.seq");

    Reporter reporter = Reporter.NULL;

    int seed = new Random().nextInt();
    // LOG.info("seed = "+seed);
    Random random = new Random(seed);

    fs.delete(dir, true);

    FileInputFormat.setInputPaths(job, dir);

    // for a variety of lengths
    for (int length = 0; length < MAX_LENGTH; length += random.nextInt(MAX_LENGTH / 10) + 1) {

      // LOG.info("creating; entries = " + length);

      // create a file with length entries
      SequenceFile.Writer writer =
          SequenceFile.createWriter(fs, conf, file, IntWritable.class, BytesWritable.class);
      try {
        for (int i = 0; i < length; i++) {
          IntWritable key = new IntWritable(i);
          byte[] data = new byte[random.nextInt(10)];
          random.nextBytes(data);
          BytesWritable value = new BytesWritable(data);
          writer.append(key, value);
        }
      } finally {
        writer.close();
      }

      // try splitting the file in a variety of sizes
      InputFormat<IntWritable, BytesWritable> format =
          new SequenceFileInputFormat<IntWritable, BytesWritable>();
      IntWritable key = new IntWritable();
      BytesWritable value = new BytesWritable();
      for (int i = 0; i < 3; i++) {
        int numSplits = random.nextInt(MAX_LENGTH / (SequenceFile.SYNC_INTERVAL / 20)) + 1;
        // LOG.info("splitting: requesting = " + numSplits);
        InputSplit[] splits = format.getSplits(job, numSplits);
        // LOG.info("splitting: got =        " + splits.length);

        // check each split
        BitSet bits = new BitSet(length);
        for (int j = 0; j < splits.length; j++) {
          RecordReader<IntWritable, BytesWritable> reader =
              format.getRecordReader(splits[j], job, reporter);
          try {
            int count = 0;
            while (reader.next(key, value)) {
              // if (bits.get(key.get())) {
              // LOG.info("splits["+j+"]="+splits[j]+" : " +
              // key.get());
              // LOG.info("@"+reader.getPos());
              // }
              assertFalse("Key in multiple partitions.", bits.get(key.get()));
              bits.set(key.get());
              count++;
            }
            // LOG.info("splits["+j+"]="+splits[j]+" count=" +
            // count);
          } finally {
            reader.close();
          }
        }
        assertEquals("Some keys in no partition.", length, bits.cardinality());
      }
    }
  }
Example #6
0
  public void doTestTextBatchAppend() throws Exception {
    LOG.debug("Starting...");

    final long rollCount = 10;
    final long batchSize = 2;
    final String fileName = "PageView";

    String newPath = testPath + "/singleTextBucket";
    int totalEvents = 0;
    int i = 1, j = 1;

    // clear the test directory
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    Path dirPath = new Path(newPath);
    fs.delete(dirPath, true);
    fs.mkdirs(dirPath);

    Context context = new Context();
    context.put("hdfs.path", newPath);
    context.put("hdfs.rollCount", String.valueOf(rollCount));
    context.put("hdfs.batchSize", String.valueOf(batchSize));
    context.put("hdfs.filePrefix", "pageview");

    Channel channel = new MemoryChannel();
    Configurables.configure(channel, context);
    sink.setChannel(channel);
    sink.start();

    Calendar eventDate = Calendar.getInstance();
    Date currentDate = new Date();
    Map<String, String> header = new HashMap<String, String>();
    header.put("topic", "PageView");

    List<String> bodies = Lists.newArrayList();

    // 将测试的事件推入到通道中
    for (i = 1; i <= (rollCount * 10) / batchSize; i++) {
      Transaction txn = channel.getTransaction();
      txn.begin();
      for (j = 1; j <= batchSize; j++) {
        header.put("timestamp", String.valueOf(currentDate.getTime()));
        Event event = new SimpleEvent();
        eventDate.clear();
        eventDate.set(2014, i, i, i, 0);
        String body = "Test." + i + "." + j;

        event.setHeaders(header);
        event.setBody(body.getBytes());
        bodies.add(body);
        channel.put(event);
        totalEvents++;
      }
      txn.commit();
      txn.close();

      // execute sink to process the events
      sink.process();
    }
    sink.stop();

    FileStatus[] dirStat = fs.listStatus(dirPath);
    Path fList[] = FileUtil.stat2Paths(dirStat);

    long expectedFiles = totalEvents / rollCount;
    if (totalEvents % rollCount > 0) {
      expectedFiles++;
    }

    Assert.assertEquals(
        "num files wrong, found: " + Lists.newArrayList(fList), expectedFiles, fList.length);
    // 检查所有写入文件的内容
    verifyOutputTextFiles(fs, conf, dirPath.toUri().getPath(), fileName, bodies);
  }
Example #7
0
    static void checkRecords(
        Configuration defaults, int noMaps, int noReduces, Path sortInput, Path sortOutput)
        throws IOException {
      JobConf jobConf = new JobConf(defaults, RecordChecker.class);
      jobConf.setJobName("sortvalidate-record-checker");

      jobConf.setInputFormat(SequenceFileInputFormat.class);
      jobConf.setOutputFormat(SequenceFileOutputFormat.class);

      jobConf.setOutputKeyClass(BytesWritable.class);
      jobConf.setOutputValueClass(IntWritable.class);

      jobConf.setMapperClass(Map.class);
      jobConf.setReducerClass(Reduce.class);

      JobClient client = new JobClient(jobConf);
      ClusterStatus cluster = client.getClusterStatus();
      if (noMaps == -1) {
        noMaps = cluster.getTaskTrackers() * jobConf.getInt("test.sortvalidate.maps_per_host", 10);
      }
      if (noReduces == -1) {
        noReduces = (int) (cluster.getMaxReduceTasks() * 0.9);
        String sortReduces = jobConf.get("test.sortvalidate.reduces_per_host");
        if (sortReduces != null) {
          noReduces = cluster.getTaskTrackers() * Integer.parseInt(sortReduces);
        }
      }
      jobConf.setNumMapTasks(noMaps);
      jobConf.setNumReduceTasks(noReduces);

      FileInputFormat.setInputPaths(jobConf, sortInput);
      FileInputFormat.addInputPath(jobConf, sortOutput);
      Path outputPath = new Path("/tmp/sortvalidate/recordchecker");
      FileSystem fs = FileSystem.get(defaults);
      if (fs.exists(outputPath)) {
        fs.delete(outputPath, true);
      }
      FileOutputFormat.setOutputPath(jobConf, outputPath);

      // Uncomment to run locally in a single process
      // job_conf.set("mapred.job.tracker", "local");
      Path[] inputPaths = FileInputFormat.getInputPaths(jobConf);
      System.out.println(
          "\nSortValidator.RecordChecker: Running on "
              + cluster.getTaskTrackers()
              + " nodes to validate sort from "
              + inputPaths[0]
              + ", "
              + inputPaths[1]
              + " into "
              + FileOutputFormat.getOutputPath(jobConf)
              + " with "
              + noReduces
              + " reduces.");
      Date startTime = new Date();
      System.out.println("Job started: " + startTime);
      JobClient.runJob(jobConf);
      Date end_time = new Date();
      System.out.println("Job ended: " + end_time);
      System.out.println(
          "The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
    }
Example #8
0
    static void checkRecords(Configuration defaults, Path sortInput, Path sortOutput)
        throws IOException {
      FileSystem inputfs = sortInput.getFileSystem(defaults);
      FileSystem outputfs = sortOutput.getFileSystem(defaults);
      FileSystem defaultfs = FileSystem.get(defaults);
      JobConf jobConf = new JobConf(defaults, RecordStatsChecker.class);
      jobConf.setJobName("sortvalidate-recordstats-checker");

      int noSortReduceTasks = outputfs.listStatus(sortOutput, sortPathsFilter).length;
      jobConf.setInt("sortvalidate.sort.reduce.tasks", noSortReduceTasks);
      int noSortInputpaths = inputfs.listStatus(sortInput).length;

      jobConf.setInputFormat(NonSplitableSequenceFileInputFormat.class);
      jobConf.setOutputFormat(SequenceFileOutputFormat.class);

      jobConf.setOutputKeyClass(IntWritable.class);
      jobConf.setOutputValueClass(RecordStatsChecker.RecordStatsWritable.class);

      jobConf.setMapperClass(Map.class);
      jobConf.setCombinerClass(Reduce.class);
      jobConf.setReducerClass(Reduce.class);

      jobConf.setNumMapTasks(noSortReduceTasks);
      jobConf.setNumReduceTasks(1);

      FileInputFormat.setInputPaths(jobConf, sortInput);
      FileInputFormat.addInputPath(jobConf, sortOutput);
      Path outputPath = new Path("/tmp/sortvalidate/recordstatschecker");
      if (defaultfs.exists(outputPath)) {
        defaultfs.delete(outputPath, true);
      }
      FileOutputFormat.setOutputPath(jobConf, outputPath);

      // Uncomment to run locally in a single process
      // job_conf.set("mapred.job.tracker", "local");
      Path[] inputPaths = FileInputFormat.getInputPaths(jobConf);
      System.out.println(
          "\nSortValidator.RecordStatsChecker: Validate sort "
              + "from "
              + inputPaths[0]
              + " ("
              + noSortInputpaths
              + " files), "
              + inputPaths[1]
              + " ("
              + noSortReduceTasks
              + " files) into "
              + FileOutputFormat.getOutputPath(jobConf)
              + " with 1 reducer.");
      Date startTime = new Date();
      System.out.println("Job started: " + startTime);
      JobClient.runJob(jobConf);
      Date end_time = new Date();
      System.out.println("Job ended: " + end_time);
      System.out.println(
          "The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");

      // Check to ensure that the statistics of the
      // framework's sort-input and sort-output match
      SequenceFile.Reader stats =
          new SequenceFile.Reader(defaultfs, new Path(outputPath, "part-00000"), defaults);
      IntWritable k1 = new IntWritable();
      IntWritable k2 = new IntWritable();
      RecordStatsWritable v1 = new RecordStatsWritable();
      RecordStatsWritable v2 = new RecordStatsWritable();
      if (!stats.next(k1, v1)) {
        throw new IOException("Failed to read record #1 from reduce's output");
      }
      if (!stats.next(k2, v2)) {
        throw new IOException("Failed to read record #2 from reduce's output");
      }

      if ((v1.getBytes() != v2.getBytes())
          || (v1.getRecords() != v2.getRecords())
          || v1.getChecksum() != v2.getChecksum()) {
        throw new IOException(
            "("
                + v1.getBytes()
                + ", "
                + v1.getRecords()
                + ", "
                + v1.getChecksum()
                + ") v/s ("
                + v2.getBytes()
                + ", "
                + v2.getRecords()
                + ", "
                + v2.getChecksum()
                + ")");
      }
    }