Exemplo n.º 1
0
  public void inject(Path crawlDb, Path urlDir) throws IOException {
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: starting at " + sdf.format(start));
      LOG.info("Injector: crawlDb: " + crawlDb);
      LOG.info("Injector: urlDir: " + urlDir);
    }

    Path tempDir =
        new Path(
            getConf().get("mapred.temp.dir", ".")
                + "/inject-temp-"
                + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    // map text input file to a <url,CrawlDatum> file
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: Converting injected urls to crawl db entries.");
    }
    JobConf sortJob = new NutchJob(getConf());
    sortJob.setJobName("inject " + urlDir);
    FileInputFormat.addInputPath(sortJob, urlDir);
    sortJob.setMapperClass(InjectMapper.class);

    FileOutputFormat.setOutputPath(sortJob, tempDir);
    sortJob.setOutputFormat(SequenceFileOutputFormat.class);
    sortJob.setOutputKeyClass(Text.class);
    sortJob.setOutputValueClass(CrawlDatum.class);
    sortJob.setLong("injector.current.time", System.currentTimeMillis());
    RunningJob mapJob = JobClient.runJob(sortJob);

    long urlsInjected = mapJob.getCounters().findCounter("injector", "urls_injected").getValue();
    long urlsFiltered = mapJob.getCounters().findCounter("injector", "urls_filtered").getValue();
    LOG.info("Injector: total number of urls rejected by filters: " + urlsFiltered);
    LOG.info(
        "Injector: total number of urls injected after normalization and filtering: "
            + urlsInjected);

    // merge with existing crawl db
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: Merging injected urls into crawl db.");
    }
    JobConf mergeJob = CrawlDb.createJob(getConf(), crawlDb);
    FileInputFormat.addInputPath(mergeJob, tempDir);
    mergeJob.setReducerClass(InjectReducer.class);
    JobClient.runJob(mergeJob);
    CrawlDb.install(mergeJob, crawlDb);

    // clean up
    FileSystem fs = FileSystem.get(getConf());
    fs.delete(tempDir, true);

    long end = System.currentTimeMillis();
    LOG.info(
        "Injector: finished at "
            + sdf.format(end)
            + ", elapsed: "
            + TimingUtil.elapsedTime(start, end));
  }
Exemplo n.º 2
0
 public void configure(JobConf job) {
   this.jobConf = job;
   urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_INJECT);
   interval = jobConf.getInt("db.fetch.interval.default", 2592000);
   filters = new URLFilters(jobConf);
   scfilters = new ScoringFilters(jobConf);
   scoreInjected = jobConf.getFloat("db.score.injected", 1.0f);
   curTime = job.getLong("injector.current.time", System.currentTimeMillis());
 }
 public void bumpProgress() {
   numWritten++;
   if (numWritten % 25000 == 0) {
     long now = System.currentTimeMillis();
     long delta = now - lastCheckpoint;
     lastCheckpoint = now;
     LOG.info("Wrote last 25000 records in " + delta + " ms");
     localManager.progress();
   }
 }
Exemplo n.º 4
0
  private static void analyzeResult(FileSystem fs, int testType, long execTime, String resFileName)
      throws IOException {
    Path reduceFile;
    if (testType == TEST_TYPE_WRITE) reduceFile = new Path(WRITE_DIR, "part-00000");
    else reduceFile = new Path(READ_DIR, "part-00000");
    DataInputStream in;
    in = new DataInputStream(fs.open(reduceFile));

    BufferedReader lines;
    lines = new BufferedReader(new InputStreamReader(in));
    long tasks = 0;
    long size = 0;
    long time = 0;
    float rate = 0;
    float sqrate = 0;
    String line;
    while ((line = lines.readLine()) != null) {
      StringTokenizer tokens = new StringTokenizer(line, " \t\n\r\f%");
      String attr = tokens.nextToken();
      if (attr.endsWith(":tasks")) tasks = Long.parseLong(tokens.nextToken());
      else if (attr.endsWith(":size")) size = Long.parseLong(tokens.nextToken());
      else if (attr.endsWith(":time")) time = Long.parseLong(tokens.nextToken());
      else if (attr.endsWith(":rate")) rate = Float.parseFloat(tokens.nextToken());
      else if (attr.endsWith(":sqrate")) sqrate = Float.parseFloat(tokens.nextToken());
    }

    double med = rate / 1000 / tasks;
    double stdDev = Math.sqrt(Math.abs(sqrate / 1000 / tasks - med * med));
    String resultLines[] = {
      "----- DFSCIOTest ----- : "
          + ((testType == TEST_TYPE_WRITE)
              ? "write"
              : (testType == TEST_TYPE_READ) ? "read" : "unknown"),
      "           Date & time: " + new Date(System.currentTimeMillis()),
      "       Number of files: " + tasks,
      "Total MBytes processed: " + size / MEGA,
      "     Throughput mb/sec: " + size * 1000.0 / (time * MEGA),
      "Average IO rate mb/sec: " + med,
      " Std IO rate deviation: " + stdDev,
      "    Test exec time sec: " + (float) execTime / 1000,
      ""
    };

    PrintStream res = new PrintStream(new FileOutputStream(new File(resFileName), true));
    for (int i = 0; i < resultLines.length; i++) {
      LOG.info(resultLines[i]);
      res.println(resultLines[i]);
    }
  }
Exemplo n.º 5
0
  public int run(String[] args) throws Exception {
    if (args.length < 4) {
      System.out.println("ERROR: Please Enter args : input output type(text|seq) splitChar(9=\t)");
      return JobClient.SUCCESS;
    }
    String input = args[0];
    String output = args[1];
    String type = args[2];
    String splitChar = args[3];

    JobConf config = new JobConf(getConf(), getClass());
    config.set("user.split", splitChar);

    config.setJobName("File Filter -" + System.currentTimeMillis());
    config.setNumReduceTasks(10);
    config.setReducerClass(IdentityReducer.class);
    config.setMapperClass(FileTestMapper.class);
    if ("text".equals(type)) {
      config.setInputFormat(TextInputFormat.class);
      TextInputFormat.addInputPath(config, new Path(input));
    } else {
      config.setInputFormat(SequenceFileInputFormat.class);
      SequenceFileInputFormat.addInputPath(config, new Path(input));
    }
    config.setMapOutputKeyClass(Text.class);
    config.setMapOutputValueClass(Text.class);

    config.setOutputKeyClass(Text.class);
    config.setOutputValueClass(Text.class);

    // if output path exists then return
    FileSystem fs = FileSystem.get(config);
    Path outputPath = new Path(output);
    FileOutputFormat.setOutputPath(config, outputPath);

    if (!fs.exists(outputPath)) {
      JobClient.runJob(config);
    } else {
      System.out.println("You has finished this job today ! " + outputPath);
    }

    return JobClient.SUCCESS;
  }
  public int run(String[] args) throws Exception {
    if (args.length < 1) {
      args = new String[] {DateStringUtils.now()};
      System.out.println(
          "ERROR: Please Enter Date , eg. 20101010 ! now use default => " + DateStringUtils.now());
    }

    JobConf config = new JobConf(getConf(), getClass());
    config.set("user.args", Utils.asString(args));

    config.setJobName(getClass() + "-" + System.currentTimeMillis());
    config.setNumReduceTasks(100);
    config.setMapperClass(getClass());
    config.setReducerClass(getClass());
    config.setInputFormat(getInputFormat());
    config.setMapOutputKeyClass(Text.class);
    config.setMapOutputValueClass(Text.class);

    // add input paths
    for (String path : getInputPath(args)) {
      if (TextInputFormat.class.equals(getInputFormat())) {
        TextInputFormat.addInputPath(config, new Path(path));
      } else if (SequenceFileInputFormat.class.equals(getInputFormat())) {
        SequenceFileInputFormat.addInputPath(config, new Path(path));
      }
    }

    config.setOutputKeyClass(Text.class);
    config.setOutputValueClass(Text.class);

    // if output path exists then return
    FileSystem fs = FileSystem.get(config);
    Path outputPath = new Path(getOutputPath(args));
    FileOutputFormat.setOutputPath(config, outputPath);

    if (!fs.exists(outputPath)) {
      JobClient.runJob(config);
    } else {
      System.out.println("You has finished this job today ! " + outputPath);
    }

    return JobClient.SUCCESS;
  }
  public void testFsCache() throws Exception {
    {
      long now = System.currentTimeMillis();
      String[] users = new String[] {"foo", "bar"};
      final Configuration conf = new Configuration();
      FileSystem[] fs = new FileSystem[users.length];

      for (int i = 0; i < users.length; i++) {
        UserGroupInformation ugi = UserGroupInformation.createRemoteUser(users[i]);
        fs[i] =
            ugi.doAs(
                new PrivilegedExceptionAction<FileSystem>() {
                  public FileSystem run() throws IOException {
                    return FileSystem.get(conf);
                  }
                });
        for (int j = 0; j < i; j++) {
          assertFalse(fs[j] == fs[i]);
        }
      }
      FileSystem.closeAll();
    }

    {
      try {
        runTestCache(HdfsClientConfigKeys.DFS_NAMENODE_RPC_PORT_DEFAULT);
      } catch (java.net.BindException be) {
        LOG.warn(
            "Cannot test HdfsClientConfigKeys.DFS_NAMENODE_RPC_PORT_DEFAULT (="
                + HdfsClientConfigKeys.DFS_NAMENODE_RPC_PORT_DEFAULT
                + ")",
            be);
      }

      runTestCache(0);
    }
  }
Exemplo n.º 8
0
  public static void main(String[] args) {
    int testType = TEST_TYPE_READ;
    int bufferSize = DEFAULT_BUFFER_SIZE;
    int fileSize = 1;
    int nrFiles = 1;
    String resFileName = DEFAULT_RES_FILE_NAME;
    boolean isSequential = false;

    String version = "DFSCIOTest.0.0.1";
    String usage =
        "Usage: DFSCIOTest -read | -write | -clean [-nrFiles N] [-fileSize MB] [-resFile resultFileName] [-bufferSize Bytes] ";

    System.out.println(version);
    if (args.length == 0) {
      System.err.println(usage);
      System.exit(-1);
    }
    for (int i = 0; i < args.length; i++) { // parse command line
      if (args[i].startsWith("-r")) {
        testType = TEST_TYPE_READ;
      } else if (args[i].startsWith("-w")) {
        testType = TEST_TYPE_WRITE;
      } else if (args[i].startsWith("-clean")) {
        testType = TEST_TYPE_CLEANUP;
      } else if (args[i].startsWith("-seq")) {
        isSequential = true;
      } else if (args[i].equals("-nrFiles")) {
        nrFiles = Integer.parseInt(args[++i]);
      } else if (args[i].equals("-fileSize")) {
        fileSize = Integer.parseInt(args[++i]);
      } else if (args[i].equals("-bufferSize")) {
        bufferSize = Integer.parseInt(args[++i]);
      } else if (args[i].equals("-resFile")) {
        resFileName = args[++i];
      }
    }

    LOG.info("nrFiles = " + nrFiles);
    LOG.info("fileSize (MB) = " + fileSize);
    LOG.info("bufferSize = " + bufferSize);

    try {
      fsConfig.setInt("test.io.file.buffer.size", bufferSize);
      FileSystem fs = FileSystem.get(fsConfig);

      if (testType != TEST_TYPE_CLEANUP) {
        fs.delete(HDFS_TEST_DIR, true);
        if (!fs.mkdirs(HDFS_TEST_DIR)) {
          throw new IOException("Mkdirs failed to create " + HDFS_TEST_DIR.toString());
        }

        // Copy the executables over to the remote filesystem
        String hadoopHome = System.getenv("HADOOP_PREFIX");
        fs.copyFromLocalFile(
            new Path(hadoopHome + "/libhdfs/libhdfs.so." + HDFS_LIB_VERSION), HDFS_SHLIB);
        fs.copyFromLocalFile(new Path(hadoopHome + "/libhdfs/hdfs_read"), HDFS_READ);
        fs.copyFromLocalFile(new Path(hadoopHome + "/libhdfs/hdfs_write"), HDFS_WRITE);
      }

      if (isSequential) {
        long tStart = System.currentTimeMillis();
        sequentialTest(fs, testType, fileSize, nrFiles);
        long execTime = System.currentTimeMillis() - tStart;
        String resultLine = "Seq Test exec time sec: " + (float) execTime / 1000;
        LOG.info(resultLine);
        return;
      }
      if (testType == TEST_TYPE_CLEANUP) {
        cleanup(fs);
        return;
      }
      createControlFile(fs, fileSize, nrFiles);
      long tStart = System.currentTimeMillis();
      if (testType == TEST_TYPE_WRITE) writeTest(fs);
      if (testType == TEST_TYPE_READ) readTest(fs);
      long execTime = System.currentTimeMillis() - tStart;

      analyzeResult(fs, testType, execTime, resFileName);
    } catch (Exception e) {
      System.err.print(e.getLocalizedMessage());
      System.exit(-1);
    }
  }
  public class ElephantRecordWriter
      implements RecordWriter<IntWritable, ElephantRecordWritable>, Closeable {

    FileSystem fileSystem;
    Args args;
    Map<Integer, Persistence> lps = new HashMap<Integer, Persistence>();
    Progressable progressable;
    LocalElephantManager localManager;

    int numWritten = 0;
    long lastCheckpoint = System.currentTimeMillis();

    public ElephantRecordWriter(Configuration conf, Args args, Progressable progressable)
        throws IOException {
      fileSystem = Utils.getFS(args.outputDirHdfs, conf);
      this.args = args;

      this.progressable = progressable;
      localManager =
          new LocalElephantManager(fileSystem, args.spec, LocalElephantManager.getTmpDirs(conf));
    }

    private Persistence retrieveShard(int shardIdx) throws IOException {
      Persistence lp = null;

      if (lps.containsKey(shardIdx)) {
        lp = lps.get(shardIdx);
      } else {
        String localShard = localManager.downloadRemoteShard("" + shardIdx, null);

        Coordinator fact = args.spec.getCoordinator();
        lp = fact.openPersistenceForAppend(localShard, args.spec.getPersistenceOptions());

        lps.put(shardIdx, lp);
        progress();
      }
      return lp;
    }

    public void write(IntWritable shard, ElephantRecordWritable carrier) throws IOException {
      Persistence lp = retrieveShard(shard.get());

      NewKeyValDocument doc = new NewKeyValDocument(carrier.key, carrier.value);

      lp.index(doc);

      bumpProgress();
    }

    public void bumpProgress() {
      numWritten++;
      if (numWritten % 25000 == 0) {
        long now = System.currentTimeMillis();
        long delta = now - lastCheckpoint;
        lastCheckpoint = now;
        LOG.info("Wrote last 25000 records in " + delta + " ms");
        localManager.progress();
      }
    }

    public void close() throws IOException {
      close(null);
    }

    public void close(Reporter reporter) throws IOException {
      for (Integer shard : lps.keySet()) {
        String lpDir = localManager.localTmpDir("" + shard);
        LOG.info("Closing LP for shard " + shard + " at " + lpDir);
        lps.get(shard).close();
        LOG.info("Closed LP for shard " + shard + " at " + lpDir);
        progress();
        String remoteDir = args.outputDirHdfs + "/" + shard;

        // Do all this stuff to ensure that S3 actually does delete
        int deleteAttempt = 4;
        while (fileSystem.exists(new Path(remoteDir)) && deleteAttempt > 0) {
          LOG.info("Deleting existing shard " + shard + " at " + remoteDir);
          fileSystem.delete(new Path(remoteDir), true);
          --deleteAttempt;
        }
        if (fileSystem.exists(new Path(remoteDir)) && deleteAttempt == 0) {
          throw new IOException(
              "Failed to delete shard "
                  + shard
                  + " at "
                  + remoteDir
                  + " after "
                  + deleteAttempt
                  + " attempts!");
        } else {
          LOG.info("Deleted existing shard " + shard + " at " + remoteDir);
        }
        LOG.info("Copying " + lpDir + " to " + remoteDir);
        fileSystem.copyFromLocalFile(new Path(lpDir), new Path(remoteDir));
        LOG.info("Copied " + lpDir + " to " + remoteDir);
        progress();
      }
      localManager.cleanup();
    }

    private void progress() {
      if (progressable != null) progressable.progress();
    }
  }