コード例 #1
0
  @org.junit.Test
  public void testMapper() throws Exception {

    final ArcFileReader reader = new ArcFileReader();

    Thread thread =
        new Thread(
            new Runnable() {

              public void run() {
                try {

                  while (reader.hasMoreItems()) {
                    ArcFileItem item = new ArcFileItem();

                    reader.getNextItem(item);

                    map(new Text(item.getUri()), item, null, null);
                  }
                  LOG.info("NO MORE ITEMS... BYE");
                } catch (IOException e) {
                  LOG.error(StringUtils.stringifyException(e));
                }
              }
            });

    // run the thread ...
    thread.start();

    File file = new File("/Users/rana/Downloads/1213886083018_0.arc.gz");
    ReadableByteChannel channel = Channels.newChannel(new FileInputStream(file));

    try {

      int totalBytesRead = 0;
      for (; ; ) {

        ByteBuffer buffer = ByteBuffer.allocate(ArcFileReader.DEFAULT_BLOCK_SIZE);

        int bytesRead = channel.read(buffer);
        LOG.info("Read " + bytesRead + " From File");

        if (bytesRead == -1) {
          reader.finished();
          break;
        } else {
          buffer.flip();
          totalBytesRead += buffer.remaining();
          reader.available(buffer);
        }
      }
    } finally {
      channel.close();
    }

    // now wait for thread to die ...
    LOG.info("Done Reading File.... Waiting for ArcFileThread to DIE");
    thread.join();
    LOG.info("Done Reading File.... ArcFileThread to DIED");
  }
コード例 #2
0
  public static void main(String[] args) {

    String accessKey = args[0];
    String secretKey = args[1];

    String paths[] = {
      // "2008/06",
      // "2008/07",
      // "2008/08",
      // "2008/09",
      // "2008/10",
      // "2008/11",
      "2009"
    };

    for (int pathIndex = 0; pathIndex < paths.length; ++pathIndex) {

      LOG.info("Processing Path:" + paths[pathIndex]);

      JobConf job = new JobConf(S3GetMetdataJob.class);

      Path tempDir =
          new Path(
              job.get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

      LOG.info("Output for Path:" + paths[pathIndex] + " is:" + tempDir);
      System.out.println("Output Path is:" + tempDir);

      job.setJobName("S3 To CrawlURLMetadata Job for Path:" + paths[pathIndex]);

      // setup s3 properties
      JetS3tARCSource.setMaxRetries(job, 1);
      // set up S3 credentials ...
      JetS3tARCSource.setAWSAccessKeyID(job, accessKey);
      JetS3tARCSource.setAWSSecretAccessKey(job, secretKey);
      ARCSplitCalculator.setFilesPerSplit(job, 25);
      // set up arc reader properties
      ArcFileReader.setIOTimeoutValue(30000);
      // set input prefixes ...
      JetS3tARCSource.setInputPrefixes(job, paths[pathIndex]);
      // and S3 bucket name ...
      JetS3tARCSource.setBucketName(job, "commoncrawl");
      // and setup arc source for ArcInputFormat
      ARCInputFormat.setARCSourceClass(job, JetS3tARCSource.class);
      // and set up input format ...
      job.setInputFormat(ARCInputFormat.class);
      // set mapper ...
      job.setMapRunnerClass(S3GetMetdataJob.class);
      // setup reducer (identity in this case ... )
      job.setReducerClass(IdentityReducer.class);
      // standard output format ...
      job.setOutputFormat(SequenceFileOutputFormat.class);
      // set output path
      FileOutputFormat.setOutputPath(job, tempDir);
      // map output types
      job.setMapOutputKeyClass(Text.class);
      job.setMapOutputValueClass(CrawlURLMetadata.class);
      // reduce output types
      job.setOutputKeyClass(Text.class);
      job.setOutputValueClass(CrawlURLMetadata.class);
      // double the number of reducers ...
      // job.setNumReduceTasks(job.getNumReduceTasks() * 2);

      // run the job ...
      try {
        LOG.info("Starting Job:" + job.getJobName());
        JobClient.runJob(job);
        LOG.info("Finished Job:" + job.getJobName());

        Path finalPath = new Path("jobout/" + paths[pathIndex] + "/result");
        LOG.info("Copying Job Output to:" + finalPath);
        FileSystem fs = FileSystem.get(job);

        try {
          fs.mkdirs(finalPath.getParent());
          fs.rename(tempDir, finalPath);
          LOG.info("Copied Job Output to:" + finalPath);
        } finally {
          // fs.close();
        }

      } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        e.printStackTrace();
      }
    }
  }