@org.junit.Test public void testMapper() throws Exception { final ArcFileReader reader = new ArcFileReader(); Thread thread = new Thread( new Runnable() { public void run() { try { while (reader.hasMoreItems()) { ArcFileItem item = new ArcFileItem(); reader.getNextItem(item); map(new Text(item.getUri()), item, null, null); } LOG.info("NO MORE ITEMS... BYE"); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); } } }); // run the thread ... thread.start(); File file = new File("/Users/rana/Downloads/1213886083018_0.arc.gz"); ReadableByteChannel channel = Channels.newChannel(new FileInputStream(file)); try { int totalBytesRead = 0; for (; ; ) { ByteBuffer buffer = ByteBuffer.allocate(ArcFileReader.DEFAULT_BLOCK_SIZE); int bytesRead = channel.read(buffer); LOG.info("Read " + bytesRead + " From File"); if (bytesRead == -1) { reader.finished(); break; } else { buffer.flip(); totalBytesRead += buffer.remaining(); reader.available(buffer); } } } finally { channel.close(); } // now wait for thread to die ... LOG.info("Done Reading File.... Waiting for ArcFileThread to DIE"); thread.join(); LOG.info("Done Reading File.... ArcFileThread to DIED"); }
public static void main(String[] args) { String accessKey = args[0]; String secretKey = args[1]; String paths[] = { // "2008/06", // "2008/07", // "2008/08", // "2008/09", // "2008/10", // "2008/11", "2009" }; for (int pathIndex = 0; pathIndex < paths.length; ++pathIndex) { LOG.info("Processing Path:" + paths[pathIndex]); JobConf job = new JobConf(S3GetMetdataJob.class); Path tempDir = new Path( job.get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); LOG.info("Output for Path:" + paths[pathIndex] + " is:" + tempDir); System.out.println("Output Path is:" + tempDir); job.setJobName("S3 To CrawlURLMetadata Job for Path:" + paths[pathIndex]); // setup s3 properties JetS3tARCSource.setMaxRetries(job, 1); // set up S3 credentials ... JetS3tARCSource.setAWSAccessKeyID(job, accessKey); JetS3tARCSource.setAWSSecretAccessKey(job, secretKey); ARCSplitCalculator.setFilesPerSplit(job, 25); // set up arc reader properties ArcFileReader.setIOTimeoutValue(30000); // set input prefixes ... JetS3tARCSource.setInputPrefixes(job, paths[pathIndex]); // and S3 bucket name ... JetS3tARCSource.setBucketName(job, "commoncrawl"); // and setup arc source for ArcInputFormat ARCInputFormat.setARCSourceClass(job, JetS3tARCSource.class); // and set up input format ... job.setInputFormat(ARCInputFormat.class); // set mapper ... job.setMapRunnerClass(S3GetMetdataJob.class); // setup reducer (identity in this case ... ) job.setReducerClass(IdentityReducer.class); // standard output format ... job.setOutputFormat(SequenceFileOutputFormat.class); // set output path FileOutputFormat.setOutputPath(job, tempDir); // map output types job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(CrawlURLMetadata.class); // reduce output types job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlURLMetadata.class); // double the number of reducers ... // job.setNumReduceTasks(job.getNumReduceTasks() * 2); // run the job ... try { LOG.info("Starting Job:" + job.getJobName()); JobClient.runJob(job); LOG.info("Finished Job:" + job.getJobName()); Path finalPath = new Path("jobout/" + paths[pathIndex] + "/result"); LOG.info("Copying Job Output to:" + finalPath); FileSystem fs = FileSystem.get(job); try { fs.mkdirs(finalPath.getParent()); fs.rename(tempDir, finalPath); LOG.info("Copied Job Output to:" + finalPath); } finally { // fs.close(); } } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); e.printStackTrace(); } } }