/* * Fetch a file that is in a Hadoop file system. Return a local File. * Interruptible. */ private File hdfsFetch(Path fromPath, Reporter reporter) throws IOException, InterruptedException { UUID uniqueId = UUID.randomUUID(); File toFile = new File(tempDir, uniqueId.toString() + "/" + fromPath.getName()); File toDir = new File(toFile.getParent()); if (toDir.exists()) { FileUtils.deleteDirectory(toDir); } toDir.mkdirs(); Path toPath = new Path(toFile.getCanonicalPath()); FileSystem fS = fromPath.getFileSystem(hadoopConf); FileSystem tofS = FileSystem.getLocal(hadoopConf); Throttler throttler = new Throttler((double) bytesPerSecThrottle); try { for (FileStatus fStatus : fS.globStatus(fromPath)) { log.info("Copying " + fStatus.getPath() + " to " + toPath); long bytesSoFar = 0; FSDataInputStream iS = fS.open(fStatus.getPath()); FSDataOutputStream oS = tofS.create(toPath); byte[] buffer = new byte[downloadBufferSize]; int nRead; while ((nRead = iS.read(buffer, 0, buffer.length)) != -1) { // Needed to being able to be interrupted at any moment. if (Thread.interrupted()) { iS.close(); oS.close(); cleanDirNoExceptions(toDir); throw new InterruptedException(); } bytesSoFar += nRead; oS.write(buffer, 0, nRead); throttler.incrementAndThrottle(nRead); if (bytesSoFar >= bytesToReportProgress) { reporter.progress(bytesSoFar); bytesSoFar = 0l; } } if (reporter != null) { reporter.progress(bytesSoFar); } oS.close(); iS.close(); } return toDir; } catch (ClosedByInterruptException e) { // This can be thrown by the method read. cleanDirNoExceptions(toDir); throw new InterruptedIOException(); } }
public void map( LongWritable key, Point value, OutputCollector<LongWritable, Point> output, Reporter reporter) throws IOException { double min = value.sumOfSquares(centers.get(0)); int best = 0; for (int index = 1; index < numberOfCenters; ++index) { double current = value.sumOfSquares(centers.get(index)); if (current < min) { min = current; best = index; } } reporter.incrCounter("NUMBER", "NODES", 1); reporter.incrCounter("CENTER", "" + best, 1); output.collect(new LongWritable(best), value); }
/** * Map method. Copies one file from source file system to destination. * * @param key src len * @param value FilePair (FileStatus src, Path dst) * @param out Log of failed copies * @param reporter */ public void map( LongWritable key, FilePair value, OutputCollector<WritableComparable<?>, Text> out, Reporter reporter) throws IOException { final FileStatus srcstat = value.input; final Path relativedst = new Path(value.output); try { copy(srcstat, relativedst, out, reporter); } catch (IOException e) { ++failcount; reporter.incrCounter(Counter.FAIL, 1); updateStatus(reporter); final String sfailure = "FAIL " + relativedst + " : " + StringUtils.stringifyException(e); out.collect(null, new Text(sfailure)); LOG.info(sfailure); try { for (int i = 0; i < 3; ++i) { try { final Path tmp = new Path(job.get(TMP_DIR_LABEL), relativedst); if (destFileSys.delete(tmp, true)) break; } catch (Throwable ex) { // ignore, we are just cleaning up LOG.debug("Ignoring cleanup exception", ex); } // update status, so we don't get timed out updateStatus(reporter); Thread.sleep(3 * 1000); } } catch (InterruptedException inte) { throw (IOException) new IOException().initCause(inte); } } finally { updateStatus(reporter); } }
/** * Copy a file to a destination. * * @param srcstat src path and metadata * @param dstpath dst path * @param reporter */ private void copy( FileStatus srcstat, Path relativedst, OutputCollector<WritableComparable<?>, Text> outc, Reporter reporter) throws IOException { Path absdst = new Path(destPath, relativedst); int totfiles = job.getInt(SRC_COUNT_LABEL, -1); assert totfiles >= 0 : "Invalid file count " + totfiles; // if a directory, ensure created even if empty if (srcstat.isDir()) { if (destFileSys.exists(absdst)) { if (!destFileSys.getFileStatus(absdst).isDir()) { throw new IOException("Failed to mkdirs: " + absdst + " is a file."); } } else if (!destFileSys.mkdirs(absdst)) { throw new IOException("Failed to mkdirs " + absdst); } // TODO: when modification times can be set, directories should be // emitted to reducers so they might be preserved. Also, mkdirs does // not currently return an error when the directory already exists; // if this changes, all directory work might as well be done in reduce return; } if (destFileSys.exists(absdst) && !overwrite && !needsUpdate(srcstat, destFileSys, absdst)) { outc.collect(null, new Text("SKIP: " + srcstat.getPath())); ++skipcount; reporter.incrCounter(Counter.SKIP, 1); updateStatus(reporter); return; } Path tmpfile = new Path(job.get(TMP_DIR_LABEL), relativedst); long cbcopied = 0L; FSDataInputStream in = null; FSDataOutputStream out = null; try { // open src file try { in = srcstat.getPath().getFileSystem(job).open(srcstat.getPath()); } catch (IOException e) { LOG.error("Failed to open src file " + srcstat.getPath() + ", ignore and return"); in = null; return; } reporter.incrCounter(Counter.BYTESEXPECTED, srcstat.getLen()); // open tmp file out = create(tmpfile, reporter, srcstat); // copy file for (int cbread; (cbread = in.read(buffer)) >= 0; ) { out.write(buffer, 0, cbread); cbcopied += cbread; reporter.setStatus( String.format("%.2f ", cbcopied * 100.0 / srcstat.getLen()) + absdst + " [ " + StringUtils.humanReadableInt(cbcopied) + " / " + StringUtils.humanReadableInt(srcstat.getLen()) + " ]"); } } finally { checkAndClose(in); checkAndClose(out); } if (cbcopied != srcstat.getLen()) { if (srcstat.getLen() == 0 && cbcopied > 0) { LOG.info("most likely see a WAL file corruption: " + srcstat.getPath()); } else { throw new IOException( "File size not matched: copied " + bytesString(cbcopied) + " to tmpfile (=" + tmpfile + ") but expected " + bytesString(srcstat.getLen()) + " from " + srcstat.getPath()); } } else { if (totfiles == 1) { // Copying a single file; use dst path provided by user as destination // rather than destination directory, if a file Path dstparent = absdst.getParent(); if (!(destFileSys.exists(dstparent) && destFileSys.getFileStatus(dstparent).isDir())) { absdst = dstparent; } } if (destFileSys.exists(absdst) && destFileSys.getFileStatus(absdst).isDir()) { throw new IOException(absdst + " is a directory"); } if (!destFileSys.mkdirs(absdst.getParent())) { throw new IOException("Failed to craete parent dir: " + absdst.getParent()); } rename(tmpfile, absdst); FileStatus dststat = destFileSys.getFileStatus(absdst); if (dststat.getLen() != srcstat.getLen()) { destFileSys.delete(absdst, false); throw new IOException( "File size not matched: copied " + bytesString(dststat.getLen()) + " to dst (=" + absdst + ") but expected " + bytesString(srcstat.getLen()) + " from " + srcstat.getPath()); } updatePermissions(srcstat, dststat); } // report at least once for each file ++copycount; reporter.incrCounter(Counter.BYTESCOPIED, cbcopied); reporter.incrCounter(Counter.COPY, 1); updateStatus(reporter); }
private void updateStatus(Reporter reporter) { reporter.setStatus(getCountString()); }
/** In case of interrupted, written file is not deleted. */ private void copyFile(File sourceFile, File destFile, Reporter reporter) throws IOException, InterruptedException { if (!destFile.exists()) { destFile.createNewFile(); } FileChannel source = null; FileChannel destination = null; Throttler throttler = new Throttler((double) bytesPerSecThrottle); FileInputStream iS = null; FileOutputStream oS = null; try { iS = new FileInputStream(sourceFile); oS = new FileOutputStream(destFile); source = iS.getChannel(); destination = oS.getChannel(); long bytesSoFar = 0; long reportingBytesSoFar = 0; long size = source.size(); int transferred = 0; while (bytesSoFar < size) { // Needed to being able to be interrupted at any moment. if (Thread.interrupted()) { throw new InterruptedException(); } // Casting to int here is safe since we will transfer at most "downloadBufferSize" bytes. // This is done on purpose for being able to implement Throttling. transferred = (int) destination.transferFrom(source, bytesSoFar, downloadBufferSize); bytesSoFar += transferred; reportingBytesSoFar += transferred; throttler.incrementAndThrottle(transferred); if (reportingBytesSoFar >= bytesToReportProgress) { reporter.progress(reportingBytesSoFar); reportingBytesSoFar = 0l; } } if (reporter != null) { reporter.progress(reportingBytesSoFar); } } catch (InterruptedException e) { e.printStackTrace(); } finally { if (iS != null) { iS.close(); } if (oS != null) { oS.close(); } if (source != null) { source.close(); } if (destination != null) { destination.close(); } } }
/* * Fetch a file that is in a S3 file system. Return a local File. It accepts "s3://" and "s3n://" prefixes. * Interruptible. */ private File s3Fetch(URI uri, Reporter reporter) throws IOException, InterruptedException { String bucketName = uri.getHost(); String path = uri.getPath(); UUID uniqueId = UUID.randomUUID(); File destFolder = new File(tempDir, uniqueId.toString() + "/" + path); if (destFolder.exists()) { FileUtils.deleteDirectory(destFolder); } destFolder.mkdirs(); Throttler throttler = new Throttler((double) bytesPerSecThrottle); boolean done = false; try { s3Service = new RestS3Service(getCredentials()); if (s3Service.checkBucketStatus(bucketName) != RestS3Service.BUCKET_STATUS__MY_BUCKET) { throw new IOException("Bucket doesn't exist or is already claimed: " + bucketName); } if (path.startsWith("/")) { path = path.substring(1, path.length()); } for (S3Object object : s3Service.listObjects(new S3Bucket(bucketName), path, "")) { long bytesSoFar = 0; String fileName = path; if (path.contains("/")) { fileName = path.substring(path.lastIndexOf("/") + 1, path.length()); } File fileDest = new File(destFolder, fileName); log.info("Downloading " + object.getKey() + " to " + fileDest + " ..."); if (fileDest.exists()) { fileDest.delete(); } object = s3Service.getObject(new S3Bucket(bucketName), object.getKey()); InputStream iS = object.getDataInputStream(); FileOutputStream writer = new FileOutputStream(fileDest); byte[] buffer = new byte[downloadBufferSize]; int nRead; while ((nRead = iS.read(buffer, 0, buffer.length)) != -1) { // Needed to being able to be interrupted at any moment. if (Thread.interrupted()) { iS.close(); writer.close(); cleanDirNoExceptions(destFolder); throw new InterruptedException(); } bytesSoFar += nRead; writer.write(buffer, 0, nRead); throttler.incrementAndThrottle(nRead); if (bytesSoFar >= bytesToReportProgress) { reporter.progress(bytesSoFar); bytesSoFar = 0l; } } if (reporter != null) { reporter.progress(bytesSoFar); } writer.close(); iS.close(); done = true; } if (!done) { throw new IOException("Bucket is empty! " + bucketName + " path: " + path); } } catch (S3ServiceException e) { throw new IOException(e); } return destFolder; }
public void map( WritableComparable<?> key, Text value, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException { String url = value.toString(); // value is line of text if (url != null && url.trim().startsWith("#")) { /* Ignore line that start with # */ return; } // if tabs : metadata that could be stored // must be name=value and separated by \t float customScore = -1f; int customInterval = interval; int fixedInterval = -1; Map<String, String> metadata = new TreeMap<String, String>(); if (url.indexOf("\t") != -1) { String[] splits = url.split("\t"); url = splits[0]; for (int s = 1; s < splits.length; s++) { // find separation between name and value int indexEquals = splits[s].indexOf("="); if (indexEquals == -1) { // skip anything without a = continue; } String metaname = splits[s].substring(0, indexEquals); String metavalue = splits[s].substring(indexEquals + 1); if (metaname.equals(nutchScoreMDName)) { try { customScore = Float.parseFloat(metavalue); } catch (NumberFormatException nfe) { } } else if (metaname.equals(nutchFetchIntervalMDName)) { try { customInterval = Integer.parseInt(metavalue); } catch (NumberFormatException nfe) { } } else if (metaname.equals(nutchFixedFetchIntervalMDName)) { try { fixedInterval = Integer.parseInt(metavalue); } catch (NumberFormatException nfe) { } } else metadata.put(metaname, metavalue); } } try { url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT); url = filters.filter(url); // filter the url } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Skipping " + url + ":" + e); } url = null; } if (url == null) { reporter.getCounter("injector", "urls_filtered").increment(1); } else { // if it passes value.set(url); // collect it CrawlDatum datum = new CrawlDatum(); datum.setStatus(CrawlDatum.STATUS_INJECTED); // Is interval custom? Then set as meta data if (fixedInterval > -1) { // Set writable using float. Flaot is used by AdaptiveFetchSchedule datum .getMetaData() .put(Nutch.WRITABLE_FIXED_INTERVAL_KEY, new FloatWritable(fixedInterval)); datum.setFetchInterval(fixedInterval); } else { datum.setFetchInterval(customInterval); } datum.setFetchTime(curTime); // now add the metadata Iterator<String> keysIter = metadata.keySet().iterator(); while (keysIter.hasNext()) { String keymd = keysIter.next(); String valuemd = metadata.get(keymd); datum.getMetaData().put(new Text(keymd), new Text(valuemd)); } if (customScore != -1) datum.setScore(customScore); else datum.setScore(scoreInjected); try { scfilters.injectedScore(value, datum); } catch (ScoringFilterException e) { if (LOG.isWarnEnabled()) { LOG.warn( "Cannot filter injected score for url " + url + ", using default (" + e.getMessage() + ")"); } } reporter.getCounter("injector", "urls_injected").increment(1); output.collect(value, datum); } }