/** * Delete directory with everything underneath. Note that in the case of *nix we use 'rm -fr * <dir>, because of the known problems with recursive deletes, and because 'rm -fr' is probably * faster. * * @param dir directory to delete. * @throws IOException on any problem with delete. */ public static void deleteDirectory(File dir) throws IOException { if (OsUtil.isNix()) { OsUtil.runCommand("rm -fr " + dir.getPath()); } else { FileUtils.deleteDirectory(dir); } }
@Override @SuppressWarnings("unchecked") protected void cleanup(Reducer.Context context) throws IOException, InterruptedException { if (!Project.getProject().isMetadataCollectStandard()) { // write summary headers with all metadata, but for standard metadata don't write the last // line context.write(new Text("Hash"), new Text(columnMetadata.delimiterSeparatedHeaders())); } zipFileWriter.closeZip(); if (Project.getProject().isLuceneIndexEnabled()) { mergeLuceneIndex(); } Project project = Project.getProject(); if (project.isEnvHadoop()) { String outputPath = Project.getProject().getProperty(ParameterProcessing.OUTPUT_DIR_HADOOP); String zipFileName = zipFileWriter.getZipFileName(); if (project.isFsHdfs()) { String cmd = "hadoop fs -copyFromLocal " + zipFileName + " " + outputPath + File.separator + context.getTaskAttemptID() + ".zip"; OsUtil.runCommand(cmd); } else if (project.isFsS3()) { S3Agent s3agent = new S3Agent(); String run = project.getRun(); if (!run.isEmpty()) { run = run + "/"; } String s3key = project.getProjectCode() + File.separator + "output/" + run + "results/" + context.getTaskAttemptID() + ".zip"; // Keep updating the hadoop progress int refreshInterval = 60000; Timer timer = new Timer(refreshInterval, this); timer.start(); s3agent.putFileInS3(zipFileName, s3key); timer.stop(); } } Stats.getInstance().setJobFinished(); }
private void mergeLuceneIndex() throws IOException { String luceneDir = Settings.getSettings().getLuceneIndexDir(); String hdfsLuceneDir = "/" + luceneDir + File.separator + Project.getProject().getProjectCode() + File.separator; String localLuceneTempDir = luceneDir + File.separator + "tmp" + File.separator; File localLuceneTempDirFile = new File(localLuceneTempDir); if (localLuceneTempDirFile.exists()) { Util.deleteDirectory(localLuceneTempDirFile); } localLuceneTempDirFile.mkdir(); // copy all zip lucene indexes, created by maps to local hd String cmd = "hadoop fs -copyToLocal " + hdfsLuceneDir + "* " + localLuceneTempDir; OsUtil.runCommand(cmd); // remove the map indexes as they are now copied to local String removeOldZips = "hadoop fs -rm " + hdfsLuceneDir + "*"; OsUtil.runCommand(removeOldZips); logger.trace("Lucene index files collected to: {}", localLuceneTempDirFile.getAbsolutePath()); String[] zipFilesArr = localLuceneTempDirFile.list(); for (String indexZipFileStr : zipFilesArr) { String indexZipFileName = localLuceneTempDir + indexZipFileStr; String unzipToDir = localLuceneTempDir + indexZipFileStr.replace(".zip", ""); ZipUtil.unzipFile(indexZipFileName, unzipToDir); File indexDir = new File(unzipToDir); FSDirectory fsDir = FSDirectory.open(indexDir); luceneIndex.addToIndex(fsDir); } // TODO check if we need to push the index to S3 or somewhere else luceneIndex.destroy(); }
protected void processMap(MapWritable value) throws IOException, InterruptedException { columnMetadata.reinit(); ++outputFileCount; DocumentMetadata allMetadata = getAllMetadata(value); Metadata standardMetadata = getStandardMetadata(allMetadata, outputFileCount); columnMetadata.addMetadata(standardMetadata); columnMetadata.addMetadata(allMetadata); // documents other than the first one in this loop are either duplicates or attachments if (first) { masterOutputFileCount = outputFileCount; } else { if (allMetadata.hasParent()) { columnMetadata.addMetadataValue( DocumentMetadataKeys.ATTACHMENT_PARENT, UPIFormat.format(masterOutputFileCount)); } else { columnMetadata.addMetadataValue( DocumentMetadataKeys.MASTER_DUPLICATE, UPIFormat.format(masterOutputFileCount)); } } // String uniqueId = allMetadata.getUniqueId(); String originalFileName = new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName(); // add the text to the text folder String documentText = allMetadata.get(DocumentMetadataKeys.DOCUMENT_TEXT); String textEntryName = ParameterProcessing.TEXT + "/" + UPIFormat.format(outputFileCount) + "_" + originalFileName + ".txt"; if (textEntryName != null) { zipFileWriter.addTextFile(textEntryName, documentText); } columnMetadata.addMetadataValue(DocumentMetadataKeys.LINK_TEXT, textEntryName); // add the native file to the native folder String nativeEntryName = ParameterProcessing.NATIVE + "/" + UPIFormat.format(outputFileCount) + "_" + originalFileName; BytesWritable bytesWritable = (BytesWritable) value.get(new Text(ParameterProcessing.NATIVE)); if (bytesWritable != null) { // some large exception files are not passed zipFileWriter.addBinaryFile( nativeEntryName, bytesWritable.getBytes(), bytesWritable.getLength()); logger.trace("Processing file: {}", nativeEntryName); } columnMetadata.addMetadataValue(DocumentMetadataKeys.LINK_NATIVE, nativeEntryName); // add the pdf made from native to the PDF folder String pdfNativeEntryName = ParameterProcessing.PDF_FOLDER + "/" + UPIFormat.format(outputFileCount) + "_" + new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName() + ".pdf"; BytesWritable pdfBytesWritable = (BytesWritable) value.get(new Text(ParameterProcessing.NATIVE_AS_PDF)); if (pdfBytesWritable != null) { zipFileWriter.addBinaryFile( pdfNativeEntryName, pdfBytesWritable.getBytes(), pdfBytesWritable.getLength()); logger.trace("Processing file: {}", pdfNativeEntryName); } processHtmlContent(value, allMetadata, UPIFormat.format(outputFileCount)); // add exception to the exception folder String exception = allMetadata.get(DocumentMetadataKeys.PROCESSING_EXCEPTION); if (exception != null) { String exceptionEntryName = "exception/" + UPIFormat.format(outputFileCount) + "_" + new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName(); if (bytesWritable != null) { zipFileWriter.addBinaryFile( exceptionEntryName, bytesWritable.getBytes(), bytesWritable.getLength()); } columnMetadata.addMetadataValue(DocumentMetadataKeys.LINK_EXCEPTION, exceptionEntryName); } // write this all to the reduce map // context.write(new Text(outputKey), new Text(columnMetadata.delimiterSeparatedValues())); // drop the key altogether, because it messes up the format - but put it in the value // TODO use NullWritable if (OsUtil.isNix()) { context.write(null, new Text(columnMetadata.delimiterSeparatedValues())); } // prepare for the next file with the same key, if there is any first = false; }