Example #1
0
 /**
  * Delete directory with everything underneath. Note that in the case of *nix we use 'rm -fr
  * <dir>, because of the known problems with recursive deletes, and because 'rm -fr' is probably
  * faster.
  *
  * @param dir directory to delete.
  * @throws IOException on any problem with delete.
  */
 public static void deleteDirectory(File dir) throws IOException {
   if (OsUtil.isNix()) {
     OsUtil.runCommand("rm -fr " + dir.getPath());
   } else {
     FileUtils.deleteDirectory(dir);
   }
 }
  @Override
  @SuppressWarnings("unchecked")
  protected void cleanup(Reducer.Context context) throws IOException, InterruptedException {
    if (!Project.getProject().isMetadataCollectStandard()) {
      // write summary headers with all metadata, but for standard metadata don't write the last
      // line
      context.write(new Text("Hash"), new Text(columnMetadata.delimiterSeparatedHeaders()));
    }
    zipFileWriter.closeZip();

    if (Project.getProject().isLuceneIndexEnabled()) {
      mergeLuceneIndex();
    }

    Project project = Project.getProject();
    if (project.isEnvHadoop()) {
      String outputPath = Project.getProject().getProperty(ParameterProcessing.OUTPUT_DIR_HADOOP);
      String zipFileName = zipFileWriter.getZipFileName();
      if (project.isFsHdfs()) {
        String cmd =
            "hadoop fs -copyFromLocal "
                + zipFileName
                + " "
                + outputPath
                + File.separator
                + context.getTaskAttemptID()
                + ".zip";
        OsUtil.runCommand(cmd);
      } else if (project.isFsS3()) {
        S3Agent s3agent = new S3Agent();
        String run = project.getRun();
        if (!run.isEmpty()) {
          run = run + "/";
        }
        String s3key =
            project.getProjectCode()
                + File.separator
                + "output/"
                + run
                + "results/"
                + context.getTaskAttemptID()
                + ".zip";
        // Keep updating the hadoop progress
        int refreshInterval = 60000;
        Timer timer = new Timer(refreshInterval, this);
        timer.start();
        s3agent.putFileInS3(zipFileName, s3key);
        timer.stop();
      }
    }
    Stats.getInstance().setJobFinished();
  }
  private void mergeLuceneIndex() throws IOException {
    String luceneDir = Settings.getSettings().getLuceneIndexDir();
    String hdfsLuceneDir =
        "/" + luceneDir + File.separator + Project.getProject().getProjectCode() + File.separator;

    String localLuceneTempDir = luceneDir + File.separator + "tmp" + File.separator;
    File localLuceneTempDirFile = new File(localLuceneTempDir);

    if (localLuceneTempDirFile.exists()) {
      Util.deleteDirectory(localLuceneTempDirFile);
    }

    localLuceneTempDirFile.mkdir();

    // copy all zip lucene indexes, created by maps to local hd
    String cmd = "hadoop fs -copyToLocal " + hdfsLuceneDir + "* " + localLuceneTempDir;
    OsUtil.runCommand(cmd);

    // remove the map indexes as they are now copied to local
    String removeOldZips = "hadoop fs -rm " + hdfsLuceneDir + "*";
    OsUtil.runCommand(removeOldZips);

    logger.trace("Lucene index files collected to: {}", localLuceneTempDirFile.getAbsolutePath());

    String[] zipFilesArr = localLuceneTempDirFile.list();
    for (String indexZipFileStr : zipFilesArr) {
      String indexZipFileName = localLuceneTempDir + indexZipFileStr;
      String unzipToDir = localLuceneTempDir + indexZipFileStr.replace(".zip", "");

      ZipUtil.unzipFile(indexZipFileName, unzipToDir);
      File indexDir = new File(unzipToDir);

      FSDirectory fsDir = FSDirectory.open(indexDir);
      luceneIndex.addToIndex(fsDir);
    }
    // TODO check if we need to push the index to S3 or somewhere else
    luceneIndex.destroy();
  }
  protected void processMap(MapWritable value) throws IOException, InterruptedException {
    columnMetadata.reinit();
    ++outputFileCount;
    DocumentMetadata allMetadata = getAllMetadata(value);
    Metadata standardMetadata = getStandardMetadata(allMetadata, outputFileCount);
    columnMetadata.addMetadata(standardMetadata);
    columnMetadata.addMetadata(allMetadata);
    // documents other than the first one in this loop are either duplicates or attachments
    if (first) {
      masterOutputFileCount = outputFileCount;
    } else {
      if (allMetadata.hasParent()) {
        columnMetadata.addMetadataValue(
            DocumentMetadataKeys.ATTACHMENT_PARENT, UPIFormat.format(masterOutputFileCount));
      } else {
        columnMetadata.addMetadataValue(
            DocumentMetadataKeys.MASTER_DUPLICATE, UPIFormat.format(masterOutputFileCount));
      }
    }

    // String uniqueId = allMetadata.getUniqueId();

    String originalFileName =
        new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName();
    // add the text to the text folder
    String documentText = allMetadata.get(DocumentMetadataKeys.DOCUMENT_TEXT);
    String textEntryName =
        ParameterProcessing.TEXT
            + "/"
            + UPIFormat.format(outputFileCount)
            + "_"
            + originalFileName
            + ".txt";
    if (textEntryName != null) {
      zipFileWriter.addTextFile(textEntryName, documentText);
    }
    columnMetadata.addMetadataValue(DocumentMetadataKeys.LINK_TEXT, textEntryName);
    // add the native file to the native folder
    String nativeEntryName =
        ParameterProcessing.NATIVE
            + "/"
            + UPIFormat.format(outputFileCount)
            + "_"
            + originalFileName;
    BytesWritable bytesWritable = (BytesWritable) value.get(new Text(ParameterProcessing.NATIVE));
    if (bytesWritable != null) { // some large exception files are not passed
      zipFileWriter.addBinaryFile(
          nativeEntryName, bytesWritable.getBytes(), bytesWritable.getLength());
      logger.trace("Processing file: {}", nativeEntryName);
    }
    columnMetadata.addMetadataValue(DocumentMetadataKeys.LINK_NATIVE, nativeEntryName);
    // add the pdf made from native to the PDF folder
    String pdfNativeEntryName =
        ParameterProcessing.PDF_FOLDER
            + "/"
            + UPIFormat.format(outputFileCount)
            + "_"
            + new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName()
            + ".pdf";
    BytesWritable pdfBytesWritable =
        (BytesWritable) value.get(new Text(ParameterProcessing.NATIVE_AS_PDF));
    if (pdfBytesWritable != null) {
      zipFileWriter.addBinaryFile(
          pdfNativeEntryName, pdfBytesWritable.getBytes(), pdfBytesWritable.getLength());
      logger.trace("Processing file: {}", pdfNativeEntryName);
    }

    processHtmlContent(value, allMetadata, UPIFormat.format(outputFileCount));

    // add exception to the exception folder
    String exception = allMetadata.get(DocumentMetadataKeys.PROCESSING_EXCEPTION);
    if (exception != null) {
      String exceptionEntryName =
          "exception/"
              + UPIFormat.format(outputFileCount)
              + "_"
              + new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName();
      if (bytesWritable != null) {
        zipFileWriter.addBinaryFile(
            exceptionEntryName, bytesWritable.getBytes(), bytesWritable.getLength());
      }
      columnMetadata.addMetadataValue(DocumentMetadataKeys.LINK_EXCEPTION, exceptionEntryName);
    }
    // write this all to the reduce map
    // context.write(new Text(outputKey), new Text(columnMetadata.delimiterSeparatedValues()));
    // drop the key altogether, because it messes up the format - but put it in the value
    // TODO use NullWritable
    if (OsUtil.isNix()) {
      context.write(null, new Text(columnMetadata.delimiterSeparatedValues()));
    }
    // prepare for the next file with the same key, if there is any
    first = false;
  }