Esempio n. 1
0
  /*
   * Fetch a file that is in a Hadoop file system. Return a local File.
   * Interruptible.
   */
  private File hdfsFetch(Path fromPath, Reporter reporter)
      throws IOException, InterruptedException {
    UUID uniqueId = UUID.randomUUID();
    File toFile = new File(tempDir, uniqueId.toString() + "/" + fromPath.getName());
    File toDir = new File(toFile.getParent());
    if (toDir.exists()) {
      FileUtils.deleteDirectory(toDir);
    }
    toDir.mkdirs();
    Path toPath = new Path(toFile.getCanonicalPath());

    FileSystem fS = fromPath.getFileSystem(hadoopConf);
    FileSystem tofS = FileSystem.getLocal(hadoopConf);

    Throttler throttler = new Throttler((double) bytesPerSecThrottle);
    try {
      for (FileStatus fStatus : fS.globStatus(fromPath)) {
        log.info("Copying " + fStatus.getPath() + " to " + toPath);
        long bytesSoFar = 0;

        FSDataInputStream iS = fS.open(fStatus.getPath());
        FSDataOutputStream oS = tofS.create(toPath);

        byte[] buffer = new byte[downloadBufferSize];

        int nRead;
        while ((nRead = iS.read(buffer, 0, buffer.length)) != -1) {
          // Needed to being able to be interrupted at any moment.
          if (Thread.interrupted()) {
            iS.close();
            oS.close();
            cleanDirNoExceptions(toDir);
            throw new InterruptedException();
          }
          bytesSoFar += nRead;
          oS.write(buffer, 0, nRead);
          throttler.incrementAndThrottle(nRead);
          if (bytesSoFar >= bytesToReportProgress) {
            reporter.progress(bytesSoFar);
            bytesSoFar = 0l;
          }
        }

        if (reporter != null) {
          reporter.progress(bytesSoFar);
        }

        oS.close();
        iS.close();
      }

      return toDir;
    } catch (ClosedByInterruptException e) {
      // This can be thrown by the method read.
      cleanDirNoExceptions(toDir);
      throw new InterruptedIOException();
    }
  }
    @Override
    protected int readChunk(long pos, byte[] buf, int offset, int len, byte[] checksum)
        throws IOException {

      boolean eof = false;
      if (needChecksum()) {
        assert checksum != null; // we have a checksum buffer
        assert checksum.length % CHECKSUM_SIZE == 0; // it is sane length
        assert len >= bytesPerSum; // we must read at least one chunk

        final int checksumsToRead =
            Math.min(
                len / bytesPerSum, // number of checksums based on len to read
                checksum.length / CHECKSUM_SIZE); // size of checksum buffer
        long checksumPos = getChecksumFilePos(pos);
        if (checksumPos != sums.getPos()) {
          sums.seek(checksumPos);
        }

        int sumLenRead = sums.read(checksum, 0, CHECKSUM_SIZE * checksumsToRead);
        if (sumLenRead >= 0 && sumLenRead % CHECKSUM_SIZE != 0) {
          throw new ChecksumException(
              "Checksum file not a length multiple of checksum size "
                  + "in "
                  + file
                  + " at "
                  + pos
                  + " checksumpos: "
                  + checksumPos
                  + " sumLenread: "
                  + sumLenRead,
              pos);
        }
        if (sumLenRead <= 0) { // we're at the end of the file
          eof = true;
        } else {
          // Adjust amount of data to read based on how many checksum chunks we read
          len = Math.min(len, bytesPerSum * (sumLenRead / CHECKSUM_SIZE));
        }
      }
      if (pos != datas.getPos()) {
        datas.seek(pos);
      }
      int nread = readFully(datas, buf, offset, len);
      if (eof && nread > 0) {
        throw new ChecksumException("Checksum error: " + file + " at " + pos, pos);
      }
      return nread;
    }
    public void map(Text key, Text value, Context context)
        throws InterruptedException, IOException {

      String filename = key.toString();
      String json = value.toString();

      // Make sure the input is valid
      if (!(filename.isEmpty() || json.isEmpty())) {

        // Change the json-type feature to Mat-type feature
        Mat descriptor = json2mat(json);
        if (descriptor != null) {
          // Read the query feature from the cache in Hadoop
          Mat query_features;
          String pathStr = context.getConfiguration().get("featureFilePath");
          FileSystem fs = FileSystem.get(context.getConfiguration());
          FSDataInputStream fsDataInputStream = fs.open(new Path(pathStr));
          StringBuilder sb = new StringBuilder();

          // Use a buffer to read the query_feature
          int remain = fsDataInputStream.available();
          while (remain > 0) {
            int read;
            byte[] buf = new byte[BUF_SIZE];
            read = fsDataInputStream.read(buf, fsDataInputStream.available() - remain, BUF_SIZE);
            sb.append(new String(buf, 0, read, StandardCharsets.UTF_8));
            remain = remain - read;
            System.out.println("remain:" + remain + "\tread:" + read + "\tsb.size:" + sb.length());
          }

          // Read the query_feature line by line
          //                    Scanner sc = new Scanner(fsDataInputStream, "UTF-8");
          //                    StringBuilder sb = new StringBuilder();
          //                    while (sc.hasNextLine()) {
          //                        sb.append(sc.nextLine());
          //                    }
          //                    String query_json = sb.toString();
          //                    String query_json = new String(buf, StandardCharsets.UTF_8);

          String query_json = sb.toString();
          fsDataInputStream.close();
          query_features = json2mat(query_json);

          // Get the similarity of the current database image against the query image
          DescriptorMatcher matcher = DescriptorMatcher.create(DescriptorMatcher.FLANNBASED);
          MatOfDMatch matches = new MatOfDMatch();

          // Ensure the two features have same length of cols (the feature extracted are all 128
          // cols(at least in this case))
          if (query_features.cols() == descriptor.cols()) {

            matcher.match(query_features, descriptor, matches);
            DMatch[] dMatches = matches.toArray();

            // Calculate the max/min distances
            //                    double max_dist = Double.MAX_VALUE;
            //                    double min_dist = Double.MIN_VALUE;
            double max_dist = 0;
            double min_dist = 100;
            for (int i = 0; i < dMatches.length; i++) {
              double dist = dMatches[i].distance;
              if (min_dist > dist) min_dist = dist;
              if (max_dist < dist) max_dist = dist;
            }
            // Only distances ≤ threshold are good matches
            double threshold = max_dist * THRESHOLD_FACTOR;
            //                    double threshold = min_dist * 2;
            LinkedList<DMatch> goodMatches = new LinkedList<DMatch>();

            for (int i = 0; i < dMatches.length; i++) {
              if (dMatches[i].distance <= threshold) {
                goodMatches.addLast(dMatches[i]);
              }
            }

            // Get the ratio of good_matches to all_matches
            double ratio = (double) goodMatches.size() / (double) dMatches.length;

            System.out.println("*** current_record_filename:" + filename + " ***");
            System.out.println("feature:" + descriptor + "\nquery_feature:" + query_features);
            System.out.println(
                "min_dist of keypoints:" + min_dist + "  max_dist of keypoints:" + max_dist);
            System.out.println(
                "total_matches:" + dMatches.length + "\tgood_matches:" + goodMatches.size());
            //                    System.out.println("type:" + descriptor.type() + " channels:" +
            // descriptor.channels() + " rows:" + descriptor.rows() + " cols:" + descriptor.cols());
            //                    System.out.println("qtype:" + query_features.type() + "
            // qchannels:" + query_features.channels() + " qrows:" + query_features.rows() + "
            // qcols:" + query_features.cols());
            System.out.println();

            if (ratio > PERCENTAGE_THRESHOLD) {
              // Key:1        Value:filename|ratio
              context.write(ONE, new Text(filename + "|" + ratio));
              //                        context.write(ONE, new Text(filename + "|" +
              // String.valueOf(goodMatches.size())));
            }
          } else {
            System.out.println("The size of the features are not equal");
          }
        } else {
          // a null pointer, do nothing
          System.out.println("A broken/null feature:" + filename);
          System.out.println();
        }
      }
    }
Esempio n. 4
0
    /**
     * Copy a file to a destination.
     *
     * @param srcstat src path and metadata
     * @param dstpath dst path
     * @param reporter
     */
    private void copy(
        FileStatus srcstat,
        Path relativedst,
        OutputCollector<WritableComparable<?>, Text> outc,
        Reporter reporter)
        throws IOException {
      Path absdst = new Path(destPath, relativedst);
      int totfiles = job.getInt(SRC_COUNT_LABEL, -1);
      assert totfiles >= 0 : "Invalid file count " + totfiles;

      // if a directory, ensure created even if empty
      if (srcstat.isDir()) {
        if (destFileSys.exists(absdst)) {
          if (!destFileSys.getFileStatus(absdst).isDir()) {
            throw new IOException("Failed to mkdirs: " + absdst + " is a file.");
          }
        } else if (!destFileSys.mkdirs(absdst)) {
          throw new IOException("Failed to mkdirs " + absdst);
        }
        // TODO: when modification times can be set, directories should be
        // emitted to reducers so they might be preserved. Also, mkdirs does
        // not currently return an error when the directory already exists;
        // if this changes, all directory work might as well be done in reduce
        return;
      }

      if (destFileSys.exists(absdst) && !overwrite && !needsUpdate(srcstat, destFileSys, absdst)) {
        outc.collect(null, new Text("SKIP: " + srcstat.getPath()));
        ++skipcount;
        reporter.incrCounter(Counter.SKIP, 1);
        updateStatus(reporter);
        return;
      }

      Path tmpfile = new Path(job.get(TMP_DIR_LABEL), relativedst);
      long cbcopied = 0L;
      FSDataInputStream in = null;
      FSDataOutputStream out = null;
      try {
        // open src file
        try {
          in = srcstat.getPath().getFileSystem(job).open(srcstat.getPath());
        } catch (IOException e) {
          LOG.error("Failed to open src file " + srcstat.getPath() + ", ignore and return");
          in = null;
          return;
        }
        reporter.incrCounter(Counter.BYTESEXPECTED, srcstat.getLen());
        // open tmp file
        out = create(tmpfile, reporter, srcstat);
        // copy file
        for (int cbread; (cbread = in.read(buffer)) >= 0; ) {
          out.write(buffer, 0, cbread);
          cbcopied += cbread;
          reporter.setStatus(
              String.format("%.2f ", cbcopied * 100.0 / srcstat.getLen())
                  + absdst
                  + " [ "
                  + StringUtils.humanReadableInt(cbcopied)
                  + " / "
                  + StringUtils.humanReadableInt(srcstat.getLen())
                  + " ]");
        }
      } finally {
        checkAndClose(in);
        checkAndClose(out);
      }

      if (cbcopied != srcstat.getLen()) {
        if (srcstat.getLen() == 0 && cbcopied > 0) {
          LOG.info("most likely see a WAL file corruption: " + srcstat.getPath());
        } else {
          throw new IOException(
              "File size not matched: copied "
                  + bytesString(cbcopied)
                  + " to tmpfile (="
                  + tmpfile
                  + ") but expected "
                  + bytesString(srcstat.getLen())
                  + " from "
                  + srcstat.getPath());
        }
      } else {
        if (totfiles == 1) {
          // Copying a single file; use dst path provided by user as destination
          // rather than destination directory, if a file
          Path dstparent = absdst.getParent();
          if (!(destFileSys.exists(dstparent) && destFileSys.getFileStatus(dstparent).isDir())) {
            absdst = dstparent;
          }
        }
        if (destFileSys.exists(absdst) && destFileSys.getFileStatus(absdst).isDir()) {
          throw new IOException(absdst + " is a directory");
        }
        if (!destFileSys.mkdirs(absdst.getParent())) {
          throw new IOException("Failed to craete parent dir: " + absdst.getParent());
        }
        rename(tmpfile, absdst);

        FileStatus dststat = destFileSys.getFileStatus(absdst);
        if (dststat.getLen() != srcstat.getLen()) {
          destFileSys.delete(absdst, false);
          throw new IOException(
              "File size not matched: copied "
                  + bytesString(dststat.getLen())
                  + " to dst (="
                  + absdst
                  + ") but expected "
                  + bytesString(srcstat.getLen())
                  + " from "
                  + srcstat.getPath());
        }
        updatePermissions(srcstat, dststat);
      }

      // report at least once for each file
      ++copycount;
      reporter.incrCounter(Counter.BYTESCOPIED, cbcopied);
      reporter.incrCounter(Counter.COPY, 1);
      updateStatus(reporter);
    }