private void executePostProcessing(DistCpOptions options) throws IOException {
    Path targetPath = options.getTargetPath();
    FileSystem fs = targetPath.getFileSystem(getConf());
    List<Path> inPaths = options.getSourcePaths();
    assert inPaths.size() == 1 : "Source paths more than 1 can't be handled";

    Path sourcePath = inPaths.get(0);
    Path includePath = new Path(getConf().get("falcon.include.path"));
    assert includePath
            .toString()
            .substring(0, sourcePath.toString().length())
            .equals(sourcePath.toString())
        : "Source path is not a subset of include path";

    String relativePath = includePath.toString().substring(sourcePath.toString().length());
    String fixedPath = getFixedPath(relativePath);

    FileStatus[] files = fs.globStatus(new Path(targetPath.toString() + "/" + fixedPath));
    if (files != null) {
      for (FileStatus file : files) {
        fs.create(new Path(file.getPath(), EntityUtil.SUCCEEDED_FILE_NAME)).close();
        LOG.info("Created " + new Path(file.getPath(), EntityUtil.SUCCEEDED_FILE_NAME));
      }
    } else {
      LOG.info(
          "No files present in path: "
              + new Path(targetPath.toString() + "/" + fixedPath).toString());
    }
  }
  @Override
  protected int setJobInputData(Configuration config, Job job) throws InferenciaException {
    try {
      // Recuperamos los ficheros que vamos a procesar, y los anyadimos
      // como datos de entrada
      final FileSystem fs = FileSystem.get(new URI(InferenciaCte.hdfsUri), config);

      // Recuperamos los datos del path origen (data/*.bz2)
      FileStatus[] glob = fs.globStatus(new Path(getRutaFicheros()));

      // Si tenemos datos...
      if (null != glob) {
        if (glob.length > 0) {
          for (FileStatus fileStatus : glob) {
            Path pFich = fileStatus.getPath();
            MultipleInputs.addInputPath(job, pFich, SequenceFileInputFormat.class, LoadMap.class);
          }
        } else {
          return noDataFound();
        }
      }
    } catch (IOException e) {
      throw new InferenciaException(e, e.getMessage());
    } catch (URISyntaxException e) {
      throw new InferenciaException(e, e.getMessage());
    }
    return InferenciaCte.SUCCESS;
  }
Exemple #3
0
 /**
  * 입력으로 선택한 경로를 지정한 목적 경로로 이동한다.
  *
  * @param source 이동할 경로
  * @param target 이동할 위치
  * @param fs Hadoop FileSystem
  */
 public static void move(String source, String target, FileSystem fs) throws Exception {
   Path srcPath = new Path(source);
   Path[] srcs = FileUtil.stat2Paths(fs.globStatus(srcPath), srcPath);
   Path dst = new Path(target);
   if (srcs.length > 1 && !fs.getFileStatus(dst).isDir()) {
     throw new FileSystemException(
         "When moving multiple files, destination should be a directory.");
   }
   for (int i = 0; i < srcs.length; i++) {
     if (!fs.rename(srcs[i], dst)) {
       FileStatus srcFstatus = null;
       FileStatus dstFstatus = null;
       try {
         srcFstatus = fs.getFileStatus(srcs[i]);
       } catch (FileNotFoundException e) {
         throw new FileNotFoundException(srcs[i] + ": No such file or directory");
       }
       try {
         dstFstatus = fs.getFileStatus(dst);
       } catch (IOException e) {
         // Nothing
       }
       if ((srcFstatus != null) && (dstFstatus != null)) {
         if (srcFstatus.isDir() && !dstFstatus.isDir()) {
           throw new FileSystemException(
               "cannot overwrite non directory " + dst + " with directory " + srcs[i]);
         }
       }
       throw new FileSystemException("Failed to rename " + srcs[i] + " to " + dst);
     }
   }
 }
  @Override
  public List<String> getContent(String path, int lineCount) throws IOException {
    FileStatus[] files = fileSystem.globStatus(new Path(path));
    ArrayList<String> lines = new ArrayList<String>();

    if (files != null) {
      for (FileStatus file : files) {

        if (lines.size() >= lineCount) {
          break;
        }

        if (!file.isDirectory()) {

          DataInputStream in = fileSystem.open(file.getPath());

          BufferedReader dataReader = new BufferedReader(new InputStreamReader(in));

          String line = dataReader.readLine();
          while (line != null && lines.size() < lineCount) {
            lines.add(line);
            line = dataReader.readLine();
          }

          dataReader.close();
          in.close();
        }
      }
    }
    return lines;
  }
Exemple #5
0
  private static boolean hasFile(
      dbutil db_util, FileSystem fs, String db_name, String file_id, String file_path)
      throws Exception {
    Get file_id_get = new Get(file_id.getBytes());
    Result file_result = db_util.doGet(db_name, file_id_get);

    KeyValue file_names = file_result.getColumnLatest("d".getBytes(), "filenames".getBytes());
    if (file_names == null) {
      return false;
    }
    String all_files = new String(file_names.getValue());
    String[] files = all_files.split("\n");
    for (String line : files) {
      if (line.equals(file_path)) {
        if (fs.globStatus(new Path(line + "*")).length == 0) {
          Put new_put = new Put(file_id.getBytes());
          new_put.add(
              "d".getBytes(),
              "filenames".getBytes(),
              all_files.replace(file_path + "\n", "").getBytes());
          db_util.doPut(db_name, new_put);
          return false;
        }
        return true;
      }
    }
    return false;
  }
Exemple #6
0
  public static boolean isHdfsDirEmpty(String hdfsFilePath) {

    try {
      Configuration conf = new Configuration();
      Path path = new Path(hdfsFilePath);

      FileSystem fs = FileSystem.get(URI.create(hdfsFilePath), conf);
      FileStatus status[] = fs.globStatus(path);

      if (status == null || status.length == 0) {

        throw new FileNotFoundException(
            "Cannot access " + hdfsFilePath + ": No such file or directory.");
      }
      for (int i = 0; i < status.length; i++) {
        long totalSize = fs.getContentSummary(status[i].getPath()).getLength();
        @SuppressWarnings("unused")
        String pathStr = status[i].getPath().toString();
        return totalSize == 0 ? true : false;
      }

    } catch (IOException e) {
      LOG.error("[isHdfsDirEmpty]", e);
      return false;
    }
    return false;
  }
  private void writeIndexDescriptors(ETwinIndexDescriptor ETwinIndexDescriptor) throws IOException {
    Configuration conf = getConf();

    FileSystem fs = (new Path(IndexConfig.index.get()).getFileSystem(conf));

    FileStatus[] fileStats = fs.globStatus(new Path(IndexConfig.index.get(), "*"));

    // We write one indexDescriptor per generated index segment.
    // Something to consider: right now it's a straight-up serialized Thrift object.
    // Would it be better to do the LzoBase64Line thing, so that we can apply our tools?
    // or extend the tools?
    for (int i = 0; i < fileStats.length; i++) {
      ETwinIndexDescriptor.setIndexPart(i);
      FileStatus stat = fileStats[i];
      Path idxPath =
          new Path(stat.getPath().getParent(), "_" + stat.getPath().getName() + ".indexmeta");
      FSDataOutputStream os = fs.create(idxPath, true);
      @SuppressWarnings("unchecked")
      ThriftWritable<ETwinIndexDescriptor> writable =
          (ThriftWritable<ETwinIndexDescriptor>)
              ThriftWritable.newInstance(ETwinIndexDescriptor.getClass());
      writable.set(ETwinIndexDescriptor);
      writable.write(os);
      os.close();
    }
  }
Exemple #8
0
  /*
   * Fetch a file that is in a Hadoop file system. Return a local File.
   * Interruptible.
   */
  private File hdfsFetch(Path fromPath, Reporter reporter)
      throws IOException, InterruptedException {
    UUID uniqueId = UUID.randomUUID();
    File toFile = new File(tempDir, uniqueId.toString() + "/" + fromPath.getName());
    File toDir = new File(toFile.getParent());
    if (toDir.exists()) {
      FileUtils.deleteDirectory(toDir);
    }
    toDir.mkdirs();
    Path toPath = new Path(toFile.getCanonicalPath());

    FileSystem fS = fromPath.getFileSystem(hadoopConf);
    FileSystem tofS = FileSystem.getLocal(hadoopConf);

    Throttler throttler = new Throttler((double) bytesPerSecThrottle);
    try {
      for (FileStatus fStatus : fS.globStatus(fromPath)) {
        log.info("Copying " + fStatus.getPath() + " to " + toPath);
        long bytesSoFar = 0;

        FSDataInputStream iS = fS.open(fStatus.getPath());
        FSDataOutputStream oS = tofS.create(toPath);

        byte[] buffer = new byte[downloadBufferSize];

        int nRead;
        while ((nRead = iS.read(buffer, 0, buffer.length)) != -1) {
          // Needed to being able to be interrupted at any moment.
          if (Thread.interrupted()) {
            iS.close();
            oS.close();
            cleanDirNoExceptions(toDir);
            throw new InterruptedException();
          }
          bytesSoFar += nRead;
          oS.write(buffer, 0, nRead);
          throttler.incrementAndThrottle(nRead);
          if (bytesSoFar >= bytesToReportProgress) {
            reporter.progress(bytesSoFar);
            bytesSoFar = 0l;
          }
        }

        if (reporter != null) {
          reporter.progress(bytesSoFar);
        }

        oS.close();
        iS.close();
      }

      return toDir;
    } catch (ClosedByInterruptException e) {
      // This can be thrown by the method read.
      cleanDirNoExceptions(toDir);
      throw new InterruptedIOException();
    }
  }
Exemple #9
0
  private void handleSampling(DriverContext context, MapWork mWork, JobConf job, HiveConf conf)
      throws Exception {
    assert mWork.getAliasToWork().keySet().size() == 1;

    String alias = mWork.getAliases().get(0);
    Operator<?> topOp = mWork.getAliasToWork().get(alias);
    PartitionDesc partDesc = mWork.getAliasToPartnInfo().get(alias);

    ArrayList<String> paths = mWork.getPaths();
    ArrayList<PartitionDesc> parts = mWork.getPartitionDescs();

    List<Path> inputPaths = new ArrayList<Path>(paths.size());
    for (String path : paths) {
      inputPaths.add(new Path(path));
    }

    Path tmpPath = context.getCtx().getExternalTmpPath(inputPaths.get(0));
    Path partitionFile = new Path(tmpPath, ".partitions");
    ShimLoader.getHadoopShims().setTotalOrderPartitionFile(job, partitionFile);
    PartitionKeySampler sampler = new PartitionKeySampler();

    if (mWork.getSamplingType() == MapWork.SAMPLING_ON_PREV_MR) {
      console.printInfo("Use sampling data created in previous MR");
      // merges sampling data from previous MR and make partition keys for total sort
      for (Path path : inputPaths) {
        FileSystem fs = path.getFileSystem(job);
        for (FileStatus status : fs.globStatus(new Path(path, ".sampling*"))) {
          sampler.addSampleFile(status.getPath(), job);
        }
      }
    } else if (mWork.getSamplingType() == MapWork.SAMPLING_ON_START) {
      console.printInfo("Creating sampling data..");
      assert topOp instanceof TableScanOperator;
      TableScanOperator ts = (TableScanOperator) topOp;

      FetchWork fetchWork;
      if (!partDesc.isPartitioned()) {
        assert paths.size() == 1;
        fetchWork = new FetchWork(inputPaths.get(0), partDesc.getTableDesc());
      } else {
        fetchWork = new FetchWork(inputPaths, parts, partDesc.getTableDesc());
      }
      fetchWork.setSource(ts);

      // random sampling
      FetchOperator fetcher = PartitionKeySampler.createSampler(fetchWork, conf, job, ts);
      try {
        ts.initialize(conf, new ObjectInspector[] {fetcher.getOutputObjectInspector()});
        OperatorUtils.setChildrenCollector(ts.getChildOperators(), sampler);
        while (fetcher.pushRow()) {}
      } finally {
        fetcher.clearFetchContext();
      }
    } else {
      throw new IllegalArgumentException("Invalid sampling type " + mWork.getSamplingType());
    }
    sampler.writePartitionKeys(partitionFile, conf, job);
  }
 public static FileStatus[] matchFilesOrDir(FileSystem fs, Path path) throws IOException {
   FileStatus[] srcs = fs.globStatus(path);
   if ((srcs != null) && srcs.length == 1) {
     if (srcs[0].isDir()) {
       srcs = fs.listStatus(srcs[0].getPath());
     }
   }
   return (srcs);
 }
Exemple #11
0
 public static boolean globDelete(final FileSystem fs, final String path, final boolean recursive)
     throws IOException {
   boolean deleted = false;
   for (final Path p : FileUtil.stat2Paths(fs.globStatus(new Path(path)))) {
     fs.delete(p, recursive);
     deleted = true;
   }
   return deleted;
 }
Exemple #12
0
 private List<TestDataModel> collect(CacheStorage storage, Path contents) throws IOException {
   List<TestDataModel> results = new ArrayList<TestDataModel>();
   FileSystem fs = storage.getFileSystem();
   for (FileStatus status : fs.globStatus(contents)) {
     results.addAll(collectContent(fs, status));
   }
   Collections.sort(results);
   return results;
 }
Exemple #13
0
  public void testMapCount() throws Exception {
    String namenode = null;
    MiniDFSCluster dfs = null;
    MiniDFSCluster mr = null;
    try {
      Configuration conf = new Configuration();

      dfs = new MiniDFSCluster.Builder(conf).numDataNodes(3).format(true).build();

      FileSystem fs = dfs.getFileSystem();
      final FsShell shell = new FsShell(conf);
      namenode = fs.getUri().toString();
      MyFile[] files = createFiles(fs.getUri(), "/srcdat");
      long totsize = 0;
      for (MyFile f : files) {
        totsize += f.getSize();
      }

      Configuration job = new JobConf(conf);
      job.setLong("distcp.bytes.per.map", totsize / 3);
      ToolRunner.run(
          new DistCpV1(job),
          new String[] {
            "-m", "100", "-log", namenode + "/logs", namenode + "/srcdat", namenode + "/destdat"
          });
      assertTrue(
          "Source and destination directories do not match.", checkFiles(fs, "/destdat", files));

      String logdir = namenode + "/logs";
      System.out.println(execCmd(shell, "-lsr", logdir));
      FileStatus[] logs = fs.listStatus(new Path(logdir));
      // rare case where splits are exact, logs.length can be 4
      assertTrue(logs.length == 2);

      deldir(fs, "/destdat");
      deldir(fs, "/logs");
      ToolRunner.run(
          new DistCpV1(job),
          new String[] {
            "-m", "1", "-log", namenode + "/logs", namenode + "/srcdat", namenode + "/destdat"
          });

      System.out.println(execCmd(shell, "-lsr", logdir));
      logs = fs.globStatus(new Path(namenode + "/logs/part*"));
      assertTrue("Unexpected map count, logs.length=" + logs.length, logs.length == 1);
    } finally {
      if (dfs != null) {
        dfs.shutdown();
      }
      if (mr != null) {
        mr.shutdown();
      }
    }
  }
 public List<String> getInputPaths(PathInfo pi) throws IOException {
   List<String> list = new ArrayList<String>();
   FileStatus[] fileStatuses;
   for (String branch : branches) {
     fileStatuses = fs.globStatus(new Path(pi.getFullPath() + branch), pathFilter);
     if (fileStatuses == null) {
       continue;
     }
     for (FileStatus f : fileStatuses) {
       list.add(f.getPath().toUri().getPath());
     }
   }
   return list;
 }
Exemple #15
0
 private void verifyResult() throws IOException {
   FileStatus[] globStatus = fs.globStatus(new Path(OUTPUT + "/part-*"));
   int itemsRead = 0;
   for (FileStatus fts : globStatus) {
     BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(fts.getPath())));
     String line = null;
     while ((line = reader.readLine()) != null) {
       String[] split = line.split("\t");
       System.out.println(split[0] + " | " + split[1]);
       assertEquals(resultList[Integer.parseInt(split[0])], split[1]);
       itemsRead++;
     }
   }
   assertEquals(resultList.length, itemsRead);
 }
Exemple #16
0
  public static List<Path> getAllFilePaths(final FileSystem fs, Path path, final PathFilter filter)
      throws IOException {
    if (null == path) path = fs.getHomeDirectory();
    if (path.toString().equals(FOWARD_SLASH)) path = new Path("");

    final List<Path> paths = new ArrayList<Path>();
    if (fs.isFile(path)) paths.add(path);
    else {
      for (final FileStatus status : fs.globStatus(new Path(path + FOWARD_ASTERISK), filter)) {
        final Path next = status.getPath();
        paths.addAll(getAllFilePaths(fs, next, filter));
      }
    }
    return paths;
  }
 /**
  * Creates a new instance.
  *
  * @param conf current configuration
  * @param definition data type
  * @param pathExpression the source path (can include wildcard)
  * @throws IOException if failed to create instance
  * @throws IllegalArgumentException if some parameters were {@code null}
  */
 @SuppressWarnings("unchecked")
 public TemporaryDataModelSource(
     Configuration conf, DataModelDefinition<?> definition, String pathExpression)
     throws IOException {
   this.conf = conf;
   this.definition = (DataModelDefinition<Object>) definition;
   this.object = definition.toObject(definition.newReflection().build());
   Path path = new Path(pathExpression);
   this.fs = path.getFileSystem(conf);
   FileStatus[] list = fs.globStatus(path);
   List<Path> paths = new ArrayList<>();
   for (int i = 0; i < list.length; i++) {
     paths.add(list[i].getPath());
   }
   this.rest = paths.iterator();
 }
Exemple #18
0
    /**
     * Processes the input file/folder argument. If the input is a file, then it is directly
     * considered for further processing by TraceBuilder. If the input is a folder, then all the
     * history logs in the input folder are considered for further processing.
     *
     * <p>If isRecursive is true, then the input path is recursively scanned for job history logs
     * for further processing by TraceBuilder.
     *
     * <p>NOTE: If the input represents a globbed path, then it is first flattened and then the
     * individual paths represented by the globbed input path are considered for further processing.
     *
     * @param input input path, possibly globbed
     * @param conf configuration
     * @param isRecursive whether to recursively traverse the input paths to find history logs
     * @return the input history log files' paths
     * @throws FileNotFoundException
     * @throws IOException
     */
    static List<Path> processInputArgument(String input, Configuration conf, boolean isRecursive)
        throws FileNotFoundException, IOException {
      Path inPath = new Path(input);
      FileSystem fs = inPath.getFileSystem(conf);
      FileStatus[] inStatuses = fs.globStatus(inPath);

      List<Path> inputPaths = new LinkedList<Path>();
      if (inStatuses == null || inStatuses.length == 0) {
        return inputPaths;
      }

      for (FileStatus inStatus : inStatuses) {
        Path thisPath = inStatus.getPath();
        if (inStatus.isDirectory()) {

          // Find list of files in this path(recursively if -recursive option
          // is specified).
          List<FileStatus> historyLogs = new ArrayList<FileStatus>();

          RemoteIterator<LocatedFileStatus> iter = fs.listFiles(thisPath, isRecursive);
          while (iter.hasNext()) {
            LocatedFileStatus child = iter.next();
            String fileName = child.getPath().getName();

            if (!(fileName.endsWith(".crc") || fileName.startsWith("."))) {
              historyLogs.add(child);
            }
          }

          if (historyLogs.size() > 0) {
            // Add the sorted history log file names in this path to the
            // inputPaths list
            FileStatus[] sortableNames = historyLogs.toArray(new FileStatus[historyLogs.size()]);
            Arrays.sort(sortableNames, new HistoryLogsComparator());

            for (FileStatus historyLog : sortableNames) {
              inputPaths.add(historyLog.getPath());
            }
          }
        } else {
          inputPaths.add(thisPath);
        }
      }

      return inputPaths;
    }
  @Override
  public TupleEntryIterator openForRead(
      FlowProcess<? extends Configuration> flowProcess, RecordReader input) throws IOException {
    // always read via Hadoop FileSystem if in standalone/local mode, or if an RecordReader is
    // provided
    if (HadoopUtil.isLocal(flowProcess.getConfig()) || input != null) {
      LOG.info("delegating to parent");
      return super.openForRead(flowProcess, input);
    }

    Path[] cachedFiles = getLocalCacheFiles(flowProcess);

    if (cachedFiles == null || cachedFiles.length == 0) return super.openForRead(flowProcess, null);

    List<Path> paths = new ArrayList<>();
    List<Tap> taps = new ArrayList<>();

    if (isSimpleGlob()) {
      FileSystem fs = FileSystem.get(flowProcess.getConfig());
      FileStatus[] statuses = fs.globStatus(getHfs().getPath());

      for (FileStatus status : statuses) paths.add(status.getPath());
    } else {
      paths.add(getHfs().getPath());
    }

    for (Path pathToFind : paths) {
      for (Path path : cachedFiles) {
        if (path.toString().endsWith(pathToFind.getName())) {
          LOG.info("found {} in distributed cache", path);
          taps.add(new Lfs(getScheme(), path.toString()));
        }
      }
    }

    if (paths.isEmpty()) // not in cache, read from HDFS
    {
      LOG.info(
          "could not find files in local resource path. delegating to parent: {}",
          super.getIdentifier());
      return super.openForRead(flowProcess, input);
    }

    return new MultiSourceTap(taps.toArray(new Tap[taps.size()])).openForRead(flowProcess, input);
  }
Exemple #20
0
  /*
   * Fetch a file that is in a Hadoop file system. Return a local File.
   */
  private File hdfsFetch(Path fromPath, Reporter reporter) throws IOException {
    File toFile = new File(tempDir, fromPath.toUri().getPath());
    File toDir = new File(toFile.getParent());
    if (toDir.exists()) {
      FileUtils.deleteDirectory(toDir);
    }
    toDir.mkdirs();
    Path toPath = new Path(toFile.getCanonicalPath());

    FileSystem fS = fromPath.getFileSystem(hadoopConf);
    FileSystem tofS = FileSystem.getLocal(hadoopConf);

    Throttler throttler = new Throttler((double) bytesPerSecThrottle);

    for (FileStatus fStatus : fS.globStatus(fromPath)) {
      log.info("Copying " + fStatus.getPath() + " to " + toPath);
      long bytesSoFar = 0;

      FSDataInputStream iS = fS.open(fStatus.getPath());
      FSDataOutputStream oS = tofS.create(toPath);

      byte[] buffer = new byte[downloadBufferSize];

      int nRead;
      while ((nRead = iS.read(buffer, 0, buffer.length)) != -1) {
        bytesSoFar += nRead;
        oS.write(buffer, 0, nRead);
        throttler.incrementAndThrottle(nRead);
        if (bytesSoFar >= bytesToReportProgress) {
          reporter.progress(bytesSoFar);
          bytesSoFar = 0l;
        }
      }

      if (reporter != null) {
        reporter.progress(bytesSoFar);
      }

      oS.close();
      iS.close();
    }

    return toDir;
  }
  private void registerHfs(
      FlowProcess<? extends Configuration> process, Configuration conf, Hfs hfs)
      throws IOException {
    if (isSimpleGlob()) {
      FileSystem fs = FileSystem.get(conf);
      FileStatus[] statuses = fs.globStatus(getHfs().getPath());

      if (statuses == null || statuses.length == 0)
        throw new TapException(
            String.format(
                "glob expression %s does not match any files on the filesystem",
                getHfs().getPath()));

      for (FileStatus fileStatus : statuses) registerURI(conf, fileStatus.getPath());
    } else {
      registerURI(conf, hfs.getPath());
    }

    hfs.sourceConfInitComplete(process, conf);
  }
 public List<PathInfo> getPathInfo(
     String dcNumber, String service, String component, long startTime, long endTime)
     throws Exception {
   List<String> hours = getHoursForTimeRange(startTime, endTime);
   List<PathInfo> paths = new ArrayList<PathInfo>();
   String glob;
   FileStatus[] fileStatuses;
   for (String hour : hours) {
     glob = "/service/" + dcNumber + "/" + service + "/" + logdir + hour + "/" + component;
     fileStatuses = fs.globStatus(new Path(glob), pathFilter);
     if (fileStatuses == null) {
       continue;
     }
     for (FileStatus f : fileStatuses) {
       if (f.isDir()) {
         paths.add(new PathInfo(logdir, f.getPath().toUri().getPath()));
       }
     }
   }
   return paths;
 }
Exemple #23
0
  public DataInput getStream() {
    try {
      if (!initialized) {
        initialized = true;
        if ((resFile == null) && (resDir == null)) {
          return null;
        }

        if (resFile != null) {
          return resFile.getFileSystem(conf).open(resFile);
        }

        resFs = resDir.getFileSystem(conf);
        FileStatus status = resFs.getFileStatus(resDir);
        assert status.isDir();
        FileStatus[] resDirFS = resFs.globStatus(new Path(resDir + "/*"));
        resDirPaths = new Path[resDirFS.length];
        int pos = 0;
        for (FileStatus resFS : resDirFS) {
          if (!resFS.isDir()) {
            resDirPaths[pos++] = resFS.getPath();
          }
        }
        if (pos == 0) {
          return null;
        }

        return resFs.open(resDirPaths[resDirFilesNum++]);
      } else {
        return getNextStream();
      }
    } catch (FileNotFoundException e) {
      LOG.info("getStream error: " + StringUtils.stringifyException(e));
      return null;
    } catch (IOException e) {
      LOG.info("getStream error: " + StringUtils.stringifyException(e));
      return null;
    }
  }
  private List<FileStatus> singleThreadedListStatus(
      JobContext job, Path[] dirs, PathFilter inputFilter, boolean recursive) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    List<IOException> errors = new ArrayList<IOException>();
    for (int i = 0; i < dirs.length; ++i) {
      Path p = dirs[i];
      FileSystem fs = p.getFileSystem(job.getConfiguration());
      FileStatus[] matches = fs.globStatus(p, inputFilter);
      if (matches == null) {
        errors.add(new IOException("Input path does not exist: " + p));
      } else if (matches.length == 0) {
        errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
      } else {
        for (FileStatus globStat : matches) {
          if (globStat.isDirectory()) {
            RemoteIterator<LocatedFileStatus> iter = fs.listLocatedStatus(globStat.getPath());
            while (iter.hasNext()) {
              LocatedFileStatus stat = iter.next();
              if (inputFilter.accept(stat.getPath())) {
                if (recursive && stat.isDirectory()) {
                  addInputPathRecursively(result, fs, stat.getPath(), inputFilter);
                } else {
                  result.add(stat);
                }
              }
            }
          } else {
            result.add(globStat);
          }
        }
      }
    }

    if (!errors.isEmpty()) {
      throw new InvalidInputException(errors);
    }
    return result;
  }
Exemple #25
0
  public static void decompressPath(
      final FileSystem fs,
      final String in,
      final String out,
      final String compressedFileSuffix,
      final boolean deletePrevious)
      throws IOException {
    final Path inPath = new Path(in);

    if (fs.isFile(inPath)) HDFSTools.decompressFile(fs, in, out, deletePrevious);
    else {
      final Path outPath = new Path(out);
      if (!fs.exists(outPath)) fs.mkdirs(outPath);
      for (final Path path : FileUtil.stat2Paths(fs.globStatus(new Path(in + FOWARD_ASTERISK)))) {
        if (path.getName().endsWith(compressedFileSuffix))
          HDFSTools.decompressFile(
              fs,
              path.toString(),
              outPath.toString() + FOWARD_SLASH + path.getName().split("\\.")[0],
              deletePrevious);
      }
    }
  }
 public static ArrayList<String[]> parseRemainderFiles(String outputPath, FileSystem fs)
     throws IOException {
   ArrayList<String[]> remainderKeys = new ArrayList<String[]>();
   String[] keys;
   System.out.println("Handle remainder starts");
   FileStatus[] fss = fs.globStatus(new Path(outputPath + "/remainder-r-*"));
   String last = null;
   for (FileStatus fst : fss) {
     FSDataInputStream in = fs.open(fst.getPath());
     String line = in.readLine();
     while (line != null) {
       if (last == null) last = line;
       else {
         remainderKeys.add(new String[] {line, last});
         last = null;
       }
       line = in.readLine();
     }
     in.close();
   }
   if (last != null) remainderKeys.add(new String[] {last});
   System.out.println("Handle remainder ends");
   return remainderKeys;
 }
  public static ConfusionMatrix readResult(
      FileSystem fs, Path pathPattern, Configuration conf, Parameters params) throws IOException {

    StringTuple key = new StringTuple();
    DoubleWritable value = new DoubleWritable();
    String defaultLabel = params.get("defaultCat");
    FileStatus[] outputFiles = fs.globStatus(pathPattern);
    Map<String, Map<String, Integer>> confusionMatrix = new HashMap<String, Map<String, Integer>>();

    for (FileStatus fileStatus : outputFiles) {
      Path path = fileStatus.getPath();
      SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
      while (reader.next(key, value)) {
        String correctLabel = key.stringAt(1);
        String classifiedLabel = key.stringAt(2);
        Map<String, Integer> rowMatrix = confusionMatrix.get(correctLabel);
        if (rowMatrix == null) {
          rowMatrix = new HashMap<String, Integer>();
        }
        Integer count = Double.valueOf(value.get()).intValue();
        rowMatrix.put(classifiedLabel, count);
        confusionMatrix.put(correctLabel, rowMatrix);
      }
    }

    ConfusionMatrix matrix = new ConfusionMatrix(confusionMatrix.keySet(), defaultLabel);
    for (Map.Entry<String, Map<String, Integer>> correctLabelSet : confusionMatrix.entrySet()) {
      Map<String, Integer> rowMatrix = correctLabelSet.getValue();
      for (Map.Entry<String, Integer> classifiedLabelSet : rowMatrix.entrySet()) {
        matrix.addInstance(correctLabelSet.getKey(), classifiedLabelSet.getKey());
        matrix.putCount(
            correctLabelSet.getKey(), classifiedLabelSet.getKey(), classifiedLabelSet.getValue());
      }
    }
    return matrix;
  }
Exemple #28
0
  private void standAlone() throws IOException {
    String absSrcDir = fs.makeQualified(srcDir).toUri().getPath();
    String absOutDir = fs.makeQualified(outDir).toUri().getPath();

    Text bucket = new Text(absSrcDir + "-0");

    List<Text> files = new ArrayList<Text>();

    FileStatus[] contents = fs.listStatus(new Path(absSrcDir));

    for (FileStatus content : contents) {
      if (!content.isDir()) {
        if (ignoredFiles != null) {
          // Check for files to skip
          ignoredFiles.reset(content.getPath().toUri().getPath());
          if (ignoredFiles.matches()) {
            LOG.trace("Ignoring " + content.getPath().toString());
            continue;
          }
        }
        files.add(new Text(content.getPath().toUri().getPath()));
      }
    }

    /*
     * Is the directory empty?
     */
    if (files.isEmpty()) {
      return;
    }

    /*
     * We trick the reducer into doing some work for us by setting these configuration properties.
     */
    job.set("mapred.tip.id", "task_000000000000_00000_r_000000");
    job.set("mapred.task.id", "attempt_000000000000_0000_r_000000_0");

    job.set("mapred.output.dir", absOutDir);

    /*
     * File output committer needs this.
     */
    fs.mkdirs(new Path(absOutDir, "_temporary"));

    CrushReducer reducer = new CrushReducer();

    reducer.configure(job);
    reducer.reduce(bucket, files.iterator(), new NullOutputCollector<Text, Text>(), Reporter.NULL);
    reducer.close();

    /*
     * Use a glob here because the temporary and task attempt work dirs have funny names.
     * Include a * at the end to cover wildcards for compressed files.
     */
    Path crushOutput = new Path(absOutDir + "/*/*/crush" + absSrcDir + "/" + dest.getName() + "*");

    FileStatus[] statuses = fs.globStatus(crushOutput);

    if (statuses == null || 1 != statuses.length) {
      throw new AssertionError("Did not find the expected output in " + crushOutput.toString());
    }

    rename(statuses[0].getPath(), dest.getParent(), dest.getName());
  }
Exemple #29
0
  public int run(String[] args) throws Exception {
    // printUsage();
    /*
     * SETUP
     */
    Configuration argConf = getConf();
    Hashtable<String, String> confArg = new Hashtable<String, String>();
    setup(confArg, argConf);
    Date currentTime = new Date();
    Date endDate = new Date(new Long(confArg.get("timestamp_stop")));
    Boolean full_run = confArg.get("intermediate").matches("(?i).*true.*");
    Boolean quick_add = confArg.get("quick_add").matches("(?i).*true.*");
    logger.info("Running GeStore");

    // ZooKeeper setup
    Configuration config = HBaseConfiguration.create();
    zkWatcher = new ZooKeeperWatcher(config, "Testing", new HBaseAdmin(config));
    zkInstance =
        new ZooKeeper(
            ZKConfig.getZKQuorumServersString(config),
            config.getInt("zookeeper.session.timeout", -1),
            zkWatcher);

    if (!confArg.get("task_id").isEmpty()) {
      confArg.put("temp_path", confArg.get("temp_path") + confArg.get("task_id"));
    }

    String lockRequest = confArg.get("file_id");
    if (!confArg.get("run_id").isEmpty())
      lockRequest = lockRequest + "_" + confArg.get("run_id") + "_";
    if (!confArg.get("task_id").isEmpty())
      lockRequest = lockRequest + "_" + confArg.get("task_id") + "_";

    // Get type of movement
    toFrom type_move = checkArgs(confArg);
    if (type_move == toFrom.LOCAL2REMOTE && !confArg.get("format").equals("unknown")) {
      List<String> arguments = new ArrayList<String>();
      arguments.add("-Dinput=" + confArg.get("local_path"));
      arguments.add("-Dtable=" + confArg.get("file_id"));
      arguments.add("-Dtimestamp=" + confArg.get("timestamp_stop"));
      arguments.add("-Dtype=" + confArg.get("format"));
      arguments.add("-Dtarget_dir=" + confArg.get("base_path") + "_" + confArg.get("file_id"));
      arguments.add("-Dtemp_hdfs_path=" + confArg.get("temp_path"));
      arguments.add("-Drun_id=" + confArg.get("run_id"));
      if (!confArg.get("run_id").isEmpty()) arguments.add("-Drun_id=" + confArg.get("run_id"));
      if (!confArg.get("task_id").isEmpty()) arguments.add("-Dtask_id=" + confArg.get("task_id"));
      if (quick_add) arguments.add("-Dquick_add=" + confArg.get("quick_add"));
      String lockName = lock(lockRequest);
      String[] argumentString = arguments.toArray(new String[arguments.size()]);
      adddb.main(argumentString);
      unlock(lockName);
      System.exit(0);
    }

    // Database registration

    dbutil db_util = new dbutil(config);
    db_util.register_database(confArg.get("db_name_files"), true);
    db_util.register_database(confArg.get("db_name_runs"), true);
    db_util.register_database(confArg.get("db_name_updates"), true);
    FileSystem hdfs = FileSystem.get(config);
    FileSystem localFS = FileSystem.getLocal(config);

    // Get source type
    confArg.put("source", getSource(db_util, confArg.get("db_name_files"), confArg.get("file_id")));
    confArg.put(
        "database", isDatabase(db_util, confArg.get("db_name_files"), confArg.get("file_id")));
    if (!confArg.get("source").equals("local")
        && type_move == toFrom.REMOTE2LOCAL
        && !confArg.get("timestamp_stop").equals(Integer.toString(Integer.MAX_VALUE))) {
      confArg.put("timestamp_stop", Long.toString(latestVersion(confArg, db_util)));
    }

    /*
     * Get previous timestamp
     */
    Get run_id_get = new Get(confArg.get("run_id").getBytes());
    Result run_get = db_util.doGet(confArg.get("db_name_runs"), run_id_get);
    KeyValue run_file_prev =
        run_get.getColumnLatest(
            "d".getBytes(), (confArg.get("file_id") + "_db_timestamp").getBytes());
    String last_timestamp = new String("0");
    if (null != run_file_prev && !confArg.get("source").equals("local")) {
      long last_timestamp_real = run_file_prev.getTimestamp();
      Long current_timestamp = new Long(confArg.get("timestamp_real"));
      if ((current_timestamp - last_timestamp_real) > 36000) {
        last_timestamp = new String(run_file_prev.getValue());
        Integer lastTimestamp = new Integer(last_timestamp);
        lastTimestamp += 1;
        last_timestamp = lastTimestamp.toString();
        logger.info("Last timestamp: " + last_timestamp + " End data: " + endDate);
        Date last_run = new Date(run_file_prev.getTimestamp());
        if (last_run.before(endDate) && !full_run) {
          confArg.put("timestamp_start", last_timestamp);
        }
      }
    }

    Integer tse = new Integer(confArg.get("timestamp_stop"));
    Integer tss = new Integer(confArg.get("timestamp_start"));
    if (tss > tse) {
      logger.info("No new version of requested file.");
      return 0;
    }

    /*
     * Generate file
     */

    String lockName = lock(lockRequest);

    Get file_id_get = new Get(confArg.get("file_id").getBytes());
    Result file_get = db_util.doGet(confArg.get("db_name_files"), file_id_get);
    if (!file_get.isEmpty()) {
      boolean found =
          hasFile(
              db_util,
              hdfs,
              confArg.get("db_name_files"),
              confArg.get("file_id"),
              getFullPath(confArg));
      if (confArg.get("source").equals("fullfile")) {
        found = false;
      }
      String filenames_put =
          getFileNames(
              db_util, confArg.get("db_name_files"), confArg.get("file_id"), getFullPath(confArg));
      // Filename not found in file database
      if (!found && type_move == toFrom.REMOTE2LOCAL) {
        if (!confArg.get("source").equals("local")) {
          // Generate intermediate file
          if (getFile(hdfs, confArg, db_util) == null) {
            unlock(lockName);
            return 1;
          }
          // Put generated file into file database
          if (!confArg.get("format").equals("fullfile")) {
            putFileEntry(
                db_util,
                hdfs,
                confArg.get("db_name_files"),
                confArg.get("file_id"),
                confArg.get("full_file_name"),
                confArg.get("source"));
          }
        } else {
          logger.warn("Remote file not found, and cannot be generated! File: " + confArg);
          unlock(lockName);
          return 1;
        }
      }
    } else {
      if (type_move == toFrom.REMOTE2LOCAL) {
        logger.warn("Remote file not found, and cannot be generated.");
        unlock(lockName);
        return 1;
      }
    }

    /*
     * Copy file
     * Update tables
     */

    if (type_move == toFrom.LOCAL2REMOTE) {
      if (!confArg.get("format").equals("fullfile")) {
        putFileEntry(
            db_util,
            hdfs,
            confArg.get("db_name_files"),
            confArg.get("file_id"),
            getFullPath(confArg),
            confArg.get("source"));
      }
      putRunEntry(
          db_util,
          confArg.get("db_name_runs"),
          confArg.get("run_id"),
          confArg.get("file_id"),
          confArg.get("type"),
          confArg.get("timestamp_real"),
          confArg.get("timestamp_stop"),
          getFullPath(confArg),
          confArg.get("delimiter"));
      hdfs.copyFromLocalFile(new Path(confArg.get("local_path")), new Path(getFullPath(confArg)));
    } else if (type_move == toFrom.REMOTE2LOCAL) {
      FileStatus[] files = hdfs.globStatus(new Path(getFullPath(confArg) + "*"));
      putRunEntry(
          db_util,
          confArg.get("db_name_runs"),
          confArg.get("run_id"),
          confArg.get("file_id"),
          confArg.get("type"),
          confArg.get("timestamp_real"),
          confArg.get("timestamp_stop"),
          getFullPath(confArg),
          confArg.get("delimiter"));
      unlock(lockName);
      for (FileStatus file : files) {
        Path cur_file = file.getPath();
        Path cur_local_path =
            new Path(new String(confArg.get("local_path") + confArg.get("file_id")));
        String suffix = getSuffix(getFileName(confArg), cur_file.getName());
        if (suffix.length() > 0) {
          cur_local_path = cur_local_path.suffix(new String("." + suffix));
        }
        if (confArg.get("copy").equals("true")) {
          String crc = hdfs.getFileChecksum(cur_file).toString();
          if (checksumLocalTest(cur_local_path, crc)) {
            continue;
          } else {
            hdfs.copyToLocalFile(cur_file, cur_local_path);
            writeChecksum(cur_local_path, crc);
          }
        } else {
          System.out.println(cur_local_path + "\t" + cur_file);
        }
      }
    }
    unlock(lockName);
    return 0;
  }
Exemple #30
0
  public void printClusters(String[] dictionary)
      throws IOException, InstantiationException, IllegalAccessException {
    Configuration conf = new Configuration();

    if (this.termDictionary != null) {
      if ("text".equals(dictionaryFormat)) {
        dictionary = VectorHelper.loadTermDictionary(new File(this.termDictionary));
      } else if ("sequencefile".equals(dictionaryFormat)) {
        FileSystem fs = FileSystem.get(new Path(this.termDictionary).toUri(), conf);
        dictionary = VectorHelper.loadTermDictionary(conf, fs, this.termDictionary);
      } else {
        throw new IllegalArgumentException("Invalid dictionary format");
      }
    }

    Writer writer =
        this.outputFile == null
            ? new OutputStreamWriter(System.out)
            : new FileWriter(this.outputFile);
    try {
      FileSystem fs = seqFileDir.getFileSystem(conf);
      for (FileStatus seqFile : fs.globStatus(new Path(seqFileDir, "part-*"))) {
        Path path = seqFile.getPath();
        // System.out.println("Input Path: " + path); doesn't this interfere with output?
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
        try {
          Writable key = reader.getKeyClass().asSubclass(Writable.class).newInstance();
          Writable value = reader.getValueClass().asSubclass(Writable.class).newInstance();
          while (reader.next(key, value)) {
            Cluster cluster = (Cluster) value;
            String fmtStr = useJSON ? cluster.asJsonString() : cluster.asFormatString(dictionary);
            if (subString > 0 && fmtStr.length() > subString) {
              writer.write(':');
              writer.write(fmtStr, 0, Math.min(subString, fmtStr.length()));
            } else {
              writer.write(fmtStr);
            }

            writer.write('\n');

            if (dictionary != null) {
              String topTerms = getTopFeatures(cluster.getCenter(), dictionary, numTopFeatures);
              writer.write("\tTop Terms: ");
              writer.write(topTerms);
              writer.write('\n');
            }

            List<WeightedVectorWritable> points = clusterIdToPoints.get(cluster.getId());
            if (points != null) {
              writer.write("\tWeight:  Point:\n\t");
              for (Iterator<WeightedVectorWritable> iterator = points.iterator();
                  iterator.hasNext(); ) {
                WeightedVectorWritable point = iterator.next();
                writer.write(String.valueOf(point.getWeight()));
                writer.write(": ");
                writer.write(AbstractCluster.formatVector(point.getVector(), dictionary));
                if (iterator.hasNext()) {
                  writer.write("\n\t");
                }
              }
              writer.write('\n');
            }
          }
        } finally {
          reader.close();
        }
      }
    } finally {
      writer.close();
    }
  }