Example #1
0
 public static Path[] getInputPaths(String rootPath) {
   try {
     Configuration conf = HBaseConfiguration.create();
     Path root = new Path(rootPath);
     ArrayList<Path> paths = new ArrayList<Path>();
     FileSystem fs = root.getFileSystem(conf);
     LinkedList<Path> list = new LinkedList<Path>();
     list.push(root);
     if (!fs.exists(root)) {
       System.out.println("path not exists: " + root.toString());
       return new Path[0];
     }
     while (!list.isEmpty()) {
       Path path = list.pop();
       if (fs.isFile(path)) {
         if (path.getName().matches("^.*part-r-\\d{5}.*$")) {
           paths.add(path);
           System.out.println("something is wrong with path" + path.toString());
         }
       } else {
         FileStatus[] statuses = fs.listStatus(path);
         for (FileStatus status : statuses) {
           if (status.isDir()) {
             list.add(status.getPath());
           } else if (status.getPath().getName().matches("^.*part-r-\\d{5}.*$")) {
             paths.add(status.getPath());
           }
         }
       }
     }
     return paths.toArray(new Path[paths.size()]);
   } catch (IOException ignored) {
     return new Path[0];
   }
 }
Example #2
0
  /**
   * Return a list of all urls matching this input. If autocomplete is false, the list contains only
   * 1 element (same as getUrl()). Otherwise, it will try to return all the files beginning with
   * what is returned by getUrl().
   *
   * @param jobConf A Configuration object
   * @return the list of input url
   */
  public HashSet<URI> getAllUrls(Configuration jobConf) {

    HashSet<URI> urls = new HashSet<URI>();

    if (!isAutoComplete()) {
      urls.add(url);
    } else {
      Path basePath = new Path(url);
      String filePrefix = basePath.getName();

      try {
        FileSystem fs = basePath.getFileSystem(jobConf);

        if (!fs.exists(basePath.getParent())) {
          throw new IOException("Input directory not found: " + url);
        }

        FileStatus[] stats = fs.listStatus(basePath.getParent());

        for (int i = 0; i < stats.length; i++) {
          Path path = stats[i].getPath();
          if (fs.isFile(path) && path.getName().startsWith(filePrefix)) urls.add(path.toUri());
        }
      } catch (IOException e) {
        System.err.println("Unable to autocomplete input file");
        e.printStackTrace();
        System.exit(1);
      }
    }

    return urls;
  }
Example #3
0
 /**
  * 지정한 경로가 파일인지 확인한다.
  *
  * @param fs FileSystem
  * @param path 확인할 Path
  * @return 파일인 경우 <tt>true</tt>
  */
 public static boolean isFile(FileSystem fs, String path) {
   try {
     return fs.isFile(new Path(path));
   } catch (Exception ex) {
     throw new FileSystemException(ExceptionUtils.getMessage("Cannot access '{}'", path), ex);
   }
 }
  private Path computeSourceRootPath(FileStatus sourceStatus, DistCpOptions options)
      throws IOException {

    Path target = options.getTargetPath();
    FileSystem targetFS = target.getFileSystem(getConf());

    boolean solitaryFile = options.getSourcePaths().size() == 1 && !sourceStatus.isDir();

    if (solitaryFile) {
      if (targetFS.isFile(target) || !targetFS.exists(target)) {
        return sourceStatus.getPath();
      } else {
        return sourceStatus.getPath().getParent();
      }
    } else {
      boolean specialHandling =
          (options.getSourcePaths().size() == 1 && !targetFS.exists(target))
              || options.shouldSyncFolder()
              || options.shouldOverwrite();

      return specialHandling && sourceStatus.isDir()
          ? sourceStatus.getPath()
          : sourceStatus.getPath().getParent();
    }
  }
  /**
   * Check if the user+group is authorized to use the specified application.
   *
   * <p>The check is done by checking the file system permissions on the workflow application.
   *
   * @param user user name.
   * @param group group name.
   * @param appPath application path.
   * @param fileName workflow or coordinator.xml
   * @param conf
   * @throws AuthorizationException thrown if the user is not authorized for the app.
   */
  public void authorizeForApp(
      String user, String group, String appPath, String fileName, Configuration conf)
      throws AuthorizationException {
    try {
      HadoopAccessorService has = Services.get().get(HadoopAccessorService.class);
      URI uri = new Path(appPath).toUri();
      Configuration fsConf = has.createJobConf(uri.getAuthority());
      FileSystem fs = has.createFileSystem(user, uri, fsConf);

      Path path = new Path(appPath);
      try {
        if (!fs.exists(path)) {
          incrCounter(INSTR_FAILED_AUTH_COUNTER, 1);
          throw new AuthorizationException(ErrorCode.E0504, appPath);
        }
        if (conf.get(XOozieClient.IS_PROXY_SUBMISSION)
            == null) { // Only further check existence of job definition files for non proxy
                       // submission jobs;
          if (!fs.isFile(path)) {
            Path appXml = new Path(path, fileName);
            if (!fs.exists(appXml)) {
              incrCounter(INSTR_FAILED_AUTH_COUNTER, 1);
              throw new AuthorizationException(ErrorCode.E0505, appPath);
            }
            if (!fs.isFile(appXml)) {
              incrCounter(INSTR_FAILED_AUTH_COUNTER, 1);
              throw new AuthorizationException(ErrorCode.E0506, appPath);
            }
            fs.open(appXml).close();
          }
        }
      }
      // TODO change this when stopping support of 0.18 to the new
      // Exception
      catch (org.apache.hadoop.fs.permission.AccessControlException ex) {
        incrCounter(INSTR_FAILED_AUTH_COUNTER, 1);
        throw new AuthorizationException(ErrorCode.E0507, appPath, ex.getMessage(), ex);
      }
    } catch (IOException ex) {
      incrCounter(INSTR_FAILED_AUTH_COUNTER, 1);
      throw new AuthorizationException(ErrorCode.E0501, ex.getMessage(), ex);
    } catch (HadoopAccessorException e) {
      throw new AuthorizationException(e);
    }
  }
 public static void fileTreeRecursion(URI uri, Configuration conf, FileSystem fs)
     throws IOException {
   Path current = new Path(uri);
   if (fs.isFile(current)) {
     visit(current, fs);
   } else {
     FileStatus[] status = fs.listStatus(current);
     Path[] paths = FileUtil.stat2Paths(status);
     for (Path p : paths) {
       fileTreeRecursion(p.toUri(), conf, fs);
     }
   }
 }
  @Override
  protected void validatePaths(DistCpOptions options) throws IOException, InvalidInputException {

    Path targetPath = options.getTargetPath();
    FileSystem targetFS = targetPath.getFileSystem(getConf());
    boolean targetIsFile = targetFS.isFile(targetPath);

    // If target is a file, then source has to be single file
    if (targetIsFile) {
      if (options.getSourcePaths().size() > 1) {
        throw new InvalidInputException("Multiple source being copied to a file: " + targetPath);
      }

      Path srcPath = options.getSourcePaths().get(0);
      FileSystem sourceFS = srcPath.getFileSystem(getConf());
      if (!sourceFS.isFile(srcPath)) {
        throw new InvalidInputException(
            "Cannot copy " + srcPath + ", which is not a file to " + targetPath);
      }
    }

    for (Path path : options.getSourcePaths()) {
      FileSystem fs = path.getFileSystem(getConf());
      if (!fs.exists(path)) {
        throw new InvalidInputException(path + " doesn't exist");
      }
    }

    /* This is requires to allow map tasks to access each of the source
      clusters. This would retrieve the delegation token for each unique
      file system and add them to job's private credential store
    */
    Credentials credentials = getCredentials();
    if (credentials != null) {
      Path[] inputPaths = options.getSourcePaths().toArray(new Path[1]);
      TokenCache.obtainTokensForNamenodes(credentials, inputPaths, getConf());
    }
  }
Example #8
0
  public static List<Path> getAllFilePaths(final FileSystem fs, Path path, final PathFilter filter)
      throws IOException {
    if (null == path) path = fs.getHomeDirectory();
    if (path.toString().equals(FOWARD_SLASH)) path = new Path("");

    final List<Path> paths = new ArrayList<Path>();
    if (fs.isFile(path)) paths.add(path);
    else {
      for (final FileStatus status : fs.globStatus(new Path(path + FOWARD_ASTERISK), filter)) {
        final Path next = status.getPath();
        paths.addAll(getAllFilePaths(fs, next, filter));
      }
    }
    return paths;
  }
Example #9
0
  /**
   * Merge default configuration with user-defined configuration.
   *
   * @throws CommandException thrown if failed to read or merge configurations
   */
  protected void mergeDefaultConfig() throws CommandException {
    Path configDefault = null;
    try {
      String coordAppPathStr = conf.get(OozieClient.COORDINATOR_APP_PATH);
      Path coordAppPath = new Path(coordAppPathStr);
      String user = ParamChecker.notEmpty(conf.get(OozieClient.USER_NAME), OozieClient.USER_NAME);
      String group =
          ParamChecker.notEmpty(conf.get(OozieClient.GROUP_NAME), OozieClient.GROUP_NAME);
      FileSystem fs =
          Services.get()
              .get(HadoopAccessorService.class)
              .createFileSystem(user, group, coordAppPath.toUri(), new Configuration());

      // app path could be a directory
      if (!fs.isFile(coordAppPath)) {
        configDefault = new Path(coordAppPath, CONFIG_DEFAULT);
      } else {
        configDefault = new Path(coordAppPath.getParent(), CONFIG_DEFAULT);
      }

      if (fs.exists(configDefault)) {
        Configuration defaultConf = new XConfiguration(fs.open(configDefault));
        PropertiesUtils.checkDisallowedProperties(defaultConf, DISALLOWED_DEFAULT_PROPERTIES);
        XConfiguration.injectDefaults(defaultConf, conf);
      } else {
        LOG.info("configDefault Doesn't exist " + configDefault);
      }
      PropertiesUtils.checkDisallowedProperties(conf, DISALLOWED_USER_PROPERTIES);

      // Resolving all variables in the job properties.
      // This ensures the Hadoop Configuration semantics is preserved.
      XConfiguration resolvedVarsConf = new XConfiguration();
      for (Map.Entry<String, String> entry : conf) {
        resolvedVarsConf.set(entry.getKey(), conf.get(entry.getKey()));
      }
      conf = resolvedVarsConf;
    } catch (IOException e) {
      throw new CommandException(
          ErrorCode.E0702,
          e.getMessage() + " : Problem reading default config " + configDefault,
          e);
    } catch (HadoopAccessorException e) {
      throw new CommandException(e);
    }
    LOG.debug("Merged CONF :" + XmlUtils.prettyPrint(conf).toString());
  }
Example #10
0
  public static boolean syncLocalFileToOneHDFS(
      final String inPathName,
      final PathFilter pathFilter,
      final String outPathName,
      final String outFileName,
      Compression.Algorithm alg)
      throws IOException {
    Configuration conf = new Configuration();
    Path pathout = new Path(outPathName);
    Path pathin = new Path(inPathName);
    FileSystem filesystemIn = pathin.getFileSystem(conf);
    FileSystem filesystemOut = pathout.getFileSystem(conf);
    FileStatus fsIn = filesystemIn.getFileStatus(pathin);
    Path fileOut = new Path(pathout, outFileName);
    if (filesystemOut.exists(fileOut) && !filesystemOut.isFile(fileOut)) {
      System.err.println("Not Support Operation : copy <dir> to <File>!");
      return false;
    }

    // create necessary dir
    filesystemOut.mkdirs(pathout);

    List<Path> todonames;
    if (fsIn.isDir()) {
      todonames = new LinkedList<Path>();
      // check src filename list
      FileStatus[] fsts = filesystemIn.listStatus(pathin, pathFilter);
      for (FileStatus sts : fsts) {
        if (sts.isDir() || sts.getLen() == 0) {
          continue;
        }
        Path fp = sts.getPath();
        todonames.add(fp);
      }
    } else {
      // copy <file> to <dir> AND <file> to <file>
      todonames = new ArrayList<Path>(1);
      todonames.add(pathin);
    }
    copyLocalFilesToOneHDFS(
        filesystemIn, todonames, filesystemOut, pathout, outFileName, alg, conf);

    return true;
  }
Example #11
0
  /**
   * Read coordinator definition.
   *
   * @param appPath application path.
   * @return coordinator definition.
   * @throws CoordinatorJobException thrown if the definition could not be read.
   */
  protected String readDefinition(String appPath) throws CoordinatorJobException {
    String user = ParamChecker.notEmpty(conf.get(OozieClient.USER_NAME), OozieClient.USER_NAME);
    String group = ParamChecker.notEmpty(conf.get(OozieClient.GROUP_NAME), OozieClient.GROUP_NAME);
    // Configuration confHadoop = CoordUtils.getHadoopConf(conf);
    try {
      URI uri = new URI(appPath);
      LOG.debug("user ="******" group =" + group);
      FileSystem fs =
          Services.get()
              .get(HadoopAccessorService.class)
              .createFileSystem(user, group, uri, new Configuration());
      Path appDefPath = null;

      // app path could be a directory
      Path path = new Path(uri.getPath());
      // check file exists for dataset include file, app xml already checked
      if (!fs.exists(path)) {
        throw new URISyntaxException(path.toString(), "path not existed : " + path.toString());
      }
      if (!fs.isFile(path)) {
        appDefPath = new Path(path, COORDINATOR_XML_FILE);
      } else {
        appDefPath = path;
      }

      Reader reader = new InputStreamReader(fs.open(appDefPath));
      StringWriter writer = new StringWriter();
      IOUtils.copyCharStream(reader, writer);
      return writer.toString();
    } catch (IOException ex) {
      LOG.warn("IOException :" + XmlUtils.prettyPrint(conf), ex);
      throw new CoordinatorJobException(ErrorCode.E1001, ex.getMessage(), ex);
    } catch (URISyntaxException ex) {
      LOG.warn("URISyException :" + ex.getMessage());
      throw new CoordinatorJobException(ErrorCode.E1002, appPath, ex.getMessage(), ex);
    } catch (HadoopAccessorException ex) {
      throw new CoordinatorJobException(ex);
    } catch (Exception ex) {
      LOG.warn("Exception :", ex);
      throw new CoordinatorJobException(ErrorCode.E1001, ex.getMessage(), ex);
    }
  }
 private static Matrix loadVectors(String vectorPathString, Configuration conf)
     throws IOException {
   Path vectorPath = new Path(vectorPathString);
   FileSystem fs = vectorPath.getFileSystem(conf);
   List<Path> subPaths = Lists.newArrayList();
   if (fs.isFile(vectorPath)) {
     subPaths.add(vectorPath);
   } else {
     for (FileStatus fileStatus : fs.listStatus(vectorPath, PathFilters.logsCRCFilter())) {
       subPaths.add(fileStatus.getPath());
     }
   }
   List<Pair<Integer, Vector>> rowList = Lists.newArrayList();
   int numRows = Integer.MIN_VALUE;
   int numCols = -1;
   boolean sequentialAccess = false;
   for (Path subPath : subPaths) {
     for (Pair<IntWritable, VectorWritable> record :
         new SequenceFileIterable<IntWritable, VectorWritable>(subPath, true, conf)) {
       int id = record.getFirst().get();
       Vector vector = record.getSecond().get();
       if (vector instanceof NamedVector) {
         vector = ((NamedVector) vector).getDelegate();
       }
       if (numCols < 0) {
         numCols = vector.size();
         sequentialAccess = vector.isSequentialAccess();
       }
       rowList.add(Pair.of(id, vector));
       numRows = Math.max(numRows, id);
     }
   }
   numRows++;
   Vector[] rowVectors = new Vector[numRows];
   for (Pair<Integer, Vector> pair : rowList) {
     rowVectors[pair.getFirst()] = pair.getSecond();
   }
   return new SparseRowMatrix(numRows, numCols, rowVectors, true, !sequentialAccess);
 }
  /**
   * Check if the user+group is authorized to use the specified application.
   *
   * <p>The check is done by checking the file system permissions on the workflow application.
   *
   * @param user user name.
   * @param group group name.
   * @param appPath application path.
   * @throws AuthorizationException thrown if the user is not authorized for the app.
   */
  public void authorizeForApp(String user, String group, String appPath, Configuration jobConf)
      throws AuthorizationException {
    try {
      HadoopAccessorService has = Services.get().get(HadoopAccessorService.class);
      URI uri = new Path(appPath).toUri();
      Configuration fsConf = has.createJobConf(uri.getAuthority());
      FileSystem fs = has.createFileSystem(user, uri, fsConf);

      Path path = new Path(appPath);
      try {
        if (!fs.exists(path)) {
          incrCounter(INSTR_FAILED_AUTH_COUNTER, 1);
          throw new AuthorizationException(ErrorCode.E0504, appPath);
        }
        Path wfXml = new Path(path, "workflow.xml");
        if (!fs.exists(wfXml)) {
          incrCounter(INSTR_FAILED_AUTH_COUNTER, 1);
          throw new AuthorizationException(ErrorCode.E0505, appPath);
        }
        if (!fs.isFile(wfXml)) {
          incrCounter(INSTR_FAILED_AUTH_COUNTER, 1);
          throw new AuthorizationException(ErrorCode.E0506, appPath);
        }
        fs.open(wfXml).close();
      }
      // TODO change this when stopping support of 0.18 to the new
      // Exception
      catch (org.apache.hadoop.fs.permission.AccessControlException ex) {
        incrCounter(INSTR_FAILED_AUTH_COUNTER, 1);
        throw new AuthorizationException(ErrorCode.E0507, appPath, ex.getMessage(), ex);
      }
    } catch (IOException ex) {
      incrCounter(INSTR_FAILED_AUTH_COUNTER, 1);
      throw new AuthorizationException(ErrorCode.E0501, ex.getMessage(), ex);
    } catch (HadoopAccessorException e) {
      throw new AuthorizationException(e);
    }
  }
Example #14
0
 private void distributeFiles() {
   try {
     URI[] uris = DistributedCache.getCacheFiles(conf);
     if (uris != null) {
       URI[] outURIs = new URI[uris.length];
       for (int i = 0; i < uris.length; i++) {
         Path path = new Path(uris[i]);
         FileSystem fs = path.getFileSystem(conf);
         if (fs.isFile(path)) {
           outURIs[i] = uris[i];
         } else {
           Path mergePath = new Path(path.getParent(), "sparkreadable-" + path.getName());
           FileUtil.copyMerge(fs, path, fs, mergePath, false, conf, "");
           outURIs[i] = mergePath.toUri();
         }
         sparkContext.addFile(outURIs[i].toString());
       }
       DistributedCache.setCacheFiles(outURIs, conf);
     }
   } catch (IOException e) {
     throw new RuntimeException("Error retrieving cache files", e);
   }
 }
 public static void recursePath(Configuration conf, Path path, Job job) {
   try {
     FileSystem fs = path.getFileSystem(conf);
     FileStatus[] fstats = fs.listStatus(path);
     if (fstats != null) {
       for (FileStatus f : fstats) {
         Path p = f.getPath();
         ;
         if (fs.isFile(p)) {
           // connection times out otherwise
           System.err.println("file:" + p.toString());
           FileInputFormat.addInputPath(job, p);
         } else {
           System.err.println("dir:" + p.toString());
           recursePath(conf, p, job);
         }
       }
     }
   } catch (IOException e) {
     // shouldn't be here
     throw new RuntimeException(e);
   }
 }
Example #16
0
  public static void decompressPath(
      final FileSystem fs,
      final String in,
      final String out,
      final String compressedFileSuffix,
      final boolean deletePrevious)
      throws IOException {
    final Path inPath = new Path(in);

    if (fs.isFile(inPath)) HDFSTools.decompressFile(fs, in, out, deletePrevious);
    else {
      final Path outPath = new Path(out);
      if (!fs.exists(outPath)) fs.mkdirs(outPath);
      for (final Path path : FileUtil.stat2Paths(fs.globStatus(new Path(in + FOWARD_ASTERISK)))) {
        if (path.getName().endsWith(compressedFileSuffix))
          HDFSTools.decompressFile(
              fs,
              path.toString(),
              outPath.toString() + FOWARD_SLASH + path.getName().split("\\.")[0],
              deletePrevious);
      }
    }
  }
Example #17
0
 public void crush() throws CrushException {
   if (jobConf == null) {
     jobConf = new JobConf(CrushUtil.class);
   }
   if (codec == null) {
     codec = new DefaultCodec();
     l4j.warn("codec not specified using DefaultCodec");
   }
   if (compressionType == null) {
     this.compressionType = SequenceFile.CompressionType.BLOCK;
     l4j.warn("compresstionType not specified using BLOCK");
   }
   try {
     if (fs == null) {
       fs = FileSystem.get(jobConf);
     }
     if (!fs.exists(sourcePath)) {
       throw new CrushException(sourcePath + " does not exist");
     }
     if (fs.isFile(sourcePath)) {
       throw new CrushException(sourcePath + " must be a directory");
     }
     FileStatus[] status = fs.listStatus(sourcePath);
     if (status.length == 0 || status.length == 1) {
       return;
     }
     if (this.type == CrushUtil.FileType.SEQUENCEFILE) {
       sequenceCrush(fs, status);
     }
     if (this.type == CrushUtil.FileType.TEXT) {
       textCrush(fs, status);
     }
   } catch (IOException ex) {
     throw new CrushException("Crushed failed" + ex);
   }
 }
Example #18
0
  @SuppressWarnings("unchecked")
  public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(Settings.HELP_OPTION, false, "print the help message");
    options.addOption(
        OptionBuilder.withArgName(Settings.PATH_INDICATOR)
            .hasArg()
            .withDescription("input beta file")
            .create(Settings.INPUT_OPTION));
    options.addOption(
        OptionBuilder.withArgName(Settings.PATH_INDICATOR)
            .hasArg()
            .withDescription("term index file")
            .create(ParseCorpus.INDEX));
    options.addOption(
        OptionBuilder.withArgName(Settings.INTEGER_INDICATOR)
            .hasArg()
            .withDescription("display top terms only (default - 10)")
            .create(TOP_DISPLAY_OPTION));

    String betaString = null;
    String indexString = null;
    int topDisplay = TOP_DISPLAY;

    CommandLineParser parser = new GnuParser();
    HelpFormatter formatter = new HelpFormatter();
    try {
      CommandLine line = parser.parse(options, args);

      if (line.hasOption(Settings.HELP_OPTION)) {
        formatter.printHelp(ParseCorpus.class.getName(), options);
        System.exit(0);
      }

      if (line.hasOption(Settings.INPUT_OPTION)) {
        betaString = line.getOptionValue(Settings.INPUT_OPTION);
      } else {
        throw new ParseException(
            "Parsing failed due to " + Settings.INPUT_OPTION + " not initialized...");
      }

      if (line.hasOption(ParseCorpus.INDEX)) {
        indexString = line.getOptionValue(ParseCorpus.INDEX);
      } else {
        throw new ParseException(
            "Parsing failed due to " + ParseCorpus.INDEX + " not initialized...");
      }

      if (line.hasOption(TOP_DISPLAY_OPTION)) {
        topDisplay = Integer.parseInt(line.getOptionValue(TOP_DISPLAY_OPTION));
      }
    } catch (ParseException pe) {
      System.err.println(pe.getMessage());
      formatter.printHelp(ParseCorpus.class.getName(), options);
      System.exit(0);
    } catch (NumberFormatException nfe) {
      System.err.println(nfe.getMessage());
      System.exit(0);
    }

    JobConf conf = new JobConf(DisplayTopic.class);
    FileSystem fs = FileSystem.get(conf);

    Path indexPath = new Path(indexString);
    Preconditions.checkArgument(
        fs.exists(indexPath) && fs.isFile(indexPath), "Invalid index path...");

    Path betaPath = new Path(betaString);
    Preconditions.checkArgument(fs.exists(betaPath) && fs.isFile(betaPath), "Invalid beta path...");

    SequenceFile.Reader sequenceFileReader = null;
    try {
      IntWritable intWritable = new IntWritable();
      Text text = new Text();
      Map<Integer, String> termIndex = new HashMap<Integer, String>();
      sequenceFileReader = new SequenceFile.Reader(fs, indexPath, conf);
      while (sequenceFileReader.next(intWritable, text)) {
        termIndex.put(intWritable.get(), text.toString());
      }

      PairOfIntFloat pairOfIntFloat = new PairOfIntFloat();
      HMapIFW hmap = new HMapIFW();
      TreeMap<Float, Integer> treeMap = new TreeMap<Float, Integer>();
      sequenceFileReader = new SequenceFile.Reader(fs, betaPath, conf);
      while (sequenceFileReader.next(pairOfIntFloat, hmap)) {
        treeMap.clear();

        System.out.println("==============================");
        System.out.println(
            "Top ranked " + topDisplay + " terms for Topic " + pairOfIntFloat.getLeftElement());
        System.out.println("==============================");

        Iterator<Integer> itr1 = hmap.keySet().iterator();
        int temp1 = 0;
        while (itr1.hasNext()) {
          temp1 = itr1.next();
          treeMap.put(-hmap.get(temp1), temp1);
          if (treeMap.size() > topDisplay) {
            treeMap.remove(treeMap.lastKey());
          }
        }

        Iterator<Float> itr2 = treeMap.keySet().iterator();
        float temp2 = 0;
        while (itr2.hasNext()) {
          temp2 = itr2.next();
          if (termIndex.containsKey(treeMap.get(temp2))) {
            System.out.println(termIndex.get(treeMap.get(temp2)) + "\t\t" + -temp2);
          } else {
            System.out.println("How embarrassing! Term index not found...");
          }
        }
      }
    } finally {
      IOUtils.closeStream(sequenceFileReader);
    }

    return 0;
  }
Example #19
0
  @Override
  public final void createTable(final CatalogProtos.TableDescProto tableDescProto)
      throws CatalogException {
    HiveCatalogStoreClientPool.HiveCatalogStoreClient client = null;

    TableDesc tableDesc = new TableDesc(tableDescProto);
    String[] splitted = CatalogUtil.splitFQTableName(tableDesc.getName());
    String databaseName = splitted[0];
    String tableName = splitted[1];

    try {
      client = clientPool.getClient();

      org.apache.hadoop.hive.metastore.api.Table table =
          new org.apache.hadoop.hive.metastore.api.Table();
      table.setDbName(databaseName);
      table.setTableName(tableName);
      table.setParameters(
          new HashMap<String, String>(tableDesc.getMeta().getOptions().getAllKeyValus()));
      // TODO: set owner
      // table.setOwner();

      StorageDescriptor sd = new StorageDescriptor();
      sd.setSerdeInfo(new SerDeInfo());
      sd.getSerdeInfo().setParameters(new HashMap<String, String>());
      sd.getSerdeInfo().setName(table.getTableName());

      // if tajo set location method, thrift client make exception as follows:
      // Caused by: MetaException(message:java.lang.NullPointerException)
      // If you want to modify table path, you have to modify on Hive cli.
      if (tableDesc.isExternal()) {
        table.setTableType(TableType.EXTERNAL_TABLE.name());
        table.putToParameters("EXTERNAL", "TRUE");

        Path tablePath = new Path(tableDesc.getUri());
        FileSystem fs = tablePath.getFileSystem(conf);
        if (fs.isFile(tablePath)) {
          LOG.warn("A table path is a file, but HiveCatalogStore does not allow a file path.");
          sd.setLocation(tablePath.getParent().toString());
        } else {
          sd.setLocation(tablePath.toString());
        }
      }

      // set column information
      List<Column> columns = tableDesc.getSchema().getRootColumns();
      ArrayList<FieldSchema> cols = new ArrayList<FieldSchema>(columns.size());

      for (Column eachField : columns) {
        cols.add(
            new FieldSchema(
                eachField.getSimpleName(),
                HiveCatalogUtil.getHiveFieldType(eachField.getDataType()),
                ""));
      }
      sd.setCols(cols);

      // set partition keys
      if (tableDesc.hasPartition()
          && tableDesc.getPartitionMethod().getPartitionType().equals(PartitionType.COLUMN)) {
        List<FieldSchema> partitionKeys = new ArrayList<FieldSchema>();
        for (Column eachPartitionKey :
            tableDesc.getPartitionMethod().getExpressionSchema().getRootColumns()) {
          partitionKeys.add(
              new FieldSchema(
                  eachPartitionKey.getSimpleName(),
                  HiveCatalogUtil.getHiveFieldType(eachPartitionKey.getDataType()),
                  ""));
        }
        table.setPartitionKeys(partitionKeys);
      }

      if (tableDesc.getMeta().getStoreType().equalsIgnoreCase(BuiltinStorages.RCFILE)) {
        String serde = tableDesc.getMeta().getOption(StorageConstants.RCFILE_SERDE);
        sd.setInputFormat(org.apache.hadoop.hive.ql.io.RCFileInputFormat.class.getName());
        sd.setOutputFormat(org.apache.hadoop.hive.ql.io.RCFileOutputFormat.class.getName());
        if (StorageConstants.DEFAULT_TEXT_SERDE.equals(serde)) {
          sd.getSerdeInfo()
              .setSerializationLib(
                  org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe.class.getName());
        } else {
          sd.getSerdeInfo()
              .setSerializationLib(
                  org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe.class.getName());
        }

        if (tableDesc.getMeta().getOptions().containsKey(StorageConstants.RCFILE_NULL)) {
          table.putToParameters(
              serdeConstants.SERIALIZATION_NULL_FORMAT,
              StringEscapeUtils.unescapeJava(
                  tableDesc.getMeta().getOption(StorageConstants.RCFILE_NULL)));
        }
      } else if (tableDesc.getMeta().getStoreType().equals(BuiltinStorages.TEXT)) {
        sd.getSerdeInfo()
            .setSerializationLib(
                org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe.class.getName());
        sd.setInputFormat(org.apache.hadoop.mapred.TextInputFormat.class.getName());
        sd.setOutputFormat(
            org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat.class.getName());

        String fieldDelimiter =
            tableDesc
                .getMeta()
                .getOption(
                    StorageConstants.TEXT_DELIMITER, StorageConstants.DEFAULT_FIELD_DELIMITER);

        // User can use an unicode for filed delimiter such as \u0001, \001.
        // In this case, java console will convert this value into "\\u001".
        // And hive will un-espace this value again.
        // As a result, user can use right field delimiter.
        // So, we have to un-escape this value.
        sd.getSerdeInfo()
            .putToParameters(
                serdeConstants.SERIALIZATION_FORMAT,
                StringEscapeUtils.unescapeJava(fieldDelimiter));
        sd.getSerdeInfo()
            .putToParameters(
                serdeConstants.FIELD_DELIM, StringEscapeUtils.unescapeJava(fieldDelimiter));
        table.getParameters().remove(StorageConstants.TEXT_DELIMITER);

        if (tableDesc.getMeta().containsOption(StorageConstants.TEXT_NULL)) {
          table.putToParameters(
              serdeConstants.SERIALIZATION_NULL_FORMAT,
              StringEscapeUtils.unescapeJava(
                  tableDesc.getMeta().getOption(StorageConstants.TEXT_NULL)));
          table.getParameters().remove(StorageConstants.TEXT_NULL);
        }
      } else if (tableDesc
          .getMeta()
          .getStoreType()
          .equalsIgnoreCase(BuiltinStorages.SEQUENCE_FILE)) {
        String serde = tableDesc.getMeta().getOption(StorageConstants.SEQUENCEFILE_SERDE);
        sd.setInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat.class.getName());
        sd.setOutputFormat(
            org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat.class.getName());

        if (StorageConstants.DEFAULT_TEXT_SERDE.equals(serde)) {
          sd.getSerdeInfo()
              .setSerializationLib(
                  org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe.class.getName());

          String fieldDelimiter =
              tableDesc
                  .getMeta()
                  .getOption(
                      StorageConstants.SEQUENCEFILE_DELIMITER,
                      StorageConstants.DEFAULT_FIELD_DELIMITER);

          // User can use an unicode for filed delimiter such as \u0001, \001.
          // In this case, java console will convert this value into "\\u001".
          // And hive will un-espace this value again.
          // As a result, user can use right field delimiter.
          // So, we have to un-escape this value.
          sd.getSerdeInfo()
              .putToParameters(
                  serdeConstants.SERIALIZATION_FORMAT,
                  StringEscapeUtils.unescapeJava(fieldDelimiter));
          sd.getSerdeInfo()
              .putToParameters(
                  serdeConstants.FIELD_DELIM, StringEscapeUtils.unescapeJava(fieldDelimiter));
          table.getParameters().remove(StorageConstants.SEQUENCEFILE_DELIMITER);
        } else {
          sd.getSerdeInfo()
              .setSerializationLib(
                  org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe.class.getName());
        }

        if (tableDesc.getMeta().containsOption(StorageConstants.SEQUENCEFILE_NULL)) {
          table.putToParameters(
              serdeConstants.SERIALIZATION_NULL_FORMAT,
              StringEscapeUtils.unescapeJava(
                  tableDesc.getMeta().getOption(StorageConstants.SEQUENCEFILE_NULL)));
          table.getParameters().remove(StorageConstants.SEQUENCEFILE_NULL);
        }
      } else {
        if (tableDesc.getMeta().getStoreType().equalsIgnoreCase(BuiltinStorages.PARQUET)) {
          sd.setInputFormat(parquet.hive.DeprecatedParquetInputFormat.class.getName());
          sd.setOutputFormat(parquet.hive.DeprecatedParquetOutputFormat.class.getName());
          sd.getSerdeInfo()
              .setSerializationLib(parquet.hive.serde.ParquetHiveSerDe.class.getName());
        } else {
          throw new UnsupportedException(
              tableDesc.getMeta().getStoreType() + " in HivecatalogStore");
        }
      }

      sd.setSortCols(new ArrayList<Order>());

      table.setSd(sd);
      client.getHiveClient().createTable(table);
    } catch (Throwable t) {
      throw new TajoInternalError(t);
    } finally {
      if (client != null) client.release();
    }
  }
Example #20
0
 public static String getResultText(Class clazz, String fileName) throws IOException {
   FileSystem localFS = FileSystem.getLocal(new Configuration());
   Path path = getResultPath(clazz, fileName);
   Preconditions.checkState(localFS.exists(path) && localFS.isFile(path));
   return FileUtil.readTextFile(new File(path.toUri()));
 }
Example #21
0
  /**
   * Simple test. multiplying [1 0 6 0] [2] [38] [0 4 0 0] * [3] = [12] [0 2 3 0] [6] [24] [3 0 0 5]
   * [1] [11]
   */
  @Test
  public void simpleSpMVTest() {
    HamaConfiguration conf = new HamaConfiguration();
    String testDir = "/simple/";
    int size = 4;
    String matrixPath = baseDir + testDir + "inputMatrix";
    String vectorPath = baseDir + testDir + "inputVector";
    String outputPath = baseDir + testDir;

    try {
      if (fs.exists(new Path(baseDir))) {
        fs.delete(new Path(baseDir), true);
      }

      // creating test matrix
      HashMap<Integer, Writable> inputMatrix = new HashMap<Integer, Writable>();
      SparseVectorWritable vector0 = new SparseVectorWritable();
      vector0.setSize(size);
      vector0.addCell(0, 1);
      vector0.addCell(2, 6);
      SparseVectorWritable vector1 = new SparseVectorWritable();
      vector1.setSize(size);
      vector1.addCell(1, 4);
      SparseVectorWritable vector2 = new SparseVectorWritable();
      vector2.setSize(size);
      vector2.addCell(1, 2);
      vector2.addCell(2, 3);
      SparseVectorWritable vector3 = new SparseVectorWritable();
      vector3.setSize(size);
      vector3.addCell(0, 3);
      vector3.addCell(3, 5);
      inputMatrix.put(0, vector0);
      inputMatrix.put(1, vector1);
      inputMatrix.put(2, vector2);
      inputMatrix.put(3, vector3);
      writeMatrix(matrixPath, conf, inputMatrix);

      HashMap<Integer, Writable> inputVector = new HashMap<Integer, Writable>();
      DenseVectorWritable vector = new DenseVectorWritable();
      vector.setSize(size);
      vector.addCell(0, 2);
      vector.addCell(1, 3);
      vector.addCell(2, 6);
      vector.addCell(3, 1);
      inputVector.put(0, vector);
      writeMatrix(vectorPath, conf, inputVector);

      SpMV.main(new String[] {matrixPath, vectorPath, outputPath, "4"});

      String resultPath = SpMV.getResultPath();
      DenseVectorWritable result = new DenseVectorWritable();
      SpMV.readFromFile(resultPath, result, conf);
      LOG.info("result is a file: " + fs.isFile(new Path(resultPath)));

      double expected[] = {38, 12, 24, 11};
      if (result.getSize() != size) throw new Exception("Incorrect size of output vector");
      for (int i = 0; i < result.getSize(); i++)
        if ((result.get(i) - expected[i]) < 0.01) expected[i] = 0;

      for (int i = 0; i < expected.length; i++)
        if (expected[i] != 0) throw new Exception("Result doesn't meets expectations");

      fs.delete(new Path(baseDir), true);
    } catch (Exception e) {
      e.printStackTrace();
      fail(e.getLocalizedMessage());
    } finally {
    }
  }
  /**
   * Method to merge multiple CSV part files on HDFS into a single CSV file on HDFS. The part files
   * are created by CSV_WRITE MR job.
   *
   * <p>This method is invoked from CP-write instruction.
   *
   * @param srcFileName
   * @param destFileName
   * @param csvprop
   * @param rlen
   * @param clen
   * @throws IOException
   */
  public final void mergeCSVPartFiles(
      String srcFileName,
      String destFileName,
      CSVFileFormatProperties csvprop,
      long rlen,
      long clen)
      throws IOException {
    Configuration conf = new Configuration(ConfigurationManager.getCachedJobConf());

    Path srcFilePath = new Path(srcFileName);
    Path mergedFilePath = new Path(destFileName);
    FileSystem hdfs = FileSystem.get(conf);

    if (hdfs.exists(mergedFilePath)) {
      hdfs.delete(mergedFilePath, true);
    }
    OutputStream out = hdfs.create(mergedFilePath, true);

    // write out the header, if needed
    if (csvprop.hasHeader()) {
      StringBuilder sb = new StringBuilder();
      for (int i = 0; i < clen; i++) {
        sb.append("C" + (i + 1));
        if (i < clen - 1) sb.append(csvprop.getDelim());
      }
      sb.append('\n');
      out.write(sb.toString().getBytes());
      sb.setLength(0);
    }

    // if the source is a directory
    if (hdfs.isDirectory(srcFilePath)) {
      try {
        FileStatus[] contents = hdfs.listStatus(srcFilePath);
        Path[] partPaths = new Path[contents.length];
        int numPartFiles = 0;
        for (int i = 0; i < contents.length; i++) {
          if (!contents[i].isDirectory()) {
            partPaths[i] = contents[i].getPath();
            numPartFiles++;
          }
        }
        Arrays.sort(partPaths);

        for (int i = 0; i < numPartFiles; i++) {
          InputStream in = hdfs.open(partPaths[i]);
          try {
            IOUtils.copyBytes(in, out, conf, false);
            if (i < numPartFiles - 1) out.write('\n');
          } finally {
            IOUtilFunctions.closeSilently(in);
          }
        }
      } finally {
        IOUtilFunctions.closeSilently(out);
      }
    } else if (hdfs.isFile(srcFilePath)) {
      InputStream in = null;
      try {
        in = hdfs.open(srcFilePath);
        IOUtils.copyBytes(in, out, conf, true);
      } finally {
        IOUtilFunctions.closeSilently(in);
        IOUtilFunctions.closeSilently(out);
      }
    } else {
      throw new IOException(srcFilePath.toString() + ": No such file or directory");
    }
  }
Example #23
0
 @Override
 public boolean isFile(String path) throws IOException {
   return mFs.isFile(new Path(path));
 }
  /**
   * @param srcFileName
   * @param destFileName
   * @param csvprop
   * @param rlen
   * @param clen
   * @throws IOException
   */
  @SuppressWarnings("unchecked")
  public final void addHeaderToCSV(String srcFileName, String destFileName, long rlen, long clen)
      throws IOException {
    Configuration conf = new Configuration(ConfigurationManager.getCachedJobConf());

    Path srcFilePath = new Path(srcFileName);
    Path destFilePath = new Path(destFileName);
    FileSystem hdfs = FileSystem.get(conf);

    if (!_props.hasHeader()) {
      // simply move srcFile to destFile

      /*
       * TODO: Remove this roundabout way!
       * For example: destFilePath = /user/biadmin/csv/temp/out/file.csv
       *              & the only path that exists already on HDFS is /user/biadmin/csv/.
       * In this case: the directory structure /user/biadmin/csv/temp/out must be created.
       * Simple hdfs.rename() does not seem to create this directory structure.
       */

      // delete the destination file, if exists already
      // boolean ret1 =
      hdfs.delete(destFilePath, true);

      // Create /user/biadmin/csv/temp/out/file.csv so that ..../temp/out/ is created.
      // boolean ret2 =
      hdfs.createNewFile(destFilePath);

      // delete the file "file.csv" but preserve the directory structure /user/biadmin/csv/temp/out/
      // boolean ret3 =
      hdfs.delete(destFilePath, true);

      // finally, move the data to destFilePath = /user/biadmin/csv/temp/out/file.csv
      // boolean ret4 =
      hdfs.rename(srcFilePath, destFilePath);

      // System.out.println("Return values = del:" + ret1 + ", createNew:" + ret2 + ", del:" + ret3
      // + ", rename:" + ret4);
      return;
    }

    // construct the header line
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < clen; i++) {
      sb.append("C" + (i + 1));
      if (i < clen - 1) sb.append(_props.getDelim());
    }
    sb.append('\n');

    if (hdfs.isDirectory(srcFilePath)) {

      // compute sorted order among part files
      ArrayList<Path> files = new ArrayList<Path>();
      for (FileStatus stat : hdfs.listStatus(srcFilePath, CSVReblockMR.hiddenFileFilter))
        files.add(stat.getPath());
      Collections.sort(files);

      // first part file path
      Path firstpart = files.get(0);

      // create a temp file, and add header and contents of first part
      Path tmp = new Path(firstpart.toString() + ".tmp");
      OutputStream out = hdfs.create(tmp, true);
      out.write(sb.toString().getBytes());
      sb.setLength(0);

      // copy rest of the data from firstpart
      InputStream in = null;
      try {
        in = hdfs.open(firstpart);
        IOUtils.copyBytes(in, out, conf, true);
      } finally {
        IOUtilFunctions.closeSilently(in);
        IOUtilFunctions.closeSilently(out);
      }

      // rename tmp to firstpart
      hdfs.delete(firstpart, true);
      hdfs.rename(tmp, firstpart);

      // rename srcfile to destFile
      hdfs.delete(destFilePath, true);
      hdfs.createNewFile(destFilePath); // force the creation of directory structure
      hdfs.delete(destFilePath, true); // delete the file, but preserve the directory structure
      hdfs.rename(srcFilePath, destFilePath); // move the data

    } else if (hdfs.isFile(srcFilePath)) {
      // create destination file
      OutputStream out = hdfs.create(destFilePath, true);

      // write header
      out.write(sb.toString().getBytes());
      sb.setLength(0);

      // copy the data from srcFile
      InputStream in = null;
      try {
        in = hdfs.open(srcFilePath);
        IOUtils.copyBytes(in, out, conf, true);
      } finally {
        IOUtilFunctions.closeSilently(in);
        IOUtilFunctions.closeSilently(out);
      }
    } else {
      throw new IOException(srcFilePath.toString() + ": No such file or directory");
    }
  }
Example #25
0
  public static boolean syncLocalFileToHDFS(
      final String inPathName, final PathFilter pf, final String outPathName) throws IOException {
    Configuration conf = new Configuration();
    Path pathOut = new Path(outPathName);
    Path pathIn = new Path(inPathName);
    FileSystem filesystemIn = pathIn.getFileSystem(conf);
    FileSystem filesystemOut = pathOut.getFileSystem(conf);
    FileStatus fsIn = filesystemIn.getFileStatus(pathIn);
    FileStatus fsOut = filesystemOut.getFileStatus(pathOut);
    if (fsIn.isDir()) {
      List<Path> todoNames;
      // check src filename list
      FileStatus[] fileStatuses;

      if (fsOut.isDir()) {
        // copy <dir> to <dir>
        // get dest filename list
        fileStatuses = filesystemOut.listStatus(pathOut);
        List<String> names = new ArrayList<String>(fileStatuses.length);
        for (FileStatus sts : fileStatuses) {
          if (sts.isDir() || sts.getLen() == 0) {
            continue;
          }
          String fn = sts.getPath().getName();
          names.add(fn);
        }
        // sort dest filename list
        Collections.sort(names);

        todoNames = new LinkedList<Path>();
        // check src filename list
        fileStatuses = filesystemIn.listStatus(pathIn, pf);
        for (FileStatus sts : fileStatuses) {
          if (sts.isDir()) {
            continue;
          }
          Path fp = sts.getPath();
          String fn = fp.getName();
          int n = Collections.binarySearch(names, fn);
          if (n < 0) {
            n = -n - 1;
          }
          if (n < names.size() && names.get(n).startsWith(fn)) {
            continue;
          }

          todoNames.add(fp);
        }
        int size = todoNames.size();
        if (size == 0) {
          return false;
        }
        Path[] srcs = todoNames.toArray(new Path[size]);

        // old stuff
        filesystemOut.copyFromLocalFile(false, true, srcs, pathOut);
      } else {
        // copy <dir> to <file>
        System.err.println("Not Support Operation : copy <dir> to <File>!");
        return false;
      }
    } else {
      // copy <file> to <dir> AND <file> to <file>
      Path fileOut = pathOut;
      boolean todo = true;
      if (fsOut.isDir()) {
        String filename = pathIn.getName();
        fileOut = new Path(pathOut, filename);
      }
      if (filesystemOut.exists(fileOut)) {
        // dest file is exists and is a directory, unsupported OP.
        if (!filesystemOut.isFile(fileOut)) {
          System.err.println("Not Support Operation : " + fileOut.toString() + " is a Directory!");
          todo = false;
        } else {
          FileStatus[] fsArray = filesystemOut.listStatus(fileOut);
          for (FileStatus fs : fsArray) {
            if (fs.getLen() > 0) {
              todo = false;
            }
          }
        }
      }
      if (todo) {
        filesystemOut.copyFromLocalFile(false, true, pathIn, pathOut);
      }
    }
    return true;
  }