Beispiel #1
0
 private static void addFolder(FileSystem fs, Path p, JsonArray succeeded, JsonArray failed) {
   try {
     if (fs == null) return;
     for (FileStatus file : fs.listStatus(p)) {
       Path pfs = file.getPath();
       if (file.isDir()) {
         addFolder(fs, pfs, succeeded, failed);
       } else {
         Key k = Key.make(pfs.toString());
         long size = file.getLen();
         Value val = null;
         if (pfs.getName().endsWith(Extensions.JSON)) {
           JsonParser parser = new JsonParser();
           JsonObject json = parser.parse(new InputStreamReader(fs.open(pfs))).getAsJsonObject();
           JsonElement v = json.get(Constants.VERSION);
           if (v == null) throw new RuntimeException("Missing version");
           JsonElement type = json.get(Constants.TYPE);
           if (type == null) throw new RuntimeException("Missing type");
           Class c = Class.forName(type.getAsString());
           OldModel model = (OldModel) c.newInstance();
           model.fromJson(json);
         } else if (pfs.getName().endsWith(Extensions.HEX)) { // Hex file?
           FSDataInputStream s = fs.open(pfs);
           int sz = (int) Math.min(1L << 20, size); // Read up to the 1st meg
           byte[] mem = MemoryManager.malloc1(sz);
           s.readFully(mem);
           // Convert to a ValueArray (hope it fits in 1Meg!)
           ValueArray ary = new ValueArray(k, 0).read(new AutoBuffer(mem));
           val = new Value(k, ary, Value.HDFS);
         } else if (size >= 2 * ValueArray.CHUNK_SZ) {
           val =
               new Value(
                   k,
                   new ValueArray(k, size),
                   Value.HDFS); // ValueArray byte wrapper over a large file
         } else {
           val = new Value(k, (int) size, Value.HDFS); // Plain Value
           val.setdsk();
         }
         DKV.put(k, val);
         Log.info("PersistHdfs: DKV.put(" + k + ")");
         JsonObject o = new JsonObject();
         o.addProperty(Constants.KEY, k.toString());
         o.addProperty(Constants.FILE, pfs.toString());
         o.addProperty(Constants.VALUE_SIZE, file.getLen());
         succeeded.add(o);
       }
     }
   } catch (Exception e) {
     Log.err(e);
     JsonObject o = new JsonObject();
     o.addProperty(Constants.FILE, p.toString());
     o.addProperty(Constants.ERROR, e.getMessage());
     failed.add(o);
   }
 }
  /**
   * Create a map and reduce Hadoop job.  Does not set the name on the job.
   * @param inputPath The input {@link org.apache.hadoop.fs.Path}
   * @param outputPath The output {@link org.apache.hadoop.fs.Path}
   * @param inputFormat The {@link org.apache.hadoop.mapreduce.InputFormat}
   * @param mapper The {@link org.apache.hadoop.mapreduce.Mapper} class to use
   * @param mapperKey The {@link org.apache.hadoop.io.Writable} key class.  If the Mapper is a no-op, this value may be null
   * @param mapperValue The {@link org.apache.hadoop.io.Writable} value class.  If the Mapper is a no-op, this value may be null
   * @param reducer The {@link org.apache.hadoop.mapreduce.Reducer} to use
   * @param reducerKey The reducer key class.
   * @param reducerValue The reducer value class.
   * @param outputFormat The {@link org.apache.hadoop.mapreduce.OutputFormat}.
   * @param conf The {@link org.apache.hadoop.conf.Configuration} to use.
   * @return The {@link org.apache.hadoop.mapreduce.Job}.
   * @throws IOException if there is a problem with the IO.
   *
   * @see #getCustomJobName(String, org.apache.hadoop.mapreduce.JobContext, Class, Class)
   * @see #prepareJob(org.apache.hadoop.fs.Path, org.apache.hadoop.fs.Path, Class, Class, Class, Class, Class, org.apache.hadoop.conf.Configuration)
   */
  @SuppressWarnings("rawtypes")
	public static Job prepareJob(Path inputPath,
                                 Path outputPath,
                                 Class<? extends InputFormat> inputFormat,
                                 Class<? extends Mapper> mapper,
                                 Class<? extends Writable> mapperKey,
                                 Class<? extends Writable> mapperValue,
                                 Class<? extends Reducer> reducer,
                                 Class<? extends Writable> reducerKey,
                                 Class<? extends Writable> reducerValue,
                                 Class<? extends OutputFormat> outputFormat,
                                 Configuration conf) throws IOException {

    //Job job = new Job(new Configuration(conf));
  	Job job = Job.getInstance(conf);
  		
    Configuration jobConf = job.getConfiguration();

    if (reducer.equals(Reducer.class)) {
      if (mapper.equals(Mapper.class)) {
        throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer");
      }
      job.setJarByClass(mapper);
    } else {
      job.setJarByClass(reducer);
    }

    job.setInputFormatClass(inputFormat);
    jobConf.set("mapred.input.dir", inputPath.toString());

    job.setMapperClass(mapper);
    if (mapperKey != null) {
      job.setMapOutputKeyClass(mapperKey);
    }
    if (mapperValue != null) {
      job.setMapOutputValueClass(mapperValue);
    }

    jobConf.setBoolean("mapred.compress.map.output", true);

    job.setReducerClass(reducer);
    job.setOutputKeyClass(reducerKey);
    job.setOutputValueClass(reducerValue);

    job.setOutputFormatClass(outputFormat);
    jobConf.set("mapred.output.dir", outputPath.toString());

    return job;
  }
  @SuppressWarnings("rawtypes")
	public static String getCustomJobName(String className, JobContext job,
                                  Class<? extends Mapper> mapper,
                                  Class<? extends Reducer> reducer) {
    StringBuilder name = new StringBuilder(100);
    String customJobName = job.getJobName();
    if (customJobName == null || customJobName.trim().isEmpty()) {
      name.append(className);
    } else {
      name.append(customJobName);
    }
    name.append('-').append(mapper.getSimpleName());
    if (reducer != null) {
    	name.append('-').append(reducer.getSimpleName());
    }
    return name.toString();
  }
Beispiel #4
0
  public static LinkedHashSet<String> findJars(LogicalPlan dag, Class<?>[] defaultClasses) {
    List<Class<?>> jarClasses = new ArrayList<Class<?>>();

    for (String className : dag.getClassNames()) {
      try {
        Class<?> clazz = Thread.currentThread().getContextClassLoader().loadClass(className);
        jarClasses.add(clazz);
      } catch (ClassNotFoundException e) {
        throw new IllegalArgumentException("Failed to load class " + className, e);
      }
    }

    for (Class<?> clazz : Lists.newArrayList(jarClasses)) {
      // process class and super classes (super does not require deploy annotation)
      for (Class<?> c = clazz; c != Object.class && c != null; c = c.getSuperclass()) {
        // LOG.debug("checking " + c);
        jarClasses.add(c);
        jarClasses.addAll(Arrays.asList(c.getInterfaces()));
      }
    }

    jarClasses.addAll(Arrays.asList(defaultClasses));

    if (dag.isDebug()) {
      LOG.debug("Deploy dependencies: {}", jarClasses);
    }

    LinkedHashSet<String> localJarFiles = new LinkedHashSet<String>(); // avoid duplicates
    HashMap<String, String> sourceToJar = new HashMap<String, String>();

    for (Class<?> jarClass : jarClasses) {
      if (jarClass.getProtectionDomain().getCodeSource() == null) {
        // system class
        continue;
      }
      String sourceLocation =
          jarClass.getProtectionDomain().getCodeSource().getLocation().toString();
      String jar = sourceToJar.get(sourceLocation);
      if (jar == null) {
        // don't create jar file from folders multiple times
        jar = JarFinder.getJar(jarClass);
        sourceToJar.put(sourceLocation, jar);
        LOG.debug("added sourceLocation {} as {}", sourceLocation, jar);
      }
      if (jar == null) {
        throw new AssertionError("Cannot resolve jar file for " + jarClass);
      }
      localJarFiles.add(jar);
    }

    String libJarsPath = dag.getValue(LogicalPlan.LIBRARY_JARS);
    if (!StringUtils.isEmpty(libJarsPath)) {
      String[] libJars = StringUtils.splitByWholeSeparator(libJarsPath, LIB_JARS_SEP);
      localJarFiles.addAll(Arrays.asList(libJars));
    }

    LOG.info("Local jar file dependencies: " + localJarFiles);

    return localJarFiles;
  }
Beispiel #5
0
  // Information needed to get a single file:
  // BASE_PATH, FILE_ID, TIMESTAMP_START, TIMESTAMP_STOP, SOURCE, FILESYSTEM
  private static Vector<Path> getFile(
      FileSystem fs, Hashtable<String, String> config, dbutil db_util) throws Exception {
    Long latestVersion = latestVersion(config, db_util);

    try {
      config.put("timestamp_start", config.get("timestamp_start"));
      config.put("timestamp_real", latestVersion.toString());
      config.put("timestamp_stop", latestVersion.toString());
    } catch (Exception E) {
      logger.error("Tryign to get file that is impossible to generate: " + getFullPath(config));
      return null;
    }
    if (Integer.parseInt(config.get("timestamp_start"))
        > Integer.parseInt(config.get("timestamp_stop"))) {
      return null;
    }
    logger.debug(
        "Getting DB for timestamp "
            + config.get("timestamp_start")
            + " to "
            + config.get("timestamp_stop"));

    String final_result = getFullPath(config);

    String temp_path_base =
        config.get("local_temp_path")
            + "_"
            + config.get("task_id")
            + "_"
            + config.get("run_id")
            + "/";
    Path newPath = new Path(final_result + "*");
    Vector<Path> ret_path = new Vector<Path>();
    String lockName = lock(final_result.replaceAll("/", "_"));
    if (fs.globStatus(newPath).length != 0) {
      ret_path.add(newPath);
      unlock(lockName);
      config.put("full_file_name", final_result);
      return ret_path;
    } else {
      if (!config.get("source").equals("local")) {
        config.put("temp_path_base", temp_path_base);

        config.put("timestamp_start", config.get("timestamp_start"));
        config.put("timestamp_real", latestVersion.toString());
        config.put("timestamp_stop", latestVersion.toString());

        Class<?> sourceClass =
            Class.forName("org.gestore.plugin.source." + config.get("source") + "Source");
        Method process_data = sourceClass.getMethod("process", Hashtable.class, FileSystem.class);
        Object processor = sourceClass.newInstance();
        Object retVal;
        try {
          retVal = process_data.invoke(processor, config, fs);
        } catch (InvocationTargetException E) {
          Throwable exception = E.getTargetException();
          logger.error("Unable to call method in child class: " + exception.toString());
          exception.printStackTrace(System.out);
          unlock(lockName);
          return null;
        }
        FileStatus[] files = (FileStatus[]) retVal;
        if (files == null) {
          logger.error("Error getting files, no files returned");
          return null;
        }

        for (FileStatus file : files) {
          Path cur_file = file.getPath();
          Path cur_local_path = new Path(temp_path_base + config.get("file_id"));
          String suffix = getSuffix(config.get("file_id"), cur_file.getName());
          cur_local_path = cur_local_path.suffix(suffix);
          Path res_path = new Path(new String(final_result + suffix));
          logger.debug("Moving file" + cur_file.toString() + " to " + res_path.toString());
          if (config.get("copy").equals("true")) {
            fs.moveFromLocalFile(cur_file, res_path);
          } else {
            fs.rename(cur_file, res_path);
          }
        }

        config.put("full_file_name", final_result);
      }
    }
    unlock(lockName);
    return ret_path;
  }
Beispiel #6
0
 private static String toJsonString(final Class<?> clazz, final Object value) {
   return toJsonString(clazz.getSimpleName(), value);
 }
Beispiel #7
0
      @SuppressWarnings("unchecked")
      public void map(
          WritableComparable key,
          Writable value,
          OutputCollector<IntWritable, RecordStatsWritable> output,
          Reporter reporter)
          throws IOException {
        // Set up rawKey and rawValue on the first call to 'map'
        if (recordId == -1) {
          rawKey = createRaw(key.getClass());
          rawValue = createRaw(value.getClass());
        }
        ++recordId;

        if (this.key == sortOutput) {
          // Check if keys are 'sorted' if this
          // record is from sort's output
          if (prevKey == null) {
            prevKey = key;
            keyClass = prevKey.getClass();
          } else {
            // Sanity check
            if (keyClass != key.getClass()) {
              throw new IOException(
                  "Type mismatch in key: expected "
                      + keyClass.getName()
                      + ", recieved "
                      + key.getClass().getName());
            }

            // Check if they were sorted correctly
            if (prevKey.compareTo(key) > 0) {
              throw new IOException(
                  "The 'map-reduce' framework wrongly"
                      + " classifed ("
                      + prevKey
                      + ") > ("
                      + key
                      + ") "
                      + "for record# "
                      + recordId);
            }
            prevKey = key;
          }

          // Check if the sorted output is 'partitioned' right
          int keyPartition = partitioner.getPartition(key, value, noSortReducers);
          if (partition != keyPartition) {
            throw new IOException(
                "Partitions do not match for record# "
                    + recordId
                    + " ! - '"
                    + partition
                    + "' v/s '"
                    + keyPartition
                    + "'");
          }
        }

        // Construct the record-stats and output (this.key, record-stats)
        byte[] keyBytes = rawKey.getRawBytes(key);
        int keyBytesLen = rawKey.getRawBytesLength(key);
        byte[] valueBytes = rawValue.getRawBytes(value);
        int valueBytesLen = rawValue.getRawBytesLength(value);

        int keyValueChecksum =
            (WritableComparator.hashBytes(keyBytes, keyBytesLen)
                ^ WritableComparator.hashBytes(valueBytes, valueBytesLen));

        output.collect(
            this.key, new RecordStatsWritable((keyBytesLen + valueBytesLen), 1, keyValueChecksum));
      }