private static void addFolder(FileSystem fs, Path p, JsonArray succeeded, JsonArray failed) { try { if (fs == null) return; for (FileStatus file : fs.listStatus(p)) { Path pfs = file.getPath(); if (file.isDir()) { addFolder(fs, pfs, succeeded, failed); } else { Key k = Key.make(pfs.toString()); long size = file.getLen(); Value val = null; if (pfs.getName().endsWith(Extensions.JSON)) { JsonParser parser = new JsonParser(); JsonObject json = parser.parse(new InputStreamReader(fs.open(pfs))).getAsJsonObject(); JsonElement v = json.get(Constants.VERSION); if (v == null) throw new RuntimeException("Missing version"); JsonElement type = json.get(Constants.TYPE); if (type == null) throw new RuntimeException("Missing type"); Class c = Class.forName(type.getAsString()); OldModel model = (OldModel) c.newInstance(); model.fromJson(json); } else if (pfs.getName().endsWith(Extensions.HEX)) { // Hex file? FSDataInputStream s = fs.open(pfs); int sz = (int) Math.min(1L << 20, size); // Read up to the 1st meg byte[] mem = MemoryManager.malloc1(sz); s.readFully(mem); // Convert to a ValueArray (hope it fits in 1Meg!) ValueArray ary = new ValueArray(k, 0).read(new AutoBuffer(mem)); val = new Value(k, ary, Value.HDFS); } else if (size >= 2 * ValueArray.CHUNK_SZ) { val = new Value( k, new ValueArray(k, size), Value.HDFS); // ValueArray byte wrapper over a large file } else { val = new Value(k, (int) size, Value.HDFS); // Plain Value val.setdsk(); } DKV.put(k, val); Log.info("PersistHdfs: DKV.put(" + k + ")"); JsonObject o = new JsonObject(); o.addProperty(Constants.KEY, k.toString()); o.addProperty(Constants.FILE, pfs.toString()); o.addProperty(Constants.VALUE_SIZE, file.getLen()); succeeded.add(o); } } } catch (Exception e) { Log.err(e); JsonObject o = new JsonObject(); o.addProperty(Constants.FILE, p.toString()); o.addProperty(Constants.ERROR, e.getMessage()); failed.add(o); } }
/** * Create a map and reduce Hadoop job. Does not set the name on the job. * @param inputPath The input {@link org.apache.hadoop.fs.Path} * @param outputPath The output {@link org.apache.hadoop.fs.Path} * @param inputFormat The {@link org.apache.hadoop.mapreduce.InputFormat} * @param mapper The {@link org.apache.hadoop.mapreduce.Mapper} class to use * @param mapperKey The {@link org.apache.hadoop.io.Writable} key class. If the Mapper is a no-op, this value may be null * @param mapperValue The {@link org.apache.hadoop.io.Writable} value class. If the Mapper is a no-op, this value may be null * @param reducer The {@link org.apache.hadoop.mapreduce.Reducer} to use * @param reducerKey The reducer key class. * @param reducerValue The reducer value class. * @param outputFormat The {@link org.apache.hadoop.mapreduce.OutputFormat}. * @param conf The {@link org.apache.hadoop.conf.Configuration} to use. * @return The {@link org.apache.hadoop.mapreduce.Job}. * @throws IOException if there is a problem with the IO. * * @see #getCustomJobName(String, org.apache.hadoop.mapreduce.JobContext, Class, Class) * @see #prepareJob(org.apache.hadoop.fs.Path, org.apache.hadoop.fs.Path, Class, Class, Class, Class, Class, org.apache.hadoop.conf.Configuration) */ @SuppressWarnings("rawtypes") public static Job prepareJob(Path inputPath, Path outputPath, Class<? extends InputFormat> inputFormat, Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey, Class<? extends Writable> mapperValue, Class<? extends Reducer> reducer, Class<? extends Writable> reducerKey, Class<? extends Writable> reducerValue, Class<? extends OutputFormat> outputFormat, Configuration conf) throws IOException { //Job job = new Job(new Configuration(conf)); Job job = Job.getInstance(conf); Configuration jobConf = job.getConfiguration(); if (reducer.equals(Reducer.class)) { if (mapper.equals(Mapper.class)) { throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer"); } job.setJarByClass(mapper); } else { job.setJarByClass(reducer); } job.setInputFormatClass(inputFormat); jobConf.set("mapred.input.dir", inputPath.toString()); job.setMapperClass(mapper); if (mapperKey != null) { job.setMapOutputKeyClass(mapperKey); } if (mapperValue != null) { job.setMapOutputValueClass(mapperValue); } jobConf.setBoolean("mapred.compress.map.output", true); job.setReducerClass(reducer); job.setOutputKeyClass(reducerKey); job.setOutputValueClass(reducerValue); job.setOutputFormatClass(outputFormat); jobConf.set("mapred.output.dir", outputPath.toString()); return job; }
@SuppressWarnings("rawtypes") public static String getCustomJobName(String className, JobContext job, Class<? extends Mapper> mapper, Class<? extends Reducer> reducer) { StringBuilder name = new StringBuilder(100); String customJobName = job.getJobName(); if (customJobName == null || customJobName.trim().isEmpty()) { name.append(className); } else { name.append(customJobName); } name.append('-').append(mapper.getSimpleName()); if (reducer != null) { name.append('-').append(reducer.getSimpleName()); } return name.toString(); }
public static LinkedHashSet<String> findJars(LogicalPlan dag, Class<?>[] defaultClasses) { List<Class<?>> jarClasses = new ArrayList<Class<?>>(); for (String className : dag.getClassNames()) { try { Class<?> clazz = Thread.currentThread().getContextClassLoader().loadClass(className); jarClasses.add(clazz); } catch (ClassNotFoundException e) { throw new IllegalArgumentException("Failed to load class " + className, e); } } for (Class<?> clazz : Lists.newArrayList(jarClasses)) { // process class and super classes (super does not require deploy annotation) for (Class<?> c = clazz; c != Object.class && c != null; c = c.getSuperclass()) { // LOG.debug("checking " + c); jarClasses.add(c); jarClasses.addAll(Arrays.asList(c.getInterfaces())); } } jarClasses.addAll(Arrays.asList(defaultClasses)); if (dag.isDebug()) { LOG.debug("Deploy dependencies: {}", jarClasses); } LinkedHashSet<String> localJarFiles = new LinkedHashSet<String>(); // avoid duplicates HashMap<String, String> sourceToJar = new HashMap<String, String>(); for (Class<?> jarClass : jarClasses) { if (jarClass.getProtectionDomain().getCodeSource() == null) { // system class continue; } String sourceLocation = jarClass.getProtectionDomain().getCodeSource().getLocation().toString(); String jar = sourceToJar.get(sourceLocation); if (jar == null) { // don't create jar file from folders multiple times jar = JarFinder.getJar(jarClass); sourceToJar.put(sourceLocation, jar); LOG.debug("added sourceLocation {} as {}", sourceLocation, jar); } if (jar == null) { throw new AssertionError("Cannot resolve jar file for " + jarClass); } localJarFiles.add(jar); } String libJarsPath = dag.getValue(LogicalPlan.LIBRARY_JARS); if (!StringUtils.isEmpty(libJarsPath)) { String[] libJars = StringUtils.splitByWholeSeparator(libJarsPath, LIB_JARS_SEP); localJarFiles.addAll(Arrays.asList(libJars)); } LOG.info("Local jar file dependencies: " + localJarFiles); return localJarFiles; }
// Information needed to get a single file: // BASE_PATH, FILE_ID, TIMESTAMP_START, TIMESTAMP_STOP, SOURCE, FILESYSTEM private static Vector<Path> getFile( FileSystem fs, Hashtable<String, String> config, dbutil db_util) throws Exception { Long latestVersion = latestVersion(config, db_util); try { config.put("timestamp_start", config.get("timestamp_start")); config.put("timestamp_real", latestVersion.toString()); config.put("timestamp_stop", latestVersion.toString()); } catch (Exception E) { logger.error("Tryign to get file that is impossible to generate: " + getFullPath(config)); return null; } if (Integer.parseInt(config.get("timestamp_start")) > Integer.parseInt(config.get("timestamp_stop"))) { return null; } logger.debug( "Getting DB for timestamp " + config.get("timestamp_start") + " to " + config.get("timestamp_stop")); String final_result = getFullPath(config); String temp_path_base = config.get("local_temp_path") + "_" + config.get("task_id") + "_" + config.get("run_id") + "/"; Path newPath = new Path(final_result + "*"); Vector<Path> ret_path = new Vector<Path>(); String lockName = lock(final_result.replaceAll("/", "_")); if (fs.globStatus(newPath).length != 0) { ret_path.add(newPath); unlock(lockName); config.put("full_file_name", final_result); return ret_path; } else { if (!config.get("source").equals("local")) { config.put("temp_path_base", temp_path_base); config.put("timestamp_start", config.get("timestamp_start")); config.put("timestamp_real", latestVersion.toString()); config.put("timestamp_stop", latestVersion.toString()); Class<?> sourceClass = Class.forName("org.gestore.plugin.source." + config.get("source") + "Source"); Method process_data = sourceClass.getMethod("process", Hashtable.class, FileSystem.class); Object processor = sourceClass.newInstance(); Object retVal; try { retVal = process_data.invoke(processor, config, fs); } catch (InvocationTargetException E) { Throwable exception = E.getTargetException(); logger.error("Unable to call method in child class: " + exception.toString()); exception.printStackTrace(System.out); unlock(lockName); return null; } FileStatus[] files = (FileStatus[]) retVal; if (files == null) { logger.error("Error getting files, no files returned"); return null; } for (FileStatus file : files) { Path cur_file = file.getPath(); Path cur_local_path = new Path(temp_path_base + config.get("file_id")); String suffix = getSuffix(config.get("file_id"), cur_file.getName()); cur_local_path = cur_local_path.suffix(suffix); Path res_path = new Path(new String(final_result + suffix)); logger.debug("Moving file" + cur_file.toString() + " to " + res_path.toString()); if (config.get("copy").equals("true")) { fs.moveFromLocalFile(cur_file, res_path); } else { fs.rename(cur_file, res_path); } } config.put("full_file_name", final_result); } } unlock(lockName); return ret_path; }
private static String toJsonString(final Class<?> clazz, final Object value) { return toJsonString(clazz.getSimpleName(), value); }
@SuppressWarnings("unchecked") public void map( WritableComparable key, Writable value, OutputCollector<IntWritable, RecordStatsWritable> output, Reporter reporter) throws IOException { // Set up rawKey and rawValue on the first call to 'map' if (recordId == -1) { rawKey = createRaw(key.getClass()); rawValue = createRaw(value.getClass()); } ++recordId; if (this.key == sortOutput) { // Check if keys are 'sorted' if this // record is from sort's output if (prevKey == null) { prevKey = key; keyClass = prevKey.getClass(); } else { // Sanity check if (keyClass != key.getClass()) { throw new IOException( "Type mismatch in key: expected " + keyClass.getName() + ", recieved " + key.getClass().getName()); } // Check if they were sorted correctly if (prevKey.compareTo(key) > 0) { throw new IOException( "The 'map-reduce' framework wrongly" + " classifed (" + prevKey + ") > (" + key + ") " + "for record# " + recordId); } prevKey = key; } // Check if the sorted output is 'partitioned' right int keyPartition = partitioner.getPartition(key, value, noSortReducers); if (partition != keyPartition) { throw new IOException( "Partitions do not match for record# " + recordId + " ! - '" + partition + "' v/s '" + keyPartition + "'"); } } // Construct the record-stats and output (this.key, record-stats) byte[] keyBytes = rawKey.getRawBytes(key); int keyBytesLen = rawKey.getRawBytesLength(key); byte[] valueBytes = rawValue.getRawBytes(value); int valueBytesLen = rawValue.getRawBytesLength(value); int keyValueChecksum = (WritableComparator.hashBytes(keyBytes, keyBytesLen) ^ WritableComparator.hashBytes(valueBytes, valueBytesLen)); output.collect( this.key, new RecordStatsWritable((keyBytesLen + valueBytesLen), 1, keyValueChecksum)); }