/** Sanity check for srcPath */ private static void checkSrcPath(Configuration conf, List<Path> srcPaths) throws IOException { List<IOException> rslt = new ArrayList<IOException>(); for (Path p : srcPaths) { FileSystem fs = p.getFileSystem(conf); if (!fs.exists(p)) { rslt.add(new IOException("Input source " + p + " does not exist.")); } } if (!rslt.isEmpty()) { throw new InvalidInputException(rslt); } }
private static List<Path> fetchFileList(Configuration conf, Path srcList) throws IOException { List<Path> result = new ArrayList<Path>(); FileSystem fs = srcList.getFileSystem(conf); BufferedReader input = null; try { input = new BufferedReader(new InputStreamReader(fs.open(srcList))); String line = input.readLine(); while (line != null) { result.add(new Path(line)); line = input.readLine(); } } finally { checkAndClose(input); } return result; }
public void configure(JobConf conf) { numberOfCenters = Integer.valueOf(conf.get("numberOfCenters")); centersDirectory = conf.get("centersReadDirectory"); try { Configuration c = new Configuration(); FileSystem fs = FileSystem.get(c); for (int index = 0; index < numberOfCenters; ++index) { SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(centersDirectory + "/centers/" + index), c); LongWritable key = new LongWritable(); Point value = new Point(); reader.next(key, value); Point center = (Point) value; centers.add(center); reader.close(); } } catch (IOException e) { // do nothing // I hope this doesn't happen System.out.println("well, damn."); e.printStackTrace(); } }
public void map( LongWritable key, Point value, OutputCollector<LongWritable, Point> output, Reporter reporter) throws IOException { double min = value.sumOfSquares(centers.get(0)); int best = 0; for (int index = 1; index < numberOfCenters; ++index) { double current = value.sumOfSquares(centers.get(index)); if (current < min) { min = current; best = index; } } reporter.incrCounter("NUMBER", "NODES", 1); reporter.incrCounter("CENTER", "" + best, 1); output.collect(new LongWritable(best), value); }
public static int main(String[] args) throws Exception { int i; String outPath; int numMaps = 0, numReds = 0; List<String> other_args = new ArrayList<String>(); for (i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { numMaps = Integer.parseInt(args[++i]); } else if ("-r".equals(args[i])) { numReds = Integer.parseInt(args[++i]); } else { other_args.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); printUsage(); // exits } } // Make sure there are exactly 2 parameters left. if (other_args.size() != 2) { System.out.println( "ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2."); printUsage(); } Date startTime = new Date(); System.out.println("Job started: " + startTime); Date startIteration; Date endIteration; JobConf conf = new JobConf(Kmeans.class); conf.setJobName("kmeans"); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(ClusterWritable.class); conf.setMapperClass(MapClass.class); conf.setReducerClass(Reduce.class); conf.setNumMapTasks(numMaps); conf.setNumReduceTasks(numReds); FileInputFormat.setInputPaths(conf, new Path(other_args.get(0))); outPath = new String(other_args.get(1)); FileOutputFormat.setOutputPath(conf, new Path(outPath)); startIteration = new Date(); JobClient.runJob(conf); endIteration = new Date(); System.out.println( "The iteration took " + (endIteration.getTime() - startIteration.getTime()) / 1000 + " seconds."); return 0; }
@Deprecated public static void copy( Configuration conf, String srcPath, String destPath, Path logPath, boolean srcAsList, boolean ignoreReadFailures) throws IOException { final Path src = new Path(srcPath); List<Path> tmp = new ArrayList<Path>(); if (srcAsList) { tmp.addAll(fetchFileList(conf, src)); } else { tmp.add(src); } EnumSet<Options> flags = ignoreReadFailures ? EnumSet.of(Options.IGNORE_READ_FAILURES) : EnumSet.noneOf(Options.class); final Path dst = new Path(destPath); copy(conf, new Arguments(tmp, dst, logPath, flags, null, Long.MAX_VALUE, Long.MAX_VALUE, null)); }
static Arguments valueOf(String[] args, Configuration conf) throws IOException { List<Path> srcs = new ArrayList<Path>(); Path dst = null; Path log = null; EnumSet<Options> flags = EnumSet.noneOf(Options.class); String presevedAttributes = null; String mapredSslConf = null; long filelimit = Long.MAX_VALUE; long sizelimit = Long.MAX_VALUE; for (int idx = 0; idx < args.length; idx++) { Options[] opt = Options.values(); int i = 0; for (; i < opt.length && !args[idx].startsWith(opt[i].cmd); i++) ; if (i < opt.length) { flags.add(opt[i]); if (opt[i] == Options.PRESERVE_STATUS) { presevedAttributes = args[idx].substring(2); FileAttribute.parse(presevedAttributes); // validation } else if (opt[i] == Options.FILE_LIMIT) { filelimit = Options.FILE_LIMIT.parseLong(args, ++idx); } else if (opt[i] == Options.SIZE_LIMIT) { sizelimit = Options.SIZE_LIMIT.parseLong(args, ++idx); } } else if ("-f".equals(args[idx])) { if (++idx == args.length) { throw new IllegalArgumentException("urilist_uri not specified in -f"); } srcs.addAll(fetchFileList(conf, new Path(args[idx]))); } else if ("-log".equals(args[idx])) { if (++idx == args.length) { throw new IllegalArgumentException("logdir not specified in -log"); } log = new Path(args[idx]); } else if ("-mapredSslConf".equals(args[idx])) { if (++idx == args.length) { throw new IllegalArgumentException("ssl conf file not specified in -mapredSslConf"); } mapredSslConf = args[idx]; } else if ("-m".equals(args[idx])) { if (++idx == args.length) { throw new IllegalArgumentException("num_maps not specified in -m"); } try { conf.setInt(MAX_MAPS_LABEL, Integer.valueOf(args[idx])); } catch (NumberFormatException e) { throw new IllegalArgumentException("Invalid argument to -m: " + args[idx]); } } else if ('-' == args[idx].codePointAt(0)) { throw new IllegalArgumentException("Invalid switch " + args[idx]); } else if (idx == args.length - 1) { dst = new Path(args[idx]); } else { srcs.add(new Path(args[idx])); } } // mandatory command-line parameters if (srcs.isEmpty() || dst == null) { throw new IllegalArgumentException("Missing " + (dst == null ? "dst path" : "src")); } // incompatible command-line flags final boolean isOverwrite = flags.contains(Options.OVERWRITE); final boolean isUpdate = flags.contains(Options.UPDATE); final boolean isDelete = flags.contains(Options.DELETE); if (isOverwrite && isUpdate) { throw new IllegalArgumentException("Conflicting overwrite policies"); } if (isDelete && !isOverwrite && !isUpdate) { throw new IllegalArgumentException( Options.DELETE.cmd + " must be specified with " + Options.OVERWRITE + " or " + Options.UPDATE + "."); } return new Arguments( srcs, dst, log, flags, presevedAttributes, filelimit, sizelimit, mapredSslConf); }
public void reduce( Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { List<String[]> _temp = new ArrayList<String[]>(); int count = 0; while (values.hasNext()) { Text _out = values.next(); String[] tokens = StringUtils.splitPreserveAllTokens(_out.toString(), TAB); _temp.add(tokens); if (count++ > 100000) break; } if (count > 10000) { Set<String> ipSet = new HashSet<String>(); for (int posI = 0; posI < _temp.size(); posI++) { String[] array = _temp.get(posI); if (array == null) continue; String mid = array[2]; String ip = array[3]; ipSet.add(ip); } output.collect( key, Utils.mergeKey(String.valueOf(ipSet.size()), StringUtils.join(ipSet, '|'))); return; } /** * ·Ö×éËã·¨ FOREACH ALL_DATA IF IN INDEX THEN UPDATE INDEX AND INSERT DATA ELSE FOREACH SUB_DATA * MAKE INDEX AND SET FIND'S DATA AS NULL */ // List<List<String[]>> dataList = new ArrayList<List<String[]>>(); List<StringBuffer> indexList = new ArrayList<StringBuffer>(); Set<String> ipSet = new HashSet<String>(); boolean muliHost = false; for (int posI = 0; posI < _temp.size(); posI++) { String[] array = _temp.get(posI); if (array == null) continue; String mid = array[2]; String ip = array[3]; ipSet.add(ip); boolean hasIndex = false; for (int i = 0; i < indexList.size(); i++) { StringBuffer index = indexList.get(i); if (index.indexOf("|" + mid + "|") >= 0 || index.indexOf("|" + ip + "|") >= 0) { if (index.indexOf("|" + mid + "|") < 0) { index.append('|').append(mid).append('|'); } if (index.indexOf("|" + ip + "|") < 0) { index.append('|').append(ip).append('|'); } // dataList.get(i).add(array); hasIndex = true; break; } } if (!hasIndex) { StringBuffer index = new StringBuffer("|" + mid + "|" + ip + "|"); // List<String[]> _tmp = new ArrayList<String[]>(); // _tmp.add(array); for (int k = posI + 1; k < _temp.size(); k++) { String[] _newArray = _temp.get(k); if (_newArray == null) { continue; } String _mid = _newArray[2]; String _ip = _newArray[3]; if (index.indexOf("|" + _mid + "|") >= 0 || index.indexOf("|" + _ip + "|") >= 0) { if (index.indexOf("|" + _mid + "|") < 0) { index.append('|').append(_mid).append('|'); } if (index.indexOf("|" + _ip + "|") < 0) { index.append('|').append(_ip).append('|'); } // _tmp.add(_newArray); _temp.set(k, null); } } indexList.add(index); // dataList.add(_tmp); } } // for(String[] _array : _temp){ // output.collect(key,Utils.mergeKey(_array[1],_array[2],_array[3],_array[4])); // } StringBuffer allIndex = new StringBuffer(); for (StringBuffer index : indexList) { allIndex.append(index).append(';'); } if (allIndex.length() > 0) { allIndex.deleteCharAt(allIndex.length() - 1); } output.collect( key, Utils.mergeKey(String.valueOf(indexList.size()), StringUtils.join(ipSet, '|'))); }