Example #1
0
 /** Sanity check for srcPath */
 private static void checkSrcPath(Configuration conf, List<Path> srcPaths) throws IOException {
   List<IOException> rslt = new ArrayList<IOException>();
   for (Path p : srcPaths) {
     FileSystem fs = p.getFileSystem(conf);
     if (!fs.exists(p)) {
       rslt.add(new IOException("Input source " + p + " does not exist."));
     }
   }
   if (!rslt.isEmpty()) {
     throw new InvalidInputException(rslt);
   }
 }
Example #2
0
 private static List<Path> fetchFileList(Configuration conf, Path srcList) throws IOException {
   List<Path> result = new ArrayList<Path>();
   FileSystem fs = srcList.getFileSystem(conf);
   BufferedReader input = null;
   try {
     input = new BufferedReader(new InputStreamReader(fs.open(srcList)));
     String line = input.readLine();
     while (line != null) {
       result.add(new Path(line));
       line = input.readLine();
     }
   } finally {
     checkAndClose(input);
   }
   return result;
 }
  public void configure(JobConf conf) {
    numberOfCenters = Integer.valueOf(conf.get("numberOfCenters"));
    centersDirectory = conf.get("centersReadDirectory");

    try {
      Configuration c = new Configuration();
      FileSystem fs = FileSystem.get(c);

      for (int index = 0; index < numberOfCenters; ++index) {
        SequenceFile.Reader reader =
            new SequenceFile.Reader(fs, new Path(centersDirectory + "/centers/" + index), c);

        LongWritable key = new LongWritable();
        Point value = new Point();

        reader.next(key, value);

        Point center = (Point) value;

        centers.add(center);

        reader.close();
      }
    } catch (IOException e) {
      // do nothing
      // I hope this doesn't happen
      System.out.println("well, damn.");
      e.printStackTrace();
    }
  }
  public void map(
      LongWritable key, Point value, OutputCollector<LongWritable, Point> output, Reporter reporter)
      throws IOException {
    double min = value.sumOfSquares(centers.get(0));
    int best = 0;

    for (int index = 1; index < numberOfCenters; ++index) {
      double current = value.sumOfSquares(centers.get(index));

      if (current < min) {
        min = current;
        best = index;
      }
    }

    reporter.incrCounter("NUMBER", "NODES", 1);
    reporter.incrCounter("CENTER", "" + best, 1);

    output.collect(new LongWritable(best), value);
  }
  public static int main(String[] args) throws Exception {

    int i;
    String outPath;
    int numMaps = 0, numReds = 0;

    List<String> other_args = new ArrayList<String>();
    for (i = 0; i < args.length; ++i) {
      try {
        if ("-m".equals(args[i])) {
          numMaps = Integer.parseInt(args[++i]);
        } else if ("-r".equals(args[i])) {
          numReds = Integer.parseInt(args[++i]);
        } else {
          other_args.add(args[i]);
        }
      } catch (NumberFormatException except) {
        System.out.println("ERROR: Integer expected instead of " + args[i]);
        printUsage();
      } catch (ArrayIndexOutOfBoundsException except) {
        System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
        printUsage(); // exits
      }
    }
    // Make sure there are exactly 2 parameters left.
    if (other_args.size() != 2) {
      System.out.println(
          "ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2.");
      printUsage();
    }

    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    Date startIteration;
    Date endIteration;
    JobConf conf = new JobConf(Kmeans.class);
    conf.setJobName("kmeans");
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(ClusterWritable.class);
    conf.setMapperClass(MapClass.class);
    conf.setReducerClass(Reduce.class);
    conf.setNumMapTasks(numMaps);
    conf.setNumReduceTasks(numReds);
    FileInputFormat.setInputPaths(conf, new Path(other_args.get(0)));
    outPath = new String(other_args.get(1));
    FileOutputFormat.setOutputPath(conf, new Path(outPath));
    startIteration = new Date();
    JobClient.runJob(conf);
    endIteration = new Date();
    System.out.println(
        "The iteration took "
            + (endIteration.getTime() - startIteration.getTime()) / 1000
            + " seconds.");
    return 0;
  }
Example #6
0
  @Deprecated
  public static void copy(
      Configuration conf,
      String srcPath,
      String destPath,
      Path logPath,
      boolean srcAsList,
      boolean ignoreReadFailures)
      throws IOException {
    final Path src = new Path(srcPath);
    List<Path> tmp = new ArrayList<Path>();
    if (srcAsList) {
      tmp.addAll(fetchFileList(conf, src));
    } else {
      tmp.add(src);
    }
    EnumSet<Options> flags =
        ignoreReadFailures
            ? EnumSet.of(Options.IGNORE_READ_FAILURES)
            : EnumSet.noneOf(Options.class);

    final Path dst = new Path(destPath);
    copy(conf, new Arguments(tmp, dst, logPath, flags, null, Long.MAX_VALUE, Long.MAX_VALUE, null));
  }
Example #7
0
    static Arguments valueOf(String[] args, Configuration conf) throws IOException {
      List<Path> srcs = new ArrayList<Path>();
      Path dst = null;
      Path log = null;
      EnumSet<Options> flags = EnumSet.noneOf(Options.class);
      String presevedAttributes = null;
      String mapredSslConf = null;
      long filelimit = Long.MAX_VALUE;
      long sizelimit = Long.MAX_VALUE;

      for (int idx = 0; idx < args.length; idx++) {
        Options[] opt = Options.values();
        int i = 0;
        for (; i < opt.length && !args[idx].startsWith(opt[i].cmd); i++) ;

        if (i < opt.length) {
          flags.add(opt[i]);
          if (opt[i] == Options.PRESERVE_STATUS) {
            presevedAttributes = args[idx].substring(2);
            FileAttribute.parse(presevedAttributes); // validation
          } else if (opt[i] == Options.FILE_LIMIT) {
            filelimit = Options.FILE_LIMIT.parseLong(args, ++idx);
          } else if (opt[i] == Options.SIZE_LIMIT) {
            sizelimit = Options.SIZE_LIMIT.parseLong(args, ++idx);
          }
        } else if ("-f".equals(args[idx])) {
          if (++idx == args.length) {
            throw new IllegalArgumentException("urilist_uri not specified in -f");
          }
          srcs.addAll(fetchFileList(conf, new Path(args[idx])));
        } else if ("-log".equals(args[idx])) {
          if (++idx == args.length) {
            throw new IllegalArgumentException("logdir not specified in -log");
          }
          log = new Path(args[idx]);
        } else if ("-mapredSslConf".equals(args[idx])) {
          if (++idx == args.length) {
            throw new IllegalArgumentException("ssl conf file not specified in -mapredSslConf");
          }
          mapredSslConf = args[idx];
        } else if ("-m".equals(args[idx])) {
          if (++idx == args.length) {
            throw new IllegalArgumentException("num_maps not specified in -m");
          }
          try {
            conf.setInt(MAX_MAPS_LABEL, Integer.valueOf(args[idx]));
          } catch (NumberFormatException e) {
            throw new IllegalArgumentException("Invalid argument to -m: " + args[idx]);
          }
        } else if ('-' == args[idx].codePointAt(0)) {
          throw new IllegalArgumentException("Invalid switch " + args[idx]);
        } else if (idx == args.length - 1) {
          dst = new Path(args[idx]);
        } else {
          srcs.add(new Path(args[idx]));
        }
      }
      // mandatory command-line parameters
      if (srcs.isEmpty() || dst == null) {
        throw new IllegalArgumentException("Missing " + (dst == null ? "dst path" : "src"));
      }
      // incompatible command-line flags
      final boolean isOverwrite = flags.contains(Options.OVERWRITE);
      final boolean isUpdate = flags.contains(Options.UPDATE);
      final boolean isDelete = flags.contains(Options.DELETE);
      if (isOverwrite && isUpdate) {
        throw new IllegalArgumentException("Conflicting overwrite policies");
      }
      if (isDelete && !isOverwrite && !isUpdate) {
        throw new IllegalArgumentException(
            Options.DELETE.cmd
                + " must be specified with "
                + Options.OVERWRITE
                + " or "
                + Options.UPDATE
                + ".");
      }
      return new Arguments(
          srcs, dst, log, flags, presevedAttributes, filelimit, sizelimit, mapredSslConf);
    }
  public void reduce(
      Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter)
      throws IOException {
    List<String[]> _temp = new ArrayList<String[]>();
    int count = 0;
    while (values.hasNext()) {
      Text _out = values.next();
      String[] tokens = StringUtils.splitPreserveAllTokens(_out.toString(), TAB);
      _temp.add(tokens);
      if (count++ > 100000) break;
    }

    if (count > 10000) {
      Set<String> ipSet = new HashSet<String>();
      for (int posI = 0; posI < _temp.size(); posI++) {
        String[] array = _temp.get(posI);
        if (array == null) continue;

        String mid = array[2];
        String ip = array[3];
        ipSet.add(ip);
      }
      output.collect(
          key, Utils.mergeKey(String.valueOf(ipSet.size()), StringUtils.join(ipSet, '|')));
      return;
    }

    /**
     * ·Ö×éËã·¨ FOREACH ALL_DATA IF IN INDEX THEN UPDATE INDEX AND INSERT DATA ELSE FOREACH SUB_DATA
     * MAKE INDEX AND SET FIND'S DATA AS NULL
     */
    // List<List<String[]>> dataList = new ArrayList<List<String[]>>();
    List<StringBuffer> indexList = new ArrayList<StringBuffer>();
    Set<String> ipSet = new HashSet<String>();
    boolean muliHost = false;
    for (int posI = 0; posI < _temp.size(); posI++) {
      String[] array = _temp.get(posI);
      if (array == null) continue;

      String mid = array[2];
      String ip = array[3];
      ipSet.add(ip);
      boolean hasIndex = false;
      for (int i = 0; i < indexList.size(); i++) {
        StringBuffer index = indexList.get(i);
        if (index.indexOf("|" + mid + "|") >= 0 || index.indexOf("|" + ip + "|") >= 0) {
          if (index.indexOf("|" + mid + "|") < 0) {
            index.append('|').append(mid).append('|');
          }

          if (index.indexOf("|" + ip + "|") < 0) {
            index.append('|').append(ip).append('|');
          }
          // dataList.get(i).add(array);
          hasIndex = true;
          break;
        }
      }
      if (!hasIndex) {
        StringBuffer index = new StringBuffer("|" + mid + "|" + ip + "|");
        // List<String[]> _tmp = new ArrayList<String[]>();
        // _tmp.add(array);
        for (int k = posI + 1; k < _temp.size(); k++) {
          String[] _newArray = _temp.get(k);
          if (_newArray == null) {
            continue;
          }
          String _mid = _newArray[2];
          String _ip = _newArray[3];
          if (index.indexOf("|" + _mid + "|") >= 0 || index.indexOf("|" + _ip + "|") >= 0) {
            if (index.indexOf("|" + _mid + "|") < 0) {
              index.append('|').append(_mid).append('|');
            }

            if (index.indexOf("|" + _ip + "|") < 0) {
              index.append('|').append(_ip).append('|');
            }
            // _tmp.add(_newArray);
            _temp.set(k, null);
          }
        }
        indexList.add(index);
        // dataList.add(_tmp);
      }
    }
    //        for(String[] _array : _temp){
    //            output.collect(key,Utils.mergeKey(_array[1],_array[2],_array[3],_array[4]));
    //        }

    StringBuffer allIndex = new StringBuffer();
    for (StringBuffer index : indexList) {
      allIndex.append(index).append(';');
    }
    if (allIndex.length() > 0) {
      allIndex.deleteCharAt(allIndex.length() - 1);
    }
    output.collect(
        key, Utils.mergeKey(String.valueOf(indexList.size()), StringUtils.join(ipSet, '|')));
  }