示例#1
0
      public void reduce(
          BytesWritable key,
          Iterator<IntWritable> values,
          OutputCollector<BytesWritable, IntWritable> output,
          Reporter reporter)
          throws IOException {
        int ones = 0;
        int twos = 0;
        while (values.hasNext()) {
          IntWritable count = values.next();
          if (count.equals(sortInput)) {
            ++ones;
          } else if (count.equals(sortOutput)) {
            ++twos;
          } else {
            throw new IOException(
                "Invalid 'value' of " + count.get() + " for (key,value): " + key.toString());
          }
        }

        // Check to ensure there are equal no. of ones and twos
        if (ones != twos) {
          throw new IOException(
              "Illegal ('one', 'two'): ("
                  + ones
                  + ", "
                  + twos
                  + ") for (key, value): "
                  + key.toString());
        }
      }
示例#2
0
    public void reduce(
        Text key,
        Iterator<CrawlDatum> values,
        OutputCollector<Text, CrawlDatum> output,
        Reporter reporter)
        throws IOException {
      boolean oldSet = false;
      boolean injectedSet = false;
      while (values.hasNext()) {
        CrawlDatum val = values.next();
        if (val.getStatus() == CrawlDatum.STATUS_INJECTED) {
          injected.set(val);
          injected.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
          injectedSet = true;
        } else {
          old.set(val);
          oldSet = true;
        }
      }
      CrawlDatum res = null;

      /**
       * Whether to overwrite, ignore or update existing records
       *
       * @see https://issues.apache.org/jira/browse/NUTCH-1405
       */

      // Injected record already exists and overwrite but not update
      if (injectedSet && oldSet && overwrite) {
        res = injected;

        if (update) {
          LOG.info(key.toString() + " overwritten with injected record but update was specified.");
        }
      }

      // Injected record already exists and update but not overwrite
      if (injectedSet && oldSet && update && !overwrite) {
        res = old;
        old.putAllMetaData(injected);
        old.setScore(injected.getScore() != scoreInjected ? injected.getScore() : old.getScore());
        old.setFetchInterval(
            injected.getFetchInterval() != interval
                ? injected.getFetchInterval()
                : old.getFetchInterval());
      }

      // Old default behaviour
      if (injectedSet && !oldSet) {
        res = injected;
      } else {
        res = old;
      }

      output.collect(key, res);
    }
 public void reduce(
     Text key,
     Iterator<IntWritable> values,
     OutputCollector<Text, IntWritable> output,
     Reporter reporter)
     throws IOException {
   int sum = 0;
   while (values.hasNext()) {
     sum += values.next().get();
   }
   output.collect(key, new IntWritable(sum));
 }
    public void reduce(
        Text key,
        Iterator<IntWritable> value,
        OutputCollector<Text, IntWritable> output,
        Reporter reporter)
        throws IOException {
      // TODO Auto-generated method stub
      int maxAvg = 30;
      int val = Integer.MIN_VALUE;
      while (value.hasNext()) {

        if ((val = value.next().get()) > maxAvg) {
          output.collect(key, new IntWritable(val));
        }
      }
    }
示例#5
0
      public void reduce(
          IntWritable key,
          Iterator<RecordStatsWritable> values,
          OutputCollector<IntWritable, RecordStatsWritable> output,
          Reporter reporter)
          throws IOException {
        long bytes = 0;
        long records = 0;
        int xor = 0;
        while (values.hasNext()) {
          RecordStatsWritable stats = values.next();
          bytes += stats.getBytes();
          records += stats.getRecords();
          xor ^= stats.getChecksum();
        }

        output.collect(key, new RecordStatsWritable(bytes, records, xor));
      }
示例#6
0
    public void reduce(
        IntWritable key,
        Iterator<Text> values,
        OutputCollector<IntWritable, Text> output,
        Reporter reporter)
        throws IOException {
      HashMap<String, Integer> countries_map = new HashMap<String, Integer>();
      ArrayList<Integer> counties = new ArrayList<>();
      String cp = new String();

      while (values.hasNext()) {
        cp = values.next().toString();
        if (countries_map.containsKey(cp)) {
          countries_map.put(cp, countries_map.get(cp) + 1);
        } else {
          countries_map.put(cp, 1);
        }
      }

      for (java.util.Map.Entry<String, Integer> entry : countries_map.entrySet()) {
        counties.add(entry.getValue());
      }
      output.collect(
          key,
          new Text(
              ""
                  + countries_map.entrySet().size()
                  + " "
                  + Collections.min(counties)
                  + " "
                  + median(counties)
                  + " "
                  + Collections.max(counties)
                  + " "
                  + mean(counties)
                  + " "
                  + standard_deviation(counties)));
    }
示例#7
0
  /**
   * Initialize DFSCopyFileMapper specific job-configuration.
   *
   * @param conf : The dfs/mapred configuration.
   * @param jobConf : The handle to the jobConf object to be initialized.
   * @param args Arguments
   */
  private static void setup(Configuration conf, JobConf jobConf, final Arguments args)
      throws IOException {
    jobConf.set(DST_DIR_LABEL, args.dst.toUri().toString());

    // set boolean values
    final boolean update = args.flags.contains(Options.UPDATE);
    final boolean overwrite = !update && args.flags.contains(Options.OVERWRITE);
    jobConf.setBoolean(Options.UPDATE.propertyname, update);
    jobConf.setBoolean(Options.OVERWRITE.propertyname, overwrite);
    jobConf.setBoolean(
        Options.IGNORE_READ_FAILURES.propertyname,
        args.flags.contains(Options.IGNORE_READ_FAILURES));
    jobConf.setBoolean(
        Options.PRESERVE_STATUS.propertyname, args.flags.contains(Options.PRESERVE_STATUS));

    final String randomId = getRandomId();
    JobClient jClient = new JobClient(jobConf);
    Path jobDirectory = new Path(jClient.getSystemDir(), NAME + "_" + randomId);
    jobConf.set(JOB_DIR_LABEL, jobDirectory.toString());

    FileSystem dstfs = args.dst.getFileSystem(conf);
    boolean dstExists = dstfs.exists(args.dst);
    boolean dstIsDir = false;
    if (dstExists) {
      dstIsDir = dstfs.getFileStatus(args.dst).isDir();
    }

    // default logPath
    Path logPath = args.log;
    if (logPath == null) {
      String filename = "_distcp_logs_" + randomId;
      if (!dstExists || !dstIsDir) {
        Path parent = args.dst.getParent();
        if (!dstfs.exists(parent)) {
          dstfs.mkdirs(parent);
        }
        logPath = new Path(parent, filename);
      } else {
        logPath = new Path(args.dst, filename);
      }
    }
    FileOutputFormat.setOutputPath(jobConf, logPath);

    // create src list, dst list
    FileSystem jobfs = jobDirectory.getFileSystem(jobConf);

    Path srcfilelist = new Path(jobDirectory, "_distcp_src_files");
    jobConf.set(SRC_LIST_LABEL, srcfilelist.toString());
    SequenceFile.Writer src_writer =
        SequenceFile.createWriter(
            jobfs,
            jobConf,
            srcfilelist,
            LongWritable.class,
            FilePair.class,
            SequenceFile.CompressionType.NONE);

    Path dstfilelist = new Path(jobDirectory, "_distcp_dst_files");
    SequenceFile.Writer dst_writer =
        SequenceFile.createWriter(
            jobfs, jobConf, dstfilelist, Text.class, Text.class, SequenceFile.CompressionType.NONE);

    Path dstdirlist = new Path(jobDirectory, "_distcp_dst_dirs");
    jobConf.set(DST_DIR_LIST_LABEL, dstdirlist.toString());
    SequenceFile.Writer dir_writer =
        SequenceFile.createWriter(
            jobfs,
            jobConf,
            dstdirlist,
            Text.class,
            FilePair.class,
            SequenceFile.CompressionType.NONE);

    // handle the case where the destination directory doesn't exist
    // and we've only a single src directory OR we're updating/overwriting
    // the contents of the destination directory.
    final boolean special = (args.srcs.size() == 1 && !dstExists) || update || overwrite;
    int srcCount = 0, cnsyncf = 0, dirsyn = 0;
    long fileCount = 0L, byteCount = 0L, cbsyncs = 0L;
    try {
      for (Iterator<Path> srcItr = args.srcs.iterator(); srcItr.hasNext(); ) {
        final Path src = srcItr.next();
        FileSystem srcfs = src.getFileSystem(conf);
        FileStatus srcfilestat = srcfs.getFileStatus(src);
        Path root = special && srcfilestat.isDir() ? src : src.getParent();
        if (srcfilestat.isDir()) {
          ++srcCount;
        }

        Stack<FileStatus> pathstack = new Stack<FileStatus>();
        for (pathstack.push(srcfilestat); !pathstack.empty(); ) {
          FileStatus cur = pathstack.pop();
          FileStatus[] children = srcfs.listStatus(cur.getPath());
          for (int i = 0; i < children.length; i++) {
            boolean skipfile = false;
            final FileStatus child = children[i];
            final String dst = makeRelative(root, child.getPath());
            ++srcCount;

            if (child.isDir()) {
              pathstack.push(child);
            } else {
              // skip file if the src and the dst files are the same.
              skipfile = update && sameFile(srcfs, child, dstfs, new Path(args.dst, dst));
              // skip file if it exceed file limit or size limit
              skipfile |=
                  fileCount == args.filelimit || byteCount + child.getLen() > args.sizelimit;

              if (!skipfile) {
                ++fileCount;
                byteCount += child.getLen();

                if (LOG.isTraceEnabled()) {
                  LOG.trace("adding file " + child.getPath());
                }

                ++cnsyncf;
                cbsyncs += child.getLen();
                if (cnsyncf > SYNC_FILE_MAX || cbsyncs > BYTES_PER_MAP) {
                  src_writer.sync();
                  dst_writer.sync();
                  cnsyncf = 0;
                  cbsyncs = 0L;
                }
              }
            }

            if (!skipfile) {
              src_writer.append(
                  new LongWritable(child.isDir() ? 0 : child.getLen()), new FilePair(child, dst));
            }

            dst_writer.append(new Text(dst), new Text(child.getPath().toString()));
          }

          if (cur.isDir()) {
            String dst = makeRelative(root, cur.getPath());
            dir_writer.append(new Text(dst), new FilePair(cur, dst));
            if (++dirsyn > SYNC_FILE_MAX) {
              dirsyn = 0;
              dir_writer.sync();
            }
          }
        }
      }
    } finally {
      checkAndClose(src_writer);
      checkAndClose(dst_writer);
      checkAndClose(dir_writer);
    }

    FileStatus dststatus = null;
    try {
      dststatus = dstfs.getFileStatus(args.dst);
    } catch (FileNotFoundException fnfe) {
      LOG.info(args.dst + " does not exist.");
    }

    // create dest path dir if copying > 1 file
    if (dststatus == null) {
      if (srcCount > 1 && !dstfs.mkdirs(args.dst)) {
        throw new IOException("Failed to create" + args.dst);
      }
    }

    final Path sorted = new Path(jobDirectory, "_distcp_sorted");
    checkDuplication(jobfs, dstfilelist, sorted, conf);

    if (dststatus != null && args.flags.contains(Options.DELETE)) {
      deleteNonexisting(dstfs, dststatus, sorted, jobfs, jobDirectory, jobConf, conf);
    }

    Path tmpDir =
        new Path(
            (dstExists && !dstIsDir) || (!dstExists && srcCount == 1)
                ? args.dst.getParent()
                : args.dst,
            "_distcp_tmp_" + randomId);
    jobConf.set(TMP_DIR_LABEL, tmpDir.toUri().toString());
    LOG.info("srcCount=" + srcCount);
    jobConf.setInt(SRC_COUNT_LABEL, srcCount);
    jobConf.setLong(TOTAL_SIZE_LABEL, byteCount);
    setMapCount(byteCount, jobConf);
  }
    @Override
    public void reduce(
        IntWritable key,
        Iterator<ClusterWritable> values,
        OutputCollector<IntWritable, Text> output,
        Reporter reporter)
        throws IOException {

      float sumSimilarity = 0.0f;
      int numMovies = 0;
      float avgSimilarity = 0.0f;
      float similarity = 0.0f;
      int s = 0;
      int count;
      float diff = 0.0f;
      float minDiff = 1.0f;
      int candidate = 0;
      String data = new String("");
      String shortline = new String("");
      ArrayList<String> arrl = new ArrayList<String>();
      ArrayList<Float> simArrl = new ArrayList<Float>();
      String oneElm = new String();
      int indexShort, index2;
      Text val = new Text();

      while (values.hasNext()) {
        ClusterWritable cr = (ClusterWritable) values.next();
        similarity = cr.similarity;
        simArrl.addAll(cr.similarities);
        for (int i = 0; i < cr.movies.size(); i++) {
          oneElm = cr.movies.get(i);
          indexShort =
              oneElm.indexOf(
                  ",",
                  1000); // to avoid memory error caused by long arrays; it will results less
                         // accurate
          if (indexShort == -1) {
            shortline = new String(oneElm);
          } else {
            shortline = new String(oneElm.substring(0, indexShort));
          }
          arrl.add(shortline);
          output.collect(key, new Text(oneElm));
        }
        numMovies += cr.movies.size();
        sumSimilarity += similarity;
      }
      if (numMovies > 0) {
        avgSimilarity = sumSimilarity / (float) numMovies;
      }
      diff = 0.0f;
      minDiff = 1.0f;
      for (s = 0; s < numMovies; s++) {
        diff = (float) Math.abs(avgSimilarity - simArrl.get(s));
        if (diff < minDiff) {
          minDiff = diff;
          candidate = s;
        }
      }
      data = arrl.get(candidate);
      index2 = data.indexOf(":");
      String movieStr = data.substring(0, index2);
      String reviews = data.substring(index2 + 1);
      StringTokenizer token = new StringTokenizer(reviews, ",");
      count = 0;
      while (token.hasMoreTokens()) {
        token.nextToken();
        count++;
      }
      System.out.println(
          "The key = "
              + key.toString()
              + " has members = "
              + numMovies
              + " simil = "
              + simArrl.get(candidate));
      val = new Text(simArrl.get(candidate) + " " + movieStr + " " + count + " " + reviews);
      output.collect(key, val);
      reporter.incrCounter(Counter.VALUES, 1);
    }
  public void reduce(
      Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter)
      throws IOException {
    List<String[]> _temp = new ArrayList<String[]>();
    int count = 0;
    while (values.hasNext()) {
      Text _out = values.next();
      String[] tokens = StringUtils.splitPreserveAllTokens(_out.toString(), TAB);
      _temp.add(tokens);
      if (count++ > 100000) break;
    }

    if (count > 10000) {
      Set<String> ipSet = new HashSet<String>();
      for (int posI = 0; posI < _temp.size(); posI++) {
        String[] array = _temp.get(posI);
        if (array == null) continue;

        String mid = array[2];
        String ip = array[3];
        ipSet.add(ip);
      }
      output.collect(
          key, Utils.mergeKey(String.valueOf(ipSet.size()), StringUtils.join(ipSet, '|')));
      return;
    }

    /**
     * ·Ö×éËã·¨ FOREACH ALL_DATA IF IN INDEX THEN UPDATE INDEX AND INSERT DATA ELSE FOREACH SUB_DATA
     * MAKE INDEX AND SET FIND'S DATA AS NULL
     */
    // List<List<String[]>> dataList = new ArrayList<List<String[]>>();
    List<StringBuffer> indexList = new ArrayList<StringBuffer>();
    Set<String> ipSet = new HashSet<String>();
    boolean muliHost = false;
    for (int posI = 0; posI < _temp.size(); posI++) {
      String[] array = _temp.get(posI);
      if (array == null) continue;

      String mid = array[2];
      String ip = array[3];
      ipSet.add(ip);
      boolean hasIndex = false;
      for (int i = 0; i < indexList.size(); i++) {
        StringBuffer index = indexList.get(i);
        if (index.indexOf("|" + mid + "|") >= 0 || index.indexOf("|" + ip + "|") >= 0) {
          if (index.indexOf("|" + mid + "|") < 0) {
            index.append('|').append(mid).append('|');
          }

          if (index.indexOf("|" + ip + "|") < 0) {
            index.append('|').append(ip).append('|');
          }
          // dataList.get(i).add(array);
          hasIndex = true;
          break;
        }
      }
      if (!hasIndex) {
        StringBuffer index = new StringBuffer("|" + mid + "|" + ip + "|");
        // List<String[]> _tmp = new ArrayList<String[]>();
        // _tmp.add(array);
        for (int k = posI + 1; k < _temp.size(); k++) {
          String[] _newArray = _temp.get(k);
          if (_newArray == null) {
            continue;
          }
          String _mid = _newArray[2];
          String _ip = _newArray[3];
          if (index.indexOf("|" + _mid + "|") >= 0 || index.indexOf("|" + _ip + "|") >= 0) {
            if (index.indexOf("|" + _mid + "|") < 0) {
              index.append('|').append(_mid).append('|');
            }

            if (index.indexOf("|" + _ip + "|") < 0) {
              index.append('|').append(_ip).append('|');
            }
            // _tmp.add(_newArray);
            _temp.set(k, null);
          }
        }
        indexList.add(index);
        // dataList.add(_tmp);
      }
    }
    //        for(String[] _array : _temp){
    //            output.collect(key,Utils.mergeKey(_array[1],_array[2],_array[3],_array[4]));
    //        }

    StringBuffer allIndex = new StringBuffer();
    for (StringBuffer index : indexList) {
      allIndex.append(index).append(';');
    }
    if (allIndex.length() > 0) {
      allIndex.deleteCharAt(allIndex.length() - 1);
    }
    output.collect(
        key, Utils.mergeKey(String.valueOf(indexList.size()), StringUtils.join(ipSet, '|')));
  }
示例#10
0
    public void reduce(
        BlockIDWritable key,
        Iterator<PairWritable> values,
        OutputCollector<BlockIDWritable, PairWritable> output,
        Reporter reporter)
        throws IOException {
      // Vector<Star> starV = new Vector<Star>();
      int buketsizeX = 0;
      int buketsizeY = 0;
      double bwidth = maxAlphas[key.zoneNum]; // ra ,x
      double bheight = theta; // dec ,y
      /* add 10 more in each dimension to make sure there is no overflow. */
      Vector<Star>[][] arrstarV =
          new Vector[((int) (zoneHeight / bheight)) + 10]
              [((int) (blockWidth / bwidth)) + 10]; // create bucket vector[Y][X]

      int num = 0;
      while (values.hasNext()) {
        num++;
        Star s = values.next().get(0);

        // participant
        double posx = (s.ra - blockRanges[key.raNum][0]) / bwidth;
        int x = (int) posx + 1; // shit by 1 in case star comes from other block
        double posy = (s.dec - zoneRanges[key.zoneNum][0]) / bheight;
        int y = (int) posy + 1;

        // set bucket size as max
        if (buketsizeX < x) buketsizeX = x;
        if (buketsizeY < y) buketsizeY = y;
        // create according bucket
        if (arrstarV[y][x] == null)
          // TODO avaoid creating vectors here.
          arrstarV[y][x] = new Vector<Star>();
        // put star into bucket
        arrstarV[y][x].add(s);
      }
      // start reducer
      int i, j, row, col;
      // for each bucket
      for (row = 0; row <= buketsizeY; row++) {
        for (col = 0; col <= buketsizeX; col++) {
          //		starV.clear();
          // construct a new vector to do compare
          // TODO we need to avoid searching objects in the border.
          if (arrstarV[row][col] != null) {
            // old method to generate output
            for (i = 0; i < arrstarV[row][col].size(); i++) {
              for (j = i + 1; j < arrstarV[row][col].size(); j++) {
                Star star1 = arrstarV[row][col].get(i);
                Star star2 = arrstarV[row][col].get(j);
                // what is this margin about
                if (star1.margin && star2.margin) continue;

                double dist = star1.x * star2.x + star1.y * star2.y + star1.z * star2.z;
                if (dist > costheta) {
                  p.set(star1, star2, dist);
                  output.collect(key, p);
                  p.set(star2, star1, dist);
                  output.collect(key, p);
                  //		num += 2;

                }
              }
            } // end for i,j

          } // end if
          else {
            continue;
          }
          // 4 more neighbors
          // right upper arrstarV[row-1][col+1] vs arrstarV[row][col]
          if (row != 0 && arrstarV[row - 1][col + 1] != null) {
            search(arrstarV[row][col], arrstarV[row - 1][col + 1], key, output);
          }
          // right arrstarV[row][col+1] vs arrstarV[row][col]
          if (arrstarV[row][col + 1] != null) {
            search(arrstarV[row][col], arrstarV[row][col + 1], key, output);
          }
          // right lower
          if (arrstarV[row + 1][col + 1] != null) {
            search(arrstarV[row][col], arrstarV[row + 1][col + 1], key, output);
          }
          // lower
          if (arrstarV[row + 1][col] != null) {
            search(arrstarV[row][col], arrstarV[row + 1][col], key, output);
          } // end if
        } // end colum
      } // end row
    }
示例#11
0
    public void map(
        WritableComparable<?> key,
        Text value,
        OutputCollector<Text, CrawlDatum> output,
        Reporter reporter)
        throws IOException {
      String url = value.toString(); // value is line of text

      if (url != null && url.trim().startsWith("#")) {
        /* Ignore line that start with # */
        return;
      }

      // if tabs : metadata that could be stored
      // must be name=value and separated by \t
      float customScore = -1f;
      int customInterval = interval;
      int fixedInterval = -1;
      Map<String, String> metadata = new TreeMap<String, String>();
      if (url.indexOf("\t") != -1) {
        String[] splits = url.split("\t");
        url = splits[0];
        for (int s = 1; s < splits.length; s++) {
          // find separation between name and value
          int indexEquals = splits[s].indexOf("=");
          if (indexEquals == -1) {
            // skip anything without a =
            continue;
          }
          String metaname = splits[s].substring(0, indexEquals);
          String metavalue = splits[s].substring(indexEquals + 1);
          if (metaname.equals(nutchScoreMDName)) {
            try {
              customScore = Float.parseFloat(metavalue);
            } catch (NumberFormatException nfe) {
            }
          } else if (metaname.equals(nutchFetchIntervalMDName)) {
            try {
              customInterval = Integer.parseInt(metavalue);
            } catch (NumberFormatException nfe) {
            }
          } else if (metaname.equals(nutchFixedFetchIntervalMDName)) {
            try {
              fixedInterval = Integer.parseInt(metavalue);
            } catch (NumberFormatException nfe) {
            }
          } else metadata.put(metaname, metavalue);
        }
      }
      try {
        url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT);
        url = filters.filter(url); // filter the url
      } catch (Exception e) {
        if (LOG.isWarnEnabled()) {
          LOG.warn("Skipping " + url + ":" + e);
        }
        url = null;
      }
      if (url == null) {
        reporter.getCounter("injector", "urls_filtered").increment(1);
      } else { // if it passes
        value.set(url); // collect it
        CrawlDatum datum = new CrawlDatum();
        datum.setStatus(CrawlDatum.STATUS_INJECTED);

        // Is interval custom? Then set as meta data
        if (fixedInterval > -1) {
          // Set writable using float. Flaot is used by AdaptiveFetchSchedule
          datum
              .getMetaData()
              .put(Nutch.WRITABLE_FIXED_INTERVAL_KEY, new FloatWritable(fixedInterval));
          datum.setFetchInterval(fixedInterval);
        } else {
          datum.setFetchInterval(customInterval);
        }

        datum.setFetchTime(curTime);
        // now add the metadata
        Iterator<String> keysIter = metadata.keySet().iterator();
        while (keysIter.hasNext()) {
          String keymd = keysIter.next();
          String valuemd = metadata.get(keymd);
          datum.getMetaData().put(new Text(keymd), new Text(valuemd));
        }
        if (customScore != -1) datum.setScore(customScore);
        else datum.setScore(scoreInjected);
        try {
          scfilters.injectedScore(value, datum);
        } catch (ScoringFilterException e) {
          if (LOG.isWarnEnabled()) {
            LOG.warn(
                "Cannot filter injected score for url "
                    + url
                    + ", using default ("
                    + e.getMessage()
                    + ")");
          }
        }
        reporter.getCounter("injector", "urls_injected").increment(1);
        output.collect(value, datum);
      }
    }