Exemplo n.º 1
0
 public String getDocString() {
   StringBuffer buf = new StringBuffer();
   buf.append("Example data: ");
   for (Iterator<String> it = sampleStrs.iterator(); it.hasNext(); ) {
     String tokStr = it.next();
     buf.append("'" + tokStr + "'");
     if (it.hasNext()) {
       buf.append(", ");
     }
   }
   return buf.toString();
 }
Exemplo n.º 2
0
 long computeContentsLength() {
   long total = computeFileLength();
   for (Iterator it = children.values().iterator(); it.hasNext(); ) {
     INode child = (INode) it.next();
     total += child.computeContentsLength();
   }
   return total;
 }
Exemplo n.º 3
0
 int numItemsInTree() {
   int total = 0;
   for (Iterator it = children.values().iterator(); it.hasNext(); ) {
     INode child = (INode) it.next();
     total += child.numItemsInTree();
   }
   return total + 1;
 }
Exemplo n.º 4
0
    public void reduce(
        Text key,
        Iterator<CrawlDatum> values,
        OutputCollector<Text, CrawlDatum> output,
        Reporter reporter)
        throws IOException {
      boolean oldSet = false;
      boolean injectedSet = false;
      while (values.hasNext()) {
        CrawlDatum val = values.next();
        if (val.getStatus() == CrawlDatum.STATUS_INJECTED) {
          injected.set(val);
          injected.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
          injectedSet = true;
        } else {
          old.set(val);
          oldSet = true;
        }
      }
      CrawlDatum res = null;

      /**
       * Whether to overwrite, ignore or update existing records
       *
       * @see https://issues.apache.org/jira/browse/NUTCH-1405
       */

      // Injected record already exists and overwrite but not update
      if (injectedSet && oldSet && overwrite) {
        res = injected;

        if (update) {
          LOG.info(key.toString() + " overwritten with injected record but update was specified.");
        }
      }

      // Injected record already exists and update but not overwrite
      if (injectedSet && oldSet && update && !overwrite) {
        res = old;
        old.putAllMetaData(injected);
        old.setScore(injected.getScore() != scoreInjected ? injected.getScore() : old.getScore());
        old.setFetchInterval(
            injected.getFetchInterval() != interval
                ? injected.getFetchInterval()
                : old.getFetchInterval());
      }

      // Old default behaviour
      if (injectedSet && !oldSet) {
        res = injected;
      } else {
        res = old;
      }

      output.collect(key, res);
    }
Exemplo n.º 5
0
    void listContents(Vector v) {
      if (parent != null && blocks != null) {
        v.add(this);
      }

      for (Iterator it = children.values().iterator(); it.hasNext(); ) {
        INode child = (INode) it.next();
        v.add(child);
      }
    }
Exemplo n.º 6
0
 /**
  * Collect all the blocks at this INode and all its children. This operation is performed after
  * a node is removed from the tree, and we want to GC all the blocks at this node and below.
  */
 void collectSubtreeBlocks(Vector v) {
   if (blocks != null) {
     for (int i = 0; i < blocks.length; i++) {
       v.add(blocks[i]);
     }
   }
   for (Iterator it = children.values().iterator(); it.hasNext(); ) {
     INode child = (INode) it.next();
     child.collectSubtreeBlocks(v);
   }
 }
Exemplo n.º 7
0
 public void reduce(
     Text key,
     Iterator<IntWritable> values,
     OutputCollector<Text, IntWritable> output,
     Reporter reporter)
     throws IOException {
   int sum = 0;
   while (values.hasNext()) {
     sum += values.next().get();
   }
   output.collect(key, new IntWritable(sum));
 }
Exemplo n.º 8
0
    public void reduce(
        Text key,
        Iterator<IntWritable> value,
        OutputCollector<Text, IntWritable> output,
        Reporter reporter)
        throws IOException {
      // TODO Auto-generated method stub
      int maxAvg = 30;
      int val = Integer.MIN_VALUE;
      while (value.hasNext()) {

        if ((val = value.next().get()) > maxAvg) {
          output.collect(key, new IntWritable(val));
        }
      }
    }
Exemplo n.º 9
0
 void saveImage(String parentPrefix, DataOutputStream out) throws IOException {
   String fullName = "";
   if (parent != null) {
     fullName = parentPrefix + "/" + name;
     new UTF8(fullName).write(out);
     if (blocks == null) {
       out.writeInt(0);
     } else {
       out.writeInt(blocks.length);
       for (int i = 0; i < blocks.length; i++) {
         blocks[i].write(out);
       }
     }
   }
   for (Iterator it = children.values().iterator(); it.hasNext(); ) {
     INode child = (INode) it.next();
     child.saveImage(fullName, out);
   }
 }
    @Override
    protected void cleanup(Context context) throws IOException, InterruptedException {

      Iterator it = carrierAvgDelay.entrySet().iterator();
      while (it.hasNext()) {
        Map.Entry pair = (Map.Entry) it.next();

        TreeSet<Pair<Float, String>> ts = (TreeSet<Pair<Float, String>>) pair.getValue();
        for (Pair<Float, String> item : ts) {
          String[] strings = {pair.getKey() + "-" + item.second, item.first.toString()};
          TextArrayWritable val = new TextArrayWritable(strings);
          context.write(NullWritable.get(), val);
        }
        /*
                        String[] strings = {pair.getKey()+"-XYZ", "7.1"};
                        TextArrayWritable val = new TextArrayWritable(strings);
                        context.write(NullWritable.get(), val);
        */
      }
    }
Exemplo n.º 11
0
  /**
   * Get a listing of files given path 'src'
   *
   * <p>This function is admittedly very inefficient right now. We'll make it better later.
   */
  public DFSFileInfo[] getListing(UTF8 src) {
    String srcs = normalizePath(src);

    synchronized (rootDir) {
      INode targetNode = rootDir.getNode(srcs);
      if (targetNode == null) {
        return null;
      } else {
        Vector contents = new Vector();
        targetNode.listContents(contents);

        DFSFileInfo listing[] = new DFSFileInfo[contents.size()];
        int i = 0;
        for (Iterator it = contents.iterator(); it.hasNext(); i++) {
          listing[i] = new DFSFileInfo((INode) it.next());
        }
        return listing;
      }
    }
  }
    public void reduce(
        IntWritable sameNum,
        Iterator<Text> data,
        OutputCollector<Text, jBLASArrayWritable> output,
        Reporter reporter)
        throws IOException {
      int totalBatchCount = exampleCount / batchSize;

      DoubleMatrix weights = DoubleMatrix.randn(hiddenNodes, visibleNodes);
      DoubleMatrix hbias = DoubleMatrix.zeros(hiddenNodes);
      DoubleMatrix vbias = DoubleMatrix.zeros(visibleNodes);
      DoubleMatrix label = DoubleMatrix.zeros(1);
      DoubleMatrix hidden_chain = null;
      DoubleMatrix vdata = DoubleMatrix.zeros(batchSize, visibleNodes);

      ArrayList<DoubleMatrix> outputmatricies = new ArrayList<DoubleMatrix>();
      outputmatricies.add(weights);
      outputmatricies.add(hbias);
      outputmatricies.add(vbias);
      outputmatricies.add(label);
      outputmatricies.add(vdata);
      outputmatricies.add(hidden_chain);

      int j;
      for (int i = 0; i < totalBatchCount; i++) {
        j = 0;
        while (data.hasNext() && j < batchSize) {
          j++;
          StringTokenizer tk = new StringTokenizer(data.next().toString());
          label.put(0, Double.parseDouble(tk.nextToken()));
          String image = tk.nextToken();
          for (int k = 0; k < image.length(); k++) {
            Integer val = new Integer(image.charAt(k));
            vdata.put(j, k, val.doubleValue());
          }
          dataArray = new jBLASArrayWritable(outputmatricies);
          batchID.set("1\t" + i);
          output.collect(batchID, dataArray);
        }
      }
    }
    @Override
    public void reduce(NullWritable key, Iterable<TextArrayWritable> values, Context context)
        throws IOException, InterruptedException {
      for (TextArrayWritable val : values) {
        Text[] pair = (Text[]) val.toArray();

        String word = pair[0].toString();
        Float avg = Float.parseFloat(pair[1].toString());
        /*
                        Text tWord = new Text(word);
                        FloatWritable value = new FloatWritable(avg);
                        context.write(tWord, value);
        */
        String[] parts = word.split("-");
        String newKey = parts[0] + "-" + parts[1];
        String carrier = parts[2];

        if (!carrierAvgDelay.containsKey(newKey)) {
          TreeSet<Pair<Float, String>> avgDelayMap = new TreeSet<Pair<Float, String>>();
          carrierAvgDelay.put(newKey, avgDelayMap);
        }
        TreeSet<Pair<Float, String>> ts = carrierAvgDelay.get(newKey);
        ts.add(new Pair<Float, String>(avg, carrier));
        if (ts.size() > 10) {
          ts.remove(ts.last());
        }
        carrierAvgDelay.put(newKey, ts);
      }

      Iterator it = carrierAvgDelay.entrySet().iterator();
      while (it.hasNext()) {
        Map.Entry pair = (Map.Entry) it.next();
        TreeSet<Pair<Float, String>> ts = (TreeSet<Pair<Float, String>>) pair.getValue();
        for (Pair<Float, String> item : ts) {
          Text word = new Text(pair.getKey() + "-" + item.second);
          FloatWritable value = new FloatWritable(item.first);
          context.write(word, value);
        }
      }
    }
Exemplo n.º 14
0
    public void reduce(
        IntWritable key,
        Iterator<Text> values,
        OutputCollector<IntWritable, Text> output,
        Reporter reporter)
        throws IOException {
      HashMap<String, Integer> countries_map = new HashMap<String, Integer>();
      ArrayList<Integer> counties = new ArrayList<>();
      String cp = new String();

      while (values.hasNext()) {
        cp = values.next().toString();
        if (countries_map.containsKey(cp)) {
          countries_map.put(cp, countries_map.get(cp) + 1);
        } else {
          countries_map.put(cp, 1);
        }
      }

      for (java.util.Map.Entry<String, Integer> entry : countries_map.entrySet()) {
        counties.add(entry.getValue());
      }
      output.collect(
          key,
          new Text(
              ""
                  + countries_map.entrySet().size()
                  + " "
                  + Collections.min(counties)
                  + " "
                  + median(counties)
                  + " "
                  + Collections.max(counties)
                  + " "
                  + mean(counties)
                  + " "
                  + standard_deviation(counties)));
    }
Exemplo n.º 15
0
 boolean unprotectedRenameTo(UTF8 src, UTF8 dst) {
   synchronized (rootDir) {
     INode removedNode = rootDir.getNode(src.toString());
     if (removedNode == null) {
       return false;
     }
     removedNode.removeNode();
     if (isDir(dst)) {
       dst = new UTF8(dst.toString() + "/" + new File(src.toString()).getName());
     }
     INode newNode = rootDir.addNode(dst.toString(), removedNode.blocks);
     if (newNode != null) {
       newNode.children = removedNode.children;
       for (Iterator it = newNode.children.values().iterator(); it.hasNext(); ) {
         INode child = (INode) it.next();
         child.parent = newNode;
       }
       return true;
     } else {
       rootDir.addNode(src.toString(), removedNode.blocks);
       return false;
     }
   }
 }
Exemplo n.º 16
0
 Block[] unprotectedDelete(UTF8 src) {
   synchronized (rootDir) {
     INode targetNode = rootDir.getNode(src.toString());
     if (targetNode == null) {
       return null;
     } else {
       //
       // Remove the node from the namespace and GC all
       // the blocks underneath the node.
       //
       if (!targetNode.removeNode()) {
         return null;
       } else {
         Vector v = new Vector();
         targetNode.collectSubtreeBlocks(v);
         for (Iterator it = v.iterator(); it.hasNext(); ) {
           Block b = (Block) it.next();
           activeBlocks.remove(b);
         }
         return (Block[]) v.toArray(new Block[v.size()]);
       }
     }
   }
 }
Exemplo n.º 17
0
    public void map(
        WritableComparable<?> key,
        Text value,
        OutputCollector<Text, CrawlDatum> output,
        Reporter reporter)
        throws IOException {
      String url = value.toString(); // value is line of text

      if (url != null && url.trim().startsWith("#")) {
        /* Ignore line that start with # */
        return;
      }

      // if tabs : metadata that could be stored
      // must be name=value and separated by \t
      float customScore = -1f;
      int customInterval = interval;
      int fixedInterval = -1;
      Map<String, String> metadata = new TreeMap<String, String>();
      if (url.indexOf("\t") != -1) {
        String[] splits = url.split("\t");
        url = splits[0];
        for (int s = 1; s < splits.length; s++) {
          // find separation between name and value
          int indexEquals = splits[s].indexOf("=");
          if (indexEquals == -1) {
            // skip anything without a =
            continue;
          }
          String metaname = splits[s].substring(0, indexEquals);
          String metavalue = splits[s].substring(indexEquals + 1);
          if (metaname.equals(nutchScoreMDName)) {
            try {
              customScore = Float.parseFloat(metavalue);
            } catch (NumberFormatException nfe) {
            }
          } else if (metaname.equals(nutchFetchIntervalMDName)) {
            try {
              customInterval = Integer.parseInt(metavalue);
            } catch (NumberFormatException nfe) {
            }
          } else if (metaname.equals(nutchFixedFetchIntervalMDName)) {
            try {
              fixedInterval = Integer.parseInt(metavalue);
            } catch (NumberFormatException nfe) {
            }
          } else metadata.put(metaname, metavalue);
        }
      }
      try {
        url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT);
        url = filters.filter(url); // filter the url
      } catch (Exception e) {
        if (LOG.isWarnEnabled()) {
          LOG.warn("Skipping " + url + ":" + e);
        }
        url = null;
      }
      if (url == null) {
        reporter.getCounter("injector", "urls_filtered").increment(1);
      } else { // if it passes
        value.set(url); // collect it
        CrawlDatum datum = new CrawlDatum();
        datum.setStatus(CrawlDatum.STATUS_INJECTED);

        // Is interval custom? Then set as meta data
        if (fixedInterval > -1) {
          // Set writable using float. Flaot is used by AdaptiveFetchSchedule
          datum
              .getMetaData()
              .put(Nutch.WRITABLE_FIXED_INTERVAL_KEY, new FloatWritable(fixedInterval));
          datum.setFetchInterval(fixedInterval);
        } else {
          datum.setFetchInterval(customInterval);
        }

        datum.setFetchTime(curTime);
        // now add the metadata
        Iterator<String> keysIter = metadata.keySet().iterator();
        while (keysIter.hasNext()) {
          String keymd = keysIter.next();
          String valuemd = metadata.get(keymd);
          datum.getMetaData().put(new Text(keymd), new Text(valuemd));
        }
        if (customScore != -1) datum.setScore(customScore);
        else datum.setScore(scoreInjected);
        try {
          scfilters.injectedScore(value, datum);
        } catch (ScoringFilterException e) {
          if (LOG.isWarnEnabled()) {
            LOG.warn(
                "Cannot filter injected score for url "
                    + url
                    + ", using default ("
                    + e.getMessage()
                    + ")");
          }
        }
        reporter.getCounter("injector", "urls_injected").increment(1);
        output.collect(value, datum);
      }
    }
Exemplo n.º 18
0
    public void reduce(
        BlockIDWritable key,
        Iterator<PairWritable> values,
        OutputCollector<BlockIDWritable, PairWritable> output,
        Reporter reporter)
        throws IOException {
      // Vector<Star> starV = new Vector<Star>();
      int buketsizeX = 0;
      int buketsizeY = 0;
      double bwidth = maxAlphas[key.zoneNum]; // ra ,x
      double bheight = theta; // dec ,y
      /* add 10 more in each dimension to make sure there is no overflow. */
      Vector<Star>[][] arrstarV =
          new Vector[((int) (zoneHeight / bheight)) + 10]
              [((int) (blockWidth / bwidth)) + 10]; // create bucket vector[Y][X]

      int num = 0;
      while (values.hasNext()) {
        num++;
        Star s = values.next().get(0);

        // participant
        double posx = (s.ra - blockRanges[key.raNum][0]) / bwidth;
        int x = (int) posx + 1; // shit by 1 in case star comes from other block
        double posy = (s.dec - zoneRanges[key.zoneNum][0]) / bheight;
        int y = (int) posy + 1;

        // set bucket size as max
        if (buketsizeX < x) buketsizeX = x;
        if (buketsizeY < y) buketsizeY = y;
        // create according bucket
        if (arrstarV[y][x] == null)
          // TODO avaoid creating vectors here.
          arrstarV[y][x] = new Vector<Star>();
        // put star into bucket
        arrstarV[y][x].add(s);
      }
      // start reducer
      int i, j, row, col;
      // for each bucket
      for (row = 0; row <= buketsizeY; row++) {
        for (col = 0; col <= buketsizeX; col++) {
          //		starV.clear();
          // construct a new vector to do compare
          // TODO we need to avoid searching objects in the border.
          if (arrstarV[row][col] != null) {
            // old method to generate output
            for (i = 0; i < arrstarV[row][col].size(); i++) {
              for (j = i + 1; j < arrstarV[row][col].size(); j++) {
                Star star1 = arrstarV[row][col].get(i);
                Star star2 = arrstarV[row][col].get(j);
                // what is this margin about
                if (star1.margin && star2.margin) continue;

                double dist = star1.x * star2.x + star1.y * star2.y + star1.z * star2.z;
                if (dist > costheta) {
                  p.set(star1, star2, dist);
                  output.collect(key, p);
                  p.set(star2, star1, dist);
                  output.collect(key, p);
                  //		num += 2;

                }
              }
            } // end for i,j

          } // end if
          else {
            continue;
          }
          // 4 more neighbors
          // right upper arrstarV[row-1][col+1] vs arrstarV[row][col]
          if (row != 0 && arrstarV[row - 1][col + 1] != null) {
            search(arrstarV[row][col], arrstarV[row - 1][col + 1], key, output);
          }
          // right arrstarV[row][col+1] vs arrstarV[row][col]
          if (arrstarV[row][col + 1] != null) {
            search(arrstarV[row][col], arrstarV[row][col + 1], key, output);
          }
          // right lower
          if (arrstarV[row + 1][col + 1] != null) {
            search(arrstarV[row][col], arrstarV[row + 1][col + 1], key, output);
          }
          // lower
          if (arrstarV[row + 1][col] != null) {
            search(arrstarV[row][col], arrstarV[row + 1][col], key, output);
          } // end if
        } // end colum
      } // end row
    }
Exemplo n.º 19
0
  /**
   * Initialize DFSCopyFileMapper specific job-configuration.
   *
   * @param conf : The dfs/mapred configuration.
   * @param jobConf : The handle to the jobConf object to be initialized.
   * @param args Arguments
   */
  private static void setup(Configuration conf, JobConf jobConf, final Arguments args)
      throws IOException {
    jobConf.set(DST_DIR_LABEL, args.dst.toUri().toString());

    // set boolean values
    final boolean update = args.flags.contains(Options.UPDATE);
    final boolean overwrite = !update && args.flags.contains(Options.OVERWRITE);
    jobConf.setBoolean(Options.UPDATE.propertyname, update);
    jobConf.setBoolean(Options.OVERWRITE.propertyname, overwrite);
    jobConf.setBoolean(
        Options.IGNORE_READ_FAILURES.propertyname,
        args.flags.contains(Options.IGNORE_READ_FAILURES));
    jobConf.setBoolean(
        Options.PRESERVE_STATUS.propertyname, args.flags.contains(Options.PRESERVE_STATUS));

    final String randomId = getRandomId();
    JobClient jClient = new JobClient(jobConf);
    Path jobDirectory = new Path(jClient.getSystemDir(), NAME + "_" + randomId);
    jobConf.set(JOB_DIR_LABEL, jobDirectory.toString());

    FileSystem dstfs = args.dst.getFileSystem(conf);
    boolean dstExists = dstfs.exists(args.dst);
    boolean dstIsDir = false;
    if (dstExists) {
      dstIsDir = dstfs.getFileStatus(args.dst).isDir();
    }

    // default logPath
    Path logPath = args.log;
    if (logPath == null) {
      String filename = "_distcp_logs_" + randomId;
      if (!dstExists || !dstIsDir) {
        Path parent = args.dst.getParent();
        if (!dstfs.exists(parent)) {
          dstfs.mkdirs(parent);
        }
        logPath = new Path(parent, filename);
      } else {
        logPath = new Path(args.dst, filename);
      }
    }
    FileOutputFormat.setOutputPath(jobConf, logPath);

    // create src list, dst list
    FileSystem jobfs = jobDirectory.getFileSystem(jobConf);

    Path srcfilelist = new Path(jobDirectory, "_distcp_src_files");
    jobConf.set(SRC_LIST_LABEL, srcfilelist.toString());
    SequenceFile.Writer src_writer =
        SequenceFile.createWriter(
            jobfs,
            jobConf,
            srcfilelist,
            LongWritable.class,
            FilePair.class,
            SequenceFile.CompressionType.NONE);

    Path dstfilelist = new Path(jobDirectory, "_distcp_dst_files");
    SequenceFile.Writer dst_writer =
        SequenceFile.createWriter(
            jobfs, jobConf, dstfilelist, Text.class, Text.class, SequenceFile.CompressionType.NONE);

    Path dstdirlist = new Path(jobDirectory, "_distcp_dst_dirs");
    jobConf.set(DST_DIR_LIST_LABEL, dstdirlist.toString());
    SequenceFile.Writer dir_writer =
        SequenceFile.createWriter(
            jobfs,
            jobConf,
            dstdirlist,
            Text.class,
            FilePair.class,
            SequenceFile.CompressionType.NONE);

    // handle the case where the destination directory doesn't exist
    // and we've only a single src directory OR we're updating/overwriting
    // the contents of the destination directory.
    final boolean special = (args.srcs.size() == 1 && !dstExists) || update || overwrite;
    int srcCount = 0, cnsyncf = 0, dirsyn = 0;
    long fileCount = 0L, byteCount = 0L, cbsyncs = 0L;
    try {
      for (Iterator<Path> srcItr = args.srcs.iterator(); srcItr.hasNext(); ) {
        final Path src = srcItr.next();
        FileSystem srcfs = src.getFileSystem(conf);
        FileStatus srcfilestat = srcfs.getFileStatus(src);
        Path root = special && srcfilestat.isDir() ? src : src.getParent();
        if (srcfilestat.isDir()) {
          ++srcCount;
        }

        Stack<FileStatus> pathstack = new Stack<FileStatus>();
        for (pathstack.push(srcfilestat); !pathstack.empty(); ) {
          FileStatus cur = pathstack.pop();
          FileStatus[] children = srcfs.listStatus(cur.getPath());
          for (int i = 0; i < children.length; i++) {
            boolean skipfile = false;
            final FileStatus child = children[i];
            final String dst = makeRelative(root, child.getPath());
            ++srcCount;

            if (child.isDir()) {
              pathstack.push(child);
            } else {
              // skip file if the src and the dst files are the same.
              skipfile = update && sameFile(srcfs, child, dstfs, new Path(args.dst, dst));
              // skip file if it exceed file limit or size limit
              skipfile |=
                  fileCount == args.filelimit || byteCount + child.getLen() > args.sizelimit;

              if (!skipfile) {
                ++fileCount;
                byteCount += child.getLen();

                if (LOG.isTraceEnabled()) {
                  LOG.trace("adding file " + child.getPath());
                }

                ++cnsyncf;
                cbsyncs += child.getLen();
                if (cnsyncf > SYNC_FILE_MAX || cbsyncs > BYTES_PER_MAP) {
                  src_writer.sync();
                  dst_writer.sync();
                  cnsyncf = 0;
                  cbsyncs = 0L;
                }
              }
            }

            if (!skipfile) {
              src_writer.append(
                  new LongWritable(child.isDir() ? 0 : child.getLen()), new FilePair(child, dst));
            }

            dst_writer.append(new Text(dst), new Text(child.getPath().toString()));
          }

          if (cur.isDir()) {
            String dst = makeRelative(root, cur.getPath());
            dir_writer.append(new Text(dst), new FilePair(cur, dst));
            if (++dirsyn > SYNC_FILE_MAX) {
              dirsyn = 0;
              dir_writer.sync();
            }
          }
        }
      }
    } finally {
      checkAndClose(src_writer);
      checkAndClose(dst_writer);
      checkAndClose(dir_writer);
    }

    FileStatus dststatus = null;
    try {
      dststatus = dstfs.getFileStatus(args.dst);
    } catch (FileNotFoundException fnfe) {
      LOG.info(args.dst + " does not exist.");
    }

    // create dest path dir if copying > 1 file
    if (dststatus == null) {
      if (srcCount > 1 && !dstfs.mkdirs(args.dst)) {
        throw new IOException("Failed to create" + args.dst);
      }
    }

    final Path sorted = new Path(jobDirectory, "_distcp_sorted");
    checkDuplication(jobfs, dstfilelist, sorted, conf);

    if (dststatus != null && args.flags.contains(Options.DELETE)) {
      deleteNonexisting(dstfs, dststatus, sorted, jobfs, jobDirectory, jobConf, conf);
    }

    Path tmpDir =
        new Path(
            (dstExists && !dstIsDir) || (!dstExists && srcCount == 1)
                ? args.dst.getParent()
                : args.dst,
            "_distcp_tmp_" + randomId);
    jobConf.set(TMP_DIR_LABEL, tmpDir.toUri().toString());
    LOG.info("srcCount=" + srcCount);
    jobConf.setInt(SRC_COUNT_LABEL, srcCount);
    jobConf.setLong(TOTAL_SIZE_LABEL, byteCount);
    setMapCount(byteCount, jobConf);
  }
Exemplo n.º 20
0
  protected void reduce(Text key, Iterable<Text> values, Context context)
      throws IOException, InterruptedException {

    Iterator<Text> itr = values.iterator();
    Text input = new Text();
    String[] inputTokens = null;

    // initialize/reset all variables
    Double pageRankOld = 0.0;
    Double residualError = 0.0;

    String output = "";
    Integer maxNode = 0;

    ArrayList<String> temp = new ArrayList<String>();
    Double tempBC = 0.0;
    vList.clear();
    newPR.clear();
    BE.clear();
    BC.clear();
    nodeDataMap.clear();

    while (itr.hasNext()) {
      input = itr.next();
      inputTokens = input.toString().split(" ");
      // if first element is PR, it is the node ID, previous pagerank and outgoing edgelist for this
      // node
      if (inputTokens[0].equals("PR")) {
        String nodeID = inputTokens[1];
        pageRankOld = Double.parseDouble(inputTokens[2]);
        newPR.put(nodeID, pageRankOld);
        NodeData node = new NodeData();
        node.setNodeID(nodeID);
        node.setPageRank(pageRankOld);
        if (inputTokens.length == 4) {
          node.setEdgeList(inputTokens[3]);
          node.setDegrees(inputTokens[3].split(",").length);
        }
        vList.add(nodeID);
        nodeDataMap.put(nodeID, node);
        // keep track of the max nodeID for this block
        if (Integer.parseInt(nodeID) > maxNode) {
          maxNode = Integer.parseInt(nodeID);
        }

        // if BE, it is an in-block edge
      } else if (inputTokens[0].equals("BE")) {

        if (BE.containsKey(inputTokens[2])) {
          // Initialize BC for this v
          temp = BE.get(inputTokens[2]);
        } else {
          temp = new ArrayList<String>();
        }
        temp.add(inputTokens[1]);
        BE.put(inputTokens[2], temp);

        // if BC, it is an incoming node from outside of the block
      } else if (inputTokens[0].equals("BC")) {
        if (BC.containsKey(inputTokens[2])) {
          // Initialize BC for this v
          tempBC = BC.get(inputTokens[2]);
        } else {
          tempBC = 0.0;
        }
        tempBC += Double.parseDouble(inputTokens[3]);
        BC.put(inputTokens[2], tempBC);
      }
    }

    int i = 0;
    do {
      i++;
      residualError = IterateBlockOnce();
      // System.out.println("Block " + key + " pass " + i + " resError:" + residualError);
    } while (residualError > threshold);

    // i < maxIterations &&

    // compute the ultimate residual error for each node in this block
    residualError = 0.0;
    for (String v : vList) {
      NodeData node = nodeDataMap.get(v);
      residualError += Math.abs(node.getPageRank() - newPR.get(v)) / newPR.get(v);
    }
    residualError = residualError / vList.size();
    // System.out.println("Block " + key + " overall resError for iteration: " + residualError);

    // add the residual error to the counter that is tracking the overall sum (must be expressed as
    // a long value)
    long residualAsLong = (long) Math.floor(residualError * PageRankBlock.precision);
    long numberOfIterations = (long) (i);
    context.getCounter(PageRankBlock.ProjectCounters.RESIDUAL_ERROR).increment(residualAsLong);

    context
        .getCounter(PageRankBlock.ProjectCounters.AVERAGE_ITERATIONS)
        .increment(numberOfIterations);

    // output should be
    //	key:nodeID (for this node)
    //	value:<pageRankNew> <degrees> <comma-separated outgoing edgeList>
    for (String v : vList) {
      NodeData node = nodeDataMap.get(v);
      output = newPR.get(v) + " " + node.getDegrees() + " " + node.getEdgeList();
      Text outputText = new Text(output);
      Text outputKey = new Text(v);
      context.write(outputKey, outputText);
      if (v.equals(maxNode.toString())) {
        System.out.println("Block:" + key + " | node:" + v + " | pageRank:" + newPR.get(v));
      }
    }

    cleanup(context);
  }