示例#1
0
    // specify input and out keys
    public void map(
        LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter)
        throws IOException {
      String line = value.toString(); // define new variable to be string

      ArrayList<Integer> range = new ArrayList<Integer>();
      for (int i = 2000; i <= 2010; i++) {
        range.add(i);
      }

      // String[] inputs = line.split(",(?=([^\"]*\"[^\"]*\")*[^\"]*$)");
      String[] inputs = line.split(",");

      try {

        int year = Integer.parseInt(inputs[165]);

        if (range.contains(year)) {
          String dur = inputs[3];
          String artist_name = inputs[2];
          String song_title = inputs[1];
          String final_input = artist_name + ',' + dur + ',' + song_title;
          Final_Value.set(final_input);
          output.collect(Final_Value, dummy);
        }
      } catch (NumberFormatException e) {
        // do nothing
      }
    }
示例#2
0
    /**
     * Produce splits such that each is no greater than the quotient of the total size and the
     * number of splits requested.
     *
     * @param job The handle to the JobConf object
     * @param numSplits Number of splits requested
     */
    public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
      int cnfiles = job.getInt(SRC_COUNT_LABEL, -1);
      long cbsize = job.getLong(TOTAL_SIZE_LABEL, -1);
      String srcfilelist = job.get(SRC_LIST_LABEL, "");
      if (cnfiles < 0 || cbsize < 0 || "".equals(srcfilelist)) {
        throw new RuntimeException(
            "Invalid metadata: #files("
                + cnfiles
                + ") total_size("
                + cbsize
                + ") listuri("
                + srcfilelist
                + ")");
      }
      Path src = new Path(srcfilelist);
      FileSystem fs = src.getFileSystem(job);
      FileStatus srcst = fs.getFileStatus(src);

      ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
      LongWritable key = new LongWritable();
      FilePair value = new FilePair();
      final long targetsize = cbsize / numSplits;
      long pos = 0L;
      long last = 0L;
      long acc = 0L;
      long cbrem = srcst.getLen();
      SequenceFile.Reader sl = null;
      try {
        sl = new SequenceFile.Reader(fs, src, job);
        for (; sl.next(key, value); last = sl.getPosition()) {
          // if adding this split would put this split past the target size,
          // cut the last split and put this next file in the next split.
          if (acc + key.get() > targetsize && acc != 0) {
            long splitsize = last - pos;
            splits.add(new FileSplit(src, pos, splitsize, (String[]) null));
            cbrem -= splitsize;
            pos = last;
            acc = 0L;
          }
          acc += key.get();
        }
      } finally {
        checkAndClose(sl);
      }
      if (cbrem != 0) {
        splits.add(new FileSplit(src, pos, cbrem, (String[]) null));
      }

      return splits.toArray(new FileSplit[splits.size()]);
    }
示例#3
0
 private float mean(ArrayList<Integer> l) {
   int t = l.size();
   Integer sum = new Integer(0);
   for (Integer i : l) {
     sum += i;
   }
   return ((float) sum) / t;
 }
示例#4
0
    private float standard_deviation(ArrayList<Integer> l) {
      int t = l.size();
      float ans = 0, mn = this.mean(l);

      for (Integer i : l) {
        ans += (i - mn) * (i - mn);
      }
      return (float) Math.sqrt(ans / (t - 1));
    }
    public void reduce(
        IntWritable sameNum,
        Iterator<Text> data,
        OutputCollector<Text, jBLASArrayWritable> output,
        Reporter reporter)
        throws IOException {
      int totalBatchCount = exampleCount / batchSize;

      DoubleMatrix weights = DoubleMatrix.randn(hiddenNodes, visibleNodes);
      DoubleMatrix hbias = DoubleMatrix.zeros(hiddenNodes);
      DoubleMatrix vbias = DoubleMatrix.zeros(visibleNodes);
      DoubleMatrix label = DoubleMatrix.zeros(1);
      DoubleMatrix hidden_chain = null;
      DoubleMatrix vdata = DoubleMatrix.zeros(batchSize, visibleNodes);

      ArrayList<DoubleMatrix> outputmatricies = new ArrayList<DoubleMatrix>();
      outputmatricies.add(weights);
      outputmatricies.add(hbias);
      outputmatricies.add(vbias);
      outputmatricies.add(label);
      outputmatricies.add(vdata);
      outputmatricies.add(hidden_chain);

      int j;
      for (int i = 0; i < totalBatchCount; i++) {
        j = 0;
        while (data.hasNext() && j < batchSize) {
          j++;
          StringTokenizer tk = new StringTokenizer(data.next().toString());
          label.put(0, Double.parseDouble(tk.nextToken()));
          String image = tk.nextToken();
          for (int k = 0; k < image.length(); k++) {
            Integer val = new Integer(image.charAt(k));
            vdata.put(j, k, val.doubleValue());
          }
          dataArray = new jBLASArrayWritable(outputmatricies);
          batchID.set("1\t" + i);
          output.collect(batchID, dataArray);
        }
      }
    }
示例#6
0
 @Override
 protected void reduce(Centroid key, Iterable<Point> values, Context context)
     throws IOException, InterruptedException {
   ArrayList<Point> points = new ArrayList<Point>();
   points.clear();
   int clusterId = key.id;
   Point newCenter = null;
   for (Point p : values) {
     Point copy = p.deepCopy();
     points.add(copy);
     if (newCenter == null) {
       newCenter = new Point(copy.deepCopy());
     } else {
       newCenter = newCenter.add(copy);
     }
   }
   Centroid center = new Centroid(clusterId, newCenter);
   for (Point p : points) {
     context.write(center, p);
   }
 }
示例#7
0
    public void reduce(
        IntWritable key,
        Iterator<Text> values,
        OutputCollector<IntWritable, Text> output,
        Reporter reporter)
        throws IOException {
      HashMap<String, Integer> countries_map = new HashMap<String, Integer>();
      ArrayList<Integer> counties = new ArrayList<>();
      String cp = new String();

      while (values.hasNext()) {
        cp = values.next().toString();
        if (countries_map.containsKey(cp)) {
          countries_map.put(cp, countries_map.get(cp) + 1);
        } else {
          countries_map.put(cp, 1);
        }
      }

      for (java.util.Map.Entry<String, Integer> entry : countries_map.entrySet()) {
        counties.add(entry.getValue());
      }
      output.collect(
          key,
          new Text(
              ""
                  + countries_map.entrySet().size()
                  + " "
                  + Collections.min(counties)
                  + " "
                  + median(counties)
                  + " "
                  + Collections.max(counties)
                  + " "
                  + mean(counties)
                  + " "
                  + standard_deviation(counties)));
    }
示例#8
0
  @SuppressWarnings("unchecked")
  public void writeToDisk(Configuration conf, boolean writeToDistributedCache) throws IOException {
    String bucketCachePath = PathUtils.getCachePath(conf) + BUCKET_CACHE_FOLDER;

    FileSystem fs = FileSystem.get(conf);
    MapFile.Writer writer = null;

    try {
      writer =
          new MapFile.Writer(
              conf,
              new Path(bucketCachePath),
              MapFile.Writer.keyClass(IntWritable.class),
              MapFile.Writer.valueClass(Bucket.class));

      ArrayList<IntWritable> keyList = new ArrayList<IntWritable>();
      for (IntWritable i : bucketCache.keySet()) {
        keyList.add(i);
      }

      Collections.sort(keyList);
      for (IntWritable i : keyList) {
        writer.append(i, bucketCache.get(i));
      }
    } finally {
      if (writer != null) {
        IOUtils.closeStream(writer);
      }
    }

    if (writeToDistributedCache) {
      for (FileStatus status : fs.listStatus(new Path(bucketCachePath))) {
        if (!status.isDirectory()) {
          DistributedCache.addCacheFile(status.getPath().toUri(), conf);
        }
      }
    }
  }
示例#9
0
  // v is all nodes within this block B
  // u is all nodes pointing to this set of v
  // some u are inside the block as well, those are in BE
  // some u are outside the block, those are in BC
  // BE = the Edges from Nodes in Block B
  // BC = the Boundary Conditions
  // NPR[v] = Next PageRank value of Node v
  protected double IterateBlockOnce() {
    // used to iterate through the BE list of edges
    ArrayList<String> uList = new ArrayList<String>();
    // npr = current PageRank value of Node v
    double npr = 0.0;
    // r = sum of PR[u]/deg[u] for boundary nodes pointing to v
    double r = 0.0;
    // resErr = the avg residual error for this iteration
    double resErr = 0.0;

    HashMap<String, Double> tempmap = new HashMap<String, Double>();
    for (String v : vList) {
      npr = 0.0f;
      double prevPR = newPR.get(v);

      // calculate newPR using PR data from any BE nodes for this node
      if (BE.containsKey(v)) {
        uList = BE.get(v);
        for (String u : uList) {
          // npr += PR[u] / deg(u);
          NodeData uNode = nodeDataMap.get(u);
          npr += (newPR.get(u) / uNode.getDegrees());
        }
      }

      // add on any PR from nodes outside the block (BC)
      if (BC.containsKey(v)) {
        r = BC.get(v);
        npr += r;
      }

      // NPR[v] = d*NPR[v] + (1-d)/N;
      npr = (dampingFactor * npr) + randomJumpFactor;
      // update the global newPR map
      tempmap.put(v, npr);
      // newPR.put(v, npr);
      // track the sum of the residual errors
      resErr += Math.abs(prevPR - npr) / npr;
    }
    // calculate the average residual error and return it
    newPR = tempmap;
    resErr = resErr / vList.size();
    return resErr;
  }
示例#10
0
 private int median(ArrayList<Integer> l) {
   Collections.sort(l);
   int t = l.size();
   return l.get(t / 2);
 }
示例#11
0
    @Override
    public void map(WritableComparable docID, Text docContents, Context context)
        throws IOException, InterruptedException {

      Matcher matcher = WORD_PATTERN.matcher(docContents.toString());
      Func func = funcFromNum(funcNum);

      // YOUR CODE HERE
      ArrayList<String> doc_words = new ArrayList<String>(); // Store all words within the document.
      ArrayList<Double> targetGram_pos =
          new ArrayList<
              Double>(); // Store the index of each occurrence of target word in the document

      DoublePair values = new DoublePair(); // DoublePair that store distance, ocurrences
      values.setDouble2(new Double(1.0)); // ocurrences = 0
      Text output = new Text();

      // Store each word within the document in doc_words
      while (matcher.find()) {
        doc_words.add(new String(matcher.group().toLowerCase()));
      }

      // Traverse the document and store each word within it in ArrayList doc_words, and at the same
      // time store the index of each occurence of target word within the document in targetGram_pos
      for (int i = 0; i < doc_words.size(); i++) {
        String word = doc_words.get(i);
        if (word.equals(targetGram)) targetGram_pos.add(new Double(i));
      }

      // Traverse the doc_words ArrayList and find the distance between each word within the
      // document and the target word
      // If there were not any ocurrence of target word distance is 0 to all words
      int index_tw = 0; // index target word
      Double distance = new Double(0); // store the distance between current word and target word
      for (int i = 0; i < doc_words.size(); i++) {
        if (targetGram_pos.size()
            == 0) { // If target word is not within the document, distance for all words is
          // Double.POSITIVE_INFINITY
          distance = Double.POSITIVE_INFINITY;
        } else {
          if (doc_words
              .get(i)
              .equals(
                  targetGram)) { // If word within the document is the same target word skip it and
            // go to the next word
            continue;
          }
          if (targetGram_pos.size() == 1) { // If there were just one entre of the target word
            distance = Math.abs(i - targetGram_pos.get(index_tw));
          } else {
            if (index_tw
                < targetGram_pos.size()
                    - 1) { // If this is not the LAST position of the ArrayList of indexes of the
              // target word
              if (Math.abs(i - targetGram_pos.get(index_tw))
                  > Math.abs(
                      i
                          - targetGram_pos.get(
                              index_tw
                                  + 1))) { // Compare the lowest distance between the nearest two
                // indexes
                index_tw++;
              }
            }
            distance = Math.abs(i - targetGram_pos.get(index_tw));
          }
        }
        values.setDouble1(
            new Double(func.f(distance))); // Evaluate dist on f(d) and store it on distance.d1
        output.set(doc_words.get(i)); // Output key is each word
        context.write(
            output, values); // key, value: key: each word, value:Pair of Double(distance, num of
        // co-currences)
      } // end for
    } // end map1
示例#12
0
  protected void reduce(Text key, Iterable<Text> values, Context context)
      throws IOException, InterruptedException {

    Iterator<Text> itr = values.iterator();
    Text input = new Text();
    String[] inputTokens = null;

    // initialize/reset all variables
    Double pageRankOld = 0.0;
    Double residualError = 0.0;

    String output = "";
    Integer maxNode = 0;

    ArrayList<String> temp = new ArrayList<String>();
    Double tempBC = 0.0;
    vList.clear();
    newPR.clear();
    BE.clear();
    BC.clear();
    nodeDataMap.clear();

    while (itr.hasNext()) {
      input = itr.next();
      inputTokens = input.toString().split(" ");
      // if first element is PR, it is the node ID, previous pagerank and outgoing edgelist for this
      // node
      if (inputTokens[0].equals("PR")) {
        String nodeID = inputTokens[1];
        pageRankOld = Double.parseDouble(inputTokens[2]);
        newPR.put(nodeID, pageRankOld);
        NodeData node = new NodeData();
        node.setNodeID(nodeID);
        node.setPageRank(pageRankOld);
        if (inputTokens.length == 4) {
          node.setEdgeList(inputTokens[3]);
          node.setDegrees(inputTokens[3].split(",").length);
        }
        vList.add(nodeID);
        nodeDataMap.put(nodeID, node);
        // keep track of the max nodeID for this block
        if (Integer.parseInt(nodeID) > maxNode) {
          maxNode = Integer.parseInt(nodeID);
        }

        // if BE, it is an in-block edge
      } else if (inputTokens[0].equals("BE")) {

        if (BE.containsKey(inputTokens[2])) {
          // Initialize BC for this v
          temp = BE.get(inputTokens[2]);
        } else {
          temp = new ArrayList<String>();
        }
        temp.add(inputTokens[1]);
        BE.put(inputTokens[2], temp);

        // if BC, it is an incoming node from outside of the block
      } else if (inputTokens[0].equals("BC")) {
        if (BC.containsKey(inputTokens[2])) {
          // Initialize BC for this v
          tempBC = BC.get(inputTokens[2]);
        } else {
          tempBC = 0.0;
        }
        tempBC += Double.parseDouble(inputTokens[3]);
        BC.put(inputTokens[2], tempBC);
      }
    }

    int i = 0;
    do {
      i++;
      residualError = IterateBlockOnce();
      // System.out.println("Block " + key + " pass " + i + " resError:" + residualError);
    } while (residualError > threshold);

    // i < maxIterations &&

    // compute the ultimate residual error for each node in this block
    residualError = 0.0;
    for (String v : vList) {
      NodeData node = nodeDataMap.get(v);
      residualError += Math.abs(node.getPageRank() - newPR.get(v)) / newPR.get(v);
    }
    residualError = residualError / vList.size();
    // System.out.println("Block " + key + " overall resError for iteration: " + residualError);

    // add the residual error to the counter that is tracking the overall sum (must be expressed as
    // a long value)
    long residualAsLong = (long) Math.floor(residualError * PageRankBlock.precision);
    long numberOfIterations = (long) (i);
    context.getCounter(PageRankBlock.ProjectCounters.RESIDUAL_ERROR).increment(residualAsLong);

    context
        .getCounter(PageRankBlock.ProjectCounters.AVERAGE_ITERATIONS)
        .increment(numberOfIterations);

    // output should be
    //	key:nodeID (for this node)
    //	value:<pageRankNew> <degrees> <comma-separated outgoing edgeList>
    for (String v : vList) {
      NodeData node = nodeDataMap.get(v);
      output = newPR.get(v) + " " + node.getDegrees() + " " + node.getEdgeList();
      Text outputText = new Text(output);
      Text outputKey = new Text(v);
      context.write(outputKey, outputText);
      if (v.equals(maxNode.toString())) {
        System.out.println("Block:" + key + " | node:" + v + " | pageRank:" + newPR.get(v));
      }
    }

    cleanup(context);
  }