// specify input and out keys
    public void map(
        LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter)
        throws IOException {
      String line = value.toString(); // define new variable to be string

      ArrayList<Integer> range = new ArrayList<Integer>();
      for (int i = 2000; i <= 2010; i++) {
        range.add(i);
      }

      // String[] inputs = line.split(",(?=([^\"]*\"[^\"]*\")*[^\"]*$)");
      String[] inputs = line.split(",");

      try {

        int year = Integer.parseInt(inputs[165]);

        if (range.contains(year)) {
          String dur = inputs[3];
          String artist_name = inputs[2];
          String song_title = inputs[1];
          String final_input = artist_name + ',' + dur + ',' + song_title;
          Final_Value.set(final_input);
          output.collect(Final_Value, dummy);
        }
      } catch (NumberFormatException e) {
        // do nothing
      }
    }
Exemple #2
0
    /**
     * Produce splits such that each is no greater than the quotient of the total size and the
     * number of splits requested.
     *
     * @param job The handle to the JobConf object
     * @param numSplits Number of splits requested
     */
    public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
      int cnfiles = job.getInt(SRC_COUNT_LABEL, -1);
      long cbsize = job.getLong(TOTAL_SIZE_LABEL, -1);
      String srcfilelist = job.get(SRC_LIST_LABEL, "");
      if (cnfiles < 0 || cbsize < 0 || "".equals(srcfilelist)) {
        throw new RuntimeException(
            "Invalid metadata: #files("
                + cnfiles
                + ") total_size("
                + cbsize
                + ") listuri("
                + srcfilelist
                + ")");
      }
      Path src = new Path(srcfilelist);
      FileSystem fs = src.getFileSystem(job);
      FileStatus srcst = fs.getFileStatus(src);

      ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
      LongWritable key = new LongWritable();
      FilePair value = new FilePair();
      final long targetsize = cbsize / numSplits;
      long pos = 0L;
      long last = 0L;
      long acc = 0L;
      long cbrem = srcst.getLen();
      SequenceFile.Reader sl = null;
      try {
        sl = new SequenceFile.Reader(fs, src, job);
        for (; sl.next(key, value); last = sl.getPosition()) {
          // if adding this split would put this split past the target size,
          // cut the last split and put this next file in the next split.
          if (acc + key.get() > targetsize && acc != 0) {
            long splitsize = last - pos;
            splits.add(new FileSplit(src, pos, splitsize, (String[]) null));
            cbrem -= splitsize;
            pos = last;
            acc = 0L;
          }
          acc += key.get();
        }
      } finally {
        checkAndClose(sl);
      }
      if (cbrem != 0) {
        splits.add(new FileSplit(src, pos, cbrem, (String[]) null));
      }

      return splits.toArray(new FileSplit[splits.size()]);
    }
 private float mean(ArrayList<Integer> l) {
   int t = l.size();
   Integer sum = new Integer(0);
   for (Integer i : l) {
     sum += i;
   }
   return ((float) sum) / t;
 }
    private float standard_deviation(ArrayList<Integer> l) {
      int t = l.size();
      float ans = 0, mn = this.mean(l);

      for (Integer i : l) {
        ans += (i - mn) * (i - mn);
      }
      return (float) Math.sqrt(ans / (t - 1));
    }
    public void reduce(
        IntWritable key,
        Iterator<Text> values,
        OutputCollector<IntWritable, Text> output,
        Reporter reporter)
        throws IOException {
      HashMap<String, Integer> countries_map = new HashMap<String, Integer>();
      ArrayList<Integer> counties = new ArrayList<>();
      String cp = new String();

      while (values.hasNext()) {
        cp = values.next().toString();
        if (countries_map.containsKey(cp)) {
          countries_map.put(cp, countries_map.get(cp) + 1);
        } else {
          countries_map.put(cp, 1);
        }
      }

      for (java.util.Map.Entry<String, Integer> entry : countries_map.entrySet()) {
        counties.add(entry.getValue());
      }
      output.collect(
          key,
          new Text(
              ""
                  + countries_map.entrySet().size()
                  + " "
                  + Collections.min(counties)
                  + " "
                  + median(counties)
                  + " "
                  + Collections.max(counties)
                  + " "
                  + mean(counties)
                  + " "
                  + standard_deviation(counties)));
    }
 private int median(ArrayList<Integer> l) {
   Collections.sort(l);
   int t = l.size();
   return l.get(t / 2);
 }
    @Override
    public void reduce(
        IntWritable key,
        Iterator<ClusterWritable> values,
        OutputCollector<IntWritable, Text> output,
        Reporter reporter)
        throws IOException {

      float sumSimilarity = 0.0f;
      int numMovies = 0;
      float avgSimilarity = 0.0f;
      float similarity = 0.0f;
      int s = 0;
      int count;
      float diff = 0.0f;
      float minDiff = 1.0f;
      int candidate = 0;
      String data = new String("");
      String shortline = new String("");
      ArrayList<String> arrl = new ArrayList<String>();
      ArrayList<Float> simArrl = new ArrayList<Float>();
      String oneElm = new String();
      int indexShort, index2;
      Text val = new Text();

      while (values.hasNext()) {
        ClusterWritable cr = (ClusterWritable) values.next();
        similarity = cr.similarity;
        simArrl.addAll(cr.similarities);
        for (int i = 0; i < cr.movies.size(); i++) {
          oneElm = cr.movies.get(i);
          indexShort =
              oneElm.indexOf(
                  ",",
                  1000); // to avoid memory error caused by long arrays; it will results less
                         // accurate
          if (indexShort == -1) {
            shortline = new String(oneElm);
          } else {
            shortline = new String(oneElm.substring(0, indexShort));
          }
          arrl.add(shortline);
          output.collect(key, new Text(oneElm));
        }
        numMovies += cr.movies.size();
        sumSimilarity += similarity;
      }
      if (numMovies > 0) {
        avgSimilarity = sumSimilarity / (float) numMovies;
      }
      diff = 0.0f;
      minDiff = 1.0f;
      for (s = 0; s < numMovies; s++) {
        diff = (float) Math.abs(avgSimilarity - simArrl.get(s));
        if (diff < minDiff) {
          minDiff = diff;
          candidate = s;
        }
      }
      data = arrl.get(candidate);
      index2 = data.indexOf(":");
      String movieStr = data.substring(0, index2);
      String reviews = data.substring(index2 + 1);
      StringTokenizer token = new StringTokenizer(reviews, ",");
      count = 0;
      while (token.hasMoreTokens()) {
        token.nextToken();
        count++;
      }
      System.out.println(
          "The key = "
              + key.toString()
              + " has members = "
              + numMovies
              + " simil = "
              + simArrl.get(candidate));
      val = new Text(simArrl.get(candidate) + " " + movieStr + " " + count + " " + reviews);
      output.collect(key, val);
      reporter.incrCounter(Counter.VALUES, 1);
    }