public void map(LongWritable key, Text value, Context context)
     throws IOException, InterruptedException {
   String cur_file =
       ((FileSplit) context.getInputSplit()).getPath().getParent().getParent().getName();
   String train_file = context.getConfiguration().get("train_file");
   if (cur_file.equals(train_file)) {
     StringTokenizer st = new StringTokenizer(value.toString());
     String word = st.nextToken();
     String f_id = st.nextToken();
     myKey.set(word);
     myVal.set(f_id);
     context.write(myKey, myVal);
   } else {
     StringTokenizer st = new StringTokenizer(value.toString());
     String word = st.nextToken();
     String f_id = st.nextToken();
     StringBuilder builder = new StringBuilder(dlt);
     while (st.hasMoreTokens()) {
       String filename = st.nextToken();
       String tf_idf = st.nextToken();
       builder.append(filename);
       builder.append(dlt);
       builder.append(tf_idf);
       builder.append("\t");
     }
     myKey.set(word);
     myVal.set(builder.toString());
     context.write(myKey, myVal);
   }
 }
示例#2
0
    // specify input and out keys
    public void map(
        LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter)
        throws IOException {
      String line = value.toString(); // define new variable to be string

      ArrayList<Integer> range = new ArrayList<Integer>();
      for (int i = 2000; i <= 2010; i++) {
        range.add(i);
      }

      // String[] inputs = line.split(",(?=([^\"]*\"[^\"]*\")*[^\"]*$)");
      String[] inputs = line.split(",");

      try {

        int year = Integer.parseInt(inputs[165]);

        if (range.contains(year)) {
          String dur = inputs[3];
          String artist_name = inputs[2];
          String song_title = inputs[1];
          String final_input = artist_name + ',' + dur + ',' + song_title;
          Final_Value.set(final_input);
          output.collect(Final_Value, dummy);
        }
      } catch (NumberFormatException e) {
        // do nothing
      }
    }
    public void map(LongWritable key, Text value, Context context)
        throws IOException, InterruptedException {
      Configuration c = context.getConfiguration();
      String s = value.toString();
      String input[] = s.split(",");
      Text outputkey = new Text();
      Text outputvalue = new Text();
      double result = 0.0;

      /* multiplies matrix and vector entry with matching column value */

      result = (Double.parseDouble(input[2])) * (vector.get(Long.parseLong(input[1])));
      outputkey.set(input[0]);
      outputvalue.set(Double.toString(result));
      context.write(outputkey, outputvalue);
    }
示例#4
0
    public void map(LongWritable key, Text value, Context context)
        throws IOException, InterruptedException {
      String line = value.toString();
      String[] arr = line.split(",");

      word.set(arr[0]);
      context.write(word, one);
    }
 public void map(LongWritable key, Text value, Context context)
     throws IOException, InterruptedException {
   String line = value.toString();
   StringTokenizer tokenizer = new StringTokenizer(line);
   while (tokenizer.hasMoreTokens()) {
     word.set(tokenizer.nextToken());
     context.write(word, one);
   }
 }
 public void map(LongWritable key, Text value, OutputCollector output, Reporter reporter)
     throws IOException {
   String line = value.toString();
   StringTokenizer tokenizer = new StringTokenizer(line);
   while (tokenizer.hasMoreTokens()) {
     word.set(tokenizer.nextToken());
     output.collect(word, one);
   }
 }
 public void map(LongWritable key, Text value, Context context)
     throws IOException, InterruptedException {
   Text word = new Text();
   StringTokenizer s = new StringTokenizer(value.toString());
   while (s.hasMoreTokens()) {
     word.set(s.nextToken());
     context.write(word, one);
   }
 }
 public void map(
     IntWritable key, Text value, OutputCollector<IntWritable, Text> output, Reporter reporter)
     throws IOException {
   String dataRow = value.toString();
   StringTokenizer tk = new StringTokenizer(dataRow);
   String label = tk.nextToken();
   String image = tk.nextToken();
   dataString.set(label + "\t" + image);
   output.collect(sameKey, dataString);
 }
示例#9
0
    public void map(LongWritable key, Text value, Context context)
        throws IOException, InterruptedException {
      String line = value.toString();

      splitposition = line.indexOf("\t");
      labels = line.substring(0, splitposition);
      content = line.substring(splitposition + 1, line.length());
      if (content.length() <= 5) {
        word.set(labels);
        int counter = Integer.parseInt(content);
        context.write(word, new IntWritable(counter));
        return;
      }
      labeltokens = labels.split(",");
      contenttokens = tokenizeDoc(content);

      for (String label : labelspace) {
        for (String token : contenttokens) {
          // filter token, denotes the needed events;
          word.set(label + " " + token);
          context.write(word, new IntWritable(-1));
        }
      }
    }
示例#10
0
    /** Implements the map-method of the Mapper-interface */
    @Override
    public void map(
        LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter)
        throws IOException {

      String line = value.toString();
      StringTokenizer tokenizer = new StringTokenizer(line);

      while (tokenizer.hasMoreTokens()) {

        String token = tokenizer.nextToken();

        if (token.length() > 3) {
          word.set(token);
          output.collect(word, one);
        }
      }
    }
    public void reduce(
        IntWritable sameNum,
        Iterator<Text> data,
        OutputCollector<Text, jBLASArrayWritable> output,
        Reporter reporter)
        throws IOException {
      int totalBatchCount = exampleCount / batchSize;

      DoubleMatrix weights = DoubleMatrix.randn(hiddenNodes, visibleNodes);
      DoubleMatrix hbias = DoubleMatrix.zeros(hiddenNodes);
      DoubleMatrix vbias = DoubleMatrix.zeros(visibleNodes);
      DoubleMatrix label = DoubleMatrix.zeros(1);
      DoubleMatrix hidden_chain = null;
      DoubleMatrix vdata = DoubleMatrix.zeros(batchSize, visibleNodes);

      ArrayList<DoubleMatrix> outputmatricies = new ArrayList<DoubleMatrix>();
      outputmatricies.add(weights);
      outputmatricies.add(hbias);
      outputmatricies.add(vbias);
      outputmatricies.add(label);
      outputmatricies.add(vdata);
      outputmatricies.add(hidden_chain);

      int j;
      for (int i = 0; i < totalBatchCount; i++) {
        j = 0;
        while (data.hasNext() && j < batchSize) {
          j++;
          StringTokenizer tk = new StringTokenizer(data.next().toString());
          label.put(0, Double.parseDouble(tk.nextToken()));
          String image = tk.nextToken();
          for (int k = 0; k < image.length(); k++) {
            Integer val = new Integer(image.charAt(k));
            vdata.put(j, k, val.doubleValue());
          }
          dataArray = new jBLASArrayWritable(outputmatricies);
          batchID.set("1\t" + i);
          output.collect(batchID, dataArray);
        }
      }
    }
    public void reduce(Text key, Iterable<Text> values, Context context)
        throws IOException, InterruptedException {

      String[] pair = new String[2];
      int count = 0;
      for (Text txt : values) {
        pair[count] = txt.toString();
        count++;
      }

      // word exists in training
      if (count == 2) {
        StringTokenizer st_one, st_two;
        if (pair[0].contains(dlt)) {
          st_one = new StringTokenizer(pair[1]);
          st_two = new StringTokenizer(pair[0]);
        } else {
          st_one = new StringTokenizer(pair[0]);
          st_two = new StringTokenizer(pair[1]);
        }

        // outputting the data
        String f_id = st_one.nextToken();

        StringBuilder builder = new StringBuilder(dlt);
        builder.append(f_id);
        builder.append(dlt);
        while (st_two.hasMoreTokens()) {
          String filename = st_two.nextToken();
          String tf_idf = st_two.nextToken();
          builder.append(filename);
          builder.append(dlt);
          builder.append(tf_idf);
          builder.append("\t");
        }
        myVal.set(builder.toString());
        context.write(key, myVal);
      }
    }
    public void map(
        Text key,
        Text val,
        org.apache.hadoop.mapreduce.Mapper<Text, Text, Text, Text>.Context context)
        throws IOException, InterruptedException {
      int i = 0, n = 0, j = 0, lj = 0, hj = 0;
      String tem = "";

      initStopWordsMap(); // initialize  the stop list
      String line = val.toString();
      StringTokenizer itr =
          new StringTokenizer(line.toLowerCase(), tokenDelimiter); // set delimiter
      n = itr.countTokens();
      cache = new String[n];
      for (i = 0; i < n; i++) {
        cache[i] = new String(""); // initialize the cache
      }
      i = 0;
      while (itr.hasMoreTokens()) {
        cache[i] = itr.nextToken(); // padding the cache with the words of the content
        i++;
      }
      for (i = 0; i < n; i++) {
        keyWord = cache[i];
        keyWord = keyWord.trim();
        if (!hmStopWord.containsKey(keyWord)) {
          lj = i - 10;
          hj = i + 10;
          if (lj < 0) lj = 0;
          if (hj > n) hj = n;
          tem = " ";
          for (j = lj; j < hj; j++) tem += cache[j] + " ";
          location = new Text();
          location.set(key.toString() + tem);
          context.write(new Text(keyWord), location);
        }
      }
    }
示例#14
0
    @Override
    public void map(WritableComparable docID, Text docContents, Context context)
        throws IOException, InterruptedException {

      Matcher matcher = WORD_PATTERN.matcher(docContents.toString());
      Func func = funcFromNum(funcNum);

      // YOUR CODE HERE
      ArrayList<String> doc_words = new ArrayList<String>(); // Store all words within the document.
      ArrayList<Double> targetGram_pos =
          new ArrayList<
              Double>(); // Store the index of each occurrence of target word in the document

      DoublePair values = new DoublePair(); // DoublePair that store distance, ocurrences
      values.setDouble2(new Double(1.0)); // ocurrences = 0
      Text output = new Text();

      // Store each word within the document in doc_words
      while (matcher.find()) {
        doc_words.add(new String(matcher.group().toLowerCase()));
      }

      // Traverse the document and store each word within it in ArrayList doc_words, and at the same
      // time store the index of each occurence of target word within the document in targetGram_pos
      for (int i = 0; i < doc_words.size(); i++) {
        String word = doc_words.get(i);
        if (word.equals(targetGram)) targetGram_pos.add(new Double(i));
      }

      // Traverse the doc_words ArrayList and find the distance between each word within the
      // document and the target word
      // If there were not any ocurrence of target word distance is 0 to all words
      int index_tw = 0; // index target word
      Double distance = new Double(0); // store the distance between current word and target word
      for (int i = 0; i < doc_words.size(); i++) {
        if (targetGram_pos.size()
            == 0) { // If target word is not within the document, distance for all words is
          // Double.POSITIVE_INFINITY
          distance = Double.POSITIVE_INFINITY;
        } else {
          if (doc_words
              .get(i)
              .equals(
                  targetGram)) { // If word within the document is the same target word skip it and
            // go to the next word
            continue;
          }
          if (targetGram_pos.size() == 1) { // If there were just one entre of the target word
            distance = Math.abs(i - targetGram_pos.get(index_tw));
          } else {
            if (index_tw
                < targetGram_pos.size()
                    - 1) { // If this is not the LAST position of the ArrayList of indexes of the
              // target word
              if (Math.abs(i - targetGram_pos.get(index_tw))
                  > Math.abs(
                      i
                          - targetGram_pos.get(
                              index_tw
                                  + 1))) { // Compare the lowest distance between the nearest two
                // indexes
                index_tw++;
              }
            }
            distance = Math.abs(i - targetGram_pos.get(index_tw));
          }
        }
        values.setDouble1(
            new Double(func.f(distance))); // Evaluate dist on f(d) and store it on distance.d1
        output.set(doc_words.get(i)); // Output key is each word
        context.write(
            output, values); // key, value: key: each word, value:Pair of Double(distance, num of
        // co-currences)
      } // end for
    } // end map1
示例#15
0
    public void map(
        WritableComparable<?> key,
        Text value,
        OutputCollector<Text, CrawlDatum> output,
        Reporter reporter)
        throws IOException {
      String url = value.toString(); // value is line of text

      if (url != null && url.trim().startsWith("#")) {
        /* Ignore line that start with # */
        return;
      }

      // if tabs : metadata that could be stored
      // must be name=value and separated by \t
      float customScore = -1f;
      int customInterval = interval;
      int fixedInterval = -1;
      Map<String, String> metadata = new TreeMap<String, String>();
      if (url.indexOf("\t") != -1) {
        String[] splits = url.split("\t");
        url = splits[0];
        for (int s = 1; s < splits.length; s++) {
          // find separation between name and value
          int indexEquals = splits[s].indexOf("=");
          if (indexEquals == -1) {
            // skip anything without a =
            continue;
          }
          String metaname = splits[s].substring(0, indexEquals);
          String metavalue = splits[s].substring(indexEquals + 1);
          if (metaname.equals(nutchScoreMDName)) {
            try {
              customScore = Float.parseFloat(metavalue);
            } catch (NumberFormatException nfe) {
            }
          } else if (metaname.equals(nutchFetchIntervalMDName)) {
            try {
              customInterval = Integer.parseInt(metavalue);
            } catch (NumberFormatException nfe) {
            }
          } else if (metaname.equals(nutchFixedFetchIntervalMDName)) {
            try {
              fixedInterval = Integer.parseInt(metavalue);
            } catch (NumberFormatException nfe) {
            }
          } else metadata.put(metaname, metavalue);
        }
      }
      try {
        url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT);
        url = filters.filter(url); // filter the url
      } catch (Exception e) {
        if (LOG.isWarnEnabled()) {
          LOG.warn("Skipping " + url + ":" + e);
        }
        url = null;
      }
      if (url == null) {
        reporter.getCounter("injector", "urls_filtered").increment(1);
      } else { // if it passes
        value.set(url); // collect it
        CrawlDatum datum = new CrawlDatum();
        datum.setStatus(CrawlDatum.STATUS_INJECTED);

        // Is interval custom? Then set as meta data
        if (fixedInterval > -1) {
          // Set writable using float. Flaot is used by AdaptiveFetchSchedule
          datum
              .getMetaData()
              .put(Nutch.WRITABLE_FIXED_INTERVAL_KEY, new FloatWritable(fixedInterval));
          datum.setFetchInterval(fixedInterval);
        } else {
          datum.setFetchInterval(customInterval);
        }

        datum.setFetchTime(curTime);
        // now add the metadata
        Iterator<String> keysIter = metadata.keySet().iterator();
        while (keysIter.hasNext()) {
          String keymd = keysIter.next();
          String valuemd = metadata.get(keymd);
          datum.getMetaData().put(new Text(keymd), new Text(valuemd));
        }
        if (customScore != -1) datum.setScore(customScore);
        else datum.setScore(scoreInjected);
        try {
          scfilters.injectedScore(value, datum);
        } catch (ScoringFilterException e) {
          if (LOG.isWarnEnabled()) {
            LOG.warn(
                "Cannot filter injected score for url "
                    + url
                    + ", using default ("
                    + e.getMessage()
                    + ")");
          }
        }
        reporter.getCounter("injector", "urls_injected").increment(1);
        output.collect(value, datum);
      }
    }