public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String cur_file = ((FileSplit) context.getInputSplit()).getPath().getParent().getParent().getName(); String train_file = context.getConfiguration().get("train_file"); if (cur_file.equals(train_file)) { StringTokenizer st = new StringTokenizer(value.toString()); String word = st.nextToken(); String f_id = st.nextToken(); myKey.set(word); myVal.set(f_id); context.write(myKey, myVal); } else { StringTokenizer st = new StringTokenizer(value.toString()); String word = st.nextToken(); String f_id = st.nextToken(); StringBuilder builder = new StringBuilder(dlt); while (st.hasMoreTokens()) { String filename = st.nextToken(); String tf_idf = st.nextToken(); builder.append(filename); builder.append(dlt); builder.append(tf_idf); builder.append("\t"); } myKey.set(word); myVal.set(builder.toString()); context.write(myKey, myVal); } }
// specify input and out keys public void map( LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { String line = value.toString(); // define new variable to be string ArrayList<Integer> range = new ArrayList<Integer>(); for (int i = 2000; i <= 2010; i++) { range.add(i); } // String[] inputs = line.split(",(?=([^\"]*\"[^\"]*\")*[^\"]*$)"); String[] inputs = line.split(","); try { int year = Integer.parseInt(inputs[165]); if (range.contains(year)) { String dur = inputs[3]; String artist_name = inputs[2]; String song_title = inputs[1]; String final_input = artist_name + ',' + dur + ',' + song_title; Final_Value.set(final_input); output.collect(Final_Value, dummy); } } catch (NumberFormatException e) { // do nothing } }
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { Configuration c = context.getConfiguration(); String s = value.toString(); String input[] = s.split(","); Text outputkey = new Text(); Text outputvalue = new Text(); double result = 0.0; /* multiplies matrix and vector entry with matching column value */ result = (Double.parseDouble(input[2])) * (vector.get(Long.parseLong(input[1]))); outputkey.set(input[0]); outputvalue.set(Double.toString(result)); context.write(outputkey, outputvalue); }
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); String[] arr = line.split(","); word.set(arr[0]); context.write(word, one); }
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); StringTokenizer tokenizer = new StringTokenizer(line); while (tokenizer.hasMoreTokens()) { word.set(tokenizer.nextToken()); context.write(word, one); } }
public void map(LongWritable key, Text value, OutputCollector output, Reporter reporter) throws IOException { String line = value.toString(); StringTokenizer tokenizer = new StringTokenizer(line); while (tokenizer.hasMoreTokens()) { word.set(tokenizer.nextToken()); output.collect(word, one); } }
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { Text word = new Text(); StringTokenizer s = new StringTokenizer(value.toString()); while (s.hasMoreTokens()) { word.set(s.nextToken()); context.write(word, one); } }
public void map( IntWritable key, Text value, OutputCollector<IntWritable, Text> output, Reporter reporter) throws IOException { String dataRow = value.toString(); StringTokenizer tk = new StringTokenizer(dataRow); String label = tk.nextToken(); String image = tk.nextToken(); dataString.set(label + "\t" + image); output.collect(sameKey, dataString); }
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); splitposition = line.indexOf("\t"); labels = line.substring(0, splitposition); content = line.substring(splitposition + 1, line.length()); if (content.length() <= 5) { word.set(labels); int counter = Integer.parseInt(content); context.write(word, new IntWritable(counter)); return; } labeltokens = labels.split(","); contenttokens = tokenizeDoc(content); for (String label : labelspace) { for (String token : contenttokens) { // filter token, denotes the needed events; word.set(label + " " + token); context.write(word, new IntWritable(-1)); } } }
/** Implements the map-method of the Mapper-interface */ @Override public void map( LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { String line = value.toString(); StringTokenizer tokenizer = new StringTokenizer(line); while (tokenizer.hasMoreTokens()) { String token = tokenizer.nextToken(); if (token.length() > 3) { word.set(token); output.collect(word, one); } } }
public void reduce( IntWritable sameNum, Iterator<Text> data, OutputCollector<Text, jBLASArrayWritable> output, Reporter reporter) throws IOException { int totalBatchCount = exampleCount / batchSize; DoubleMatrix weights = DoubleMatrix.randn(hiddenNodes, visibleNodes); DoubleMatrix hbias = DoubleMatrix.zeros(hiddenNodes); DoubleMatrix vbias = DoubleMatrix.zeros(visibleNodes); DoubleMatrix label = DoubleMatrix.zeros(1); DoubleMatrix hidden_chain = null; DoubleMatrix vdata = DoubleMatrix.zeros(batchSize, visibleNodes); ArrayList<DoubleMatrix> outputmatricies = new ArrayList<DoubleMatrix>(); outputmatricies.add(weights); outputmatricies.add(hbias); outputmatricies.add(vbias); outputmatricies.add(label); outputmatricies.add(vdata); outputmatricies.add(hidden_chain); int j; for (int i = 0; i < totalBatchCount; i++) { j = 0; while (data.hasNext() && j < batchSize) { j++; StringTokenizer tk = new StringTokenizer(data.next().toString()); label.put(0, Double.parseDouble(tk.nextToken())); String image = tk.nextToken(); for (int k = 0; k < image.length(); k++) { Integer val = new Integer(image.charAt(k)); vdata.put(j, k, val.doubleValue()); } dataArray = new jBLASArrayWritable(outputmatricies); batchID.set("1\t" + i); output.collect(batchID, dataArray); } } }
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { String[] pair = new String[2]; int count = 0; for (Text txt : values) { pair[count] = txt.toString(); count++; } // word exists in training if (count == 2) { StringTokenizer st_one, st_two; if (pair[0].contains(dlt)) { st_one = new StringTokenizer(pair[1]); st_two = new StringTokenizer(pair[0]); } else { st_one = new StringTokenizer(pair[0]); st_two = new StringTokenizer(pair[1]); } // outputting the data String f_id = st_one.nextToken(); StringBuilder builder = new StringBuilder(dlt); builder.append(f_id); builder.append(dlt); while (st_two.hasMoreTokens()) { String filename = st_two.nextToken(); String tf_idf = st_two.nextToken(); builder.append(filename); builder.append(dlt); builder.append(tf_idf); builder.append("\t"); } myVal.set(builder.toString()); context.write(key, myVal); } }
public void map( Text key, Text val, org.apache.hadoop.mapreduce.Mapper<Text, Text, Text, Text>.Context context) throws IOException, InterruptedException { int i = 0, n = 0, j = 0, lj = 0, hj = 0; String tem = ""; initStopWordsMap(); // initialize the stop list String line = val.toString(); StringTokenizer itr = new StringTokenizer(line.toLowerCase(), tokenDelimiter); // set delimiter n = itr.countTokens(); cache = new String[n]; for (i = 0; i < n; i++) { cache[i] = new String(""); // initialize the cache } i = 0; while (itr.hasMoreTokens()) { cache[i] = itr.nextToken(); // padding the cache with the words of the content i++; } for (i = 0; i < n; i++) { keyWord = cache[i]; keyWord = keyWord.trim(); if (!hmStopWord.containsKey(keyWord)) { lj = i - 10; hj = i + 10; if (lj < 0) lj = 0; if (hj > n) hj = n; tem = " "; for (j = lj; j < hj; j++) tem += cache[j] + " "; location = new Text(); location.set(key.toString() + tem); context.write(new Text(keyWord), location); } } }
@Override public void map(WritableComparable docID, Text docContents, Context context) throws IOException, InterruptedException { Matcher matcher = WORD_PATTERN.matcher(docContents.toString()); Func func = funcFromNum(funcNum); // YOUR CODE HERE ArrayList<String> doc_words = new ArrayList<String>(); // Store all words within the document. ArrayList<Double> targetGram_pos = new ArrayList< Double>(); // Store the index of each occurrence of target word in the document DoublePair values = new DoublePair(); // DoublePair that store distance, ocurrences values.setDouble2(new Double(1.0)); // ocurrences = 0 Text output = new Text(); // Store each word within the document in doc_words while (matcher.find()) { doc_words.add(new String(matcher.group().toLowerCase())); } // Traverse the document and store each word within it in ArrayList doc_words, and at the same // time store the index of each occurence of target word within the document in targetGram_pos for (int i = 0; i < doc_words.size(); i++) { String word = doc_words.get(i); if (word.equals(targetGram)) targetGram_pos.add(new Double(i)); } // Traverse the doc_words ArrayList and find the distance between each word within the // document and the target word // If there were not any ocurrence of target word distance is 0 to all words int index_tw = 0; // index target word Double distance = new Double(0); // store the distance between current word and target word for (int i = 0; i < doc_words.size(); i++) { if (targetGram_pos.size() == 0) { // If target word is not within the document, distance for all words is // Double.POSITIVE_INFINITY distance = Double.POSITIVE_INFINITY; } else { if (doc_words .get(i) .equals( targetGram)) { // If word within the document is the same target word skip it and // go to the next word continue; } if (targetGram_pos.size() == 1) { // If there were just one entre of the target word distance = Math.abs(i - targetGram_pos.get(index_tw)); } else { if (index_tw < targetGram_pos.size() - 1) { // If this is not the LAST position of the ArrayList of indexes of the // target word if (Math.abs(i - targetGram_pos.get(index_tw)) > Math.abs( i - targetGram_pos.get( index_tw + 1))) { // Compare the lowest distance between the nearest two // indexes index_tw++; } } distance = Math.abs(i - targetGram_pos.get(index_tw)); } } values.setDouble1( new Double(func.f(distance))); // Evaluate dist on f(d) and store it on distance.d1 output.set(doc_words.get(i)); // Output key is each word context.write( output, values); // key, value: key: each word, value:Pair of Double(distance, num of // co-currences) } // end for } // end map1
public void map( WritableComparable<?> key, Text value, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException { String url = value.toString(); // value is line of text if (url != null && url.trim().startsWith("#")) { /* Ignore line that start with # */ return; } // if tabs : metadata that could be stored // must be name=value and separated by \t float customScore = -1f; int customInterval = interval; int fixedInterval = -1; Map<String, String> metadata = new TreeMap<String, String>(); if (url.indexOf("\t") != -1) { String[] splits = url.split("\t"); url = splits[0]; for (int s = 1; s < splits.length; s++) { // find separation between name and value int indexEquals = splits[s].indexOf("="); if (indexEquals == -1) { // skip anything without a = continue; } String metaname = splits[s].substring(0, indexEquals); String metavalue = splits[s].substring(indexEquals + 1); if (metaname.equals(nutchScoreMDName)) { try { customScore = Float.parseFloat(metavalue); } catch (NumberFormatException nfe) { } } else if (metaname.equals(nutchFetchIntervalMDName)) { try { customInterval = Integer.parseInt(metavalue); } catch (NumberFormatException nfe) { } } else if (metaname.equals(nutchFixedFetchIntervalMDName)) { try { fixedInterval = Integer.parseInt(metavalue); } catch (NumberFormatException nfe) { } } else metadata.put(metaname, metavalue); } } try { url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT); url = filters.filter(url); // filter the url } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Skipping " + url + ":" + e); } url = null; } if (url == null) { reporter.getCounter("injector", "urls_filtered").increment(1); } else { // if it passes value.set(url); // collect it CrawlDatum datum = new CrawlDatum(); datum.setStatus(CrawlDatum.STATUS_INJECTED); // Is interval custom? Then set as meta data if (fixedInterval > -1) { // Set writable using float. Flaot is used by AdaptiveFetchSchedule datum .getMetaData() .put(Nutch.WRITABLE_FIXED_INTERVAL_KEY, new FloatWritable(fixedInterval)); datum.setFetchInterval(fixedInterval); } else { datum.setFetchInterval(customInterval); } datum.setFetchTime(curTime); // now add the metadata Iterator<String> keysIter = metadata.keySet().iterator(); while (keysIter.hasNext()) { String keymd = keysIter.next(); String valuemd = metadata.get(keymd); datum.getMetaData().put(new Text(keymd), new Text(valuemd)); } if (customScore != -1) datum.setScore(customScore); else datum.setScore(scoreInjected); try { scfilters.injectedScore(value, datum); } catch (ScoringFilterException e) { if (LOG.isWarnEnabled()) { LOG.warn( "Cannot filter injected score for url " + url + ", using default (" + e.getMessage() + ")"); } } reporter.getCounter("injector", "urls_injected").increment(1); output.collect(value, datum); } }