// specify input and out keys public void map( LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { String line = value.toString(); // define new variable to be string ArrayList<Integer> range = new ArrayList<Integer>(); for (int i = 2000; i <= 2010; i++) { range.add(i); } // String[] inputs = line.split(",(?=([^\"]*\"[^\"]*\")*[^\"]*$)"); String[] inputs = line.split(","); try { int year = Integer.parseInt(inputs[165]); if (range.contains(year)) { String dur = inputs[3]; String artist_name = inputs[2]; String song_title = inputs[1]; String final_input = artist_name + ',' + dur + ',' + song_title; Final_Value.set(final_input); output.collect(Final_Value, dummy); } } catch (NumberFormatException e) { // do nothing } }
public void map(LongWritable key, Text value, OutputCollector output, Reporter reporter) throws IOException { String line = value.toString(); StringTokenizer tokenizer = new StringTokenizer(line); while (tokenizer.hasMoreTokens()) { word.set(tokenizer.nextToken()); output.collect(word, one); } }
/* Finds a full sentence and sets it as the value. * If the sentence is shorter than the full line, the rest is stored to use later. */ public synchronized boolean next(LongWritable key, Text value) throws IOException { Text line = new Text(); boolean getMore = true; boolean retrieved = false; String result = leftovers; leftovers = ""; value.clear(); while (getMore) { retrieved = recordReader.next(key, line); if (retrieved) { String lineValue = line.toString(); // here, we assume sentences run until the period. int endOfSentence = lineValue.indexOf('.'); if (endOfSentence == -1) { result += " " + lineValue; } else { result += " " + lineValue.substring(0, endOfSentence + 1); leftovers = lineValue.substring(endOfSentence + 1); getMore = false; } } else { getMore = false; value.set(result); return false; } } value.set(result); return true; }
/** Implements the map-method of the Mapper-interface */ @Override public void map( LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { String line = value.toString(); StringTokenizer tokenizer = new StringTokenizer(line); while (tokenizer.hasMoreTokens()) { String token = tokenizer.nextToken(); if (token.length() > 3) { word.set(token); output.collect(word, one); } } }
/* * Finds a full file and sets it as the value. */ public synchronized boolean next(LongWritable key, Text value) throws IOException { Text line = new Text(); boolean retrieved = true; String result = ""; value.clear(); while (retrieved) { retrieved = recordReader.next(key, line); if (line.toString().length() > 0) { String lineValue = line.toString(); result += lineValue + "\n"; } } value.set(result); return true; }
public void map( WritableComparable<?> key, Text value, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException { String url = value.toString(); // value is line of text if (url != null && url.trim().startsWith("#")) { /* Ignore line that start with # */ return; } // if tabs : metadata that could be stored // must be name=value and separated by \t float customScore = -1f; int customInterval = interval; int fixedInterval = -1; Map<String, String> metadata = new TreeMap<String, String>(); if (url.indexOf("\t") != -1) { String[] splits = url.split("\t"); url = splits[0]; for (int s = 1; s < splits.length; s++) { // find separation between name and value int indexEquals = splits[s].indexOf("="); if (indexEquals == -1) { // skip anything without a = continue; } String metaname = splits[s].substring(0, indexEquals); String metavalue = splits[s].substring(indexEquals + 1); if (metaname.equals(nutchScoreMDName)) { try { customScore = Float.parseFloat(metavalue); } catch (NumberFormatException nfe) { } } else if (metaname.equals(nutchFetchIntervalMDName)) { try { customInterval = Integer.parseInt(metavalue); } catch (NumberFormatException nfe) { } } else if (metaname.equals(nutchFixedFetchIntervalMDName)) { try { fixedInterval = Integer.parseInt(metavalue); } catch (NumberFormatException nfe) { } } else metadata.put(metaname, metavalue); } } try { url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT); url = filters.filter(url); // filter the url } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Skipping " + url + ":" + e); } url = null; } if (url == null) { reporter.getCounter("injector", "urls_filtered").increment(1); } else { // if it passes value.set(url); // collect it CrawlDatum datum = new CrawlDatum(); datum.setStatus(CrawlDatum.STATUS_INJECTED); // Is interval custom? Then set as meta data if (fixedInterval > -1) { // Set writable using float. Flaot is used by AdaptiveFetchSchedule datum .getMetaData() .put(Nutch.WRITABLE_FIXED_INTERVAL_KEY, new FloatWritable(fixedInterval)); datum.setFetchInterval(fixedInterval); } else { datum.setFetchInterval(customInterval); } datum.setFetchTime(curTime); // now add the metadata Iterator<String> keysIter = metadata.keySet().iterator(); while (keysIter.hasNext()) { String keymd = keysIter.next(); String valuemd = metadata.get(keymd); datum.getMetaData().put(new Text(keymd), new Text(valuemd)); } if (customScore != -1) datum.setScore(customScore); else datum.setScore(scoreInjected); try { scfilters.injectedScore(value, datum); } catch (ScoringFilterException e) { if (LOG.isWarnEnabled()) { LOG.warn( "Cannot filter injected score for url " + url + ", using default (" + e.getMessage() + ")"); } } reporter.getCounter("injector", "urls_injected").increment(1); output.collect(value, datum); } }