// specify input and out keys public void map( LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { String line = value.toString(); // define new variable to be string ArrayList<Integer> range = new ArrayList<Integer>(); for (int i = 2000; i <= 2010; i++) { range.add(i); } // String[] inputs = line.split(",(?=([^\"]*\"[^\"]*\")*[^\"]*$)"); String[] inputs = line.split(","); try { int year = Integer.parseInt(inputs[165]); if (range.contains(year)) { String dur = inputs[3]; String artist_name = inputs[2]; String song_title = inputs[1]; String final_input = artist_name + ',' + dur + ',' + song_title; Final_Value.set(final_input); output.collect(Final_Value, dummy); } } catch (NumberFormatException e) { // do nothing } }
/** Check whether the file list have duplication. */ private static void checkDuplication(FileSystem fs, Path file, Path sorted, Configuration conf) throws IOException { SequenceFile.Reader in = null; try { SequenceFile.Sorter sorter = new SequenceFile.Sorter(fs, new Text.Comparator(), Text.class, Text.class, conf); sorter.sort(file, sorted); in = new SequenceFile.Reader(fs, sorted, conf); Text prevdst = null, curdst = new Text(); Text prevsrc = null, cursrc = new Text(); for (; in.next(curdst, cursrc); ) { if (prevdst != null && curdst.equals(prevdst)) { throw new DuplicationException( "Invalid input, there are duplicated files in the sources: " + prevsrc + ", " + cursrc); } prevdst = curdst; curdst = new Text(); prevsrc = cursrc; cursrc = new Text(); } } finally { checkAndClose(in); } }
public void map(LongWritable key, Text value, OutputCollector output, Reporter reporter) throws IOException { String line = value.toString(); StringTokenizer tokenizer = new StringTokenizer(line); while (tokenizer.hasMoreTokens()) { word.set(tokenizer.nextToken()); output.collect(word, one); } }
public void testInputFormat() { try { JobConf conf = new JobConf(); String TMP_DIR = System.getProperty("test.build.data", "/tmp"); Path filename = new Path("file:///" + TMP_DIR + "/tmpSeqFile"); SequenceFile.Writer sfw = SequenceFile.createWriter( FileSystem.getLocal(conf), conf, filename, ChukwaArchiveKey.class, ChunkImpl.class, SequenceFile.CompressionType.NONE, Reporter.NULL); StringBuilder buf = new StringBuilder(); int offsets[] = new int[lines.length]; for (int i = 0; i < lines.length; ++i) { buf.append(lines[i]); buf.append("\n"); offsets[i] = buf.length() - 1; } ChukwaArchiveKey key = new ChukwaArchiveKey(0, "datatype", "sname", 0); ChunkImpl val = new ChunkImpl("datatype", "sname", 0, buf.toString().getBytes(), null); val.setRecordOffsets(offsets); sfw.append(key, val); sfw.append(key, val); // write it twice sfw.close(); long len = FileSystem.getLocal(conf).getFileStatus(filename).getLen(); InputSplit split = new FileSplit(filename, 0, len, (String[]) null); ChukwaInputFormat in = new ChukwaInputFormat(); RecordReader<LongWritable, Text> r = in.getRecordReader(split, conf, Reporter.NULL); LongWritable l = r.createKey(); Text line = r.createValue(); for (int i = 0; i < lines.length * 2; ++i) { boolean succeeded = r.next(l, line); assertTrue(succeeded); assertEquals(i, l.get()); assertEquals(lines[i % lines.length], line.toString()); System.out.println("read line: " + l.get() + " " + line); } boolean succeeded = r.next(l, line); assertFalse(succeeded); } catch (IOException e) { e.printStackTrace(); fail("IO exception " + e); } }
public void map( LongWritable key, Text value, OutputCollector<LongWritable, Text> output, Reporter reporter) throws IOException { String line = value.toString(); if (validate(line, reporter)) output.collect(key, value); }
public void map( LongWritable key, Text value, OutputCollector<IntWritable, DoubleWritable> output, Reporter reporter) throws IOException { /* * It implements the mapper. It outputs the numbers of weight and updated weights. * * Note that the format of intermediate output is <IntWritable, DoubleWritable>, * because the key is the number of weight (an integer), and the value is the weight's value (double) */ inputData = value.toString(); // go through the process initialize(); getposphase(); getnegphase(); update(); // output the intermediate data // The <key, value> pairs are <weightID, weightUpdate> double[][] vishidinc_array = vishidinc.getArray(); for (int i = 0; i < numdims; i++) { for (int j = 0; j < numhid; j++) { weightPos.set(i * numhid + j); weightValue.set(vishidinc_array[i][j]); output.collect(weightPos, weightValue); } } }
@Override public void reduce( Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { String line = ""; String header = ""; TreeMap<String, String> ciudades = new TreeMap<String, String>(); // Obtenemos los datos y los metemos a un treemap para que los ordene por ciudad while (values.hasNext()) { String[] tmp = values.next().toString().split(","); String ciudad = tmp[0]; String mes = tmp[1]; String temperatura = tmp[2]; String fecha = tmp[3]; ciudades.put(ciudad, tmp[1] + "," + tmp[2] + "," + tmp[3]); } // Recorremos las ciudades y vamos imprimiendo for (String ciudad : ciudades.keySet()) { header += ciudad + ",,"; String[] temporal = ciudades.get(ciudad).split(","); line += temporal[2] + "," + temporal[1] + ","; } if (c == 0) { // Imprimimos cabezera output.collect(new Text("Año,"), new Text(header)); c++; } output.collect(new Text(key.toString() + ","), new Text(line)); }
/** Implements the map-method of the Mapper-interface */ @Override public void map( LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { String line = value.toString(); StringTokenizer tokenizer = new StringTokenizer(line); while (tokenizer.hasMoreTokens()) { String token = tokenizer.nextToken(); if (token.length() > 3) { word.set(token); output.collect(word, one); } } }
public void reduce( Text key, Iterator<CrawlDatum> values, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException { boolean oldSet = false; boolean injectedSet = false; while (values.hasNext()) { CrawlDatum val = values.next(); if (val.getStatus() == CrawlDatum.STATUS_INJECTED) { injected.set(val); injected.setStatus(CrawlDatum.STATUS_DB_UNFETCHED); injectedSet = true; } else { old.set(val); oldSet = true; } } CrawlDatum res = null; /** * Whether to overwrite, ignore or update existing records * * @see https://issues.apache.org/jira/browse/NUTCH-1405 */ // Injected record already exists and overwrite but not update if (injectedSet && oldSet && overwrite) { res = injected; if (update) { LOG.info(key.toString() + " overwritten with injected record but update was specified."); } } // Injected record already exists and update but not overwrite if (injectedSet && oldSet && update && !overwrite) { res = old; old.putAllMetaData(injected); old.setScore(injected.getScore() != scoreInjected ? injected.getScore() : old.getScore()); old.setFetchInterval( injected.getFetchInterval() != interval ? injected.getFetchInterval() : old.getFetchInterval()); } // Old default behaviour if (injectedSet && !oldSet) { res = injected; } else { res = old; } output.collect(key, res); }
/* * Finds a full file and sets it as the value. */ public synchronized boolean next(LongWritable key, Text value) throws IOException { Text line = new Text(); boolean retrieved = true; String result = ""; value.clear(); while (retrieved) { retrieved = recordReader.next(key, line); if (line.toString().length() > 0) { String lineValue = line.toString(); result += lineValue + "\n"; } } value.set(result); return true; }
public void map( LongWritable key, Text value, OutputCollector<IntWritable, Text> output, Reporter reporter) throws IOException { String arr[] = value.toString().split("\\r?\\n"); for (String row : arr) { if (row.startsWith("\"")) { continue; } String parts[] = row.split(","); output.collect(new IntWritable(new Integer(parts[1])), new Text(parts[4])); } }
public void map( LongWritable key, Text value, OutputCollector<IntWritable, IntWritable> output, Reporter reporter) throws IOException { String line = value.toString(); IntWritable clave = new IntWritable(); IntWritable valor = new IntWritable(); clave.set(Integer.parseInt(line)); valor.set(Integer.parseInt(line) + 1); output.collect(clave, valor); }
public void map( LongWritable key, Text value, OutputCollector<DoubleWritable, DoubleWritable> output, Reporter reporter) throws IOException { String line = value.toString(); DoubleWritable clave = new DoubleWritable(); DoubleWritable valor = new DoubleWritable(); clave.set(Double.parseDouble(line)); valor.set(Math.sqrt(Double.parseDouble(line))); output.collect(clave, valor); }
@SuppressWarnings("unchecked") public void reduce( Text key, Iterator<Text> values, OutputCollector<NullWritable, Text> output, Reporter reporter) throws IOException { OutputCollector collector = multipleOutputs.getCollector("station", key.toString().replace("-", ""), reporter); while (values.hasNext()) { collector.collect(NullWritable.get(), values.next()); } }
/* Finds a full sentence and sets it as the value. * If the sentence is shorter than the full line, the rest is stored to use later. */ public synchronized boolean next(LongWritable key, Text value) throws IOException { Text line = new Text(); boolean getMore = true; boolean retrieved = false; String result = leftovers; leftovers = ""; value.clear(); while (getMore) { retrieved = recordReader.next(key, line); if (retrieved) { String lineValue = line.toString(); // here, we assume sentences run until the period. int endOfSentence = lineValue.indexOf('.'); if (endOfSentence == -1) { result += " " + lineValue; } else { result += " " + lineValue.substring(0, endOfSentence + 1); leftovers = lineValue.substring(endOfSentence + 1); getMore = false; } } else { getMore = false; value.set(result); return false; } } value.set(result); return true; }
public void map( LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { String line = value.toString(); String lastToken = null; StringTokenizer s = new StringTokenizer(line, "\t"); String year = s.nextToken(); while (s.hasMoreTokens()) { lastToken = s.nextToken(); } int averagePrice = Integer.parseInt(lastToken); output.collect(new Text(year), new IntWritable(averagePrice)); }
@Override public void map( LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { String line = value.toString(); StringTokenizer tokens = new StringTokenizer(line); String[] keys = tokens.nextToken().toString().split("-"); String date_temp = tokens.nextToken(); String country = keys[0]; String year = keys[1]; // Mandamos año y un iterable de [ciudad,mes,temperatura,fecha] output.collect(new Text(year), new Text(country + "," + date_temp)); }
public void map( LongWritable key, Text value, OutputCollector<IntWritable, DoubleWritable> output, Reporter reporter) throws IOException { String line = value.toString(); StringTokenizer tokenizer = new StringTokenizer(line); int rowIdx = 0; double xValue = 0; if (tokenizer.hasMoreTokens()) { rowIdx = Integer.parseInt(tokenizer.nextToken()); xValue = Double.parseDouble(tokenizer.nextToken()); } double xResult = (resVec[rowIdx] - (sumVec[rowIdx] * xValue)) / diaVec[rowIdx]; output.collect(new IntWritable(rowIdx), new DoubleWritable(xResult)); }
@Override public void map( LongWritable key, Text value, /*[*/ OutputCollector<Text, IntWritable> output, Reporter reporter /*]*/) throws IOException { String line = value.toString(); String year = line.substring(15, 19); int airTemperature; if (line.charAt(87) == '+') { // parseInt doesn't like leading plus signs airTemperature = Integer.parseInt(line.substring(88, 92)); } else { airTemperature = Integer.parseInt(line.substring(87, 92)); } String quality = line.substring(92, 93); if (airTemperature != MISSING && quality.matches("[01459]")) { /*[*/ output.collect /*]*/(new Text(year), new IntWritable(airTemperature)); } }
public void map( WritableComparable<?> key, Text value, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException { String url = value.toString(); // value is line of text if (url != null && url.trim().startsWith("#")) { /* Ignore line that start with # */ return; } // if tabs : metadata that could be stored // must be name=value and separated by \t float customScore = -1f; int customInterval = interval; int fixedInterval = -1; Map<String, String> metadata = new TreeMap<String, String>(); if (url.indexOf("\t") != -1) { String[] splits = url.split("\t"); url = splits[0]; for (int s = 1; s < splits.length; s++) { // find separation between name and value int indexEquals = splits[s].indexOf("="); if (indexEquals == -1) { // skip anything without a = continue; } String metaname = splits[s].substring(0, indexEquals); String metavalue = splits[s].substring(indexEquals + 1); if (metaname.equals(nutchScoreMDName)) { try { customScore = Float.parseFloat(metavalue); } catch (NumberFormatException nfe) { } } else if (metaname.equals(nutchFetchIntervalMDName)) { try { customInterval = Integer.parseInt(metavalue); } catch (NumberFormatException nfe) { } } else if (metaname.equals(nutchFixedFetchIntervalMDName)) { try { fixedInterval = Integer.parseInt(metavalue); } catch (NumberFormatException nfe) { } } else metadata.put(metaname, metavalue); } } try { url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT); url = filters.filter(url); // filter the url } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Skipping " + url + ":" + e); } url = null; } if (url == null) { reporter.getCounter("injector", "urls_filtered").increment(1); } else { // if it passes value.set(url); // collect it CrawlDatum datum = new CrawlDatum(); datum.setStatus(CrawlDatum.STATUS_INJECTED); // Is interval custom? Then set as meta data if (fixedInterval > -1) { // Set writable using float. Flaot is used by AdaptiveFetchSchedule datum .getMetaData() .put(Nutch.WRITABLE_FIXED_INTERVAL_KEY, new FloatWritable(fixedInterval)); datum.setFetchInterval(fixedInterval); } else { datum.setFetchInterval(customInterval); } datum.setFetchTime(curTime); // now add the metadata Iterator<String> keysIter = metadata.keySet().iterator(); while (keysIter.hasNext()) { String keymd = keysIter.next(); String valuemd = metadata.get(keymd); datum.getMetaData().put(new Text(keymd), new Text(valuemd)); } if (customScore != -1) datum.setScore(customScore); else datum.setScore(scoreInjected); try { scfilters.injectedScore(value, datum); } catch (ScoringFilterException e) { if (LOG.isWarnEnabled()) { LOG.warn( "Cannot filter injected score for url " + url + ", using default (" + e.getMessage() + ")"); } } reporter.getCounter("injector", "urls_injected").increment(1); output.collect(value, datum); } }
public void write(DataOutput out) throws IOException { input.write(out); Text.writeString(out, output); }
public void readFields(DataInput in) throws IOException { input.readFields(in); output = Text.readString(in); }
/** Delete the dst files/dirs which do not exist in src */ private static void deleteNonexisting( FileSystem dstfs, FileStatus dstroot, Path dstsorted, FileSystem jobfs, Path jobdir, JobConf jobconf, Configuration conf) throws IOException { if (!dstroot.isDir()) { throw new IOException( "dst must be a directory when option " + Options.DELETE.cmd + " is set, but dst (= " + dstroot.getPath() + ") is not a directory."); } // write dst lsr results final Path dstlsr = new Path(jobdir, "_distcp_dst_lsr"); final SequenceFile.Writer writer = SequenceFile.createWriter( jobfs, jobconf, dstlsr, Text.class, FileStatus.class, SequenceFile.CompressionType.NONE); try { // do lsr to get all file statuses in dstroot final Stack<FileStatus> lsrstack = new Stack<FileStatus>(); for (lsrstack.push(dstroot); !lsrstack.isEmpty(); ) { final FileStatus status = lsrstack.pop(); if (status.isDir()) { for (FileStatus child : dstfs.listStatus(status.getPath())) { String relative = makeRelative(dstroot.getPath(), child.getPath()); writer.append(new Text(relative), child); lsrstack.push(child); } } } } finally { checkAndClose(writer); } // sort lsr results final Path sortedlsr = new Path(jobdir, "_distcp_dst_lsr_sorted"); SequenceFile.Sorter sorter = new SequenceFile.Sorter( jobfs, new Text.Comparator(), Text.class, FileStatus.class, jobconf); sorter.sort(dstlsr, sortedlsr); // compare lsr list and dst list SequenceFile.Reader lsrin = null; SequenceFile.Reader dstin = null; try { lsrin = new SequenceFile.Reader(jobfs, sortedlsr, jobconf); dstin = new SequenceFile.Reader(jobfs, dstsorted, jobconf); // compare sorted lsr list and sorted dst list final Text lsrpath = new Text(); final FileStatus lsrstatus = new FileStatus(); final Text dstpath = new Text(); final Text dstfrom = new Text(); final FsShell shell = new FsShell(conf); final String[] shellargs = {"-rmr", null}; boolean hasnext = dstin.next(dstpath, dstfrom); for (; lsrin.next(lsrpath, lsrstatus); ) { int dst_cmp_lsr = dstpath.compareTo(lsrpath); for (; hasnext && dst_cmp_lsr < 0; ) { hasnext = dstin.next(dstpath, dstfrom); dst_cmp_lsr = dstpath.compareTo(lsrpath); } if (dst_cmp_lsr == 0) { // lsrpath exists in dst, skip it hasnext = dstin.next(dstpath, dstfrom); } else { // lsrpath does not exist, delete it String s = new Path(dstroot.getPath(), lsrpath.toString()).toString(); if (shellargs[1] == null || !isAncestorPath(shellargs[1], s)) { shellargs[1] = s; int r = 0; try { r = shell.run(shellargs); } catch (Exception e) { throw new IOException("Exception from shell.", e); } if (r != 0) { throw new IOException( "\"" + shellargs[0] + " " + shellargs[1] + "\" returns non-zero value " + r); } } } } } finally { checkAndClose(lsrin); checkAndClose(dstin); } }