@Override public void reduce( Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { String line = ""; String header = ""; TreeMap<String, String> ciudades = new TreeMap<String, String>(); // Obtenemos los datos y los metemos a un treemap para que los ordene por ciudad while (values.hasNext()) { String[] tmp = values.next().toString().split(","); String ciudad = tmp[0]; String mes = tmp[1]; String temperatura = tmp[2]; String fecha = tmp[3]; ciudades.put(ciudad, tmp[1] + "," + tmp[2] + "," + tmp[3]); } // Recorremos las ciudades y vamos imprimiendo for (String ciudad : ciudades.keySet()) { header += ciudad + ",,"; String[] temporal = ciudades.get(ciudad).split(","); line += temporal[2] + "," + temporal[1] + ","; } if (c == 0) { // Imprimimos cabezera output.collect(new Text("Año,"), new Text(header)); c++; } output.collect(new Text(key.toString() + ","), new Text(line)); }
void search( Vector<Star> v1, Vector<Star> v2, BlockIDWritable key, OutputCollector<BlockIDWritable, PairWritable> output) throws IOException { for (int i = 0; i < v1.size(); i++) { for (int j = 0; j < v2.size(); j++) { Star star1 = v1.get(i); Star star2 = v2.get(j); // what is this margin about if (star1.margin && star2.margin) continue; double dist = star1.x * star2.x + star1.y * star2.y + star1.z * star2.z; if (dist > costheta) { p.set(star1, star2, dist); output.collect(key, p); p.set(star2, star1, dist); output.collect(key, p); // num += 2; } } } // end for i,j }
public void map( LongWritable key, Text value, OutputCollector<IntWritable, DoubleWritable> output, Reporter reporter) throws IOException { /* * It implements the mapper. It outputs the numbers of weight and updated weights. * * Note that the format of intermediate output is <IntWritable, DoubleWritable>, * because the key is the number of weight (an integer), and the value is the weight's value (double) */ inputData = value.toString(); // go through the process initialize(); getposphase(); getnegphase(); update(); // output the intermediate data // The <key, value> pairs are <weightID, weightUpdate> double[][] vishidinc_array = vishidinc.getArray(); for (int i = 0; i < numdims; i++) { for (int j = 0; j < numhid; j++) { weightPos.set(i * numhid + j); weightValue.set(vishidinc_array[i][j]); output.collect(weightPos, weightValue); } } }
// specify input and out keys public void map( LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { String line = value.toString(); // define new variable to be string ArrayList<Integer> range = new ArrayList<Integer>(); for (int i = 2000; i <= 2010; i++) { range.add(i); } // String[] inputs = line.split(",(?=([^\"]*\"[^\"]*\")*[^\"]*$)"); String[] inputs = line.split(","); try { int year = Integer.parseInt(inputs[165]); if (range.contains(year)) { String dur = inputs[3]; String artist_name = inputs[2]; String song_title = inputs[1]; String final_input = artist_name + ',' + dur + ',' + song_title; Final_Value.set(final_input); output.collect(Final_Value, dummy); } } catch (NumberFormatException e) { // do nothing } }
public void reduce( DoubleWritable key, Iterator<DoubleWritable> values, OutputCollector<DoubleWritable, Text> output, Reporter reporter) throws IOException { output.collect(key, new Text(values.next().toString() + " - ")); }
/** * Takes in (id, node) pairs and emits them right back out * * @param key The Node ID * @param Node The Node object * @param output An Output Collector that collects (id, node) pairs * @param reporter Default reporter object */ public void map( IntWritable key, WikiPage value, OutputCollector<IntWritable, WikiPage> output, Reporter reporter) throws IOException { output.collect(key, value); }
public void map(LongWritable key, Text value, OutputCollector output, Reporter reporter) throws IOException { String line = value.toString(); StringTokenizer tokenizer = new StringTokenizer(line); while (tokenizer.hasMoreTokens()) { word.set(tokenizer.nextToken()); output.collect(word, one); } }
@Override public void reduce( IntWritable key, Iterator<DoubleWritable> values, OutputCollector<IntWritable, DoubleWritable> output, Reporter reporter) throws IOException { output.collect(key, values.next()); }
public void reduce( Text key, Iterator<CrawlDatum> values, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException { boolean oldSet = false; boolean injectedSet = false; while (values.hasNext()) { CrawlDatum val = values.next(); if (val.getStatus() == CrawlDatum.STATUS_INJECTED) { injected.set(val); injected.setStatus(CrawlDatum.STATUS_DB_UNFETCHED); injectedSet = true; } else { old.set(val); oldSet = true; } } CrawlDatum res = null; /** * Whether to overwrite, ignore or update existing records * * @see https://issues.apache.org/jira/browse/NUTCH-1405 */ // Injected record already exists and overwrite but not update if (injectedSet && oldSet && overwrite) { res = injected; if (update) { LOG.info(key.toString() + " overwritten with injected record but update was specified."); } } // Injected record already exists and update but not overwrite if (injectedSet && oldSet && update && !overwrite) { res = old; old.putAllMetaData(injected); old.setScore(injected.getScore() != scoreInjected ? injected.getScore() : old.getScore()); old.setFetchInterval( injected.getFetchInterval() != interval ? injected.getFetchInterval() : old.getFetchInterval()); } // Old default behaviour if (injectedSet && !oldSet) { res = injected; } else { res = old; } output.collect(key, res); }
public void map( BytesWritable key, BytesWritable value, OutputCollector<BytesWritable, IntWritable> output, Reporter reporter) throws IOException { // newKey = (key, value) BytesWritable keyValue = new BytesWritable(pair(key, value)); // output (newKey, value) output.collect(keyValue, this.value); }
public void reduce( Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { int sum = 0; while (values.hasNext()) { sum += values.next().get(); } output.collect(key, new IntWritable(sum)); }
public void map( LongWritable key, Text value, OutputCollector<IntWritable, Text> output, Reporter reporter) throws IOException { String arr[] = value.toString().split("\\r?\\n"); for (String row : arr) { if (row.startsWith("\"")) { continue; } String parts[] = row.split(","); output.collect(new IntWritable(new Integer(parts[1])), new Text(parts[4])); } }
public void map( LongWritable key, Text value, OutputCollector<DoubleWritable, DoubleWritable> output, Reporter reporter) throws IOException { String line = value.toString(); DoubleWritable clave = new DoubleWritable(); DoubleWritable valor = new DoubleWritable(); clave.set(Double.parseDouble(line)); valor.set(Math.sqrt(Double.parseDouble(line))); output.collect(clave, valor); }
public void map( LongWritable key, Text value, OutputCollector<IntWritable, IntWritable> output, Reporter reporter) throws IOException { String line = value.toString(); IntWritable clave = new IntWritable(); IntWritable valor = new IntWritable(); clave.set(Integer.parseInt(line)); valor.set(Integer.parseInt(line) + 1); output.collect(clave, valor); }
public void map( LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { String line = value.toString(); String lastToken = null; StringTokenizer s = new StringTokenizer(line, "\t"); String year = s.nextToken(); while (s.hasMoreTokens()) { lastToken = s.nextToken(); } int averagePrice = Integer.parseInt(lastToken); output.collect(new Text(year), new IntWritable(averagePrice)); }
public void reduce( Text key, Iterator<IntWritable> value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { // TODO Auto-generated method stub int maxAvg = 30; int val = Integer.MIN_VALUE; while (value.hasNext()) { if ((val = value.next().get()) > maxAvg) { output.collect(key, new IntWritable(val)); } } }
@Override public void map( LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { String line = value.toString(); StringTokenizer tokens = new StringTokenizer(line); String[] keys = tokens.nextToken().toString().split("-"); String date_temp = tokens.nextToken(); String country = keys[0]; String year = keys[1]; // Mandamos año y un iterable de [ciudad,mes,temperatura,fecha] output.collect(new Text(year), new Text(country + "," + date_temp)); }
/** Implements the map-method of the Mapper-interface */ @Override public void map( LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { String line = value.toString(); StringTokenizer tokenizer = new StringTokenizer(line); while (tokenizer.hasMoreTokens()) { String token = tokenizer.nextToken(); if (token.length() > 3) { word.set(token); output.collect(word, one); } } }
public void reduce( IntWritable key, Iterator<RecordStatsWritable> values, OutputCollector<IntWritable, RecordStatsWritable> output, Reporter reporter) throws IOException { long bytes = 0; long records = 0; int xor = 0; while (values.hasNext()) { RecordStatsWritable stats = values.next(); bytes += stats.getBytes(); records += stats.getRecords(); xor ^= stats.getChecksum(); } output.collect(key, new RecordStatsWritable(bytes, records, xor)); }
public void map( LongWritable key, Point value, OutputCollector<LongWritable, Point> output, Reporter reporter) throws IOException { double min = value.sumOfSquares(centers.get(0)); int best = 0; for (int index = 1; index < numberOfCenters; ++index) { double current = value.sumOfSquares(centers.get(index)); if (current < min) { min = current; best = index; } } reporter.incrCounter("NUMBER", "NODES", 1); reporter.incrCounter("CENTER", "" + best, 1); output.collect(new LongWritable(best), value); }
public void map( LongWritable key, Text value, OutputCollector<IntWritable, DoubleWritable> output, Reporter reporter) throws IOException { String line = value.toString(); StringTokenizer tokenizer = new StringTokenizer(line); int rowIdx = 0; double xValue = 0; if (tokenizer.hasMoreTokens()) { rowIdx = Integer.parseInt(tokenizer.nextToken()); xValue = Double.parseDouble(tokenizer.nextToken()); } double xResult = (resVec[rowIdx] - (sumVec[rowIdx] * xValue)) / diaVec[rowIdx]; output.collect(new IntWritable(rowIdx), new DoubleWritable(xResult)); }
public void reduce( IntWritable key, Iterator<Text> values, OutputCollector<IntWritable, Text> output, Reporter reporter) throws IOException { HashMap<String, Integer> countries_map = new HashMap<String, Integer>(); ArrayList<Integer> counties = new ArrayList<>(); String cp = new String(); while (values.hasNext()) { cp = values.next().toString(); if (countries_map.containsKey(cp)) { countries_map.put(cp, countries_map.get(cp) + 1); } else { countries_map.put(cp, 1); } } for (java.util.Map.Entry<String, Integer> entry : countries_map.entrySet()) { counties.add(entry.getValue()); } output.collect( key, new Text( "" + countries_map.entrySet().size() + " " + Collections.min(counties) + " " + median(counties) + " " + Collections.max(counties) + " " + mean(counties) + " " + standard_deviation(counties))); }
@SuppressWarnings("unchecked") public void map( WritableComparable key, Writable value, OutputCollector<IntWritable, RecordStatsWritable> output, Reporter reporter) throws IOException { // Set up rawKey and rawValue on the first call to 'map' if (recordId == -1) { rawKey = createRaw(key.getClass()); rawValue = createRaw(value.getClass()); } ++recordId; if (this.key == sortOutput) { // Check if keys are 'sorted' if this // record is from sort's output if (prevKey == null) { prevKey = key; keyClass = prevKey.getClass(); } else { // Sanity check if (keyClass != key.getClass()) { throw new IOException( "Type mismatch in key: expected " + keyClass.getName() + ", recieved " + key.getClass().getName()); } // Check if they were sorted correctly if (prevKey.compareTo(key) > 0) { throw new IOException( "The 'map-reduce' framework wrongly" + " classifed (" + prevKey + ") > (" + key + ") " + "for record# " + recordId); } prevKey = key; } // Check if the sorted output is 'partitioned' right int keyPartition = partitioner.getPartition(key, value, noSortReducers); if (partition != keyPartition) { throw new IOException( "Partitions do not match for record# " + recordId + " ! - '" + partition + "' v/s '" + keyPartition + "'"); } } // Construct the record-stats and output (this.key, record-stats) byte[] keyBytes = rawKey.getRawBytes(key); int keyBytesLen = rawKey.getRawBytesLength(key); byte[] valueBytes = rawValue.getRawBytes(value); int valueBytesLen = rawValue.getRawBytesLength(value); int keyValueChecksum = (WritableComparator.hashBytes(keyBytes, keyBytesLen) ^ WritableComparator.hashBytes(valueBytes, valueBytesLen)); output.collect( this.key, new RecordStatsWritable((keyBytesLen + valueBytesLen), 1, keyValueChecksum)); }
public void map( WritableComparable<?> key, Text value, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException { String url = value.toString(); // value is line of text if (url != null && url.trim().startsWith("#")) { /* Ignore line that start with # */ return; } // if tabs : metadata that could be stored // must be name=value and separated by \t float customScore = -1f; int customInterval = interval; int fixedInterval = -1; Map<String, String> metadata = new TreeMap<String, String>(); if (url.indexOf("\t") != -1) { String[] splits = url.split("\t"); url = splits[0]; for (int s = 1; s < splits.length; s++) { // find separation between name and value int indexEquals = splits[s].indexOf("="); if (indexEquals == -1) { // skip anything without a = continue; } String metaname = splits[s].substring(0, indexEquals); String metavalue = splits[s].substring(indexEquals + 1); if (metaname.equals(nutchScoreMDName)) { try { customScore = Float.parseFloat(metavalue); } catch (NumberFormatException nfe) { } } else if (metaname.equals(nutchFetchIntervalMDName)) { try { customInterval = Integer.parseInt(metavalue); } catch (NumberFormatException nfe) { } } else if (metaname.equals(nutchFixedFetchIntervalMDName)) { try { fixedInterval = Integer.parseInt(metavalue); } catch (NumberFormatException nfe) { } } else metadata.put(metaname, metavalue); } } try { url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT); url = filters.filter(url); // filter the url } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Skipping " + url + ":" + e); } url = null; } if (url == null) { reporter.getCounter("injector", "urls_filtered").increment(1); } else { // if it passes value.set(url); // collect it CrawlDatum datum = new CrawlDatum(); datum.setStatus(CrawlDatum.STATUS_INJECTED); // Is interval custom? Then set as meta data if (fixedInterval > -1) { // Set writable using float. Flaot is used by AdaptiveFetchSchedule datum .getMetaData() .put(Nutch.WRITABLE_FIXED_INTERVAL_KEY, new FloatWritable(fixedInterval)); datum.setFetchInterval(fixedInterval); } else { datum.setFetchInterval(customInterval); } datum.setFetchTime(curTime); // now add the metadata Iterator<String> keysIter = metadata.keySet().iterator(); while (keysIter.hasNext()) { String keymd = keysIter.next(); String valuemd = metadata.get(keymd); datum.getMetaData().put(new Text(keymd), new Text(valuemd)); } if (customScore != -1) datum.setScore(customScore); else datum.setScore(scoreInjected); try { scfilters.injectedScore(value, datum); } catch (ScoringFilterException e) { if (LOG.isWarnEnabled()) { LOG.warn( "Cannot filter injected score for url " + url + ", using default (" + e.getMessage() + ")"); } } reporter.getCounter("injector", "urls_injected").increment(1); output.collect(value, datum); } }
public void reduce( BlockIDWritable key, Iterator<PairWritable> values, OutputCollector<BlockIDWritable, PairWritable> output, Reporter reporter) throws IOException { // Vector<Star> starV = new Vector<Star>(); int buketsizeX = 0; int buketsizeY = 0; double bwidth = maxAlphas[key.zoneNum]; // ra ,x double bheight = theta; // dec ,y /* add 10 more in each dimension to make sure there is no overflow. */ Vector<Star>[][] arrstarV = new Vector[((int) (zoneHeight / bheight)) + 10] [((int) (blockWidth / bwidth)) + 10]; // create bucket vector[Y][X] int num = 0; while (values.hasNext()) { num++; Star s = values.next().get(0); // participant double posx = (s.ra - blockRanges[key.raNum][0]) / bwidth; int x = (int) posx + 1; // shit by 1 in case star comes from other block double posy = (s.dec - zoneRanges[key.zoneNum][0]) / bheight; int y = (int) posy + 1; // set bucket size as max if (buketsizeX < x) buketsizeX = x; if (buketsizeY < y) buketsizeY = y; // create according bucket if (arrstarV[y][x] == null) // TODO avaoid creating vectors here. arrstarV[y][x] = new Vector<Star>(); // put star into bucket arrstarV[y][x].add(s); } // start reducer int i, j, row, col; // for each bucket for (row = 0; row <= buketsizeY; row++) { for (col = 0; col <= buketsizeX; col++) { // starV.clear(); // construct a new vector to do compare // TODO we need to avoid searching objects in the border. if (arrstarV[row][col] != null) { // old method to generate output for (i = 0; i < arrstarV[row][col].size(); i++) { for (j = i + 1; j < arrstarV[row][col].size(); j++) { Star star1 = arrstarV[row][col].get(i); Star star2 = arrstarV[row][col].get(j); // what is this margin about if (star1.margin && star2.margin) continue; double dist = star1.x * star2.x + star1.y * star2.y + star1.z * star2.z; if (dist > costheta) { p.set(star1, star2, dist); output.collect(key, p); p.set(star2, star1, dist); output.collect(key, p); // num += 2; } } } // end for i,j } // end if else { continue; } // 4 more neighbors // right upper arrstarV[row-1][col+1] vs arrstarV[row][col] if (row != 0 && arrstarV[row - 1][col + 1] != null) { search(arrstarV[row][col], arrstarV[row - 1][col + 1], key, output); } // right arrstarV[row][col+1] vs arrstarV[row][col] if (arrstarV[row][col + 1] != null) { search(arrstarV[row][col], arrstarV[row][col + 1], key, output); } // right lower if (arrstarV[row + 1][col + 1] != null) { search(arrstarV[row][col], arrstarV[row + 1][col + 1], key, output); } // lower if (arrstarV[row + 1][col] != null) { search(arrstarV[row][col], arrstarV[row + 1][col], key, output); } // end if } // end colum } // end row }
public void map( LongWritable key, Star value, OutputCollector<BlockIDWritable, PairWritable> output, Reporter reporter) throws IOException { loc.set(value.ra, value.dec); int zoneNum = loc.zoneNum; int raNum = loc.raNum; p.set(value, null); /* * When the block size increases (> theta), only part of a block * needs to be copied to its neighbor. */ output.collect(loc, p); /* * only replicate objects in the border of a block. I expect most of * objects don't need to be copied. */ if (value.dec > zoneRanges[zoneNum][0] + theta && value.dec < zoneRanges[zoneNum][1] - theta && value.ra > blockRanges[raNum][0] + maxAlphas[zoneNum] && value.ra < blockRanges[raNum][1] - maxAlphas[zoneNum]) return; /* * the code below is to copy the star to some neighbors. We only * need to copy an object to the bottom, left, left bottom, left top * neighbors */ value.margin = true; /* * we should treat the entire zone 0 as a block, so we only needs to * copy some objects at the corner to their neighbors */ if (loc.zoneNum == 0) { /* copy the object to the right top neighbor */ if (value.ra >= blockRanges[raNum][1] - maxAlphas[zoneNum] && value.ra <= blockRanges[raNum][1] && value.dec >= zoneRanges[zoneNum][1] - theta && value.dec <= zoneRanges[zoneNum][1]) { // BlockIDWritable loc1 = new BlockIDWritable(); /* raNum of objects in zone 0 is always 0, * we need to recalculate it. */ // loc1.raNum = BlockIDWritable.ra2Num(value.ra) + 1; // if (loc1.raNum == numBlocks) { // loc1.raNum = 0; // value.ra -= 360; // } // loc1.zoneNum = loc.zoneNum + 1; /// output.collect(loc1, p); } return; } else if (loc.zoneNum == numZones - 1) { /* copy the object to the bottom neighbor */ if (value.dec >= zoneRanges[zoneNum][0] && value.dec <= zoneRanges[zoneNum][0] + theta) { /* raNum of objects in zone zoneNum - 1 is always 0, * we need to recalculate it. */ loc1.raNum = BlockIDWritable.ra2Num(value.ra); loc1.zoneNum = loc.zoneNum - 1; output.collect(loc1, p); /* copy the object to the right bottom neighbor */ while (value.ra >= blockRanges[loc1.raNum][1] - maxAlphas[zoneNum] && value.ra <= blockRanges[loc1.raNum][1]) { loc1.raNum++; if (loc1.raNum == numBlocks) { loc1.raNum = 0; value.ra -= 360; } loc1.zoneNum = loc.zoneNum - 1; output.collect(loc1, p); } } return; } boolean wrap = false; loc1.raNum = loc.raNum; /* copy the object to the right neighbor */ while (value.ra >= blockRanges[loc1.raNum][1] - maxAlphas[zoneNum] && value.ra <= blockRanges[loc1.raNum][1]) { loc1.raNum++; loc1.zoneNum = loc.zoneNum; /* * when the object is copied to the right neighbor, we need to * be careful. we need to convert ra and raNum if ra is close to * 360. */ if (loc1.raNum == numBlocks) { loc1.raNum = 0; value.ra -= 360; wrap = true; } output.collect(loc1, p); /* copy the object to the right bottom neighbor */ if (value.dec >= zoneRanges[zoneNum][0] && value.dec <= zoneRanges[zoneNum][0] + theta) { loc1.zoneNum = loc.zoneNum - 1; output.collect(loc1, p); } /* copy the object to the right top neighbor */ if (value.dec >= zoneRanges[zoneNum][1] - theta && value.dec <= zoneRanges[zoneNum][1]) { loc1.zoneNum = loc.zoneNum + 1; output.collect(loc1, p); } } if (wrap) { value.ra += 360; } /* copy the object to the bottom neighbor */ if (value.dec >= zoneRanges[zoneNum][0] && value.dec <= zoneRanges[zoneNum][0] + theta) { loc1.raNum = loc.raNum; loc1.zoneNum = loc.zoneNum - 1; if (loc1.zoneNum == 0) loc1.raNum = 0; output.collect(loc1, p); } }
public void reduce( Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { List<String[]> _temp = new ArrayList<String[]>(); int count = 0; while (values.hasNext()) { Text _out = values.next(); String[] tokens = StringUtils.splitPreserveAllTokens(_out.toString(), TAB); _temp.add(tokens); if (count++ > 100000) break; } if (count > 10000) { Set<String> ipSet = new HashSet<String>(); for (int posI = 0; posI < _temp.size(); posI++) { String[] array = _temp.get(posI); if (array == null) continue; String mid = array[2]; String ip = array[3]; ipSet.add(ip); } output.collect( key, Utils.mergeKey(String.valueOf(ipSet.size()), StringUtils.join(ipSet, '|'))); return; } /** * ·Ö×éËã·¨ FOREACH ALL_DATA IF IN INDEX THEN UPDATE INDEX AND INSERT DATA ELSE FOREACH SUB_DATA * MAKE INDEX AND SET FIND'S DATA AS NULL */ // List<List<String[]>> dataList = new ArrayList<List<String[]>>(); List<StringBuffer> indexList = new ArrayList<StringBuffer>(); Set<String> ipSet = new HashSet<String>(); boolean muliHost = false; for (int posI = 0; posI < _temp.size(); posI++) { String[] array = _temp.get(posI); if (array == null) continue; String mid = array[2]; String ip = array[3]; ipSet.add(ip); boolean hasIndex = false; for (int i = 0; i < indexList.size(); i++) { StringBuffer index = indexList.get(i); if (index.indexOf("|" + mid + "|") >= 0 || index.indexOf("|" + ip + "|") >= 0) { if (index.indexOf("|" + mid + "|") < 0) { index.append('|').append(mid).append('|'); } if (index.indexOf("|" + ip + "|") < 0) { index.append('|').append(ip).append('|'); } // dataList.get(i).add(array); hasIndex = true; break; } } if (!hasIndex) { StringBuffer index = new StringBuffer("|" + mid + "|" + ip + "|"); // List<String[]> _tmp = new ArrayList<String[]>(); // _tmp.add(array); for (int k = posI + 1; k < _temp.size(); k++) { String[] _newArray = _temp.get(k); if (_newArray == null) { continue; } String _mid = _newArray[2]; String _ip = _newArray[3]; if (index.indexOf("|" + _mid + "|") >= 0 || index.indexOf("|" + _ip + "|") >= 0) { if (index.indexOf("|" + _mid + "|") < 0) { index.append('|').append(_mid).append('|'); } if (index.indexOf("|" + _ip + "|") < 0) { index.append('|').append(_ip).append('|'); } // _tmp.add(_newArray); _temp.set(k, null); } } indexList.add(index); // dataList.add(_tmp); } } // for(String[] _array : _temp){ // output.collect(key,Utils.mergeKey(_array[1],_array[2],_array[3],_array[4])); // } StringBuffer allIndex = new StringBuffer(); for (StringBuffer index : indexList) { allIndex.append(index).append(';'); } if (allIndex.length() > 0) { allIndex.deleteCharAt(allIndex.length() - 1); } output.collect( key, Utils.mergeKey(String.valueOf(indexList.size()), StringUtils.join(ipSet, '|'))); }
public void map(Object key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { String[] _allCols = StringUtils.splitPreserveAllTokens(value.toString(), TAB); output.collect(new Text(_allCols[0]), value); }
@Override public void reduce( IntWritable key, Iterator<ClusterWritable> values, OutputCollector<IntWritable, Text> output, Reporter reporter) throws IOException { float sumSimilarity = 0.0f; int numMovies = 0; float avgSimilarity = 0.0f; float similarity = 0.0f; int s = 0; int count; float diff = 0.0f; float minDiff = 1.0f; int candidate = 0; String data = new String(""); String shortline = new String(""); ArrayList<String> arrl = new ArrayList<String>(); ArrayList<Float> simArrl = new ArrayList<Float>(); String oneElm = new String(); int indexShort, index2; Text val = new Text(); while (values.hasNext()) { ClusterWritable cr = (ClusterWritable) values.next(); similarity = cr.similarity; simArrl.addAll(cr.similarities); for (int i = 0; i < cr.movies.size(); i++) { oneElm = cr.movies.get(i); indexShort = oneElm.indexOf( ",", 1000); // to avoid memory error caused by long arrays; it will results less // accurate if (indexShort == -1) { shortline = new String(oneElm); } else { shortline = new String(oneElm.substring(0, indexShort)); } arrl.add(shortline); output.collect(key, new Text(oneElm)); } numMovies += cr.movies.size(); sumSimilarity += similarity; } if (numMovies > 0) { avgSimilarity = sumSimilarity / (float) numMovies; } diff = 0.0f; minDiff = 1.0f; for (s = 0; s < numMovies; s++) { diff = (float) Math.abs(avgSimilarity - simArrl.get(s)); if (diff < minDiff) { minDiff = diff; candidate = s; } } data = arrl.get(candidate); index2 = data.indexOf(":"); String movieStr = data.substring(0, index2); String reviews = data.substring(index2 + 1); StringTokenizer token = new StringTokenizer(reviews, ","); count = 0; while (token.hasMoreTokens()) { token.nextToken(); count++; } System.out.println( "The key = " + key.toString() + " has members = " + numMovies + " simil = " + simArrl.get(candidate)); val = new Text(simArrl.get(candidate) + " " + movieStr + " " + count + " " + reviews); output.collect(key, val); reporter.incrCounter(Counter.VALUES, 1); }
public void map( LongWritable key, Text value, OutputCollector<IntWritable, ClusterWritable> output, Reporter reporter) throws IOException { String movieIdStr = new String(); String reviewStr = new String(); String userIdStr = new String(); String reviews = new String(); String line = new String(); String tok = new String(""); long movieId; int review, userId, p, q, r, rater, rating, movieIndex; int clusterId = 0; int[] n = new int[maxClusters]; float[] sq_a = new float[maxClusters]; float[] sq_b = new float[maxClusters]; float[] numer = new float[maxClusters]; float[] denom = new float[maxClusters]; float max_similarity = 0.0f; float similarity = 0.0f; Cluster movie = new Cluster(); ClusterWritable movies_arrl = new ClusterWritable(); StringBuffer sb = new StringBuffer(); line = ((Text) value).toString(); movieIndex = line.indexOf(":"); for (r = 0; r < maxClusters; r++) { numer[r] = 0.0f; denom[r] = 0.0f; sq_a[r] = 0.0f; sq_b[r] = 0.0f; n[r] = 0; } if (movieIndex > 0) { movieIdStr = line.substring(0, movieIndex); sb.append(movieIdStr).append(":"); movieId = Long.parseLong(movieIdStr); movie.movie_id = movieId; reviews = line.substring(movieIndex + 1); StringTokenizer token = new StringTokenizer(reviews, ","); int attrCnt = 0; // while (token.hasMoreTokens()) { Leo while (token.hasMoreTokens() && attrCnt < attrNum) { tok = token.nextToken(); int reviewIndex = tok.indexOf("_"); // userIdStr = tok.substring(0, reviewIndex); //Leo userIdStr = String.valueOf(attrCnt); reviewStr = tok.substring(reviewIndex + 1); if (attrCnt > 0) { sb.append(","); } sb.append(String.valueOf(attrCnt)).append("_").append(reviewStr); userId = Integer.parseInt(userIdStr); review = Integer.parseInt(reviewStr); for (r = 0; r < totalClusters; r++) { /*for (q = 0; q < centroids_ref[r].total; q++) { rater = centroids_ref[r].reviews.get(q).rater_id; rating = (int) centroids_ref[r].reviews.get(q).rating; if (userId == rater) { numer[r] += (float) (review * rating); sq_a[r] += (float) (review * review); sq_b[r] += (float) (rating * rating); n[r]++; // counter break; // to avoid multiple ratings by the same reviewer } }*/ // Leo rating = (int) centroids_ref[r].reviews.get(attrCnt).rating; numer[r] += (float) ((review - rating) * (review - rating)); n[r]++; // counter } attrCnt++; } for (p = 0; p < totalClusters; p++) { /*denom[p] = (float) ((Math.sqrt((double) sq_a[p])) * (Math .sqrt((double) sq_b[p]))); if (denom[p] > 0) { similarity = numer[p] / denom[p]; if (similarity > max_similarity) { max_similarity = similarity; clusterId = p; } }*/ // Leo similarity = 250 - numer[p]; if (similarity > max_similarity) { max_similarity = similarity; clusterId = p; } } // movies_arrl.movies.add(line);//Leo movies_arrl.movies.add(sb.toString()); movies_arrl.similarities.add(max_similarity); movies_arrl.similarity = max_similarity; output.collect(new IntWritable(clusterId), movies_arrl); reporter.incrCounter(Counter.WORDS, 1); } }