/* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context) */ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] items = value.toString().split(fieldDelimRegex); srcEntityId = items[0]; trgEntityId = items[1]; rank = Integer.parseInt(items[items.length - 1]); outKey.initialize(); if (recordInOutput) { // include source and taraget record if (recLength == -1) { recLength = (items.length - 3) / 2; srcRecBeg = 2; srcRecEnd = trgRecBeg = 2 + recLength; trgRecEnd = trgRecBeg + recLength; } srcRec = org.chombo.util.Utility.join(items, srcRecBeg, srcRecEnd, fieldDelim); trgRec = org.chombo.util.Utility.join(items, trgRecBeg, trgRecEnd, fieldDelim); outKey.add(srcEntityId, srcRec, rank); outVal.set(trgEntityId + fieldDelim + trgRec + fieldDelim + items[items.length - 1]); } else { // only target entity id and distance outKey.add(srcEntityId, rank); outVal.set(trgEntityId + fieldDelim + items[items.length - 1]); } context.write(outKey, outVal); }
/* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context) */ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] items = value.toString().split(fieldDelim); eventType = Integer.parseInt(items[2]); timeStamp = Long.parseLong(items[3]); // user ID, item ID, event keyOut.initialize(); keyOut.add(items[0], items[1], eventType); valOut.initialize(); valOut.add(eventType, timeStamp); context.write(keyOut, valOut); }
/* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Reducer#reduce(KEYIN, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context) */ protected void reduce(Tuple key, Iterable<Tuple> values, Context context) throws IOException, InterruptedException { sum = 0; count = 0; for (Tuple val : values) { count += val.getInt(0); sum += val.getInt(1); } avg = count > 0 ? sum / count : 0; stBld.delete(0, stBld.length()); stBld.append(key.toString()).append(fieldDelim); stBld.append(count).append(fieldDelim).append(sum).append(fieldDelim).append(avg); outVal.set(stBld.toString()); context.write(NullWritable.get(), outVal); }
/* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Reducer#reduce(KEYIN, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context) */ protected void reduce(Tuple key, Iterable<Tuple> values, Context context) throws IOException, InterruptedException { if (stBld.length() > 0) { stBld.delete(0, stBld.length() - 1); } boolean first = true; count = 0; latestTimeStamp = 0; for (Tuple value : values) { eventType = value.getInt(0); timeStamp = value.getLong(1); if (first) { mostEngagingEventType = eventType; ++count; first = false; } else { // all occurences of the first event type if (eventType == mostEngagingEventType) { ++count; } } // latest time stamp if (timeStamp > latestTimeStamp) { latestTimeStamp = timeStamp; } } rating = ratingMapper.scoreForEvent(mostEngagingEventType, count); stBld .append(key.getString(0)) .append(fieldDelim) .append(key.getString(1)) .append(fieldDelim) .append(rating) .append(fieldDelim) .append(latestTimeStamp); if (outputDetail) { stBld.append(fieldDelim).append(mostEngagingEventType).append(fieldDelim).append(count); } valOut.set(stBld.toString()); context.write(NullWritable.get(), valOut); }
@Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { items = value.toString().split(fieldDelimRegex); outKey.initialize(); outVal.initialize(); int initValue = 0; for (int i = 0; i < quantityAttr; ++i) { outKey.append(items[i]); } if (isAggrFileSplit) { if (items.length >= quantityAttr) { // existing aggregation outVal.add( Integer.parseInt(items[quantityAttr]), Integer.parseInt(items[quantityAttr + 1])); } else { // first aggregation outVal.add(initValue, initValue); } } else { outVal.add((int) 1, Integer.parseInt(items[quantityAttr])); } context.write(outKey, outVal); }
/* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Reducer#reduce(KEYIN, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context) */ protected void reduce(TextInt key, Iterable<Tuple> values, Context context) throws IOException, InterruptedException { ratingCorrelations.clear(); ++logCounter; ratingStat = null; for (Tuple value : values) { if (((Integer) value.get(value.getSize() - 1)) == 0) { // in rating correlation ratingCorrelations.add(value.createClone()); context.getCounter("Predictor", "Rating correlation").increment(1); } else if (((Integer) value.get(value.getSize() - 1)) == 1) { // rating stat ratingStat = value.createClone(); } else { // in user rating if (!ratingCorrelations.isEmpty()) { String userID = value.getString(0); rating = value.getInt(1); if (userRatingWithContext) { ratingContext = value.getString(2); } // all rating correlations for (Tuple ratingCorrTup : ratingCorrelations) { context.getCounter("Predictor", "User rating").increment(1); itemID = ratingCorrTup.getString(0); ratingCorr = ratingCorrTup.getInt(1); weight = ratingCorrTup.getInt(2); modifyCorrelation(); int predRating = linearCorrelation ? (rating * ratingCorr) / maxRating : (rating * correlationScale + ratingCorr) / maxRating; if (predRating > 0) { // userID, itemID, predicted rating, correlation length, correlation coeff, input // rating std dev ratingStdDev = ratingStat != null ? ratingStat.getInt(0) : -1; if (userRatingWithContext) { valueOut.set( userID + fieldDelim + itemID + fieldDelim + ratingContext + fieldDelim + predRating + fieldDelim + weight + fieldDelim + ratingCorr + fieldDelim + ratingStdDev); } else { valueOut.set( userID + fieldDelim + itemID + fieldDelim + predRating + fieldDelim + weight + fieldDelim + ratingCorr + fieldDelim + ratingStdDev); } context.write(NullWritable.get(), valueOut); context.getCounter("Predictor", "Rating correlation").increment(1); } } } } } }
/* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context) */ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] items = value.toString().split(fieldDelim); String itemID = items[0]; if (isRatingFileSplit) { // user rating context.getCounter("Record type count", "Rating").increment(1); boolean toInclude = true; for (int i = 1; i < items.length; ++i) { // all user ratings for this item ratings = items[i].split(subFieldDelim); // time sensitive recommendation toInclude = true; if (ratingTimeCutoff > 0) { timeStamp = Long.parseLong(ratings[2]); toInclude = timeStamp > ratingTimeCutoff; } // contextual recommendation if (userRatingWithContext) { ratingContext = ratings[3]; } // check for min input rating threshold inputRating = new Integer(ratings[1]); toInclude = toInclude && inputRating > minInputRating; if (toInclude) { // itemID keyOut.set(itemID, two); // userID, rating valOut.initialize(); if (userRatingWithContext) { valOut.add(ratings[0], inputRating, context, two); } else { valOut.add(ratings[0], inputRating, two); } context.write(keyOut, valOut); } } } else if (isRatingStatFileSplit) { // rating stat context.getCounter("Record type count", "Rating stat").increment(1); int ratingStdDev = Integer.parseInt(items[STD_DEV_ORD]); keyOut.set(itemID, one); valOut.initialize(); valOut.add(ratingStdDev, one); context.write(keyOut, valOut); } else { // item correlation context.getCounter("Record type count", "Correlation").increment(1); correlation = Integer.parseInt(items[2]); correlationLength = Integer.parseInt(items[3]); // if correlation is above min threshold if (correlation > minCorrelation) { // correlation of 1st item keyOut.set(items[0], zero); valOut.initialize(); if (linearCorrelation) { // other itemID, correlation, intersection length (weight) valOut.add(items[1], correlation, correlationLength, zero); } else { // other itemID, correlation, intersection length (weight) valOut.add(items[1], -correlation, correlationLength, zero); } context.write(keyOut, valOut); // correlation of second item keyOut.set(items[1], zero); valOut.initialize(); if (linearCorrelation) { // other itemID, correlation, intersection length (weight) valOut.add(items[0], correlation, correlationLength, zero); } else { // other itemID, correlation, intersection length (weight) valOut.add(items[0], -correlation, correlationLength, zero); } context.write(keyOut, valOut); } } }
/* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Reducer#reduce(KEYIN, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context) */ protected void reduce(Tuple key, Iterable<Text> values, Context context) throws IOException, InterruptedException { srcEntityId = key.getString(0); count = 0; boolean doEmitNeighbor = false; valueList.clear(); for (Text value : values) { doEmitNeighbor = false; // count based neighbor if (nearestByCount) { doEmitNeighbor = true; if (++count >= topMatchCount) { doEmitNeighbor = false; } } // distance based neighbors if (nearestByDistance) { // distance based neighbor String[] items = value.toString().split(fieldDelim); distance = Integer.parseInt(items[items.length - 1]); if (distance <= topMatchDistance) { if (!nearestByCount) { doEmitNeighbor = true; } } else { doEmitNeighbor = false; } } if (doEmitNeighbor) { // along with neighbors if (compactOutput) { if (recordInOutput) { // contains id,record,rank - strip out entity ID and rank String[] valueItems = value.toString().split(fieldDelim); valueList.add(org.chombo.util.Utility.join(valueItems, 1, valueItems.length - 1)); } else { // contains id, rank valueList.add(value.toString()); } } else { outVal.set(srcEntityId + fieldDelim + value.toString()); context.write(NullWritable.get(), outVal); } } else { // only source entity if neighborhood condition not met if (outputWithNoNeighbor && !compactOutput) { outVal.set(srcEntityId); context.write(NullWritable.get(), outVal); } } } // emit in compact format if (compactOutput) { boolean doEmit = true; String srcRec = recordInOutput ? key.getString(1) : ""; int numNeighbor = valueList.size(); if (0 == numNeighbor) { // only source entity if neighborhood condition not met if (outputWithNoNeighbor) { outVal.set( recordInOutput ? srcEntityId + fieldDelim + srcRec + fieldDelim + numNeighbor : srcEntityId); } else { doEmit = false; } } else { String targetValues = org.chombo.util.Utility.join(valueList, fieldDelim); outVal.set( recordInOutput ? srcEntityId + fieldDelim + srcRec + fieldDelim + numNeighbor + targetValues : srcEntityId + fieldDelim + targetValues); } if (doEmit) { context.write(NullWritable.get(), outVal); } } }
/* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Reducer#reduce(KEYIN, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context) */ protected void reduce(Tuple key, Iterable<Tuple> values, Context context) throws IOException, InterruptedException { if (stBld.length() > 0) { stBld.delete(0, stBld.length()); } testEntityId = key.getString(0); stBld.append(testEntityId); // collect nearest neighbors count = 0; neighborhood.initialize(); for (Tuple value : values) { int index = 0; trainEntityId = value.getString(index++); distance = value.getInt(index++); trainClassValue = value.getString(index++); if (classCondtionWeighted && neighborhood.IsInClassificationMode()) { trainingFeaturePostProb = value.getDouble(index++); if (inverseDistanceWeighted) { neighborhood.addNeighbor( trainEntityId, distance, trainClassValue, trainingFeaturePostProb, true); } else { neighborhood.addNeighbor( trainEntityId, distance, trainClassValue, trainingFeaturePostProb); } } else { Neighborhood.Neighbor neighbor = neighborhood.addNeighbor(trainEntityId, distance, trainClassValue); if (neighborhood.isInLinearRegressionMode()) { neighbor.setRegrInputVar(Double.parseDouble(value.getString(index++))); } } if (++count == topMatchCount) { break; } } if (neighborhood.isInLinearRegressionMode()) { String testRegrNumFld = isValidationMode ? key.getString(2) : key.getString(1); neighborhood.withRegrInputVar(Double.parseDouble(testRegrNumFld)); } // class distribution neighborhood.processClassDitribution(); if (outputClassDistr && neighborhood.IsInClassificationMode()) { if (classCondtionWeighted) { Map<String, Double> classDistr = neighborhood.getWeightedClassDitribution(); double thisScore; for (String classVal : classDistr.keySet()) { thisScore = classDistr.get(classVal); // LOG.debug("classVal:" + classVal + " thisScore:" + thisScore); stBld.append(fieldDelim).append(classVal).append(fieldDelim).append(thisScore); } } else { Map<String, Integer> classDistr = neighborhood.getClassDitribution(); int thisScore; for (String classVal : classDistr.keySet()) { thisScore = classDistr.get(classVal); stBld.append(classVal).append(fieldDelim).append(thisScore); } } } if (isValidationMode) { // actual class attr value testClassValActual = key.getString(1); stBld.append(fieldDelim).append(testClassValActual); } // predicted class value if (useCostBasedClassifier) { // use cost based arbitrator if (neighborhood.IsInClassificationMode()) { posClassProbab = neighborhood.getClassProb(posClassAttrValue); testClassValPredicted = costBasedArbitrator.classify(posClassProbab); } } else { // get directly if (neighborhood.IsInClassificationMode()) { testClassValPredicted = neighborhood.classify(); } else { testClassValPredicted = "" + neighborhood.getPredictedValue(); } } stBld.append(fieldDelim).append(testClassValPredicted); if (isValidationMode) { if (neighborhood.IsInClassificationMode()) { confMatrix.report(testClassValPredicted, testClassValActual); } } outVal.set(stBld.toString()); context.write(NullWritable.get(), outVal); }
/* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context) */ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { items = value.toString().split(fieldDelimRegex); outKey.initialize(); outVal.initialize(); if (classCondtionWeighted) { trainEntityId = items[2]; testEntityId = items[0]; rank = Integer.parseInt(items[3]); trainClassAttr = items[4]; trainingFeaturePostProb = Double.parseDouble(items[5]); if (isValidationMode) { // validation mode testClassAttr = items[1]; outKey.add(testEntityId, testClassAttr, rank); } else { // prediction mode outKey.add(testEntityId, rank); } outVal.add(trainEntityId, rank, trainClassAttr, trainingFeaturePostProb); } else { int index = 0; trainEntityId = items[index++]; testEntityId = items[index++]; rank = Integer.parseInt(items[index++]); trainClassAttr = items[index++]; if (isValidationMode) { // validation mode testClassAttr = items[index++]; } outVal.add(trainEntityId, rank, trainClassAttr); // for linear regression add numeric input field if (isLinearRegression) { trainRegrNumFld = items[index++]; outVal.add(trainRegrNumFld); testRegrNumFld = items[index++]; if (isValidationMode) { outKey.add(testEntityId, testClassAttr, testRegrNumFld, rank); } else { outKey.add(testEntityId, testRegrNumFld, rank); } outKey.add(testRegrNumFld); } else { if (isValidationMode) { outKey.add(testEntityId, testClassAttr, rank); } else { outKey.add(testEntityId, rank); } } } context.write(outKey, outVal); }