public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String cur_file = ((FileSplit) context.getInputSplit()).getPath().getParent().getParent().getName(); String train_file = context.getConfiguration().get("train_file"); if (cur_file.equals(train_file)) { StringTokenizer st = new StringTokenizer(value.toString()); String word = st.nextToken(); String f_id = st.nextToken(); myKey.set(word); myVal.set(f_id); context.write(myKey, myVal); } else { StringTokenizer st = new StringTokenizer(value.toString()); String word = st.nextToken(); String f_id = st.nextToken(); StringBuilder builder = new StringBuilder(dlt); while (st.hasMoreTokens()) { String filename = st.nextToken(); String tf_idf = st.nextToken(); builder.append(filename); builder.append(dlt); builder.append(tf_idf); builder.append("\t"); } myKey.set(word); myVal.set(builder.toString()); context.write(myKey, myVal); } }
/* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Mapper#setup(org.apache.hadoop.mapreduce.Mapper.Context) */ protected void setup(Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); if (conf.getBoolean("debug.on", false)) { LOG.setLevel(Level.DEBUG); System.out.println("in debug mode"); } fieldDelim = conf.get("field.delim", ","); subFieldDelim = conf.get("sub.field.delim", ":"); String ratingFilePrefix = conf.get("utp.rating.file.prefix", "rating"); isRatingFileSplit = ((FileSplit) context.getInputSplit()).getPath().getName().startsWith(ratingFilePrefix); String ratingStatFilePrefix = conf.get("utp.rating.stat.file.prefix", "stat"); isRatingStatFileSplit = ((FileSplit) context.getInputSplit()) .getPath() .getName() .startsWith(ratingStatFilePrefix); linearCorrelation = conf.getBoolean("utp.correlation.linear", true); int ratingTimeWindow = conf.getInt("utp.rating.time.window.hour", -1); ratingTimeCutoff = ratingTimeWindow > 0 ? System.currentTimeMillis() / 1000 - ratingTimeWindow * 60L * 60L : -1; minInputRating = conf.getInt("utp.min.input.rating", -1); minCorrelation = conf.getInt("utp.min.correlation", -1); userRatingWithContext = conf.getBoolean("utp.user.rating.with.context", false); LOG.info("isRatingFileSplit:" + isRatingFileSplit); }
@Override protected void setup(Context context) throws IOException, InterruptedException { long start = ((FileSplit) context.getInputSplit()).getStart(); logger.info("Input Split : ", context.getInputSplit().toString()); logger.info("Input Split Start : {}", start); counter = context.getCounter(getClass().getName(), String.valueOf(start)); }
@Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { // 获取输入文件的全路径和名称 String pathName = ((FileSplit) context.getInputSplit()).getPath().toString(); if (pathName.contains("data.txt")) { String values[] = value.toString().split("\t"); if (values.length < 3) { // data数据格式不规范,字段小于3,抛弃数据 return; } else { // 数据格式规范,区分标识为1 TextPair tp = new TextPair(new Text(values[1]), new Text("1")); context.write(tp, new Text(values[0] + "\t" + values[2])); } } if (pathName.contains("info.txt")) { String values[] = value.toString().split("\t"); if (values.length < 2) { // data数据格式不规范,字段小于2,抛弃数据 return; } else { // 数据格式规范,区分标识为0 TextPair tp = new TextPair(new Text(values[0]), new Text("0")); context.write(tp, new Text(values[1])); } } }
@Override protected void setup(Context context) throws IOException, InterruptedException { // Get the source index; (employee = 1, salary = 2) // Added as configuration in driver FileSplit fsFileSplit = (FileSplit) context.getInputSplit(); intSrcIndex = Integer.parseInt(context.getConfiguration().get(fsFileSplit.getPath().getName())); // Initialize the list of fields to emit as output based on // intSrcIndex (1=employee, 2=current salary, 3=historical salary) if (intSrcIndex == 1) // employee { lstRequiredAttribList.add(1); // FName lstRequiredAttribList.add(2); // LName lstRequiredAttribList.add(3); // Gender } else // salary { lstRequiredAttribList.add(1); // Salary lstRequiredAttribList.add(3); // Effective-to-date (Value of // 9999-01-01 indicates current // salary) } }
@Override public void setup(Context context) throws IOException, InterruptedException { /* * FileSplit for the input file provides access to the file's path. */ Path path = ((FileSplit) context.getInputSplit()).getPath(); fileName = path.getName(); }
protected void setup(Context context) throws IOException, InterruptedException { Configuration config = context.getConfiguration(); fieldDelimRegex = config.get("field.delim.regex", ","); quantityAttr = config.getInt("quantity.attr", -1); String aggrFilePrefix = context.getConfiguration().get("aggregate.file.prefix", ""); if (!aggrFilePrefix.isEmpty()) { isAggrFileSplit = ((FileSplit) context.getInputSplit()).getPath().getName().startsWith(aggrFilePrefix); } else { String incrFilePrefix = context.getConfiguration().get("incremental.file.prefix", ""); if (!incrFilePrefix.isEmpty()) { isAggrFileSplit = !((FileSplit) context.getInputSplit()).getPath().getName().startsWith(incrFilePrefix); } else { throw new IOException("Aggregate or incremental file prefix needs to be specified"); } } }
public void map(Object key, Text value, Context context) throws IOException, InterruptedException { StringTokenizer itr = new StringTokenizer(value.toString()); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); split.set(context.getInputSplit().toString()); context.write(word, split); } }
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); String fileName = ((FileSplit) context.getInputSplit()).getPath().getName(); String parts[] = line.split(":"); if (parts.length > 1) { String op = parts[0].trim() + "#" + parts[1].trim(); context.write(new Text(fileName), new Text(op)); } }
@Override protected void setup(Context context) throws IOException, InterruptedException { String suffix = ""; String fileName = ((FileSplit) context.getInputSplit()).getPath().getName(); if (fileName.contains(Constants.SUFFIX_USER_LOST_MONTH_1)) { suffix = Constants.SUFFIX_USER_LOST_MONTH_1; } else if (fileName.contains(Constants.SUFFIX_USER_BACK_MONTH_1)) { suffix = Constants.SUFFIX_USER_BACK_MONTH_1; } mapKeyObj.setSuffix(suffix); }
protected void setup(Context context) throws IOException, InterruptedException { // 从配置中 得到每个特征分片的长度 slicelen = context.getConfiguration().getInt("LLR_SliceLeangth", 10); // 得到文件名称 与行偏移组成唯一 primary key FileSplit fileSplit = (FileSplit) context.getInputSplit(); filename = fileSplit.getPath().getName(); }
public void map(Object key, Text value, Context context) throws IOException, InterruptedException { String valString = value.toString().replaceAll("[^a-zA-Z0-9]+", " "); StringTokenizer itr = new StringTokenizer(valString); FileSplit fileSplit = (FileSplit) context.getInputSplit(); String fileName = fileSplit.getPath().getName(); while (itr.hasMoreTokens()) { term.set(itr.nextToken()); docFrequency.set(fileName, 1); context.write(term, docFrequency); } }
@Override public void run(Context context) throws IOException, InterruptedException { String file = ((FileSplit) context.getInputSplit()).getPath().getName(); LOG.info("Input file: " + file); PositionalSequenceFileRecordReader<IntWritable, IntDocVector> reader = new PositionalSequenceFileRecordReader<IntWritable, IntDocVector>(); reader.initialize(context.getInputSplit(), context); int fileNo = Integer.parseInt(file.substring(file.lastIndexOf("-") + 1)); long filePos = reader.getPosition(); while (reader.nextKeyValue()) { IntWritable key = reader.getCurrentKey(); output.set(fileNo + "\t" + filePos); context.write(key, output); context.getCounter(Dictionary.Size).increment(1); filePos = reader.getPosition(); } reader.close(); }
@Override public void setup(Context context) throws IOException, InterruptedException { super.setup(context); String fieldStrs = context.getConfiguration().get("higo.index.fields"); split = context.getConfiguration().get("higo.column.split", split); String custfields = context.getConfiguration().get("higo.column.custfields", ""); usedthedate = context.getConfiguration().getBoolean("higo.column.userthedate", usedthedate); this.thedate = null; if (usedthedate) { InputSplit inputSplit = context.getInputSplit(); Path filepath = ((FileSplit) inputSplit).getPath(); String inputbase = context.getConfiguration().get("higo.input.base"); this.thedate = JobIndexPublic.parseThedate(new Path(inputbase), filepath); System.out.println( "thedatepath: " + thedate + "@" + filepath.toString() + "@" + inputbase + ""); } if (custfields == null || custfields.isEmpty()) { String[] fieldslist = fieldStrs.split(","); this.fields = new String[fieldslist.length]; this.isDate = new Boolean[fieldslist.length]; this.isString = new Boolean[fieldslist.length]; this.isStore = new Boolean[fieldslist.length]; for (int i = 0; i < fieldslist.length; i++) { String[] fieldSchema = fieldslist[i].split(":"); String fieldName = fieldSchema[0].trim().toLowerCase(); String type = fieldSchema[1]; this.isStore[i] = Boolean.valueOf(fieldSchema[3]); this.fields[i] = fieldName; this.isDate[i] = type.equalsIgnoreCase("tdate"); this.isString[i] = type.equalsIgnoreCase("string"); } } else { String[] fieldslist = custfields.split(","); this.fields = new String[fieldslist.length]; this.isDate = new Boolean[fieldslist.length]; this.isString = new Boolean[fieldslist.length]; this.isStore = new Boolean[fieldslist.length]; for (int i = 0; i < fieldslist.length; i++) { this.isStore[i] = Boolean.valueOf(false); this.fields[i] = fieldslist[i]; this.isDate[i] = false; this.isString[i] = true; } } }
@Override protected void setup(Context context) throws IOException, InterruptedException { type = context.getConfiguration().get("type"); FileSplit fileSplit = (FileSplit) context.getInputSplit(); filePath = fileSplit.getPath(); dmPlatyRuleDAO = new DMPlatyRuleDAOImpl<String, Integer>(); if (isLocalRunMode(context)) { String dmMobilePlayFilePath = context.getConfiguration().get(ConstantEnum.DM_MOBILE_PLATY_FILEPATH.name()); dmPlatyRuleDAO.parseDMObj(new File(dmMobilePlayFilePath)); } else { File dmMobilePlayFile = new File(ConstantEnum.DM_MOBILE_PLATY.name().toLowerCase()); dmPlatyRuleDAO.parseDMObj(dmMobilePlayFile); } }
/* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context) */ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] items = value.toString().split(fieldDelimRegex); if (null == entity) { if (identifyWithFilePrefix) { FileSplit fileInpSplit = (FileSplit) context.getInputSplit(); String filePrefix = fileInpSplit.getPath().getName().substring(0, filePrefixLength); entity = schema.getEntityByFilePrefix(filePrefix); } else { entity = schema.getEntityBySize(items.length); } idOrdinal = entity.getIdField().getOrdinal(); } if (null != entity) { hash = items[idOrdinal].hashCode() % bucketCount; hash = hash < 0 ? -hash : hash; if (entity.getType() == 0) { if (identifyWithFilePrefix) { valueHolder.set("0," + value.toString()); } else { valueHolder.set(value); } for (int i = 0; i < bucketCount; ++i) { keyHolder.set((hash * bucketCount + i) * 10); context.write(keyHolder, valueHolder); } } else { if (identifyWithFilePrefix) { valueHolder.set("1," + value.toString()); } else { valueHolder.set(value); } for (int i = 0; i < bucketCount; ++i) { keyHolder.set(((i * bucketCount + hash) * 10) + 1); context.write(keyHolder, valueHolder); } } } else { } }
@Override protected void setup(Context ctxt) throws IOException, InterruptedException { final Configuration conf = ctxt.getConfiguration(); final GridmixSplit split = (GridmixSplit) ctxt.getInputSplit(); final int maps = split.getMapCount(); final long[] reduceBytes = split.getOutputBytes(); final long[] reduceRecords = split.getOutputRecords(); long totalRecords = 0L; final int nReduces = ctxt.getNumReduceTasks(); if (nReduces > 0) { int idx = 0; int id = split.getId(); for (int i = 0; i < nReduces; ++i) { final GridmixKey.Spec spec = new GridmixKey.Spec(); if (i == id) { spec.bytes_out = split.getReduceBytes(idx); spec.rec_out = split.getReduceRecords(idx); ++idx; id += maps; } reduces.add( new IntermediateRecordFactory( new AvgRecordFactory(reduceBytes[i], reduceRecords[i], conf), i, reduceRecords[i], spec, conf)); totalRecords += reduceRecords[i]; } } else { reduces.add(new AvgRecordFactory(reduceBytes[0], reduceRecords[0], conf)); totalRecords = reduceRecords[0]; } final long splitRecords = split.getInputRecords(); final long inputRecords = splitRecords <= 0 && split.getLength() >= 0 ? Math.max(1, split.getLength() / conf.getInt("gridmix.missing.rec.size", 64 * 1024)) : splitRecords; ratio = totalRecords / (1.0 * inputRecords); acc = 0.0; }
protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); this.dmIPRuleDAO = new DMIPRuleDAOImpl<Long, Map<ConstantEnum, String>>(); this.dmOuterURLRuleDAO = new DMOuterURLRuleImpl<String, Map<ConstantEnum, String>>(); this.dmKeywordRuleDAO = new DMKeywordRuleDAOImpl<String, Map<ConstantEnum, String>>(); this.dmInterURLRuleDAO = new DMInterURLImpl(); this.dmIPRuleDAO.parseDMObj(new File(ConstantEnum.IP_TABLE.name().toLowerCase())); this.dmOuterURLRuleDAO.parseDMObj(new File(ConstantEnum.DM_OUTER_URL.name().toLowerCase())); this.dmInterURLRuleDAO.parseDMObj(new File(ConstantEnum.DM_INTER_URL.name().toLowerCase())); this.dmKeywordRuleDAO.parseDMObj( new File(ConstantEnum.DM_URL_KEYWORD_2.name().toLowerCase())); multipleOutputs = new MultipleOutputs<Text, Text>(context); dateId = context.getConfiguration().get("dateid"); keyText = new Text(); valueText = new Text(); FileSplit fileSplit = (FileSplit) context.getInputSplit(); filePath = fileSplit.getPath().getParent().toString(); }
@Override protected void setup(Context context) throws IOException, InterruptedException { InputSplit split = context.getInputSplit(); Path path = ((FileSplit) split).getPath(); filenameKey = new Text(path.toString()); }
@Override protected void setup(Context context) throws IOException, InterruptedException { filterSet = JdbcUtil.getPluginConfig(pluginType); fileSuffix = ((FileSplit) context.getInputSplit()).getPath().getName(); }
@Override protected void setup(Context context) throws IOException, InterruptedException { fileSuffix = ((FileSplit) context.getInputSplit()).getPath().getName(); statDate = getStatDate(context); }
@Override protected void setup(Context context) throws IOException, InterruptedException { FileSplit fileSplit = (FileSplit) context.getInputSplit(); path = fileSplit.getPath(); }
@Override public void setup(Context context) { FileSplit split = (FileSplit) context.getInputSplit(); isLeft = split.getPath().toString().contains("pigmix_page_views"); }
@Override public void setup(Context context) { pathName = ((FileSplit) context.getInputSplit()).getPath().toString(); }
@Override protected void map(String key, String value, final Context context) throws IOException, InterruptedException { final InputSplit split = context.getInputSplit(); if (!(split instanceof DatasourceInputSplit)) { throw new IAE( "Unexpected split type. Expected [%s] was [%s]", DatasourceInputSplit.class.getCanonicalName(), split.getClass().getCanonicalName()); } final String tmpDirLoc = context.getConfiguration().get(TMP_FILE_LOC_KEY); final File tmpDir = Paths.get(tmpDirLoc).toFile(); final DataSegment segment = Iterables.getOnlyElement(((DatasourceInputSplit) split).getSegments()).getSegment(); final HadoopDruidConverterConfig config = converterConfigFromConfiguration(context.getConfiguration()); context.setStatus("DOWNLOADING"); context.progress(); final Path inPath = new Path(JobHelper.getURIFromSegment(segment)); final File inDir = new File(tmpDir, "in"); if (inDir.exists() && !inDir.delete()) { log.warn("Could not delete [%s]", inDir); } if (!inDir.mkdir() && (!inDir.exists() || inDir.isDirectory())) { log.warn("Unable to make directory"); } final long inSize = JobHelper.unzipNoGuava(inPath, context.getConfiguration(), inDir, context); log.debug("Loaded %d bytes into [%s] for converting", inSize, inDir.getAbsolutePath()); context.getCounter(COUNTER_GROUP, COUNTER_LOADED).increment(inSize); context.setStatus("CONVERTING"); context.progress(); final File outDir = new File(tmpDir, "out"); if (!outDir.mkdir() && (!outDir.exists() || !outDir.isDirectory())) { throw new IOException(String.format("Could not create output directory [%s]", outDir)); } HadoopDruidConverterConfig.INDEX_MERGER.convert( inDir, outDir, config.getIndexSpec(), JobHelper.progressIndicatorForContext(context)); if (config.isValidate()) { context.setStatus("Validating"); HadoopDruidConverterConfig.INDEX_IO.validateTwoSegments(inDir, outDir); } context.progress(); context.setStatus("Starting PUSH"); final Path baseOutputPath = new Path(config.getSegmentOutputPath()); final FileSystem outputFS = baseOutputPath.getFileSystem(context.getConfiguration()); final DataSegment finalSegmentTemplate = segment.withVersion(segment.getVersion() + "_converted"); final DataSegment finalSegment = JobHelper.serializeOutIndex( finalSegmentTemplate, context.getConfiguration(), context, context.getTaskAttemptID(), outDir, JobHelper.makeSegmentOutputPath(baseOutputPath, outputFS, finalSegmentTemplate)); context.progress(); context.setStatus("Finished PUSH"); final String finalSegmentString = HadoopDruidConverterConfig.jsonMapper.writeValueAsString(finalSegment); context .getConfiguration() .set(ConvertingOutputFormat.PUBLISHED_SEGMENT_KEY, finalSegmentString); context.write(new Text("dataSegment"), new Text(finalSegmentString)); context.getCounter(COUNTER_GROUP, COUNTER_WRITTEN).increment(finalSegment.getSize()); context.progress(); context.setStatus("Ready To Commit"); }