public void map(LongWritable key, Text value, Context context)
     throws IOException, InterruptedException {
   String cur_file =
       ((FileSplit) context.getInputSplit()).getPath().getParent().getParent().getName();
   String train_file = context.getConfiguration().get("train_file");
   if (cur_file.equals(train_file)) {
     StringTokenizer st = new StringTokenizer(value.toString());
     String word = st.nextToken();
     String f_id = st.nextToken();
     myKey.set(word);
     myVal.set(f_id);
     context.write(myKey, myVal);
   } else {
     StringTokenizer st = new StringTokenizer(value.toString());
     String word = st.nextToken();
     String f_id = st.nextToken();
     StringBuilder builder = new StringBuilder(dlt);
     while (st.hasMoreTokens()) {
       String filename = st.nextToken();
       String tf_idf = st.nextToken();
       builder.append(filename);
       builder.append(dlt);
       builder.append(tf_idf);
       builder.append("\t");
     }
     myKey.set(word);
     myVal.set(builder.toString());
     context.write(myKey, myVal);
   }
 }
Example #2
0
    /* (non-Javadoc)
     * @see org.apache.hadoop.mapreduce.Mapper#setup(org.apache.hadoop.mapreduce.Mapper.Context)
     */
    protected void setup(Context context) throws IOException, InterruptedException {
      Configuration conf = context.getConfiguration();
      if (conf.getBoolean("debug.on", false)) {
        LOG.setLevel(Level.DEBUG);
        System.out.println("in debug mode");
      }

      fieldDelim = conf.get("field.delim", ",");
      subFieldDelim = conf.get("sub.field.delim", ":");
      String ratingFilePrefix = conf.get("utp.rating.file.prefix", "rating");
      isRatingFileSplit =
          ((FileSplit) context.getInputSplit()).getPath().getName().startsWith(ratingFilePrefix);
      String ratingStatFilePrefix = conf.get("utp.rating.stat.file.prefix", "stat");
      isRatingStatFileSplit =
          ((FileSplit) context.getInputSplit())
              .getPath()
              .getName()
              .startsWith(ratingStatFilePrefix);

      linearCorrelation = conf.getBoolean("utp.correlation.linear", true);
      int ratingTimeWindow = conf.getInt("utp.rating.time.window.hour", -1);
      ratingTimeCutoff =
          ratingTimeWindow > 0
              ? System.currentTimeMillis() / 1000 - ratingTimeWindow * 60L * 60L
              : -1;

      minInputRating = conf.getInt("utp.min.input.rating", -1);
      minCorrelation = conf.getInt("utp.min.correlation", -1);

      userRatingWithContext = conf.getBoolean("utp.user.rating.with.context", false);
      LOG.info("isRatingFileSplit:" + isRatingFileSplit);
    }
Example #3
0
 @Override
 protected void setup(Context context) throws IOException, InterruptedException {
   long start = ((FileSplit) context.getInputSplit()).getStart();
   logger.info("Input Split : ", context.getInputSplit().toString());
   logger.info("Input Split Start : {}", start);
   counter = context.getCounter(getClass().getName(), String.valueOf(start));
 }
 @Override
 protected void map(LongWritable key, Text value, Context context)
     throws IOException, InterruptedException {
   // 获取输入文件的全路径和名称
   String pathName = ((FileSplit) context.getInputSplit()).getPath().toString();
   if (pathName.contains("data.txt")) {
     String values[] = value.toString().split("\t");
     if (values.length < 3) {
       // data数据格式不规范,字段小于3,抛弃数据
       return;
     } else {
       // 数据格式规范,区分标识为1
       TextPair tp = new TextPair(new Text(values[1]), new Text("1"));
       context.write(tp, new Text(values[0] + "\t" + values[2]));
     }
   }
   if (pathName.contains("info.txt")) {
     String values[] = value.toString().split("\t");
     if (values.length < 2) {
       // data数据格式不规范,字段小于2,抛弃数据
       return;
     } else {
       // 数据格式规范,区分标识为0
       TextPair tp = new TextPair(new Text(values[0]), new Text("0"));
       context.write(tp, new Text(values[1]));
     }
   }
 }
Example #5
0
  @Override
  protected void setup(Context context) throws IOException, InterruptedException {

    // Get the source index; (employee = 1, salary = 2)
    // Added as configuration in driver
    FileSplit fsFileSplit = (FileSplit) context.getInputSplit();
    intSrcIndex = Integer.parseInt(context.getConfiguration().get(fsFileSplit.getPath().getName()));

    // Initialize the list of fields to emit as output based on
    // intSrcIndex (1=employee, 2=current salary, 3=historical salary)
    if (intSrcIndex == 1) // employee
    {
      lstRequiredAttribList.add(1); // FName
      lstRequiredAttribList.add(2); // LName
      lstRequiredAttribList.add(3); // Gender

    } else // salary
    {
      lstRequiredAttribList.add(1); // Salary
      lstRequiredAttribList.add(3); // Effective-to-date (Value of
      // 9999-01-01 indicates current
      // salary)

    }
  }
 @Override
 public void setup(Context context) throws IOException, InterruptedException {
   /*
    * FileSplit for the input file provides access to the file's path.
    */
   Path path = ((FileSplit) context.getInputSplit()).getPath();
   fileName = path.getName();
 }
Example #7
0
 protected void setup(Context context) throws IOException, InterruptedException {
   Configuration config = context.getConfiguration();
   fieldDelimRegex = config.get("field.delim.regex", ",");
   quantityAttr = config.getInt("quantity.attr", -1);
   String aggrFilePrefix = context.getConfiguration().get("aggregate.file.prefix", "");
   if (!aggrFilePrefix.isEmpty()) {
     isAggrFileSplit =
         ((FileSplit) context.getInputSplit()).getPath().getName().startsWith(aggrFilePrefix);
   } else {
     String incrFilePrefix = context.getConfiguration().get("incremental.file.prefix", "");
     if (!incrFilePrefix.isEmpty()) {
       isAggrFileSplit =
           !((FileSplit) context.getInputSplit()).getPath().getName().startsWith(incrFilePrefix);
     } else {
       throw new IOException("Aggregate or incremental file prefix needs to be specified");
     }
   }
 }
 public void map(Object key, Text value, Context context)
     throws IOException, InterruptedException {
   StringTokenizer itr = new StringTokenizer(value.toString());
   while (itr.hasMoreTokens()) {
     word.set(itr.nextToken());
     split.set(context.getInputSplit().toString());
     context.write(word, split);
   }
 }
 protected void map(LongWritable key, Text value, Context context)
     throws IOException, InterruptedException {
   String line = value.toString();
   String fileName = ((FileSplit) context.getInputSplit()).getPath().getName();
   String parts[] = line.split(":");
   if (parts.length > 1) {
     String op = parts[0].trim() + "#" + parts[1].trim();
     context.write(new Text(fileName), new Text(op));
   }
 }
 @Override
 protected void setup(Context context) throws IOException, InterruptedException {
   String suffix = "";
   String fileName = ((FileSplit) context.getInputSplit()).getPath().getName();
   if (fileName.contains(Constants.SUFFIX_USER_LOST_MONTH_1)) {
     suffix = Constants.SUFFIX_USER_LOST_MONTH_1;
   } else if (fileName.contains(Constants.SUFFIX_USER_BACK_MONTH_1)) {
     suffix = Constants.SUFFIX_USER_BACK_MONTH_1;
   }
   mapKeyObj.setSuffix(suffix);
 }
    protected void setup(Context context) throws IOException, InterruptedException {

      // 从配置中 得到每个特征分片的长度

      slicelen = context.getConfiguration().getInt("LLR_SliceLeangth", 10);

      //  得到文件名称 与行偏移组成唯一 primary key

      FileSplit fileSplit = (FileSplit) context.getInputSplit();
      filename = fileSplit.getPath().getName();
    }
    public void map(Object key, Text value, Context context)
        throws IOException, InterruptedException {
      String valString = value.toString().replaceAll("[^a-zA-Z0-9]+", " ");
      StringTokenizer itr = new StringTokenizer(valString);

      FileSplit fileSplit = (FileSplit) context.getInputSplit();
      String fileName = fileSplit.getPath().getName();
      while (itr.hasMoreTokens()) {
        term.set(itr.nextToken());
        docFrequency.set(fileName, 1);
        context.write(term, docFrequency);
      }
    }
    @Override
    public void run(Context context) throws IOException, InterruptedException {
      String file = ((FileSplit) context.getInputSplit()).getPath().getName();
      LOG.info("Input file: " + file);

      PositionalSequenceFileRecordReader<IntWritable, IntDocVector> reader =
          new PositionalSequenceFileRecordReader<IntWritable, IntDocVector>();
      reader.initialize(context.getInputSplit(), context);

      int fileNo = Integer.parseInt(file.substring(file.lastIndexOf("-") + 1));
      long filePos = reader.getPosition();
      while (reader.nextKeyValue()) {
        IntWritable key = reader.getCurrentKey();
        output.set(fileNo + "\t" + filePos);

        context.write(key, output);
        context.getCounter(Dictionary.Size).increment(1);

        filePos = reader.getPosition();
      }
      reader.close();
    }
Example #14
0
  @Override
  public void setup(Context context) throws IOException, InterruptedException {
    super.setup(context);
    String fieldStrs = context.getConfiguration().get("higo.index.fields");
    split = context.getConfiguration().get("higo.column.split", split);
    String custfields = context.getConfiguration().get("higo.column.custfields", "");
    usedthedate = context.getConfiguration().getBoolean("higo.column.userthedate", usedthedate);
    this.thedate = null;
    if (usedthedate) {
      InputSplit inputSplit = context.getInputSplit();
      Path filepath = ((FileSplit) inputSplit).getPath();
      String inputbase = context.getConfiguration().get("higo.input.base");
      this.thedate = JobIndexPublic.parseThedate(new Path(inputbase), filepath);
      System.out.println(
          "thedatepath: " + thedate + "@" + filepath.toString() + "@" + inputbase + "");
    }

    if (custfields == null || custfields.isEmpty()) {

      String[] fieldslist = fieldStrs.split(",");
      this.fields = new String[fieldslist.length];
      this.isDate = new Boolean[fieldslist.length];
      this.isString = new Boolean[fieldslist.length];
      this.isStore = new Boolean[fieldslist.length];

      for (int i = 0; i < fieldslist.length; i++) {
        String[] fieldSchema = fieldslist[i].split(":");
        String fieldName = fieldSchema[0].trim().toLowerCase();
        String type = fieldSchema[1];
        this.isStore[i] = Boolean.valueOf(fieldSchema[3]);
        this.fields[i] = fieldName;
        this.isDate[i] = type.equalsIgnoreCase("tdate");
        this.isString[i] = type.equalsIgnoreCase("string");
      }
    } else {
      String[] fieldslist = custfields.split(",");
      this.fields = new String[fieldslist.length];
      this.isDate = new Boolean[fieldslist.length];
      this.isString = new Boolean[fieldslist.length];
      this.isStore = new Boolean[fieldslist.length];

      for (int i = 0; i < fieldslist.length; i++) {
        this.isStore[i] = Boolean.valueOf(false);
        this.fields[i] = fieldslist[i];
        this.isDate[i] = false;
        this.isString[i] = true;
      }
    }
  }
Example #15
0
 @Override
 protected void setup(Context context) throws IOException, InterruptedException {
   type = context.getConfiguration().get("type");
   FileSplit fileSplit = (FileSplit) context.getInputSplit();
   filePath = fileSplit.getPath();
   dmPlatyRuleDAO = new DMPlatyRuleDAOImpl<String, Integer>();
   if (isLocalRunMode(context)) {
     String dmMobilePlayFilePath =
         context.getConfiguration().get(ConstantEnum.DM_MOBILE_PLATY_FILEPATH.name());
     dmPlatyRuleDAO.parseDMObj(new File(dmMobilePlayFilePath));
   } else {
     File dmMobilePlayFile = new File(ConstantEnum.DM_MOBILE_PLATY.name().toLowerCase());
     dmPlatyRuleDAO.parseDMObj(dmMobilePlayFile);
   }
 }
Example #16
0
    /* (non-Javadoc)
     * @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context)
     */
    @Override
    protected void map(LongWritable key, Text value, Context context)
        throws IOException, InterruptedException {
      String[] items = value.toString().split(fieldDelimRegex);

      if (null == entity) {
        if (identifyWithFilePrefix) {
          FileSplit fileInpSplit = (FileSplit) context.getInputSplit();
          String filePrefix = fileInpSplit.getPath().getName().substring(0, filePrefixLength);
          entity = schema.getEntityByFilePrefix(filePrefix);
        } else {
          entity = schema.getEntityBySize(items.length);
        }
        idOrdinal = entity.getIdField().getOrdinal();
      }

      if (null != entity) {
        hash = items[idOrdinal].hashCode() % bucketCount;
        hash = hash < 0 ? -hash : hash;
        if (entity.getType() == 0) {
          if (identifyWithFilePrefix) {
            valueHolder.set("0," + value.toString());
          } else {
            valueHolder.set(value);
          }
          for (int i = 0; i < bucketCount; ++i) {
            keyHolder.set((hash * bucketCount + i) * 10);
            context.write(keyHolder, valueHolder);
          }
        } else {
          if (identifyWithFilePrefix) {
            valueHolder.set("1," + value.toString());
          } else {
            valueHolder.set(value);
          }
          for (int i = 0; i < bucketCount; ++i) {
            keyHolder.set(((i * bucketCount + hash) * 10) + 1);
            context.write(keyHolder, valueHolder);
          }
        }
      } else {

      }
    }
Example #17
0
    @Override
    protected void setup(Context ctxt) throws IOException, InterruptedException {
      final Configuration conf = ctxt.getConfiguration();
      final GridmixSplit split = (GridmixSplit) ctxt.getInputSplit();
      final int maps = split.getMapCount();
      final long[] reduceBytes = split.getOutputBytes();
      final long[] reduceRecords = split.getOutputRecords();

      long totalRecords = 0L;
      final int nReduces = ctxt.getNumReduceTasks();
      if (nReduces > 0) {
        int idx = 0;
        int id = split.getId();
        for (int i = 0; i < nReduces; ++i) {
          final GridmixKey.Spec spec = new GridmixKey.Spec();
          if (i == id) {
            spec.bytes_out = split.getReduceBytes(idx);
            spec.rec_out = split.getReduceRecords(idx);
            ++idx;
            id += maps;
          }
          reduces.add(
              new IntermediateRecordFactory(
                  new AvgRecordFactory(reduceBytes[i], reduceRecords[i], conf),
                  i,
                  reduceRecords[i],
                  spec,
                  conf));
          totalRecords += reduceRecords[i];
        }
      } else {
        reduces.add(new AvgRecordFactory(reduceBytes[0], reduceRecords[0], conf));
        totalRecords = reduceRecords[0];
      }
      final long splitRecords = split.getInputRecords();
      final long inputRecords =
          splitRecords <= 0 && split.getLength() >= 0
              ? Math.max(1, split.getLength() / conf.getInt("gridmix.missing.rec.size", 64 * 1024))
              : splitRecords;
      ratio = totalRecords / (1.0 * inputRecords);
      acc = 0.0;
    }
Example #18
0
    protected void setup(Context context) throws IOException, InterruptedException {
      super.setup(context);
      this.dmIPRuleDAO = new DMIPRuleDAOImpl<Long, Map<ConstantEnum, String>>();
      this.dmOuterURLRuleDAO = new DMOuterURLRuleImpl<String, Map<ConstantEnum, String>>();
      this.dmKeywordRuleDAO = new DMKeywordRuleDAOImpl<String, Map<ConstantEnum, String>>();
      this.dmInterURLRuleDAO = new DMInterURLImpl();

      this.dmIPRuleDAO.parseDMObj(new File(ConstantEnum.IP_TABLE.name().toLowerCase()));
      this.dmOuterURLRuleDAO.parseDMObj(new File(ConstantEnum.DM_OUTER_URL.name().toLowerCase()));
      this.dmInterURLRuleDAO.parseDMObj(new File(ConstantEnum.DM_INTER_URL.name().toLowerCase()));
      this.dmKeywordRuleDAO.parseDMObj(
          new File(ConstantEnum.DM_URL_KEYWORD_2.name().toLowerCase()));

      multipleOutputs = new MultipleOutputs<Text, Text>(context);

      dateId = context.getConfiguration().get("dateid");

      keyText = new Text();
      valueText = new Text();

      FileSplit fileSplit = (FileSplit) context.getInputSplit();
      filePath = fileSplit.getPath().getParent().toString();
    }
 @Override
 protected void setup(Context context) throws IOException, InterruptedException {
   InputSplit split = context.getInputSplit();
   Path path = ((FileSplit) split).getPath();
   filenameKey = new Text(path.toString());
 }
Example #20
0
 @Override
 protected void setup(Context context) throws IOException, InterruptedException {
   filterSet = JdbcUtil.getPluginConfig(pluginType);
   fileSuffix = ((FileSplit) context.getInputSplit()).getPath().getName();
 }
 @Override
 protected void setup(Context context) throws IOException, InterruptedException {
   fileSuffix = ((FileSplit) context.getInputSplit()).getPath().getName();
   statDate = getStatDate(context);
 }
Example #22
0
 @Override
 protected void setup(Context context) throws IOException, InterruptedException {
   FileSplit fileSplit = (FileSplit) context.getInputSplit();
   path = fileSplit.getPath();
 }
Example #23
0
 @Override
 public void setup(Context context) {
   FileSplit split = (FileSplit) context.getInputSplit();
   isLeft = split.getPath().toString().contains("pigmix_page_views");
 }
 @Override
 public void setup(Context context) {
   pathName = ((FileSplit) context.getInputSplit()).getPath().toString();
 }
    @Override
    protected void map(String key, String value, final Context context)
        throws IOException, InterruptedException {
      final InputSplit split = context.getInputSplit();
      if (!(split instanceof DatasourceInputSplit)) {
        throw new IAE(
            "Unexpected split type. Expected [%s] was [%s]",
            DatasourceInputSplit.class.getCanonicalName(), split.getClass().getCanonicalName());
      }

      final String tmpDirLoc = context.getConfiguration().get(TMP_FILE_LOC_KEY);
      final File tmpDir = Paths.get(tmpDirLoc).toFile();

      final DataSegment segment =
          Iterables.getOnlyElement(((DatasourceInputSplit) split).getSegments()).getSegment();

      final HadoopDruidConverterConfig config =
          converterConfigFromConfiguration(context.getConfiguration());

      context.setStatus("DOWNLOADING");
      context.progress();
      final Path inPath = new Path(JobHelper.getURIFromSegment(segment));
      final File inDir = new File(tmpDir, "in");

      if (inDir.exists() && !inDir.delete()) {
        log.warn("Could not delete [%s]", inDir);
      }

      if (!inDir.mkdir() && (!inDir.exists() || inDir.isDirectory())) {
        log.warn("Unable to make directory");
      }

      final long inSize =
          JobHelper.unzipNoGuava(inPath, context.getConfiguration(), inDir, context);
      log.debug("Loaded %d bytes into [%s] for converting", inSize, inDir.getAbsolutePath());
      context.getCounter(COUNTER_GROUP, COUNTER_LOADED).increment(inSize);

      context.setStatus("CONVERTING");
      context.progress();
      final File outDir = new File(tmpDir, "out");
      if (!outDir.mkdir() && (!outDir.exists() || !outDir.isDirectory())) {
        throw new IOException(String.format("Could not create output directory [%s]", outDir));
      }
      HadoopDruidConverterConfig.INDEX_MERGER.convert(
          inDir, outDir, config.getIndexSpec(), JobHelper.progressIndicatorForContext(context));
      if (config.isValidate()) {
        context.setStatus("Validating");
        HadoopDruidConverterConfig.INDEX_IO.validateTwoSegments(inDir, outDir);
      }
      context.progress();
      context.setStatus("Starting PUSH");
      final Path baseOutputPath = new Path(config.getSegmentOutputPath());
      final FileSystem outputFS = baseOutputPath.getFileSystem(context.getConfiguration());
      final DataSegment finalSegmentTemplate =
          segment.withVersion(segment.getVersion() + "_converted");
      final DataSegment finalSegment =
          JobHelper.serializeOutIndex(
              finalSegmentTemplate,
              context.getConfiguration(),
              context,
              context.getTaskAttemptID(),
              outDir,
              JobHelper.makeSegmentOutputPath(baseOutputPath, outputFS, finalSegmentTemplate));
      context.progress();
      context.setStatus("Finished PUSH");
      final String finalSegmentString =
          HadoopDruidConverterConfig.jsonMapper.writeValueAsString(finalSegment);
      context
          .getConfiguration()
          .set(ConvertingOutputFormat.PUBLISHED_SEGMENT_KEY, finalSegmentString);
      context.write(new Text("dataSegment"), new Text(finalSegmentString));

      context.getCounter(COUNTER_GROUP, COUNTER_WRITTEN).increment(finalSegment.getSize());
      context.progress();
      context.setStatus("Ready To Commit");
    }