Java Text 예제들, org.apache.hadoop.util.Text Java 예제들

예제 #1

1

파일 보기

파일: MapTestID.java 프로젝트: cliu0507/scalable-machine-learning

 public void map(LongWritable key, Text value, Context context)
     throws IOException, InterruptedException {
   String cur_file =
       ((FileSplit) context.getInputSplit()).getPath().getParent().getParent().getName();
   String train_file = context.getConfiguration().get("train_file");
   if (cur_file.equals(train_file)) {
     StringTokenizer st = new StringTokenizer(value.toString());
     String word = st.nextToken();
     String f_id = st.nextToken();
     myKey.set(word);
     myVal.set(f_id);
     context.write(myKey, myVal);
   } else {
     StringTokenizer st = new StringTokenizer(value.toString());
     String word = st.nextToken();
     String f_id = st.nextToken();
     StringBuilder builder = new StringBuilder(dlt);
     while (st.hasMoreTokens()) {
       String filename = st.nextToken();
       String tf_idf = st.nextToken();
       builder.append(filename);
       builder.append(dlt);
       builder.append(tf_idf);
       builder.append("\t");
     }
     myKey.set(word);
     myVal.set(builder.toString());
     context.write(myKey, myVal);
   }
 }

예제 #2

0

파일 보기

파일: XiangLi1_exercise3.java 프로젝트: xxiang13/hadoop

    // specify input and out keys
    public void map(
        LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter)
        throws IOException {
      String line = value.toString(); // define new variable to be string

      ArrayList<Integer> range = new ArrayList<Integer>();
      for (int i = 2000; i <= 2010; i++) {
        range.add(i);
      }

      // String[] inputs = line.split(",(?=([^\"]*\"[^\"]*\")*[^\"]*$)");
      String[] inputs = line.split(",");

      try {

        int year = Integer.parseInt(inputs[165]);

        if (range.contains(year)) {
          String dur = inputs[3];
          String artist_name = inputs[2];
          String song_title = inputs[1];
          String final_input = artist_name + ',' + dur + ',' + song_title;
          Final_Value.set(final_input);
          output.collect(Final_Value, dummy);
        }
      } catch (NumberFormatException e) {
        // do nothing
      }
    }

예제 #3

0

파일 보기

파일: Sort.java 프로젝트: robbyzhang/hadoop

    public void map(LongWritable key, Text value, Context context)
        throws IOException, InterruptedException {
      String line = value.toString();
      String[] arr = line.split(",");

      word.set(arr[0]);
      context.write(word, one);
    }

예제 #4

0

파일 보기

파일: WordCount.java 프로젝트: y-tag/java-Hadoop-MapReduceSample

 public void map(LongWritable key, Text value, Context context)
     throws IOException, InterruptedException {
   String line = value.toString();
   StringTokenizer tokenizer = new StringTokenizer(line);
   while (tokenizer.hasMoreTokens()) {
     word.set(tokenizer.nextToken());
     context.write(word, one);
   }
 }

예제 #5

0

파일 보기

파일: WordCount.java 프로젝트: brent8149/datascience-fall14

 public void map(LongWritable key, Text value, OutputCollector output, Reporter reporter)
     throws IOException {
   String line = value.toString();
   StringTokenizer tokenizer = new StringTokenizer(line);
   while (tokenizer.hasMoreTokens()) {
     word.set(tokenizer.nextToken());
     output.collect(word, one);
   }
 }

예제 #6

0

파일 보기

파일: BatchGenerationEngine2.java 프로젝트: cauchyturing/distributed-deep-belief

 public void map(
     IntWritable key, Text value, OutputCollector<IntWritable, Text> output, Reporter reporter)
     throws IOException {
   String dataRow = value.toString();
   StringTokenizer tk = new StringTokenizer(dataRow);
   String label = tk.nextToken();
   String image = tk.nextToken();
   dataString.set(label + "\t" + image);
   output.collect(sameKey, dataString);
 }

예제 #7

0

파일 보기

파일: GraphReducer.java 프로젝트: ChenxiSu/CS5300_Project2_PageRank

  public void reduce(LongWritable key, Iterable<Text> values, Context context)
      throws IOException, InterruptedException {
    LongWritable curNodeId = key;
    double previousPRValue = 1;
    double nextPRValue = 0;
    double localResidual = 0;
    String edgeListOfCurNode = "";
    long localResidualTransformed = 0;

    for (Text value : values) {
      String[] inputInfo = value.toString().split("\\s+");

      // incoming pagerank value
      if (inputInfo.length == 1) {
        nextPRValue += Double.parseDouble(inputInfo[0]);
      }
      // current node info
      else if (inputInfo.length == 3) {
        edgeListOfCurNode = inputInfo[2];
        previousPRValue = Double.parseDouble(inputInfo[1]);
      } else if (inputInfo.length == 2) {
        previousPRValue = Double.parseDouble(inputInfo[1]);
      } else {
        System.out.println("ERROR: received unexpected TEXT in length");
      }
    }
    if (previousPRValue == 1) System.out.println("No node info has been received by a reducer");
    // calculate the pagerank value according to the given formula
    nextPRValue = pagerankFormula(nextPRValue);

    // should also iterate sink nodes list, add the evenly splitted value

    // reducer should store the updated node info(NPR) to output directory
    context.write(null, new Text(curNodeId + " " + nextPRValue + " " + edgeListOfCurNode));

    // then compare PPR with NPR
    try {
      localResidual = Math.abs(previousPRValue - nextPRValue) / nextPRValue;
      localResidualTransformed = (long) (localResidual * 10000);
      // System.out.println("Make sure you got the right transformed residual :
      // "+localResidualTransformed);

    } catch (ArithmeticException e) {
      System.out.println("PPR is zero. Check where you get the value!");
    }

    // assume there is a global counter called residualCounter;

    context.getCounter(myCounter.ResidualCounter.RESIDUAL_SUM).increment(localResidualTransformed);
  }

예제 #8

0

파일 보기

파일: HadoopNBFilter.java 프로젝트: scriptkid85/10605-2

    public void reduce(Text key, Iterable<IntWritable> values, Context context)
        throws IOException, InterruptedException {
      int sum = 0;
      boolean filtered = true;
      String keypair = key.toString();
      String[] keys = keypair.split(" ");
      if (keys.length == 1) {
        filtered = false;
      } else {
        if (keys[0].equals("*") || keys[1].equals("*")) {
          filtered = false;
        }
      }

      if (!filtered) {
        for (IntWritable val : values) {
          sum += val.get();
        }
        context.write(key, new IntWritable(sum));
        return;
      }

      for (IntWritable val : values) {
        if (val.get() == -1) {
          filtered = false;
          continue;
        }
        sum += val.get();
      }
      // filter non-needed events
      if (filtered) return;
      context.write(key, new IntWritable(sum));
    }

예제 #9

0

파일 보기

파일: RBMMapper.java 프로젝트: dery-hit/MapReduce-Based-Deep-Learning

  public void map(
      LongWritable key,
      Text value,
      OutputCollector<IntWritable, DoubleWritable> output,
      Reporter reporter)
      throws IOException {
    /*
     * It implements the mapper. It outputs the numbers of weight and updated weights.
     *
     * Note that the format of intermediate output is <IntWritable, DoubleWritable>,
     * because the key is the number of weight (an integer), and the value is the weight's value (double)
     */
    inputData = value.toString();

    // go through the process
    initialize();
    getposphase();
    getnegphase();
    update();

    // output the intermediate data
    // The <key, value> pairs are <weightID, weightUpdate>
    double[][] vishidinc_array = vishidinc.getArray();
    for (int i = 0; i < numdims; i++) {
      for (int j = 0; j < numhid; j++) {
        weightPos.set(i * numhid + j);
        weightValue.set(vishidinc_array[i][j]);
        output.collect(weightPos, weightValue);
      }
    }
  }

예제 #10

0

파일 보기

파일: AuthorCounter.java 프로젝트: akhfa/hadoop-test

    public void map(Object key, Text value, Context context)
        throws IOException, InterruptedException {
      String file = value.toString();
      String[] lines = file.split("\n");

      for (String line : lines) {
        if (line.contains("<author>") && line.contains("</author>")) {
          String author = line.substring(8, line.indexOf("</a"));
          word.set(author);
          context.write(word, one);
        } else if (line.contains("<author>")) {
          String author = line.substring(8);
          word.set(author);
          context.write(word, one);
        }
      }
    }

예제 #11

0

파일 보기

파일: WordCount.java 프로젝트: raoiitkgp/hadoop-examples

    /** Implements the map-method of the Mapper-interface */
    @Override
    public void map(
        LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter)
        throws IOException {

      String line = value.toString();
      StringTokenizer tokenizer = new StringTokenizer(line);

      while (tokenizer.hasMoreTokens()) {

        String token = tokenizer.nextToken();

        if (token.length() > 3) {
          word.set(token);
          output.collect(word, one);
        }
      }
    }

예제 #12

0

파일 보기

파일: Injector.java 프로젝트: soolr/nutch-1.7

    public void reduce(
        Text key,
        Iterator<CrawlDatum> values,
        OutputCollector<Text, CrawlDatum> output,
        Reporter reporter)
        throws IOException {
      boolean oldSet = false;
      boolean injectedSet = false;
      while (values.hasNext()) {
        CrawlDatum val = values.next();
        if (val.getStatus() == CrawlDatum.STATUS_INJECTED) {
          injected.set(val);
          injected.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
          injectedSet = true;
        } else {
          old.set(val);
          oldSet = true;
        }
      }
      CrawlDatum res = null;

      /**
       * Whether to overwrite, ignore or update existing records
       *
       * @see https://issues.apache.org/jira/browse/NUTCH-1405
       */

      // Injected record already exists and overwrite but not update
      if (injectedSet && oldSet && overwrite) {
        res = injected;

        if (update) {
          LOG.info(key.toString() + " overwritten with injected record but update was specified.");
        }
      }

      // Injected record already exists and update but not overwrite
      if (injectedSet && oldSet && update && !overwrite) {
        res = old;
        old.putAllMetaData(injected);
        old.setScore(injected.getScore() != scoreInjected ? injected.getScore() : old.getScore());
        old.setFetchInterval(
            injected.getFetchInterval() != interval
                ? injected.getFetchInterval()
                : old.getFetchInterval());
      }

      // Old default behaviour
      if (injectedSet && !oldSet) {
        res = injected;
      } else {
        res = old;
      }

      output.collect(key, res);
    }

예제 #13

0

파일 보기

파일: ValueHistogram.java 프로젝트: feldsherov/TS_hadoop

 public void map(
     LongWritable key, Text value, OutputCollector<IntWritable, Text> output, Reporter reporter)
     throws IOException {
   String arr[] = value.toString().split("\\r?\\n");
   for (String row : arr) {
     if (row.startsWith("\"")) {
       continue;
     }
     String parts[] = row.split(",");
     output.collect(new IntWritable(new Integer(parts[1])), new Text(parts[4]));
   }
 }

예제 #14

0

파일 보기

파일: Sqrt2.java 프로젝트: Totemika/repoHadoop

 public void map(
     LongWritable key,
     Text value,
     OutputCollector<DoubleWritable, DoubleWritable> output,
     Reporter reporter)
     throws IOException {
   String line = value.toString();
   DoubleWritable clave = new DoubleWritable();
   DoubleWritable valor = new DoubleWritable();
   clave.set(Double.parseDouble(line));
   valor.set(Math.sqrt(Double.parseDouble(line)));
   output.collect(clave, valor);
 }

예제 #15

0

파일 보기

파일: MapTestID.java 프로젝트: cliu0507/scalable-machine-learning

    public void reduce(Text key, Iterable<Text> values, Context context)
        throws IOException, InterruptedException {

      String[] pair = new String[2];
      int count = 0;
      for (Text txt : values) {
        pair[count] = txt.toString();
        count++;
      }

      // word exists in training
      if (count == 2) {
        StringTokenizer st_one, st_two;
        if (pair[0].contains(dlt)) {
          st_one = new StringTokenizer(pair[1]);
          st_two = new StringTokenizer(pair[0]);
        } else {
          st_one = new StringTokenizer(pair[0]);
          st_two = new StringTokenizer(pair[1]);
        }

        // outputting the data
        String f_id = st_one.nextToken();

        StringBuilder builder = new StringBuilder(dlt);
        builder.append(f_id);
        builder.append(dlt);
        while (st_two.hasMoreTokens()) {
          String filename = st_two.nextToken();
          String tf_idf = st_two.nextToken();
          builder.append(filename);
          builder.append(dlt);
          builder.append(tf_idf);
          builder.append("\t");
        }
        myVal.set(builder.toString());
        context.write(key, myVal);
      }
    }

예제 #16

0

파일 보기

파일: Add1.java 프로젝트: Totemika/repoHadoop

 public void map(
     LongWritable key,
     Text value,
     OutputCollector<IntWritable, IntWritable> output,
     Reporter reporter)
     throws IOException {
   String line = value.toString();
   IntWritable clave = new IntWritable();
   IntWritable valor = new IntWritable();
   clave.set(Integer.parseInt(line));
   valor.set(Integer.parseInt(line) + 1);
   output.collect(clave, valor);
 }

예제 #17

0

파일 보기

파일: PartitionByStationUsingMultipleOutputs.java 프로젝트: tranbahien/hadoop-book

    @SuppressWarnings("unchecked")
    public void reduce(
        Text key,
        Iterator<Text> values,
        OutputCollector<NullWritable, Text> output,
        Reporter reporter)
        throws IOException {

      OutputCollector collector =
          multipleOutputs.getCollector("station", key.toString().replace("-", ""), reporter);
      while (values.hasNext()) {
        collector.collect(NullWritable.get(), values.next());
      }
    }

예제 #18

0

파일 보기

파일: HadoopNBFilter.java 프로젝트: scriptkid85/10605-2

    public void map(LongWritable key, Text value, Context context)
        throws IOException, InterruptedException {
      String line = value.toString();

      splitposition = line.indexOf("\t");
      labels = line.substring(0, splitposition);
      content = line.substring(splitposition + 1, line.length());
      if (content.length() <= 5) {
        word.set(labels);
        int counter = Integer.parseInt(content);
        context.write(word, new IntWritable(counter));
        return;
      }
      labeltokens = labels.split(",");
      contenttokens = tokenizeDoc(content);

      for (String label : labelspace) {
        for (String token : contenttokens) {
          // filter token, denotes the needed events;
          word.set(label + " " + token);
          context.write(word, new IntWritable(-1));
        }
      }
    }

예제 #19

0

파일 보기

파일: Jacobi.java 프로젝트: Quetzaloid/Parallel-programming

    public void map(
        LongWritable key,
        Text value,
        OutputCollector<IntWritable, DoubleWritable> output,
        Reporter reporter)
        throws IOException {
      String line = value.toString();
      StringTokenizer tokenizer = new StringTokenizer(line);

      int rowIdx = 0;
      double xValue = 0;

      if (tokenizer.hasMoreTokens()) {
        rowIdx = Integer.parseInt(tokenizer.nextToken());
        xValue = Double.parseDouble(tokenizer.nextToken());
      }

      double xResult = (resVec[rowIdx] - (sumVec[rowIdx] * xValue)) / diaVec[rowIdx];
      output.collect(new IntWritable(rowIdx), new DoubleWritable(xResult));
    }

예제 #20

0

파일 보기

파일: WikiProjectMapper.java 프로젝트: anantunc/HadoopWikiProject

  @Override
  public void map(LongWritable key, Text value, Context context)
      throws IOException, InterruptedException {
    String line = value.toString();
    String[] tokens = line.split("\\s");
    String hour = new String();
    long numRequests = 0;

    if (tokens.length > 4) {
      // get the project name field
      hour = tokens[4];

      // get the number of requests field
      numRequests = Long.parseLong(tokens[2]);

      // output the key, value pairs where the key is a combination of
      // project name and datetime and the value is number of requests
      context.write(new Text(hour), new LongWritable(numRequests));
    }
  }

예제 #21

0

파일 보기

파일: BatchGenerationEngine2.java 프로젝트: cauchyturing/distributed-deep-belief

    public void reduce(
        IntWritable sameNum,
        Iterator<Text> data,
        OutputCollector<Text, jBLASArrayWritable> output,
        Reporter reporter)
        throws IOException {
      int totalBatchCount = exampleCount / batchSize;

      DoubleMatrix weights = DoubleMatrix.randn(hiddenNodes, visibleNodes);
      DoubleMatrix hbias = DoubleMatrix.zeros(hiddenNodes);
      DoubleMatrix vbias = DoubleMatrix.zeros(visibleNodes);
      DoubleMatrix label = DoubleMatrix.zeros(1);
      DoubleMatrix hidden_chain = null;
      DoubleMatrix vdata = DoubleMatrix.zeros(batchSize, visibleNodes);

      ArrayList<DoubleMatrix> outputmatricies = new ArrayList<DoubleMatrix>();
      outputmatricies.add(weights);
      outputmatricies.add(hbias);
      outputmatricies.add(vbias);
      outputmatricies.add(label);
      outputmatricies.add(vdata);
      outputmatricies.add(hidden_chain);

      int j;
      for (int i = 0; i < totalBatchCount; i++) {
        j = 0;
        while (data.hasNext() && j < batchSize) {
          j++;
          StringTokenizer tk = new StringTokenizer(data.next().toString());
          label.put(0, Double.parseDouble(tk.nextToken()));
          String image = tk.nextToken();
          for (int k = 0; k < image.length(); k++) {
            Integer val = new Integer(image.charAt(k));
            vdata.put(j, k, val.doubleValue());
          }
          dataArray = new jBLASArrayWritable(outputmatricies);
          batchID.set("1\t" + i);
          output.collect(batchID, dataArray);
        }
      }
    }

예제 #22

0

파일 보기

파일: Injector.java 프로젝트: soolr/nutch-1.7

    public void map(
        WritableComparable<?> key,
        Text value,
        OutputCollector<Text, CrawlDatum> output,
        Reporter reporter)
        throws IOException {
      String url = value.toString(); // value is line of text

      if (url != null && url.trim().startsWith("#")) {
        /* Ignore line that start with # */
        return;
      }

      // if tabs : metadata that could be stored
      // must be name=value and separated by \t
      float customScore = -1f;
      int customInterval = interval;
      int fixedInterval = -1;
      Map<String, String> metadata = new TreeMap<String, String>();
      if (url.indexOf("\t") != -1) {
        String[] splits = url.split("\t");
        url = splits[0];
        for (int s = 1; s < splits.length; s++) {
          // find separation between name and value
          int indexEquals = splits[s].indexOf("=");
          if (indexEquals == -1) {
            // skip anything without a =
            continue;
          }
          String metaname = splits[s].substring(0, indexEquals);
          String metavalue = splits[s].substring(indexEquals + 1);
          if (metaname.equals(nutchScoreMDName)) {
            try {
              customScore = Float.parseFloat(metavalue);
            } catch (NumberFormatException nfe) {
            }
          } else if (metaname.equals(nutchFetchIntervalMDName)) {
            try {
              customInterval = Integer.parseInt(metavalue);
            } catch (NumberFormatException nfe) {
            }
          } else if (metaname.equals(nutchFixedFetchIntervalMDName)) {
            try {
              fixedInterval = Integer.parseInt(metavalue);
            } catch (NumberFormatException nfe) {
            }
          } else metadata.put(metaname, metavalue);
        }
      }
      try {
        url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT);
        url = filters.filter(url); // filter the url
      } catch (Exception e) {
        if (LOG.isWarnEnabled()) {
          LOG.warn("Skipping " + url + ":" + e);
        }
        url = null;
      }
      if (url == null) {
        reporter.getCounter("injector", "urls_filtered").increment(1);
      } else { // if it passes
        value.set(url); // collect it
        CrawlDatum datum = new CrawlDatum();
        datum.setStatus(CrawlDatum.STATUS_INJECTED);

        // Is interval custom? Then set as meta data
        if (fixedInterval > -1) {
          // Set writable using float. Flaot is used by AdaptiveFetchSchedule
          datum
              .getMetaData()
              .put(Nutch.WRITABLE_FIXED_INTERVAL_KEY, new FloatWritable(fixedInterval));
          datum.setFetchInterval(fixedInterval);
        } else {
          datum.setFetchInterval(customInterval);
        }

        datum.setFetchTime(curTime);
        // now add the metadata
        Iterator<String> keysIter = metadata.keySet().iterator();
        while (keysIter.hasNext()) {
          String keymd = keysIter.next();
          String valuemd = metadata.get(keymd);
          datum.getMetaData().put(new Text(keymd), new Text(valuemd));
        }
        if (customScore != -1) datum.setScore(customScore);
        else datum.setScore(scoreInjected);
        try {
          scfilters.injectedScore(value, datum);
        } catch (ScoringFilterException e) {
          if (LOG.isWarnEnabled()) {
            LOG.warn(
                "Cannot filter injected score for url "
                    + url
                    + ", using default ("
                    + e.getMessage()
                    + ")");
          }
        }
        reporter.getCounter("injector", "urls_injected").increment(1);
        output.collect(value, datum);
      }
    }