Esempio n. 1
0
    public void map(LongWritable key, Text value, Context context)
        throws IOException, InterruptedException {

      String strId = "";
      String strBody = "";

      // Parse the xml and read data (page id and article body)
      // Using XOM library
      Builder builder = new Builder();

      try {
        Document doc = builder.build(value.toString(), null);

        Nodes nodeId = doc.query("//eecs485_article_id");
        strId = nodeId.get(0).getChild(0).getValue();

        Nodes nodeBody = doc.query("//eecs485_article_body");
        strBody = nodeBody.get(0).getChild(0).getValue();
      } catch (ParsingException ex) {
        System.out.println("Not well-formed.");
        System.out.println(ex.getMessage());
      } catch (IOException ex) {
        System.out.println("io exception");
      }

      // Tokenize document body
      Pattern pattern = Pattern.compile("\\w+");
      Matcher matcher = pattern.matcher(strBody);

      while (matcher.find()) {
        // Write the parsed token
        // key = term, docid   value = 1
        context.write(new Text(matcher.group() + "," + strId), one);
      }
    }
Esempio n. 2
0
    public void reduce(Text key, Iterable<LongWritable> values, Context context)
        throws IOException, InterruptedException {

      // function starts here

      int sum = 0;

      for (LongWritable value : values) {
        sum += value.get();
      }

      context.write(key, new LongWritable(sum));
    }