public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String strId = ""; String strBody = ""; // Parse the xml and read data (page id and article body) // Using XOM library Builder builder = new Builder(); try { Document doc = builder.build(value.toString(), null); Nodes nodeId = doc.query("//eecs485_article_id"); strId = nodeId.get(0).getChild(0).getValue(); Nodes nodeBody = doc.query("//eecs485_article_body"); strBody = nodeBody.get(0).getChild(0).getValue(); } catch (ParsingException ex) { System.out.println("Not well-formed."); System.out.println(ex.getMessage()); } catch (IOException ex) { System.out.println("io exception"); } // Tokenize document body Pattern pattern = Pattern.compile("\\w+"); Matcher matcher = pattern.matcher(strBody); while (matcher.find()) { // Write the parsed token // key = term, docid value = 1 context.write(new Text(matcher.group() + "," + strId), one); } }
public void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException { // function starts here int sum = 0; for (LongWritable value : values) { sum += value.get(); } context.write(key, new LongWritable(sum)); }