@Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = ((Text) value).toString(); List<String> tokens = new ArrayList<String>(); StringTokenizer itr = new StringTokenizer(line); int numWords = 0; while (itr.hasMoreTokens() && numWords < 100) { String w = itr.nextToken().toLowerCase().replaceAll("(^[^a-z]+|[^a-z]+$)", ""); if (w.length() == 0) continue; if (!tokens.contains(w)) { tokens.add(w); } numWords++; } for (int i = 0; i < tokens.size(); i++) { MAP.clear(); for (int j = 0; j < tokens.size(); j++) { if (i == j) continue; MAP.increment(tokens.get(j)); } KEY.set(tokens.get(i)); context.write(KEY, MAP); } }
@Override public void reduce(Text key, Iterable<HMapStIW> values, Context context) throws IOException, InterruptedException { Iterator<HMapStIW> iter = values.iterator(); HMapStIW map = new HMapStIW(); while (iter.hasNext()) { map.plus(iter.next()); } HMapStFW writeMap = new HMapStFW(); double pmi = 0.0; for (MapKI.Entry<String> entry : map.entrySet()) { String k = entry.getKey(); if (map.get(k) >= 10) { if (wordCounts.containsKey(key.toString()) && wordCounts.containsKey(k)) { int px = wordCounts.get(key.toString()); int py = wordCounts.get(k); pmi = Math.log10(((double) (map.get(k)) / (px * py)) * wordCounts.get("numLines*")); writeMap.put(k, (float) pmi); } } } if (writeMap.size() > 0) { context.write(key, writeMap); } }
@Override public void reduce(Text key, Iterable<HMapStIW> values, Context context) throws IOException, InterruptedException { Iterator<HMapStIW> iter = values.iterator(); HMapStIW map = new HMapStIW(); while (iter.hasNext()) { map.plus(iter.next()); } context.write(key, map); }