/** * Parallel Classification * * @param key The label * @param value the features (all unique) associated w/ this label * @param output The OutputCollector to write the results to * @param reporter Reports status back to hadoop */ @Override public void map( Text key, Text value, OutputCollector<StringTuple, DoubleWritable> output, Reporter reporter) throws IOException { List<String> ngrams = new NGrams(value.toString(), gramSize).generateNGramsWithoutLabel(); try { ClassifierResult result = classifier.classifyDocument(ngrams.toArray(new String[ngrams.size()]), defaultCategory); String correctLabel = key.toString(); String classifiedLabel = result.getLabel(); StringTuple outputTuple = new StringTuple(BayesConstants.CLASSIFIER_TUPLE); outputTuple.add(correctLabel); outputTuple.add(classifiedLabel); output.collect(outputTuple, ONE); } catch (InvalidDatastoreException e) { throw new IOException(e); } }
@Override public void configure(JobConf job) { try { BayesParameters params = new BayesParameters(job.get("bayes.parameters", "")); log.info("Bayes Parameter {}", params.print()); log.info("{}", params.print()); Algorithm algorithm; Datastore datastore; if ("hdfs".equals(params.get("dataSource"))) { if ("bayes".equalsIgnoreCase(params.get("classifierType"))) { log.info("Testing Bayes Classifier"); algorithm = new BayesAlgorithm(); datastore = new InMemoryBayesDatastore(params); } else if ("cbayes".equalsIgnoreCase(params.get("classifierType"))) { log.info("Testing Complementary Bayes Classifier"); algorithm = new CBayesAlgorithm(); datastore = new InMemoryBayesDatastore(params); } else { throw new IllegalArgumentException( "Unrecognized classifier type: " + params.get("classifierType")); } } else { throw new IllegalArgumentException( "Unrecognized dataSource type: " + params.get("dataSource")); } classifier = new ClassifierContext(algorithm, datastore); classifier.initialize(); defaultCategory = params.get("defaultCat"); gramSize = params.getGramSize(); } catch (IOException ex) { log.warn(ex.toString(), ex); } catch (InvalidDatastoreException e) { log.error(e.toString(), e); } }