public static void main(String[] args) { final BayesParameters params = new BayesParameters(); params.setGramSize(1); params.set("verbose", "true"); params.set("classifierType", "bayes"); params.set("defaultCat", "OTHER"); params.set("encoding", "UTF-8"); params.set("alpha_i", "1.0"); params.set("dataSource", "hdfs"); params.set("basePath", "/tmp/output"); try { Path input = new Path("/tmp/input"); Path output = new Path("/tmp/output"); TrainClassifier.trainNaiveBayes(input, output, params); Algorithm algorithm = new BayesAlgorithm(); Datastore datastore = new InMemoryBayesDatastore(params); ClassifierContext classifier = new ClassifierContext(algorithm, datastore); classifier.initialize(); final BufferedReader reader = new BufferedReader(new FileReader(args[0])); String entry = reader.readLine(); log.debug("First line: " + entry); while (entry != null) { log.debug("Processing line: " + entry); List<String> document = new NGrams(entry, Integer.parseInt(params.get("gramSize"))) .generateNGramsWithoutLabel(); ClassifierResult result = classifier.classifyDocument( document.toArray(new String[document.size()]), params.get("defaultCat")); log.debug("Label: " + result.getLabel() + ", Score: " + result.getScore() + ", " + entry); entry = reader.readLine(); } } catch (final IOException ex) { ex.printStackTrace(); } catch (final InvalidDatastoreException ex) { ex.printStackTrace(); } }
/** * Parallel Classification * * @param key The label * @param value the features (all unique) associated w/ this label * @param output The OutputCollector to write the results to * @param reporter Reports status back to hadoop */ @Override public void map( Text key, Text value, OutputCollector<StringTuple, DoubleWritable> output, Reporter reporter) throws IOException { List<String> ngrams = new NGrams(value.toString(), gramSize).generateNGramsWithoutLabel(); try { ClassifierResult result = classifier.classifyDocument(ngrams.toArray(new String[ngrams.size()]), defaultCategory); String correctLabel = key.toString(); String classifiedLabel = result.getLabel(); StringTuple outputTuple = new StringTuple(BayesConstants.CLASSIFIER_TUPLE); outputTuple.add(correctLabel); outputTuple.add(classifiedLabel); output.collect(outputTuple, ONE); } catch (InvalidDatastoreException e) { throw new IOException(e); } }