Example #1
0
  public static void main(String[] args) {
    final BayesParameters params = new BayesParameters();
    params.setGramSize(1);
    params.set("verbose", "true");
    params.set("classifierType", "bayes");
    params.set("defaultCat", "OTHER");
    params.set("encoding", "UTF-8");
    params.set("alpha_i", "1.0");
    params.set("dataSource", "hdfs");
    params.set("basePath", "/tmp/output");

    try {
      Path input = new Path("/tmp/input");
      Path output = new Path("/tmp/output");
      TrainClassifier.trainNaiveBayes(input, output, params);

      Algorithm algorithm = new BayesAlgorithm();
      Datastore datastore = new InMemoryBayesDatastore(params);
      ClassifierContext classifier = new ClassifierContext(algorithm, datastore);
      classifier.initialize();

      final BufferedReader reader = new BufferedReader(new FileReader(args[0]));
      String entry = reader.readLine();
      log.debug("First line: " + entry);

      while (entry != null) {
        log.debug("Processing line: " + entry);

        List<String> document =
            new NGrams(entry, Integer.parseInt(params.get("gramSize")))
                .generateNGramsWithoutLabel();

        ClassifierResult result =
            classifier.classifyDocument(
                document.toArray(new String[document.size()]), params.get("defaultCat"));

        log.debug("Label: " + result.getLabel() + ", Score: " + result.getScore() + ", " + entry);

        entry = reader.readLine();
      }
    } catch (final IOException ex) {
      ex.printStackTrace();
    } catch (final InvalidDatastoreException ex) {
      ex.printStackTrace();
    }
  }
  /**
   * Parallel Classification
   *
   * @param key The label
   * @param value the features (all unique) associated w/ this label
   * @param output The OutputCollector to write the results to
   * @param reporter Reports status back to hadoop
   */
  @Override
  public void map(
      Text key, Text value, OutputCollector<StringTuple, DoubleWritable> output, Reporter reporter)
      throws IOException {
    List<String> ngrams = new NGrams(value.toString(), gramSize).generateNGramsWithoutLabel();

    try {
      ClassifierResult result =
          classifier.classifyDocument(ngrams.toArray(new String[ngrams.size()]), defaultCategory);

      String correctLabel = key.toString();
      String classifiedLabel = result.getLabel();

      StringTuple outputTuple = new StringTuple(BayesConstants.CLASSIFIER_TUPLE);
      outputTuple.add(correctLabel);
      outputTuple.add(classifiedLabel);

      output.collect(outputTuple, ONE);
    } catch (InvalidDatastoreException e) {
      throw new IOException(e);
    }
  }