Beispiel #1
0
 public static Analyzer createAnalyzer(Class<? extends Analyzer> analyzerClass, Version version) {
   try {
     return ClassUtils.instantiateAs(
         analyzerClass, Analyzer.class, new Class<?>[] {Version.class}, new Object[] {version});
   } catch (IllegalStateException e) {
     return ClassUtils.instantiateAs(analyzerClass, Analyzer.class);
   }
 }
Beispiel #2
0
  @Override
  public int run(String[] args) throws Exception {

    addInputOption();
    addOutputOption();
    addOption(DefaultOptionCreator.distanceMeasureOption().create());
    addOption(DefaultOptionCreator.t1Option().create());
    addOption(DefaultOptionCreator.t2Option().create());
    addOption(DefaultOptionCreator.overwriteOption().create());

    Map<String, List<String>> argMap = parseArguments(args);
    if (argMap == null) {
      return -1;
    }

    Path input = getInputPath();
    Path output = getOutputPath();
    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
      HadoopUtil.delete(new Configuration(), output);
    }
    String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
    double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
    double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
    DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);

    run(input, output, measure, t1, t2);
    return 0;
  }
 @Override
 protected void setup(Context context) throws IOException, InterruptedException {
   super.setup(context);
   Configuration conf = context.getConfiguration();
   measure =
       ClassUtils.instantiateAs(
           conf.get(RepresentativePointsDriver.DISTANCE_MEASURE_KEY), DistanceMeasure.class);
   representativePoints = getRepresentativePoints(conf);
 }
 @Override
 protected void setup(Context ctx) throws IOException, InterruptedException {
   similarity =
       ClassUtils.instantiateAs(
           ctx.getConfiguration().get(SIMILARITY_CLASSNAME), VectorSimilarityMeasure.class);
   norms = new RandomAccessSparseVector(Integer.MAX_VALUE);
   nonZeroEntries = new RandomAccessSparseVector(Integer.MAX_VALUE);
   maxValues = new RandomAccessSparseVector(Integer.MAX_VALUE);
   threshold = Double.parseDouble(ctx.getConfiguration().get(THRESHOLD));
 }
 @Override
 protected void setup(Context ctx) throws IOException, InterruptedException {
   similarity =
       ClassUtils.instantiateAs(
           ctx.getConfiguration().get(SIMILARITY_CLASSNAME), VectorSimilarityMeasure.class);
   numberOfColumns = ctx.getConfiguration().getInt(NUMBER_OF_COLUMNS, -1);
   Preconditions.checkArgument(numberOfColumns > 0, "Incorrect number of columns!");
   excludeSelfSimilarity = ctx.getConfiguration().getBoolean(EXCLUDE_SELF_SIMILARITY, false);
   norms =
       Vectors.read(new Path(ctx.getConfiguration().get(NORMS_PATH)), ctx.getConfiguration());
   treshold = Double.parseDouble(ctx.getConfiguration().get(THRESHOLD));
 }
 @Override
 protected void setup(Context ctx) throws IOException, InterruptedException {
   similarity =
       ClassUtils.instantiateAs(
           ctx.getConfiguration().get(SIMILARITY_CLASSNAME), VectorSimilarityMeasure.class);
   numNonZeroEntries =
       Vectors.readAsIntMap(
           new Path(ctx.getConfiguration().get(NUM_NON_ZERO_ENTRIES_PATH)),
           ctx.getConfiguration());
   maxValues =
       Vectors.read(
           new Path(ctx.getConfiguration().get(MAXVALUES_PATH)), ctx.getConfiguration());
   threshold = Double.parseDouble(ctx.getConfiguration().get(THRESHOLD));
 }
  /**
   * Takes in two arguments:
   *
   * <ol>
   *   <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live
   *   <li>The output {@link org.apache.hadoop.fs.Path} where to write the classifier as a {@link
   *       org.apache.hadoop.io.SequenceFile}
   * </ol>
   */
  public static void main(String[] args) throws IOException, InterruptedException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option dirInputPathOpt = DefaultOptionCreator.inputOption().create();

    Option dirOutputPathOpt = DefaultOptionCreator.outputOption().create();

    Option categoriesOpt =
        obuilder
            .withLongName("categories")
            .withRequired(true)
            .withArgument(abuilder.withName("categories").withMinimum(1).withMaximum(1).create())
            .withDescription(
                "Location of the categories file.  One entry per line. "
                    + "Will be used to make a string match in Wikipedia Category field")
            .withShortName("c")
            .create();

    Option exactMatchOpt =
        obuilder
            .withLongName("exactMatch")
            .withDescription(
                "If set, then the category name must exactly match the "
                    + "entry in the categories file. Default is false")
            .withShortName("e")
            .create();
    Option analyzerOpt =
        obuilder
            .withLongName("analyzer")
            .withRequired(false)
            .withArgument(abuilder.withName("analyzer").withMinimum(1).withMaximum(1).create())
            .withDescription("The analyzer to use, must have a no argument constructor")
            .withShortName("a")
            .create();
    Option helpOpt = DefaultOptionCreator.helpOption();

    Group group =
        gbuilder
            .withName("Options")
            .withOption(categoriesOpt)
            .withOption(dirInputPathOpt)
            .withOption(dirOutputPathOpt)
            .withOption(exactMatchOpt)
            .withOption(analyzerOpt)
            .withOption(helpOpt)
            .create();

    Parser parser = new Parser();
    parser.setGroup(group);
    try {
      CommandLine cmdLine = parser.parse(args);
      if (cmdLine.hasOption(helpOpt)) {
        CommandLineUtil.printHelp(group);
        return;
      }

      String inputPath = (String) cmdLine.getValue(dirInputPathOpt);
      String outputPath = (String) cmdLine.getValue(dirOutputPathOpt);
      String catFile = (String) cmdLine.getValue(categoriesOpt);
      Class<? extends Analyzer> analyzerClass = WikipediaAnalyzer.class;
      if (cmdLine.hasOption(analyzerOpt)) {
        String className = cmdLine.getValue(analyzerOpt).toString();
        analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
        // try instantiating it, b/c there isn't any point in setting it if
        // you can't instantiate it
        ClassUtils.instantiateAs(analyzerClass, Analyzer.class);
      }
      runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt), analyzerClass);
    } catch (OptionException e) {
      log.error("Exception", e);
      CommandLineUtil.printHelp(group);
    } catch (ClassNotFoundException e) {
      log.error("Exception", e);
      CommandLineUtil.printHelp(group);
    }
  }