Esempio n. 1
0
 public static Map<String, String> parseParams(String serializedString) throws IOException {
   Configuration conf = new Configuration();
   conf.set(
       "io.serializations",
       "org.apache.hadoop.io.serializer.JavaSerialization,"
           + "org.apache.hadoop.io.serializer.WritableSerialization");
   Map<String, String> params = Maps.newHashMap();
   DefaultStringifier<Map<String, String>> mapStringifier =
       new DefaultStringifier<Map<String, String>>(conf, GenericsUtil.getClass(params));
   return mapStringifier.fromString(serializedString);
 }
  /**
   * Run the job
   *
   * @param input the input pathname String
   * @param output the output pathname String
   * @param catFile the file containing the Wikipedia categories
   * @param exactMatchOnly if true, then the Wikipedia category must match exactly instead of simply
   *     containing the category string
   */
  public static void runJob(
      String input,
      String output,
      String catFile,
      boolean exactMatchOnly,
      Class<? extends Analyzer> analyzerClass)
      throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration();
    conf.set("key.value.separator.in.input.line", " ");
    conf.set("xmlinput.start", "<page>");
    conf.set("xmlinput.end", "</page>");
    conf.setBoolean("exact.match.only", exactMatchOnly);
    conf.set("analyzer.class", analyzerClass.getName());
    conf.set(
        "io.serializations",
        "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    // Dont ever forget this. People should keep track of how hadoop conf
    // parameters can make or break a piece of code

    Set<String> categories = Sets.newHashSet();
    for (String line : new FileLineIterable(new File(catFile))) {
      categories.add(line.trim().toLowerCase(Locale.ENGLISH));
    }

    Stringifier<Set<String>> setStringifier =
        new DefaultStringifier<Set<String>>(conf, GenericsUtil.getClass(categories));

    String categoriesStr = setStringifier.toString(categories);

    conf.set("wikipedia.categories", categoriesStr);

    Job job = new Job(conf);
    log.info("Input: {} Out: {} Categories: {}", input, output, catFile);
    job.setJarByClass(WikipediaDatasetCreatorDriver.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setMapperClass(WikipediaDatasetCreatorMapper.class);
    // TODO: job.setNumMapTasks(100);
    job.setInputFormatClass(XmlInputFormat.class);
    job.setReducerClass(WikipediaDatasetCreatorReducer.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.setInputPaths(job, new Path(input));
    Path outPath = new Path(output);
    FileOutputFormat.setOutputPath(job, outPath);
    HadoopUtil.delete(conf, outPath);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
      throw new IllegalStateException("Job failed!");
    }
  }
  /**
   * Run the job
   *
   * @param input the input pathname String
   * @param output the output pathname String
   * @param catFile the file containing the Wikipedia categories
   * @param exactMatchOnly if true, then the Wikipedia category must match exactly instead of simply
   *     containing the category string
   * @param all if true select all categories
   * @throws ClassNotFoundException
   * @throws InterruptedException
   */
  public static void runJob(
      String input, String output, String catFile, boolean exactMatchOnly, boolean all)
      throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration();
    conf.set("xmlinput.start", "<page>");
    conf.set("xmlinput.end", "</page>");
    conf.setBoolean("exact.match.only", exactMatchOnly);
    conf.setBoolean("all.files", all);
    conf.set(
        "io.serializations",
        "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");

    Job job = new Job(conf);
    if (WikipediaToSequenceFile.log.isInfoEnabled()) {
      log.info(
          "Input: " + input + " Out: " + output + " Categories: " + catFile + " All Files: " + all);
    }
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.setInputPaths(job, new Path(input));
    Path outPath = new Path(output);
    FileOutputFormat.setOutputPath(job, outPath);
    job.setMapperClass(WikipediaMapper.class);
    job.setInputFormatClass(XmlInputFormat.class);
    job.setReducerClass(Reducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setJarByClass(WikipediaToSequenceFile.class);

    /*
     * conf.set("mapred.compress.map.output", "true"); conf.set("mapred.map.output.compression.type",
     * "BLOCK"); conf.set("mapred.output.compress", "true"); conf.set("mapred.output.compression.type",
     * "BLOCK"); conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
     */
    HadoopUtil.overwriteOutput(outPath);

    Set<String> categories = new HashSet<String>();
    if (catFile.length() > 0) {
      for (String line : new FileLineIterable(new File(catFile))) {
        categories.add(line.trim().toLowerCase(Locale.ENGLISH));
      }
    }

    DefaultStringifier<Set<String>> setStringifier =
        new DefaultStringifier<Set<String>>(conf, GenericsUtil.getClass(categories));

    String categoriesStr = setStringifier.toString(categories);

    conf.set("wikipedia.categories", categoriesStr);

    job.waitForCompletion(true);
  }
Esempio n. 4
0
 @Override
 public String toString() {
   Configuration conf = new Configuration();
   conf.set(
       "io.serializations",
       "org.apache.hadoop.io.serializer.JavaSerialization,"
           + "org.apache.hadoop.io.serializer.WritableSerialization");
   DefaultStringifier<Map<String, String>> mapStringifier =
       new DefaultStringifier<Map<String, String>>(conf, GenericsUtil.getClass(params));
   try {
     return mapStringifier.toString(params);
   } catch (IOException e) {
     log.info("Encountered IOException while deserializing returning empty string", e);
     return "";
   }
 }