Esempio n. 1
0
  public int run(String[] args) throws Exception {
    if (args.length < 3) {
      System.err.println(
          "Usage: SolrIndexer <solr url> <crawldb> [-linkdb <linkdb>] [-params k1=v1&k2=v2...] (<segment> ... | -dir <segments>) [-noCommit] [-deleteGone] [-filter] [-normalize]");
      return -1;
    }

    final Path crawlDb = new Path(args[1]);
    Path linkDb = null;

    final List<Path> segments = new ArrayList<Path>();
    String params = null;

    boolean noCommit = false;
    boolean deleteGone = false;
    boolean filter = false;
    boolean normalize = false;

    for (int i = 2; i < args.length; i++) {
      if (args[i].equals("-linkdb")) {
        linkDb = new Path(args[++i]);
      } else if (args[i].equals("-dir")) {
        Path dir = new Path(args[++i]);
        FileSystem fs = dir.getFileSystem(getConf());
        FileStatus[] fstats = fs.listStatus(dir, HadoopFSUtil.getPassDirectoriesFilter(fs));
        Path[] files = HadoopFSUtil.getPaths(fstats);
        for (Path p : files) {
          segments.add(p);
        }
      } else if (args[i].equals("-noCommit")) {
        noCommit = true;
      } else if (args[i].equals("-deleteGone")) {
        deleteGone = true;
      } else if (args[i].equals("-filter")) {
        filter = true;
      } else if (args[i].equals("-normalize")) {
        normalize = true;
      } else if (args[i].equals("-params")) {
        params = args[++i];
      } else {
        segments.add(new Path(args[i]));
      }
    }

    try {
      indexSolr(
          args[0], crawlDb, linkDb, segments, noCommit, deleteGone, params, filter, normalize);
      return 0;
    } catch (final Exception e) {
      LOG.error("SolrIndexer: " + StringUtils.stringifyException(e));
      return -1;
    }
  }
  public static void main(String[] args) throws Exception {
    if (args.length < 2) {
      usage();
      return;
    }
    int mode = -1;
    if (args[0].equals("-dump")) mode = MODE_DUMP;
    else if (args[0].equals("-list")) mode = MODE_LIST;
    else if (args[0].equals("-get")) mode = MODE_GET;

    boolean co = true;
    boolean fe = true;
    boolean ge = true;
    boolean pa = true;
    boolean pd = true;
    boolean pt = true;
    // collect general options
    for (int i = 1; i < args.length; i++) {
      if (args[i].equals("-nocontent")) {
        co = false;
        args[i] = null;
      } else if (args[i].equals("-nofetch")) {
        fe = false;
        args[i] = null;
      } else if (args[i].equals("-nogenerate")) {
        ge = false;
        args[i] = null;
      } else if (args[i].equals("-noparse")) {
        pa = false;
        args[i] = null;
      } else if (args[i].equals("-noparsedata")) {
        pd = false;
        args[i] = null;
      } else if (args[i].equals("-noparsetext")) {
        pt = false;
        args[i] = null;
      }
    }
    Configuration conf = NutchConfiguration.create();
    final FileSystem fs = FileSystem.get(conf);
    SegmentReader segmentReader = new SegmentReader(conf, co, fe, ge, pa, pd, pt);
    // collect required args
    switch (mode) {
      case MODE_DUMP:
        String input = args[1];
        if (input == null) {
          System.err.println("Missing required argument: <segment_dir>");
          usage();
          return;
        }
        String output = args.length > 2 ? args[2] : null;
        if (output == null) {
          System.err.println("Missing required argument: <output>");
          usage();
          return;
        }
        segmentReader.dump(new Path(input), new Path(output));
        return;
      case MODE_LIST:
        ArrayList<Path> dirs = new ArrayList<Path>();
        for (int i = 1; i < args.length; i++) {
          if (args[i] == null) continue;
          if (args[i].equals("-dir")) {
            Path dir = new Path(args[++i]);
            FileStatus[] fstats = fs.listStatus(dir, HadoopFSUtil.getPassDirectoriesFilter(fs));
            Path[] files = HadoopFSUtil.getPaths(fstats);
            if (files != null && files.length > 0) {
              dirs.addAll(Arrays.asList(files));
            }
          } else dirs.add(new Path(args[i]));
        }
        segmentReader.list(dirs, new OutputStreamWriter(System.out, "UTF-8"));
        return;
      case MODE_GET:
        input = args[1];
        if (input == null) {
          System.err.println("Missing required argument: <segment_dir>");
          usage();
          return;
        }
        String key = args.length > 2 ? args[2] : null;
        if (key == null) {
          System.err.println("Missing required argument: <keyValue>");
          usage();
          return;
        }
        segmentReader.get(
            new Path(input),
            new Text(key),
            new OutputStreamWriter(System.out, "UTF-8"),
            new HashMap<String, List<Writable>>());
        return;
      default:
        System.err.println("Invalid operation: " + args[0]);
        usage();
        return;
    }
  }
  public void dump(Path segment, Path output) throws IOException {

    if (LOG.isInfoEnabled()) {
      LOG.info("SegmentReader: dump segment: " + segment);
    }

    JobConf job = createJobConf();
    job.setJobName("read " + segment);

    if (ge) FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
    if (fe) FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.FETCH_DIR_NAME));
    if (pa) FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.PARSE_DIR_NAME));
    if (co) FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME));
    if (pd) FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME));
    if (pt) FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME));

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(InputCompatMapper.class);
    job.setReducerClass(SegmentReader.class);

    Path tempDir =
        new Path(
            job.get("hadoop.tmp.dir", "/tmp") + "/segread-" + new java.util.Random().nextInt());
    fs.delete(tempDir, true);

    FileOutputFormat.setOutputPath(job, tempDir);
    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NutchWritable.class);

    JobClient.runJob(job);

    // concatenate the output
    Path dumpFile = new Path(output, job.get("segment.dump.dir", "dump"));

    // remove the old file
    fs.delete(dumpFile, true);
    FileStatus[] fstats = fs.listStatus(tempDir, HadoopFSUtil.getPassAllFilter());
    Path[] files = HadoopFSUtil.getPaths(fstats);

    PrintWriter writer = null;
    int currentRecordNumber = 0;
    if (files.length > 0) {
      writer = new PrintWriter(new BufferedWriter(new OutputStreamWriter(fs.create(dumpFile))));
      try {
        for (int i = 0; i < files.length; i++) {
          Path partFile = (Path) files[i];
          try {
            currentRecordNumber = append(fs, job, partFile, writer, currentRecordNumber);
          } catch (IOException exception) {
            if (LOG.isWarnEnabled()) {
              LOG.warn(
                  "Couldn't copy the content of "
                      + partFile.toString()
                      + " into "
                      + dumpFile.toString());
              LOG.warn(exception.getMessage());
            }
          }
        }
      } finally {
        writer.close();
      }
    }
    fs.delete(tempDir);
    if (LOG.isInfoEnabled()) {
      LOG.info("SegmentReader: done");
    }
  }