public int run(String[] args) throws Exception { if (args.length < 3) { System.err.println( "Usage: SolrIndexer <solr url> <crawldb> [-linkdb <linkdb>] [-params k1=v1&k2=v2...] (<segment> ... | -dir <segments>) [-noCommit] [-deleteGone] [-filter] [-normalize]"); return -1; } final Path crawlDb = new Path(args[1]); Path linkDb = null; final List<Path> segments = new ArrayList<Path>(); String params = null; boolean noCommit = false; boolean deleteGone = false; boolean filter = false; boolean normalize = false; for (int i = 2; i < args.length; i++) { if (args[i].equals("-linkdb")) { linkDb = new Path(args[++i]); } else if (args[i].equals("-dir")) { Path dir = new Path(args[++i]); FileSystem fs = dir.getFileSystem(getConf()); FileStatus[] fstats = fs.listStatus(dir, HadoopFSUtil.getPassDirectoriesFilter(fs)); Path[] files = HadoopFSUtil.getPaths(fstats); for (Path p : files) { segments.add(p); } } else if (args[i].equals("-noCommit")) { noCommit = true; } else if (args[i].equals("-deleteGone")) { deleteGone = true; } else if (args[i].equals("-filter")) { filter = true; } else if (args[i].equals("-normalize")) { normalize = true; } else if (args[i].equals("-params")) { params = args[++i]; } else { segments.add(new Path(args[i])); } } try { indexSolr( args[0], crawlDb, linkDb, segments, noCommit, deleteGone, params, filter, normalize); return 0; } catch (final Exception e) { LOG.error("SolrIndexer: " + StringUtils.stringifyException(e)); return -1; } }
public static void main(String[] args) throws Exception { if (args.length < 2) { usage(); return; } int mode = -1; if (args[0].equals("-dump")) mode = MODE_DUMP; else if (args[0].equals("-list")) mode = MODE_LIST; else if (args[0].equals("-get")) mode = MODE_GET; boolean co = true; boolean fe = true; boolean ge = true; boolean pa = true; boolean pd = true; boolean pt = true; // collect general options for (int i = 1; i < args.length; i++) { if (args[i].equals("-nocontent")) { co = false; args[i] = null; } else if (args[i].equals("-nofetch")) { fe = false; args[i] = null; } else if (args[i].equals("-nogenerate")) { ge = false; args[i] = null; } else if (args[i].equals("-noparse")) { pa = false; args[i] = null; } else if (args[i].equals("-noparsedata")) { pd = false; args[i] = null; } else if (args[i].equals("-noparsetext")) { pt = false; args[i] = null; } } Configuration conf = NutchConfiguration.create(); final FileSystem fs = FileSystem.get(conf); SegmentReader segmentReader = new SegmentReader(conf, co, fe, ge, pa, pd, pt); // collect required args switch (mode) { case MODE_DUMP: String input = args[1]; if (input == null) { System.err.println("Missing required argument: <segment_dir>"); usage(); return; } String output = args.length > 2 ? args[2] : null; if (output == null) { System.err.println("Missing required argument: <output>"); usage(); return; } segmentReader.dump(new Path(input), new Path(output)); return; case MODE_LIST: ArrayList<Path> dirs = new ArrayList<Path>(); for (int i = 1; i < args.length; i++) { if (args[i] == null) continue; if (args[i].equals("-dir")) { Path dir = new Path(args[++i]); FileStatus[] fstats = fs.listStatus(dir, HadoopFSUtil.getPassDirectoriesFilter(fs)); Path[] files = HadoopFSUtil.getPaths(fstats); if (files != null && files.length > 0) { dirs.addAll(Arrays.asList(files)); } } else dirs.add(new Path(args[i])); } segmentReader.list(dirs, new OutputStreamWriter(System.out, "UTF-8")); return; case MODE_GET: input = args[1]; if (input == null) { System.err.println("Missing required argument: <segment_dir>"); usage(); return; } String key = args.length > 2 ? args[2] : null; if (key == null) { System.err.println("Missing required argument: <keyValue>"); usage(); return; } segmentReader.get( new Path(input), new Text(key), new OutputStreamWriter(System.out, "UTF-8"), new HashMap<String, List<Writable>>()); return; default: System.err.println("Invalid operation: " + args[0]); usage(); return; } }
public void dump(Path segment, Path output) throws IOException { if (LOG.isInfoEnabled()) { LOG.info("SegmentReader: dump segment: " + segment); } JobConf job = createJobConf(); job.setJobName("read " + segment); if (ge) FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME)); if (fe) FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.FETCH_DIR_NAME)); if (pa) FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.PARSE_DIR_NAME)); if (co) FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME)); if (pd) FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME)); if (pt) FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(InputCompatMapper.class); job.setReducerClass(SegmentReader.class); Path tempDir = new Path( job.get("hadoop.tmp.dir", "/tmp") + "/segread-" + new java.util.Random().nextInt()); fs.delete(tempDir, true); FileOutputFormat.setOutputPath(job, tempDir); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NutchWritable.class); JobClient.runJob(job); // concatenate the output Path dumpFile = new Path(output, job.get("segment.dump.dir", "dump")); // remove the old file fs.delete(dumpFile, true); FileStatus[] fstats = fs.listStatus(tempDir, HadoopFSUtil.getPassAllFilter()); Path[] files = HadoopFSUtil.getPaths(fstats); PrintWriter writer = null; int currentRecordNumber = 0; if (files.length > 0) { writer = new PrintWriter(new BufferedWriter(new OutputStreamWriter(fs.create(dumpFile)))); try { for (int i = 0; i < files.length; i++) { Path partFile = (Path) files[i]; try { currentRecordNumber = append(fs, job, partFile, writer, currentRecordNumber); } catch (IOException exception) { if (LOG.isWarnEnabled()) { LOG.warn( "Couldn't copy the content of " + partFile.toString() + " into " + dumpFile.toString()); LOG.warn(exception.getMessage()); } } } } finally { writer.close(); } } fs.delete(tempDir); if (LOG.isInfoEnabled()) { LOG.info("SegmentReader: done"); } }