/** For debugging. */ public static void main(String[] args) throws Exception { final String usage = "NutchBean query"; if (args.length == 0) { System.err.println(usage); System.exit(-1); } final Configuration conf = NutchConfiguration.create(); final NutchBean bean = new NutchBean(conf); try { final Query query = Query.parse(args[0], conf); final Hits hits = bean.search(query, 10); System.out.println("Total hits: " + hits.getTotal()); final int length = (int) Math.min(hits.getTotal(), 10); final Hit[] show = hits.getHits(0, length); final HitDetails[] details = bean.getDetails(show); final Summary[] summaries = bean.getSummary(details, query); for (int i = 0; i < hits.getLength(); i++) { System.out.println(" " + i + " " + details[i] + "\n" + summaries[i]); } } catch (Throwable t) { LOG.error("Exception occured while executing search: " + t, t); System.exit(1); } System.exit(0); }
public static void main(String[] args) throws IOException { if (args.length != 1) { System.err.println("Usage: EncodingDetector <file>"); System.exit(1); } Configuration conf = NutchConfiguration.create(); EncodingDetector detector = new EncodingDetector(NutchConfiguration.create()); // do everything as bytes; don't want any conversion BufferedInputStream istr = new BufferedInputStream(new FileInputStream(args[0])); ByteArrayOutputStream ostr = new ByteArrayOutputStream(); byte[] bytes = new byte[1000]; boolean more = true; while (more) { int len = istr.read(bytes); if (len < bytes.length) { more = false; if (len > 0) { ostr.write(bytes, 0, len); } } else { ostr.write(bytes); } } byte[] data = ostr.toByteArray(); // make a fake Content Content content = new Content("", "", data, "text/html", new Metadata(), conf); detector.autoDetectClues(content, true); String encoding = detector.guessEncoding(content, conf.get("parser.character.encoding.default")); System.out.println("Guessed encoding: " + encoding); }
/** @param args */ public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println( "SegmentMerger output_dir (-dir segments | seg1 seg2 ...) [-filter] [-slice NNNN]"); System.err.println("\toutput_dir\tname of the parent dir for output segment slice(s)"); System.err.println("\t-dir segments\tparent dir containing several segments"); System.err.println("\tseg1 seg2 ...\tlist of segment dirs"); System.err.println("\t-filter\t\tfilter out URL-s prohibited by current URLFilters"); System.err.println("\t-slice NNNN\tcreate many output segments, each containing NNNN URLs"); return; } Configuration conf = NutchConfiguration.create(); final FileSystem fs = FileSystem.get(conf); Path out = new Path(args[0]); ArrayList segs = new ArrayList(); long sliceSize = 0; boolean filter = false; for (int i = 1; i < args.length; i++) { if (args[i].equals("-dir")) { Path[] files = fs.listPaths( new Path(args[++i]), new PathFilter() { public boolean accept(Path f) { try { if (fs.isDirectory(f)) return true; } catch (IOException e) { } ; return false; } }); for (int j = 0; j < files.length; j++) segs.add(files[j]); } else if (args[i].equals("-filter")) { filter = true; } else if (args[i].equals("-slice")) { sliceSize = Long.parseLong(args[++i]); } else { segs.add(new Path(args[i])); } } if (segs.size() == 0) { System.err.println("ERROR: No input segments."); return; } SegmentMerger merger = new SegmentMerger(conf); merger.merge(out, (Path[]) segs.toArray(new Path[segs.size()]), filter, sliceSize); }
public void testFilter() throws Exception { String domainFile = SAMPLES + SEPARATOR + "hosts.txt"; Configuration conf = NutchConfiguration.create(); DomainURLFilter domainFilter = new DomainURLFilter(domainFile); domainFilter.setConf(conf); assertNotNull(domainFilter.filter("http://lucene.apache.org")); assertNotNull(domainFilter.filter("http://hadoop.apache.org")); assertNotNull(domainFilter.filter("http://www.apache.org")); assertNull(domainFilter.filter("http://www.google.com")); assertNull(domainFilter.filter("http://mail.yahoo.com")); assertNotNull(domainFilter.filter("http://www.foobar.net")); assertNotNull(domainFilter.filter("http://www.foobas.net")); assertNotNull(domainFilter.filter("http://www.yahoo.com")); assertNotNull(domainFilter.filter("http://www.foobar.be")); assertNull(domainFilter.filter("http://www.adobe.com")); }
@Test public void testNoFilter() throws Exception { // https://issues.apache.org/jira/browse/NUTCH-2189 String domainFile = SAMPLES + SEPARATOR + "this-file-does-not-exist.txt"; Configuration conf = NutchConfiguration.create(); DomainURLFilter domainFilter = new DomainURLFilter(domainFile); domainFilter.setConf(conf); Assert.assertNotNull(domainFilter.filter("http://lucene.apache.org")); Assert.assertNotNull(domainFilter.filter("http://hadoop.apache.org")); Assert.assertNotNull(domainFilter.filter("http://www.apache.org")); Assert.assertNotNull(domainFilter.filter("http://www.google.com")); Assert.assertNotNull(domainFilter.filter("http://mail.yahoo.com")); Assert.assertNotNull(domainFilter.filter("http://www.foobar.net")); Assert.assertNotNull(domainFilter.filter("http://www.foobas.net")); Assert.assertNotNull(domainFilter.filter("http://www.yahoo.com")); Assert.assertNotNull(domainFilter.filter("http://www.foobar.be")); Assert.assertNotNull(domainFilter.filter("http://www.adobe.com")); }
public static void main(String args[]) throws IOException { BasicURLNormalizer normalizer = new BasicURLNormalizer(); normalizer.setConf(NutchConfiguration.create()); String scope = URLNormalizers.SCOPE_DEFAULT; if (args.length >= 1) { scope = args[0]; System.out.println("Scope: " + scope); } String line, normUrl; BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); while ((line = in.readLine()) != null) { try { normUrl = normalizer.normalize(line, scope); System.out.println(normUrl); } catch (MalformedURLException e) { System.out.println("failed: " + line); } } System.exit(0); }
public void testIt() throws ProtocolException, ParseException { String urlString; Content content; Parse parse; Configuration conf = NutchConfiguration.create(); Protocol protocol; ProtocolFactory factory = new ProtocolFactory(conf); OOParser parser = new OOParser(); parser.setConf(conf); for (int i = 0; i < sampleFiles.length; i++) { urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; protocol = factory.getProtocol(urlString); content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent(); parse = parser.getParse(content).get(content.getUrl()); String text = parse.getText().replaceAll("[ \t\r\n]+", " "); assertTrue(expectedText.equals(text)); } }
public static void main(String argv[]) throws Exception { String usage = "Content (-local | -dfs <namenode:port>) recno segment"; if (argv.length < 3) { System.out.println("usage:" + usage); return; } Options opts = new Options(); Configuration conf = NutchConfiguration.create(); GenericOptionsParser parser = new GenericOptionsParser(conf, opts, argv); String[] remainingArgs = parser.getRemainingArgs(); FileSystem fs = FileSystem.get(conf); try { int recno = Integer.parseInt(remainingArgs[0]); String segment = remainingArgs[1]; Path file = new Path(segment, DIR_NAME); System.out.println("Reading from file: " + file); ArrayFile.Reader contents = new ArrayFile.Reader(fs, file.toString(), conf); Content content = new Content(); contents.get(recno, content); System.out.println("Retrieved " + recno + " from file " + file); System.out.println(content); contents.close(); } finally { fs.close(); } }
public static void main(String[] args) throws Exception { // args = new String[] {"/home/rollin/git/nutch/2.x/urls"}; args = new String[] {"/home/rollin/git/nutch/2.x/urls", "-crawlId", "jd"}; int res = ToolRunner.run(NutchConfiguration.create(), new InjectorJob(), args); System.exit(res); }
public static void main(String args[]) throws Exception { int res = ToolRunner.run(NutchConfiguration.create(), new GeneratorJob(), args); System.exit(res); }
public class TestPoolingDataDriver extends TestCase { private static Configuration conf = NutchConfiguration.create(); Connection connection = null; public void testGeneration() {} }
private static void setup() throws Exception { conf = NutchConfiguration.create(); conf.setBoolean("parser.html.form.use_action", true); utils = new DOMContentUtils(conf); TikaParser tikaParser = new TikaParser(); tikaParser.setConf(conf); Parser parser = tikaParser.getTikaConfig().getParser("text/html"); for (int i = 0; i < testPages.length; i++) { Metadata tikamd = new Metadata(); HTMLDocumentImpl doc = new HTMLDocumentImpl(); doc.setErrorChecking(false); DocumentFragment root = doc.createDocumentFragment(); DOMBuilder domhandler = new DOMBuilder(doc, root); ParseContext context = new ParseContext(); // to add once available in Tika // context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE); try { parser.parse( new ByteArrayInputStream(testPages[i].getBytes()), domhandler, tikamd, context); testBaseHrefURLs[i] = new URL(testBaseHrefs[i]); } catch (Exception e) { e.printStackTrace(); fail("caught exception: " + e); } testDOMs[i] = root; LSSerializerImpl lsi = new LSSerializerImpl(); System.out.println("input " + i + ": '" + testPages[i] + "'"); System.out.println("output " + i + ": '" + lsi.writeToString(root) + "'"); } answerOutlinks = new Outlink[][] { // 0 { new Outlink("http://www.nutch.org", "anchor"), }, // 1 { new Outlink("http://www.nutch.org/", "home"), new Outlink("http://www.nutch.org/docs/bot.html", "bots"), }, // 2 { new Outlink("http://www.nutch.org/", "separate this"), new Outlink("http://www.nutch.org/docs/ok", "from this"), }, // 3 { new Outlink("http://www.nutch.org/", "home"), new Outlink("http://www.nutch.org/docs/1", "1"), new Outlink("http://www.nutch.org/docs/2", "2"), }, // 4 { new Outlink("http://www.nutch.org/frames/top.html", ""), new Outlink("http://www.nutch.org/frames/left.html", ""), new Outlink("http://www.nutch.org/frames/invalid.html", ""), new Outlink("http://www.nutch.org/frames/right.html", ""), }, // 5 { new Outlink("http://www.nutch.org/maps/logo.gif", ""), new Outlink("http://www.nutch.org/index.html", ""), new Outlink("http://www.nutch.org/maps/#bottom", ""), new Outlink("http://www.nutch.org/bot.html", ""), new Outlink("http://www.nutch.org/docs/index.html", "") }, // 6 { new Outlink("http://www.nutch.org/index.html", "whitespace test"), }, // 7 {}, // 8 { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), }, // 9 {}, // 10 { new Outlink("http://www.nutch.org/;x", "anchor1"), new Outlink("http://www.nutch.org/g;x", "anchor2"), new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") }, // 11 { // this is tricky - see RFC3986 section 5.4.1 example 7 new Outlink("http://www.nutch.org/g", "anchor1"), new Outlink("http://www.nutch.org/g?y#s", "anchor2"), new Outlink("http://www.nutch.org/;something?y=1", "anchor3"), new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"), new Outlink("http://www.nutch.org/;something?y=1;somethingelse", "anchor5") } }; }
public static void main(String[] args) throws Exception { int res = ToolRunner.run(NutchConfiguration.create(), new ParserChecker(), args); System.exit(res); }
public static void main(String[] args) throws Exception { int res = new IndexSorterArquivoWeb().doMain(NutchConfiguration.create(), args); System.exit(res); }
// private static int[] oldToNew(IndexReader reader, Searcher searcher) throws IOException { private static DocScore[] newToOld(IndexReader reader, Searcher searcher) throws IOException { int readerMax = reader.maxDoc(); DocScore[] newToOld = new DocScore[readerMax]; // use site, an indexed, un-tokenized field to get boost // byte[] boosts = reader.norms("site"); TODO MC /* TODO MC */ Document docMeta; Pattern includes = Pattern.compile("\\|"); String value = NutchConfiguration.create().get(INCLUDE_EXTENSIONS_KEY, ""); String includeExtensions[] = includes.split(value); Hashtable<String, Boolean> validExtensions = new Hashtable<String, Boolean>(); for (int i = 0; i < includeExtensions.length; i++) { validExtensions.put(includeExtensions[i], true); System.out.println("extension boosted " + includeExtensions[i]); } /* TODO MC */ for (int oldDoc = 0; oldDoc < readerMax; oldDoc++) { float score; if (reader.isDeleted(oldDoc)) { // score = 0.0f; score = -1f; // TODO MC } else { // score = Similarity.decodeNorm(boosts[oldDoc]); TODO MC /* TODO MC */ docMeta = searcher.doc(oldDoc); if (validExtensions.get(docMeta.get("subType")) == null) { // searched extensions will have higher scores score = -0.5f; } else { score = Integer.parseInt(docMeta.get("inlinks")); /* if (score==0) { score=0.001f; // TODO MC - to not erase } */ } /* TODO MC */ // System.out.println("Score for old document "+oldDoc+" is "+score+" and type // "+docMeta.get("subType")); // TODO MC debug remove } DocScore docScore = new DocScore(); docScore.doc = oldDoc; docScore.score = score; newToOld[oldDoc] = docScore; } System.out.println("Sorting " + newToOld.length + " documents."); Arrays.sort(newToOld); // HeapSorter.sort(newToOld); // TODO MC - due to the lack of space /* TODO MC int[] oldToNew = new int[readerMax]; for (int newDoc = 0; newDoc < readerMax; newDoc++) { DocScore docScore = newToOld[newDoc]; //oldToNew[docScore.oldDoc] = docScore.score > 0.0f ? newDoc : -1; // TODO MC oldToNew[docScore.oldDoc] = newDoc; // TODO MC } */ /* TODO MC * for (int newDoc = 0; newDoc < readerMax; newDoc++) { DocScore docScore = newToOld[newDoc]; System.out.println("Score for new document "+newDoc+" is "+docScore.score); // TODO MC debug remove } * TODO MC */ // return oldToNew; TODO MC return newToOld; // TODO MC }
public static void main(String[] args) throws Exception { if (args.length < 2) { usage(); return; } int mode = -1; if (args[0].equals("-dump")) mode = MODE_DUMP; else if (args[0].equals("-list")) mode = MODE_LIST; else if (args[0].equals("-get")) mode = MODE_GET; boolean co = true; boolean fe = true; boolean ge = true; boolean pa = true; boolean pd = true; boolean pt = true; // collect general options for (int i = 1; i < args.length; i++) { if (args[i].equals("-nocontent")) { co = false; args[i] = null; } else if (args[i].equals("-nofetch")) { fe = false; args[i] = null; } else if (args[i].equals("-nogenerate")) { ge = false; args[i] = null; } else if (args[i].equals("-noparse")) { pa = false; args[i] = null; } else if (args[i].equals("-noparsedata")) { pd = false; args[i] = null; } else if (args[i].equals("-noparsetext")) { pt = false; args[i] = null; } } Configuration conf = NutchConfiguration.create(); final FileSystem fs = FileSystem.get(conf); SegmentReader segmentReader = new SegmentReader(conf, co, fe, ge, pa, pd, pt); // collect required args switch (mode) { case MODE_DUMP: String input = args[1]; if (input == null) { System.err.println("Missing required argument: <segment_dir>"); usage(); return; } String output = args.length > 2 ? args[2] : null; if (output == null) { System.err.println("Missing required argument: <output>"); usage(); return; } segmentReader.dump(new Path(input), new Path(output)); return; case MODE_LIST: ArrayList<Path> dirs = new ArrayList<Path>(); for (int i = 1; i < args.length; i++) { if (args[i] == null) continue; if (args[i].equals("-dir")) { Path dir = new Path(args[++i]); FileStatus[] fstats = fs.listStatus(dir, HadoopFSUtil.getPassDirectoriesFilter(fs)); Path[] files = HadoopFSUtil.getPaths(fstats); if (files != null && files.length > 0) { dirs.addAll(Arrays.asList(files)); } } else dirs.add(new Path(args[i])); } segmentReader.list(dirs, new OutputStreamWriter(System.out, "UTF-8")); return; case MODE_GET: input = args[1]; if (input == null) { System.err.println("Missing required argument: <segment_dir>"); usage(); return; } String key = args.length > 2 ? args[2] : null; if (key == null) { System.err.println("Missing required argument: <keyValue>"); usage(); return; } segmentReader.get( new Path(input), new Text(key), new OutputStreamWriter(System.out, "UTF-8"), new HashMap<String, List<Writable>>()); return; default: System.err.println("Invalid operation: " + args[0]); usage(); return; } }
public static void main(String[] args) throws Exception { final int res = ToolRunner.run(NutchConfiguration.create(), new SolrIndexer(), args); System.exit(res); }