Beispiel #1
0
  /** For debugging. */
  public static void main(String[] args) throws Exception {
    final String usage = "NutchBean query";

    if (args.length == 0) {
      System.err.println(usage);
      System.exit(-1);
    }

    final Configuration conf = NutchConfiguration.create();
    final NutchBean bean = new NutchBean(conf);
    try {
      final Query query = Query.parse(args[0], conf);
      final Hits hits = bean.search(query, 10);
      System.out.println("Total hits: " + hits.getTotal());
      final int length = (int) Math.min(hits.getTotal(), 10);
      final Hit[] show = hits.getHits(0, length);
      final HitDetails[] details = bean.getDetails(show);
      final Summary[] summaries = bean.getSummary(details, query);

      for (int i = 0; i < hits.getLength(); i++) {
        System.out.println(" " + i + " " + details[i] + "\n" + summaries[i]);
      }
    } catch (Throwable t) {
      LOG.error("Exception occured while executing search: " + t, t);
      System.exit(1);
    }
    System.exit(0);
  }
Beispiel #2
0
    public void contextInitialized(ServletContextEvent sce) {
      final ServletContext app = sce.getServletContext();
      final Configuration conf = NutchConfiguration.get(app);

      LOG.info("creating new bean");
      NutchBean bean = null;
      try {
        bean = new NutchBean(conf);
        app.setAttribute(KEY, bean);
      } catch (final IOException ex) {
        LOG.error(StringUtils.stringifyException(ex));
      }
    }
Beispiel #3
0
  public static void main(String[] args) throws IOException {
    if (args.length != 1) {
      System.err.println("Usage: EncodingDetector <file>");
      System.exit(1);
    }

    Configuration conf = NutchConfiguration.create();
    EncodingDetector detector = new EncodingDetector(NutchConfiguration.create());

    // do everything as bytes; don't want any conversion
    BufferedInputStream istr = new BufferedInputStream(new FileInputStream(args[0]));
    ByteArrayOutputStream ostr = new ByteArrayOutputStream();
    byte[] bytes = new byte[1000];
    boolean more = true;
    while (more) {
      int len = istr.read(bytes);
      if (len < bytes.length) {
        more = false;
        if (len > 0) {
          ostr.write(bytes, 0, len);
        }
      } else {
        ostr.write(bytes);
      }
    }

    byte[] data = ostr.toByteArray();

    // make a fake Content
    Content content = new Content("", "", data, "text/html", new Metadata(), conf);

    detector.autoDetectClues(content, true);
    String encoding =
        detector.guessEncoding(content, conf.get("parser.character.encoding.default"));
    System.out.println("Guessed encoding: " + encoding);
  }
 /** @param args */
 public static void main(String[] args) throws Exception {
   if (args.length < 2) {
     System.err.println(
         "SegmentMerger output_dir (-dir segments | seg1 seg2 ...) [-filter] [-slice NNNN]");
     System.err.println("\toutput_dir\tname of the parent dir for output segment slice(s)");
     System.err.println("\t-dir segments\tparent dir containing several segments");
     System.err.println("\tseg1 seg2 ...\tlist of segment dirs");
     System.err.println("\t-filter\t\tfilter out URL-s prohibited by current URLFilters");
     System.err.println("\t-slice NNNN\tcreate many output segments, each containing NNNN URLs");
     return;
   }
   Configuration conf = NutchConfiguration.create();
   final FileSystem fs = FileSystem.get(conf);
   Path out = new Path(args[0]);
   ArrayList segs = new ArrayList();
   long sliceSize = 0;
   boolean filter = false;
   for (int i = 1; i < args.length; i++) {
     if (args[i].equals("-dir")) {
       Path[] files =
           fs.listPaths(
               new Path(args[++i]),
               new PathFilter() {
                 public boolean accept(Path f) {
                   try {
                     if (fs.isDirectory(f)) return true;
                   } catch (IOException e) {
                   }
                   ;
                   return false;
                 }
               });
       for (int j = 0; j < files.length; j++) segs.add(files[j]);
     } else if (args[i].equals("-filter")) {
       filter = true;
     } else if (args[i].equals("-slice")) {
       sliceSize = Long.parseLong(args[++i]);
     } else {
       segs.add(new Path(args[i]));
     }
   }
   if (segs.size() == 0) {
     System.err.println("ERROR: No input segments.");
     return;
   }
   SegmentMerger merger = new SegmentMerger(conf);
   merger.merge(out, (Path[]) segs.toArray(new Path[segs.size()]), filter, sliceSize);
 }
  public void testFilter() throws Exception {

    String domainFile = SAMPLES + SEPARATOR + "hosts.txt";
    Configuration conf = NutchConfiguration.create();
    DomainURLFilter domainFilter = new DomainURLFilter(domainFile);
    domainFilter.setConf(conf);
    assertNotNull(domainFilter.filter("http://lucene.apache.org"));
    assertNotNull(domainFilter.filter("http://hadoop.apache.org"));
    assertNotNull(domainFilter.filter("http://www.apache.org"));
    assertNull(domainFilter.filter("http://www.google.com"));
    assertNull(domainFilter.filter("http://mail.yahoo.com"));
    assertNotNull(domainFilter.filter("http://www.foobar.net"));
    assertNotNull(domainFilter.filter("http://www.foobas.net"));
    assertNotNull(domainFilter.filter("http://www.yahoo.com"));
    assertNotNull(domainFilter.filter("http://www.foobar.be"));
    assertNull(domainFilter.filter("http://www.adobe.com"));
  }
Beispiel #6
0
 @Test
 public void testNoFilter() throws Exception {
   // https://issues.apache.org/jira/browse/NUTCH-2189
   String domainFile = SAMPLES + SEPARATOR + "this-file-does-not-exist.txt";
   Configuration conf = NutchConfiguration.create();
   DomainURLFilter domainFilter = new DomainURLFilter(domainFile);
   domainFilter.setConf(conf);
   Assert.assertNotNull(domainFilter.filter("http://lucene.apache.org"));
   Assert.assertNotNull(domainFilter.filter("http://hadoop.apache.org"));
   Assert.assertNotNull(domainFilter.filter("http://www.apache.org"));
   Assert.assertNotNull(domainFilter.filter("http://www.google.com"));
   Assert.assertNotNull(domainFilter.filter("http://mail.yahoo.com"));
   Assert.assertNotNull(domainFilter.filter("http://www.foobar.net"));
   Assert.assertNotNull(domainFilter.filter("http://www.foobas.net"));
   Assert.assertNotNull(domainFilter.filter("http://www.yahoo.com"));
   Assert.assertNotNull(domainFilter.filter("http://www.foobar.be"));
   Assert.assertNotNull(domainFilter.filter("http://www.adobe.com"));
 }
Beispiel #7
0
 public static void main(String args[]) throws IOException {
   BasicURLNormalizer normalizer = new BasicURLNormalizer();
   normalizer.setConf(NutchConfiguration.create());
   String scope = URLNormalizers.SCOPE_DEFAULT;
   if (args.length >= 1) {
     scope = args[0];
     System.out.println("Scope: " + scope);
   }
   String line, normUrl;
   BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
   while ((line = in.readLine()) != null) {
     try {
       normUrl = normalizer.normalize(line, scope);
       System.out.println(normUrl);
     } catch (MalformedURLException e) {
       System.out.println("failed: " + line);
     }
   }
   System.exit(0);
 }
Beispiel #8
0
  public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Content content;
    Parse parse;
    Configuration conf = NutchConfiguration.create();
    Protocol protocol;
    ProtocolFactory factory = new ProtocolFactory(conf);
    OOParser parser = new OOParser();
    parser.setConf(conf);

    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      protocol = factory.getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();

      parse = parser.getParse(content).get(content.getUrl());

      String text = parse.getText().replaceAll("[ \t\r\n]+", " ");
      assertTrue(expectedText.equals(text));
    }
  }
Beispiel #9
0
  public static void main(String argv[]) throws Exception {

    String usage = "Content (-local | -dfs <namenode:port>) recno segment";

    if (argv.length < 3) {
      System.out.println("usage:" + usage);
      return;
    }
    Options opts = new Options();
    Configuration conf = NutchConfiguration.create();

    GenericOptionsParser parser = new GenericOptionsParser(conf, opts, argv);

    String[] remainingArgs = parser.getRemainingArgs();
    FileSystem fs = FileSystem.get(conf);

    try {
      int recno = Integer.parseInt(remainingArgs[0]);
      String segment = remainingArgs[1];

      Path file = new Path(segment, DIR_NAME);
      System.out.println("Reading from file: " + file);

      ArrayFile.Reader contents = new ArrayFile.Reader(fs, file.toString(), conf);

      Content content = new Content();
      contents.get(recno, content);
      System.out.println("Retrieved " + recno + " from file " + file);

      System.out.println(content);

      contents.close();
    } finally {
      fs.close();
    }
  }
Beispiel #10
0
 public static void main(String[] args) throws Exception {
   //     args = new String[] {"/home/rollin/git/nutch/2.x/urls"};
   args = new String[] {"/home/rollin/git/nutch/2.x/urls", "-crawlId", "jd"};
   int res = ToolRunner.run(NutchConfiguration.create(), new InjectorJob(), args);
   System.exit(res);
 }
Beispiel #11
0
 public static void main(String args[]) throws Exception {
   int res = ToolRunner.run(NutchConfiguration.create(), new GeneratorJob(), args);
   System.exit(res);
 }
public class TestPoolingDataDriver extends TestCase {
  private static Configuration conf = NutchConfiguration.create();
  Connection connection = null;

  public void testGeneration() {}
}
  private static void setup() throws Exception {
    conf = NutchConfiguration.create();
    conf.setBoolean("parser.html.form.use_action", true);
    utils = new DOMContentUtils(conf);
    TikaParser tikaParser = new TikaParser();
    tikaParser.setConf(conf);
    Parser parser = tikaParser.getTikaConfig().getParser("text/html");
    for (int i = 0; i < testPages.length; i++) {
      Metadata tikamd = new Metadata();

      HTMLDocumentImpl doc = new HTMLDocumentImpl();
      doc.setErrorChecking(false);
      DocumentFragment root = doc.createDocumentFragment();
      DOMBuilder domhandler = new DOMBuilder(doc, root);
      ParseContext context = new ParseContext();
      // to add once available in Tika
      // context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
      try {
        parser.parse(
            new ByteArrayInputStream(testPages[i].getBytes()), domhandler, tikamd, context);
        testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
      } catch (Exception e) {
        e.printStackTrace();
        fail("caught exception: " + e);
      }
      testDOMs[i] = root;
      LSSerializerImpl lsi = new LSSerializerImpl();
      System.out.println("input " + i + ": '" + testPages[i] + "'");
      System.out.println("output " + i + ": '" + lsi.writeToString(root) + "'");
    }
    answerOutlinks =
        new Outlink[][] {
          // 0
          {
            new Outlink("http://www.nutch.org", "anchor"),
          },
          // 1
          {
            new Outlink("http://www.nutch.org/", "home"),
            new Outlink("http://www.nutch.org/docs/bot.html", "bots"),
          },
          // 2
          {
            new Outlink("http://www.nutch.org/", "separate this"),
            new Outlink("http://www.nutch.org/docs/ok", "from this"),
          },

          // 3
          {
            new Outlink("http://www.nutch.org/", "home"),
            new Outlink("http://www.nutch.org/docs/1", "1"),
            new Outlink("http://www.nutch.org/docs/2", "2"),
          },
          // 4
          {
            new Outlink("http://www.nutch.org/frames/top.html", ""),
            new Outlink("http://www.nutch.org/frames/left.html", ""),
            new Outlink("http://www.nutch.org/frames/invalid.html", ""),
            new Outlink("http://www.nutch.org/frames/right.html", ""),
          },
          // 5
          {
            new Outlink("http://www.nutch.org/maps/logo.gif", ""),
            new Outlink("http://www.nutch.org/index.html", ""),
            new Outlink("http://www.nutch.org/maps/#bottom", ""),
            new Outlink("http://www.nutch.org/bot.html", ""),
            new Outlink("http://www.nutch.org/docs/index.html", "")
          },
          // 6
          {
            new Outlink("http://www.nutch.org/index.html", "whitespace test"),
          },
          // 7
          {},
          // 8
          {
            new Outlink("http://www.nutch.org/dummy.jsp", "test2"),
          },
          // 9
          {},
          // 10
          {
            new Outlink("http://www.nutch.org/;x", "anchor1"),
            new Outlink("http://www.nutch.org/g;x", "anchor2"),
            new Outlink("http://www.nutch.org/g;x?y#s", "anchor3")
          },
          // 11
          {
            // this is tricky - see RFC3986 section 5.4.1 example 7
            new Outlink("http://www.nutch.org/g", "anchor1"),
            new Outlink("http://www.nutch.org/g?y#s", "anchor2"),
            new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
            new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
            new Outlink("http://www.nutch.org/;something?y=1;somethingelse", "anchor5")
          }
        };
  }
 public static void main(String[] args) throws Exception {
   int res = ToolRunner.run(NutchConfiguration.create(), new ParserChecker(), args);
   System.exit(res);
 }
 public static void main(String[] args) throws Exception {
   int res = new IndexSorterArquivoWeb().doMain(NutchConfiguration.create(), args);
   System.exit(res);
 }
  // private static int[] oldToNew(IndexReader reader, Searcher searcher) throws IOException {
  private static DocScore[] newToOld(IndexReader reader, Searcher searcher) throws IOException {
    int readerMax = reader.maxDoc();
    DocScore[] newToOld = new DocScore[readerMax];

    // use site, an indexed, un-tokenized field to get boost
    // byte[] boosts = reader.norms("site"); TODO MC
    /* TODO MC */
    Document docMeta;
    Pattern includes = Pattern.compile("\\|");
    String value = NutchConfiguration.create().get(INCLUDE_EXTENSIONS_KEY, "");
    String includeExtensions[] = includes.split(value);
    Hashtable<String, Boolean> validExtensions = new Hashtable<String, Boolean>();
    for (int i = 0; i < includeExtensions.length; i++) {
      validExtensions.put(includeExtensions[i], true);
      System.out.println("extension boosted " + includeExtensions[i]);
    }
    /* TODO MC */

    for (int oldDoc = 0; oldDoc < readerMax; oldDoc++) {
      float score;
      if (reader.isDeleted(oldDoc)) {
        // score = 0.0f;
        score = -1f; // TODO MC
      } else {
        // score = Similarity.decodeNorm(boosts[oldDoc]); TODO MC
        /* TODO MC */
        docMeta = searcher.doc(oldDoc);
        if (validExtensions.get(docMeta.get("subType"))
            == null) { // searched extensions will have higher scores
          score = -0.5f;
        } else {
          score = Integer.parseInt(docMeta.get("inlinks"));
          /*
          if (score==0) {
          	score=0.001f; // TODO MC - to not erase
          }
          */
        }
        /* TODO MC */
        // System.out.println("Score for old document "+oldDoc+" is "+score+" and type
        // "+docMeta.get("subType")); // TODO MC debug remove
      }
      DocScore docScore = new DocScore();
      docScore.doc = oldDoc;
      docScore.score = score;
      newToOld[oldDoc] = docScore;
    }

    System.out.println("Sorting " + newToOld.length + " documents.");
    Arrays.sort(newToOld);
    // HeapSorter.sort(newToOld); // TODO MC - due to the lack of space

    /* TODO MC
    int[] oldToNew = new int[readerMax];
    for (int newDoc = 0; newDoc < readerMax; newDoc++) {
      DocScore docScore = newToOld[newDoc];
      //oldToNew[docScore.oldDoc] = docScore.score > 0.0f ? newDoc : -1; // TODO MC
      oldToNew[docScore.oldDoc] = newDoc; // TODO MC
    }
    */

    /* TODO MC *
    for (int newDoc = 0; newDoc < readerMax; newDoc++) {
    	DocScore docScore = newToOld[newDoc];
    	System.out.println("Score for new document "+newDoc+" is "+docScore.score); // TODO MC debug remove
    }
    * TODO MC */

    // return oldToNew; TODO MC
    return newToOld; // TODO MC
  }
  public static void main(String[] args) throws Exception {
    if (args.length < 2) {
      usage();
      return;
    }
    int mode = -1;
    if (args[0].equals("-dump")) mode = MODE_DUMP;
    else if (args[0].equals("-list")) mode = MODE_LIST;
    else if (args[0].equals("-get")) mode = MODE_GET;

    boolean co = true;
    boolean fe = true;
    boolean ge = true;
    boolean pa = true;
    boolean pd = true;
    boolean pt = true;
    // collect general options
    for (int i = 1; i < args.length; i++) {
      if (args[i].equals("-nocontent")) {
        co = false;
        args[i] = null;
      } else if (args[i].equals("-nofetch")) {
        fe = false;
        args[i] = null;
      } else if (args[i].equals("-nogenerate")) {
        ge = false;
        args[i] = null;
      } else if (args[i].equals("-noparse")) {
        pa = false;
        args[i] = null;
      } else if (args[i].equals("-noparsedata")) {
        pd = false;
        args[i] = null;
      } else if (args[i].equals("-noparsetext")) {
        pt = false;
        args[i] = null;
      }
    }
    Configuration conf = NutchConfiguration.create();
    final FileSystem fs = FileSystem.get(conf);
    SegmentReader segmentReader = new SegmentReader(conf, co, fe, ge, pa, pd, pt);
    // collect required args
    switch (mode) {
      case MODE_DUMP:
        String input = args[1];
        if (input == null) {
          System.err.println("Missing required argument: <segment_dir>");
          usage();
          return;
        }
        String output = args.length > 2 ? args[2] : null;
        if (output == null) {
          System.err.println("Missing required argument: <output>");
          usage();
          return;
        }
        segmentReader.dump(new Path(input), new Path(output));
        return;
      case MODE_LIST:
        ArrayList<Path> dirs = new ArrayList<Path>();
        for (int i = 1; i < args.length; i++) {
          if (args[i] == null) continue;
          if (args[i].equals("-dir")) {
            Path dir = new Path(args[++i]);
            FileStatus[] fstats = fs.listStatus(dir, HadoopFSUtil.getPassDirectoriesFilter(fs));
            Path[] files = HadoopFSUtil.getPaths(fstats);
            if (files != null && files.length > 0) {
              dirs.addAll(Arrays.asList(files));
            }
          } else dirs.add(new Path(args[i]));
        }
        segmentReader.list(dirs, new OutputStreamWriter(System.out, "UTF-8"));
        return;
      case MODE_GET:
        input = args[1];
        if (input == null) {
          System.err.println("Missing required argument: <segment_dir>");
          usage();
          return;
        }
        String key = args.length > 2 ? args[2] : null;
        if (key == null) {
          System.err.println("Missing required argument: <keyValue>");
          usage();
          return;
        }
        segmentReader.get(
            new Path(input),
            new Text(key),
            new OutputStreamWriter(System.out, "UTF-8"),
            new HashMap<String, List<Writable>>());
        return;
      default:
        System.err.println("Invalid operation: " + args[0]);
        usage();
        return;
    }
  }
Beispiel #18
0
 public static void main(String[] args) throws Exception {
   final int res = ToolRunner.run(NutchConfiguration.create(), new SolrIndexer(), args);
   System.exit(res);
 }