Esempio n. 1
0
  public void testCrawl2() throws Exception {
    System.setProperty("http.proxyHost", "localhost");
    System.setProperty("http.proxyPort", "3128");

    Crawler c = new Crawler(1);

    Frontier frontier = new BasicFrontier();
    frontier.add(new URI("http://harth.org/andreas/foaf.rdf"));
    // frontier.add(new URI("http://umbrich.net/foaf.rdf"));

    // frontier.setBlacklist(CrawlerConstants.BLACKLIST);

    LinkFilter lf = new LinkFilterDefault(frontier);

    c.setFetchFilter(new FetchFilterRdfXml());
    c.setLinkFilter(lf);

    // c.setLinkFilter(new LinkFilterDummy());

    ErrorHandler eh = new ErrorHandlerLogger(null, null);
    c.setErrorHandler(eh);

    Callback cb = new CallbackNxOutputStream(System.out);
    SinkCallback sc = new SinkCallback(cb, true);

    c.setOutputCallback(sc);

    c.evaluateBreadthFirst(frontier, 1, -1, -1);
  }
Esempio n. 2
0
  public void testNormalise() throws Exception {
    long time = System.currentTimeMillis();

    TldManager tldm = new TldManager();

    BreadthFirstQueue fq = new BreadthFirstQueue(tldm, Integer.MAX_VALUE, Integer.MAX_VALUE);

    InputStream is = new GZIPInputStream(new FileInputStream("test/uris.txt.gz"));

    BufferedReader br = new BufferedReader(new InputStreamReader(is));

    Frontier f = new BasicFrontier();

    int i = 0;

    String line = br.readLine();
    while (line != null) {
      i++;

      URI u = new URI(line);

      f.add(u);

      line = br.readLine();
    }

    br.close();

    fq.schedule(f);

    int size = fq.size();

    URI u = fq.poll();
    Random r = new Random();

    int j = 0;
    int redirects = 0;

    while (u != null) {
      u = fq.poll();

      if (u != null && r.nextFloat() < 0.01) {
        fq.setRedirect(u, new URI("http://dbpedia.org/resource/Redirect"), 303);
        fq.addDirectly(u);
        System.out.println("adding " + u);
        redirects++;
      }
      j++;
    }

    long time1 = System.currentTimeMillis();

    System.out.println(fq);
    System.out.println(fq.size());
    System.out.println("initial queue size " + size);
    System.out.println("redirects " + redirects);

    System.out.println(i + " uris, " + j + " polled, in " + (time1 - time) + " ms");
  }
Esempio n. 3
0
  public static void breadthFirstCrawling(String configFilePath) {

    // read xml config file
    Hashtable<String, String> configMap = XMLParser.parse(configFilePath);

    int numberOfThread = Integer.parseInt(configMap.get("numberOfThread"));
    String seedFile = configMap.get("seedFile");
    int depth = Integer.parseInt(configMap.get("depth"));
    int maxURIs = Integer.parseInt(configMap.get("maxURIs"));
    int maxplds = Integer.parseInt(configMap.get("maxplds"));
    int minActplds = Integer.parseInt(configMap.get("minActplds"));
    String linkFilterDomain = configMap.get("linkFilterDomain");

    Callback cboutput1 = null, cboutput2 = null;
    // set Crawler
    Crawler crawler = new Crawler(numberOfThread);

    // Add seed URIs to the Frontier
    //		Frontier frontier = new BasicFrontier();
    Frontier frontier = new RankedFrontier();

    try {
      BufferedReader bReader = new BufferedReader(new FileReader(new File(seedFile)));

      String URI;
      while ((URI = bReader.readLine()) != null) {
        frontier.add(new URI(URI));
      }

    } catch (FileNotFoundException e1) {
      // TODO Auto-generated catch block
      e1.printStackTrace();
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } catch (URISyntaxException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    ////
    //	"http://localhost:8080/marmotta/resource?uri=http%3A%2F%2Flocalhost%2Finterpro%3AIPR002061"));
    //
    //	"http://localhost:8080/marmotta/resource?uri=http%3A%2F%2Flocalhost%2Finterpro%3AIPR018218"));
    ////					"http://localhost:8080/marmotta/resource?uri=http%3A%2F%2Flocalhost%2FTestResult"));
    ////					"http://bio2rdf.org/interpro/describe/rdf/interpro:IPR002061"));
    //// check html version on
    // http://localhost:8080/marmotta/meta/text/html?uri=http%3A%2F%2Flocalhost%2FTestResult

    // content handler

    ContentHandler contentHandler;
    try {
      contentHandler =
          new ContentHandlers(
              new ContentHandlerRdfXml(),
              new ContentHandlerAny23(new URI("http://localhost:8080/apache-any23-service/")));
      crawler.setContentHandler(contentHandler);
    } catch (URISyntaxException e1) {
      // TODO Auto-generated catch block
      e1.printStackTrace();
    }

    // sink
    //		try {
    //			OutputStream os = new FileOutputStream("crawlerLog");
    //			cboutput1 = new MyCallback(os,new PartialMatchingForEachItem());
    //			OutputStream os2 = new FileOutputStream("crawlerLog_ED");
    //			cboutput2 = new MyCallback(os2,new EditDistance(0.8));
    //			OutputStream os3 = new FileOutputStream("crawlerLog_Full");
    //			Callback cboutput3 = new CallbackNQOutputStream(os3);//if used the thread will be stoped
    // when a rdf file is finished, don't know why
    //			OutputStream os2 = new BufferedOutputStream(new FileOutputStream("CrawlGraph.gexf"));
    //			cboutput2 = new CallbackGEXFOutputStream(os2);
    Callbacks cbs = new Callbacks(new Callback[] {});

    //			Sink sink = new SinkCallback(cbs,false);
    //			crawler.setOutputCallback(sink);
    //			cboutput1.startDocument();
    //			cboutput2.startDocument();
    //		} catch (FileNotFoundException e) {
    //			// TODO Auto-generated catch block
    //			e.printStackTrace();
    //		}

    // link filter and blacklist
    String[] domains = linkFilterDomain.split(";");
    LinkFilter linkFilter;
    if (domains.length >= 0 && !domains[0].equals("")) {
      linkFilter = new LinkFilterDomain(frontier);
      for (String domian : domains) {
        ((LinkFilterDomain) linkFilter).addHost(domian);
      }
    } else {
      linkFilter = new LinkFilterDefault(frontier); // linkedfilter can add new  uri into Frontier
    }
    crawler.setLinkFilter(linkFilter);

    FetchFilter blackList = new FetchFilterSuffix(CrawlerConstants.BLACKLIST);
    crawler.setBlacklistFilter(blackList);
    // error handler

    try {
      // Print to Stdout
      PrintStream ps = System.out;
      // Print to file
      FileOutputStream fos;
      fos = new FileOutputStream("errorLogFile");

      // Add printstream and file stream to error handler
      Callback rcb = new CallbackNQOutputStream(fos);
      ErrorHandler eh = new ErrorHandlerLogger(ps, rcb, true);
      rcb.startDocument();
      // Connect hooks with error handler
      crawler.setErrorHandler(eh);
      frontier.setErrorHandler(eh);
      linkFilter.setErrorHandler(eh);

      //
      //	java.util.logging.Logger.getLogger("com.ontologycentral.ldspider").setLevel(java.util.logging.Level.WARNING);

    } catch (FileNotFoundException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    crawler.evaluateBreadthFirst(
        frontier,
        new HashSetSeen(),
        new HashTableRedirects(),
        depth,
        maxURIs,
        maxplds,
        minActplds,
        false,
        Mode.ABOX_AND_TBOX);

    //		((CallbackGEXFOutputStream) cboutput2).readyToClose();
    //    	cboutput2.endDocument();

    // print callback1 result
    //    	System.out.println(cboutput1.toString());
    ((LinkFilterDefault) linkFilter).printStatistics();
  }