public static RankInfo proxy_ranking_keyword(String keyword, String targe_name) {
    RankInfo info = new RankInfo();
    keyword = keyword.replace(" ", "%20");
    info.setKeyword(keyword);
    // we here fetch up to three paginations
    int nb_depth = 3;
    long startTimeMs = System.currentTimeMillis();
    org.jsoup.nodes.Document doc;
    int depth = 0;
    int nb_results = 0;
    int my_rank = 30;
    String my_url = "";
    boolean found = false;
    while (depth < nb_depth && !found) {
      try {
        // we wait between x and xx seconds
        Thread.sleep(randInt(min_number_of_wait_times, max_number_of_wait_times) * 1000);
        System.out.println("Fetching a new page");
        String constructed_url =
            "https://www.google.fr/search?q=" + keyword + "&start=" + Integer.toString(depth * 10);
        // we here use our properly configured squid proxy on port 3128 on localhost
        Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress("localhost", 3128));
        URL url = new URL(constructed_url);
        HttpURLConnection connection = (HttpURLConnection) url.openConnection(proxy);
        connection.setConnectTimeout(1000000000);
        String randomAgent = randomUserAgent();
        connection.setRequestProperty("User-Agent", randomAgent);
        connection.connect();
        InputStreamReader in = new InputStreamReader((InputStream) connection.getContent());
        BufferedReader buff = new BufferedReader(in);
        StringBuilder builder = new StringBuilder();
        String line;
        do {
          line = buff.readLine();
          builder.append(line);
        } while (line != null);
        String pageString = builder.toString();
        connection.disconnect();

        doc = Jsoup.parse(pageString);
        Elements serps = doc.select("h3[class=r]");
        for (Element serp : serps) {
          Element link = serp.getElementsByTag("a").first();
          if (link != null) {
            String linkref = link.attr("href");
            if (linkref.startsWith("/url?q=") || linkref.startsWith("http://")) {
              nb_results++;
              if (linkref.startsWith("/url?q=")) {
                linkref = linkref.substring(7, linkref.indexOf("&"));
              } else {
                if (linkref.indexOf("&") != -1) {
                  linkref = linkref.substring(0, linkref.indexOf("&"));
                }
              }
            }
            if (linkref.contains(targe_name) && !found) {
              my_rank = nb_results;
              my_url = linkref;
              found = true;
            }
          }
        }
        if (nb_results == 0) {
          System.out.println("Warning captcha");
        }
        depth++;
      } catch (IOException e) {
        e.printStackTrace();
      } catch (InterruptedException e) {
        e.printStackTrace();
      }
    }
    long taskTimeMs = System.currentTimeMillis() - startTimeMs;
    // System.out.println(taskTimeMs);
    info.setPosition(my_rank);
    info.setUrl(my_url);
    if (nb_results == 0) {
      System.out.println("Warning captcha");
    } else {
      System.out.println("Number of links read in the pages : " + nb_results);
    }
    System.out.println("My rank : " + my_rank + " for keyword : " + keyword);
    System.out.println("My URL : " + my_url + " for keyword : " + keyword);
    return info;
  }
示例#2
0
 public static RankInfo ranking_keyword(String keyword, String targe_name) {
   RankInfo info = new RankInfo();
   info.setKeyword(keyword);
   // we here fetch up to five paginations
   int nb_depth = 5;
   long startTimeMs = System.currentTimeMillis();
   org.jsoup.nodes.Document doc;
   int depth = 0;
   int nb_results = 0;
   int my_rank = 50;
   String my_url = "";
   boolean found = false;
   while (depth < nb_depth && !found) {
     try {
       // we wait between 30 and 70 seconds
       Thread.sleep(randInt(30, 50) * 1000);
       System.out.println("Fetching a new page");
       doc =
           Jsoup.connect(
                   "https://www.google.fr/search?q="
                       + keyword
                       + "&start="
                       + Integer.toString(depth * 10))
               .userAgent(
                   "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB;     rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13 (.NET CLR 3.5.30729)")
               .referrer("accounterlive.com")
               .ignoreHttpErrors(true)
               .timeout(0)
               .get();
       Elements serps = doc.select("h3[class=r]");
       for (Element serp : serps) {
         Element link = serp.getElementsByTag("a").first();
         String linkref = link.attr("href");
         if (linkref.startsWith("/url?q=")) {
           nb_results++;
           linkref = linkref.substring(7, linkref.indexOf("&"));
         }
         if (linkref.contains(targe_name)) {
           my_rank = nb_results;
           my_url = linkref;
           found = true;
         }
         //					System.out.println("Link ref: "+linkref);
         //					System.out.println("Title: "+serp.text());
       }
       if (nb_results == 0) {
         System.out.println("Warning captcha");
       }
       depth++;
     } catch (IOException e) {
       e.printStackTrace();
     } catch (InterruptedException e) {
       // TODO Auto-generated catch block
       e.printStackTrace();
     }
   }
   long taskTimeMs = System.currentTimeMillis() - startTimeMs;
   // System.out.println(taskTimeMs);
   info.setPosition(my_rank);
   info.setUrl(my_url);
   if (nb_results == 0) {
     System.out.println("Warning captcha");
   } else {
     System.out.println("Number of links : " + nb_results);
   }
   System.out.println("My rank : " + my_rank + " for keyword : " + keyword);
   System.out.println("My URL : " + my_url + " for keyword : " + keyword);
   return info;
 }