public static RankInfo proxy_ranking_keyword(String keyword, String targe_name) { RankInfo info = new RankInfo(); keyword = keyword.replace(" ", "%20"); info.setKeyword(keyword); // we here fetch up to three paginations int nb_depth = 3; long startTimeMs = System.currentTimeMillis(); org.jsoup.nodes.Document doc; int depth = 0; int nb_results = 0; int my_rank = 30; String my_url = ""; boolean found = false; while (depth < nb_depth && !found) { try { // we wait between x and xx seconds Thread.sleep(randInt(min_number_of_wait_times, max_number_of_wait_times) * 1000); System.out.println("Fetching a new page"); String constructed_url = "https://www.google.fr/search?q=" + keyword + "&start=" + Integer.toString(depth * 10); // we here use our properly configured squid proxy on port 3128 on localhost Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress("localhost", 3128)); URL url = new URL(constructed_url); HttpURLConnection connection = (HttpURLConnection) url.openConnection(proxy); connection.setConnectTimeout(1000000000); String randomAgent = randomUserAgent(); connection.setRequestProperty("User-Agent", randomAgent); connection.connect(); InputStreamReader in = new InputStreamReader((InputStream) connection.getContent()); BufferedReader buff = new BufferedReader(in); StringBuilder builder = new StringBuilder(); String line; do { line = buff.readLine(); builder.append(line); } while (line != null); String pageString = builder.toString(); connection.disconnect(); doc = Jsoup.parse(pageString); Elements serps = doc.select("h3[class=r]"); for (Element serp : serps) { Element link = serp.getElementsByTag("a").first(); if (link != null) { String linkref = link.attr("href"); if (linkref.startsWith("/url?q=") || linkref.startsWith("http://")) { nb_results++; if (linkref.startsWith("/url?q=")) { linkref = linkref.substring(7, linkref.indexOf("&")); } else { if (linkref.indexOf("&") != -1) { linkref = linkref.substring(0, linkref.indexOf("&")); } } } if (linkref.contains(targe_name) && !found) { my_rank = nb_results; my_url = linkref; found = true; } } } if (nb_results == 0) { System.out.println("Warning captcha"); } depth++; } catch (IOException e) { e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } } long taskTimeMs = System.currentTimeMillis() - startTimeMs; // System.out.println(taskTimeMs); info.setPosition(my_rank); info.setUrl(my_url); if (nb_results == 0) { System.out.println("Warning captcha"); } else { System.out.println("Number of links read in the pages : " + nb_results); } System.out.println("My rank : " + my_rank + " for keyword : " + keyword); System.out.println("My URL : " + my_url + " for keyword : " + keyword); return info; }
public static RankInfo ranking_keyword(String keyword, String targe_name) { RankInfo info = new RankInfo(); info.setKeyword(keyword); // we here fetch up to five paginations int nb_depth = 5; long startTimeMs = System.currentTimeMillis(); org.jsoup.nodes.Document doc; int depth = 0; int nb_results = 0; int my_rank = 50; String my_url = ""; boolean found = false; while (depth < nb_depth && !found) { try { // we wait between 30 and 70 seconds Thread.sleep(randInt(30, 50) * 1000); System.out.println("Fetching a new page"); doc = Jsoup.connect( "https://www.google.fr/search?q=" + keyword + "&start=" + Integer.toString(depth * 10)) .userAgent( "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13 (.NET CLR 3.5.30729)") .referrer("accounterlive.com") .ignoreHttpErrors(true) .timeout(0) .get(); Elements serps = doc.select("h3[class=r]"); for (Element serp : serps) { Element link = serp.getElementsByTag("a").first(); String linkref = link.attr("href"); if (linkref.startsWith("/url?q=")) { nb_results++; linkref = linkref.substring(7, linkref.indexOf("&")); } if (linkref.contains(targe_name)) { my_rank = nb_results; my_url = linkref; found = true; } // System.out.println("Link ref: "+linkref); // System.out.println("Title: "+serp.text()); } if (nb_results == 0) { System.out.println("Warning captcha"); } depth++; } catch (IOException e) { e.printStackTrace(); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } } long taskTimeMs = System.currentTimeMillis() - startTimeMs; // System.out.println(taskTimeMs); info.setPosition(my_rank); info.setUrl(my_url); if (nb_results == 0) { System.out.println("Warning captcha"); } else { System.out.println("Number of links : " + nb_results); } System.out.println("My rank : " + my_rank + " for keyword : " + keyword); System.out.println("My URL : " + my_url + " for keyword : " + keyword); return info; }