public static void test01() throws Exception { Page p = new Page(); p.setTmsp("2011-03-04 10:30:00"); p.setUrl("http://www.google.com"); p.setBody("my body"); p.insert(); Page e = new Page("2011-03-04 10:31:00", "http://www.usp.br", "o corpo"); e.insert(); e.setBody("usp usp usp"); e.update(); Database db = new Database(); db.connect(); ResultSet rs = Page.findAll(db); /* while (rs.next()) { System.out.println(rs.getString("tmsp") +", "+ rs.getString("url") +", "+ rs.getString("body")); }*/ Page p0 = null; while ((p0 = Page.next(rs)) != null) { System.out.println(p0.getTmsp()); System.out.println(p0.getUrl()); System.out.println(p0.getBody()); } db.close(); p.remove(); e.remove(); }
// Chamando Robot public static void robot() throws Exception { Database db = new Database(); db.connect(); ResultSet rs = Page.findAll(db); Page p = null; while ((p = Page.next(rs)) != null) { String body = Robot.get(p.getUrl()); // procurar por urls dentro do body // buscar por essas paginas // String expr = "href=\"([^\"]*)"; String ereg = "href=\"https{0,1}:\\/\\/([^\"]*)\""; Pattern pt = Pattern.compile(ereg); Matcher m = pt.matcher(body); while (m.find()) { System.out.println(m.group()); String[] _url = m.group().split("\""); Page.newUrl(_url[1]); } p.setBody(body); p.update(); } db.close(); }
public static void newUrl(String url) throws Exception { String body = Robot.get(url); Page p = new Page(); p.setTmsp("2011-03-11 11:00:00"); p.setUrl(url); p.setBody(body); p.insert(); }
public static Page next(ResultSet rs) throws Exception { Page p = null; if (rs.next()) { p = new Page(); p.setTmsp(rs.getString("tmsp")); p.setUrl(rs.getString("url")); p.setBody(rs.getString("body")); } return p; }