public void getPostrResults(Keyword k) { try { int page = 1; boolean sessionEnd = false; while (!sessionEnd) { String keywordValue = k.getValue().replace(" ", "+"); String url = "http://postr.hu/?keres=" + keywordValue + "&oldal=" + page++; Document doc = getDocumentByUrl(url, 3); Elements links = doc.select(".activityflow .comment_text h3 a"); // Elements bodies = doc.select("#talalatbox p"); if (links.size() == 0) { sessionEnd = true; continue; } SearchSession searchSession = new SearchSession(); searchSession.setStartDate(Calendar.getInstance().getTime()); searchSession.setRawData(doc.html()); searchSession.setSearchText(url); searchSessionRepository.create(searchSession); for (int i = 0; i < links.size(); i++) { try { // ha a link tarlamaz #=t, akkor nem kell, mivel comment String linkHref = links.get(i).attr("href"); if (linkHref.contains("#")) { continue; } String linkText = links.get(i).text(); log.debug(linkHref); MessageDigest md = MessageDigest.getInstance("MD5"); md.update(linkHref.getBytes()); byte byteData[] = md.digest(); // convert the byte to hex format method 1 StringBuffer sb = new StringBuffer(); for (int j = 0; j < byteData.length; j++) { sb.append(Integer.toString((byteData[j] & 0xff) + 0x100, 16).substring(1)); } String sourceId = sb.toString(); log.debug("sourceId: " + sourceId); // ellenorizni kell az adatbazisban if (!dataService.isDataWithSourceIdAndKeyword(sourceId, k)) { Document bodyDoc = getDocumentByUrl(linkHref, 3); // log.debug(bodyDoc.html()); Elements bodyElements = bodyDoc.getElementsByClass("text"); if (bodyElements.size() == 0) { continue; } String body = bodyElements.get(0).text(); log.debug(body); boolean resp = dictionaryService.valideText(body, 41); if (resp) { DateFormat formatter; Date originalDate = null; formatter = new SimpleDateFormat("yyyymmdd"); // String[] cut = linkHref.split("/"); // originalDate = formatter // .parse(cut[3] + cut[4] + cut[5]); dataService.createData( sourceId, body, linkHref, linkText, "postr", searchSession, originalDate, k); } } } catch (Exception e) { log.error("Hiba a data linkek feldolgozasa soran:"); e.printStackTrace(); } } searchSession.setEndDate(Calendar.getInstance().getTime()); searchSessionRepository.update(searchSession); } // sessionEnd } catch (Exception e) { e.printStackTrace(); } }
private void getTwitterResults(Keyword k) { log.debug("name: " + k.getValue()); List<NameValuePair> params = new ArrayList<NameValuePair>(); String keywordValue = k.getValue().replace(" ", "+"); params.add(new BasicNameValuePair("q", keywordValue)); String query = URLEncodedUtils.format(params, "utf-8"); URI url = null; try { SearchSession searchSession = new SearchSession(); searchSession.setStartDate(Calendar.getInstance().getTime()); url = URIUtils.createURI("http", "search.twitter.com", 0, "search.json", query, null); searchSession.setSearchText(url.toString()); log.debug(url.toString()); Gson gson = new Gson(); String respRow = getStringFromUrl(url, 3); TwitterResponse respList = gson.fromJson(respRow, TwitterResponse.class); // System.out.println(respList); searchSession.setRawData(respRow); searchSessionRepository.create(searchSession); for (TwitterResult d : respList.results) { // ellenorzi sourceid alapjan, hogy szerepel-e az // adatbaziban String body = URLDecoder.decode(d.text, "UTF-8"); boolean validText = false; if (body.length() > 2) { boolean resp = dictionaryService.valideText(body, 50); if (resp) { validText = true; } else { log.debug("nem magyar szoveg: " + body); } } else { log.debug("a body nem eleg hosszu "); } log.debug(d.id); if (validText) { DateFormat formatter; Date originalDate; formatter = new SimpleDateFormat("E, dd MMM yyyy HH:mm:ss Z"); originalDate = formatter.parse(d.created_at); dataService.createData( d.id, body, url.toString(), d.to_user, "twitter", searchSession, originalDate, k); } } searchSession.setEndDate(Calendar.getInstance().getTime()); searchSessionRepository.update(searchSession); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }
public void getBloghuResults(Keyword k) { try { int page = 1; boolean sessionEnd = false; while (!sessionEnd) { String keywordValue = k.getValue().replace(" ", "+"); String url = "http://blog.hu/cimlap/search/?sterm=" + keywordValue + "&page=" + page++; Document doc = getDocumentByUrl(url, 3); Elements links = doc.select("#talalatbox h1 a"); Elements bodies = doc.select("#talalatbox p"); if (links.size() == 0) { sessionEnd = true; continue; } SearchSession searchSession = new SearchSession(); searchSession.setStartDate(Calendar.getInstance().getTime()); searchSession.setRawData(doc.html()); searchSession.setSearchText(url); searchSessionRepository.create(searchSession); for (int i = 0; i < links.size(); i++) { String linkHref = links.get(i).attr("href"); String linkText = links.get(i).text(); log.debug(linkHref); String sourceId = createMd5(linkHref); log.debug("sourceId: " + sourceId); // ellenorizni kell az adatbazisban if (!dataService.isDataWithSourceIdAndKeyword(sourceId, k)) { Document bodyDoc = getDocumentByUrl(linkHref, 3); // log.debug(bodyDoc.html()); Element bodyElements = bodyDoc.getElementsByClass("post-content").get(0); String body = bodyElements.text(); log.debug(body); boolean resp = dictionaryService.valideText(body, 41); if (resp) { DateFormat formatter; Date originalDate; formatter = new SimpleDateFormat("yyyymmdd"); String[] cut = linkHref.split("/"); originalDate = formatter.parse(cut[3] + cut[4] + cut[5]); dataService.createData( sourceId, body, linkHref, linkText, "bloghu", searchSession, originalDate, k); } } } searchSession.setEndDate(Calendar.getInstance().getTime()); searchSessionRepository.update(searchSession); } // sessionEnd } catch (Exception e) { e.printStackTrace(); } }
private void getFacebookResults(Keyword k) { log.debug("name: " + k.getValue()); List<NameValuePair> params = new ArrayList<NameValuePair>(); String keywordValue = k.getValue().replace(" ", "+"); params.add(new BasicNameValuePair("q", keywordValue)); params.add(new BasicNameValuePair("limit", "200")); String query = URLEncodedUtils.format(params, "utf-8"); SearchSession searchSession = new SearchSession(); searchSession.setStartDate(Calendar.getInstance().getTime()); searchSession.setSearchText(query); searchSessionRepository.create(searchSession); URI url = null; try { url = URIUtils.createURI("https", "graph.facebook.com", 0, "search", query, null); log.debug(url.toString()); String respRow = getStringFromUrl(url, 3); searchSession.setRawData(respRow); Gson gson = new Gson(); FacebookResponse respList = gson.fromJson(respRow, FacebookResponse.class); for (FBData d : respList.data) { boolean validText = false; String body = new String(d.message.getBytes("UTF-8"), "UTF-8"); if (body.length() > 2) { boolean resp = dictionaryService.valideText(body, 50); if (resp) { validText = true; } else { log.debug("nem magyar szoveg: " + body); } } else { log.debug("a body nem eleg hosszu "); } if (validText) { DateFormat formatter; Date originalDate; formatter = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss"); originalDate = formatter.parse(d.created_time); String title = (d.name != null && d.name.length() > 100) ? d.name.substring(0, 100) : d.name; dataService.createData( d.id, body, url.toString(), title, "facebook", searchSession, originalDate, k); } } searchSession.setEndDate(Calendar.getInstance().getTime()); searchSessionRepository.update(searchSession); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }