コード例 #1
0
  public void getPostrResults(Keyword k) {
    try {

      int page = 1;
      boolean sessionEnd = false;

      while (!sessionEnd) {

        String keywordValue = k.getValue().replace(" ", "+");

        String url = "http://postr.hu/?keres=" + keywordValue + "&oldal=" + page++;

        Document doc = getDocumentByUrl(url, 3);

        Elements links = doc.select(".activityflow .comment_text h3 a");
        // Elements bodies = doc.select("#talalatbox p");

        if (links.size() == 0) {
          sessionEnd = true;
          continue;
        }

        SearchSession searchSession = new SearchSession();
        searchSession.setStartDate(Calendar.getInstance().getTime());
        searchSession.setRawData(doc.html());
        searchSession.setSearchText(url);
        searchSessionRepository.create(searchSession);

        for (int i = 0; i < links.size(); i++) {
          try {

            // ha a link tarlamaz #=t, akkor nem kell, mivel comment

            String linkHref = links.get(i).attr("href");
            if (linkHref.contains("#")) {
              continue;
            }

            String linkText = links.get(i).text();

            log.debug(linkHref);

            MessageDigest md = MessageDigest.getInstance("MD5");
            md.update(linkHref.getBytes());

            byte byteData[] = md.digest();

            // convert the byte to hex format method 1
            StringBuffer sb = new StringBuffer();
            for (int j = 0; j < byteData.length; j++) {
              sb.append(Integer.toString((byteData[j] & 0xff) + 0x100, 16).substring(1));
            }

            String sourceId = sb.toString();
            log.debug("sourceId: " + sourceId);

            // ellenorizni kell az adatbazisban
            if (!dataService.isDataWithSourceIdAndKeyword(sourceId, k)) {

              Document bodyDoc = getDocumentByUrl(linkHref, 3);
              // log.debug(bodyDoc.html());
              Elements bodyElements = bodyDoc.getElementsByClass("text");

              if (bodyElements.size() == 0) {
                continue;
              }

              String body = bodyElements.get(0).text();

              log.debug(body);

              boolean resp = dictionaryService.valideText(body, 41);

              if (resp) {

                DateFormat formatter;
                Date originalDate = null;
                formatter = new SimpleDateFormat("yyyymmdd");
                // String[] cut = linkHref.split("/");
                // originalDate = formatter
                // .parse(cut[3] + cut[4] + cut[5]);

                dataService.createData(
                    sourceId, body, linkHref, linkText, "postr", searchSession, originalDate, k);
              }
            }
          } catch (Exception e) {
            log.error("Hiba a data linkek feldolgozasa soran:");
            e.printStackTrace();
          }
        }
        searchSession.setEndDate(Calendar.getInstance().getTime());
        searchSessionRepository.update(searchSession);
      } // sessionEnd
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
コード例 #2
0
  private void getTwitterResults(Keyword k) {

    log.debug("name: " + k.getValue());
    List<NameValuePair> params = new ArrayList<NameValuePair>();

    String keywordValue = k.getValue().replace(" ", "+");

    params.add(new BasicNameValuePair("q", keywordValue));

    String query = URLEncodedUtils.format(params, "utf-8");

    URI url = null;
    try {

      SearchSession searchSession = new SearchSession();

      searchSession.setStartDate(Calendar.getInstance().getTime());

      url = URIUtils.createURI("http", "search.twitter.com", 0, "search.json", query, null);

      searchSession.setSearchText(url.toString());

      log.debug(url.toString());

      Gson gson = new Gson();
      String respRow = getStringFromUrl(url, 3);
      TwitterResponse respList = gson.fromJson(respRow, TwitterResponse.class);

      // System.out.println(respList);

      searchSession.setRawData(respRow);
      searchSessionRepository.create(searchSession);
      for (TwitterResult d : respList.results) {
        // ellenorzi sourceid alapjan, hogy szerepel-e az
        // adatbaziban

        String body = URLDecoder.decode(d.text, "UTF-8");

        boolean validText = false;

        if (body.length() > 2) {
          boolean resp = dictionaryService.valideText(body, 50);

          if (resp) {
            validText = true;
          } else {
            log.debug("nem magyar szoveg: " + body);
          }

        } else {
          log.debug("a body nem eleg hosszu ");
        }

        log.debug(d.id);

        if (validText) {

          DateFormat formatter;
          Date originalDate;
          formatter = new SimpleDateFormat("E, dd MMM yyyy HH:mm:ss Z");
          originalDate = formatter.parse(d.created_at);

          dataService.createData(
              d.id, body, url.toString(), d.to_user, "twitter", searchSession, originalDate, k);
        }
      }
      searchSession.setEndDate(Calendar.getInstance().getTime());
      searchSessionRepository.update(searchSession);

    } catch (Exception e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
  }
コード例 #3
0
  public void getBloghuResults(Keyword k) {
    try {

      int page = 1;
      boolean sessionEnd = false;

      while (!sessionEnd) {

        String keywordValue = k.getValue().replace(" ", "+");

        String url = "http://blog.hu/cimlap/search/?sterm=" + keywordValue + "&page=" + page++;

        Document doc = getDocumentByUrl(url, 3);

        Elements links = doc.select("#talalatbox h1 a");
        Elements bodies = doc.select("#talalatbox p");

        if (links.size() == 0) {
          sessionEnd = true;
          continue;
        }

        SearchSession searchSession = new SearchSession();
        searchSession.setStartDate(Calendar.getInstance().getTime());
        searchSession.setRawData(doc.html());
        searchSession.setSearchText(url);
        searchSessionRepository.create(searchSession);

        for (int i = 0; i < links.size(); i++) {
          String linkHref = links.get(i).attr("href");
          String linkText = links.get(i).text();

          log.debug(linkHref);

          String sourceId = createMd5(linkHref);
          log.debug("sourceId: " + sourceId);

          // ellenorizni kell az adatbazisban
          if (!dataService.isDataWithSourceIdAndKeyword(sourceId, k)) {

            Document bodyDoc = getDocumentByUrl(linkHref, 3);
            // log.debug(bodyDoc.html());
            Element bodyElements = bodyDoc.getElementsByClass("post-content").get(0);

            String body = bodyElements.text();

            log.debug(body);

            boolean resp = dictionaryService.valideText(body, 41);

            if (resp) {

              DateFormat formatter;
              Date originalDate;
              formatter = new SimpleDateFormat("yyyymmdd");
              String[] cut = linkHref.split("/");
              originalDate = formatter.parse(cut[3] + cut[4] + cut[5]);

              dataService.createData(
                  sourceId, body, linkHref, linkText, "bloghu", searchSession, originalDate, k);
            }
          }
        }
        searchSession.setEndDate(Calendar.getInstance().getTime());
        searchSessionRepository.update(searchSession);
      } // sessionEnd
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
コード例 #4
0
  private void getFacebookResults(Keyword k) {

    log.debug("name: " + k.getValue());
    List<NameValuePair> params = new ArrayList<NameValuePair>();

    String keywordValue = k.getValue().replace(" ", "+");

    params.add(new BasicNameValuePair("q", keywordValue));
    params.add(new BasicNameValuePair("limit", "200"));

    String query = URLEncodedUtils.format(params, "utf-8");

    SearchSession searchSession = new SearchSession();
    searchSession.setStartDate(Calendar.getInstance().getTime());
    searchSession.setSearchText(query);
    searchSessionRepository.create(searchSession);

    URI url = null;
    try {

      url = URIUtils.createURI("https", "graph.facebook.com", 0, "search", query, null);

      log.debug(url.toString());

      String respRow = getStringFromUrl(url, 3);

      searchSession.setRawData(respRow);

      Gson gson = new Gson();

      FacebookResponse respList = gson.fromJson(respRow, FacebookResponse.class);

      for (FBData d : respList.data) {

        boolean validText = false;

        String body = new String(d.message.getBytes("UTF-8"), "UTF-8");

        if (body.length() > 2) {
          boolean resp = dictionaryService.valideText(body, 50);

          if (resp) {
            validText = true;
          } else {
            log.debug("nem magyar szoveg: " + body);
          }

        } else {
          log.debug("a body nem eleg hosszu ");
        }

        if (validText) {

          DateFormat formatter;
          Date originalDate;
          formatter = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss");
          originalDate = formatter.parse(d.created_time);

          String title =
              (d.name != null && d.name.length() > 100) ? d.name.substring(0, 100) : d.name;

          dataService.createData(
              d.id, body, url.toString(), title, "facebook", searchSession, originalDate, k);
        }
      }

      searchSession.setEndDate(Calendar.getInstance().getTime());
      searchSessionRepository.update(searchSession);

    } catch (Exception e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
  }