Пример #1
0
  private static Response execute(
      String url, Method method, Map<String, String> cookies, Map<String, String> data) {
    Response response = null;

    Connection connection = Jsoup.connect(url);
    connection.method(method);

    connection.timeout(10000);
    connection.ignoreContentType(true);
    connection.maxBodySize(0);

    if (cookies != null) {
      connection.cookies(cookies);
    }

    if (data != null) {
      for (Entry<String, String> entry : data.entrySet()) {
        String key = entry.getKey();
        String value = entry.getValue();

        connection.data(key, value);
      }
    }

    try {
      response = connection.execute();
    } catch (IOException e) {
      e.printStackTrace();
    }

    return response;
  }
Пример #2
0
 public static String getType(Document doc) {
   String type = "";
   if (doc.select("#kw").size() > 0) {
     Element e = doc.select("#kw").get(0);
     StringBuilder typeURL = new StringBuilder();
     typeURL.append("http://widget.unistats.ac.uk/Widget/");
     typeURL.append(e.attr("data-institution") + "/");
     typeURL.append(e.attr("data-course") + "/");
     typeURL.append(e.attr("data-orientation") + "/");
     typeURL.append("null/");
     typeURL.append(e.attr("data-language") + "/");
     typeURL.append(e.attr("data-kismode"));
     boolean finishe = false;
     try {
       do {
         Connection tmpConn = Jsoup.connect(typeURL.toString());
         Document tmpDoc = tmpConn.timeout(10000).get();
         if (tmpDoc.select("#kisWidget > div.widgetCourse > h1").size() > 0) {
           e = tmpDoc.select("#kisWidget > div.widgetCourse > h1").get(0);
           type =
               e.text().trim().indexOf(" ") > 0
                   ? e.text().trim().substring(0, e.text().trim().indexOf(" "))
                   : e.text().trim();
         }
         finishe = true;
       } while (!finishe);
     } catch (Exception ex) {
       ex.printStackTrace();
     }
   }
   return type;
 }
Пример #3
0
  public static void initMajorList(String originalUrl) {

    System.out.println("preparing majorList");

    boolean finish = false;
    do {
      try {
        majorList.clear();
        Connection conn = Jsoup.connect(originalUrl);
        Document doc = conn.timeout(10000).get();
        Elements es = doc.select("#accordion__target-3 > div.course-listing__box > a");
        for (Element e : es) { // major
          MajorForCollection major = new MajorForCollection();
          major.setLevel(LEVEL);
          major.setTitle(e.select("h3").get(0).text().trim());
          major.setType(e.select("p").get(0).text().replaceAll("-[\\s\\S]*", "").trim());
          major.setUrl(e.select("a").get(0).attr("href"));
          majorList.add(major);
        }
        ;
        finish = true;
      } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      }
    } while (!finish);

    System.out.println("majorList prepared");
    System.out.println("majorList size: " + majorList.size());
  }
Пример #4
0
 public static String requestFee(String url) {
   boolean finish = false;
   String fee = "";
   do {
     try {
       Connection conn = Jsoup.connect(url);
       Document doc = conn.timeout(5000).get();
       if (doc.select("#block-system-main > table > tbody > tr:nth-child(2) > td:nth-child(4)")
               .size()
           > 0) {
         fee =
             getFee(
                 doc.select(
                         "#block-system-main > table > tbody > tr:nth-child(2) > td:nth-child(4)")
                     .text());
       } else {
         fee = url;
       }
       finish = true;
     } catch (IOException e) {
       System.out.println("requestFee : " + e.getMessage());
     }
   } while (!finish);
   return fee;
 }
Пример #5
0
 public static String extractContent(String url) {
   try {
     Connection connection = Jsoup.connect(url);
     connection.userAgent(USER_AGENT);
     connection.followRedirects(true);
     connection.timeout(GET_TIMEOUT);
     long start = System.currentTimeMillis();
     Connection.Response response = connection.execute();
     long diff = System.currentTimeMillis() - start;
     int responseCode = response.statusCode();
     if (response.statusCode() == OK) {
       String body = response.body();
       Logger.info(
           "%s retrieved, content length %d, time %s sec.",
           url, body.length(), FormatUtil.millis2Seconds(diff));
       return response.body();
     } else {
       Logger.error("%s returned %d", url, responseCode);
       return "";
     }
   } catch (IOException e) {
     Logger.error(e, "%s cannot be read.", url);
     return "";
   }
 }
Пример #6
0
  public static void getVOSA() throws IOException {

    Document doc;

    // need http protocol
    Connection connection = Jsoup.connect(VOSA_URL);
    connection.timeout(30000);
    doc = connection.get();

    // get all links
    Elements links = doc.select("a.top_link");
    for (Element link : links) {

      // get the value from href attribute
      System.out.println("\nlink : " + link.attr("href"));
      System.out.println("text : " + link.text());
    }
  }
Пример #7
0
 public URL[] getURLs() throws Exception {
   List<URL> urls = new ArrayList<>();
   String query = term + " filetype:" + filetype;
   URL google =
       new URL(
           "http://www.google.com/search?q="
               + URLEncoder.encode(query, "UTF-8")
               + "&start="
               + start);
   Connection con = HttpConnection.connect(google);
   con.timeout(60000);
   con.userAgent("");
   Document doc = con.get();
   Elements els = doc.select("cite");
   for (Element el : els) {
     String text = el.text();
     if (!text.startsWith("http")) {
       text = "http://" + text;
     }
     URL url = new URL(text);
     urls.add(url);
   }
   return (URL[]) urls.toArray(new URL[urls.size()]);
 }
Пример #8
0
    public Document
	docGet_URL(String url)
    {
	Document doc;
	Connection conn;
	int timeout = CONNECT_TIMEOUT_INI;
	conn = null;
	try {
	    conn = Jsoup.connect(url);
	} catch (Exception _) {
	    ; // conn is null at this point
	} // end of [try]
	if (conn == null) return null;
	while (true) {
	    conn = conn.timeout(timeout);
	    try {
		return conn.get();
	    } catch (IOException _) {
		timeout = 2 * timeout;
		if (timeout > CONNECT_TIMEOUT_FIN) { _geterr(url); break; }
	    } // end of [try]
	}
	return null; // HX: max timeout reached at this point
    }
Пример #9
0
  public static void getDetails(MajorForCollection major) throws Exception {
    Connection conn = Jsoup.connect(major.getUrl());
    Document doc = conn.timeout(10000).followRedirects(true).get();
    Element e = null;

    if (doc.select("table.course-page__table-basic").size() > 0) {
      e = doc.select("table.course-page__table-basic").get(0);
      for (Element tr : e.select("tr")) {
        if (tr.text().contains("Duration")) {
          major.setLength(getLength(e.text()));
        } else if (tr.text().contains("Start date")) {
          major.setMonthOfEntry(getMonthOfEntry(e.text()));
        }
      }
    }

    if (doc.select("a.btn.btn-bordered").size() > 0) {
      e = doc.select("a.btn.btn-bordered").get(0);
      major.setApplicationFee(e.attr("href"));
    }

    if (doc.select("#entry-requirements-2").size() > 0) {
      e = doc.select("#entry-requirements-2").get(0);
      major.setAcademicRequirements(e.text());
    }

    if (doc.select("div.course-page.row a").size() > 0) {
      e = doc.select("div.course-page.row a").last();
      major.setSchool(e.attr("href"));
      if (major
          .getSchool()
          .equals("http://www.study.monash/media/links/faculty-websites/design-and-architecture")) {
        major.setSchool("Monash Art Design & Architecture");
      } else if (major
          .getSchool()
          .equals("http://www.study.monash/media/links/faculty-websites/business-and-economics")) {
        major.setSchool("Monash Business School");
      } else if (major
          .getSchool()
          .equals("http://www.study.monash/media/links/faculty-websites/arts")) {
        major.setSchool("Faculty of Arts, Monash University");
      } else if (major
          .getSchool()
          .equals("http://www.study.monash/media/links/faculty-websites/science")) {
        major.setSchool("Faculty of Science");
      } else if (major
          .getSchool()
          .equals("http://www.study.monash/media/links/faculty-websites/medicine")) {
        major.setSchool("Faculty of Medicine, Nursing and Health Sciences");
      } else if (major
          .getSchool()
          .equals("http://www.study.monash/media/links/faculty-websites/education")) {
        major.setSchool("Faculty of Education - Faculty of Education");
      } else if (major
          .getSchool()
          .equals("http://www.study.monash/media/links/faculty-websites/engineering")) {
        major.setSchool("Faculty of Engineering, Monash University");
      } else if (major
          .getSchool()
          .equals("http://www.study.monash/media/links/faculty-websites/information-technology")) {
        major.setSchool("Faculty of Information Technology - Monash University");
      } else if (major
          .getSchool()
          .equals("http://www.study.monash/media/links/faculty-websites/pharmacy")) {
        major.setSchool("Faculty of Pharmacy and Pharmaceutical Sciences");
      } else if (major
          .getSchool()
          .equals("http://www.study.monash/media/links/faculty-websites/law")) {
        major.setSchool("Faculty of Law");
      }
    }

    if (doc.select("#fees").size() > 0) {
      e = doc.select("#fees").get(0);
      major.setTuitionFee(e.nextElementSibling().text());
    }

    if (!major
            .getApplicationFee()
            .equals("http://www.monash.edu.au/pubs/handbooks/courses/A6015.html")
        && !major
            .getApplicationFee()
            .equals("http://www.monash.edu.au/pubs/handbooks/courses/2276.html")) {
      doc = WebUtils.getDocument(major.getApplicationFee(), WebUtils.METHOD_GET, 10 * 1000);
      if (doc.select("h2.black.pub_heading:containsOwn(Requirements) + div.pub_body_text").size()
          > 0) {
        e = doc.select("h2.black.pub_heading:containsOwn(Requirements) + div.pub_body_text").get(0);
        major.setStructure(replaceSpecialCharacter(html2Str(e.outerHtml())).trim());
        if (major.getStructure().contains("Part A.")) {
          major.setStructure(
              major.getStructure().substring(major.getStructure().indexOf("Part A.")));
        } else if (doc.select("h2.black.pub_heading:containsOwn(Structure) + div.pub_body_text")
                .size()
            > 0) {
          e = doc.select("h2.black.pub_heading:containsOwn(Structure) + div.pub_body_text").get(0);
          major.setStructure(replaceSpecialCharacter(html2Str(e.text())).trim());
        }
      }
    }

    mark(major, true);
  }
  /**
   * Parses the given feed and extracts out and parsers all linked items within the feed, using the
   * underlying ROME feed parsing library.
   *
   * @param rss A {@link Content} object representing the feed that is being parsed by this {@link
   *     Parser}.
   * @return A {@link ParseResult} containing all {@link Parse}d feeds that were present in the feed
   *     file that this {@link Parser} dealt with.
   */
  public String getParse(String url, boolean json, HashMap<String, Object> params)
      throws Exception {

    url = URLDecoder.decode(url, "UTF-8");
    URL feedURL = new URL(url);
    // Logger.getLogger(this.getClass().getName()).log(Level.INFO, "Encoding del Feed: {0}", new
    // Object[]{feedURL.openConnection().getContentEncoding()});
    Feed feed = FeedParser.parse(feedURL);

    // List<PostType> newsList = new ArrayList<PostType>();

    PostType newEntry;
    // PostType newEntryComments;
    Post newEntrySolr;
    // SyndFeed feed = null;

    Gson gson = new Gson();

    List<LinkType> links;

    Document doc;

    FeedSelectors feedSelectors;

    String extendedString = (String) params.get("zone");
    Place place = null;
    if (params.containsKey("place")) {
      place = placeDao.retrieveByExtendedString(extendedString);
    }
    org.zonales.tagsAndZones.objects.Zone zone = zoneDao.retrieveByExtendedString(extendedString);

    if (!json) {
      for (int i = 0; i < feed.getItemCount(); i++) {
        FeedItem entry = feed.getItem(i);
        Logger.getLogger(this.getClass().getName())
            .log(
                Level.INFO, "Intentando conectar a {0}", new Object[] {entry.getLink().toString()});

        Connection conn = Jsoup.connect(entry.getLink().toString());
        conn.timeout(60000);
        doc = conn.get();
        String responseURL = conn.response().url().getHost();
        //                doc = Jsoup.connect(entry.getLink().toString()).timeout(60000).get();
        Logger.getLogger(this.getClass().getName())
            .log(Level.INFO, "Parseando la URL: {0}", new Object[] {entry.getLink().toString()});
        feedSelectors = dao.retrieve(url);
        if (findWords(
            entry.getTitle(),
            doc,
            (ArrayList) params.get("searchlist"),
            (ArrayList) params.get("blacklist"),
            feedSelectors)) {
          newEntry = new PostType();
          String source;
          if (feed.getHeader() == null || feed.getHeader().getLink() == null) {
            Logger.getLogger(this.getClass().getName()).log(Level.INFO, "NULL: Link");
            source = feedURL.getHost();
          } else {
            Logger.getLogger(this.getClass().getName())
                .log(Level.INFO, "NO NULL: {0}", feed.getHeader().getLink().toString());
            source = feed.getHeader().getLink().getHost();
            //                        if (source.indexOf("/") != -1) {
            //                            source = source.substring(0, source.indexOf("/") + 1);
            //                        }
          }
          newEntry.setSource(source);
          newEntry.setDocType("post");
          newEntry.setZone(
              new Zone(
                  String.valueOf(zone.getId()),
                  zone.getName(),
                  zone.getType().getName(),
                  zone.getExtendedString()));

          newEntry.setPostLatitude(Double.parseDouble((String) params.get("latitud")));
          newEntry.setPostLongitude(Double.parseDouble((String) params.get("longitud")));
          // newEntry.setId(entry.getUri());
          // newEntry.setId(entry.getUri() != null && entry.getUri().length() > 0 ?
          // entry.getUri().trim() : entry.getLink().trim()+entry.getTitle().trim());
          newEntry.setId(
              entry.getGUID() != null
                  ? entry.getGUID()
                  : (entry.getElementValue("http://www.w3.org/2005/Atom", "id") != null
                      ? entry.getElementValue("http://www.w3.org/2005/Atom", "id")
                      : (entry.getTitle())));
          newEntry.setFromUser(
              new User(
                  null,
                  source,
                  null,
                  null,
                  place != null
                      ? new org.zonales.entities.Place(
                          String.valueOf(place.getId()), place.getName(), place.getType().getName())
                      : null));
          newEntry.setTitle(entry.getTitle());
          newEntry.setText(entry.getDescriptionAsText());
          newEntry.setTags(new TagsType((ArrayList) params.get("tagslist")));

          if (newEntry.getLinks() == null) {
            newEntry.setLinks(new LinksType(new ArrayList<LinkType>()));
          }
          if ((links = getLinks(feedSelectors, doc, responseURL)) != null) {
            newEntry.getLinks().getLink().addAll(links);
          }
          newEntry.getLinks().getLink().add(new LinkType("source", entry.getLink().toString()));

          if (newEntry.getActions() == null) {
            newEntry.setActions(new ActionsType(new ArrayList<ActionType>()));
          }
          newEntry.setActions(
              new ActionsType(
                  getActions(
                      feedSelectors,
                      doc,
                      newEntry.getId(),
                      json,
                      (Boolean) params.get("comments"),
                      source)));

          if (entry.getPubDate() != null) {
            newEntry.setCreated(String.valueOf(entry.getPubDate().getTime()));
          }

          if (entry.getModDate() != null) {
            newEntry.setModified(String.valueOf(entry.getModDate().getTime()));
          }

          for (ActionType action : newEntry.getActions().getAction()) {
            if ("comments".equals(action.getType())) {
              newEntry.setRelevance(action.getCant());
            }
          }

          if (!json) {
            newEntry.setVerbatim(gson.toJson(newEntry));
          }

          newsList.add(newEntry);

          // addToMap(parseResult, feed, feedLink, entry, content, newEntry);
        }
      }

      PostsType news;

      news = new PostsType(newsList);
      completeLinks(news);
      Feed2XML(news, sw);
      return sw.toString(); // + comments.toString();
    } else {
      for (int i = 0; i < feed.getItemCount(); i++) {
        FeedItem entry = feed.getItem(i);
        Logger.getLogger(this.getClass().getName())
            .log(
                Level.INFO, "Intentando conectar a {0}", new Object[] {entry.getLink().toString()});

        Connection conn = Jsoup.connect(entry.getLink().toString());
        conn.timeout(60000);
        doc = conn.get();
        String responseURL = conn.response().url().getHost();
        //                Logger.getLogger(this.getClass().getName()).log(Level.INFO, "RESPONSE URL:
        // {0}", responseURL);
        //                doc = Jsoup.connect(entry.getLink().toString()).timeout(60000).get();
        Logger.getLogger(this.getClass().getName())
            .log(Level.INFO, "Parseando la URL: {0}", new Object[] {entry.getLink().toString()});
        feedSelectors = dao.retrieve(url);
        if (findWords(
            entry.getTitle(),
            doc,
            (ArrayList) params.get("searchlist"),
            (ArrayList) params.get("blacklist"),
            feedSelectors)) {
          newEntrySolr = new Post();
          String source;
          if (feed.getHeader() == null || feed.getHeader().getLink() == null) {
            Logger.getLogger(this.getClass().getName()).log(Level.INFO, "NULL: Link");
            source = feedURL.getHost();
          } else {
            Logger.getLogger(this.getClass().getName())
                .log(Level.INFO, "NO NULL: {0}", feed.getHeader().getLink().toString());
            source = feed.getHeader().getLink().getHost();
            //                        if (source.indexOf("/") != -1) {
            //                            source = source.substring(0, source.indexOf("/") + 1);
            //                        }
          }
          newEntrySolr.setSource(source);
          newEntrySolr.setDocType("post");
          newEntrySolr.setZone(
              new Zone(
                  String.valueOf(zone.getId()),
                  zone.getName(),
                  zone.getType().getName(),
                  zone.getExtendedString()));

          newEntrySolr.setPostLatitude(Double.parseDouble((String) params.get("latitud")));
          newEntrySolr.setPostLongitude(Double.parseDouble((String) params.get("longitud")));
          // newEntry.setId(entry.getUri());
          // newEntry.setId(entry.getUri() != null && entry.getUri().length() > 0 ?
          // entry.getUri().trim() : entry.getLink().trim()+entry.getTitle().trim());
          newEntrySolr.setId(
              entry.getGUID() != null
                  ? entry.getGUID()
                  : (entry.getElementValue("http://www.w3.org/2005/Atom", "id") != null
                      ? entry.getElementValue("http://www.w3.org/2005/Atom", "id")
                      : (entry.getTitle())));
          newEntrySolr.setFromUser(
              new User(
                  null,
                  source,
                  null,
                  null,
                  place != null
                      ? new org.zonales.entities.Place(
                          String.valueOf(place.getId()), place.getName(), place.getType().getName())
                      : null));
          newEntrySolr.setTitle(entry.getTitle());
          newEntrySolr.setText(entry.getDescriptionAsText());
          newEntrySolr.setTags(new ArrayList<String>((ArrayList) params.get("tagslist")));

          if (newEntrySolr.getLinks() == null) {
            newEntrySolr.setLinks(new ArrayList<LinkType>());
          }
          if ((links = getLinks(feedSelectors, doc, responseURL)) != null) {
            newEntrySolr.getLinks().addAll(links);
          }
          newEntrySolr.getLinks().add(new LinkType("source", entry.getLink().toString()));

          if (newEntrySolr.getActions() == null) {
            newEntrySolr.setActions(new ArrayList<ActionType>());
          }
          newEntrySolr
              .getActions()
              .addAll(
                  getActions(
                      feedSelectors,
                      doc,
                      newEntrySolr.getId(),
                      json,
                      (Boolean) params.get("comments"),
                      source));

          if (entry.getPubDate() != null) {
            newEntrySolr.setCreated((entry.getPubDate().getTime()));
          }
          if (entry.getModDate() != null) {
            newEntrySolr.setModified((entry.getModDate().getTime()));
          }

          for (ActionType action : newEntrySolr.getActions()) {
            if ("comments".equals(action.getType())) {
              newEntrySolr.setRelevance(action.getCant());
            }
          }

          if (!json) {
            newEntrySolr.setVerbatim(gson.toJson(newEntrySolr));
          }

          newsListSolr.add(newEntrySolr);

          // addToMap(parseResult, feed, feedLink, entry, content, newEntry);
        }
      }
      return "{post: " + gson.toJson(newsListSolr) + "}"; // + comments.toString();
    }
  }