private List<ArtifactVersionBean> parseMavenMetadata(Document doc) {
    String groupId = doc.getElementsByTag("groupId").text();
    String artifactId = doc.getElementsByTag("artifactId").text();
    if (!StringUtils.hasText(groupId) || !StringUtils.hasText(artifactId)) {
      return Lists.newArrayListWithCapacity(0);
    }

    Elements versions = doc.getElementsByTag("version");
    List<ArtifactVersionBean> artifactList = Lists.newArrayList();
    for (Element version : versions) {

      ArtifactVersionBean artifactVersionBean = new ArtifactVersionBean();
      artifactVersionBean.setGroupId(groupId);
      artifactVersionBean.setArtifactId(artifactId);
      artifactVersionBean.setVersion(version.text());
      artifactVersionBean.setId(groupId + ":" + artifactId + ":" + version.text());

      // Gets and convert the last update date
      Long lastUpdateDate = retrieveLastUpdateDate(artifactVersionBean);
      if (lastUpdateDate == null) {
        continue;
      }
      artifactVersionBean.setTimestamp(lastUpdateDate);

      artifactList.add(artifactVersionBean);
    }

    return artifactList;
  }
Example #2
0
  public String reviseContForLieyunwang(String pcont) {
    if (pcont == null) return "";

    Document doc = Jsoup.parse(pcont);
    Elements eles = doc.select("div#share-box");
    for (Element ele : eles) {
      ele.remove();
    }
    eles = doc.select("div[id^=BAIDU]");
    for (Element ele : eles) {
      ele.remove();
    }
    eles = doc.select("iframe[id^=360_HOT]");
    for (Element ele : eles) {
      ele.remove();
    }
    eles = doc.select("div.n_article");
    for (Element ele : eles) {
      ele.remove();
    }
    eles = doc.select("div#comment-box");
    for (Element ele : eles) {
      ele.remove();
    }

    return doc.html();
  }
Example #3
0
  @GET
  @Path("/logs")
  @Produces({MediaType.TEXT_HTML})
  public String logs() {

    Document doc = null;
    try {
      ArrayList<Logger.Log> logs = Logger.getInstance().getLastLogs();

      File file = new File(getClass().getClassLoader().getResource("logs.html").getFile());

      doc = Jsoup.parse(file, "UTF-8");
      Element tbody = doc.getElementById("logs");

      for (Logger.Log log : logs) {
        Element tr = tbody.appendElement("tr").addClass(log.getType_log());

        tr.appendElement("td").addClass("type").text(log.getType_log());
        tr.appendElement("td").addClass("date").text(log.getDate().toString());
        tr.appendElement("td").addClass("message").text(log.getMessage());
      }
    } catch (Exception e) {
      e.printStackTrace();
    }

    if (doc != null) {
      return doc.html();
    }

    return null;
  }
Example #4
0
  public Worker(String url, boolean verbose) throws Exception {
    Document doc;
    doc = Jsoup.connect(url).get();
    // select anchors with href only
    Elements links = doc.select("a[href]");
    String l_Href;
    String host;
    int linksNum;
    Parser parser;
    for (Element link : links) {
      // absolute = http:// added
      l_Href = link.attr("abs:href");
      if (!l_Href.isEmpty()) {
        parser = new Parser(l_Href);
        host = parser.getHost();
        // if tempStats contains the url, add one to the value
        if (tempStats.containsKey(host)) {
          linksNum = tempStats.get(host);
          tempStats.put(host, linksNum += 1);
        }
        // if it doesn't, add it

        else {
          tempStats.put(host, 1);
        }
        // parse the url
        tempQueue.add(parser.getURL());
      }
    }
    if (verbose) {
      System.out.println(
          Thread.currentThread().getName() + " : " + tempQueue.size() + " links from " + url);
    }
  }
 @Override
 public void upload(
     ComponentParameter compParameter,
     IMultipartFile multipartFile,
     HashMap<String, Object> json) {
   try {
     ID id = ItSiteUtil.getLoginUser(compParameter).getId();
     if (id != null) {
       final Document document =
           Jsoup.parse(
               multipartFile.getInputStream(), compParameter.request.getCharacterEncoding(), "");
       final Elements as = document.getElementsByTag("a");
       for (final Element a : as) {
         if (a.hasAttr("add_date")) {
           final BookmarkBean bean = new BookmarkBean();
           final long t = ConvertUtils.toLong(a.attr("add_date"), 0) * 1000;
           bean.setTitle(a.text());
           bean.setUrl(a.attr("href"));
           bean.setUserId(id);
           bean.setUpdateDate(new Date(t));
           try {
             BookmarkUtils.applicationModule.doUpdate(bean);
           } catch (Exception e) {
           }
         }
       }
     }
   } catch (final Exception e) {
     throw DataObjectException.wrapException("没有权限");
   }
 }
  @Override
  public List<String> parseCategory(String categoryName, String categoryURL) {
    // TODO Auto-generated method stub

    List<String> linksByCategoryList = null;

    try {

      Document doc = Jsoup.connect(categoryURL).timeout(Constants.MAX_DELAY_TIME * 1000).get();

      Elements links = doc.select("div[class=views-field views-field-title]").select("a");

      if (links != null && links.size() > 0) {

        linksByCategoryList = new ArrayList<String>();

        for (Element element : links) {

          String newsLink = element.attr("href");
          newsLink = newsLink.substring(1);

          linksByCategoryList.add(newsLink);
        }
      }

    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    return linksByCategoryList;
  }
Example #7
0
 public static ArrayList<EntryModel> getPopularContent() {
   final ArrayList<EntryModel> result = new ArrayList<>();
   Thread thread =
       new Thread(
           () -> {
             try {
               Document document = Jsoup.connect("http://jkanime.net/").get();
               Elements elements = document.getElementsByClass("home_portada_bg");
               for (Element element : elements) {
                 result.add(
                     new EntryModel(
                         Constants.TYPE_SHOW,
                         element.getElementsByTag("a").first().text(),
                         element.getElementsByTag("a").first().attr("abs:href"),
                         element.getElementsByTag("img").first().attr("src")));
               }
             } catch (IOException e) {
               e.printStackTrace();
             }
           });
   thread.start();
   try {
     thread.join();
     return result;
   } catch (InterruptedException | NullPointerException e) {
     e.printStackTrace();
     return null;
   }
 }
  @Override
  public SearchResult[] getSearchResults(String searchString) throws IOException {
    Document doc = Jsoup.connect(searchString).timeout(CONNECTION_TIMEOUT_VALUE).get();
    boolean onSearchResultsPage = doc.location().contains("adultSearch.htm");
    // found the movie without a search results page
    if (doc.location() != null && !onSearchResultsPage) {
      String idOfPage = getIDStringFromDocumentLocation(doc);
      String posterPath = getPosterPreviewPathFromIDString(idOfPage);
      String label = doc.select("title").first().text();
      Thumb previewImage = new Thumb(posterPath);
      // SearchResult directResult = new SearchResult(doc.location());
      SearchResult result = null;
      if (posterPath != null) result = new SearchResult(doc.location(), label, previewImage);
      else result = new SearchResult(doc.location(), label, null);

      SearchResult[] directResultArray = {result};
      return directResultArray;
    }
    Elements foundMovies = doc.select("table[width=690]:contains(Wish List) tr tbody:has(img)");
    LinkedList<SearchResult> searchList = new LinkedList<SearchResult>();

    for (Element movie : foundMovies) {
      String urlPath = movie.select("a").first().attr("href");
      String thumb = movie.select("img").first().attr("src");
      String label = movie.select("img").first().attr("alt");
      SearchResult searchResult = new SearchResult(urlPath, label, new Thumb(thumb));
      if (!searchList.contains(searchResult)) searchList.add(searchResult);
    }
    return searchList.toArray(new SearchResult[searchList.size()]);
  }
Example #9
0
  public static Element markTestElement(Element element) {
    element.traverse(
        new NodeVisitor() {
          @Override
          public void tail(Node node, int level) {}

          @Override
          public void head(Node node, int level) {
            node.attr("class", nodeMarker.matcher(node.attr("class")).replaceAll(""));
          }
        });
    element.traverse(
        new NodeVisitor() {
          int count = 0;

          @Override
          public void tail(Node node, int level) {}

          @Override
          public void head(Node node, int level) {
            ++count;
            node.attr("class", node.attr("class") + " " + NODE_MARKER + "0_" + count + " ");
          }
        });
    return element;
  }
Example #10
0
 /**
  * getMovieActors parses through the movie's page html and returns three actors.
  *
  * @author defq0n
  * @param pageLink is the extended imdb url for the movie page.
  * @return movieActors String containing three actors.
  */
 private static String[] getMovieActors(String pageLink) {
   String[] movieActors = {"", "", ""};
   try {
     Document d = Jsoup.connect("http://imdb.com" + pageLink).get();
     Element e = d.body();
     String html = e.toString();
     String actorsDiv = "";
     for (int i = html.indexOf("<h4 class=\"inline\">Stars:</h4>") + 30;
         i < html.indexOf("See full cast and crew");
         i++) {
       actorsDiv += html.charAt(i);
     }
     String tempDiv = actorsDiv;
     for (int i = 0; i < 3; i++) { // we will get the first three top actors
       String actor = "";
       String t = "itemprop=\"url\"><span class=\"itemprop\" itemprop=\"name\">";
       for (int j = tempDiv.indexOf(t) + t.length(); j < tempDiv.indexOf("</span></a>"); j++) {
         actor += tempDiv.charAt(j);
       }
       movieActors[i] = actor;
       tempDiv = "";
       for (int j = actorsDiv.indexOf(actor + "</span>") + actor.length() + 7;
           j < actorsDiv.length();
           j++) {
         tempDiv += actorsDiv.charAt(j);
       }
     }
   } catch (Exception e) {
     System.out.println(e.toString());
   }
   return movieActors;
 }
Example #11
0
 private static String getTrailer(Movie movie) {
   String trailerLink = "";
   if (Integer.valueOf(movie.getMovieYear()) < 1990) {
     trailerLink = "null";
   } else {
     trailerLink += "http://www.youtube.com";
     String link = formatYoutubeString(movie.getMovieName());
     try {
       Document d = Jsoup.connect("http://www.youtube.com/" + link).get();
       Element e = d.body();
       String html = e.toString();
       String linkDiv = "";
       int max = html.indexOf("class=\"yt-lockup-title \"><a href=\"") + 100;
       for (int i = html.indexOf("class=\"yt-lockup-title \"><a href=\""); i < max; i++) {
         linkDiv += html.charAt(i);
       }
       for (int i = linkDiv.indexOf("<a href=\"") + 9;
           i < linkDiv.indexOf("class=\"yt-uix-sessionlink") - 2;
           i++) {
         trailerLink += linkDiv.charAt(i);
       }
     } catch (Exception e) {
       System.out.println(e.toString());
     }
   }
   return trailerLink;
 }
 /** If there are elements inside our top node that have a negative gravity score remove them */
 protected void removeNodesWithNegativeScores(Element topNode) {
   Elements gravityItems = topNode.select("*[gravityScore]");
   for (Element item : gravityItems) {
     int score = Integer.parseInt(item.attr("gravityScore"));
     if (score < 0 || item.text().length() < minParagraphText) item.remove();
   }
 }
  @Bean
  public IntegrationFlow evernoteIntegration() {
    return IntegrationFlows.from(
            this.evernoteMessageSource(),
            configurer ->
                configurer.poller(Pollers.fixedRate(pollIntervalInSeconds, TimeUnit.SECONDS)))
        .channel(this.inputChannel())
        .filter(Collection.class, source -> !source.isEmpty())
        .split()
        .transform(
            Note.class,
            source -> {
              String content = source.getContent();
              if (StringUtils.isNotBlank(content)) {
                Document enmlDocument = Jsoup.parse(content);
                Elements noteElements = enmlDocument.select("en-note");
                if (noteElements.size() == 1) {
                  Element noteElement = noteElements.get(0);
                  String wordsFromNote = noteElement.text();
                  if (StringUtils.isNotBlank(wordsFromNote)) {
                    return wordsFromNote;
                  }
                }
              }

              return source.getTitle();
            },
            configurer -> configurer.requiresReply(false))
        .filter(source -> source != null)
        .channel(wordRequestsChannel)
        .get();
  }
Example #14
0
  public static void initMajorList(String originalUrl) {

    System.out.println("preparing majorList");

    boolean finish = false;
    do {
      try {
        majorList.clear();
        Connection conn = Jsoup.connect(originalUrl);
        Document doc = conn.timeout(10000).get();
        Elements es = doc.select("#accordion__target-3 > div.course-listing__box > a");
        for (Element e : es) { // major
          MajorForCollection major = new MajorForCollection();
          major.setLevel(LEVEL);
          major.setTitle(e.select("h3").get(0).text().trim());
          major.setType(e.select("p").get(0).text().replaceAll("-[\\s\\S]*", "").trim());
          major.setUrl(e.select("a").get(0).attr("href"));
          majorList.add(major);
        }
        ;
        finish = true;
      } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      }
    } while (!finish);

    System.out.println("majorList prepared");
    System.out.println("majorList size: " + majorList.size());
  }
Example #15
0
 private static String makeModular(String html) {
   String text = "";
   Document doc = Jsoup.parse(html);
   Elements els = doc.getAllElements();
   boolean moved = false;
   String url = "";
   for (Element el : els) {
     switch (el.nodeName()) {
       case "title":
         text = el.text();
         if (text.toLowerCase().contains("moved") && text.toLowerCase().contains("permanently")) {
           moved = true;
         }
         break;
       case "body":
         if (moved) {
           url = getMovedUrl(el);
         }
         break;
       default:
         break;
     }
   }
   if (moved) {
     getMovedUrl(doc);
   }
   return text;
 }
Example #16
0
  private static String replaceCidWithAttachments(
      String html, Map<String, Attachment> attachments) {
    Document doc = Jsoup.parse(html);
    String[] attrNames = {"src", "href"};

    for (String attrName : attrNames) {
      Elements tags = doc.select("*[" + attrName + "]");
      for (Element tag : tags) {
        String uriString = tag.attr(attrName).trim();

        if (!uriString.toLowerCase().startsWith("cid:")) {
          continue;
        }

        String cid = uriString.substring("cid:".length());

        if (!attachments.containsKey(cid)) {
          continue;
        }

        Long id = attachments.get(cid).id;
        tag.attr(attrName, controllers.routes.AttachmentApp.getFile(id).url());
      }
    }

    Elements bodies = doc.getElementsByTag("body");

    if (bodies.size() > 0) {
      return bodies.get(0).html();
    } else {
      return doc.html();
    }
  }
Example #17
0
  //// COMPLETAMENTE INUTILE
  public static int[] getPrice(String path) {
    int[] month = new int[31];
    int count = 0;
    try {
      File input = new File(path);
      Document doc = Jsoup.parse(input, "UTF-8", "http://example.com/");
      Elements elementi_div = doc.getElementsByTag("div");
      for (Element e : elementi_div) {
        if (e.text().length() > 0)
          if (Character.isDigit(e.text().charAt(0)) && e.text().contains("€ ")) {
            count++;
            String[] arr = e.text().split(" ");
            month[Integer.parseInt(arr[0]) - 1] = Integer.parseInt(arr[2].replace(".", ""));
          }
      }
    } catch (Exception e) {
      System.out.println(e);
    }
    if (count == 0) {
      System.out.println("Non e' stato scaricato il file");
      // getPrice(path);
    }

    return month;
  }
 public static void processEpub(String bookPath, String dest)
     throws FileNotFoundException, IOException {
   EpubReader reader = new EpubReader();
   Book b = reader.readEpub(new FileInputStream(new File(bookPath)));
   String content = "";
   int pagecount = 1;
   int tempCounter;
   Count cnt = new Count(0, 0);
   for (Resource res : b.getContents()) {
     content = new String(res.getData());
     Document doc = Jsoup.parse(content, "UTF-8");
     // http-equiv=\"content-type\" content=\"text/html; charset=utf-8\"");
     Element elem = new Element(Tag.valueOf("meta"), "");
     elem.attr("http-equiv", "content-type");
     elem.attr("content", "text/html; charset=utf-8");
     doc.head().after(elem);
     System.out.println(doc.head().data());
     Element ele = doc.body();
     alterElement(ele);
     Count cTemp = modify(ele, cnt);
     cnt.setCount(cTemp.getCount());
     cnt.setPgCount(cTemp.getPgCount());
     doc.body().html(ele.html());
     res.setData(doc.html().getBytes());
     if (res.getMediaType() == null) res.setMediaType(new MediaType("html", "html"));
   }
   EpubWriter wr = new EpubWriter();
   wr.write(b, new FileOutputStream(new File(dest)));
 }
Example #19
0
 public static ArrayList<EntryModel> getSearchResults(final String query) {
   final ArrayList<EntryModel> result = new ArrayList<>();
   Thread thread =
       new Thread(
           () -> {
             try {
               Document document =
                   Jsoup.connect("http://jkanime.net/buscar/" + query.replace(" ", "_")).get();
               Elements elements = document.getElementsByClass("search");
               for (Element element : elements) {
                 String title = element.getElementsByClass("titl").first().text();
                 String url = element.getElementsByClass("titl").first().attr("abs:href");
                 String picUrl = element.getElementsByTag("img").first().attr("src");
                 result.add(new EntryModel(Constants.TYPE_SHOW, title, url, picUrl));
               }
             } catch (IOException e) {
               e.printStackTrace();
             }
           });
   thread.start();
   try {
     thread.join();
     return result;
   } catch (InterruptedException | NullPointerException e) {
     e.printStackTrace();
     return null;
   }
 }
Example #20
0
  /**
   * Recursively writes a data source Item and its children to a design.
   *
   * @since 7.5.0
   * @param design the element into which to insert the item
   * @param itemId the id of the item to write
   * @param context the DesignContext instance used in writing
   * @return
   */
  @Override
  protected Element writeItem(Element design, Object itemId, DesignContext context) {
    Element element = design.appendElement("node");

    element.attr("text", itemId.toString());

    Resource icon = getItemIcon(itemId);
    if (icon != null) {
      DesignAttributeHandler.writeAttribute(
          "icon", element.attributes(), icon, null, Resource.class);
    }

    if (isSelected(itemId)) {
      element.attr("selected", "");
    }

    Collection<?> children = getChildren(itemId);
    if (children != null) {
      // Yeah... see #5864
      for (Object childItemId : children) {
        writeItem(element, childItemId, context);
      }
    }

    return element;
  }
 /**
  * 从网址里面抽取链接
  *
  * @return 链接的集合
  */
 public static List<String> getUrlsByPage(String str) {
   List<String> urls = new ArrayList<String>();
   try {
     URL url = new URL(str);
     int end = 0;
     String host = url.getHost();
     Document doc = Jsoup.parse(url, 30000);
     Elements links = doc.select("a");
     String href = null;
     for (Element link : links) {
       href = link.attr("href");
       if (href.startsWith(HTTP)) {
         urls.add(href);
       } else if (href.startsWith("/")) {
         urls.add(HTTP + host + href);
       } else {
         if (end > 0) {
           urls.add(str + href);
         } else {
           urls.add(str + href);
         }
       }
     }
   } catch (MalformedURLException e) {
     // TODO Auto-generated catch block
     e.printStackTrace();
   } catch (IOException e) {
     // TODO Auto-generated catch block
     e.printStackTrace();
   }
   return urls;
 }
 private static void parseStatHeaderDetails(Document doc, Statistic stat) {
   Elements statsTrs = doc.select("table#id_stats").select("tr");
   for (Element tr : statsTrs) {
     Elements tds = tr.select("td");
     String name = tds.get(0).text().trim();
     String value = tds.get(1).text().trim();
     if (name != null) {
       if (name.startsWith("Win-Loss-Void")) {
         String[] values = value.split("-");
         if (values != null && values.length == 3) {
           stat.setWin(NumberParser.parseInt(values[0]));
           stat.setLose(NumberParser.parseInt(values[1]));
           stat.setVoid_(NumberParser.parseInt(values[2]));
         } else {
           logger.warn("Win-Loss-Void section doesn't contain 3 elements as expected");
         }
       } else if (name.startsWith("Stake avg")) {
         stat.setAvgStake(NumberParser.parseDouble(value));
       } else if (name.startsWith("Odd avg")) {
         stat.setAvgOdds(NumberParser.parseDouble(value));
       } else if (name.startsWith("Staked")) {
         stat.setStaked(NumberParser.parseDouble(value));
       } else if (name.startsWith("Returned")) {
         stat.setReturned(NumberParser.parseDouble(value));
       }
     }
   }
 }
  /**
   * This methods checks whether elements have a child element of with a given attribute.
   *
   * @param elements
   * @param testSolutionHandler
   */
  private void checkChildElementWithAttributePresence(
      Elements elements, TestSolutionHandler testSolutionHandler) {
    if (elements.isEmpty()) {
      testSolutionHandler.addTestSolution(TestSolution.NOT_APPLICABLE);
      return;
    }

    TestSolution testSolution = TestSolution.PASSED;

    for (Element el : elements) {

      if (!el.getElementsByAttribute(attributeName).isEmpty()) {

        testSolution = setTestSolution(testSolution, getSuccessSolution());
        addSourceCodeRemark(getSuccessSolution(), el, getSuccessMsgCode());

      } else {

        testSolution = setTestSolution(testSolution, getFailureSolution());
        addSourceCodeRemark(getFailureSolution(), el, getFailureMsgCode());
      }
    }

    testSolutionHandler.addTestSolution(testSolution);
  }
Example #24
0
  /**
   * Take links from results and do pagination (max 7 times).
   *
   * @param document
   * @return
   */
  @Override
  public List<URL> getNextPages(Document document) {
    List<URL> urls = new ArrayList<>();

    // Collect rows with links to comparing offerts links
    Elements elements = document.select(PRODUCTS_ROW_QUERY + ":not([onclick])");

    for (Element element : elements) {
      String str = element.attr("abs:href");
      try {
        urls.add(Utils.stringToURL(str));
      } catch (ConnectionException e) {
      }
    }

    // Pagination
    final int MAX_PAGE = 7;
    Element next = document.select("a[href].next").first();
    if (next != null) {
      String nextStr = next.attr("href");
      if (!nextStr.contains("page_nr=" + MAX_PAGE)) {
        try {
          urls.add(Utils.stringToURL(nextStr));
        } catch (ConnectionException e) {
        }
      }
    }

    logger.debug("Collected " + urls.size() + " urls to visit");
    return urls;
  }
Example #25
0
  @Test
  public void designIsSerializedWithCorrectPrefixesAndPackageNames() throws IOException {
    ByteArrayOutputStream out = serializeDesign(ctx);

    // Check the mapping from prefixes to package names using the html tree
    String[] expectedPrefixes = {"my"};
    String[] expectedPackageNames = {"com.addon.mypackage"};
    int index = 0;

    Document doc = Jsoup.parse(out.toString("UTF-8"));
    Element head = doc.head();
    for (Node child : head.childNodes()) {
      if ("meta".equals(child.nodeName())) {
        String name = child.attributes().get("name");
        if ("package-mapping".equals(name)) {
          String content = child.attributes().get("content");
          String[] parts = content.split(":");
          assertEquals("Unexpected prefix.", expectedPrefixes[index], parts[0]);
          assertEquals("Unexpected package name.", expectedPackageNames[index], parts[1]);
          index++;
        }
      }
    }
    assertEquals("Unexpected number of prefix - package name pairs.", 1, index);
  }
  @Override
  protected void initialize(Element source) {
    Elements elements = source.getElementsByTag("td");

    Element element = elements.get(0).select("[data-sc-params]").get(0);
    String name =
        element
            .attr("data-sc-params")
            .replaceAll("\\{ 'name': '", "")
            .replaceAll("', 'magnet':.*", "")
            .replaceAll("%20", "\\.")
            .replaceAll("%5B.*", "");

    ShowData showData = ShowData.fromFilename(name);
    initialize(showData);

    seeds = Integer.parseInt(elements.get(4).text());
    peers = Integer.parseInt(elements.get(5).text());

    element = elements.get(0).select("div a[title=Download torrent file]").get(0);
    String[] array = element.attr("href").split("\\?");
    downloadLink = array[0].replaceAll("\\.torrent", "/temp\\.torrent");

    if (downloadLink.startsWith("//")) {
      downloadLink = "http:" + downloadLink;
    }
  }
Example #27
0
  public String reviseContForTieba(String pcont) {
    if (pcont == null) return "";

    Document doc = Jsoup.parse(pcont);
    Elements eles = doc.select("div.BAIDU_CLB_AD");
    for (Element ele : eles) {
      ele.remove();
    }
    eles = doc.select("ul.p_mtail");
    for (Element ele : eles) {
      ele.remove();
    }
    eles = doc.select("ul.p_props_tail");
    for (Element ele : eles) {
      ele.remove();
    }
    eles = doc.select("div.thread_recommend");
    for (Element ele : eles) {
      ele.remove();
    }
    eles = doc.select("div.j_lzl_container");
    for (Element ele : eles) {
      ele.remove();
    }
    return doc.html();
  }
  /**
   * Do not paginate.
   *
   * @param document
   * @return
   */
  @Override
  public List<URL> getNextPages(Document document) {
    List<URL> urls = new ArrayList<>();

    String nextStrUrl = null;

    // Pagination
    /*URL res;
    try {
    	Elements elements = document.getElementsByClass("next");
    	Element next = elements.first().select("a").first();
    	nextStrUrl = next.attr("abs:href");
    } catch (NullPointerException e) {
    	return null;
    }

    try {
    	res = Utils.stringToURL(nextStrUrl);
    } catch (ConnectionException e) {
    	logger.debug(e.toString());
    	return null;
    }
    urls.add(res);*/

    for (Element element : document.select("div#productView > div.productCompare")) {
      String href = element.select("a[href].buttonRetail").first().attr("abs:href");
      try {
        urls.add(Utils.stringToURL(href));
      } catch (ConnectionException e) {
      }
    }

    return urls;
  }
Example #29
0
  public List<MenuMeal> getMenuMeals(int number) {
    Document doc = null;
    List<MenuMeal> meals = new ArrayList<>();

    try {
      doc =
          Jsoup.connect(String.format(URL, number))
              .userAgent("Chrome/49.0.2623.112")
              .referrer("https://www.google.ru/")
              .timeout(7000)
              .get();
    } catch (IOException e) {
      e.printStackTrace();
    }
    if (doc == null) return meals;

    Elements elements = doc.select("td[width=400");

    if (!elements.isEmpty()) {
      for (Element element : elements) {
        Element parent = element.parent();
        MenuMeal menuMeal = new MenuMeal();

        menuMeal.setDescription(parent.select("div[id=ssilka]").first().text());
        String cost = parent.select("div[id=ssilka]").last().text();
        menuMeal.setCost(Integer.valueOf(cost.substring(0, cost.indexOf("-"))));

        meals.add(menuMeal);
      }
      return meals;
    } else {
      return meals;
    }
  }
Example #30
0
  public static void processPage(String URL) throws SQLException, IOException {
    // check if the given URL is already in database
    String sql = "select * from Record where URL = '" + URL + "'";
    ResultSet rs = db.runSql(sql);
    if (rs.next()) {

    } else {
      // store the URL to database to avoid parsing again
      sql = "INSERT INTO  test.Record " + "(URL) VALUES " + "(?);";
      PreparedStatement stmt = db.conn.prepareStatement(sql, Statement.RETURN_GENERATED_KEYS);
      stmt.setString(1, URL);
      stmt.execute();

      // get useful information
      Document doc = Jsoup.connect("http://www.mit.edu/").get();

      if (doc.text().contains("PhD")) {
        System.out.println(URL);
      }

      // get all links and recursively call the processPage method
      Elements questions = doc.select("a[href]");
      for (Element link : questions) {
        if (link.attr("href").contains("mit.edu")) processPage(link.attr("abs:href"));
      }
    }
  }