@Override public Article run(HtmlObject htmlObject) { String html = htmlObject.getHtml(); Document doc = Jsoup.parse(html); String title = doc.select(".article h1").text(); Elements contentElement = doc.select(".article_con"); String content = ""; String contentHtml = ""; if (contentElement != null) { // contentElement.select(".author").remove(); content = contentElement.text(); contentHtml = contentElement.html(); } String Ele_data = doc.select(".article h2").text(); Matcher m1 = datePattern.matcher(Ele_data); String date = ""; if (m1.find()) { date = m1.group(1); } else { Date today = new Date(); SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd"); date = formatter.format(today); } Article model1 = new Article(); model1.setUrl(htmlObject.getUrl()); model1.setTitle(title); model1.setContent(content); model1.setPublishDate(date); model1.setArticleType(ArticleType.News); model1.setProvider("雨果网"); return model1; }
@Override public String fire(String inputContent) throws Exception { validate(); Document document = Jsoup.parse(inputContent); Elements elements = document.select(cssSelector); return (elements != null && elements.size() > 0 ? elements.html().trim() : null); }
public void parseHomeworkContent(Document document, Homework homework) throws Exception { Elements content; content = document.select("pre"); if (content.size() == 0) throw new Exception(Exceptions.PARSE_FAIL); homework.content = content.html(); }
private List<String> parseRegex(String regex) { regex = replaceEqualPlaceHolder(replaceBracketPlaceHolder(regex)); List<String> list = new ArrayList<String>(); Pattern pattern = Pattern.compile(regex); Matcher m = pattern.matcher(elements.html()); while (m.find()) { list.add(m.group()); } return list; }
public static void main(String[] args) throws IOException { for (int i = 1; i <= 46; i++) { Document document = Jsoup.connect("http://www.importnew.com/all-posts/page/" + i).get(); Elements ele = document.select(".post.floated-thumb > .post-meta > p > .meta-title"); System.out.println(ele.html()); for (Element e : ele) { System.out.println(e.text()); } } }
public static ArrayList<GoodAttribute> createGoodAttributeList( String product_id, String attribute_group, Elements elements) { ArrayList<GoodAttribute> result = new ArrayList<>(); String elementsHtml = elements.html(); String[] attributes = elementsHtml.split("\\n"); if (attributes.length > 1) { for (int i = 0; i < attributes.length; i = i + 2) { result.add( new GoodAttribute(product_id, attribute_group, attributes[i], attributes[i + 1])); } } return result; }
@Override public String fire(String inputContent, String[] args) throws Exception { validateCSSSelectorRuleArgs(args); Document document = Jsoup.parse(inputContent); Elements elements = null; for (int i = 0; i < args.length; ++i) { if (i == 0) { elements = document.select(args[i]); } else { if (elements != null) { elements = elements.select(args[i]); } else { break; } } } return (elements != null && elements.size() > 0 ? elements.html().trim() : null); }
public static String updateAFGXml(boolean isActivate, String target, String ectXml) { String conditionStr = isActivate ? "<cp:conditions/>" : "<cp:conditions><ss:rule-deactivated/></cp:conditions>"; Document doc = Jsoup.parse(ectXml, "UTF-8"); Elements ruleAudio = doc.select("cp|rule[id=cfu] "); Elements ruleAudioCondition = ruleAudio.select("cp|conditions"); ruleAudioCondition.remove(); // we cant change it to "<cp:conditions/> directly ruleAudio.prepend(conditionStr); Elements ruleAudioForwardTarget = ruleAudio.select("ss|forward-to>ss|target"); ruleAudioForwardTarget.html(target); String r = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"; r += doc.getElementsByTag("ss:simservs").outerHtml(); // modify for jsoup problem r = r.replaceAll("noreplytimer", "NoReplyTimer"); // r= r.replaceAll("\n", ""); r = r.replaceAll(">\\s+(.+)\\s+<", ">$1<"); return r; }
public static void sendReportAsMailBody(String filePath, String subject, String AttachmentSource) throws Throwable { File fis = new File( "C:\\SVN\\12 QuickflixProject\\QuickflixProject\\Results\\Chrome\\" + "Chrome_2015-03-25_15_54_34_292\\SummaryResults_2015-03-25_15_54_34_292.html"); // File fis = new File(filePath); FileReader fileReader = new FileReader(fis); // jsoup code Document doc = Jsoup.parse(fis, "UTF-8", "http://example.com/"); Element newsHeadlines = doc.getElementById("footer"); Elements header = doc.select("head"); String jsonStringBuffer = "<html>" + header.html() + "<body> <table id='footer'>" + newsHeadlines.html() + "</table></body></html>"; Document newFooter = Jsoup.parse(jsonStringBuffer); Element remLogos = doc.getElementById("Logos"); remLogos.remove(); Element remFooter = doc.getElementById("footer"); remFooter.remove(); // ********** BufferedReader reader = new BufferedReader(fileReader); StringBuffer stringBuffer = new StringBuffer(); String line; while ((line = reader.readLine()) != null) { // String newLine[] = line.split("(?<=footer).*?(?=</tfoot>)"); String newLine[] = line.split("(?i)(<table id='footer'.*?>)(.+?)(</tfoot>)"); // System.out.println(newLine[0]); /*String logoLess[]=newLine[0].split("(?i)(<table id='Logos'.*?>)(.+?)(main)"); System.out.println(logoLess[0]);*/ stringBuffer.append(newLine[0]); } // stringBuffer.append(stbfooter.toString()); /*System.out.println("Contents of file:"); System.out.println(stringBuffer.toString());*/ fileReader.close(); // Recipient's email ID needs to be mentioned. String[] toMailerList = configProps.getProperty("To").split(","); System.out.println(configProps.getProperty("To")); String[] ccMailerList = configProps.getProperty("CC").split(","); System.out.println(configProps.getProperty("CC")); final String username = configProps.getProperty("UserName"); final String password = configProps.getProperty("Password"); String from = configProps.getProperty("From"); /*Properties props = new Properties(); props.put("mail.smtp.host", "mail.quickflix.com.au"); props.put("mail.smtp.port", "25");*/ Properties props = new Properties(); props.put("mail.smtp.auth", "true"); props.put("mail.smtp.starttls.enable", "true"); props.put("mail.smtp.debug", "true"); props.put("mail.smtp.auth", "true"); props.put("mail.smtp.socketFactory.class", "javax.net.ssl.SSLSocketFactory"); props.put("mail.smtp.starttls.enable", "true"); props.put("mail.smtp.socketFactory.fallback", "false"); /*props.put("mail.smtp.host", "smtp.gmail.com"); props.put("mail.smtp.port", "465");*/ props.put("mail.smtp.host", "smtp.office365.com"); props.put("mail.smtp.port", "995"); Session session = Session.getInstance( props, new javax.mail.Authenticator() { protected PasswordAuthentication getPasswordAuthentication() { return new PasswordAuthentication(username, password); } }); // Session session = Session.getInstance(props); try { // Create a default MimeMessage object. MimeMessage message = new MimeMessage(session); // Set From: header field of the header. message.setFrom(new InternetAddress(from)); javax.mail.internet.InternetAddress[] addressTo = new javax.mail.internet.InternetAddress[toMailerList.length]; for (int i = 0; i < toMailerList.length; i++) { addressTo[i] = new javax.mail.internet.InternetAddress(toMailerList[i]); } message.setRecipients(javax.mail.Message.RecipientType.TO, addressTo); javax.mail.internet.InternetAddress[] addressCC = new javax.mail.internet.InternetAddress[ccMailerList.length]; for (int i = 0; i < ccMailerList.length; i++) { addressCC[i] = new javax.mail.internet.InternetAddress(ccMailerList[i]); } message.setRecipients(javax.mail.Message.RecipientType.CC, addressCC); // Set Subject: header field message.setSubject(subject); // Create the message part BodyPart messageBodyPart = new MimeBodyPart(); // Create the attachement part BodyPart AttachmentBodyPart = new MimeBodyPart(); DataSource source = new FileDataSource(AttachmentSource); AttachmentBodyPart.setDataHandler(new DataHandler(source)); AttachmentBodyPart.setFileName("QuickflixReports.zip"); // Fill the message messageBodyPart.setContent( newFooter.html() + stringBuffer.toString(), "text/html; charset=utf-8"); // Create a multipart message Multipart multipart = new MimeMultipart(); // Set Attachment part multipart.addBodyPart(AttachmentBodyPart); // Set text message part multipart.addBodyPart(messageBodyPart); message.setContent(multipart); // Send message Transport.send(message); System.out.println("Sent message successfully...."); } catch (MessagingException mex) { mex.printStackTrace(); } }
private Object getResult(String resultQuery) { if (resultQuery.startsWith(REPLACE_TAG)) { return parseReplace(resultQuery); } else if (resultQuery.startsWith(RESULT_TAG)) { String value = getValue(resultQuery, RESULT_TAG); if ("html".equalsIgnoreCase(value)) { outputNodeInfo(); return elements.html(); } else if ("text".equalsIgnoreCase(value)) { outputNodeInfo(); return elements.text(); } else if (value.startsWith("attr") && isUniqueValue(value, "=")) { outputNodeInfo(); String[] vals = value.split("="); List<String> list = new ArrayList<String>(); if ("src".equals(vals[1])) { if (elements.size() == 1) { return elements.first().attr("abs:src"); } for (Element ele : elements) { String src = ele.attr("abs:src"); if (!StringUtils.isEmpty(src)) { list.add(src); } } return list; } else if ("href".equals(vals[1])) { if (elements.size() == 1) { return elements.first().attr("abs:href"); } for (Element ele : elements) { String src = ele.attr("abs:href"); if (!StringUtils.isEmpty(src)) { list.add(src); } } return list; } else { if (elements.size() == 1) { return elements.first().attr(vals[1]); } for (Element ele : elements) { String src = ele.attr(vals[1]); if (!StringUtils.isEmpty(src)) { list.add(src); } } return list; } } else if (value.startsWith("regex") && isUniqueValue(value, "=")) { outputNodeInfo(); return parseRegex(value.split("=")[1]); } else { outputNodeInfo(); return elements.text(); } } else { simpleSelect(resultQuery); outputNodeInfo(); return elements; } }
public SiteData getCrawlerModel(Document doc) throws HttpStatusException { SiteData siteData = new SiteData(); // Title Elements newsHeadlines = doc.select("h1[itemprop=name]"); String title = newsHeadlines.get(0).html(); siteData.setTitle(title); // System.out.println("title " + title); // Contact Name Elements contactNameElm = doc.select(".poster span"); String contactName = contactNameElm.get(0).html(); siteData.setOwnerName(contactName); // System.out.println("contactName " + contactName); // Contact Number Elements contactNumbers = doc.select(".item-contact-more.is-showable ul li span"); siteData.setTpNumbers(new ArrayList<String>()); for (Element contactNumberElm : contactNumbers) { String contactNumber = contactNumberElm.html(); siteData.getTpNumbers().add(contactNumber); } // Location Elements locationElm = doc.select(".location"); String location = locationElm.html(); siteData.setLocation("Sri lanka, " + location); // Date Elements dateElm = doc.select(".date"); String date = dateElm.html(); siteData.setPostDateTimeS(date); // 6 Oct 4:27 pm"; DateFormat formatter = new SimpleDateFormat("dd MMM HH:mm a"); try { Date postDate = (Date) formatter.parse(date); postDate.setYear(Calendar.getInstance().getTime().getYear()); siteData.setPostDateTime(postDate); } catch (ParseException e) { // e.printStackTrace(); } Elements amountElement = doc.select(".amount"); String amount = amountElement.html(); if (amount != null) { amount = amount.replace(",", ""); } try { siteData.setPrice(Double.parseDouble(amount)); } catch (Exception e) { } Elements descriptionElm = doc.select("div[itemprop=description]"); String description = descriptionElm.text(); siteData.setContent(description); List<String> images = new ArrayList(); Elements imageElms = doc.select("img[data-srcset]"); L1: for (Element imgElm : imageElms) { String image = imgElm.attr("data-srcset"); String[] rowImages = image.replace("//", "").split(","); for (String rowImage : rowImages) { // System.out.println(rowImage); if (rowImage.contains("i.ikman-st.com") && rowImage.contains("fitted")) { String imageUrl = rowImage.replace("1x", "").trim(); images.add("http://" + imageUrl); } break; } } siteData.setImages(images); return siteData; }