@Override
  public Article run(HtmlObject htmlObject) {
    String html = htmlObject.getHtml();
    Document doc = Jsoup.parse(html);
    String title = doc.select(".article h1").text();
    Elements contentElement = doc.select(".article_con");
    String content = "";
    String contentHtml = "";
    if (contentElement != null) {
      // contentElement.select(".author").remove();
      content = contentElement.text();
      contentHtml = contentElement.html();
    }

    String Ele_data = doc.select(".article h2").text();
    Matcher m1 = datePattern.matcher(Ele_data);
    String date = "";
    if (m1.find()) {
      date = m1.group(1);
    } else {
      Date today = new Date();
      SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd");
      date = formatter.format(today);
    }

    Article model1 = new Article();
    model1.setUrl(htmlObject.getUrl());
    model1.setTitle(title);
    model1.setContent(content);
    model1.setPublishDate(date);
    model1.setArticleType(ArticleType.News);
    model1.setProvider("雨果网");
    return model1;
  }
Ejemplo n.º 2
0
 @Override
 public String fire(String inputContent) throws Exception {
   validate();
   Document document = Jsoup.parse(inputContent);
   Elements elements = document.select(cssSelector);
   return (elements != null && elements.size() > 0 ? elements.html().trim() : null);
 }
Ejemplo n.º 3
0
  public void parseHomeworkContent(Document document, Homework homework) throws Exception {
    Elements content;

    content = document.select("pre");
    if (content.size() == 0) throw new Exception(Exceptions.PARSE_FAIL);

    homework.content = content.html();
  }
Ejemplo n.º 4
0
 private List<String> parseRegex(String regex) {
   regex = replaceEqualPlaceHolder(replaceBracketPlaceHolder(regex));
   List<String> list = new ArrayList<String>();
   Pattern pattern = Pattern.compile(regex);
   Matcher m = pattern.matcher(elements.html());
   while (m.find()) {
     list.add(m.group());
   }
   return list;
 }
Ejemplo n.º 5
0
  public static void main(String[] args) throws IOException {

    for (int i = 1; i <= 46; i++) {
      Document document = Jsoup.connect("http://www.importnew.com/all-posts/page/" + i).get();
      Elements ele = document.select(".post.floated-thumb > .post-meta > p > .meta-title");
      System.out.println(ele.html());
      for (Element e : ele) {
        System.out.println(e.text());
      }
    }
  }
Ejemplo n.º 6
0
 public static ArrayList<GoodAttribute> createGoodAttributeList(
     String product_id, String attribute_group, Elements elements) {
   ArrayList<GoodAttribute> result = new ArrayList<>();
   String elementsHtml = elements.html();
   String[] attributes = elementsHtml.split("\\n");
   if (attributes.length > 1) {
     for (int i = 0; i < attributes.length; i = i + 2) {
       result.add(
           new GoodAttribute(product_id, attribute_group, attributes[i], attributes[i + 1]));
     }
   }
   return result;
 }
Ejemplo n.º 7
0
 @Override
 public String fire(String inputContent, String[] args) throws Exception {
   validateCSSSelectorRuleArgs(args);
   Document document = Jsoup.parse(inputContent);
   Elements elements = null;
   for (int i = 0; i < args.length; ++i) {
     if (i == 0) {
       elements = document.select(args[i]);
     } else {
       if (elements != null) {
         elements = elements.select(args[i]);
       } else {
         break;
       }
     }
   }
   return (elements != null && elements.size() > 0 ? elements.html().trim() : null);
 }
Ejemplo n.º 8
0
  public static String updateAFGXml(boolean isActivate, String target, String ectXml) {
    String conditionStr =
        isActivate ? "<cp:conditions/>" : "<cp:conditions><ss:rule-deactivated/></cp:conditions>";
    Document doc = Jsoup.parse(ectXml, "UTF-8");
    Elements ruleAudio = doc.select("cp|rule[id=cfu] ");

    Elements ruleAudioCondition = ruleAudio.select("cp|conditions");
    ruleAudioCondition.remove(); // we cant change it to "<cp:conditions/> directly
    ruleAudio.prepend(conditionStr);

    Elements ruleAudioForwardTarget = ruleAudio.select("ss|forward-to>ss|target");
    ruleAudioForwardTarget.html(target);

    String r = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
    r += doc.getElementsByTag("ss:simservs").outerHtml();

    // modify for jsoup problem
    r = r.replaceAll("noreplytimer", "NoReplyTimer");
    // r= r.replaceAll("\n", "");
    r = r.replaceAll(">\\s+(.+)\\s+<", ">$1<");

    return r;
  }
  public static void sendReportAsMailBody(String filePath, String subject, String AttachmentSource)
      throws Throwable {

    File fis =
        new File(
            "C:\\SVN\\12 QuickflixProject\\QuickflixProject\\Results\\Chrome\\"
                + "Chrome_2015-03-25_15_54_34_292\\SummaryResults_2015-03-25_15_54_34_292.html");
    // File fis = new File(filePath);
    FileReader fileReader = new FileReader(fis);
    // jsoup code
    Document doc = Jsoup.parse(fis, "UTF-8", "http://example.com/");
    Element newsHeadlines = doc.getElementById("footer");
    Elements header = doc.select("head");
    String jsonStringBuffer =
        "<html>"
            + header.html()
            + "<body> <table id='footer'>"
            + newsHeadlines.html()
            + "</table></body></html>";
    Document newFooter = Jsoup.parse(jsonStringBuffer);
    Element remLogos = doc.getElementById("Logos");
    remLogos.remove();
    Element remFooter = doc.getElementById("footer");
    remFooter.remove();
    // **********
    BufferedReader reader = new BufferedReader(fileReader);
    StringBuffer stringBuffer = new StringBuffer();
    String line;
    while ((line = reader.readLine()) != null) {
      // String newLine[] = line.split("(?<=footer).*?(?=</tfoot>)");
      String newLine[] = line.split("(?i)(<table id='footer'.*?>)(.+?)(</tfoot>)");
      // System.out.println(newLine[0]);
      /*String logoLess[]=newLine[0].split("(?i)(<table id='Logos'.*?>)(.+?)(main)");
      System.out.println(logoLess[0]);*/
      stringBuffer.append(newLine[0]);
    }
    // stringBuffer.append(stbfooter.toString());
    /*System.out.println("Contents of file:");
    System.out.println(stringBuffer.toString());*/
    fileReader.close();

    // Recipient's email ID needs to be mentioned.
    String[] toMailerList = configProps.getProperty("To").split(",");
    System.out.println(configProps.getProperty("To"));
    String[] ccMailerList = configProps.getProperty("CC").split(",");
    System.out.println(configProps.getProperty("CC"));
    final String username = configProps.getProperty("UserName");
    final String password = configProps.getProperty("Password");
    String from = configProps.getProperty("From");

    /*Properties props = new Properties();
    props.put("mail.smtp.host", "mail.quickflix.com.au");
    props.put("mail.smtp.port", "25");*/

    Properties props = new Properties();
    props.put("mail.smtp.auth", "true");
    props.put("mail.smtp.starttls.enable", "true");
    props.put("mail.smtp.debug", "true");
    props.put("mail.smtp.auth", "true");
    props.put("mail.smtp.socketFactory.class", "javax.net.ssl.SSLSocketFactory");
    props.put("mail.smtp.starttls.enable", "true");
    props.put("mail.smtp.socketFactory.fallback", "false");
    /*props.put("mail.smtp.host", "smtp.gmail.com");
    props.put("mail.smtp.port", "465");*/
    props.put("mail.smtp.host", "smtp.office365.com");
    props.put("mail.smtp.port", "995");
    Session session =
        Session.getInstance(
            props,
            new javax.mail.Authenticator() {
              protected PasswordAuthentication getPasswordAuthentication() {
                return new PasswordAuthentication(username, password);
              }
            });

    // Session session = Session.getInstance(props);
    try {
      // Create a default MimeMessage object.
      MimeMessage message = new MimeMessage(session);

      // Set From: header field of the header.
      message.setFrom(new InternetAddress(from));

      javax.mail.internet.InternetAddress[] addressTo =
          new javax.mail.internet.InternetAddress[toMailerList.length];

      for (int i = 0; i < toMailerList.length; i++) {
        addressTo[i] = new javax.mail.internet.InternetAddress(toMailerList[i]);
      }
      message.setRecipients(javax.mail.Message.RecipientType.TO, addressTo);

      javax.mail.internet.InternetAddress[] addressCC =
          new javax.mail.internet.InternetAddress[ccMailerList.length];
      for (int i = 0; i < ccMailerList.length; i++) {
        addressCC[i] = new javax.mail.internet.InternetAddress(ccMailerList[i]);
      }
      message.setRecipients(javax.mail.Message.RecipientType.CC, addressCC);
      // Set Subject: header field
      message.setSubject(subject);

      // Create the message part
      BodyPart messageBodyPart = new MimeBodyPart();
      // Create the attachement part
      BodyPart AttachmentBodyPart = new MimeBodyPart();
      DataSource source = new FileDataSource(AttachmentSource);
      AttachmentBodyPart.setDataHandler(new DataHandler(source));
      AttachmentBodyPart.setFileName("QuickflixReports.zip");
      // Fill the message
      messageBodyPart.setContent(
          newFooter.html() + stringBuffer.toString(), "text/html; charset=utf-8");
      // Create a multipart message
      Multipart multipart = new MimeMultipart();
      // Set Attachment part
      multipart.addBodyPart(AttachmentBodyPart);
      // Set text message part
      multipart.addBodyPart(messageBodyPart);
      message.setContent(multipart);
      // Send message
      Transport.send(message);
      System.out.println("Sent message successfully....");
    } catch (MessagingException mex) {
      mex.printStackTrace();
    }
  }
Ejemplo n.º 10
0
 private Object getResult(String resultQuery) {
   if (resultQuery.startsWith(REPLACE_TAG)) {
     return parseReplace(resultQuery);
   } else if (resultQuery.startsWith(RESULT_TAG)) {
     String value = getValue(resultQuery, RESULT_TAG);
     if ("html".equalsIgnoreCase(value)) {
       outputNodeInfo();
       return elements.html();
     } else if ("text".equalsIgnoreCase(value)) {
       outputNodeInfo();
       return elements.text();
     } else if (value.startsWith("attr") && isUniqueValue(value, "=")) {
       outputNodeInfo();
       String[] vals = value.split("=");
       List<String> list = new ArrayList<String>();
       if ("src".equals(vals[1])) {
         if (elements.size() == 1) {
           return elements.first().attr("abs:src");
         }
         for (Element ele : elements) {
           String src = ele.attr("abs:src");
           if (!StringUtils.isEmpty(src)) {
             list.add(src);
           }
         }
         return list;
       } else if ("href".equals(vals[1])) {
         if (elements.size() == 1) {
           return elements.first().attr("abs:href");
         }
         for (Element ele : elements) {
           String src = ele.attr("abs:href");
           if (!StringUtils.isEmpty(src)) {
             list.add(src);
           }
         }
         return list;
       } else {
         if (elements.size() == 1) {
           return elements.first().attr(vals[1]);
         }
         for (Element ele : elements) {
           String src = ele.attr(vals[1]);
           if (!StringUtils.isEmpty(src)) {
             list.add(src);
           }
         }
         return list;
       }
     } else if (value.startsWith("regex") && isUniqueValue(value, "=")) {
       outputNodeInfo();
       return parseRegex(value.split("=")[1]);
     } else {
       outputNodeInfo();
       return elements.text();
     }
   } else {
     simpleSelect(resultQuery);
     outputNodeInfo();
     return elements;
   }
 }
  public SiteData getCrawlerModel(Document doc) throws HttpStatusException {
    SiteData siteData = new SiteData();
    // Title
    Elements newsHeadlines = doc.select("h1[itemprop=name]");
    String title = newsHeadlines.get(0).html();
    siteData.setTitle(title);
    //      System.out.println("title " + title);
    //      Contact Name
    Elements contactNameElm = doc.select(".poster span");
    String contactName = contactNameElm.get(0).html();
    siteData.setOwnerName(contactName);
    //      System.out.println("contactName " + contactName);
    //      Contact Number
    Elements contactNumbers = doc.select(".item-contact-more.is-showable ul li span");
    siteData.setTpNumbers(new ArrayList<String>());
    for (Element contactNumberElm : contactNumbers) {
      String contactNumber = contactNumberElm.html();
      siteData.getTpNumbers().add(contactNumber);
    }
    //      Location
    Elements locationElm = doc.select(".location");
    String location = locationElm.html();
    siteData.setLocation("Sri lanka, " + location);
    //      Date
    Elements dateElm = doc.select(".date");
    String date = dateElm.html();

    siteData.setPostDateTimeS(date);
    //         6 Oct 4:27 pm";
    DateFormat formatter = new SimpleDateFormat("dd MMM HH:mm a");
    try {

      Date postDate = (Date) formatter.parse(date);
      postDate.setYear(Calendar.getInstance().getTime().getYear());
      siteData.setPostDateTime(postDate);
    } catch (ParseException e) {
      //      e.printStackTrace();
    }

    Elements amountElement = doc.select(".amount");
    String amount = amountElement.html();
    if (amount != null) {
      amount = amount.replace(",", "");
    }
    try {
      siteData.setPrice(Double.parseDouble(amount));
    } catch (Exception e) {
    }
    Elements descriptionElm = doc.select("div[itemprop=description]");
    String description = descriptionElm.text();
    siteData.setContent(description);
    List<String> images = new ArrayList();
    Elements imageElms = doc.select("img[data-srcset]");
    L1:
    for (Element imgElm : imageElms) {
      String image = imgElm.attr("data-srcset");
      String[] rowImages = image.replace("//", "").split(",");
      for (String rowImage : rowImages) {
        //      System.out.println(rowImage);
        if (rowImage.contains("i.ikman-st.com") && rowImage.contains("fitted")) {
          String imageUrl = rowImage.replace("1x", "").trim();
          images.add("http://" + imageUrl);
        }
        break;
      }
    }
    siteData.setImages(images);
    return siteData;
  }