public static void main(String args[]) throws Exception { HtmlObject htmlObject = new HtmlObject(); StringBuffer sb = new StringBuffer(); BufferedReader reader = new BufferedReader(new FileReader("/home/chen/example/cifNews.html")); String temp = null; while ((temp = reader.readLine()) != null) { sb.append(temp); } htmlObject.setHtml(sb.toString()); htmlObject.setUrl("http://www.cifnews.com/Article/14175"); CifNewsParser parser = new CifNewsParser(); Article article = parser.run(htmlObject); System.out.println("title:" + article.getTitle()); System.out.println("content:" + article.getContent()); System.out.println("date:" + article.getPublishDate()); }
@Override public Article run(HtmlObject htmlObject) { String html = htmlObject.getHtml(); Document doc = Jsoup.parse(html); String title = doc.select(".article h1").text(); Elements contentElement = doc.select(".article_con"); String content = ""; String contentHtml = ""; if (contentElement != null) { // contentElement.select(".author").remove(); content = contentElement.text(); contentHtml = contentElement.html(); } String Ele_data = doc.select(".article h2").text(); Matcher m1 = datePattern.matcher(Ele_data); String date = ""; if (m1.find()) { date = m1.group(1); } else { Date today = new Date(); SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd"); date = formatter.format(today); } Article model1 = new Article(); model1.setUrl(htmlObject.getUrl()); model1.setTitle(title); model1.setContent(content); model1.setPublishDate(date); model1.setArticleType(ArticleType.News); model1.setProvider("雨果网"); return model1; }