public static List<RssUrlBean> getRssUrlBeanListFromPage(int rssCompo_id, String url) { List<RssUrlBean> rubList = new ArrayList<RssUrlBean>(); if (url.equals("")) return rubList; try { URL feedUrl = new URL(url); // SyndFeedInput:从远程读到xml结构的内容转成SyndFeedImpl实例 SyndFeedInput input = new SyndFeedInput(); // rome按SyndFeed类型生成rss和atom的实例, // SyndFeed是rss和atom实现类SyndFeedImpl的接口 SyndFeed syndFeed = input.build(new XmlReader(feedUrl)); List<SyndEntry> entryList = syndFeed.getEntries(); for (SyndEntry entry : entryList) { RssUrlBean rub = new RssUrlBean(); rub.setRssCompo_id(rssCompo_id); rub.setTitle(entry.getTitle()); rub.setLink(entry.getUri()); rub.setPublishedDate(CommonUtil.getStandardDate(entry.getPublishedDate().toLocaleString())); rub.setDescription(entry.getDescription().getValue()); if (entry.getUpdatedDate() != null) rub.setUpdatedDate(CommonUtil.getStandardDate(entry.getUpdatedDate().toLocaleString())); rub.setAuthors(entry.getAuthor()); rubList.add(rub); } } catch (Exception ex) { ex.printStackTrace(); } return rubList; }
/** * A helper function to print out the contents of an entry to log.trace * * @param entry */ public static void printEntry(SyndEntry entry) { if (!log.isTraceEnabled()) { return; } StringBuffer pr = new StringBuffer( "URI: " + entry.getUri() + "\n" + "Title: " + entry.getTitle() + "\n" + "\n" + "Date: " + entry.getPublishedDate() + "\n" + "Modified: " + entry.getUpdatedDate() + "\n"); pr.append("Creators: \n"); for (SyndPerson author : FeedHelper.getAuthors(entry)) { pr.append(" - " + author.getName() + "\n"); } pr.append("Links: \n"); for (SyndLink link : FeedHelper.getLinks(entry)) { pr.append(" - " + link.getTitle() + ": " + link.getHref() + "\n"); } SyndContent description = entry.getDescription(); if (description != null) { pr.append("\nDescription(" + description.getType() + "): " + description.getValue()); } pr.append("Contents: \n"); for (SyndContent content : FeedHelper.getContents(entry)) { pr.append(" Type: " + content.getType()); pr.append(" Body: " + content.getValue()); try { pr.append( " Body (Plain text): " + PlainTextExtractor.getPlainText(content.getType(), content.getValue())); } catch (ParserException e) { pr.append("Failed to parse content"); } } pr.append("Categories: \n"); for (SyndCategory category : FeedHelper.getCategories(entry)) { pr.append(category.getName() + "(" + category.getTaxonomyUri() + ")"); } log.trace(pr.toString()); }
protected Item createRSSItem(SyndEntry sEntry) { Item item = new Item(); item.setModules(ModuleUtils.cloneModules(sEntry.getModules())); item.setTitle(sEntry.getTitle()); item.setLink(sEntry.getLink()); item.setUri(sEntry.getUri()); if (sEntry.getDescription() != null) { Description description = new Description(); description.setValue(sEntry.getDescription().getValue()); item.setDescription(description); } return item; }
@SuppressWarnings("unchecked") @Trigger("!buzz") @Help("Fetches one of the latest posts from jeanmarcmorandini.com") public List<String> getLatestBuzz() { List<String> toReturn = new ArrayList<String>(); try { URL url = new URL("http://www.jeanmarcmorandini.com/rss.php"); SyndFeedInput input = new SyndFeedInput(); SyndFeed rss = input.build(new XmlReader(url)); Iterator<SyndEntry> it = rss.getEntries().iterator(); String message = null; while (it.hasNext()) { SyndEntry item = it.next(); String guid = item.getUri(); RSSFeed buzz = dao.findByGUID(guid); if (buzz == null) { buzz = new RSSFeed(); buzz.setGuid(item.getUri()); dao.save(buzz); String urlBitly = utilsService.bitly(item.getLink()); String content = Jsoup.parse(item.getDescription().getValue()).select("p").get(0).text(); message = IRCUtils.bold("EXCLU!") + " " + item.getTitle() + " - " + urlBitly; toReturn.add(message); toReturn.add(content); break; } } if (message == null) { toReturn.add("Pas d'exclus pour le moment."); } } catch (Exception e) { LOG.handle(e); } return toReturn; }
/** * Remove an item for the feed * * @param uri * @return * @throws Exception */ @SuppressWarnings("unchecked") public SyndFeed removeEntry(String uri) { SyndFeed feed = read(); List<SyndEntry> entries = feed.getEntries(); if (uri != null && uri.trim().length() > 0) { for (SyndEntry syndEntry : entries) { if (syndEntry.getUri().equals(uri)) { entries.remove(syndEntry); break; } } } feed.setEntries(entries); return feed; }
@SuppressWarnings("unchecked") public void test() throws Exception { final SyndFeedInput input = new SyndFeedInput(true); final SyndFeed feed = input.build(new File("c:\\temp\\google.xml")); logger.debug("Successfully parsed the RSS feed"); logger.debug("Author = " + feed.getAuthors()); logger.debug("Categories = " + feed.getCategories()); final List<SyndEntry> entries = feed.getEntries(); for (final SyndEntry entry : entries) { logger.debug("Title = " + StringEscapeUtils.unescapeHtml(entry.getTitle())); logger.debug( "Description = " + StringEscapeUtils.unescapeHtml(entry.getDescription().getValue())); logger.debug(entry.getUri()); logger.debug("Updated date = " + entry.getUpdatedDate()); logger.debug("Published date = " + entry.getPublishedDate()); logger.debug("===================================================="); } }
protected void removeItem(List<SyndEntry> entries, List<Node> listRemove) throws RepositoryException { List<SyndEntry> entries1 = new ArrayList<SyndEntry>(); boolean flag = true; for (SyndEntry syndEntry : entries) { flag = true; for (Node post : listRemove) { if (syndEntry.getUri().equals(post.getName())) { flag = false; break; } } if (flag) { entries1.add(syndEntry); } } entries.clear(); entries.addAll(entries1); }
// @Transactional // @TransactionAttribute(TransactionAttributeType.REQUIRES_NEW) public void importEntry( final SyndFeed feed, final SyndEntry entry, final Set<KiWiUriResource> types, final Set<ContentItem> tags, User user, final Collection<ContentItem> output) { if (user == null && entry.getAuthor() != null && !"".equals(entry.getAuthor())) { if (userService.userExists(entry.getAuthor())) { user = userService.getUserByLogin(entry.getAuthor()); } else { // user = userService.createUser(entry.getAuthor()); /* In my opinion, it is not ok to create a user entity * without asking the person if he/she wants to be * created and persisted in the KiWi dataset. * Thus I'm changing the user to 'anonymous', * if he/she is'nt registered with the same nick that * is given in the rss entry. */ user = userService.getUserByLogin("anonymous"); kiwiEntityManager.persist(user); } } log.debug("feed entry: #0 (#1)", entry.getTitle(), entry.getUri()); // create a new content item and copy all data from the feed entry ContentItem item; if (entry.getLink() != null) { item = contentItemService.createExternContentItem(entry.getLink()); } else if (entry.getUri() != null) { try { // try parsing URI; if it is not valid, URI uri = new URI(entry.getUri()); item = contentItemService.createExternContentItem(entry.getUri()); } catch (URISyntaxException e) { item = contentItemService.createExternContentItem(feed.getLink() + "#" + entry.getUri()); } } else { item = contentItemService.createContentItem(); } contentItemService.updateTitle(item, entry.getTitle()); if (feed.getLanguage() != null) item.setLanguage(new Locale(feed.getLanguage())); if (entry.getPublishedDate() != null) { item.setCreated(entry.getPublishedDate()); item.setModified(entry.getPublishedDate()); } if (entry.getUpdatedDate() != null) { if (entry.getPublishedDate() == null) { item.setCreated(entry.getUpdatedDate()); } item.setModified(entry.getUpdatedDate()); } item.setAuthor(user); // read feed content and set it as item's text content List<SyndContent> contents = entry.getContents(); if (contents.size() == 1) { log.debug("using RSS content section provided by item"); contentItemService.updateTextContentItem(item, "<p>" + contents.get(0).getValue() + "</p>"); } else if (contents.size() > 1) { log.warn("feed entry contained more than one content section"); contentItemService.updateTextContentItem(item, "<p>" + contents.get(0).getValue() + "</p>"); } else if (contents.size() == 0) { if (entry.getDescription() != null && entry.getDescription().getValue() != null) { log.debug("using RSS description as no content section was available"); contentItemService.updateTextContentItem( item, "<p>" + entry.getDescription().getValue() + "</p>"); } } // save before tagging contentItemService.saveContentItem(item); // read feed categories and use them as tags for (SyndCategory cat : (List<SyndCategory>) entry.getCategories()) { ContentItem _cat; if (!taggingService.hasTag(item, cat.getName())) { if (cat.getTaxonomyUri() != null) { _cat = contentItemService.getContentItemByUri(cat.getTaxonomyUri()); if (_cat == null) { _cat = contentItemService.createExternContentItem(cat.getTaxonomyUri()); contentItemService.updateTitle(_cat, cat.getName()); _cat.setAuthor(user); contentItemService.saveContentItem(_cat); } taggingService.createTagging(cat.getName(), item, _cat, user); } else { _cat = contentItemService.getContentItemByTitle(cat.getName()); if (_cat == null) { _cat = contentItemService.createContentItem(); contentItemService.updateTitle(_cat, cat.getName()); _cat.setAuthor(user); contentItemService.saveContentItem(_cat); } taggingService.createTagging(cat.getName(), item, _cat, user); } } } // scan for Twitter-style hash tags in title (e.g. #kiwiknows, see KIWI-622) Matcher m_hashtag = p_hashtag.matcher(entry.getTitle()); while (m_hashtag.find()) { String tag_label = m_hashtag.group(1); if (!taggingService.hasTag(item, tag_label)) { ContentItem tag = contentItemService.getContentItemByTitle(tag_label); if (tag == null) { tag = contentItemService.createContentItem(); contentItemService.updateTitle(tag, tag_label); tag.setAuthor(user); contentItemService.saveContentItem(tag); } taggingService.createTagging(tag_label, item, tag, user); } } // check for geo information GeoRSSModule geoRSSModule = GeoRSSUtils.getGeoRSS(entry); if (geoRSSModule != null && geoRSSModule.getPosition() != null) { POI poi = kiwiEntityManager.createFacade(item, POI.class); poi.setLatitude(geoRSSModule.getPosition().getLatitude()); poi.setLongitude(geoRSSModule.getPosition().getLongitude()); kiwiEntityManager.persist(poi); } // check for media information MediaEntryModule mediaModule = (MediaEntryModule) entry.getModule(MediaModule.URI); if (mediaModule != null) { MediaContent[] media = mediaModule.getMediaContents(); if (media.length > 0) { MediaContent m = media[0]; if (m.getReference() instanceof UrlReference) { URL url = ((UrlReference) m.getReference()).getUrl(); String type = m.getType(); String name = url.getFile(); if (name.lastIndexOf("/") > 0) { name = name.substring(name.lastIndexOf("/") + 1); } log.debug("importing media data from URL #0", url.toString()); try { InputStream is = url.openStream(); ByteArrayOutputStream bout = new ByteArrayOutputStream(); int c; while ((c = is.read()) != -1) { bout.write(c); } byte[] data = bout.toByteArray(); contentItemService.updateMediaContentItem(item, data, type, name); is.close(); bout.close(); } catch (IOException ex) { log.error("error importing media content from RSS stream"); } } else { log.info("RSS importer can only import media with URL references"); } } else { log.warn("media module found without content"); } Category[] cats = mediaModule.getMetadata().getCategories(); for (Category cat : cats) { ContentItem _cat; String label = cat.getLabel() != null ? cat.getLabel() : cat.getValue(); if (!taggingService.hasTag(item, label)) { if (cat.getScheme() != null) { _cat = contentItemService.getContentItemByUri(cat.getScheme() + cat.getValue()); if (_cat == null) { _cat = contentItemService.createExternContentItem(cat.getScheme() + cat.getValue()); contentItemService.updateTitle(_cat, label); _cat.setAuthor(user); contentItemService.saveContentItem(_cat); } taggingService.createTagging(label, item, _cat, user); } else { _cat = contentItemService.getContentItemByTitle(label); if (_cat == null) { _cat = contentItemService.createContentItem(); contentItemService.updateTitle(_cat, label); _cat.setAuthor(user); contentItemService.saveContentItem(_cat); } taggingService.createTagging(label, item, _cat, user); } } } } // add parameter categories as tags for (ContentItem tag : tags) { if (!taggingService.hasTag(item, tag.getTitle())) { taggingService.createTagging(tag.getTitle(), item, tag, user); } } // add parameter types as types for (KiWiUriResource type : types) { item.addType(type); } // add kiwi:FeedPost type item.addType(tripleStore.createUriResource(Constants.NS_KIWI_CORE + "FeedPost")); /* the flush is necessary, because CIs or tags will * otherwise be created multiple times when they * appear more than once in one RSS feed */ entityManager.flush(); log.debug("imported content item '#0' with URI '#1'", item.getTitle(), item.getResource()); }
public Vector<SyndEntry> crawl() { if (feedid == 1301) { int z = 0; z++; } SyndFeedInput input = new SyndFeedInput(); XmlReader reader = null; Vector<SyndEntry> ret = new Vector<SyndEntry>(); Document doc; try { /* * Document doc = Jsoup.parse(feedurl,10000); * getLogger().info(doc.toString()); */ SyndFeed feed = null; try { feed = input.build(reader = new XmlReader(feedurl)); } catch (Exception fe) { try { System.err.println("from url: " + feedurl); fe.printStackTrace(); doc = Jsoup.parse(feedurl, 10000); feed = input.build(new StringReader(doc.toString())); getLogger().info("Could fix it woith jsoup"); } catch (ExceptionInInitializerError ed) { System.err.println("from url: " + feedurl); ed.printStackTrace(); try { URL url = feedurl; String feedpage = FeedCrawler.readPage(url); feedpage = feedpage .replaceAll("\\&ldquo;", "\"") .replaceAll("“", "\"") .replaceAll("\\&rdquo;", "\"") .replaceAll("”", "\""); System.out.println(feedpage); // feedpage=feedpage.replaceAll("\\&ldquo;", "\""); // reader=new XmlReader(new InputSource(new // StringReader(feedpage)).getCharacterStream()); feed = input.build(new InputSource(new StringReader(feedpage))); getLogger().info("Could fix it with complicatedreader"); } catch (URISyntaxException e) { // TODO Auto-generated catch block System.err.println("from url: " + feedurl); e.printStackTrace(); System.out.println("URL does not work: " + feedurl); } catch (FeedException e) { // TODO Auto-generated catch block System.err.println("from url: " + feedurl); e.printStackTrace(); System.out.println("URL does not work: " + feedurl); } catch (Exception ex) { System.err.println("from url: " + feedurl); ex.printStackTrace(); System.out.println("URL does not work: " + feedurl); } } } for (Iterator<SyndEntry> i = feed.getEntries().iterator(); i.hasNext(); ) { SyndEntry entry = (SyndEntry) i.next(); System.out.println("\t INFOR: Entry\t" + entry.getUri()); if (!entry.getUri().startsWith("http")) { System.out.println("\t ERROR: \t" + entry.getLink()); System.out.println("\t DEBUG: \t" + feedurl); System.out.println("\t FIX: \t"); if (domain.length() > 0) { entry.setLink(domain + entry.getLink()); entry.setUri(domain + entry.getUri()); ret.add(entry); } } else ret.add(entry); getLogger().info(entry.getTitle()); } } catch (IllegalArgumentException e) { // TODO Auto-generated catch block getLogger().info("Feed error: " + feedurl); getLogger().info("try with jsoup"); System.err.println("from url: " + feedurl); e.printStackTrace(); try { doc = Jsoup.parse(feedurl, 10000); } catch (IOException e1) { // TODO Auto-generated catch block System.err.println("from url: " + feedurl); e1.printStackTrace(); } } catch (FeedException fe) { getLogger().info("Even jsoup did not work: " + feedurl); System.err.println("from url: " + feedurl); fe.printStackTrace(); } catch (IOException e) { getLogger().info("Feed error: " + feedurl); getLogger().info("try with jsoup"); // TODO Auto-generated catch block System.err.println("from url: " + feedurl); e.printStackTrace(); try { doc = Jsoup.parse(feedurl, 10000); } catch (IOException e2) { // TODO Auto-generated catch block System.err.println("from url: " + feedurl); e2.printStackTrace(); try { getLogger().info("try with jsoup"); doc = Jsoup.parse(feedurl, 10000); getLogger().info(doc.toString()); } catch (IOException e1) { // TODO Auto-generated catch block getLogger().info("once again, jsoup cant do it"); System.err.println("from url: " + feedurl); e1.printStackTrace(); } } } finally { if (reader != null) try { reader.close(); } catch (IOException eg) { getLogger().info("Feed error: " + feedurl); // TODO Auto-generated catch block System.err.println("from url: " + feedurl); eg.printStackTrace(); } } return ret; }