private void completeLinks(PostsType posts) { for (PostType post : posts.getPost()) { if (post.getLinks() != null) { for (LinkType link : post.getLinks().getLink()) { if (!link.getUrl().matches("^http://.*")) { link.setUrl( "http://" + post.getSource() + link.getUrl().substring(link.getUrl().matches("^/.*") ? 1 : 0)); } } } } }
/** * Parses the given feed and extracts out and parsers all linked items within the feed, using the * underlying ROME feed parsing library. * * @param rss A {@link Content} object representing the feed that is being parsed by this {@link * Parser}. * @return A {@link ParseResult} containing all {@link Parse}d feeds that were present in the feed * file that this {@link Parser} dealt with. */ public String getParse(String url, boolean json, HashMap<String, Object> params) throws Exception { url = URLDecoder.decode(url, "UTF-8"); URL feedURL = new URL(url); // Logger.getLogger(this.getClass().getName()).log(Level.INFO, "Encoding del Feed: {0}", new // Object[]{feedURL.openConnection().getContentEncoding()}); Feed feed = FeedParser.parse(feedURL); // List<PostType> newsList = new ArrayList<PostType>(); PostType newEntry; // PostType newEntryComments; Post newEntrySolr; // SyndFeed feed = null; Gson gson = new Gson(); List<LinkType> links; Document doc; FeedSelectors feedSelectors; String extendedString = (String) params.get("zone"); Place place = null; if (params.containsKey("place")) { place = placeDao.retrieveByExtendedString(extendedString); } org.zonales.tagsAndZones.objects.Zone zone = zoneDao.retrieveByExtendedString(extendedString); if (!json) { for (int i = 0; i < feed.getItemCount(); i++) { FeedItem entry = feed.getItem(i); Logger.getLogger(this.getClass().getName()) .log( Level.INFO, "Intentando conectar a {0}", new Object[] {entry.getLink().toString()}); Connection conn = Jsoup.connect(entry.getLink().toString()); conn.timeout(60000); doc = conn.get(); String responseURL = conn.response().url().getHost(); // doc = Jsoup.connect(entry.getLink().toString()).timeout(60000).get(); Logger.getLogger(this.getClass().getName()) .log(Level.INFO, "Parseando la URL: {0}", new Object[] {entry.getLink().toString()}); feedSelectors = dao.retrieve(url); if (findWords( entry.getTitle(), doc, (ArrayList) params.get("searchlist"), (ArrayList) params.get("blacklist"), feedSelectors)) { newEntry = new PostType(); String source; if (feed.getHeader() == null || feed.getHeader().getLink() == null) { Logger.getLogger(this.getClass().getName()).log(Level.INFO, "NULL: Link"); source = feedURL.getHost(); } else { Logger.getLogger(this.getClass().getName()) .log(Level.INFO, "NO NULL: {0}", feed.getHeader().getLink().toString()); source = feed.getHeader().getLink().getHost(); // if (source.indexOf("/") != -1) { // source = source.substring(0, source.indexOf("/") + 1); // } } newEntry.setSource(source); newEntry.setDocType("post"); newEntry.setZone( new Zone( String.valueOf(zone.getId()), zone.getName(), zone.getType().getName(), zone.getExtendedString())); newEntry.setPostLatitude(Double.parseDouble((String) params.get("latitud"))); newEntry.setPostLongitude(Double.parseDouble((String) params.get("longitud"))); // newEntry.setId(entry.getUri()); // newEntry.setId(entry.getUri() != null && entry.getUri().length() > 0 ? // entry.getUri().trim() : entry.getLink().trim()+entry.getTitle().trim()); newEntry.setId( entry.getGUID() != null ? entry.getGUID() : (entry.getElementValue("http://www.w3.org/2005/Atom", "id") != null ? entry.getElementValue("http://www.w3.org/2005/Atom", "id") : (entry.getTitle()))); newEntry.setFromUser( new User( null, source, null, null, place != null ? new org.zonales.entities.Place( String.valueOf(place.getId()), place.getName(), place.getType().getName()) : null)); newEntry.setTitle(entry.getTitle()); newEntry.setText(entry.getDescriptionAsText()); newEntry.setTags(new TagsType((ArrayList) params.get("tagslist"))); if (newEntry.getLinks() == null) { newEntry.setLinks(new LinksType(new ArrayList<LinkType>())); } if ((links = getLinks(feedSelectors, doc, responseURL)) != null) { newEntry.getLinks().getLink().addAll(links); } newEntry.getLinks().getLink().add(new LinkType("source", entry.getLink().toString())); if (newEntry.getActions() == null) { newEntry.setActions(new ActionsType(new ArrayList<ActionType>())); } newEntry.setActions( new ActionsType( getActions( feedSelectors, doc, newEntry.getId(), json, (Boolean) params.get("comments"), source))); if (entry.getPubDate() != null) { newEntry.setCreated(String.valueOf(entry.getPubDate().getTime())); } if (entry.getModDate() != null) { newEntry.setModified(String.valueOf(entry.getModDate().getTime())); } for (ActionType action : newEntry.getActions().getAction()) { if ("comments".equals(action.getType())) { newEntry.setRelevance(action.getCant()); } } if (!json) { newEntry.setVerbatim(gson.toJson(newEntry)); } newsList.add(newEntry); // addToMap(parseResult, feed, feedLink, entry, content, newEntry); } } PostsType news; news = new PostsType(newsList); completeLinks(news); Feed2XML(news, sw); return sw.toString(); // + comments.toString(); } else { for (int i = 0; i < feed.getItemCount(); i++) { FeedItem entry = feed.getItem(i); Logger.getLogger(this.getClass().getName()) .log( Level.INFO, "Intentando conectar a {0}", new Object[] {entry.getLink().toString()}); Connection conn = Jsoup.connect(entry.getLink().toString()); conn.timeout(60000); doc = conn.get(); String responseURL = conn.response().url().getHost(); // Logger.getLogger(this.getClass().getName()).log(Level.INFO, "RESPONSE URL: // {0}", responseURL); // doc = Jsoup.connect(entry.getLink().toString()).timeout(60000).get(); Logger.getLogger(this.getClass().getName()) .log(Level.INFO, "Parseando la URL: {0}", new Object[] {entry.getLink().toString()}); feedSelectors = dao.retrieve(url); if (findWords( entry.getTitle(), doc, (ArrayList) params.get("searchlist"), (ArrayList) params.get("blacklist"), feedSelectors)) { newEntrySolr = new Post(); String source; if (feed.getHeader() == null || feed.getHeader().getLink() == null) { Logger.getLogger(this.getClass().getName()).log(Level.INFO, "NULL: Link"); source = feedURL.getHost(); } else { Logger.getLogger(this.getClass().getName()) .log(Level.INFO, "NO NULL: {0}", feed.getHeader().getLink().toString()); source = feed.getHeader().getLink().getHost(); // if (source.indexOf("/") != -1) { // source = source.substring(0, source.indexOf("/") + 1); // } } newEntrySolr.setSource(source); newEntrySolr.setDocType("post"); newEntrySolr.setZone( new Zone( String.valueOf(zone.getId()), zone.getName(), zone.getType().getName(), zone.getExtendedString())); newEntrySolr.setPostLatitude(Double.parseDouble((String) params.get("latitud"))); newEntrySolr.setPostLongitude(Double.parseDouble((String) params.get("longitud"))); // newEntry.setId(entry.getUri()); // newEntry.setId(entry.getUri() != null && entry.getUri().length() > 0 ? // entry.getUri().trim() : entry.getLink().trim()+entry.getTitle().trim()); newEntrySolr.setId( entry.getGUID() != null ? entry.getGUID() : (entry.getElementValue("http://www.w3.org/2005/Atom", "id") != null ? entry.getElementValue("http://www.w3.org/2005/Atom", "id") : (entry.getTitle()))); newEntrySolr.setFromUser( new User( null, source, null, null, place != null ? new org.zonales.entities.Place( String.valueOf(place.getId()), place.getName(), place.getType().getName()) : null)); newEntrySolr.setTitle(entry.getTitle()); newEntrySolr.setText(entry.getDescriptionAsText()); newEntrySolr.setTags(new ArrayList<String>((ArrayList) params.get("tagslist"))); if (newEntrySolr.getLinks() == null) { newEntrySolr.setLinks(new ArrayList<LinkType>()); } if ((links = getLinks(feedSelectors, doc, responseURL)) != null) { newEntrySolr.getLinks().addAll(links); } newEntrySolr.getLinks().add(new LinkType("source", entry.getLink().toString())); if (newEntrySolr.getActions() == null) { newEntrySolr.setActions(new ArrayList<ActionType>()); } newEntrySolr .getActions() .addAll( getActions( feedSelectors, doc, newEntrySolr.getId(), json, (Boolean) params.get("comments"), source)); if (entry.getPubDate() != null) { newEntrySolr.setCreated((entry.getPubDate().getTime())); } if (entry.getModDate() != null) { newEntrySolr.setModified((entry.getModDate().getTime())); } for (ActionType action : newEntrySolr.getActions()) { if ("comments".equals(action.getType())) { newEntrySolr.setRelevance(action.getCant()); } } if (!json) { newEntrySolr.setVerbatim(gson.toJson(newEntrySolr)); } newsListSolr.add(newEntrySolr); // addToMap(parseResult, feed, feedLink, entry, content, newEntry); } } return "{post: " + gson.toJson(newsListSolr) + "}"; // + comments.toString(); } }