private void completeLinks(PostsType posts) { for (PostType post : posts.getPost()) { if (post.getLinks() != null) { for (LinkType link : post.getLinks().getLink()) { if (!link.getUrl().matches("^http://.*")) { link.setUrl( "http://" + post.getSource() + link.getUrl().substring(link.getUrl().matches("^/.*") ? 1 : 0)); } } } } }
/** * Parses the given feed and extracts out and parsers all linked items within the feed, using the * underlying ROME feed parsing library. * * @param rss A {@link Content} object representing the feed that is being parsed by this {@link * Parser}. * @return A {@link ParseResult} containing all {@link Parse}d feeds that were present in the feed * file that this {@link Parser} dealt with. */ public String getParse(String url, boolean json, HashMap<String, Object> params) throws Exception { url = URLDecoder.decode(url, "UTF-8"); URL feedURL = new URL(url); // Logger.getLogger(this.getClass().getName()).log(Level.INFO, "Encoding del Feed: {0}", new // Object[]{feedURL.openConnection().getContentEncoding()}); Feed feed = FeedParser.parse(feedURL); // List<PostType> newsList = new ArrayList<PostType>(); PostType newEntry; // PostType newEntryComments; Post newEntrySolr; // SyndFeed feed = null; Gson gson = new Gson(); List<LinkType> links; Document doc; FeedSelectors feedSelectors; String extendedString = (String) params.get("zone"); Place place = null; if (params.containsKey("place")) { place = placeDao.retrieveByExtendedString(extendedString); } org.zonales.tagsAndZones.objects.Zone zone = zoneDao.retrieveByExtendedString(extendedString); if (!json) { for (int i = 0; i < feed.getItemCount(); i++) { FeedItem entry = feed.getItem(i); Logger.getLogger(this.getClass().getName()) .log( Level.INFO, "Intentando conectar a {0}", new Object[] {entry.getLink().toString()}); Connection conn = Jsoup.connect(entry.getLink().toString()); conn.timeout(60000); doc = conn.get(); String responseURL = conn.response().url().getHost(); // doc = Jsoup.connect(entry.getLink().toString()).timeout(60000).get(); Logger.getLogger(this.getClass().getName()) .log(Level.INFO, "Parseando la URL: {0}", new Object[] {entry.getLink().toString()}); feedSelectors = dao.retrieve(url); if (findWords( entry.getTitle(), doc, (ArrayList) params.get("searchlist"), (ArrayList) params.get("blacklist"), feedSelectors)) { newEntry = new PostType(); String source; if (feed.getHeader() == null || feed.getHeader().getLink() == null) { Logger.getLogger(this.getClass().getName()).log(Level.INFO, "NULL: Link"); source = feedURL.getHost(); } else { Logger.getLogger(this.getClass().getName()) .log(Level.INFO, "NO NULL: {0}", feed.getHeader().getLink().toString()); source = feed.getHeader().getLink().getHost(); // if (source.indexOf("/") != -1) { // source = source.substring(0, source.indexOf("/") + 1); // } } newEntry.setSource(source); newEntry.setDocType("post"); newEntry.setZone( new Zone( String.valueOf(zone.getId()), zone.getName(), zone.getType().getName(), zone.getExtendedString())); newEntry.setPostLatitude(Double.parseDouble((String) params.get("latitud"))); newEntry.setPostLongitude(Double.parseDouble((String) params.get("longitud"))); // newEntry.setId(entry.getUri()); // newEntry.setId(entry.getUri() != null && entry.getUri().length() > 0 ? // entry.getUri().trim() : entry.getLink().trim()+entry.getTitle().trim()); newEntry.setId( entry.getGUID() != null ? entry.getGUID() : (entry.getElementValue("http://www.w3.org/2005/Atom", "id") != null ? entry.getElementValue("http://www.w3.org/2005/Atom", "id") : (entry.getTitle()))); newEntry.setFromUser( new User( null, source, null, null, place != null ? new org.zonales.entities.Place( String.valueOf(place.getId()), place.getName(), place.getType().getName()) : null)); newEntry.setTitle(entry.getTitle()); newEntry.setText(entry.getDescriptionAsText()); newEntry.setTags(new TagsType((ArrayList) params.get("tagslist"))); if (newEntry.getLinks() == null) { newEntry.setLinks(new LinksType(new ArrayList<LinkType>())); } if ((links = getLinks(feedSelectors, doc, responseURL)) != null) { newEntry.getLinks().getLink().addAll(links); } newEntry.getLinks().getLink().add(new LinkType("source", entry.getLink().toString())); if (newEntry.getActions() == null) { newEntry.setActions(new ActionsType(new ArrayList<ActionType>())); } newEntry.setActions( new ActionsType( getActions( feedSelectors, doc, newEntry.getId(), json, (Boolean) params.get("comments"), source))); if (entry.getPubDate() != null) { newEntry.setCreated(String.valueOf(entry.getPubDate().getTime())); } if (entry.getModDate() != null) { newEntry.setModified(String.valueOf(entry.getModDate().getTime())); } for (ActionType action : newEntry.getActions().getAction()) { if ("comments".equals(action.getType())) { newEntry.setRelevance(action.getCant()); } } if (!json) { newEntry.setVerbatim(gson.toJson(newEntry)); } newsList.add(newEntry); // addToMap(parseResult, feed, feedLink, entry, content, newEntry); } } PostsType news; news = new PostsType(newsList); completeLinks(news); Feed2XML(news, sw); return sw.toString(); // + comments.toString(); } else { for (int i = 0; i < feed.getItemCount(); i++) { FeedItem entry = feed.getItem(i); Logger.getLogger(this.getClass().getName()) .log( Level.INFO, "Intentando conectar a {0}", new Object[] {entry.getLink().toString()}); Connection conn = Jsoup.connect(entry.getLink().toString()); conn.timeout(60000); doc = conn.get(); String responseURL = conn.response().url().getHost(); // Logger.getLogger(this.getClass().getName()).log(Level.INFO, "RESPONSE URL: // {0}", responseURL); // doc = Jsoup.connect(entry.getLink().toString()).timeout(60000).get(); Logger.getLogger(this.getClass().getName()) .log(Level.INFO, "Parseando la URL: {0}", new Object[] {entry.getLink().toString()}); feedSelectors = dao.retrieve(url); if (findWords( entry.getTitle(), doc, (ArrayList) params.get("searchlist"), (ArrayList) params.get("blacklist"), feedSelectors)) { newEntrySolr = new Post(); String source; if (feed.getHeader() == null || feed.getHeader().getLink() == null) { Logger.getLogger(this.getClass().getName()).log(Level.INFO, "NULL: Link"); source = feedURL.getHost(); } else { Logger.getLogger(this.getClass().getName()) .log(Level.INFO, "NO NULL: {0}", feed.getHeader().getLink().toString()); source = feed.getHeader().getLink().getHost(); // if (source.indexOf("/") != -1) { // source = source.substring(0, source.indexOf("/") + 1); // } } newEntrySolr.setSource(source); newEntrySolr.setDocType("post"); newEntrySolr.setZone( new Zone( String.valueOf(zone.getId()), zone.getName(), zone.getType().getName(), zone.getExtendedString())); newEntrySolr.setPostLatitude(Double.parseDouble((String) params.get("latitud"))); newEntrySolr.setPostLongitude(Double.parseDouble((String) params.get("longitud"))); // newEntry.setId(entry.getUri()); // newEntry.setId(entry.getUri() != null && entry.getUri().length() > 0 ? // entry.getUri().trim() : entry.getLink().trim()+entry.getTitle().trim()); newEntrySolr.setId( entry.getGUID() != null ? entry.getGUID() : (entry.getElementValue("http://www.w3.org/2005/Atom", "id") != null ? entry.getElementValue("http://www.w3.org/2005/Atom", "id") : (entry.getTitle()))); newEntrySolr.setFromUser( new User( null, source, null, null, place != null ? new org.zonales.entities.Place( String.valueOf(place.getId()), place.getName(), place.getType().getName()) : null)); newEntrySolr.setTitle(entry.getTitle()); newEntrySolr.setText(entry.getDescriptionAsText()); newEntrySolr.setTags(new ArrayList<String>((ArrayList) params.get("tagslist"))); if (newEntrySolr.getLinks() == null) { newEntrySolr.setLinks(new ArrayList<LinkType>()); } if ((links = getLinks(feedSelectors, doc, responseURL)) != null) { newEntrySolr.getLinks().addAll(links); } newEntrySolr.getLinks().add(new LinkType("source", entry.getLink().toString())); if (newEntrySolr.getActions() == null) { newEntrySolr.setActions(new ArrayList<ActionType>()); } newEntrySolr .getActions() .addAll( getActions( feedSelectors, doc, newEntrySolr.getId(), json, (Boolean) params.get("comments"), source)); if (entry.getPubDate() != null) { newEntrySolr.setCreated((entry.getPubDate().getTime())); } if (entry.getModDate() != null) { newEntrySolr.setModified((entry.getModDate().getTime())); } for (ActionType action : newEntrySolr.getActions()) { if ("comments".equals(action.getType())) { newEntrySolr.setRelevance(action.getCant()); } } if (!json) { newEntrySolr.setVerbatim(gson.toJson(newEntrySolr)); } newsListSolr.add(newEntrySolr); // addToMap(parseResult, feed, feedLink, entry, content, newEntry); } } return "{post: " + gson.toJson(newsListSolr) + "}"; // + comments.toString(); } }
/** ********************************** */ public List<ActionType> getActions( FeedSelectors feedSelectors, Document doc, String idPost, boolean json, boolean comments, String source) throws IOException, BadLocationException { List<ActionType> list = new ArrayList<ActionType>(); String selectorAuthor = null, selectorText = null, selectorDate = null; String expressionAuthor = null, expressionText = null, expressionDate = null; String author = null, text = null, id = null, timestamp = null; String patternDate = null; Matcher matcher; Pattern patron; // FileInputStream datos= new FileInputStream (ConDatos); /** ********** */ if (feedSelectors == null || feedSelectors.getSelectors() == null || feedSelectors.getSelectors().isEmpty()) { feedSelectors = dao.retrieve("default"); } if (feedSelectors == null || feedSelectors.getSelectors() == null || feedSelectors.getSelectors().isEmpty()) { return null; } Elements elmts = null; boolean commenters = false; for (FeedSelector feedSelector : feedSelectors.getSelectors()) { if ("commentsAuthor".equals(feedSelector.getType())) { selectorAuthor = feedSelector.getSelector(); if (!feedSelector.getValidator().equals("")) { expressionAuthor = feedSelector.getValidator(); } } if ("commentsText".equals(feedSelector.getType())) { selectorText = feedSelector.getSelector(); if (!feedSelector.getValidator().equals("")) { expressionText = feedSelector.getValidator(); } } if ("commentsDate".equals(feedSelector.getType())) { selectorDate = feedSelector.getSelector(); if (!feedSelector.getValidator().equals("")) { expressionDate = feedSelector.getValidator(); } } if ("comments".equals(feedSelector.getType())) { elmts = doc.select(feedSelector.getSelector()); commenters = true; } if (feedSelector.getFormat() != null && !feedSelector.getFormat().equals("")) { patternDate = feedSelector.getFormat(); } } newEntryComments = new PostType(); newEntryCommentsSolr = new Post(); if (commenters && elmts.size() > 0) { Integer cantCommnents = 0; for (Element comment : elmts) { Logger.getLogger(this.getClass().getName()) .log(Level.INFO, "Comentario: {0}", comment.html()); Boolean validComment = false; if (selectorAuthor != null) { for (Element element : comment.select(selectorAuthor)) { author = element.text(); if (expressionAuthor == null) { break; } else { if (author != null && !"".equals(author)) { patron = Pattern.compile(expressionAuthor); matcher = patron.matcher(author); if (matcher.find()) { author = author.replaceAll(matcher.group(1), ""); break; } } } } } Logger.getLogger(this.getClass().getName()) .log(Level.INFO, "SelectorText; {0}", selectorText); if (selectorText != null) { for (Element element : comment.select(selectorText)) { if (expressionText != null) { String html = element.html(); Logger.getLogger(this.getClass().getName()).log(Level.INFO, "Text HTML: {0}", html); if (html != null && !"".equals(html)) { patron = Pattern.compile("^[^<].*"); matcher = patron.matcher(html); if (matcher.find()) { text = element.text(); Logger.getLogger(this.getClass().getName()) .log(Level.INFO, "Matchea, Text; {0}", text); validComment = true; break; } else { validComment = false; } } else { validComment = false; } } else { text = element.text(); validComment = true; break; } } } if (selectorDate != null) { Logger.getLogger(this.getClass().getName()) .log(Level.INFO, "Selector Fecha: {0}", selectorDate); for (Element element : comment.select(selectorDate)) { timestamp = element.text(); if (expressionDate != null && timestamp != null && !"".equals(timestamp)) { Logger.getLogger(this.getClass().getName()) .log( Level.INFO, "Validador Fecha: {0}, Texto Fecha: {1}", new Object[] {expressionDate, timestamp}); patron = Pattern.compile(expressionDate); matcher = patron.matcher(timestamp); if (matcher.find()) { timestamp = timestamp.replaceAll(matcher.group(1), ""); break; } } else { break; } } } if (validComment && comments) { Logger.getLogger(this.getClass().getName()).log(Level.INFO, "El comentario es válido"); cantCommnents++; if (!json) { newEntryComments.setId(idPost + author + timestamp); newEntryComments.setSourcePost(idPost); newEntryComments.setDocType("comment"); newEntryComments.setFromUser(new User(id, author, null, null, null)); newEntryComments.setText(text); newEntryComments.setCreated(timestamp); newEntryComments.setSource(source); newsList.add(newEntryComments); } else { newEntryCommentsSolr.setId(idPost + author + timestamp); newEntryCommentsSolr.setSourcePost(idPost); newEntryCommentsSolr.setDocType("comment"); newEntryCommentsSolr.setFromUser(new User(id, author, null, null, null)); newEntryCommentsSolr.setText(text); newEntryCommentsSolr.setSource(source); if (patternDate != null) newEntryCommentsSolr.setCreated(getDate(timestamp, patternDate)); newsListSolr.add(newEntryCommentsSolr); } } // comments.add(new Comment(id , new User (id,author,null,null,null) , text , new // Date(timestamp))); } list.add(new ActionType("comments", cantCommnents)); } if (list.isEmpty()) { return new ArrayList(); } else { return new ArrayList(list); } }