protected Item createRSSItem(SyndEntry sEntry) {
   Item item = new Item();
   item.setModules(ModuleUtils.cloneModules(sEntry.getModules()));
   item.setTitle(sEntry.getTitle());
   item.setLink(sEntry.getLink());
   item.setUri(sEntry.getUri());
   if (sEntry.getDescription() != null) {
     Description description = new Description();
     description.setValue(sEntry.getDescription().getValue());
     item.setDescription(description);
   }
   return item;
 }
  @SuppressWarnings("unchecked")
  private boolean findFeedEntry(SyndFeed feed, String title, String[] bodyPortions) {
    List<SyndEntry> entries = feed.getEntries();

    for (SyndEntry entry : entries) {
      if (entry.getTitle().equals(title)) {
        if (bodyPortions == null) {
          return true;
        }

        boolean missingPortion = false;

        SyndContent description = entry.getDescription();
        String value = description.getValue();
        for (int i = 0; i < bodyPortions.length; i++) {
          if (!value.contains(bodyPortions[i])) {
            missingPortion = true;
            break;
          }
        }

        if (!missingPortion) {
          return true;
        }
      }
    }

    return false;
  }
  public static List<RssUrlBean> getRssUrlBeanListFromPage(int rssCompo_id, String url) {
    List<RssUrlBean> rubList = new ArrayList<RssUrlBean>();
    if (url.equals("")) return rubList;
    try {
      URL feedUrl = new URL(url);
      // SyndFeedInput:从远程读到xml结构的内容转成SyndFeedImpl实例
      SyndFeedInput input = new SyndFeedInput();
      // rome按SyndFeed类型生成rss和atom的实例,
      // SyndFeed是rss和atom实现类SyndFeedImpl的接口
      SyndFeed syndFeed = input.build(new XmlReader(feedUrl));

      List<SyndEntry> entryList = syndFeed.getEntries();
      for (SyndEntry entry : entryList) {
        RssUrlBean rub = new RssUrlBean();
        rub.setRssCompo_id(rssCompo_id);
        rub.setTitle(entry.getTitle());
        rub.setLink(entry.getUri());
        rub.setPublishedDate(CommonUtil.getStandardDate(entry.getPublishedDate().toLocaleString()));
        rub.setDescription(entry.getDescription().getValue());
        if (entry.getUpdatedDate() != null)
          rub.setUpdatedDate(CommonUtil.getStandardDate(entry.getUpdatedDate().toLocaleString()));
        rub.setAuthors(entry.getAuthor());
        rubList.add(rub);
      }
    } catch (Exception ex) {
      ex.printStackTrace();
    }
    return rubList;
  }
Beispiel #4
0
 public Item(String source, SyndEntry entry) {
   this(
       source,
       entry.getLink(),
       entry.getTitle(),
       entry.getDescription().getValue(),
       entry.getPublishedDate(),
       entry.getEnclosures());
 }
  /**
   * A helper function to print out the contents of an entry to log.trace
   *
   * @param entry
   */
  public static void printEntry(SyndEntry entry) {
    if (!log.isTraceEnabled()) {
      return;
    }

    StringBuffer pr =
        new StringBuffer(
            "URI: "
                + entry.getUri()
                + "\n"
                + "Title: "
                + entry.getTitle()
                + "\n"
                + "\n"
                + "Date: "
                + entry.getPublishedDate()
                + "\n"
                + "Modified: "
                + entry.getUpdatedDate()
                + "\n");

    pr.append("Creators: \n");

    for (SyndPerson author : FeedHelper.getAuthors(entry)) {
      pr.append("  - " + author.getName() + "\n");
    }

    pr.append("Links: \n");
    for (SyndLink link : FeedHelper.getLinks(entry)) {
      pr.append("  - " + link.getTitle() + ": " + link.getHref() + "\n");
    }

    SyndContent description = entry.getDescription();
    if (description != null) {
      pr.append("\nDescription(" + description.getType() + "): " + description.getValue());
    }

    pr.append("Contents: \n");
    for (SyndContent content : FeedHelper.getContents(entry)) {
      pr.append(" Type: " + content.getType());
      pr.append(" Body: " + content.getValue());
      try {
        pr.append(
            " Body (Plain text): "
                + PlainTextExtractor.getPlainText(content.getType(), content.getValue()));
      } catch (ParserException e) {
        pr.append("Failed to parse content");
      }
    }

    pr.append("Categories: \n");
    for (SyndCategory category : FeedHelper.getCategories(entry)) {
      pr.append(category.getName() + "(" + category.getTaxonomyUri() + ")");
    }
    log.trace(pr.toString());
  }
  protected void _testItem(int i) throws Exception {
    super._testItem(i);
    List items = getCachedSyndFeed().getEntries();
    SyndEntry entry = (SyndEntry) items.get(i);

    assertProperty(entry.getTitle(), "channel.item[" + i + "].title");
    assertProperty(entry.getLink(), "channel.item[" + i + "].link");
    assertProperty(entry.getDescription().getValue(), "channel.item[" + i + "].description");
    _testCategories(entry.getCategories(), "channel.item[" + i + "]");
    _testEnclosures(entry.getEnclosures(), "channel.item[" + i + "]");
  }
  @Override
  protected void testItem(final int i) throws Exception {
    super.testItem(i);
    final List<SyndEntry> items = this.getCachedSyndFeed().getEntries();
    final SyndEntry entry = items.get(i);

    assertProperty(entry.getTitle(), "channel.item[" + i + "].title");
    assertProperty(entry.getLink(), "channel.item[" + i + "].link");
    assertProperty(entry.getDescription().getValue(), "channel.item[" + i + "].description");
    testCategories(entry.getCategories(), "channel.item[" + i + "]");
    testEnclosures(entry.getEnclosures(), "channel.item[" + i + "]");
  }
 public static List<News> getNews(String categoryLink) {
   ArrayList<SyndEntry> syndEntrys = getSyndEntrys(categoryLink);
   String link, description, title;
   List<News> listNews = new ArrayList<News>();
   for (SyndEntry entry : syndEntrys) {
     title = entry.getTitle();
     link = entry.getLink();
     description = entry.getDescription().getValue();
     listNews.add(new News(title, link, description));
   }
   return listNews;
 }
  public void mouseClicked(MouseEvent e) {
    Point p = e.getPoint();

    int row = appWindow.feedItems.rowAtPoint(p);
    int column = appWindow.feedItems.columnAtPoint(p);

    ListFeed channel = (ListFeed) appWindow.channelsList.getSelectedValue();
    SyndFeed feed = channel.feed;
    SyndEntry item = (SyndEntry) feed.getEntries().get(row);

    if (e.getClickCount() == 2) {
      // open in browser window
    } else if (e.getClickCount() == 1) {
      appWindow.itemDetails.setText(item.getDescription().getValue());
    }
  }
  protected Item createRSSItem(SyndEntry sEntry) {
    Item item = super.createRSSItem(sEntry);

    SyndContent sContent = sEntry.getDescription();
    if (sContent != null) {
      item.setDescription(createItemDescription(sContent));
    }
    List contents = sEntry.getContents();
    if (contents != null && contents.size() > 0) {
      SyndContent syndContent = (SyndContent) contents.get(0);
      Content cont = new Content();
      cont.setValue(syndContent.getValue());
      cont.setType(syndContent.getType());
      item.setContent(cont);
    }
    return item;
  }
Beispiel #11
0
  @SuppressWarnings("unchecked")
  public void test() throws Exception {
    final SyndFeedInput input = new SyndFeedInput(true);
    final SyndFeed feed = input.build(new File("c:\\temp\\google.xml"));

    logger.debug("Successfully parsed the RSS feed");
    logger.debug("Author      = " + feed.getAuthors());
    logger.debug("Categories  = " + feed.getCategories());
    final List<SyndEntry> entries = feed.getEntries();
    for (final SyndEntry entry : entries) {
      logger.debug("Title = " + StringEscapeUtils.unescapeHtml(entry.getTitle()));
      logger.debug(
          "Description = " + StringEscapeUtils.unescapeHtml(entry.getDescription().getValue()));
      logger.debug(entry.getUri());
      logger.debug("Updated date = " + entry.getUpdatedDate());
      logger.debug("Published date = " + entry.getPublishedDate());
      logger.debug("====================================================");
    }
  }
  private Article mapArticle(SyndEntry syndEntry) {
    StringBuilder sb = new StringBuilder();
    for (Object obj : syndEntry.getContents()) {
      if (!(obj instanceof SyndContent)) {
        continue;
      }
      SyndContent syndContent = (SyndContent) obj;
      sb.append(syndContent.getValue());
    }

    return Article.builder()
        .num(0)
        .title(syndEntry.getTitle())
        .content(sb.toString())
        .description(syndEntry.getDescription().getValue())
        .author(syndEntry.getAuthor())
        .image("")
        .writtenDate(syndEntry.getPublishedDate())
        .build();
  }
  /** @see com.elia.rssexample.data.NewsDao */
  @SuppressWarnings("unchecked")
  public List<NewsItem> getNewsList() {

    // TODO: exception handling

    log.trace("Enter getNewsList().");

    List<NewsItem> newsList = new ArrayList<NewsItem>();

    XmlReader reader = null;
    try {
      for (String rssUrl : rssUrlList) {

        reader = new XmlReader(new URL(rssUrl));
        SyndFeed feed = new SyndFeedInput().build(reader);

        for (SyndEntry entry : (List<SyndEntry>) feed.getEntries()) {

          NewsItem item = new NewsItem();

          item.setTitle(entry.getTitle());
          item.setDescription(entry.getDescription().getValue());
          item.setLink(entry.getLink());
          item.setPublished(entry.getPublishedDate());

          newsList.add(item);
        }
      }
    } catch (Exception e) {
      log.error("Error reading feed.", e);
    } finally {
      try {
        reader.close();
      } catch (IOException e) {
        log.warn("Unable to close xml reader.", e);
      }
    }

    return newsList;
  }
Beispiel #14
0
  @SuppressWarnings("unchecked")
  @Trigger("!buzz")
  @Help("Fetches one of the latest posts from jeanmarcmorandini.com")
  public List<String> getLatestBuzz() {
    List<String> toReturn = new ArrayList<String>();
    try {
      URL url = new URL("http://www.jeanmarcmorandini.com/rss.php");
      SyndFeedInput input = new SyndFeedInput();
      SyndFeed rss = input.build(new XmlReader(url));

      Iterator<SyndEntry> it = rss.getEntries().iterator();
      String message = null;
      while (it.hasNext()) {
        SyndEntry item = it.next();
        String guid = item.getUri();
        RSSFeed buzz = dao.findByGUID(guid);
        if (buzz == null) {
          buzz = new RSSFeed();
          buzz.setGuid(item.getUri());
          dao.save(buzz);
          String urlBitly = utilsService.bitly(item.getLink());
          String content = Jsoup.parse(item.getDescription().getValue()).select("p").get(0).text();
          message = IRCUtils.bold("EXCLU!") + " " + item.getTitle() + " - " + urlBitly;
          toReturn.add(message);
          toReturn.add(content);
          break;
        }
      }

      if (message == null) {
        toReturn.add("Pas d'exclus pour le moment.");
      }

    } catch (Exception e) {
      LOG.handle(e);
    }

    return toReturn;
  }
Beispiel #15
0
 public String getDescription() {
   return entry.getDescription().getValue();
 }
Beispiel #16
0
  /**
   * Generate an ebook from an RSS DOM Document.
   *
   * @param url The URL from where the Document was fetched (used only to set the author metadata)
   * @param doc The DOM Document of the feed.
   * @return An ebook.
   * @throws IllegalArgumentException
   * @throws FeedException
   * @throws IOException
   */
  private static Book createBookFromFeed(URL url, Document doc, List<Keyword> keywords)
      throws IllegalArgumentException, FeedException, IOException {
    Book book = new Book();
    // start parsing our feed and have the above onItem methods called
    SyndFeedInput input = new SyndFeedInput();
    SyndFeed feed = input.build(doc);

    System.out.println(feed);

    // Set the title
    book.getMetadata().addTitle(feed.getTitle());

    // Add an Author
    String author = feed.getAuthor();
    if (author == null || "".equals(author.trim())) {
      author = url.getHost();
    }
    book.getMetadata().addAuthor(new Author(author));

    if (feed.getPublishedDate() != null) {
      book.getMetadata().addDate(new nl.siegmann.epublib.domain.Date(feed.getPublishedDate()));
    }

    if (feed.getDescription() != null) {
      book.getMetadata().addDescription(feed.getDescription());
    }

    if (feed.getCopyright() != null) {
      book.getMetadata().getRights().add(feed.getCopyright());
    }

    // Set cover image - This has never worked.
    // if (feed.getImage() != null) {
    // System.out.println("There is an image for the feed");

    // Promise<HttpResponse> futureImgResponse =
    // WS.url(feed.getImage().getUrl()).getAsync();
    // HttpResponse imgResponse = await(futureImgResponse);
    // System.out.println("Content-type: " + imgResponse.getContentType());
    // if (imgResponse.getContentType().startsWith("image/")) {
    // String extension =
    // imgResponse.getContentType().substring("image/".length());
    // InputStream imageStream = imgResponse.getStream();
    // book.getMetadata().setCoverImage(new Resource(imageStream, "cover." +
    // extension));

    // System.out.println("Using default cover");
    // imageStream =
    // VirtualFile.fromRelativePath("assets/cover.png").inputstream();
    // if (imageStream != null) {
    // System.out.println("Using default cover");
    // book.getMetadata().setCoverImage(new Resource(imageStream,
    // "cover.png"));
    // } else {
    // System.out.println("Could not load default cover");
    // }

    // }
    // }

    int entryNumber = 0;
    List<SyndEntry> entries = feed.getEntries();

    for (SyndEntry entry : entries) {
      if (matchesKeyword(entry, keywords)) {

        StringBuilder title = new StringBuilder(100);
        if (entry.getTitle() != null) {
          title.append(entry.getTitle());
        }
        if (entry.getAuthor() != null) {
          title.append(" - ").append(entry.getAuthor());
        }
        StringBuilder content = new StringBuilder();

        // Add title inside text
        content.append("<h2>").append(title).append("</h2>");

        if (entry.getDescription() != null) {
          SyndContent syndContent = (SyndContent) entry.getDescription();
          if (!syndContent.getType().contains("html")) {
            content.append("<pre>\n");
          }
          content.append(syndContent.getValue());
          if (!syndContent.getType().contains("html")) {
            content.append("\n</pre>");
          }
          content.append("<hr/>");
        }

        if (entry.getContents().size() > 0) {
          SyndContent syndContent = (SyndContent) entry.getContents().get(0);
          if (!syndContent.getType().contains("html")) {
            content.append("<pre>\n");
          }
          content.append(syndContent.getValue());
          if (!syndContent.getType().contains("html")) {
            content.append("\n</pre>");
          }
        }
        String strContent = clean(content.toString());
        // Add Chapter
        try {
          entryNumber++;
          book.addSection(
              title.toString(),
              new Resource(new StringReader(strContent), "entry" + entryNumber + ".xhtml"));
        } catch (IOException e) {
          // TODO Auto-generated catch block
          e.printStackTrace();
        }
      }
    }

    return book;
  }
Beispiel #17
0
  // Build the feed list
  @SuppressWarnings("unchecked")
  private void buildFeedList(LinkedList<SyndFeed> syndFeeds, SourcePojo source) {
    // If there's a max number of sources to get per harvest, configure that here:
    long nWaitTime_ms = props.getWebCrawlWaitTime();
    long nMaxTime_ms =
        props.getMaxTimePerFeed(); // (can't override this, too easy to break the system...)
    int nMaxDocsPerSource = props.getMaxDocsPerSource();
    long nNow = new Date().getTime();
    if (null != source.getRssConfig()) {
      if (null != source.getRssConfig().getWaitTimeOverride_ms()) {
        nWaitTime_ms = source.getRssConfig().getWaitTimeOverride_ms();
      }
    }
    long nMaxDocs = Long.MAX_VALUE;
    if (nWaitTime_ms > 0) {
      nMaxDocs = nMaxTime_ms / nWaitTime_ms;
    }
    if (nMaxDocs > nMaxDocsPerSource) { // (another limit, take the smaller of the 2)
      nMaxDocs = nMaxDocsPerSource;
    }
    // (end per feed configuration)

    // Add extra docs
    List<SyndEntry> tmpList = null;
    boolean bCreatedAggregateList = false;
    int nRealSyndEntries = 0;

    for (SyndFeed feed : syndFeeds) {
      if (0 == nRealSyndEntries) {
        tmpList = feed.getEntries();
      } else if (!bCreatedAggregateList) {
        bCreatedAggregateList = true;
        tmpList = new LinkedList<SyndEntry>(tmpList);
        tmpList.addAll(feed.getEntries());
      } else {
        tmpList.addAll(feed.getEntries());
      }
      nRealSyndEntries += feed.getEntries().size();
    }
    if (null == tmpList) {
      tmpList = new LinkedList<SyndEntry>();
    }
    // TESTED

    if ((null != source.getRssConfig()) && (null != source.getRssConfig().getExtraUrls())) {
      for (ExtraUrlPojo extraUrl : source.getRssConfig().getExtraUrls()) {
        if (null == extraUrl.title) {
          continue; // (this is an RSS feed not a URL)
        } // TESTED
        SyndEntryImpl synd = new SyndEntryImpl();
        synd.setLink(extraUrl.url);
        if (null != extraUrl.description) {
          SyndContentImpl description = new SyndContentImpl();
          description.setValue(extraUrl.description);
          synd.setDescription(description);
        }
        synd.setTitle(extraUrl.title);
        if (null != extraUrl.publishedDate) {
          try {
            synd.setPublishedDate(new Date(DateUtility.parseDate(extraUrl.publishedDate)));
          } catch (Exception e) {
          } // do nothign will use now as pub date
        }
        tmpList.add((SyndEntry) synd);

        if (null != extraUrl.fullText) {
          SyndFeedImpl fullTextContainer = new SyndFeedImpl();
          fullTextContainer.setDescription(extraUrl.fullText);
          synd.setSource(fullTextContainer);
        }
      }
    }

    // Then begin looping over entries

    LinkedList<String> duplicateSources = new LinkedList<String>();
    try {
      Map<String, List<SyndEntry>> urlDups = new HashMap<String, List<SyndEntry>>();
      int nSyndEntries = 0;
      for (Object synd : tmpList) {
        nSyndEntries++; // (keep count so we know we're accessing our own fake SyndEntryImpls)
        final SyndEntry entry = (SyndEntry) synd;

        if (null != entry.getLink()) // if url returns null, skip this entry
        {
          String url = this.cleanUrlStart(entry.getLink());

          if (null != source.getRssConfig()) { // Some RSS specific logic
            // If an include is specified, must match
            Matcher includeMatcher = source.getRssConfig().getIncludeMatcher(url);
            if (null != includeMatcher) {
              if (!includeMatcher.find()) {
                continue;
              }
            }
            // If an exclude is specified, must not match
            Matcher excludeMatcher = source.getRssConfig().getExcludeMatcher(url);
            if (null != excludeMatcher) {
              if (excludeMatcher.find()) {
                continue;
              }
            }
          }

          // Some error checking:
          // sometimes the URL seems to have some characters in front of the HTTP - remove these
          this.nTmpDocsSubmitted++;
          if (null == url) {
            this.nTmpHttpErrors++;
            continue;
          }

          // Also save the title and description:
          String title = "";
          if (null != entry.getTitle()) {
            title = entry.getTitle();
          }
          String desc = "";
          if (null != entry.getDescription()) {
            desc = entry.getDescription().getValue();
          }
          boolean duplicate = false;

          // Look for duplicates within the current set of sources
          List<SyndEntry> possDups = null;
          if (null == (possDups = urlDups.get(url))) { // (new URL)
            possDups = new LinkedList<SyndEntry>();
            possDups.add(entry);
            urlDups.put(url, possDups);
          } else { // (old URL, check if this is a duplicate...)
            int nCount = 0;
            for (SyndEntry possDup : possDups) {
              if (possDup.getTitle().equals(title)
                  || ((null != possDup.getDescription())
                      && possDup.getDescription().getValue().equals(desc))
                  || ((null != possDup.getDescription()) && (null == entry.getDescription()))) {
                // If *either* the title or the description matches as well as the URL...
                duplicate = true;
                break;
              }
              nCount++;
            }

            if (!duplicate) {
              possDups.add(entry);
            } else { // DUPLICATE: ensure we have minimal set of data to cover all cases:
              boolean bTitleMatch = false;
              boolean bDescMatch = false;
              for (SyndEntry possDup : possDups) {
                if (!bTitleMatch
                    && possDup
                        .getTitle()
                        .equals(title)) { // (don't bother if already have a title match)
                  bTitleMatch = true;
                } else if (!bDescMatch) { // (don't yet have a desc match(
                  if (null != entry.getDescription()) {
                    if (null != possDup.getDescription()) { // (neither desc is null)
                      if (possDup.getDescription().getValue().equals(desc)) {
                        bDescMatch = true;
                      }
                    }
                  } else { // curr desc is null
                    if (null == possDup.getDescription()) { // dup desc is null
                      bDescMatch = true;
                    }
                  } // (end various title match/desc match/both have no desc cases
                } // (end if no desc match)
                if (bTitleMatch && bDescMatch) {
                  break; // (no way can fire)
                }
              } // (end loop over dups)

              if (!bTitleMatch || !bDescMatch) {
                possDups.add(entry);
              }
            } // (end is duplicate, nasty logic to add minimal set to dup list to cover all titles,
              // descs)
          }
          if (duplicate) {
            continue;
          }

          try {
            DuplicateManager qr = _context.getDuplicateManager();
            if (null != entry.getDescription()) {
              duplicate =
                  qr.isDuplicate_UrlTitleDescription(
                      url,
                      title.replaceAll("\\<.*?\\>", "").trim(),
                      desc.replaceAll("\\<.*?\\>", "").trim(),
                      source,
                      duplicateSources);
            } else {
              duplicate =
                  qr.isDuplicate_UrlTitleDescription(
                      url,
                      title.replaceAll("\\<.*?\\>", "").trim(),
                      null,
                      source,
                      duplicateSources);
              // ^^^(this is different to isDuplicate_UrlTitle because it enforces that the
              // description be null, vs just checking the title)
            }
            if (duplicate
                && (null != source.getRssConfig())
                && (null != source.getRssConfig().getUpdateCycle_secs())) {
              // Check modified times...
              Date dupModDate = qr.getLastDuplicateModifiedTime();
              ObjectId dupId = qr.getLastDuplicateId();

              if ((null != dupModDate) && (null != dupId)) {
                if (dupModDate.getTime() + source.getRssConfig().getUpdateCycle_secs() * 1000
                    < nNow) {

                  DocumentPojo doc = buildDocument(entry, source, duplicateSources);
                  if ((nSyndEntries > nRealSyndEntries) && (null != entry.getSource())) {
                    // (Use dummy TitleEx to create a "fake" full text block)
                    doc.setFullText(entry.getSource().getDescription());
                  }
                  doc.setUpdateId(dupId); // (set _id to document I'm going to overwrite)
                  this.docsToUpdate.add(doc);

                  if ((this.docsToAdd.size() + this.docsToUpdate.size()) >= nMaxDocs) {
                    source.setReachedMaxDocs();
                    break; // (that's enough documents)
                  }
                }
              }
            } // TESTED (duplicates we update instead of ignoring)

            if (!duplicate) {
              DocumentPojo doc = buildDocument(entry, source, duplicateSources);
              if ((nSyndEntries > nRealSyndEntries) && (null != entry.getSource())) {
                // (Use dummy TitleEx to create a "fake" full text block)
                doc.setFullText(entry.getSource().getDescription());
              }
              this.docsToAdd.add(doc);

              if ((this.docsToAdd.size() + this.docsToUpdate.size()) >= nMaxDocs) {
                source.setReachedMaxDocs();
                break; // (that's enough documents)
              }
            }
            if (this.nTmpDocsSubmitted > 20) { // (some arbitrary "significant" number)
              if (nTmpHttpErrors == this.nTmpDocsSubmitted) {
                break;
              }
            }
          } catch (Exception e) {
            // If an exception occurs log the error
            logger.error("Exception Message: " + e.getMessage(), e);
          }
        }
      } // (end loop over feeds in a syndicate)
    } catch (Exception e) {
      // If an exception occurs log the error
      logger.error("Exception Message: " + e.getMessage(), e);
    }
  }
Beispiel #18
0
  private DocumentPojo buildDocument(
      SyndEntry entry, SourcePojo source, LinkedList<String> duplicateSources) {

    String tmpURL = this.cleanUrlStart(entry.getLink().toString());
    // (can't return null because called from code which checks this)

    // create the feed pojo
    DocumentPojo doc = new DocumentPojo();

    doc.setUrl(tmpURL);
    doc.setCreated(new Date());
    doc.setModified(new Date());

    // Strip out html if it is present
    if (entry.getTitle() != null) doc.setTitle(entry.getTitle().replaceAll("\\<.*?\\>", "").trim());
    if (entry.getDescription() != null)
      doc.setDescription(entry.getDescription().getValue().replaceAll("\\<.*?\\>", "").trim());
    if (entry.getPublishedDate() != null) {
      doc.setPublishedDate(entry.getPublishedDate());
    } else {
      doc.setPublishedDate(new Date());
    }

    // Clone from an existing source if we can:
    if (!duplicateSources.isEmpty()
        && (null == doc.getUpdateId())) { // (can't duplicate updating document)
      doc.setDuplicateFrom(duplicateSources.getFirst());
    }

    // GeoRSS
    GeoRSSModule geoRSSModule =
        GeoRSSUtils.getGeoRSS(entry); // currently does not handle <georss:circle>
    if (null != geoRSSModule) {
      if (null != geoRSSModule.getPosition()) {
        double lat = geoRSSModule.getPosition().getLatitude();
        double lon = geoRSSModule.getPosition().getLongitude();
        GeoPojo gp = new GeoPojo();
        gp.lat = lat;
        gp.lon = lon;
        doc.setDocGeo(gp);
      }
      if (null != geoRSSModule.getGeometry()) {
        AbstractGeometry ag = geoRSSModule.getGeometry();
        if (ag.getClass().equals(new LineString().getClass())) { // <georss:line>
          LineString ls = ((LineString) geoRSSModule.getGeometry());

          double latAvg = 0.0;
          double lonAvg = 0.0;
          int length = ls.getPositionList().size();
          for (int i = 0; i < length; i++) {
            latAvg += ls.getPositionList().getLatitude(i);
            lonAvg += ls.getPositionList().getLongitude(i);
          }
          latAvg = latAvg / length;
          lonAvg = lonAvg / length;
          GeoPojo gp = new GeoPojo();
          gp.lat = latAvg;
          gp.lon = lonAvg;
          doc.setDocGeo(gp);
        } else if (ag.getClass().equals(new Polygon().getClass())) // <georss:polygon>
        {
          Polygon poly = ((Polygon) geoRSSModule.getGeometry());
          AbstractRing ar = poly.getExterior();
          LinearRing lr = (LinearRing) ar;

          double latAvg = 0.0;
          double lonAvg = 0.0;
          int length = lr.getPositionList().size();
          for (int i = 0; i < length; i++) {
            latAvg += lr.getPositionList().getLatitude(i);
            lonAvg += lr.getPositionList().getLongitude(i);
          }
          latAvg = latAvg / length;
          lonAvg = lonAvg / length;
          GeoPojo gp = new GeoPojo();
          gp.lat = latAvg;
          gp.lon = lonAvg;
          doc.setDocGeo(gp);
        } else if (ag.getClass().equals(new Envelope().getClass())) { // <georss:box>
          Envelope env = ((Envelope) geoRSSModule.getGeometry());

          double latAvg = (env.getMaxLatitude() + env.getMinLatitude()) / 2;
          double lonAvg = (env.getMaxLongitude() + env.getMinLongitude()) / 2;

          GeoPojo gp = new GeoPojo();
          gp.lat = latAvg;
          gp.lon = lonAvg;
          doc.setDocGeo(gp);
        }
      }
    } // end if GeoRSS

    // Arbitrary other metadata:

    if (null != entry.getForeignMarkup()) {
      JSONObject rssMetadata = new JSONObject();

      @SuppressWarnings("unchecked")
      List<Element> fms = (List<Element>) entry.getForeignMarkup();
      for (Element fm : fms) {
        try {
          JSONObject subObj = XML.toJSONObject(new XMLOutputter().outputString(fm));
          if (1 == subObj.length()) {
            for (String name : JSONObject.getNames(subObj)) {
              rssMetadata.put(name, subObj.get(name));
            }
          } else { // (this will never happen in practice?)
            rssMetadata.put(fm.getName(), subObj);
          }
        } catch (JSONException e) {
        } // (do nothing just carry on)
      }
      if (!fms.isEmpty()) {
        doc.addToMetadata(
            "_FEED_METADATA_", XmlToMetadataParser.convertJsonObjectToLinkedHashMap(rssMetadata));
      }
    } // TESTED (longs converted to string, eg edgar:assistantDirector from
      // "http.www.sec.gov.archives.edgar.usgaap.rss.xml")

    return doc;
  }
Beispiel #19
0
  //	@Transactional
  //	@TransactionAttribute(TransactionAttributeType.REQUIRES_NEW)
  public void importEntry(
      final SyndFeed feed,
      final SyndEntry entry,
      final Set<KiWiUriResource> types,
      final Set<ContentItem> tags,
      User user,
      final Collection<ContentItem> output) {
    if (user == null && entry.getAuthor() != null && !"".equals(entry.getAuthor())) {
      if (userService.userExists(entry.getAuthor())) {
        user = userService.getUserByLogin(entry.getAuthor());
      } else {

        //				user = userService.createUser(entry.getAuthor());
        /* In my opinion, it is not ok to create a user entity
         * without asking the person if he/she wants to be
         * created and persisted in the KiWi dataset.
         * Thus I'm changing the user to 'anonymous',
         * if he/she is'nt registered with the same nick that
         * is given in the rss entry.
         */
        user = userService.getUserByLogin("anonymous");
        kiwiEntityManager.persist(user);
      }
    }

    log.debug("feed entry: #0 (#1)", entry.getTitle(), entry.getUri());

    // create a new content item and copy all data from the feed entry
    ContentItem item;
    if (entry.getLink() != null) {
      item = contentItemService.createExternContentItem(entry.getLink());
    } else if (entry.getUri() != null) {
      try {
        // try parsing URI; if it is not valid,
        URI uri = new URI(entry.getUri());
        item = contentItemService.createExternContentItem(entry.getUri());
      } catch (URISyntaxException e) {
        item = contentItemService.createExternContentItem(feed.getLink() + "#" + entry.getUri());
      }
    } else {
      item = contentItemService.createContentItem();
    }
    contentItemService.updateTitle(item, entry.getTitle());

    if (feed.getLanguage() != null) item.setLanguage(new Locale(feed.getLanguage()));

    if (entry.getPublishedDate() != null) {
      item.setCreated(entry.getPublishedDate());
      item.setModified(entry.getPublishedDate());
    }

    if (entry.getUpdatedDate() != null) {
      if (entry.getPublishedDate() == null) {
        item.setCreated(entry.getUpdatedDate());
      }
      item.setModified(entry.getUpdatedDate());
    }

    item.setAuthor(user);

    // read feed content and set it as item's text content
    List<SyndContent> contents = entry.getContents();
    if (contents.size() == 1) {
      log.debug("using RSS content section provided by item");
      contentItemService.updateTextContentItem(item, "<p>" + contents.get(0).getValue() + "</p>");
    } else if (contents.size() > 1) {
      log.warn("feed entry contained more than one content section");
      contentItemService.updateTextContentItem(item, "<p>" + contents.get(0).getValue() + "</p>");
    } else if (contents.size() == 0) {
      if (entry.getDescription() != null && entry.getDescription().getValue() != null) {
        log.debug("using RSS description as no content section was available");
        contentItemService.updateTextContentItem(
            item, "<p>" + entry.getDescription().getValue() + "</p>");
      }
    }

    // save before tagging
    contentItemService.saveContentItem(item);

    // read feed categories and use them as tags
    for (SyndCategory cat : (List<SyndCategory>) entry.getCategories()) {
      ContentItem _cat;
      if (!taggingService.hasTag(item, cat.getName())) {
        if (cat.getTaxonomyUri() != null) {
          _cat = contentItemService.getContentItemByUri(cat.getTaxonomyUri());
          if (_cat == null) {
            _cat = contentItemService.createExternContentItem(cat.getTaxonomyUri());
            contentItemService.updateTitle(_cat, cat.getName());
            _cat.setAuthor(user);
            contentItemService.saveContentItem(_cat);
          }
          taggingService.createTagging(cat.getName(), item, _cat, user);
        } else {
          _cat = contentItemService.getContentItemByTitle(cat.getName());
          if (_cat == null) {
            _cat = contentItemService.createContentItem();
            contentItemService.updateTitle(_cat, cat.getName());
            _cat.setAuthor(user);
            contentItemService.saveContentItem(_cat);
          }
          taggingService.createTagging(cat.getName(), item, _cat, user);
        }
      }
    }
    // scan for Twitter-style hash tags in title (e.g. #kiwiknows, see KIWI-622)
    Matcher m_hashtag = p_hashtag.matcher(entry.getTitle());
    while (m_hashtag.find()) {
      String tag_label = m_hashtag.group(1);
      if (!taggingService.hasTag(item, tag_label)) {
        ContentItem tag = contentItemService.getContentItemByTitle(tag_label);
        if (tag == null) {
          tag = contentItemService.createContentItem();
          contentItemService.updateTitle(tag, tag_label);
          tag.setAuthor(user);
          contentItemService.saveContentItem(tag);
        }
        taggingService.createTagging(tag_label, item, tag, user);
      }
    }

    // check for geo information
    GeoRSSModule geoRSSModule = GeoRSSUtils.getGeoRSS(entry);
    if (geoRSSModule != null && geoRSSModule.getPosition() != null) {
      POI poi = kiwiEntityManager.createFacade(item, POI.class);
      poi.setLatitude(geoRSSModule.getPosition().getLatitude());
      poi.setLongitude(geoRSSModule.getPosition().getLongitude());
      kiwiEntityManager.persist(poi);
    }

    // check for media information
    MediaEntryModule mediaModule = (MediaEntryModule) entry.getModule(MediaModule.URI);
    if (mediaModule != null) {
      MediaContent[] media = mediaModule.getMediaContents();
      if (media.length > 0) {
        MediaContent m = media[0];
        if (m.getReference() instanceof UrlReference) {
          URL url = ((UrlReference) m.getReference()).getUrl();

          String type = m.getType();
          String name = url.getFile();
          if (name.lastIndexOf("/") > 0) {
            name = name.substring(name.lastIndexOf("/") + 1);
          }

          log.debug("importing media data from URL #0", url.toString());

          try {
            InputStream is = url.openStream();

            ByteArrayOutputStream bout = new ByteArrayOutputStream();

            int c;
            while ((c = is.read()) != -1) {
              bout.write(c);
            }

            byte[] data = bout.toByteArray();

            contentItemService.updateMediaContentItem(item, data, type, name);

            is.close();
            bout.close();
          } catch (IOException ex) {
            log.error("error importing media content from RSS stream");
          }
        } else {
          log.info("RSS importer can only import media with URL references");
        }
      } else {
        log.warn("media module found without content");
      }

      Category[] cats = mediaModule.getMetadata().getCategories();
      for (Category cat : cats) {
        ContentItem _cat;

        String label = cat.getLabel() != null ? cat.getLabel() : cat.getValue();

        if (!taggingService.hasTag(item, label)) {
          if (cat.getScheme() != null) {
            _cat = contentItemService.getContentItemByUri(cat.getScheme() + cat.getValue());
            if (_cat == null) {
              _cat = contentItemService.createExternContentItem(cat.getScheme() + cat.getValue());
              contentItemService.updateTitle(_cat, label);
              _cat.setAuthor(user);
              contentItemService.saveContentItem(_cat);
            }
            taggingService.createTagging(label, item, _cat, user);
          } else {
            _cat = contentItemService.getContentItemByTitle(label);
            if (_cat == null) {
              _cat = contentItemService.createContentItem();
              contentItemService.updateTitle(_cat, label);
              _cat.setAuthor(user);
              contentItemService.saveContentItem(_cat);
            }
            taggingService.createTagging(label, item, _cat, user);
          }
        }
      }
    }

    // add parameter categories as tags
    for (ContentItem tag : tags) {
      if (!taggingService.hasTag(item, tag.getTitle())) {
        taggingService.createTagging(tag.getTitle(), item, tag, user);
      }
    }

    // add parameter types as types
    for (KiWiUriResource type : types) {
      item.addType(type);
    }

    // add kiwi:FeedPost type
    item.addType(tripleStore.createUriResource(Constants.NS_KIWI_CORE + "FeedPost"));

    /* the flush is necessary, because CIs or tags will
     * otherwise be created multiple times when they
     * appear more than once in one RSS feed */
    entityManager.flush();
    log.debug("imported content item '#0' with URI '#1'", item.getTitle(), item.getResource());
  }
Beispiel #20
0
  // build a SubscriptionEntry from Rome SyndEntry and SyndFeed
  private SubscriptionEntry buildEntry(SyndEntry romeEntry) {

    // if we don't have a permalink then we can't continue
    if (romeEntry.getLink() == null) {
      return null;
    }

    SubscriptionEntry newEntry = new SubscriptionEntry();

    newEntry.setTitle(romeEntry.getTitle());
    newEntry.setPermalink(romeEntry.getLink());

    // Play some games to get the author
    DCModule entrydc = (DCModule) romeEntry.getModule(DCModule.URI);
    if (romeEntry.getAuthor() != null) {
      newEntry.setAuthor(romeEntry.getAuthor());
    } else {
      newEntry.setAuthor(entrydc.getCreator()); // use <dc:creator>
    }

    // Play some games to get the updated date
    if (romeEntry.getUpdatedDate() != null) {
      newEntry.setUpdateTime(new Timestamp(romeEntry.getUpdatedDate().getTime()));
    }
    // TODO: should we set a default update time here?

    // And more games getting publish date
    if (romeEntry.getPublishedDate() != null) {
      newEntry.setPubTime(new Timestamp(romeEntry.getPublishedDate().getTime())); // use <pubDate>
    } else if (entrydc != null && entrydc.getDate() != null) {
      newEntry.setPubTime(new Timestamp(entrydc.getDate().getTime())); // use <dc:date>
    } else {
      newEntry.setPubTime(newEntry.getUpdateTime());
    }

    // get content and unescape if it is 'text/plain'
    if (romeEntry.getContents().size() > 0) {
      SyndContent content = (SyndContent) romeEntry.getContents().get(0);
      if (content != null && content.getType().equals("text/plain")) {
        newEntry.setText(StringEscapeUtils.unescapeHtml(content.getValue()));
      } else if (content != null) {
        newEntry.setText(content.getValue());
      }
    }

    // no content, try summary
    if (newEntry.getText() == null || newEntry.getText().trim().length() == 0) {
      if (romeEntry.getDescription() != null) {
        newEntry.setText(romeEntry.getDescription().getValue());
      }
    }

    // copy categories
    if (romeEntry.getCategories().size() > 0) {
      List list = new ArrayList();
      Iterator cats = romeEntry.getCategories().iterator();
      while (cats.hasNext()) {
        SyndCategory cat = (SyndCategory) cats.next();
        list.add(cat.getName());
      }
      newEntry.setCategoriesString(list);
    }

    return newEntry;
  }