public void setEpisodeAttributes(Episode episode, Podcast podcast, SyndEntry entry) {
    // set DESCRIPTION for episode - used in search
    if (null != entry.getDescription()) {

      String episodeDesc = entry.getDescription().getValue();
      // tags are removed from description
      String descWithoutTabs = episodeDesc.replaceAll("\\<[^>]*>", "");
      // carriage returns are removed from description - for player
      String descWithoutEndOfLine = descWithoutTabs.replaceAll("\\n", "");
      if (descWithoutEndOfLine.length() > MAX_LENGTH_DESCRIPTION) {
        episode.setDescription(descWithoutEndOfLine.substring(0, MAX_LENGTH_DESCRIPTION));
      } else {
        episode.setDescription(descWithoutEndOfLine);
      }
    }

    // set author
    episode.setAuthor(entry.getAuthor());

    // set title for episode - used in search
    String episodeTitle = entry.getTitle();
    if (episodeTitle != null) {
      // removes quotes to display properly in player
      episodeTitle = episodeTitle.replaceAll("\"", "");
      if (episodeTitle.length() > MAX_PERMITTED_TITLE_LENGTH) {
        episodeTitle = episodeTitle.substring(0, MAX_PERMITTED_TITLE_LENGTH);
      }
      episode.setTitle(episodeTitle);
      String titleInUrl = episodeTitle.trim().replaceAll("[^a-zA-Z0-9\\-\\s\\.]", "");
      titleInUrl = titleInUrl.replaceAll("[\\-| |\\.]+", "-");
      if (titleInUrl.length() > TITLE_IN_URL_MAX_LENGTH) {
        episode.setTitleInUrl(titleInUrl.substring(0, TITLE_IN_URL_MAX_LENGTH));
      } else {
        episode.setTitleInUrl(titleInUrl);
      }
    }

    episode.setLink(entry.getLink());

    // in the beginning inherit the media type from the podcast
    episode.setMediaType(podcast.getMediaType());

    // get the list of enclosures
    @SuppressWarnings("unchecked")
    List<SyndEnclosure> enclosures = (List<SyndEnclosure>) entry.getEnclosures();

    List<String> audioMimeTypesList = Arrays.asList(audioMimeTypesArray);
    List<String> videoMimeTypesList = Arrays.asList(videoMimeTypesArray);

    // set media url for the episode - this will be played in the player
    if (null != enclosures) {
      // if in the enclosure list is a media type (either audio or video),
      // this will set as the link of the episode
      for (SyndEnclosure enclosure : enclosures) {
        if (null != enclosure) {
          episode.setMediaUrl(enclosure.getUrl());
          if (enclosure.getLength() >= 0) episode.setLength(enclosure.getLength());
          // when adding a new podcast media type is selected for the
          // podcast based on an initial view, but it can be that is a
          // mixed podcast so both audio
          // and video should be considered and in that case PRIORITY
          // has the type of the episode if any...
          if (null != enclosure.getType()) {
            episode.setEnclosureType(enclosure.getType().trim());
            if (audioMimeTypesList.contains(enclosure.getType().trim())) {
              episode.setMediaType(MediaType.Audio);
              break;
            }
            if (videoMimeTypesList.contains(enclosure.getType().trim())) {
              episode.setMediaType(MediaType.Video);
              break;
            }
          }
        }
      }
    } else {
      episode.setMediaUrl("noMediaUrl");
    }

    if (episode.getMediaUrl() == null) {
      episode.setMediaUrl("noMediaUrl");
    }

    if (episode.getMediaUrl() == null || episode.getMediaUrl().equals("noMediaUrl")) {
      LOG.warn(
          "PodcastId["
              + podcast.getPodcastId()
              + "] - "
              + "COULD NOT SET MEDIA URL - "
              + "epTitle["
              + entry.getTitle()
              + "]"
              + "feed["
              + podcast.getUrl()
              + "]");
    }

    // set link attribute
    episode.setLink(entry.getLink());

    episode.setPublicationDate(entry.getPublishedDate());
    updatePodcastPublicationDateAndLastMediaUrl(episode, podcast);

    if (episode.getPublicationDate() == null) {
      LOG.warn(
          "PodcastId["
              + podcast.getPodcastId()
              + "] - "
              + "COULD NOT SET publication date "
              + "epTitle["
              + entry.getTitle()
              + "]"
              + "feed["
              + podcast.getUrl()
              + "]");
    }
  }
Ejemplo n.º 2
0
  private List<Outlink> parseFeed(String url, byte[] content, Metadata parentMetadata)
      throws MalformedURLException {
    List<Outlink> links = new ArrayList<>();

    SyndFeed feed = null;
    try (ByteArrayInputStream is = new ByteArrayInputStream(content)) {
      SyndFeedInput input = new SyndFeedInput();
      feed = input.build(new InputSource(is));
    } catch (Exception e) {
      LOG.error("Exception parsing feed from DOM {}", url);
      return links;
    }

    URL sURL = new URL(url);

    List<SyndEntry> entries = feed.getEntries();
    for (SyndEntry entry : entries) {
      String targetURL = entry.getLink();

      // build an absolute URL
      try {
        targetURL = URLUtil.resolveURL(sURL, targetURL).toExternalForm();
      } catch (MalformedURLException e) {
        LOG.debug("MalformedURLException on {}", targetURL);
        continue;
      }

      targetURL = urlFilters.filter(sURL, parentMetadata, targetURL);

      if (StringUtils.isBlank(targetURL)) continue;

      Outlink newLink = new Outlink(targetURL);

      Metadata targetMD = metadataTransfer.getMetaForOutlink(targetURL, url, parentMetadata);
      newLink.setMetadata(targetMD);

      String title = entry.getTitle();
      if (StringUtils.isNotBlank(title)) {
        targetMD.setValue("feed.title", title.trim());
      }

      Date publishedDate = entry.getPublishedDate();
      if (publishedDate != null) {
        // filter based on the published date
        if (filterHoursSincePub != -1) {
          Calendar rightNow = Calendar.getInstance();
          rightNow.add(Calendar.HOUR, -filterHoursSincePub);
          if (publishedDate.before(rightNow.getTime())) {
            LOG.info(
                "{} has a published date {} which is more than {} hours old",
                targetURL,
                publishedDate.toString(),
                filterHoursSincePub);
            continue;
          }
        }
        targetMD.setValue("feed.publishedDate", publishedDate.toString());
      }

      SyndContent description = entry.getDescription();
      if (description != null && StringUtils.isNotBlank(description.getValue())) {
        targetMD.setValue("feed.description", description.getValue());
      }

      links.add(newLink);
    }

    return links;
  }