Пример #1
0
  /**
   * Parse the XML document, looking for a <b>feed</b> element to determine if it's an <b>Atom
   * doc</b> <b>rss</b> to determine if it's an <b>RSS doc</b>.
   *
   * @param sitemapUrl
   * @param doc - XML document to parse
   * @throws UnknownFormatException if XML does not appear to be Atom or RSS
   */
  private SiteMap parseSyndicationFormat(URL sitemapUrl, Document doc)
      throws UnknownFormatException {

    SiteMap sitemap = new SiteMap(sitemapUrl);

    // See if this is an Atom feed by looking for "feed" element
    NodeList list = doc.getElementsByTagName("feed");
    if (list.getLength() > 0) {
      parseAtom(sitemap, (Element) list.item(0), doc);
      sitemap.setProcessed(true);
      return sitemap;
    } else {
      // See if it is a RSS feed by looking for a "channel" element. This avoids the issue
      // of having the outer tag named <rdf:RDF> that was causing this code to fail. Inside of
      // the <rss> or <rdf> tag is a <channel> tag, so we can use that.
      // See https://github.com/crawler-commons/crawler-commons/issues/87
      // and also RSS 1.0 specification http://web.resource.org/rss/1.0/spec
      list = doc.getElementsByTagName("channel");
      if (list.getLength() > 0) {
        parseRSS(sitemap, doc);
        sitemap.setProcessed(true);
        return sitemap;
      } else {
        throw new UnknownFormatException("Unknown syndication format at " + sitemapUrl);
      }
    }
  }
Пример #2
0
  /**
   * Parse XML that contains a valid Sitemap. Example of a Sitemap: <?xml version="1.0"
   * encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <url>
   * <loc>http://www.example.com/</loc> <lastmod>2005-01-01</lastmod>
   * <changefreq>monthly</changefreq> <priority>0.8</priority> </url> <url> <loc
   * >http://www.example.com/catalog?item=12&amp;desc=vacation_hawaii</loc>
   * <changefreq>weekly</changefreq> </url> </urlset>
   *
   * @param doc
   */
  private SiteMap parseXmlSitemap(URL sitemapUrl, Document doc) {

    SiteMap sitemap = new SiteMap(sitemapUrl);
    sitemap.setType(SitemapType.XML);

    NodeList list = doc.getElementsByTagName("url");

    // Loop through the <url>s
    for (int i = 0; i < list.getLength(); i++) {

      Node n = list.item(i);
      if (n.getNodeType() == Node.ELEMENT_NODE) {
        Element elem = (Element) n;
        String lastMod = getElementValue(elem, "lastmod");
        String changeFreq = getElementValue(elem, "changefreq");
        String priority = getElementValue(elem, "priority");
        String loc = getElementValue(elem, "loc");

        addUrlIntoSitemap(loc, sitemap, lastMod, changeFreq, priority, i);
      }
    }

    sitemap.setProcessed(true);
    return sitemap;
  }
Пример #3
0
  /**
   * Process a text-based Sitemap. Text sitemaps only list URLs but no priorities, last mods, etc.
   *
   * @param content
   * @throws IOException
   */
  private SiteMap processText(String sitemapUrl, byte[] content) throws IOException {
    LOG.debug("Processing textual Sitemap");

    SiteMap textSiteMap = new SiteMap(sitemapUrl);
    textSiteMap.setType(SitemapType.TEXT);

    BOMInputStream bomIs = new BOMInputStream(new ByteArrayInputStream(content));
    @SuppressWarnings("resource")
    BufferedReader reader = new BufferedReader(new InputStreamReader(bomIs, "UTF-8"));

    String line;
    int i = 1;
    while ((line = reader.readLine()) != null) {
      if (line.length() > 0 && i <= MAX_URLS) {
        addUrlIntoSitemap(line, textSiteMap, null, null, null, i++);
      }
    }
    textSiteMap.setProcessed(true);

    return textSiteMap;
  }