/** * Parse the XML document, looking for a <b>feed</b> element to determine if it's an <b>Atom * doc</b> <b>rss</b> to determine if it's an <b>RSS doc</b>. * * @param sitemapUrl * @param doc - XML document to parse * @throws UnknownFormatException if XML does not appear to be Atom or RSS */ private SiteMap parseSyndicationFormat(URL sitemapUrl, Document doc) throws UnknownFormatException { SiteMap sitemap = new SiteMap(sitemapUrl); // See if this is an Atom feed by looking for "feed" element NodeList list = doc.getElementsByTagName("feed"); if (list.getLength() > 0) { parseAtom(sitemap, (Element) list.item(0), doc); sitemap.setProcessed(true); return sitemap; } else { // See if it is a RSS feed by looking for a "channel" element. This avoids the issue // of having the outer tag named <rdf:RDF> that was causing this code to fail. Inside of // the <rss> or <rdf> tag is a <channel> tag, so we can use that. // See https://github.com/crawler-commons/crawler-commons/issues/87 // and also RSS 1.0 specification http://web.resource.org/rss/1.0/spec list = doc.getElementsByTagName("channel"); if (list.getLength() > 0) { parseRSS(sitemap, doc); sitemap.setProcessed(true); return sitemap; } else { throw new UnknownFormatException("Unknown syndication format at " + sitemapUrl); } } }
/** * Parse XML that contains a valid Sitemap. Example of a Sitemap: <?xml version="1.0" * encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <url> * <loc>http://www.example.com/</loc> <lastmod>2005-01-01</lastmod> * <changefreq>monthly</changefreq> <priority>0.8</priority> </url> <url> <loc * >http://www.example.com/catalog?item=12&desc=vacation_hawaii</loc> * <changefreq>weekly</changefreq> </url> </urlset> * * @param doc */ private SiteMap parseXmlSitemap(URL sitemapUrl, Document doc) { SiteMap sitemap = new SiteMap(sitemapUrl); sitemap.setType(SitemapType.XML); NodeList list = doc.getElementsByTagName("url"); // Loop through the <url>s for (int i = 0; i < list.getLength(); i++) { Node n = list.item(i); if (n.getNodeType() == Node.ELEMENT_NODE) { Element elem = (Element) n; String lastMod = getElementValue(elem, "lastmod"); String changeFreq = getElementValue(elem, "changefreq"); String priority = getElementValue(elem, "priority"); String loc = getElementValue(elem, "loc"); addUrlIntoSitemap(loc, sitemap, lastMod, changeFreq, priority, i); } } sitemap.setProcessed(true); return sitemap; }
/** * Process a text-based Sitemap. Text sitemaps only list URLs but no priorities, last mods, etc. * * @param content * @throws IOException */ private SiteMap processText(String sitemapUrl, byte[] content) throws IOException { LOG.debug("Processing textual Sitemap"); SiteMap textSiteMap = new SiteMap(sitemapUrl); textSiteMap.setType(SitemapType.TEXT); BOMInputStream bomIs = new BOMInputStream(new ByteArrayInputStream(content)); @SuppressWarnings("resource") BufferedReader reader = new BufferedReader(new InputStreamReader(bomIs, "UTF-8")); String line; int i = 1; while ((line = reader.readLine()) != null) { if (line.length() > 0 && i <= MAX_URLS) { addUrlIntoSitemap(line, textSiteMap, null, null, null, i++); } } textSiteMap.setProcessed(true); return textSiteMap; }